optimize: speed up deployment while ensuring latest images and better diagnostics - Force pull latest images with hash verification before deployment - Reduce timeouts: deployment 3min, health check 90s, stack removal 60s - Check every 2s instead of 5s for faster feedback - Exit early on clear failures to avoid waiting - Comprehensive container-level diagnostics on failure - Show task status, container logs, and service inspection - Verify deployed image matches pulled image hash - Minimal 10s initialization wait instead of 30s - Better error messages with image hash tracking
This commit is contained in:
		
							parent
							
								
									bf09520c1d
								
							
						
					
					
						commit
						993b32af46
					
				|  | @ -29,10 +29,11 @@ set -euo pipefail | ||||||
| readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||||
| readonly LOCK_FILE="/tmp/authelia-deploy.lock" | readonly LOCK_FILE="/tmp/authelia-deploy.lock" | ||||||
| readonly MAX_RETRIES=3 | readonly MAX_RETRIES=3 | ||||||
| readonly RETRY_DELAY=10 | readonly RETRY_DELAY=5  # Reduced from 10s to 5s | ||||||
| readonly DEPLOYMENT_TIMEOUT=300  # 5 minutes | readonly DEPLOYMENT_TIMEOUT=180  # Reduced from 300s to 180s (3 minutes) | ||||||
| readonly HEALTH_CHECK_TIMEOUT=120  # 2 minutes | readonly HEALTH_CHECK_TIMEOUT=90  # Reduced from 120s to 90s | ||||||
| readonly MIN_DISK_SPACE_MB=1000 | readonly MIN_DISK_SPACE_MB=500  # Reduced from 1000MB to 500MB | ||||||
|  | readonly FORCE_PULL=true  # Always pull latest images | ||||||
| 
 | 
 | ||||||
| # Color codes for output | # Color codes for output | ||||||
| readonly RED='\033[0;31m' | readonly RED='\033[0;31m' | ||||||
|  | @ -40,11 +41,13 @@ readonly GREEN='\033[0;32m' | ||||||
| readonly YELLOW='\033[1;33m' | readonly YELLOW='\033[1;33m' | ||||||
| readonly BLUE='\033[0;34m' | readonly BLUE='\033[0;34m' | ||||||
| readonly PURPLE='\033[0;35m' | readonly PURPLE='\033[0;35m' | ||||||
|  | readonly CYAN='\033[0;36m' | ||||||
| readonly NC='\033[0m' # No Color | readonly NC='\033[0m' # No Color | ||||||
| 
 | 
 | ||||||
| # Global variables for cleanup | # Global variables for cleanup | ||||||
| DEPLOYMENT_STARTED=false | DEPLOYMENT_STARTED=false | ||||||
| OLD_IMAGE_ID="" | OLD_IMAGE_HASH="" | ||||||
|  | NEW_IMAGE_HASH="" | ||||||
| ROLLBACK_NEEDED=false | ROLLBACK_NEEDED=false | ||||||
| 
 | 
 | ||||||
| # Logging functions | # Logging functions | ||||||
|  | @ -179,23 +182,23 @@ pre_flight_checks() { | ||||||
| # Get current image ID for rollback purposes | # Get current image ID for rollback purposes | ||||||
| get_current_image_id() { | get_current_image_id() { | ||||||
|     if docker stack ps "${CI_REPO_NAME}" >/dev/null 2>&1; then |     if docker stack ps "${CI_REPO_NAME}" >/dev/null 2>&1; then | ||||||
|         OLD_IMAGE_ID=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "") |         OLD_IMAGE_HASH=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "") | ||||||
|         if [[ -n "$OLD_IMAGE_ID" ]]; then |         if [[ -n "$OLD_IMAGE_HASH" ]]; then | ||||||
|             debug "Current image for rollback: $OLD_IMAGE_ID" |             debug "Current image for rollback: $OLD_IMAGE_HASH" | ||||||
|         fi |         fi | ||||||
|     fi |     fi | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # Rollback function | # Rollback function | ||||||
| attempt_rollback() { | attempt_rollback() { | ||||||
|     if [[ -n "$OLD_IMAGE_ID" && "$OLD_IMAGE_ID" != "IMAGE" ]]; then |     if [[ -n "$OLD_IMAGE_HASH" && "$OLD_IMAGE_HASH" != "IMAGE" ]]; then | ||||||
|         warning "Attempting rollback to previous image: $OLD_IMAGE_ID" |         warning "Attempting rollback to previous image: $OLD_IMAGE_HASH" | ||||||
|          |          | ||||||
|         # This would require a more complex rollback mechanism |         # This would require a more complex rollback mechanism | ||||||
|         # For now, just log the attempt |         # For now, just log the attempt | ||||||
|         error "Rollback mechanism not yet implemented" |         error "Rollback mechanism not yet implemented" | ||||||
|         error "Manual intervention required" |         error "Manual intervention required" | ||||||
|         error "Previous image was: $OLD_IMAGE_ID" |         error "Previous image was: $OLD_IMAGE_HASH" | ||||||
|     else |     else | ||||||
|         error "No previous image information available for rollback" |         error "No previous image information available for rollback" | ||||||
|     fi |     fi | ||||||
|  | @ -209,10 +212,107 @@ docker_registry_login() { | ||||||
|     retry_command "$login_cmd" "Docker registry login" |     retry_command "$login_cmd" "Docker registry login" | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # Wait for stack removal with timeout | # Force pull latest images to ensure we deploy the newest version | ||||||
|  | force_pull_latest_images() { | ||||||
|  |     log "🚀 Force pulling latest images to ensure fresh deployment" | ||||||
|  |      | ||||||
|  |     # Get the image names from docker-compose production file | ||||||
|  |     local authelia_image="git.nixc.us/nixius/authelia:production-authelia" | ||||||
|  |     local mariadb_image="git.nixc.us/nixius/authelia:production-mariadb"  | ||||||
|  |     local redis_image="git.nixc.us/nixius/authelia:production-redis" | ||||||
|  |      | ||||||
|  |     # Pull each image and capture new hashes | ||||||
|  |     log "Pulling Authelia image..." | ||||||
|  |     if docker pull "$authelia_image"; then | ||||||
|  |         NEW_IMAGE_HASH=$(docker images --format "table {{.Repository}}:{{.Tag}}\t{{.ID}}" | grep "production-authelia" | awk '{print $2}' | head -n1) | ||||||
|  |         success "✅ Authelia image pulled: $NEW_IMAGE_HASH" | ||||||
|  |     else | ||||||
|  |         error "❌ Failed to pull Authelia image" | ||||||
|  |         return 1 | ||||||
|  |     fi | ||||||
|  |      | ||||||
|  |     log "Pulling MariaDB image..." | ||||||
|  |     retry_command "docker pull $mariadb_image" "MariaDB image pull" | ||||||
|  |      | ||||||
|  |     log "Pulling Redis image..." | ||||||
|  |     retry_command "docker pull $redis_image" "Redis image pull" | ||||||
|  |      | ||||||
|  |     # Verify we have a new image hash | ||||||
|  |     if [[ -n "$NEW_IMAGE_HASH" && "$NEW_IMAGE_HASH" != "$OLD_IMAGE_HASH" ]]; then | ||||||
|  |         success "🔄 New image detected: $OLD_IMAGE_HASH → $NEW_IMAGE_HASH" | ||||||
|  |     elif [[ -n "$NEW_IMAGE_HASH" ]]; then | ||||||
|  |         warning "⚠️ Same image hash detected: $NEW_IMAGE_HASH (this may be expected)" | ||||||
|  |     else | ||||||
|  |         error "❌ Could not determine new image hash" | ||||||
|  |         return 1 | ||||||
|  |     fi | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | # Get detailed container information for debugging | ||||||
|  | get_container_diagnostics() { | ||||||
|  |     local service_name="$1" | ||||||
|  |     local container_logs="" | ||||||
|  |      | ||||||
|  |     error "=== 🔍 DETAILED DIAGNOSTICS FOR ${service_name} ===" | ||||||
|  |      | ||||||
|  |     # Get all tasks for this service | ||||||
|  |     local tasks | ||||||
|  |     tasks=$(docker service ps "${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Name}}\t{{.CurrentState}}\t{{.Error}}" --no-trunc) | ||||||
|  |      | ||||||
|  |     if [[ -n "$tasks" ]]; then | ||||||
|  |         error "Service tasks:" | ||||||
|  |         echo "$tasks" | while IFS=$'\t' read -r task_id name state task_error; do | ||||||
|  |             error "  Task: $name" | ||||||
|  |             error "    ID: $task_id" | ||||||
|  |             error "    State: $state" | ||||||
|  |             if [[ -n "$task_error" ]]; then | ||||||
|  |                 error "    Error: $task_error" | ||||||
|  |             fi | ||||||
|  |              | ||||||
|  |             # Try to get container logs for this task | ||||||
|  |             log "Attempting to get logs for task $task_id..." | ||||||
|  |             local task_logs | ||||||
|  |             task_logs=$(docker service logs "${CI_REPO_NAME}_${service_name}" --raw --tail 20 2>/dev/null || echo "No logs available") | ||||||
|  |             if [[ "$task_logs" != "No logs available" ]]; then | ||||||
|  |                 error "    Recent logs:" | ||||||
|  |                 echo "$task_logs" | sed 's/^/      /' | ||||||
|  |             fi | ||||||
|  |         done | ||||||
|  |     else | ||||||
|  |         error "No service tasks found for ${service_name}" | ||||||
|  |     fi | ||||||
|  |      | ||||||
|  |     # Get service inspection details | ||||||
|  |     error "Service inspection:" | ||||||
|  |     docker service inspect "${CI_REPO_NAME}_${service_name}" --pretty 2>/dev/null | head -20 | sed 's/^/  /' || error "  Service inspect failed" | ||||||
|  |      | ||||||
|  |     # Check if there are any containers running for this service | ||||||
|  |     local containers | ||||||
|  |     containers=$(docker ps -a --filter "label=com.docker.swarm.service.name=${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Status}}\t{{.Names}}" 2>/dev/null || echo "") | ||||||
|  |      | ||||||
|  |     if [[ -n "$containers" ]]; then | ||||||
|  |         error "Associated containers:" | ||||||
|  |         echo "$containers" | while IFS=$'\t' read -r container_id status name; do | ||||||
|  |             error "  Container: $name ($container_id)" | ||||||
|  |             error "    Status: $status" | ||||||
|  |              | ||||||
|  |             # Get container logs | ||||||
|  |             local container_logs | ||||||
|  |             container_logs=$(docker logs "$container_id" --tail 15 2>&1 || echo "No container logs available") | ||||||
|  |             error "    Container logs (last 15 lines):" | ||||||
|  |             echo "$container_logs" | sed 's/^/      /' | ||||||
|  |         done | ||||||
|  |     else | ||||||
|  |         error "No containers found for service ${service_name}" | ||||||
|  |     fi | ||||||
|  |      | ||||||
|  |     error "=== END DIAGNOSTICS FOR ${service_name} ===" | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | # Optimized wait for stack removal | ||||||
| wait_for_stack_removal() { | wait_for_stack_removal() { | ||||||
|     log "Verifying stack removal completed" |     log "Verifying stack removal completed" | ||||||
|     local timeout=$((DEPLOYMENT_TIMEOUT)) |     local timeout=60  # Reduced timeout for faster deployment | ||||||
|     local elapsed=0 |     local elapsed=0 | ||||||
|      |      | ||||||
|     while docker stack ls | grep -q "${CI_REPO_NAME}"; do  |     while docker stack ls | grep -q "${CI_REPO_NAME}"; do  | ||||||
|  | @ -221,9 +321,11 @@ wait_for_stack_removal() { | ||||||
|             return 1 |             return 1 | ||||||
|         fi |         fi | ||||||
|          |          | ||||||
|  |         if [[ $((elapsed % 10)) -eq 0 ]]; then  # Log every 10 seconds instead of 5 | ||||||
|             log "Stack still exists, waiting... (${elapsed}s/${timeout}s)" |             log "Stack still exists, waiting... (${elapsed}s/${timeout}s)" | ||||||
|         sleep 5 |         fi | ||||||
|         elapsed=$((elapsed + 5)) |         sleep 2  # Check every 2 seconds instead of 5 | ||||||
|  |         elapsed=$((elapsed + 2)) | ||||||
|     done |     done | ||||||
|      |      | ||||||
|     success "Stack removal completed in ${elapsed} seconds" |     success "Stack removal completed in ${elapsed} seconds" | ||||||
|  | @ -302,56 +404,80 @@ deploy_stack() { | ||||||
|     success "Stack deployment command completed" |     success "Stack deployment command completed" | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # Enhanced health checking with multiple validation methods | # Enhanced health checking optimized for speed and accuracy | ||||||
| comprehensive_health_check() { | comprehensive_health_check() { | ||||||
|     log "Starting comprehensive health check (${HEALTH_CHECK_TIMEOUT}s timeout)" |     log "🔍 Starting rapid health verification (${HEALTH_CHECK_TIMEOUT}s timeout)" | ||||||
|     local start_time=$(date +%s) |     local start_time=$(date +%s) | ||||||
|     local timeout=$HEALTH_CHECK_TIMEOUT |     local timeout=$HEALTH_CHECK_TIMEOUT | ||||||
|      |      | ||||||
|     # Wait for services to initialize |     # Minimal initial wait - just 10 seconds instead of 30 | ||||||
|     log "Waiting for services to initialize (30 seconds)" |     log "Brief initialization wait (10 seconds)..." | ||||||
|     sleep 30 |     sleep 10 | ||||||
|      |      | ||||||
|     # Check deployment status |     # Get immediate deployment status | ||||||
|     log "Checking deployment status" |     log "Checking deployment status" | ||||||
|     docker stack ps "${CI_REPO_NAME}" |     docker stack ps "${CI_REPO_NAME}" | ||||||
|      |      | ||||||
|     # Health check loop with multiple validation methods |     # Fast health check loop with 2-second intervals | ||||||
|     local check_count=0 |     local check_count=0 | ||||||
|     local max_checks=$((timeout / 5)) |     local max_checks=$((timeout / 2))  # Check every 2 seconds | ||||||
|  |     local authelia_healthy=false | ||||||
|  |     local last_status="" | ||||||
|      |      | ||||||
|     while [[ $check_count -lt $max_checks ]]; do |     while [[ $check_count -lt $max_checks ]]; do | ||||||
|         local current_time=$(date +%s) |         local current_time=$(date +%s) | ||||||
|         local elapsed=$((current_time - start_time)) |         local elapsed=$((current_time - start_time)) | ||||||
|          |          | ||||||
|         log "Health check attempt $((check_count + 1))/${max_checks} (${elapsed}s elapsed)" |         # Only log every 10 seconds to reduce noise | ||||||
|  |         if [[ $((check_count % 5)) -eq 0 ]]; then | ||||||
|  |             log "Health check ${check_count}/${max_checks} (${elapsed}s elapsed)" | ||||||
|  |         fi | ||||||
|          |          | ||||||
|         # Check if authelia service is running |         # Get current service status | ||||||
|         if docker stack ps "${CI_REPO_NAME}" | grep -q "authelia_authelia.*Running"; then |         local service_status | ||||||
|             success "✅ Authelia service is running!" |         service_status=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Name}}\t{{.CurrentState}}\t{{.Error}}" | grep "authelia_authelia" | head -n1) | ||||||
|          |          | ||||||
|             # Additional verification checks |         if [[ -n "$service_status" ]]; then | ||||||
|             log "Performing additional health verification..." |             local name=$(echo "$service_status" | cut -f1) | ||||||
|             sleep 5 |             local state=$(echo "$service_status" | cut -f2) | ||||||
|  |             local error_msg=$(echo "$service_status" | cut -f3) | ||||||
|              |              | ||||||
|             # Check service is stable (not restarting) |             # Check for Running state | ||||||
|             local service_info |             if echo "$state" | grep -q "Running"; then | ||||||
|             service_info=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | head -n1) |                 # Verify it's actually stable by checking for a few seconds | ||||||
|              |                 if [[ "$last_status" == "Running" ]]; then | ||||||
|             if echo "$service_info" | grep -q "Running"; then |                     # Double-check: no recent failures | ||||||
|                 # Check if there are any failed instances |  | ||||||
|                     local failed_count |                     local failed_count | ||||||
|                     failed_count=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | grep -c "Failed" || echo "0") |                     failed_count=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | grep -c "Failed" || echo "0") | ||||||
|                      |                      | ||||||
|                     if [[ $failed_count -eq 0 ]]; then |                     if [[ $failed_count -eq 0 ]]; then | ||||||
|                     success "🎉 Production deployment completed successfully!" |                         # Final verification: ensure we're using the new image | ||||||
|                     success "Authelia service is healthy and stable" |                         local current_image | ||||||
|                     success "Total deployment time: ${elapsed} seconds" |                         current_image=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Image}}" | grep authelia | head -n1) | ||||||
|  |                          | ||||||
|  |                         if [[ "$current_image" == *"$NEW_IMAGE_HASH"* ]] || [[ -z "$NEW_IMAGE_HASH" ]]; then | ||||||
|  |                             success "✅ Authelia service is healthy and running!" | ||||||
|  |                             success "🎯 Using correct image: $current_image" | ||||||
|  |                             success "⚡ Total deployment time: ${elapsed} seconds" | ||||||
|                             ROLLBACK_NEEDED=false |                             ROLLBACK_NEEDED=false | ||||||
|                             return 0 |                             return 0 | ||||||
|                         else |                         else | ||||||
|                     warning "Found $failed_count failed service instances, continuing health checks..." |                             warning "⚠️ Service running but using wrong image: $current_image (expected: $NEW_IMAGE_HASH)" | ||||||
|                         fi |                         fi | ||||||
|  |                     else | ||||||
|  |                         warning "⚠️ Service running but found $failed_count failed instances" | ||||||
|  |                     fi | ||||||
|  |                 fi | ||||||
|  |                 last_status="Running" | ||||||
|  |             elif echo "$state" | grep -q "Failed\|Rejected\|Shutdown"; then | ||||||
|  |                 error "❌ Service failed: $state" | ||||||
|  |                 if [[ -n "$error_msg" ]]; then | ||||||
|  |                     error "Error: $error_msg" | ||||||
|  |                 fi | ||||||
|  |                 break  # Exit early on clear failure | ||||||
|  |             else | ||||||
|  |                 last_status="$state" | ||||||
|  |                 debug "Service state: $state" | ||||||
|             fi |             fi | ||||||
|         fi |         fi | ||||||
|          |          | ||||||
|  | @ -359,25 +485,36 @@ comprehensive_health_check() { | ||||||
|             break |             break | ||||||
|         fi |         fi | ||||||
|          |          | ||||||
|         log "Waiting for authelia service... (${elapsed}s/${timeout}s)" |         sleep 2 | ||||||
|         sleep 5 |  | ||||||
|         ((check_count++)) |         ((check_count++)) | ||||||
|     done |     done | ||||||
|      |      | ||||||
|     # Health check failed |     # Health check failed - provide comprehensive diagnostics | ||||||
|     error "❌ Health check failed after ${timeout} seconds" |     error "❌ Health check failed after ${elapsed} seconds" | ||||||
|     error "Deployment verification failed" |     error "Deployment verification failed" | ||||||
|      |      | ||||||
|     # Show detailed debugging information |     # Get detailed diagnostics for each service | ||||||
|     error "=== DEBUGGING INFORMATION ===" |     log "🔍 Gathering comprehensive diagnostics..." | ||||||
|     error "Stack status:" |  | ||||||
|     docker stack ps "${CI_REPO_NAME}" || true |  | ||||||
|      |      | ||||||
|     error "Authelia service logs (last 30 lines):" |     local services=("authelia" "mariadb" "redis") | ||||||
|     docker service logs "${CI_REPO_NAME}_authelia" --tail 30 || true |     for service in "${services[@]}"; do | ||||||
|  |         if docker service ls --format "{{.Name}}" | grep -q "${CI_REPO_NAME}_${service}"; then | ||||||
|  |             get_container_diagnostics "$service" | ||||||
|  |         else | ||||||
|  |             error "Service ${CI_REPO_NAME}_${service} not found!" | ||||||
|  |         fi | ||||||
|  |     done | ||||||
|      |      | ||||||
|     error "Docker service inspect:" |     # Additional stack-level diagnostics | ||||||
|     docker service inspect "${CI_REPO_NAME}_authelia" --pretty || true |     error "=== 📊 STACK-LEVEL DIAGNOSTICS ===" | ||||||
|  |     error "Full stack status:" | ||||||
|  |     docker stack ps "${CI_REPO_NAME}" --no-trunc || true | ||||||
|  |      | ||||||
|  |     error "Stack services:" | ||||||
|  |     docker stack services "${CI_REPO_NAME}" || true | ||||||
|  |      | ||||||
|  |     error "Recent Docker events:" | ||||||
|  |     docker events --since="$((elapsed + 60))s" --until="now" --filter "container" 2>/dev/null | tail -10 || true | ||||||
|      |      | ||||||
|     return 1 |     return 1 | ||||||
| } | } | ||||||
|  | @ -395,13 +532,16 @@ main() { | ||||||
|     # Step 1: Docker registry login |     # Step 1: Docker registry login | ||||||
|     docker_registry_login |     docker_registry_login | ||||||
|      |      | ||||||
|  |     # Step 1.5: Force pull latest images to ensure fresh deployment | ||||||
|  |     force_pull_latest_images | ||||||
|  |      | ||||||
|     # Step 2: Remove old stack to release secrets |     # Step 2: Remove old stack to release secrets | ||||||
|     log "Removing old stack to release secrets" |     log "Removing old stack to release secrets" | ||||||
|     docker stack rm "${CI_REPO_NAME}" || true |     docker stack rm "${CI_REPO_NAME}" || true | ||||||
|      |      | ||||||
|     # Step 3: Wait for complete stack removal with timeout |     # Step 3: Wait for complete stack removal with optimized timeout | ||||||
|     log "Waiting for complete stack removal (30 seconds minimum)" |     log "Waiting for complete stack removal (minimum 15 seconds)" | ||||||
|     sleep 30 |     sleep 15  # Reduced from 30 seconds | ||||||
|     wait_for_stack_removal |     wait_for_stack_removal | ||||||
|      |      | ||||||
|     # Step 4 & 5: Manage secrets (remove old, create new) |     # Step 4 & 5: Manage secrets (remove old, create new) | ||||||
|  | @ -410,10 +550,11 @@ main() { | ||||||
|     # Step 6: Deploy new stack |     # Step 6: Deploy new stack | ||||||
|     deploy_stack |     deploy_stack | ||||||
|      |      | ||||||
|     # Step 7-9: Comprehensive health checking |     # Step 7-9: Rapid health checking with container diagnostics | ||||||
|     comprehensive_health_check |     comprehensive_health_check | ||||||
|      |      | ||||||
|     success "🎉 Production deployment completed successfully!" |     success "🎉 Production deployment completed successfully!" | ||||||
|  |     success "🏆 Deployed image: $NEW_IMAGE_HASH" | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| # Run main function | # Run main function | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	 Your Name
						Your Name