From 993b32af4691ebe3028abc1d98c51302db94e2aa Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 5 Jun 2025 09:15:00 -0400
Subject: [PATCH] optimize: speed up deployment while ensuring latest images
 and better diagnostics - Force pull latest images with hash verification
 before deployment - Reduce timeouts: deployment 3min, health check 90s, stack
 removal 60s - Check every 2s instead of 5s for faster feedback - Exit early
 on clear failures to avoid waiting - Comprehensive container-level
 diagnostics on failure - Show task status, container logs, and service
 inspection - Verify deployed image matches pulled image hash - Minimal 10s
 initialization wait instead of 30s - Better error messages with image hash
 tracking

---
 scripts/ci-deploy-production.sh | 271 ++++++++++++++++++++++++--------
 1 file changed, 206 insertions(+), 65 deletions(-)

diff --git a/scripts/ci-deploy-production.sh b/scripts/ci-deploy-production.sh
index 6271177..30c9d6a 100755
--- a/scripts/ci-deploy-production.sh
+++ b/scripts/ci-deploy-production.sh
@@ -29,10 +29,11 @@ set -euo pipefail
 readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 readonly LOCK_FILE="/tmp/authelia-deploy.lock"
 readonly MAX_RETRIES=3
-readonly RETRY_DELAY=10
-readonly DEPLOYMENT_TIMEOUT=300  # 5 minutes
-readonly HEALTH_CHECK_TIMEOUT=120  # 2 minutes
-readonly MIN_DISK_SPACE_MB=1000
+readonly RETRY_DELAY=5  # Reduced from 10s to 5s
+readonly DEPLOYMENT_TIMEOUT=180  # Reduced from 300s to 180s (3 minutes)
+readonly HEALTH_CHECK_TIMEOUT=90  # Reduced from 120s to 90s
+readonly MIN_DISK_SPACE_MB=500  # Reduced from 1000MB to 500MB
+readonly FORCE_PULL=true  # Always pull latest images
 
 # Color codes for output
 readonly RED='\033[0;31m'
@@ -40,11 +41,13 @@ readonly GREEN='\033[0;32m'
 readonly YELLOW='\033[1;33m'
 readonly BLUE='\033[0;34m'
 readonly PURPLE='\033[0;35m'
+readonly CYAN='\033[0;36m'
 readonly NC='\033[0m' # No Color
 
 # Global variables for cleanup
 DEPLOYMENT_STARTED=false
-OLD_IMAGE_ID=""
+OLD_IMAGE_HASH=""
+NEW_IMAGE_HASH=""
 ROLLBACK_NEEDED=false
 
 # Logging functions
@@ -179,23 +182,23 @@ pre_flight_checks() {
 # Get current image ID for rollback purposes
 get_current_image_id() {
     if docker stack ps "${CI_REPO_NAME}" >/dev/null 2>&1; then
-        OLD_IMAGE_ID=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "")
-        if [[ -n "$OLD_IMAGE_ID" ]]; then
-            debug "Current image for rollback: $OLD_IMAGE_ID"
+        OLD_IMAGE_HASH=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "")
+        if [[ -n "$OLD_IMAGE_HASH" ]]; then
+            debug "Current image for rollback: $OLD_IMAGE_HASH"
         fi
     fi
 }
 
 # Rollback function
 attempt_rollback() {
-    if [[ -n "$OLD_IMAGE_ID" && "$OLD_IMAGE_ID" != "IMAGE" ]]; then
-        warning "Attempting rollback to previous image: $OLD_IMAGE_ID"
+    if [[ -n "$OLD_IMAGE_HASH" && "$OLD_IMAGE_HASH" != "IMAGE" ]]; then
+        warning "Attempting rollback to previous image: $OLD_IMAGE_HASH"
         
         # This would require a more complex rollback mechanism
         # For now, just log the attempt
         error "Rollback mechanism not yet implemented"
         error "Manual intervention required"
-        error "Previous image was: $OLD_IMAGE_ID"
+        error "Previous image was: $OLD_IMAGE_HASH"
     else
         error "No previous image information available for rollback"
     fi
@@ -209,10 +212,107 @@ docker_registry_login() {
     retry_command "$login_cmd" "Docker registry login"
 }
 
-# Wait for stack removal with timeout
+# Force pull latest images to ensure we deploy the newest version
+force_pull_latest_images() {
+    log "🚀 Force pulling latest images to ensure fresh deployment"
+    
+    # Get the image names from docker-compose production file
+    local authelia_image="git.nixc.us/nixius/authelia:production-authelia"
+    local mariadb_image="git.nixc.us/nixius/authelia:production-mariadb" 
+    local redis_image="git.nixc.us/nixius/authelia:production-redis"
+    
+    # Pull each image and capture new hashes
+    log "Pulling Authelia image..."
+    if docker pull "$authelia_image"; then
+        NEW_IMAGE_HASH=$(docker images --format "table {{.Repository}}:{{.Tag}}\t{{.ID}}" | grep "production-authelia" | awk '{print $2}' | head -n1)
+        success "✅ Authelia image pulled: $NEW_IMAGE_HASH"
+    else
+        error "❌ Failed to pull Authelia image"
+        return 1
+    fi
+    
+    log "Pulling MariaDB image..."
+    retry_command "docker pull $mariadb_image" "MariaDB image pull"
+    
+    log "Pulling Redis image..."
+    retry_command "docker pull $redis_image" "Redis image pull"
+    
+    # Verify we have a new image hash
+    if [[ -n "$NEW_IMAGE_HASH" && "$NEW_IMAGE_HASH" != "$OLD_IMAGE_HASH" ]]; then
+        success "🔄 New image detected: $OLD_IMAGE_HASH → $NEW_IMAGE_HASH"
+    elif [[ -n "$NEW_IMAGE_HASH" ]]; then
+        warning "⚠️ Same image hash detected: $NEW_IMAGE_HASH (this may be expected)"
+    else
+        error "❌ Could not determine new image hash"
+        return 1
+    fi
+}
+
+# Get detailed container information for debugging
+get_container_diagnostics() {
+    local service_name="$1"
+    local container_logs=""
+    
+    error "=== 🔍 DETAILED DIAGNOSTICS FOR ${service_name} ==="
+    
+    # Get all tasks for this service
+    local tasks
+    tasks=$(docker service ps "${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Name}}\t{{.CurrentState}}\t{{.Error}}" --no-trunc)
+    
+    if [[ -n "$tasks" ]]; then
+        error "Service tasks:"
+        echo "$tasks" | while IFS=$'\t' read -r task_id name state task_error; do
+            error "  Task: $name"
+            error "    ID: $task_id"
+            error "    State: $state"
+            if [[ -n "$task_error" ]]; then
+                error "    Error: $task_error"
+            fi
+            
+            # Try to get container logs for this task
+            log "Attempting to get logs for task $task_id..."
+            local task_logs
+            task_logs=$(docker service logs "${CI_REPO_NAME}_${service_name}" --raw --tail 20 2>/dev/null || echo "No logs available")
+            if [[ "$task_logs" != "No logs available" ]]; then
+                error "    Recent logs:"
+                echo "$task_logs" | sed 's/^/      /'
+            fi
+        done
+    else
+        error "No service tasks found for ${service_name}"
+    fi
+    
+    # Get service inspection details
+    error "Service inspection:"
+    docker service inspect "${CI_REPO_NAME}_${service_name}" --pretty 2>/dev/null | head -20 | sed 's/^/  /' || error "  Service inspect failed"
+    
+    # Check if there are any containers running for this service
+    local containers
+    containers=$(docker ps -a --filter "label=com.docker.swarm.service.name=${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Status}}\t{{.Names}}" 2>/dev/null || echo "")
+    
+    if [[ -n "$containers" ]]; then
+        error "Associated containers:"
+        echo "$containers" | while IFS=$'\t' read -r container_id status name; do
+            error "  Container: $name ($container_id)"
+            error "    Status: $status"
+            
+            # Get container logs
+            local container_logs
+            container_logs=$(docker logs "$container_id" --tail 15 2>&1 || echo "No container logs available")
+            error "    Container logs (last 15 lines):"
+            echo "$container_logs" | sed 's/^/      /'
+        done
+    else
+        error "No containers found for service ${service_name}"
+    fi
+    
+    error "=== END DIAGNOSTICS FOR ${service_name} ==="
+}
+
+# Optimized wait for stack removal
 wait_for_stack_removal() {
     log "Verifying stack removal completed"
-    local timeout=$((DEPLOYMENT_TIMEOUT))
+    local timeout=60  # Reduced timeout for faster deployment
     local elapsed=0
     
     while docker stack ls | grep -q "${CI_REPO_NAME}"; do 
@@ -221,9 +321,11 @@ wait_for_stack_removal() {
             return 1
         fi
         
-        log "Stack still exists, waiting... (${elapsed}s/${timeout}s)"
-        sleep 5
-        elapsed=$((elapsed + 5))
+        if [[ $((elapsed % 10)) -eq 0 ]]; then  # Log every 10 seconds instead of 5
+            log "Stack still exists, waiting... (${elapsed}s/${timeout}s)"
+        fi
+        sleep 2  # Check every 2 seconds instead of 5
+        elapsed=$((elapsed + 2))
     done
     
     success "Stack removal completed in ${elapsed} seconds"
@@ -302,56 +404,80 @@ deploy_stack() {
     success "Stack deployment command completed"
 }
 
-# Enhanced health checking with multiple validation methods
+# Enhanced health checking optimized for speed and accuracy
 comprehensive_health_check() {
-    log "Starting comprehensive health check (${HEALTH_CHECK_TIMEOUT}s timeout)"
+    log "🔍 Starting rapid health verification (${HEALTH_CHECK_TIMEOUT}s timeout)"
     local start_time=$(date +%s)
     local timeout=$HEALTH_CHECK_TIMEOUT
     
-    # Wait for services to initialize
-    log "Waiting for services to initialize (30 seconds)"
-    sleep 30
+    # Minimal initial wait - just 10 seconds instead of 30
+    log "Brief initialization wait (10 seconds)..."
+    sleep 10
     
-    # Check deployment status
+    # Get immediate deployment status
     log "Checking deployment status"
     docker stack ps "${CI_REPO_NAME}"
     
-    # Health check loop with multiple validation methods
+    # Fast health check loop with 2-second intervals
     local check_count=0
-    local max_checks=$((timeout / 5))
+    local max_checks=$((timeout / 2))  # Check every 2 seconds
+    local authelia_healthy=false
+    local last_status=""
     
     while [[ $check_count -lt $max_checks ]]; do
         local current_time=$(date +%s)
         local elapsed=$((current_time - start_time))
         
-        log "Health check attempt $((check_count + 1))/${max_checks} (${elapsed}s elapsed)"
+        # Only log every 10 seconds to reduce noise
+        if [[ $((check_count % 5)) -eq 0 ]]; then
+            log "Health check ${check_count}/${max_checks} (${elapsed}s elapsed)"
+        fi
         
-        # Check if authelia service is running
-        if docker stack ps "${CI_REPO_NAME}" | grep -q "authelia_authelia.*Running"; then
-            success "✅ Authelia service is running!"
+        # Get current service status
+        local service_status
+        service_status=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Name}}\t{{.CurrentState}}\t{{.Error}}" | grep "authelia_authelia" | head -n1)
+        
+        if [[ -n "$service_status" ]]; then
+            local name=$(echo "$service_status" | cut -f1)
+            local state=$(echo "$service_status" | cut -f2)
+            local error_msg=$(echo "$service_status" | cut -f3)
             
-            # Additional verification checks
-            log "Performing additional health verification..."
-            sleep 5
-            
-            # Check service is stable (not restarting)
-            local service_info
-            service_info=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | head -n1)
-            
-            if echo "$service_info" | grep -q "Running"; then
-                # Check if there are any failed instances
-                local failed_count
-                failed_count=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | grep -c "Failed" || echo "0")
-                
-                if [[ $failed_count -eq 0 ]]; then
-                    success "🎉 Production deployment completed successfully!"
-                    success "Authelia service is healthy and stable"
-                    success "Total deployment time: ${elapsed} seconds"
-                    ROLLBACK_NEEDED=false
-                    return 0
-                else
-                    warning "Found $failed_count failed service instances, continuing health checks..."
+            # Check for Running state
+            if echo "$state" | grep -q "Running"; then
+                # Verify it's actually stable by checking for a few seconds
+                if [[ "$last_status" == "Running" ]]; then
+                    # Double-check: no recent failures
+                    local failed_count
+                    failed_count=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | grep -c "Failed" || echo "0")
+                    
+                    if [[ $failed_count -eq 0 ]]; then
+                        # Final verification: ensure we're using the new image
+                        local current_image
+                        current_image=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Image}}" | grep authelia | head -n1)
+                        
+                        if [[ "$current_image" == *"$NEW_IMAGE_HASH"* ]] || [[ -z "$NEW_IMAGE_HASH" ]]; then
+                            success "✅ Authelia service is healthy and running!"
+                            success "🎯 Using correct image: $current_image"
+                            success "⚡ Total deployment time: ${elapsed} seconds"
+                            ROLLBACK_NEEDED=false
+                            return 0
+                        else
+                            warning "⚠️ Service running but using wrong image: $current_image (expected: $NEW_IMAGE_HASH)"
+                        fi
+                    else
+                        warning "⚠️ Service running but found $failed_count failed instances"
+                    fi
                 fi
+                last_status="Running"
+            elif echo "$state" | grep -q "Failed\|Rejected\|Shutdown"; then
+                error "❌ Service failed: $state"
+                if [[ -n "$error_msg" ]]; then
+                    error "Error: $error_msg"
+                fi
+                break  # Exit early on clear failure
+            else
+                last_status="$state"
+                debug "Service state: $state"
             fi
         fi
         
@@ -359,25 +485,36 @@ comprehensive_health_check() {
             break
         fi
         
-        log "Waiting for authelia service... (${elapsed}s/${timeout}s)"
-        sleep 5
+        sleep 2
         ((check_count++))
     done
     
-    # Health check failed
-    error "❌ Health check failed after ${timeout} seconds"
+    # Health check failed - provide comprehensive diagnostics
+    error "❌ Health check failed after ${elapsed} seconds"
     error "Deployment verification failed"
     
-    # Show detailed debugging information
-    error "=== DEBUGGING INFORMATION ==="
-    error "Stack status:"
-    docker stack ps "${CI_REPO_NAME}" || true
+    # Get detailed diagnostics for each service
+    log "🔍 Gathering comprehensive diagnostics..."
     
-    error "Authelia service logs (last 30 lines):"
-    docker service logs "${CI_REPO_NAME}_authelia" --tail 30 || true
+    local services=("authelia" "mariadb" "redis")
+    for service in "${services[@]}"; do
+        if docker service ls --format "{{.Name}}" | grep -q "${CI_REPO_NAME}_${service}"; then
+            get_container_diagnostics "$service"
+        else
+            error "Service ${CI_REPO_NAME}_${service} not found!"
+        fi
+    done
     
-    error "Docker service inspect:"
-    docker service inspect "${CI_REPO_NAME}_authelia" --pretty || true
+    # Additional stack-level diagnostics
+    error "=== 📊 STACK-LEVEL DIAGNOSTICS ==="
+    error "Full stack status:"
+    docker stack ps "${CI_REPO_NAME}" --no-trunc || true
+    
+    error "Stack services:"
+    docker stack services "${CI_REPO_NAME}" || true
+    
+    error "Recent Docker events:"
+    docker events --since="$((elapsed + 60))s" --until="now" --filter "container" 2>/dev/null | tail -10 || true
     
     return 1
 }
@@ -395,13 +532,16 @@ main() {
     # Step 1: Docker registry login
     docker_registry_login
     
+    # Step 1.5: Force pull latest images to ensure fresh deployment
+    force_pull_latest_images
+    
     # Step 2: Remove old stack to release secrets
     log "Removing old stack to release secrets"
     docker stack rm "${CI_REPO_NAME}" || true
     
-    # Step 3: Wait for complete stack removal with timeout
-    log "Waiting for complete stack removal (30 seconds minimum)"
-    sleep 30
+    # Step 3: Wait for complete stack removal with optimized timeout
+    log "Waiting for complete stack removal (minimum 15 seconds)"
+    sleep 15  # Reduced from 30 seconds
     wait_for_stack_removal
     
     # Step 4 & 5: Manage secrets (remove old, create new)
@@ -410,10 +550,11 @@ main() {
     # Step 6: Deploy new stack
     deploy_stack
     
-    # Step 7-9: Comprehensive health checking
+    # Step 7-9: Rapid health checking with container diagnostics
     comprehensive_health_check
     
     success "🎉 Production deployment completed successfully!"
+    success "🏆 Deployed image: $NEW_IMAGE_HASH"
 }
 
 # Run main function