From 993b32af4691ebe3028abc1d98c51302db94e2aa Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 5 Jun 2025 09:15:00 -0400 Subject: [PATCH] optimize: speed up deployment while ensuring latest images and better diagnostics - Force pull latest images with hash verification before deployment - Reduce timeouts: deployment 3min, health check 90s, stack removal 60s - Check every 2s instead of 5s for faster feedback - Exit early on clear failures to avoid waiting - Comprehensive container-level diagnostics on failure - Show task status, container logs, and service inspection - Verify deployed image matches pulled image hash - Minimal 10s initialization wait instead of 30s - Better error messages with image hash tracking --- scripts/ci-deploy-production.sh | 271 ++++++++++++++++++++++++-------- 1 file changed, 206 insertions(+), 65 deletions(-) diff --git a/scripts/ci-deploy-production.sh b/scripts/ci-deploy-production.sh index 6271177..30c9d6a 100755 --- a/scripts/ci-deploy-production.sh +++ b/scripts/ci-deploy-production.sh @@ -29,10 +29,11 @@ set -euo pipefail readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" readonly LOCK_FILE="/tmp/authelia-deploy.lock" readonly MAX_RETRIES=3 -readonly RETRY_DELAY=10 -readonly DEPLOYMENT_TIMEOUT=300 # 5 minutes -readonly HEALTH_CHECK_TIMEOUT=120 # 2 minutes -readonly MIN_DISK_SPACE_MB=1000 +readonly RETRY_DELAY=5 # Reduced from 10s to 5s +readonly DEPLOYMENT_TIMEOUT=180 # Reduced from 300s to 180s (3 minutes) +readonly HEALTH_CHECK_TIMEOUT=90 # Reduced from 120s to 90s +readonly MIN_DISK_SPACE_MB=500 # Reduced from 1000MB to 500MB +readonly FORCE_PULL=true # Always pull latest images # Color codes for output readonly RED='\033[0;31m' @@ -40,11 +41,13 @@ readonly GREEN='\033[0;32m' readonly YELLOW='\033[1;33m' readonly BLUE='\033[0;34m' readonly PURPLE='\033[0;35m' +readonly CYAN='\033[0;36m' readonly NC='\033[0m' # No Color # Global variables for cleanup DEPLOYMENT_STARTED=false -OLD_IMAGE_ID="" +OLD_IMAGE_HASH="" +NEW_IMAGE_HASH="" ROLLBACK_NEEDED=false # Logging functions @@ -179,23 +182,23 @@ pre_flight_checks() { # Get current image ID for rollback purposes get_current_image_id() { if docker stack ps "${CI_REPO_NAME}" >/dev/null 2>&1; then - OLD_IMAGE_ID=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "") - if [[ -n "$OLD_IMAGE_ID" ]]; then - debug "Current image for rollback: $OLD_IMAGE_ID" + OLD_IMAGE_HASH=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "") + if [[ -n "$OLD_IMAGE_HASH" ]]; then + debug "Current image for rollback: $OLD_IMAGE_HASH" fi fi } # Rollback function attempt_rollback() { - if [[ -n "$OLD_IMAGE_ID" && "$OLD_IMAGE_ID" != "IMAGE" ]]; then - warning "Attempting rollback to previous image: $OLD_IMAGE_ID" + if [[ -n "$OLD_IMAGE_HASH" && "$OLD_IMAGE_HASH" != "IMAGE" ]]; then + warning "Attempting rollback to previous image: $OLD_IMAGE_HASH" # This would require a more complex rollback mechanism # For now, just log the attempt error "Rollback mechanism not yet implemented" error "Manual intervention required" - error "Previous image was: $OLD_IMAGE_ID" + error "Previous image was: $OLD_IMAGE_HASH" else error "No previous image information available for rollback" fi @@ -209,10 +212,107 @@ docker_registry_login() { retry_command "$login_cmd" "Docker registry login" } -# Wait for stack removal with timeout +# Force pull latest images to ensure we deploy the newest version +force_pull_latest_images() { + log "🚀 Force pulling latest images to ensure fresh deployment" + + # Get the image names from docker-compose production file + local authelia_image="git.nixc.us/nixius/authelia:production-authelia" + local mariadb_image="git.nixc.us/nixius/authelia:production-mariadb" + local redis_image="git.nixc.us/nixius/authelia:production-redis" + + # Pull each image and capture new hashes + log "Pulling Authelia image..." + if docker pull "$authelia_image"; then + NEW_IMAGE_HASH=$(docker images --format "table {{.Repository}}:{{.Tag}}\t{{.ID}}" | grep "production-authelia" | awk '{print $2}' | head -n1) + success "✅ Authelia image pulled: $NEW_IMAGE_HASH" + else + error "❌ Failed to pull Authelia image" + return 1 + fi + + log "Pulling MariaDB image..." + retry_command "docker pull $mariadb_image" "MariaDB image pull" + + log "Pulling Redis image..." + retry_command "docker pull $redis_image" "Redis image pull" + + # Verify we have a new image hash + if [[ -n "$NEW_IMAGE_HASH" && "$NEW_IMAGE_HASH" != "$OLD_IMAGE_HASH" ]]; then + success "🔄 New image detected: $OLD_IMAGE_HASH → $NEW_IMAGE_HASH" + elif [[ -n "$NEW_IMAGE_HASH" ]]; then + warning "⚠️ Same image hash detected: $NEW_IMAGE_HASH (this may be expected)" + else + error "❌ Could not determine new image hash" + return 1 + fi +} + +# Get detailed container information for debugging +get_container_diagnostics() { + local service_name="$1" + local container_logs="" + + error "=== 🔍 DETAILED DIAGNOSTICS FOR ${service_name} ===" + + # Get all tasks for this service + local tasks + tasks=$(docker service ps "${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Name}}\t{{.CurrentState}}\t{{.Error}}" --no-trunc) + + if [[ -n "$tasks" ]]; then + error "Service tasks:" + echo "$tasks" | while IFS=$'\t' read -r task_id name state task_error; do + error " Task: $name" + error " ID: $task_id" + error " State: $state" + if [[ -n "$task_error" ]]; then + error " Error: $task_error" + fi + + # Try to get container logs for this task + log "Attempting to get logs for task $task_id..." + local task_logs + task_logs=$(docker service logs "${CI_REPO_NAME}_${service_name}" --raw --tail 20 2>/dev/null || echo "No logs available") + if [[ "$task_logs" != "No logs available" ]]; then + error " Recent logs:" + echo "$task_logs" | sed 's/^/ /' + fi + done + else + error "No service tasks found for ${service_name}" + fi + + # Get service inspection details + error "Service inspection:" + docker service inspect "${CI_REPO_NAME}_${service_name}" --pretty 2>/dev/null | head -20 | sed 's/^/ /' || error " Service inspect failed" + + # Check if there are any containers running for this service + local containers + containers=$(docker ps -a --filter "label=com.docker.swarm.service.name=${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Status}}\t{{.Names}}" 2>/dev/null || echo "") + + if [[ -n "$containers" ]]; then + error "Associated containers:" + echo "$containers" | while IFS=$'\t' read -r container_id status name; do + error " Container: $name ($container_id)" + error " Status: $status" + + # Get container logs + local container_logs + container_logs=$(docker logs "$container_id" --tail 15 2>&1 || echo "No container logs available") + error " Container logs (last 15 lines):" + echo "$container_logs" | sed 's/^/ /' + done + else + error "No containers found for service ${service_name}" + fi + + error "=== END DIAGNOSTICS FOR ${service_name} ===" +} + +# Optimized wait for stack removal wait_for_stack_removal() { log "Verifying stack removal completed" - local timeout=$((DEPLOYMENT_TIMEOUT)) + local timeout=60 # Reduced timeout for faster deployment local elapsed=0 while docker stack ls | grep -q "${CI_REPO_NAME}"; do @@ -221,9 +321,11 @@ wait_for_stack_removal() { return 1 fi - log "Stack still exists, waiting... (${elapsed}s/${timeout}s)" - sleep 5 - elapsed=$((elapsed + 5)) + if [[ $((elapsed % 10)) -eq 0 ]]; then # Log every 10 seconds instead of 5 + log "Stack still exists, waiting... (${elapsed}s/${timeout}s)" + fi + sleep 2 # Check every 2 seconds instead of 5 + elapsed=$((elapsed + 2)) done success "Stack removal completed in ${elapsed} seconds" @@ -302,56 +404,80 @@ deploy_stack() { success "Stack deployment command completed" } -# Enhanced health checking with multiple validation methods +# Enhanced health checking optimized for speed and accuracy comprehensive_health_check() { - log "Starting comprehensive health check (${HEALTH_CHECK_TIMEOUT}s timeout)" + log "🔍 Starting rapid health verification (${HEALTH_CHECK_TIMEOUT}s timeout)" local start_time=$(date +%s) local timeout=$HEALTH_CHECK_TIMEOUT - # Wait for services to initialize - log "Waiting for services to initialize (30 seconds)" - sleep 30 + # Minimal initial wait - just 10 seconds instead of 30 + log "Brief initialization wait (10 seconds)..." + sleep 10 - # Check deployment status + # Get immediate deployment status log "Checking deployment status" docker stack ps "${CI_REPO_NAME}" - # Health check loop with multiple validation methods + # Fast health check loop with 2-second intervals local check_count=0 - local max_checks=$((timeout / 5)) + local max_checks=$((timeout / 2)) # Check every 2 seconds + local authelia_healthy=false + local last_status="" while [[ $check_count -lt $max_checks ]]; do local current_time=$(date +%s) local elapsed=$((current_time - start_time)) - log "Health check attempt $((check_count + 1))/${max_checks} (${elapsed}s elapsed)" + # Only log every 10 seconds to reduce noise + if [[ $((check_count % 5)) -eq 0 ]]; then + log "Health check ${check_count}/${max_checks} (${elapsed}s elapsed)" + fi - # Check if authelia service is running - if docker stack ps "${CI_REPO_NAME}" | grep -q "authelia_authelia.*Running"; then - success "✅ Authelia service is running!" + # Get current service status + local service_status + service_status=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Name}}\t{{.CurrentState}}\t{{.Error}}" | grep "authelia_authelia" | head -n1) + + if [[ -n "$service_status" ]]; then + local name=$(echo "$service_status" | cut -f1) + local state=$(echo "$service_status" | cut -f2) + local error_msg=$(echo "$service_status" | cut -f3) - # Additional verification checks - log "Performing additional health verification..." - sleep 5 - - # Check service is stable (not restarting) - local service_info - service_info=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | head -n1) - - if echo "$service_info" | grep -q "Running"; then - # Check if there are any failed instances - local failed_count - failed_count=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | grep -c "Failed" || echo "0") - - if [[ $failed_count -eq 0 ]]; then - success "🎉 Production deployment completed successfully!" - success "Authelia service is healthy and stable" - success "Total deployment time: ${elapsed} seconds" - ROLLBACK_NEEDED=false - return 0 - else - warning "Found $failed_count failed service instances, continuing health checks..." + # Check for Running state + if echo "$state" | grep -q "Running"; then + # Verify it's actually stable by checking for a few seconds + if [[ "$last_status" == "Running" ]]; then + # Double-check: no recent failures + local failed_count + failed_count=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | grep -c "Failed" || echo "0") + + if [[ $failed_count -eq 0 ]]; then + # Final verification: ensure we're using the new image + local current_image + current_image=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Image}}" | grep authelia | head -n1) + + if [[ "$current_image" == *"$NEW_IMAGE_HASH"* ]] || [[ -z "$NEW_IMAGE_HASH" ]]; then + success "✅ Authelia service is healthy and running!" + success "🎯 Using correct image: $current_image" + success "⚡ Total deployment time: ${elapsed} seconds" + ROLLBACK_NEEDED=false + return 0 + else + warning "⚠️ Service running but using wrong image: $current_image (expected: $NEW_IMAGE_HASH)" + fi + else + warning "⚠️ Service running but found $failed_count failed instances" + fi fi + last_status="Running" + elif echo "$state" | grep -q "Failed\|Rejected\|Shutdown"; then + error "❌ Service failed: $state" + if [[ -n "$error_msg" ]]; then + error "Error: $error_msg" + fi + break # Exit early on clear failure + else + last_status="$state" + debug "Service state: $state" fi fi @@ -359,25 +485,36 @@ comprehensive_health_check() { break fi - log "Waiting for authelia service... (${elapsed}s/${timeout}s)" - sleep 5 + sleep 2 ((check_count++)) done - # Health check failed - error "❌ Health check failed after ${timeout} seconds" + # Health check failed - provide comprehensive diagnostics + error "❌ Health check failed after ${elapsed} seconds" error "Deployment verification failed" - # Show detailed debugging information - error "=== DEBUGGING INFORMATION ===" - error "Stack status:" - docker stack ps "${CI_REPO_NAME}" || true + # Get detailed diagnostics for each service + log "🔍 Gathering comprehensive diagnostics..." - error "Authelia service logs (last 30 lines):" - docker service logs "${CI_REPO_NAME}_authelia" --tail 30 || true + local services=("authelia" "mariadb" "redis") + for service in "${services[@]}"; do + if docker service ls --format "{{.Name}}" | grep -q "${CI_REPO_NAME}_${service}"; then + get_container_diagnostics "$service" + else + error "Service ${CI_REPO_NAME}_${service} not found!" + fi + done - error "Docker service inspect:" - docker service inspect "${CI_REPO_NAME}_authelia" --pretty || true + # Additional stack-level diagnostics + error "=== 📊 STACK-LEVEL DIAGNOSTICS ===" + error "Full stack status:" + docker stack ps "${CI_REPO_NAME}" --no-trunc || true + + error "Stack services:" + docker stack services "${CI_REPO_NAME}" || true + + error "Recent Docker events:" + docker events --since="$((elapsed + 60))s" --until="now" --filter "container" 2>/dev/null | tail -10 || true return 1 } @@ -395,13 +532,16 @@ main() { # Step 1: Docker registry login docker_registry_login + # Step 1.5: Force pull latest images to ensure fresh deployment + force_pull_latest_images + # Step 2: Remove old stack to release secrets log "Removing old stack to release secrets" docker stack rm "${CI_REPO_NAME}" || true - # Step 3: Wait for complete stack removal with timeout - log "Waiting for complete stack removal (30 seconds minimum)" - sleep 30 + # Step 3: Wait for complete stack removal with optimized timeout + log "Waiting for complete stack removal (minimum 15 seconds)" + sleep 15 # Reduced from 30 seconds wait_for_stack_removal # Step 4 & 5: Manage secrets (remove old, create new) @@ -410,10 +550,11 @@ main() { # Step 6: Deploy new stack deploy_stack - # Step 7-9: Comprehensive health checking + # Step 7-9: Rapid health checking with container diagnostics comprehensive_health_check success "🎉 Production deployment completed successfully!" + success "🏆 Deployed image: $NEW_IMAGE_HASH" } # Run main function