|
|
|
@ -29,10 +29,11 @@ set -euo pipefail
|
|
|
|
|
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
|
readonly LOCK_FILE="/tmp/authelia-deploy.lock"
|
|
|
|
|
readonly MAX_RETRIES=3
|
|
|
|
|
readonly RETRY_DELAY=10
|
|
|
|
|
readonly DEPLOYMENT_TIMEOUT=300 # 5 minutes
|
|
|
|
|
readonly HEALTH_CHECK_TIMEOUT=120 # 2 minutes
|
|
|
|
|
readonly MIN_DISK_SPACE_MB=1000
|
|
|
|
|
readonly RETRY_DELAY=5 # Reduced from 10s to 5s
|
|
|
|
|
readonly DEPLOYMENT_TIMEOUT=180 # Reduced from 300s to 180s (3 minutes)
|
|
|
|
|
readonly HEALTH_CHECK_TIMEOUT=90 # Reduced from 120s to 90s
|
|
|
|
|
readonly MIN_DISK_SPACE_MB=500 # Reduced from 1000MB to 500MB
|
|
|
|
|
readonly FORCE_PULL=true # Always pull latest images
|
|
|
|
|
|
|
|
|
|
# Color codes for output
|
|
|
|
|
readonly RED='\033[0;31m'
|
|
|
|
@ -40,11 +41,13 @@ readonly GREEN='\033[0;32m'
|
|
|
|
|
readonly YELLOW='\033[1;33m'
|
|
|
|
|
readonly BLUE='\033[0;34m'
|
|
|
|
|
readonly PURPLE='\033[0;35m'
|
|
|
|
|
readonly CYAN='\033[0;36m'
|
|
|
|
|
readonly NC='\033[0m' # No Color
|
|
|
|
|
|
|
|
|
|
# Global variables for cleanup
|
|
|
|
|
DEPLOYMENT_STARTED=false
|
|
|
|
|
OLD_IMAGE_ID=""
|
|
|
|
|
OLD_IMAGE_HASH=""
|
|
|
|
|
NEW_IMAGE_HASH=""
|
|
|
|
|
ROLLBACK_NEEDED=false
|
|
|
|
|
|
|
|
|
|
# Logging functions
|
|
|
|
@ -179,23 +182,23 @@ pre_flight_checks() {
|
|
|
|
|
# Get current image ID for rollback purposes
|
|
|
|
|
get_current_image_id() {
|
|
|
|
|
if docker stack ps "${CI_REPO_NAME}" >/dev/null 2>&1; then
|
|
|
|
|
OLD_IMAGE_ID=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "")
|
|
|
|
|
if [[ -n "$OLD_IMAGE_ID" ]]; then
|
|
|
|
|
debug "Current image for rollback: $OLD_IMAGE_ID"
|
|
|
|
|
OLD_IMAGE_HASH=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "")
|
|
|
|
|
if [[ -n "$OLD_IMAGE_HASH" ]]; then
|
|
|
|
|
debug "Current image for rollback: $OLD_IMAGE_HASH"
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Rollback function
|
|
|
|
|
attempt_rollback() {
|
|
|
|
|
if [[ -n "$OLD_IMAGE_ID" && "$OLD_IMAGE_ID" != "IMAGE" ]]; then
|
|
|
|
|
warning "Attempting rollback to previous image: $OLD_IMAGE_ID"
|
|
|
|
|
if [[ -n "$OLD_IMAGE_HASH" && "$OLD_IMAGE_HASH" != "IMAGE" ]]; then
|
|
|
|
|
warning "Attempting rollback to previous image: $OLD_IMAGE_HASH"
|
|
|
|
|
|
|
|
|
|
# This would require a more complex rollback mechanism
|
|
|
|
|
# For now, just log the attempt
|
|
|
|
|
error "Rollback mechanism not yet implemented"
|
|
|
|
|
error "Manual intervention required"
|
|
|
|
|
error "Previous image was: $OLD_IMAGE_ID"
|
|
|
|
|
error "Previous image was: $OLD_IMAGE_HASH"
|
|
|
|
|
else
|
|
|
|
|
error "No previous image information available for rollback"
|
|
|
|
|
fi
|
|
|
|
@ -209,10 +212,107 @@ docker_registry_login() {
|
|
|
|
|
retry_command "$login_cmd" "Docker registry login"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Wait for stack removal with timeout
|
|
|
|
|
# Force pull latest images to ensure we deploy the newest version
|
|
|
|
|
force_pull_latest_images() {
|
|
|
|
|
log "🚀 Force pulling latest images to ensure fresh deployment"
|
|
|
|
|
|
|
|
|
|
# Get the image names from docker-compose production file
|
|
|
|
|
local authelia_image="git.nixc.us/nixius/authelia:production-authelia"
|
|
|
|
|
local mariadb_image="git.nixc.us/nixius/authelia:production-mariadb"
|
|
|
|
|
local redis_image="git.nixc.us/nixius/authelia:production-redis"
|
|
|
|
|
|
|
|
|
|
# Pull each image and capture new hashes
|
|
|
|
|
log "Pulling Authelia image..."
|
|
|
|
|
if docker pull "$authelia_image"; then
|
|
|
|
|
NEW_IMAGE_HASH=$(docker images --format "table {{.Repository}}:{{.Tag}}\t{{.ID}}" | grep "production-authelia" | awk '{print $2}' | head -n1)
|
|
|
|
|
success "✅ Authelia image pulled: $NEW_IMAGE_HASH"
|
|
|
|
|
else
|
|
|
|
|
error "❌ Failed to pull Authelia image"
|
|
|
|
|
return 1
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
log "Pulling MariaDB image..."
|
|
|
|
|
retry_command "docker pull $mariadb_image" "MariaDB image pull"
|
|
|
|
|
|
|
|
|
|
log "Pulling Redis image..."
|
|
|
|
|
retry_command "docker pull $redis_image" "Redis image pull"
|
|
|
|
|
|
|
|
|
|
# Verify we have a new image hash
|
|
|
|
|
if [[ -n "$NEW_IMAGE_HASH" && "$NEW_IMAGE_HASH" != "$OLD_IMAGE_HASH" ]]; then
|
|
|
|
|
success "🔄 New image detected: $OLD_IMAGE_HASH → $NEW_IMAGE_HASH"
|
|
|
|
|
elif [[ -n "$NEW_IMAGE_HASH" ]]; then
|
|
|
|
|
warning "⚠️ Same image hash detected: $NEW_IMAGE_HASH (this may be expected)"
|
|
|
|
|
else
|
|
|
|
|
error "❌ Could not determine new image hash"
|
|
|
|
|
return 1
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Get detailed container information for debugging
|
|
|
|
|
get_container_diagnostics() {
|
|
|
|
|
local service_name="$1"
|
|
|
|
|
local container_logs=""
|
|
|
|
|
|
|
|
|
|
error "=== 🔍 DETAILED DIAGNOSTICS FOR ${service_name} ==="
|
|
|
|
|
|
|
|
|
|
# Get all tasks for this service
|
|
|
|
|
local tasks
|
|
|
|
|
tasks=$(docker service ps "${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Name}}\t{{.CurrentState}}\t{{.Error}}" --no-trunc)
|
|
|
|
|
|
|
|
|
|
if [[ -n "$tasks" ]]; then
|
|
|
|
|
error "Service tasks:"
|
|
|
|
|
echo "$tasks" | while IFS=$'\t' read -r task_id name state task_error; do
|
|
|
|
|
error " Task: $name"
|
|
|
|
|
error " ID: $task_id"
|
|
|
|
|
error " State: $state"
|
|
|
|
|
if [[ -n "$task_error" ]]; then
|
|
|
|
|
error " Error: $task_error"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# Try to get container logs for this task
|
|
|
|
|
log "Attempting to get logs for task $task_id..."
|
|
|
|
|
local task_logs
|
|
|
|
|
task_logs=$(docker service logs "${CI_REPO_NAME}_${service_name}" --raw --tail 20 2>/dev/null || echo "No logs available")
|
|
|
|
|
if [[ "$task_logs" != "No logs available" ]]; then
|
|
|
|
|
error " Recent logs:"
|
|
|
|
|
echo "$task_logs" | sed 's/^/ /'
|
|
|
|
|
fi
|
|
|
|
|
done
|
|
|
|
|
else
|
|
|
|
|
error "No service tasks found for ${service_name}"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# Get service inspection details
|
|
|
|
|
error "Service inspection:"
|
|
|
|
|
docker service inspect "${CI_REPO_NAME}_${service_name}" --pretty 2>/dev/null | head -20 | sed 's/^/ /' || error " Service inspect failed"
|
|
|
|
|
|
|
|
|
|
# Check if there are any containers running for this service
|
|
|
|
|
local containers
|
|
|
|
|
containers=$(docker ps -a --filter "label=com.docker.swarm.service.name=${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Status}}\t{{.Names}}" 2>/dev/null || echo "")
|
|
|
|
|
|
|
|
|
|
if [[ -n "$containers" ]]; then
|
|
|
|
|
error "Associated containers:"
|
|
|
|
|
echo "$containers" | while IFS=$'\t' read -r container_id status name; do
|
|
|
|
|
error " Container: $name ($container_id)"
|
|
|
|
|
error " Status: $status"
|
|
|
|
|
|
|
|
|
|
# Get container logs
|
|
|
|
|
local container_logs
|
|
|
|
|
container_logs=$(docker logs "$container_id" --tail 15 2>&1 || echo "No container logs available")
|
|
|
|
|
error " Container logs (last 15 lines):"
|
|
|
|
|
echo "$container_logs" | sed 's/^/ /'
|
|
|
|
|
done
|
|
|
|
|
else
|
|
|
|
|
error "No containers found for service ${service_name}"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
error "=== END DIAGNOSTICS FOR ${service_name} ==="
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Optimized wait for stack removal
|
|
|
|
|
wait_for_stack_removal() {
|
|
|
|
|
log "Verifying stack removal completed"
|
|
|
|
|
local timeout=$((DEPLOYMENT_TIMEOUT))
|
|
|
|
|
local timeout=60 # Reduced timeout for faster deployment
|
|
|
|
|
local elapsed=0
|
|
|
|
|
|
|
|
|
|
while docker stack ls | grep -q "${CI_REPO_NAME}"; do
|
|
|
|
@ -221,9 +321,11 @@ wait_for_stack_removal() {
|
|
|
|
|
return 1
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
log "Stack still exists, waiting... (${elapsed}s/${timeout}s)"
|
|
|
|
|
sleep 5
|
|
|
|
|
elapsed=$((elapsed + 5))
|
|
|
|
|
if [[ $((elapsed % 10)) -eq 0 ]]; then # Log every 10 seconds instead of 5
|
|
|
|
|
log "Stack still exists, waiting... (${elapsed}s/${timeout}s)"
|
|
|
|
|
fi
|
|
|
|
|
sleep 2 # Check every 2 seconds instead of 5
|
|
|
|
|
elapsed=$((elapsed + 2))
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
success "Stack removal completed in ${elapsed} seconds"
|
|
|
|
@ -302,56 +404,80 @@ deploy_stack() {
|
|
|
|
|
success "Stack deployment command completed"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Enhanced health checking with multiple validation methods
|
|
|
|
|
# Enhanced health checking optimized for speed and accuracy
|
|
|
|
|
comprehensive_health_check() {
|
|
|
|
|
log "Starting comprehensive health check (${HEALTH_CHECK_TIMEOUT}s timeout)"
|
|
|
|
|
log "🔍 Starting rapid health verification (${HEALTH_CHECK_TIMEOUT}s timeout)"
|
|
|
|
|
local start_time=$(date +%s)
|
|
|
|
|
local timeout=$HEALTH_CHECK_TIMEOUT
|
|
|
|
|
|
|
|
|
|
# Wait for services to initialize
|
|
|
|
|
log "Waiting for services to initialize (30 seconds)"
|
|
|
|
|
sleep 30
|
|
|
|
|
# Minimal initial wait - just 10 seconds instead of 30
|
|
|
|
|
log "Brief initialization wait (10 seconds)..."
|
|
|
|
|
sleep 10
|
|
|
|
|
|
|
|
|
|
# Check deployment status
|
|
|
|
|
# Get immediate deployment status
|
|
|
|
|
log "Checking deployment status"
|
|
|
|
|
docker stack ps "${CI_REPO_NAME}"
|
|
|
|
|
|
|
|
|
|
# Health check loop with multiple validation methods
|
|
|
|
|
# Fast health check loop with 2-second intervals
|
|
|
|
|
local check_count=0
|
|
|
|
|
local max_checks=$((timeout / 5))
|
|
|
|
|
local max_checks=$((timeout / 2)) # Check every 2 seconds
|
|
|
|
|
local authelia_healthy=false
|
|
|
|
|
local last_status=""
|
|
|
|
|
|
|
|
|
|
while [[ $check_count -lt $max_checks ]]; do
|
|
|
|
|
local current_time=$(date +%s)
|
|
|
|
|
local elapsed=$((current_time - start_time))
|
|
|
|
|
|
|
|
|
|
log "Health check attempt $((check_count + 1))/${max_checks} (${elapsed}s elapsed)"
|
|
|
|
|
# Only log every 10 seconds to reduce noise
|
|
|
|
|
if [[ $((check_count % 5)) -eq 0 ]]; then
|
|
|
|
|
log "Health check ${check_count}/${max_checks} (${elapsed}s elapsed)"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# Check if authelia service is running
|
|
|
|
|
if docker stack ps "${CI_REPO_NAME}" | grep -q "authelia_authelia.*Running"; then
|
|
|
|
|
success "✅ Authelia service is running!"
|
|
|
|
|
# Get current service status
|
|
|
|
|
local service_status
|
|
|
|
|
service_status=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Name}}\t{{.CurrentState}}\t{{.Error}}" | grep "authelia_authelia" | head -n1)
|
|
|
|
|
|
|
|
|
|
if [[ -n "$service_status" ]]; then
|
|
|
|
|
local name=$(echo "$service_status" | cut -f1)
|
|
|
|
|
local state=$(echo "$service_status" | cut -f2)
|
|
|
|
|
local error_msg=$(echo "$service_status" | cut -f3)
|
|
|
|
|
|
|
|
|
|
# Additional verification checks
|
|
|
|
|
log "Performing additional health verification..."
|
|
|
|
|
sleep 5
|
|
|
|
|
|
|
|
|
|
# Check service is stable (not restarting)
|
|
|
|
|
local service_info
|
|
|
|
|
service_info=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | head -n1)
|
|
|
|
|
|
|
|
|
|
if echo "$service_info" | grep -q "Running"; then
|
|
|
|
|
# Check if there are any failed instances
|
|
|
|
|
local failed_count
|
|
|
|
|
failed_count=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | grep -c "Failed" || echo "0")
|
|
|
|
|
|
|
|
|
|
if [[ $failed_count -eq 0 ]]; then
|
|
|
|
|
success "🎉 Production deployment completed successfully!"
|
|
|
|
|
success "Authelia service is healthy and stable"
|
|
|
|
|
success "Total deployment time: ${elapsed} seconds"
|
|
|
|
|
ROLLBACK_NEEDED=false
|
|
|
|
|
return 0
|
|
|
|
|
else
|
|
|
|
|
warning "Found $failed_count failed service instances, continuing health checks..."
|
|
|
|
|
# Check for Running state
|
|
|
|
|
if echo "$state" | grep -q "Running"; then
|
|
|
|
|
# Verify it's actually stable by checking for a few seconds
|
|
|
|
|
if [[ "$last_status" == "Running" ]]; then
|
|
|
|
|
# Double-check: no recent failures
|
|
|
|
|
local failed_count
|
|
|
|
|
failed_count=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | grep -c "Failed" || echo "0")
|
|
|
|
|
|
|
|
|
|
if [[ $failed_count -eq 0 ]]; then
|
|
|
|
|
# Final verification: ensure we're using the new image
|
|
|
|
|
local current_image
|
|
|
|
|
current_image=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Image}}" | grep authelia | head -n1)
|
|
|
|
|
|
|
|
|
|
if [[ "$current_image" == *"$NEW_IMAGE_HASH"* ]] || [[ -z "$NEW_IMAGE_HASH" ]]; then
|
|
|
|
|
success "✅ Authelia service is healthy and running!"
|
|
|
|
|
success "🎯 Using correct image: $current_image"
|
|
|
|
|
success "⚡ Total deployment time: ${elapsed} seconds"
|
|
|
|
|
ROLLBACK_NEEDED=false
|
|
|
|
|
return 0
|
|
|
|
|
else
|
|
|
|
|
warning "⚠️ Service running but using wrong image: $current_image (expected: $NEW_IMAGE_HASH)"
|
|
|
|
|
fi
|
|
|
|
|
else
|
|
|
|
|
warning "⚠️ Service running but found $failed_count failed instances"
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
last_status="Running"
|
|
|
|
|
elif echo "$state" | grep -q "Failed\|Rejected\|Shutdown"; then
|
|
|
|
|
error "❌ Service failed: $state"
|
|
|
|
|
if [[ -n "$error_msg" ]]; then
|
|
|
|
|
error "Error: $error_msg"
|
|
|
|
|
fi
|
|
|
|
|
break # Exit early on clear failure
|
|
|
|
|
else
|
|
|
|
|
last_status="$state"
|
|
|
|
|
debug "Service state: $state"
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
@ -359,25 +485,36 @@ comprehensive_health_check() {
|
|
|
|
|
break
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
log "Waiting for authelia service... (${elapsed}s/${timeout}s)"
|
|
|
|
|
sleep 5
|
|
|
|
|
sleep 2
|
|
|
|
|
((check_count++))
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
# Health check failed
|
|
|
|
|
error "❌ Health check failed after ${timeout} seconds"
|
|
|
|
|
# Health check failed - provide comprehensive diagnostics
|
|
|
|
|
error "❌ Health check failed after ${elapsed} seconds"
|
|
|
|
|
error "Deployment verification failed"
|
|
|
|
|
|
|
|
|
|
# Show detailed debugging information
|
|
|
|
|
error "=== DEBUGGING INFORMATION ==="
|
|
|
|
|
error "Stack status:"
|
|
|
|
|
docker stack ps "${CI_REPO_NAME}" || true
|
|
|
|
|
# Get detailed diagnostics for each service
|
|
|
|
|
log "🔍 Gathering comprehensive diagnostics..."
|
|
|
|
|
|
|
|
|
|
error "Authelia service logs (last 30 lines):"
|
|
|
|
|
docker service logs "${CI_REPO_NAME}_authelia" --tail 30 || true
|
|
|
|
|
local services=("authelia" "mariadb" "redis")
|
|
|
|
|
for service in "${services[@]}"; do
|
|
|
|
|
if docker service ls --format "{{.Name}}" | grep -q "${CI_REPO_NAME}_${service}"; then
|
|
|
|
|
get_container_diagnostics "$service"
|
|
|
|
|
else
|
|
|
|
|
error "Service ${CI_REPO_NAME}_${service} not found!"
|
|
|
|
|
fi
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
error "Docker service inspect:"
|
|
|
|
|
docker service inspect "${CI_REPO_NAME}_authelia" --pretty || true
|
|
|
|
|
# Additional stack-level diagnostics
|
|
|
|
|
error "=== 📊 STACK-LEVEL DIAGNOSTICS ==="
|
|
|
|
|
error "Full stack status:"
|
|
|
|
|
docker stack ps "${CI_REPO_NAME}" --no-trunc || true
|
|
|
|
|
|
|
|
|
|
error "Stack services:"
|
|
|
|
|
docker stack services "${CI_REPO_NAME}" || true
|
|
|
|
|
|
|
|
|
|
error "Recent Docker events:"
|
|
|
|
|
docker events --since="$((elapsed + 60))s" --until="now" --filter "container" 2>/dev/null | tail -10 || true
|
|
|
|
|
|
|
|
|
|
return 1
|
|
|
|
|
}
|
|
|
|
@ -395,13 +532,16 @@ main() {
|
|
|
|
|
# Step 1: Docker registry login
|
|
|
|
|
docker_registry_login
|
|
|
|
|
|
|
|
|
|
# Step 1.5: Force pull latest images to ensure fresh deployment
|
|
|
|
|
force_pull_latest_images
|
|
|
|
|
|
|
|
|
|
# Step 2: Remove old stack to release secrets
|
|
|
|
|
log "Removing old stack to release secrets"
|
|
|
|
|
docker stack rm "${CI_REPO_NAME}" || true
|
|
|
|
|
|
|
|
|
|
# Step 3: Wait for complete stack removal with timeout
|
|
|
|
|
log "Waiting for complete stack removal (30 seconds minimum)"
|
|
|
|
|
sleep 30
|
|
|
|
|
# Step 3: Wait for complete stack removal with optimized timeout
|
|
|
|
|
log "Waiting for complete stack removal (minimum 15 seconds)"
|
|
|
|
|
sleep 15 # Reduced from 30 seconds
|
|
|
|
|
wait_for_stack_removal
|
|
|
|
|
|
|
|
|
|
# Step 4 & 5: Manage secrets (remove old, create new)
|
|
|
|
@ -410,10 +550,11 @@ main() {
|
|
|
|
|
# Step 6: Deploy new stack
|
|
|
|
|
deploy_stack
|
|
|
|
|
|
|
|
|
|
# Step 7-9: Comprehensive health checking
|
|
|
|
|
# Step 7-9: Rapid health checking with container diagnostics
|
|
|
|
|
comprehensive_health_check
|
|
|
|
|
|
|
|
|
|
success "🎉 Production deployment completed successfully!"
|
|
|
|
|
success "🏆 Deployed image: $NEW_IMAGE_HASH"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Run main function
|
|
|
|
|