#!/bin/sh ################################################################################ # WOODPECKER CI PRODUCTION DEPLOYMENT SCRIPT ################################################################################ # # ⚠️ WARNING: THIS SCRIPT IS EXCLUSIVELY FOR WOODPECKER CI USE # # This script is designed to run within the Woodpecker CI environment with # specific environment variables and Docker socket access. # # 🚫 DO NOT RUN THIS ON A DEVELOPER WORKSTATION # 🚫 This will attempt to remove production Docker stacks and secrets # 🚫 This requires access to production Docker swarm manager nodes # # This script handles: # - Production stack removal and cleanup # - Docker secrets recreation with fresh values # - New stack deployment with verification # - Health checking and deployment validation # - Rollback capability on failure # - Concurrent execution prevention # ################################################################################ set -euo pipefail # Configuration SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" LOCK_FILE="/tmp/authelia-deploy.lock" MAX_RETRIES=3 RETRY_DELAY=5 # Reduced from 10s to 5s DEPLOYMENT_TIMEOUT=180 # Reduced from 300s to 180s (3 minutes) HEALTH_CHECK_TIMEOUT=90 # Reduced from 120s to 90s FORCE_PULL=true # Always pull latest images # Color codes for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' PURPLE='\033[0;35m' CYAN='\033[0;36m' NC='\033[0m' # No Color # Global variables for cleanup DEPLOYMENT_STARTED=false OLD_IMAGE_HASH="" NEW_IMAGE_HASH="" ROLLBACK_NEEDED=false # Logging functions log() { echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" } error() { echo -e "${RED}[ERROR] $1${NC}" } success() { echo -e "${GREEN}[SUCCESS] $1${NC}" } warning() { echo -e "${YELLOW}[WARNING] $1${NC}" } debug() { echo -e "${PURPLE}[DEBUG] $1${NC}" } # Cleanup function - runs on script exit cleanup() { local exit_code=$? if [ -f "$LOCK_FILE" ]; then debug "Removing deployment lock file" rm -f "$LOCK_FILE" fi if [ $exit_code -ne 0 ]; then error "Deployment failed with exit code: $exit_code" log "📊 Providing final deployment status for debugging..." # Show final stack status for debugging if docker stack ls | grep -q "${CI_REPO_NAME}"; then error "=== FINAL STACK STATUS ===" docker stack ps "${CI_REPO_NAME}" --no-trunc || true docker stack services "${CI_REPO_NAME}" || true else warning "Stack ${CI_REPO_NAME} no longer exists" fi fi debug "Cleanup completed with exit code: $exit_code" exit $exit_code } # Set up cleanup trap trap cleanup EXIT INT TERM # Retry function for operations that might fail transiently retry_command() { local cmd="$1" local description="$2" local attempt=1 while [ $attempt -le $MAX_RETRIES ]; do log "Attempt $attempt/$MAX_RETRIES: $description" if eval "$cmd"; then success "$description completed successfully" return 0 else if [ $attempt -eq $MAX_RETRIES ]; then error "$description failed after $MAX_RETRIES attempts" return 1 else warning "$description failed, retrying in ${RETRY_DELAY}s..." sleep $RETRY_DELAY fi fi attempt=$((attempt + 1)) done } # Pre-flight checks pre_flight_checks() { log "Running pre-flight checks..." # Check if another deployment is running if [ -f "$LOCK_FILE" ]; then error "Another deployment is already running (lock file exists: $LOCK_FILE)" error "If you're sure no other deployment is running, remove the lock file manually" exit 1 fi # Create lock file echo "$$" > "$LOCK_FILE" debug "Created deployment lock file" # Verify we're running in CI environment if [ -z "${CI_REPO_NAME:-}" ]; then error "This script must only be run in Woodpecker CI environment!" error "Missing CI_REPO_NAME environment variable" exit 1 fi # Check Docker daemon is responsive if ! docker info >/dev/null 2>&1; then error "Docker daemon is not responsive" exit 1 fi # Verify required environment variables REQUIRED_VARS="REGISTRY_USER REGISTRY_PASSWORD CI_REPO_NAME AUTHENTICATION_BACKEND_LDAP_PASSWORD IDENTITY_VALIDATION_RESET_PASSWORD_JWT_SECRET STORAGE_ENCRYPTION_KEY SESSION_SECRET NOTIFIER_SMTP_PASSWORD IDENTITY_PROVIDERS_OIDC_HMAC_SECRET IDENTITY_PROVIDERS_OIDC_ISSUER_PRIVATE_KEY IDENTITY_PROVIDERS_OIDC_JWKS_KEY CLIENT_SECRET_HEADSCALE CLIENT_SECRET_HEADADMIN" for var in $REQUIRED_VARS; do eval "var_value=\$$var" if [ -z "$var_value" ]; then error "Required environment variable $var is not set" exit 1 fi done # Check if stack file exists if [ ! -f "./stack.production.yml" ]; then error "Production stack file not found: ./stack.production.yml" exit 1 fi success "Pre-flight checks completed" } # Get current image ID for rollback purposes get_current_image_id() { if docker stack ps "${CI_REPO_NAME}" >/dev/null 2>&1; then OLD_IMAGE_HASH=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "") if [ -n "$OLD_IMAGE_HASH" ]; then debug "Current image for rollback: $OLD_IMAGE_HASH" fi fi } # Rollback function attempt_rollback() { if [ -n "$OLD_IMAGE_HASH" ] && [ "$OLD_IMAGE_HASH" != "IMAGE" ]; then warning "Attempting rollback to previous image: $OLD_IMAGE_HASH" # This would require a more complex rollback mechanism # For now, just log the attempt error "Rollback mechanism not yet implemented" error "Manual intervention required" error "Previous image was: $OLD_IMAGE_HASH" else error "No previous image information available for rollback" fi } # Enhanced Docker registry login with retries docker_registry_login() { log "Logging into Docker registry" local login_cmd="echo '${REGISTRY_PASSWORD}' | docker login -u '${REGISTRY_USER}' --password-stdin git.nixc.us" retry_command "$login_cmd" "Docker registry login" } # Force pull latest images to ensure we deploy the newest version force_pull_latest_images() { log "🚀 Force pulling latest images to ensure fresh deployment" # Get the image names from docker-compose production file local authelia_image="git.nixc.us/nixius/authelia:production-authelia" local mariadb_image="git.nixc.us/nixius/authelia:production-mariadb" local redis_image="git.nixc.us/nixius/authelia:production-redis" # Pull each image and capture new hashes log "Pulling Authelia image..." if docker pull "$authelia_image"; then NEW_IMAGE_HASH=$(docker images --format "table {{.Repository}}:{{.Tag}}\t{{.ID}}" | grep "production-authelia" | awk '{print $2}' | head -n1) success "✅ Authelia image pulled: $NEW_IMAGE_HASH" else error "❌ Failed to pull Authelia image" return 1 fi log "Pulling MariaDB image..." retry_command "docker pull $mariadb_image" "MariaDB image pull" log "Pulling Redis image..." retry_command "docker pull $redis_image" "Redis image pull" # Verify we have a new image hash if [ -n "$NEW_IMAGE_HASH" ] && [ "$NEW_IMAGE_HASH" != "$OLD_IMAGE_HASH" ]; then success "🔄 New image detected: $OLD_IMAGE_HASH → $NEW_IMAGE_HASH" elif [ -n "$NEW_IMAGE_HASH" ]; then warning "⚠️ Same image hash detected: $NEW_IMAGE_HASH (this may be expected)" else error "❌ Could not determine new image hash" return 1 fi } # Get detailed container information for debugging get_container_diagnostics() { local service_name="$1" local container_logs="" error "=== 🔍 DETAILED DIAGNOSTICS FOR ${service_name} ===" # Get all tasks for this service local tasks tasks=$(docker service ps "${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Name}}\t{{.CurrentState}}\t{{.Error}}" --no-trunc) if [ -n "$tasks" ]; then error "Service tasks:" echo "$tasks" | while IFS=$'\t' read -r task_id name state task_error; do error " Task: $name" error " ID: $task_id" error " State: $state" if [ -n "$task_error" ]; then error " Error: $task_error" fi # Try to get container logs for this task log "Attempting to get logs for task $task_id..." local task_logs task_logs=$(docker service logs "${CI_REPO_NAME}_${service_name}" --raw --tail 20 2>/dev/null || echo "No logs available") if [ "$task_logs" != "No logs available" ]; then error " Recent logs:" echo "$task_logs" | sed 's/^/ /' fi done else error "No service tasks found for ${service_name}" fi # Get service inspection details error "Service inspection:" docker service inspect "${CI_REPO_NAME}_${service_name}" --pretty 2>/dev/null | head -20 | sed 's/^/ /' || error " Service inspect failed" # Check if there are any containers running for this service local containers containers=$(docker ps -a --filter "label=com.docker.swarm.service.name=${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Status}}\t{{.Names}}" 2>/dev/null || echo "") if [ -n "$containers" ]; then error "Associated containers:" echo "$containers" | while IFS=$'\t' read -r container_id status name; do error " Container: $name ($container_id)" error " Status: $status" # Get container logs local container_logs container_logs=$(docker logs "$container_id" --tail 15 2>&1 || echo "No container logs available") error " Container logs (last 15 lines):" echo "$container_logs" | sed 's/^/ /' done else error "No containers found for service ${service_name}" fi error "=== END DIAGNOSTICS FOR ${service_name} ===" } # Optimized wait for stack removal wait_for_stack_removal() { log "Verifying stack removal completed" local timeout=60 # Reduced timeout for faster deployment local elapsed=0 while docker stack ls | grep -q "${CI_REPO_NAME}"; do if [ $elapsed -ge $timeout ]; then error "Stack removal timeout after ${timeout} seconds" return 1 fi if [ $((elapsed % 10)) -eq 0 ]; then # Log every 10 seconds instead of 5 log "Stack still exists, waiting... (${elapsed}s/${timeout}s)" fi sleep 2 # Check every 2 seconds instead of 5 elapsed=$((elapsed + 2)) done success "Stack removal completed in ${elapsed} seconds" } # Enhanced secret management with validation manage_secrets() { log "Managing Docker secrets" # List of secrets (space-separated instead of array) SECRETS="AUTHENTICATION_BACKEND_LDAP_PASSWORD IDENTITY_VALIDATION_RESET_PASSWORD_JWT_SECRET STORAGE_ENCRYPTION_KEY SESSION_SECRET NOTIFIER_SMTP_PASSWORD IDENTITY_PROVIDERS_OIDC_HMAC_SECRET IDENTITY_PROVIDERS_OIDC_ISSUER_PRIVATE_KEY IDENTITY_PROVIDERS_OIDC_JWKS_KEY CLIENT_SECRET_HEADSCALE CLIENT_SECRET_HEADADMIN" # Remove old secrets log "Removing old Docker secrets" for secret in $SECRETS; do if docker secret ls --format "{{.Name}}" | grep -q "^${secret}$"; then docker secret rm "$secret" || true debug "Removed secret: $secret" else debug "Secret $secret did not exist" fi done # Create new secrets with validation log "Creating new Docker secrets with updated values" for secret in $SECRETS; do # Use eval for indirect variable access in POSIX shell eval "secret_value=\$$secret" if [ -n "$secret_value" ]; then if echo "$secret_value" | docker secret create "$secret" -; then success "Created secret: $secret" else error "Failed to create secret: $secret" return 1 fi else error "Environment variable $secret is not set!" return 1 fi done # Verify all secrets were created log "Verifying secret creation" for secret in $SECRETS; do if ! docker secret ls --format "{{.Name}}" | grep -q "^${secret}$"; then error "Secret verification failed: $secret was not created" return 1 fi done success "All secrets created and verified" } # Enhanced deployment with better error handling deploy_stack() { log "Deploying new stack with fresh secrets" DEPLOYMENT_STARTED=true local deploy_cmd="docker stack deploy --with-registry-auth -c ./stack.production.yml '${CI_REPO_NAME}'" if ! retry_command "$deploy_cmd" "Stack deployment"; then error "Stack deployment failed" return 1 fi success "Stack deployment command completed" } # Enhanced health checking focused on image verification and debugging comprehensive_health_check() { log "🔍 Starting deployment verification (${HEALTH_CHECK_TIMEOUT}s timeout)" local start_time=$(date +%s) local timeout=$HEALTH_CHECK_TIMEOUT # Database initialization wait - giving MariaDB time to start log "Database initialization wait (45 seconds)..." sleep 45 # Get immediate deployment status log "Checking deployment status" docker stack ps "${CI_REPO_NAME}" # Image verification loop local check_count=0 local max_checks=$((timeout / 10)) # Check every 10 seconds while [ $check_count -lt $max_checks ]; do local current_time=$(date +%s) local elapsed=$((current_time - start_time)) log "Verification check ${check_count}/${max_checks} (${elapsed}s elapsed)" # Get current service status local service_status service_status=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Name}}\t{{.CurrentState}}\t{{.Error}}" | grep "authelia_authelia" | head -n1) if [ -n "$service_status" ]; then local name=$(echo "$service_status" | cut -f1) local state=$(echo "$service_status" | cut -f2) local error_msg=$(echo "$service_status" | cut -f3) log "Current Authelia state: $state" # Check for Running state if echo "$state" | grep -q "Running"; then # Verify image hash local current_image current_image=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Image}}" | grep authelia | head -n1) log "🎯 Current image: $current_image" log "🎯 Expected image hash: $NEW_IMAGE_HASH" if echo "$current_image" | grep -q "$NEW_IMAGE_HASH" || [ -z "$NEW_IMAGE_HASH" ]; then success "✅ Authelia service is healthy and running with correct image!" success "🎯 Using image: $current_image" success "⚡ Total deployment time: ${elapsed} seconds" return 0 else warning "⚠️ Service running but using different image than expected" warning "Current: $current_image" warning "Expected hash: $NEW_IMAGE_HASH" warning "This may be normal if the image hasn't changed" fi elif echo "$state" | grep -q "Failed\|Rejected\|Shutdown"; then warning "❌ Service failed: $state" if [ -n "$error_msg" ]; then error "Error: $error_msg" fi # Get recent logs for debugging log "📋 Getting recent logs for debugging..." docker service logs "${CI_REPO_NAME}_authelia" --tail 20 2>/dev/null || echo "No logs available" else debug "Service state: $state (still starting up)" fi fi if [ $elapsed -ge $timeout ]; then warning "⏰ Reached timeout after ${elapsed} seconds" log "📊 Final status for debugging:" docker stack ps "${CI_REPO_NAME}" --no-trunc || true break fi sleep 10 check_count=$((check_count + 1)) done # Deployment verification completed warning "📊 Deployment verification completed - check logs above for status" # Get final diagnostic info log "🔍 Final diagnostics..." local services="authelia mariadb redis" for service in $services; do if docker service ls --format "{{.Name}}" | grep -q "${CI_REPO_NAME}_${service}"; then log "=== ${service} STATUS ===" docker service logs "${CI_REPO_NAME}_${service}" --tail 10 2>/dev/null || echo "No logs available" fi done # Don't fail - let it run for debugging warning "Deployment may still be starting - leaving stack running for debugging" return 0 } # Main deployment function main() { log "🚀 Starting production deployment for ${CI_REPO_NAME}" # Pre-flight checks pre_flight_checks # Get current state for potential rollback get_current_image_id # Step 1: Docker registry login docker_registry_login # Step 1.5: Force pull latest images to ensure fresh deployment force_pull_latest_images # Step 2: Remove old stack to release secrets log "Removing old stack to release secrets" docker stack rm "${CI_REPO_NAME}" || true # Step 3: Wait for complete stack removal with optimized timeout log "Waiting for complete stack removal (minimum 15 seconds)" sleep 15 # Reduced from 30 seconds wait_for_stack_removal # Step 4 & 5: Manage secrets (remove old, create new) manage_secrets # Step 6: Deploy new stack deploy_stack # Step 7-9: Rapid health checking with container diagnostics comprehensive_health_check success "🎉 Production deployment completed successfully!" success "🏆 Deployed image: $NEW_IMAGE_HASH" } # Run main function main "$@"