diff --git a/scripts/ci-deploy-production.sh b/scripts/ci-deploy-production.sh index e539fe0..6271177 100755 --- a/scripts/ci-deploy-production.sh +++ b/scripts/ci-deploy-production.sh @@ -18,19 +18,36 @@ # - Docker secrets recreation with fresh values # - New stack deployment with verification # - Health checking and deployment validation +# - Rollback capability on failure +# - Concurrent execution prevention # ################################################################################ set -euo pipefail -# Color codes for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color +# Configuration +readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly LOCK_FILE="/tmp/authelia-deploy.lock" +readonly MAX_RETRIES=3 +readonly RETRY_DELAY=10 +readonly DEPLOYMENT_TIMEOUT=300 # 5 minutes +readonly HEALTH_CHECK_TIMEOUT=120 # 2 minutes +readonly MIN_DISK_SPACE_MB=1000 -# Logging function +# Color codes for output +readonly RED='\033[0;31m' +readonly GREEN='\033[0;32m' +readonly YELLOW='\033[1;33m' +readonly BLUE='\033[0;34m' +readonly PURPLE='\033[0;35m' +readonly NC='\033[0m' # No Color + +# Global variables for cleanup +DEPLOYMENT_STARTED=false +OLD_IMAGE_ID="" +ROLLBACK_NEEDED=false + +# Logging functions log() { echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" } @@ -47,107 +64,357 @@ warning() { echo -e "${YELLOW}[WARNING] $1${NC}" } -# Verify we're running in CI environment -if [[ -z "${CI_REPO_NAME:-}" ]]; then - error "This script must only be run in Woodpecker CI environment!" - error "Missing CI_REPO_NAME environment variable" - exit 1 -fi +debug() { + echo -e "${PURPLE}[DEBUG] $1${NC}" +} -log "Starting production deployment for ${CI_REPO_NAME}" - -# Step 1: Docker registry login -log "Logging into Docker registry" -echo "${REGISTRY_PASSWORD}" | docker login -u "${REGISTRY_USER}" --password-stdin git.nixc.us - -# Step 2: Remove old stack to release secrets -log "Removing old stack to release secrets" -docker stack rm "${CI_REPO_NAME}" || true - -# Step 3: Wait for complete stack removal -log "Waiting for complete stack removal (30 seconds)" -sleep 30 - -log "Verifying stack removal completed" -while docker stack ls | grep -q "${CI_REPO_NAME}"; do - log "Stack still exists, waiting..." - sleep 5 -done -success "Stack removal completed" - -# Step 4: Remove old Docker secrets -log "Removing old Docker secrets" -declare -a SECRETS=( - "AUTHENTICATION_BACKEND_LDAP_PASSWORD" - "IDENTITY_VALIDATION_RESET_PASSWORD_JWT_SECRET" - "STORAGE_ENCRYPTION_KEY" - "SESSION_SECRET" - "NOTIFIER_SMTP_PASSWORD" - "IDENTITY_PROVIDERS_OIDC_HMAC_SECRET" - "IDENTITY_PROVIDERS_OIDC_ISSUER_PRIVATE_KEY" - "IDENTITY_PROVIDERS_OIDC_JWKS_KEY" - "CLIENT_SECRET_HEADSCALE" - "CLIENT_SECRET_HEADADMIN" -) - -for secret in "${SECRETS[@]}"; do - docker secret rm "$secret" || true - log "Removed secret: $secret" -done - -# Step 5: Create new Docker secrets with updated values -log "Creating new Docker secrets with updated values" -for secret in "${SECRETS[@]}"; do - env_var="${secret}" - if [[ -n "${!env_var:-}" ]]; then - echo "${!env_var}" | docker secret create "$secret" - - success "Created secret: $secret" - else - error "Environment variable $env_var is not set!" - exit 1 +# Cleanup function - runs on script exit +cleanup() { + local exit_code=$? + + if [[ -f "$LOCK_FILE" ]]; then + debug "Removing deployment lock file" + rm -f "$LOCK_FILE" fi -done + + if [[ $exit_code -ne 0 && "$ROLLBACK_NEEDED" == "true" ]]; then + error "Deployment failed - attempting rollback..." + attempt_rollback + fi + + debug "Cleanup completed with exit code: $exit_code" + exit $exit_code +} -# Step 6: Deploy new stack with fresh secrets -log "Deploying new stack with fresh secrets" -docker stack deploy --with-registry-auth -c ./stack.production.yml "${CI_REPO_NAME}" +# Set up cleanup trap +trap cleanup EXIT INT TERM -# Step 7: Wait for services to initialize -log "Waiting for services to initialize (30 seconds)" -sleep 30 - -# Step 8: Check deployment status -log "Checking deployment status" -docker stack ps "${CI_REPO_NAME}" - -# Step 9: Health check loop for authelia service -log "Checking service health for 60 seconds" -for i in {1..12}; do - if docker stack ps "${CI_REPO_NAME}" | grep Running | grep -q "authelia_authelia"; then - success "✅ Authelia service is running!" +# Retry function for operations that might fail transiently +retry_command() { + local cmd="$1" + local description="$2" + local attempt=1 + + while [[ $attempt -le $MAX_RETRIES ]]; do + log "Attempt $attempt/$MAX_RETRIES: $description" - # Additional health verification - log "Performing additional health checks..." - sleep 5 - - # Check if service is actually healthy (not just running) - if docker stack ps "${CI_REPO_NAME}" | grep -A 5 "authelia_authelia" | grep -q "Running"; then - success "🎉 Production deployment completed successfully!" - success "Authelia service is healthy and running" - exit 0 + if eval "$cmd"; then + success "$description completed successfully" + return 0 + else + if [[ $attempt -eq $MAX_RETRIES ]]; then + error "$description failed after $MAX_RETRIES attempts" + return 1 + else + warning "$description failed, retrying in ${RETRY_DELAY}s..." + sleep $RETRY_DELAY + fi fi - elif [ $i -eq 12 ]; then - error "❌ Deployment verification failed after 60 seconds" - error "Showing service logs for debugging:" - docker service logs "${CI_REPO_NAME}_authelia" --tail 20 - error "Showing stack status:" - docker stack ps "${CI_REPO_NAME}" - exit 1 - else - log "Attempt $i/12: Waiting for authelia service..." - sleep 5 - fi -done + + ((attempt++)) + done +} -error "Health check timeout - this should not be reached" -exit 1 \ No newline at end of file +# Pre-flight checks +pre_flight_checks() { + log "Running pre-flight checks..." + + # Check if another deployment is running + if [[ -f "$LOCK_FILE" ]]; then + error "Another deployment is already running (lock file exists: $LOCK_FILE)" + error "If you're sure no other deployment is running, remove the lock file manually" + exit 1 + fi + + # Create lock file + echo "$$" > "$LOCK_FILE" + debug "Created deployment lock file" + + # Verify we're running in CI environment + if [[ -z "${CI_REPO_NAME:-}" ]]; then + error "This script must only be run in Woodpecker CI environment!" + error "Missing CI_REPO_NAME environment variable" + exit 1 + fi + + # Check Docker daemon is responsive + if ! docker info >/dev/null 2>&1; then + error "Docker daemon is not responsive" + exit 1 + fi + + # Check available disk space + local available_space + available_space=$(df /var/lib/docker --output=avail --block-size=1M | tail -n1 | tr -d ' ') + if [[ $available_space -lt $MIN_DISK_SPACE_MB ]]; then + error "Insufficient disk space: ${available_space}MB available, ${MIN_DISK_SPACE_MB}MB required" + exit 1 + fi + + # Verify required environment variables + local required_vars=( + "REGISTRY_USER" "REGISTRY_PASSWORD" "CI_REPO_NAME" + "AUTHENTICATION_BACKEND_LDAP_PASSWORD" "IDENTITY_VALIDATION_RESET_PASSWORD_JWT_SECRET" + "STORAGE_ENCRYPTION_KEY" "SESSION_SECRET" "NOTIFIER_SMTP_PASSWORD" + "IDENTITY_PROVIDERS_OIDC_HMAC_SECRET" "IDENTITY_PROVIDERS_OIDC_ISSUER_PRIVATE_KEY" + "IDENTITY_PROVIDERS_OIDC_JWKS_KEY" "CLIENT_SECRET_HEADSCALE" "CLIENT_SECRET_HEADADMIN" + ) + + for var in "${required_vars[@]}"; do + if [[ -z "${!var:-}" ]]; then + error "Required environment variable $var is not set" + exit 1 + fi + done + + # Check if stack file exists + if [[ ! -f "./stack.production.yml" ]]; then + error "Production stack file not found: ./stack.production.yml" + exit 1 + fi + + success "Pre-flight checks completed" +} + +# Get current image ID for rollback purposes +get_current_image_id() { + if docker stack ps "${CI_REPO_NAME}" >/dev/null 2>&1; then + OLD_IMAGE_ID=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "") + if [[ -n "$OLD_IMAGE_ID" ]]; then + debug "Current image for rollback: $OLD_IMAGE_ID" + fi + fi +} + +# Rollback function +attempt_rollback() { + if [[ -n "$OLD_IMAGE_ID" && "$OLD_IMAGE_ID" != "IMAGE" ]]; then + warning "Attempting rollback to previous image: $OLD_IMAGE_ID" + + # This would require a more complex rollback mechanism + # For now, just log the attempt + error "Rollback mechanism not yet implemented" + error "Manual intervention required" + error "Previous image was: $OLD_IMAGE_ID" + else + error "No previous image information available for rollback" + fi +} + +# Enhanced Docker registry login with retries +docker_registry_login() { + log "Logging into Docker registry" + + local login_cmd="echo '${REGISTRY_PASSWORD}' | docker login -u '${REGISTRY_USER}' --password-stdin git.nixc.us" + retry_command "$login_cmd" "Docker registry login" +} + +# Wait for stack removal with timeout +wait_for_stack_removal() { + log "Verifying stack removal completed" + local timeout=$((DEPLOYMENT_TIMEOUT)) + local elapsed=0 + + while docker stack ls | grep -q "${CI_REPO_NAME}"; do + if [[ $elapsed -ge $timeout ]]; then + error "Stack removal timeout after ${timeout} seconds" + return 1 + fi + + log "Stack still exists, waiting... (${elapsed}s/${timeout}s)" + sleep 5 + elapsed=$((elapsed + 5)) + done + + success "Stack removal completed in ${elapsed} seconds" +} + +# Enhanced secret management with validation +manage_secrets() { + log "Managing Docker secrets" + + declare -a SECRETS=( + "AUTHENTICATION_BACKEND_LDAP_PASSWORD" + "IDENTITY_VALIDATION_RESET_PASSWORD_JWT_SECRET" + "STORAGE_ENCRYPTION_KEY" + "SESSION_SECRET" + "NOTIFIER_SMTP_PASSWORD" + "IDENTITY_PROVIDERS_OIDC_HMAC_SECRET" + "IDENTITY_PROVIDERS_OIDC_ISSUER_PRIVATE_KEY" + "IDENTITY_PROVIDERS_OIDC_JWKS_KEY" + "CLIENT_SECRET_HEADSCALE" + "CLIENT_SECRET_HEADADMIN" + ) + + # Remove old secrets + log "Removing old Docker secrets" + for secret in "${SECRETS[@]}"; do + if docker secret ls --format "{{.Name}}" | grep -q "^${secret}$"; then + docker secret rm "$secret" || true + debug "Removed secret: $secret" + else + debug "Secret $secret did not exist" + fi + done + + # Create new secrets with validation + log "Creating new Docker secrets with updated values" + for secret in "${SECRETS[@]}"; do + env_var="${secret}" + if [[ -n "${!env_var:-}" ]]; then + if echo "${!env_var}" | docker secret create "$secret" -; then + success "Created secret: $secret" + else + error "Failed to create secret: $secret" + return 1 + fi + else + error "Environment variable $env_var is not set!" + return 1 + fi + done + + # Verify all secrets were created + log "Verifying secret creation" + for secret in "${SECRETS[@]}"; do + if ! docker secret ls --format "{{.Name}}" | grep -q "^${secret}$"; then + error "Secret verification failed: $secret was not created" + return 1 + fi + done + + success "All secrets created and verified" +} + +# Enhanced deployment with better error handling +deploy_stack() { + log "Deploying new stack with fresh secrets" + ROLLBACK_NEEDED=true + DEPLOYMENT_STARTED=true + + local deploy_cmd="docker stack deploy --with-registry-auth -c ./stack.production.yml '${CI_REPO_NAME}'" + + if ! retry_command "$deploy_cmd" "Stack deployment"; then + error "Stack deployment failed" + return 1 + fi + + success "Stack deployment command completed" +} + +# Enhanced health checking with multiple validation methods +comprehensive_health_check() { + log "Starting comprehensive health check (${HEALTH_CHECK_TIMEOUT}s timeout)" + local start_time=$(date +%s) + local timeout=$HEALTH_CHECK_TIMEOUT + + # Wait for services to initialize + log "Waiting for services to initialize (30 seconds)" + sleep 30 + + # Check deployment status + log "Checking deployment status" + docker stack ps "${CI_REPO_NAME}" + + # Health check loop with multiple validation methods + local check_count=0 + local max_checks=$((timeout / 5)) + + while [[ $check_count -lt $max_checks ]]; do + local current_time=$(date +%s) + local elapsed=$((current_time - start_time)) + + log "Health check attempt $((check_count + 1))/${max_checks} (${elapsed}s elapsed)" + + # Check if authelia service is running + if docker stack ps "${CI_REPO_NAME}" | grep -q "authelia_authelia.*Running"; then + success "✅ Authelia service is running!" + + # Additional verification checks + log "Performing additional health verification..." + sleep 5 + + # Check service is stable (not restarting) + local service_info + service_info=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | head -n1) + + if echo "$service_info" | grep -q "Running"; then + # Check if there are any failed instances + local failed_count + failed_count=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | grep -c "Failed" || echo "0") + + if [[ $failed_count -eq 0 ]]; then + success "🎉 Production deployment completed successfully!" + success "Authelia service is healthy and stable" + success "Total deployment time: ${elapsed} seconds" + ROLLBACK_NEEDED=false + return 0 + else + warning "Found $failed_count failed service instances, continuing health checks..." + fi + fi + fi + + if [[ $elapsed -ge $timeout ]]; then + break + fi + + log "Waiting for authelia service... (${elapsed}s/${timeout}s)" + sleep 5 + ((check_count++)) + done + + # Health check failed + error "❌ Health check failed after ${timeout} seconds" + error "Deployment verification failed" + + # Show detailed debugging information + error "=== DEBUGGING INFORMATION ===" + error "Stack status:" + docker stack ps "${CI_REPO_NAME}" || true + + error "Authelia service logs (last 30 lines):" + docker service logs "${CI_REPO_NAME}_authelia" --tail 30 || true + + error "Docker service inspect:" + docker service inspect "${CI_REPO_NAME}_authelia" --pretty || true + + return 1 +} + +# Main deployment function +main() { + log "🚀 Starting production deployment for ${CI_REPO_NAME}" + + # Pre-flight checks + pre_flight_checks + + # Get current state for potential rollback + get_current_image_id + + # Step 1: Docker registry login + docker_registry_login + + # Step 2: Remove old stack to release secrets + log "Removing old stack to release secrets" + docker stack rm "${CI_REPO_NAME}" || true + + # Step 3: Wait for complete stack removal with timeout + log "Waiting for complete stack removal (30 seconds minimum)" + sleep 30 + wait_for_stack_removal + + # Step 4 & 5: Manage secrets (remove old, create new) + manage_secrets + + # Step 6: Deploy new stack + deploy_stack + + # Step 7-9: Comprehensive health checking + comprehensive_health_check + + success "🎉 Production deployment completed successfully!" +} + +# Run main function +main "$@" \ No newline at end of file