authelia/scripts/ci-deploy-production.sh

#!/bin/bash

################################################################################
# WOODPECKER CI PRODUCTION DEPLOYMENT SCRIPT
################################################################################
#
# ⚠️  WARNING: THIS SCRIPT IS EXCLUSIVELY FOR WOODPECKER CI USE
#
# This script is designed to run within the Woodpecker CI environment with
# specific environment variables and Docker socket access.
#
# 🚫 DO NOT RUN THIS ON A DEVELOPER WORKSTATION
# 🚫 This will attempt to remove production Docker stacks and secrets
# 🚫 This requires access to production Docker swarm manager nodes
#
# This script handles:
# - Production stack removal and cleanup
# - Docker secrets recreation with fresh values
# - New stack deployment with verification
# - Health checking and deployment validation
# - Rollback capability on failure
# - Concurrent execution prevention
#
################################################################################

set -euo pipefail

# Configuration
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly LOCK_FILE="/tmp/authelia-deploy.lock"
readonly MAX_RETRIES=3
readonly RETRY_DELAY=5  # Reduced from 10s to 5s
readonly DEPLOYMENT_TIMEOUT=180  # Reduced from 300s to 180s (3 minutes)
readonly HEALTH_CHECK_TIMEOUT=90  # Reduced from 120s to 90s
readonly MIN_DISK_SPACE_MB=500  # Reduced from 1000MB to 500MB
readonly FORCE_PULL=true  # Always pull latest images

# Color codes for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly PURPLE='\033[0;35m'
readonly CYAN='\033[0;36m'
readonly NC='\033[0m' # No Color

# Global variables for cleanup
DEPLOYMENT_STARTED=false
OLD_IMAGE_HASH=""
NEW_IMAGE_HASH=""
ROLLBACK_NEEDED=false

# Logging functions
log() {
    echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}

error() {
    echo -e "${RED}[ERROR] $1${NC}"
}

success() {
    echo -e "${GREEN}[SUCCESS] $1${NC}"
}

warning() {
    echo -e "${YELLOW}[WARNING] $1${NC}"
}

debug() {
    echo -e "${PURPLE}[DEBUG] $1${NC}"
}

# Cleanup function - runs on script exit
cleanup() {
    local exit_code=$?

    if [[ -f "$LOCK_FILE" ]]; then
        debug "Removing deployment lock file"
        rm -f "$LOCK_FILE"
    fi

    if [[ $exit_code -ne 0 && "$ROLLBACK_NEEDED" == "true" ]]; then
        error "Deployment failed - attempting rollback..."
        attempt_rollback
    fi

    debug "Cleanup completed with exit code: $exit_code"
    exit $exit_code
}

# Set up cleanup trap
trap cleanup EXIT INT TERM

# Retry function for operations that might fail transiently
retry_command() {
    local cmd="$1"
    local description="$2"
    local attempt=1

    while [[ $attempt -le $MAX_RETRIES ]]; do
        log "Attempt $attempt/$MAX_RETRIES: $description"

        if eval "$cmd"; then
            success "$description completed successfully"
            return 0
        else
            if [[ $attempt -eq $MAX_RETRIES ]]; then
                error "$description failed after $MAX_RETRIES attempts"
                return 1
            else
                warning "$description failed, retrying in ${RETRY_DELAY}s..."
                sleep $RETRY_DELAY
            fi
        fi

        ((attempt++))
    done
}

# Pre-flight checks
pre_flight_checks() {
    log "Running pre-flight checks..."

    # Check if another deployment is running
    if [[ -f "$LOCK_FILE" ]]; then
        error "Another deployment is already running (lock file exists: $LOCK_FILE)"
        error "If you're sure no other deployment is running, remove the lock file manually"
        exit 1
    fi

    # Create lock file
    echo "$$" > "$LOCK_FILE"
    debug "Created deployment lock file"

    # Verify we're running in CI environment
    if [[ -z "${CI_REPO_NAME:-}" ]]; then
        error "This script must only be run in Woodpecker CI environment!"
        error "Missing CI_REPO_NAME environment variable"
        exit 1
    fi

    # Check Docker daemon is responsive
    if ! docker info >/dev/null 2>&1; then
        error "Docker daemon is not responsive"
        exit 1
    fi

    # Check available disk space
    local available_space
    available_space=$(df /var/lib/docker --output=avail --block-size=1M | tail -n1 | tr -d ' ')
    if [[ $available_space -lt $MIN_DISK_SPACE_MB ]]; then
        error "Insufficient disk space: ${available_space}MB available, ${MIN_DISK_SPACE_MB}MB required"
        exit 1
    fi

    # Verify required environment variables
    local required_vars=(
        "REGISTRY_USER" "REGISTRY_PASSWORD" "CI_REPO_NAME"
        "AUTHENTICATION_BACKEND_LDAP_PASSWORD" "IDENTITY_VALIDATION_RESET_PASSWORD_JWT_SECRET"
        "STORAGE_ENCRYPTION_KEY" "SESSION_SECRET" "NOTIFIER_SMTP_PASSWORD"
        "IDENTITY_PROVIDERS_OIDC_HMAC_SECRET" "IDENTITY_PROVIDERS_OIDC_ISSUER_PRIVATE_KEY"
        "IDENTITY_PROVIDERS_OIDC_JWKS_KEY" "CLIENT_SECRET_HEADSCALE" "CLIENT_SECRET_HEADADMIN"
    )

    for var in "${required_vars[@]}"; do
        if [[ -z "${!var:-}" ]]; then
            error "Required environment variable $var is not set"
            exit 1
        fi
    done

    # Check if stack file exists
    if [[ ! -f "./stack.production.yml" ]]; then
        error "Production stack file not found: ./stack.production.yml"
        exit 1
    fi

    success "Pre-flight checks completed"
}

# Get current image ID for rollback purposes
get_current_image_id() {
    if docker stack ps "${CI_REPO_NAME}" >/dev/null 2>&1; then
        OLD_IMAGE_HASH=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "")
        if [[ -n "$OLD_IMAGE_HASH" ]]; then
            debug "Current image for rollback: $OLD_IMAGE_HASH"
        fi
    fi
}

# Rollback function
attempt_rollback() {
    if [[ -n "$OLD_IMAGE_HASH" && "$OLD_IMAGE_HASH" != "IMAGE" ]]; then
        warning "Attempting rollback to previous image: $OLD_IMAGE_HASH"

        # This would require a more complex rollback mechanism
        # For now, just log the attempt
        error "Rollback mechanism not yet implemented"
        error "Manual intervention required"
        error "Previous image was: $OLD_IMAGE_HASH"
    else
        error "No previous image information available for rollback"
    fi
}

# Enhanced Docker registry login with retries
docker_registry_login() {
    log "Logging into Docker registry"

    local login_cmd="echo '${REGISTRY_PASSWORD}' | docker login -u '${REGISTRY_USER}' --password-stdin git.nixc.us"
    retry_command "$login_cmd" "Docker registry login"
}

# Force pull latest images to ensure we deploy the newest version
force_pull_latest_images() {
    log "🚀 Force pulling latest images to ensure fresh deployment"

    # Get the image names from docker-compose production file
    local authelia_image="git.nixc.us/nixius/authelia:production-authelia"
    local mariadb_image="git.nixc.us/nixius/authelia:production-mariadb"
    local redis_image="git.nixc.us/nixius/authelia:production-redis"

    # Pull each image and capture new hashes
    log "Pulling Authelia image..."
    if docker pull "$authelia_image"; then
        NEW_IMAGE_HASH=$(docker images --format "table {{.Repository}}:{{.Tag}}\t{{.ID}}" | grep "production-authelia" | awk '{print $2}' | head -n1)
        success "✅ Authelia image pulled: $NEW_IMAGE_HASH"
    else
        error "❌ Failed to pull Authelia image"
        return 1
    fi

    log "Pulling MariaDB image..."
    retry_command "docker pull $mariadb_image" "MariaDB image pull"

    log "Pulling Redis image..."
    retry_command "docker pull $redis_image" "Redis image pull"

    # Verify we have a new image hash
    if [[ -n "$NEW_IMAGE_HASH" && "$NEW_IMAGE_HASH" != "$OLD_IMAGE_HASH" ]]; then
        success "🔄 New image detected: $OLD_IMAGE_HASH → $NEW_IMAGE_HASH"
    elif [[ -n "$NEW_IMAGE_HASH" ]]; then
        warning "⚠️ Same image hash detected: $NEW_IMAGE_HASH (this may be expected)"
    else
        error "❌ Could not determine new image hash"
        return 1
    fi
}

# Get detailed container information for debugging
get_container_diagnostics() {
    local service_name="$1"
    local container_logs=""

    error "=== 🔍 DETAILED DIAGNOSTICS FOR ${service_name} ==="

    # Get all tasks for this service
    local tasks
    tasks=$(docker service ps "${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Name}}\t{{.CurrentState}}\t{{.Error}}" --no-trunc)

    if [[ -n "$tasks" ]]; then
        error "Service tasks:"
        echo "$tasks" | while IFS=$'\t' read -r task_id name state task_error; do
            error "  Task: $name"
            error "    ID: $task_id"
            error "    State: $state"
            if [[ -n "$task_error" ]]; then
                error "    Error: $task_error"
            fi

            # Try to get container logs for this task
            log "Attempting to get logs for task $task_id..."
            local task_logs
            task_logs=$(docker service logs "${CI_REPO_NAME}_${service_name}" --raw --tail 20 2>/dev/null || echo "No logs available")
            if [[ "$task_logs" != "No logs available" ]]; then
                error "    Recent logs:"
                echo "$task_logs" | sed 's/^/      /'
            fi
        done
    else
        error "No service tasks found for ${service_name}"
    fi

    # Get service inspection details
    error "Service inspection:"
    docker service inspect "${CI_REPO_NAME}_${service_name}" --pretty 2>/dev/null | head -20 | sed 's/^/  /' || error "  Service inspect failed"

    # Check if there are any containers running for this service
    local containers
    containers=$(docker ps -a --filter "label=com.docker.swarm.service.name=${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Status}}\t{{.Names}}" 2>/dev/null || echo "")

    if [[ -n "$containers" ]]; then
        error "Associated containers:"
        echo "$containers" | while IFS=$'\t' read -r container_id status name; do
            error "  Container: $name ($container_id)"
            error "    Status: $status"

            # Get container logs
            local container_logs
            container_logs=$(docker logs "$container_id" --tail 15 2>&1 || echo "No container logs available")
            error "    Container logs (last 15 lines):"
            echo "$container_logs" | sed 's/^/      /'
        done
    else
        error "No containers found for service ${service_name}"
    fi

    error "=== END DIAGNOSTICS FOR ${service_name} ==="
}

# Optimized wait for stack removal
wait_for_stack_removal() {
    log "Verifying stack removal completed"
    local timeout=60  # Reduced timeout for faster deployment
    local elapsed=0

    while docker stack ls | grep -q "${CI_REPO_NAME}"; do
        if [[ $elapsed -ge $timeout ]]; then
            error "Stack removal timeout after ${timeout} seconds"
            return 1
        fi

        if [[ $((elapsed % 10)) -eq 0 ]]; then  # Log every 10 seconds instead of 5
            log "Stack still exists, waiting... (${elapsed}s/${timeout}s)"
        fi
        sleep 2  # Check every 2 seconds instead of 5
        elapsed=$((elapsed + 2))
    done

    success "Stack removal completed in ${elapsed} seconds"
}

# Enhanced secret management with validation
manage_secrets() {
    log "Managing Docker secrets"

    declare -a SECRETS=(
        "AUTHENTICATION_BACKEND_LDAP_PASSWORD"
        "IDENTITY_VALIDATION_RESET_PASSWORD_JWT_SECRET"
        "STORAGE_ENCRYPTION_KEY"
        "SESSION_SECRET"
        "NOTIFIER_SMTP_PASSWORD"
        "IDENTITY_PROVIDERS_OIDC_HMAC_SECRET"
        "IDENTITY_PROVIDERS_OIDC_ISSUER_PRIVATE_KEY"
        "IDENTITY_PROVIDERS_OIDC_JWKS_KEY"
        "CLIENT_SECRET_HEADSCALE"
        "CLIENT_SECRET_HEADADMIN"
    )

    # Remove old secrets
    log "Removing old Docker secrets"
    for secret in "${SECRETS[@]}"; do
        if docker secret ls --format "{{.Name}}" | grep -q "^${secret}$"; then
            docker secret rm "$secret" || true
            debug "Removed secret: $secret"
        else
            debug "Secret $secret did not exist"
        fi
    done

    # Create new secrets with validation
    log "Creating new Docker secrets with updated values"
    for secret in "${SECRETS[@]}"; do
        env_var="${secret}"
        if [[ -n "${!env_var:-}" ]]; then
            if echo "${!env_var}" | docker secret create "$secret" -; then
                success "Created secret: $secret"
            else
                error "Failed to create secret: $secret"
                return 1
            fi
        else
            error "Environment variable $env_var is not set!"
            return 1
        fi
    done

    # Verify all secrets were created
    log "Verifying secret creation"
    for secret in "${SECRETS[@]}"; do
        if ! docker secret ls --format "{{.Name}}" | grep -q "^${secret}$"; then
            error "Secret verification failed: $secret was not created"
            return 1
        fi
    done

    success "All secrets created and verified"
}

# Enhanced deployment with better error handling
deploy_stack() {
    log "Deploying new stack with fresh secrets"
    ROLLBACK_NEEDED=true
    DEPLOYMENT_STARTED=true

    local deploy_cmd="docker stack deploy --with-registry-auth -c ./stack.production.yml '${CI_REPO_NAME}'"

    if ! retry_command "$deploy_cmd" "Stack deployment"; then
        error "Stack deployment failed"
        return 1
    fi

    success "Stack deployment command completed"
}

# Enhanced health checking optimized for speed and accuracy
comprehensive_health_check() {
    log "🔍 Starting rapid health verification (${HEALTH_CHECK_TIMEOUT}s timeout)"
    local start_time=$(date +%s)
    local timeout=$HEALTH_CHECK_TIMEOUT

    # Minimal initial wait - just 10 seconds instead of 30
    log "Brief initialization wait (10 seconds)..."
    sleep 10

    # Get immediate deployment status
    log "Checking deployment status"
    docker stack ps "${CI_REPO_NAME}"

    # Fast health check loop with 2-second intervals
    local check_count=0
    local max_checks=$((timeout / 2))  # Check every 2 seconds
    local authelia_healthy=false
    local last_status=""

    while [[ $check_count -lt $max_checks ]]; do
        local current_time=$(date +%s)
        local elapsed=$((current_time - start_time))

        # Only log every 10 seconds to reduce noise
        if [[ $((check_count % 5)) -eq 0 ]]; then
            log "Health check ${check_count}/${max_checks} (${elapsed}s elapsed)"
        fi

        # Get current service status
        local service_status
        service_status=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Name}}\t{{.CurrentState}}\t{{.Error}}" | grep "authelia_authelia" | head -n1)

        if [[ -n "$service_status" ]]; then
            local name=$(echo "$service_status" | cut -f1)
            local state=$(echo "$service_status" | cut -f2)
            local error_msg=$(echo "$service_status" | cut -f3)

            # Check for Running state
            if echo "$state" | grep -q "Running"; then
                # Verify it's actually stable by checking for a few seconds
                if [[ "$last_status" == "Running" ]]; then
                    # Double-check: no recent failures
                    local failed_count
                    failed_count=$(docker stack ps "${CI_REPO_NAME}" | grep "authelia_authelia" | grep -c "Failed" || echo "0")

                    if [[ $failed_count -eq 0 ]]; then
                        # Final verification: ensure we're using the new image
                        local current_image
                        current_image=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Image}}" | grep authelia | head -n1)

                        if [[ "$current_image" == *"$NEW_IMAGE_HASH"* ]] || [[ -z "$NEW_IMAGE_HASH" ]]; then
                            success "✅ Authelia service is healthy and running!"
                            success "🎯 Using correct image: $current_image"
                            success "⚡ Total deployment time: ${elapsed} seconds"
                            ROLLBACK_NEEDED=false
                            return 0
                        else
                            warning "⚠️ Service running but using wrong image: $current_image (expected: $NEW_IMAGE_HASH)"
                        fi
                    else
                        warning "⚠️ Service running but found $failed_count failed instances"
                    fi
                fi
                last_status="Running"
            elif echo "$state" | grep -q "Failed\|Rejected\|Shutdown"; then
                error "❌ Service failed: $state"
                if [[ -n "$error_msg" ]]; then
                    error "Error: $error_msg"
                fi
                break  # Exit early on clear failure
            else
                last_status="$state"
                debug "Service state: $state"
            fi
        fi

        if [[ $elapsed -ge $timeout ]]; then
            break
        fi

        sleep 2
        ((check_count++))
    done

    # Health check failed - provide comprehensive diagnostics
    error "❌ Health check failed after ${elapsed} seconds"
    error "Deployment verification failed"

    # Get detailed diagnostics for each service
    log "🔍 Gathering comprehensive diagnostics..."

    local services=("authelia" "mariadb" "redis")
    for service in "${services[@]}"; do
        if docker service ls --format "{{.Name}}" | grep -q "${CI_REPO_NAME}_${service}"; then
            get_container_diagnostics "$service"
        else
            error "Service ${CI_REPO_NAME}_${service} not found!"
        fi
    done

    # Additional stack-level diagnostics
    error "=== 📊 STACK-LEVEL DIAGNOSTICS ==="
    error "Full stack status:"
    docker stack ps "${CI_REPO_NAME}" --no-trunc || true

    error "Stack services:"
    docker stack services "${CI_REPO_NAME}" || true

    error "Recent Docker events:"
    docker events --since="$((elapsed + 60))s" --until="now" --filter "container" 2>/dev/null | tail -10 || true

    return 1
}

# Main deployment function
main() {
    log "🚀 Starting production deployment for ${CI_REPO_NAME}"

    # Pre-flight checks
    pre_flight_checks

    # Get current state for potential rollback
    get_current_image_id

    # Step 1: Docker registry login
    docker_registry_login

    # Step 1.5: Force pull latest images to ensure fresh deployment
    force_pull_latest_images

    # Step 2: Remove old stack to release secrets
    log "Removing old stack to release secrets"
    docker stack rm "${CI_REPO_NAME}" || true

    # Step 3: Wait for complete stack removal with optimized timeout
    log "Waiting for complete stack removal (minimum 15 seconds)"
    sleep 15  # Reduced from 30 seconds
    wait_for_stack_removal

    # Step 4 & 5: Manage secrets (remove old, create new)
    manage_secrets

    # Step 6: Deploy new stack
    deploy_stack

    # Step 7-9: Rapid health checking with container diagnostics
    comprehensive_health_check

    success "🎉 Production deployment completed successfully!"
    success "🏆 Deployed image: $NEW_IMAGE_HASH"
}

# Run main function
main "$@"