authelia/scripts/ci-deploy-production.sh

531 lines
18 KiB
Bash
Executable File

#!/bin/sh
################################################################################
# WOODPECKER CI PRODUCTION DEPLOYMENT SCRIPT
################################################################################
#
# ⚠️ WARNING: THIS SCRIPT IS EXCLUSIVELY FOR WOODPECKER CI USE
#
# This script is designed to run within the Woodpecker CI environment with
# specific environment variables and Docker socket access.
#
# 🚫 DO NOT RUN THIS ON A DEVELOPER WORKSTATION
# 🚫 This will attempt to remove production Docker stacks and secrets
# 🚫 This requires access to production Docker swarm manager nodes
#
# This script handles:
# - Production stack removal and cleanup
# - Docker secrets recreation with fresh values
# - New stack deployment with verification
# - Health checking and deployment validation
# - Rollback capability on failure
# - Concurrent execution prevention
#
################################################################################
set -euo pipefail
# Configuration
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
LOCK_FILE="/tmp/authelia-deploy.lock"
MAX_RETRIES=3
RETRY_DELAY=5 # Reduced from 10s to 5s
DEPLOYMENT_TIMEOUT=180 # Reduced from 300s to 180s (3 minutes)
HEALTH_CHECK_TIMEOUT=90 # Reduced from 120s to 90s
FORCE_PULL=true # Always pull latest images
# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# Global variables for cleanup
DEPLOYMENT_STARTED=false
OLD_IMAGE_HASH=""
NEW_IMAGE_HASH=""
ROLLBACK_NEEDED=false
# Logging functions
log() {
echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}
error() {
echo -e "${RED}[ERROR] $1${NC}"
}
success() {
echo -e "${GREEN}[SUCCESS] $1${NC}"
}
warning() {
echo -e "${YELLOW}[WARNING] $1${NC}"
}
debug() {
echo -e "${PURPLE}[DEBUG] $1${NC}"
}
# Cleanup function - runs on script exit
cleanup() {
local exit_code=$?
if [ -f "$LOCK_FILE" ]; then
debug "Removing deployment lock file"
rm -f "$LOCK_FILE"
fi
if [ $exit_code -ne 0 ]; then
error "Deployment failed with exit code: $exit_code"
log "📊 Providing final deployment status for debugging..."
# Show final stack status for debugging
if docker stack ls | grep -q "${CI_REPO_NAME}"; then
error "=== FINAL STACK STATUS ==="
docker stack ps "${CI_REPO_NAME}" --no-trunc || true
docker stack services "${CI_REPO_NAME}" || true
else
warning "Stack ${CI_REPO_NAME} no longer exists"
fi
fi
debug "Cleanup completed with exit code: $exit_code"
exit $exit_code
}
# Set up cleanup trap
trap cleanup EXIT INT TERM
# Retry function for operations that might fail transiently
retry_command() {
local cmd="$1"
local description="$2"
local attempt=1
while [ $attempt -le $MAX_RETRIES ]; do
log "Attempt $attempt/$MAX_RETRIES: $description"
if eval "$cmd"; then
success "$description completed successfully"
return 0
else
if [ $attempt -eq $MAX_RETRIES ]; then
error "$description failed after $MAX_RETRIES attempts"
return 1
else
warning "$description failed, retrying in ${RETRY_DELAY}s..."
sleep $RETRY_DELAY
fi
fi
attempt=$((attempt + 1))
done
}
# Pre-flight checks
pre_flight_checks() {
log "Running pre-flight checks..."
# Check if another deployment is running
if [ -f "$LOCK_FILE" ]; then
error "Another deployment is already running (lock file exists: $LOCK_FILE)"
error "If you're sure no other deployment is running, remove the lock file manually"
exit 1
fi
# Create lock file
echo "$$" > "$LOCK_FILE"
debug "Created deployment lock file"
# Verify we're running in CI environment
if [ -z "${CI_REPO_NAME:-}" ]; then
error "This script must only be run in Woodpecker CI environment!"
error "Missing CI_REPO_NAME environment variable"
exit 1
fi
# Check Docker daemon is responsive
if ! docker info >/dev/null 2>&1; then
error "Docker daemon is not responsive"
exit 1
fi
# Verify required environment variables
REQUIRED_VARS="REGISTRY_USER REGISTRY_PASSWORD CI_REPO_NAME AUTHENTICATION_BACKEND_LDAP_PASSWORD IDENTITY_VALIDATION_RESET_PASSWORD_JWT_SECRET STORAGE_ENCRYPTION_KEY SESSION_SECRET NOTIFIER_SMTP_PASSWORD IDENTITY_PROVIDERS_OIDC_HMAC_SECRET IDENTITY_PROVIDERS_OIDC_ISSUER_PRIVATE_KEY IDENTITY_PROVIDERS_OIDC_JWKS_KEY CLIENT_SECRET_HEADSCALE CLIENT_SECRET_HEADADMIN"
for var in $REQUIRED_VARS; do
eval "var_value=\$$var"
if [ -z "$var_value" ]; then
error "Required environment variable $var is not set"
exit 1
fi
done
# Check if stack file exists
if [ ! -f "./stack.production.yml" ]; then
error "Production stack file not found: ./stack.production.yml"
exit 1
fi
success "Pre-flight checks completed"
}
# Get current image ID for rollback purposes
get_current_image_id() {
if docker stack ps "${CI_REPO_NAME}" >/dev/null 2>&1; then
OLD_IMAGE_HASH=$(docker stack ps "${CI_REPO_NAME}" --format "table {{.Image}}" | grep authelia | head -n1 || echo "")
if [ -n "$OLD_IMAGE_HASH" ]; then
debug "Current image for rollback: $OLD_IMAGE_HASH"
fi
fi
}
# Rollback function
attempt_rollback() {
if [ -n "$OLD_IMAGE_HASH" ] && [ "$OLD_IMAGE_HASH" != "IMAGE" ]; then
warning "Attempting rollback to previous image: $OLD_IMAGE_HASH"
# This would require a more complex rollback mechanism
# For now, just log the attempt
error "Rollback mechanism not yet implemented"
error "Manual intervention required"
error "Previous image was: $OLD_IMAGE_HASH"
else
error "No previous image information available for rollback"
fi
}
# Enhanced Docker registry login with retries
docker_registry_login() {
log "Logging into Docker registry"
local login_cmd="echo '${REGISTRY_PASSWORD}' | docker login -u '${REGISTRY_USER}' --password-stdin git.nixc.us"
retry_command "$login_cmd" "Docker registry login"
}
# Force pull latest images to ensure we deploy the newest version
force_pull_latest_images() {
log "🚀 Force pulling latest images to ensure fresh deployment"
# Get the image names from docker-compose production file
local authelia_image="git.nixc.us/nixius/authelia:production-authelia"
local mariadb_image="git.nixc.us/nixius/authelia:production-mariadb"
local redis_image="git.nixc.us/nixius/authelia:production-redis"
# Pull each image and capture new hashes
log "Pulling Authelia image..."
if docker pull "$authelia_image"; then
NEW_IMAGE_HASH=$(docker images --format "table {{.Repository}}:{{.Tag}}\t{{.ID}}" | grep "production-authelia" | awk '{print $2}' | head -n1)
success "✅ Authelia image pulled: $NEW_IMAGE_HASH"
else
error "❌ Failed to pull Authelia image"
return 1
fi
log "Pulling MariaDB image..."
retry_command "docker pull $mariadb_image" "MariaDB image pull"
log "Pulling Redis image..."
retry_command "docker pull $redis_image" "Redis image pull"
# Verify we have a new image hash
if [ -n "$NEW_IMAGE_HASH" ] && [ "$NEW_IMAGE_HASH" != "$OLD_IMAGE_HASH" ]; then
success "🔄 New image detected: $OLD_IMAGE_HASH$NEW_IMAGE_HASH"
elif [ -n "$NEW_IMAGE_HASH" ]; then
warning "⚠️ Same image hash detected: $NEW_IMAGE_HASH (this may be expected)"
else
error "❌ Could not determine new image hash"
return 1
fi
}
# Get detailed container information for debugging
get_container_diagnostics() {
local service_name="$1"
local container_logs=""
error "=== 🔍 DETAILED DIAGNOSTICS FOR ${service_name} ==="
# Get all tasks for this service
local tasks
tasks=$(docker service ps "${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Name}}\t{{.CurrentState}}\t{{.Error}}" --no-trunc)
if [ -n "$tasks" ]; then
error "Service tasks:"
echo "$tasks" | while IFS=$'\t' read -r task_id name state task_error; do
error " Task: $name"
error " ID: $task_id"
error " State: $state"
if [ -n "$task_error" ]; then
error " Error: $task_error"
fi
# Try to get container logs for this task
log "Attempting to get logs for task $task_id..."
local task_logs
task_logs=$(docker service logs "${CI_REPO_NAME}_${service_name}" --raw --tail 20 2>/dev/null || echo "No logs available")
if [ "$task_logs" != "No logs available" ]; then
error " Recent logs:"
echo "$task_logs" | sed 's/^/ /'
fi
done
else
error "No service tasks found for ${service_name}"
fi
# Get service inspection details
error "Service inspection:"
docker service inspect "${CI_REPO_NAME}_${service_name}" --pretty 2>/dev/null | head -20 | sed 's/^/ /' || error " Service inspect failed"
# Check if there are any containers running for this service
local containers
containers=$(docker ps -a --filter "label=com.docker.swarm.service.name=${CI_REPO_NAME}_${service_name}" --format "{{.ID}}\t{{.Status}}\t{{.Names}}" 2>/dev/null || echo "")
if [ -n "$containers" ]; then
error "Associated containers:"
echo "$containers" | while IFS=$'\t' read -r container_id status name; do
error " Container: $name ($container_id)"
error " Status: $status"
# Get container logs
local container_logs
container_logs=$(docker logs "$container_id" --tail 15 2>&1 || echo "No container logs available")
error " Container logs (last 15 lines):"
echo "$container_logs" | sed 's/^/ /'
done
else
error "No containers found for service ${service_name}"
fi
error "=== END DIAGNOSTICS FOR ${service_name} ==="
}
# Optimized wait for stack removal
wait_for_stack_removal() {
log "Verifying stack removal completed"
local timeout=60 # Reduced timeout for faster deployment
local elapsed=0
while docker stack ls | grep -q "${CI_REPO_NAME}"; do
if [ $elapsed -ge $timeout ]; then
error "Stack removal timeout after ${timeout} seconds"
return 1
fi
if [ $((elapsed % 10)) -eq 0 ]; then # Log every 10 seconds instead of 5
log "Stack still exists, waiting... (${elapsed}s/${timeout}s)"
fi
sleep 2 # Check every 2 seconds instead of 5
elapsed=$((elapsed + 2))
done
success "Stack removal completed in ${elapsed} seconds"
}
# Enhanced secret management with validation
manage_secrets() {
log "Managing Docker secrets"
# List of secrets (space-separated instead of array)
SECRETS="AUTHENTICATION_BACKEND_LDAP_PASSWORD IDENTITY_VALIDATION_RESET_PASSWORD_JWT_SECRET STORAGE_ENCRYPTION_KEY SESSION_SECRET NOTIFIER_SMTP_PASSWORD IDENTITY_PROVIDERS_OIDC_HMAC_SECRET IDENTITY_PROVIDERS_OIDC_ISSUER_PRIVATE_KEY IDENTITY_PROVIDERS_OIDC_JWKS_KEY CLIENT_SECRET_HEADSCALE CLIENT_SECRET_HEADADMIN"
# Remove old secrets
log "Removing old Docker secrets"
for secret in $SECRETS; do
if docker secret ls --format "{{.Name}}" | grep -q "^${secret}$"; then
docker secret rm "$secret" || true
debug "Removed secret: $secret"
else
debug "Secret $secret did not exist"
fi
done
# Create new secrets with validation
log "Creating new Docker secrets with updated values"
for secret in $SECRETS; do
# Use eval for indirect variable access in POSIX shell
eval "secret_value=\$$secret"
if [ -n "$secret_value" ]; then
if echo "$secret_value" | docker secret create "$secret" -; then
success "Created secret: $secret"
else
error "Failed to create secret: $secret"
return 1
fi
else
error "Environment variable $secret is not set!"
return 1
fi
done
# Verify all secrets were created
log "Verifying secret creation"
for secret in $SECRETS; do
if ! docker secret ls --format "{{.Name}}" | grep -q "^${secret}$"; then
error "Secret verification failed: $secret was not created"
return 1
fi
done
success "All secrets created and verified"
}
# Enhanced deployment with better error handling
deploy_stack() {
log "Deploying new stack with fresh secrets"
DEPLOYMENT_STARTED=true
local deploy_cmd="docker stack deploy --with-registry-auth -c ./stack.production.yml '${CI_REPO_NAME}'"
if ! retry_command "$deploy_cmd" "Stack deployment"; then
error "Stack deployment failed"
return 1
fi
success "Stack deployment command completed"
}
# Enhanced health checking focused on image verification and debugging
comprehensive_health_check() {
log "🔍 Starting deployment verification (${HEALTH_CHECK_TIMEOUT}s timeout)"
local start_time=$(date +%s)
local timeout=$HEALTH_CHECK_TIMEOUT
# Database initialization wait - giving MariaDB time to start
log "Database initialization wait (45 seconds)..."
sleep 45
# Get immediate deployment status
log "Checking deployment status"
docker stack ps "${CI_REPO_NAME}"
# Image verification loop
local check_count=0
local max_checks=$((timeout / 10)) # Check every 10 seconds
while [ $check_count -lt $max_checks ]; do
local current_time=$(date +%s)
local elapsed=$((current_time - start_time))
log "Verification check ${check_count}/${max_checks} (${elapsed}s elapsed)"
# Get current service status
local service_status
service_status=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Name}}\t{{.CurrentState}}\t{{.Error}}" | grep "authelia_authelia" | head -n1)
if [ -n "$service_status" ]; then
local name=$(echo "$service_status" | cut -f1)
local state=$(echo "$service_status" | cut -f2)
local error_msg=$(echo "$service_status" | cut -f3)
log "Current Authelia state: $state"
# Check for Running state
if echo "$state" | grep -q "Running"; then
# Verify image hash
local current_image
current_image=$(docker stack ps "${CI_REPO_NAME}" --format "{{.Image}}" | grep authelia | head -n1)
log "🎯 Current image: $current_image"
log "🎯 Expected image hash: $NEW_IMAGE_HASH"
if echo "$current_image" | grep -q "$NEW_IMAGE_HASH" || [ -z "$NEW_IMAGE_HASH" ]; then
success "✅ Authelia service is healthy and running with correct image!"
success "🎯 Using image: $current_image"
success "⚡ Total deployment time: ${elapsed} seconds"
return 0
else
warning "⚠️ Service running but using different image than expected"
warning "Current: $current_image"
warning "Expected hash: $NEW_IMAGE_HASH"
warning "This may be normal if the image hasn't changed"
fi
elif echo "$state" | grep -q "Failed\|Rejected\|Shutdown"; then
warning "❌ Service failed: $state"
if [ -n "$error_msg" ]; then
error "Error: $error_msg"
fi
# Get recent logs for debugging
log "📋 Getting recent logs for debugging..."
docker service logs "${CI_REPO_NAME}_authelia" --tail 20 2>/dev/null || echo "No logs available"
else
debug "Service state: $state (still starting up)"
fi
fi
if [ $elapsed -ge $timeout ]; then
warning "⏰ Reached timeout after ${elapsed} seconds"
log "📊 Final status for debugging:"
docker stack ps "${CI_REPO_NAME}" --no-trunc || true
break
fi
sleep 10
check_count=$((check_count + 1))
done
# Deployment verification completed
warning "📊 Deployment verification completed - check logs above for status"
# Get final diagnostic info
log "🔍 Final diagnostics..."
local services="authelia mariadb redis"
for service in $services; do
if docker service ls --format "{{.Name}}" | grep -q "${CI_REPO_NAME}_${service}"; then
log "=== ${service} STATUS ==="
docker service logs "${CI_REPO_NAME}_${service}" --tail 10 2>/dev/null || echo "No logs available"
fi
done
# Don't fail - let it run for debugging
warning "Deployment may still be starting - leaving stack running for debugging"
return 0
}
# Main deployment function
main() {
log "🚀 Starting production deployment for ${CI_REPO_NAME}"
# Pre-flight checks
pre_flight_checks
# Get current state for potential rollback
get_current_image_id
# Step 1: Docker registry login
docker_registry_login
# Step 1.5: Force pull latest images to ensure fresh deployment
force_pull_latest_images
# Step 2: Remove old stack to release secrets
log "Removing old stack to release secrets"
docker stack rm "${CI_REPO_NAME}" || true
# Step 3: Wait for complete stack removal with optimized timeout
log "Waiting for complete stack removal (minimum 15 seconds)"
sleep 15 # Reduced from 30 seconds
wait_for_stack_removal
# Step 4 & 5: Manage secrets (remove old, create new)
manage_secrets
# Step 6: Deploy new stack
deploy_stack
# Step 7-9: Rapid health checking with container diagnostics
comprehensive_health_check
success "🎉 Production deployment completed successfully!"
success "🏆 Deployed image: $NEW_IMAGE_HASH"
}
# Run main function
main "$@"