From b2820ed47f32f04e9687cda0add8d86a379c0f47 Mon Sep 17 00:00:00 2001 From: Leopere Date: Sun, 8 Feb 2026 18:24:13 -0500 Subject: [PATCH] Rework for Swarm deploy on ingress.nixc.us - labels.go: use `docker service update --label-add/rm` via SSH to dynamically manage Traefik labels on the Swarm service itself, matching how traefik-http discovers routes from Docker swarm labels - stack.production.yml: constrain to ingress.nixc.us, host-mode port 2222, base traefik.enable labels, SWARM_SERVICE_NAME env - cmd/server/main.go: SWARM_SERVICE_NAME replaces TRAEFIK_CONFIG_DIR - .woodpecker.yml: hardcode stack name better-argo-tunnels, update smoke test env vars Co-authored-by: Cursor --- .woodpecker.yml | 11 ++- cmd/server/main.go | 12 ++-- internal/server/labels.go | 138 ++++++++++++++++++++------------------ stack.production.yml | 22 ++++-- 4 files changed, 99 insertions(+), 84 deletions(-) diff --git a/.woodpecker.yml b/.woodpecker.yml index 262a648..19b5742 100644 --- a/.woodpecker.yml +++ b/.woodpecker.yml @@ -49,7 +49,6 @@ steps: - echo "$${REGISTRY_PASSWORD}" | docker login -u "$${REGISTRY_USER}" --password-stdin git.nixc.us - apk add --no-cache git || true - export GIT_COMMIT=$${CI_COMMIT_SHA} - - export GIT_COMMIT_DATE=$(git log -1 --format=%ci HEAD 2>/dev/null || echo "unknown") - echo "Building GIT_COMMIT=$GIT_COMMIT" # Build server image - docker build --target server -t git.nixc.us/colin/better-argo-tunnels:production . @@ -79,7 +78,7 @@ steps: - echo "$${REGISTRY_PASSWORD}" | docker login git.nixc.us -u "$${REGISTRY_USER}" --password-stdin - docker pull git.nixc.us/colin/better-argo-tunnels:production - docker rm -f tunnel-smoke || true - # Smoke: just verify the binary runs and prints startup log + # Smoke: verify the binary runs and prints startup log - mkdir -p /tmp/smoke-keys - ssh-keygen -t ed25519 -f /tmp/smoke-keys/host_key -N "" -q - ssh-keygen -t ed25519 -f /tmp/smoke-keys/client_key -N "" -q @@ -91,7 +90,7 @@ steps: -e AUTHORIZED_KEYS=/keys/authorized_keys \ -e TRAEFIK_SSH_HOST=127.0.0.1 \ -e TRAEFIK_SSH_KEY=/keys/host_key \ - -e TRAEFIK_CONFIG_DIR=/tmp/dynamic \ + -e SWARM_SERVICE_NAME=smoke-test \ -v /tmp/smoke-keys:/keys:ro \ git.nixc.us/colin/better-argo-tunnels:production - sleep 3 @@ -103,7 +102,7 @@ steps: branch: main event: [push, cron] - # Deploy to Swarm + # Deploy to Swarm on ingress.nixc.us deploy-production: name: deploy-production image: woodpeckerci/plugin-docker-buildx @@ -129,7 +128,7 @@ steps: - echo "$${REGISTRY_PASSWORD}" | docker login -u "$${REGISTRY_USER}" --password-stdin git.nixc.us # Remove old stack - echo "Removing old stack..." - - docker stack rm $${CI_REPO_NAME} || true + - docker stack rm better-argo-tunnels || true - sleep 10 # Refresh secrets - | @@ -148,7 +147,7 @@ steps: docker secret ls | grep tunnel_ # Deploy stack - echo "Deploying stack..." - - docker stack deploy --with-registry-auth -c ./stack.production.yml $${CI_REPO_NAME} + - docker stack deploy --with-registry-auth -c ./stack.production.yml better-argo-tunnels when: branch: main event: [push, cron] diff --git a/cmd/server/main.go b/cmd/server/main.go index 0196e29..91c4794 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -49,29 +49,29 @@ func main() { portStart := envInt("PORT_RANGE_START", 10000) portEnd := envInt("PORT_RANGE_END", 10100) - // Remote Traefik host config (SSH into the ingress host to manage routes). + // Swarm manager SSH config (for updating service labels). traefikHost := envRequired("TRAEFIK_SSH_HOST") traefikUser := envOr("TRAEFIK_SSH_USER", "root") traefikKey := envRequired("TRAEFIK_SSH_KEY") - traefikConfigDir := envOr("TRAEFIK_CONFIG_DIR", "/root/traefik/dynamic") + serviceName := envOr("SWARM_SERVICE_NAME", "better-argo-tunnels_tunnel-server") entrypoint := envOr("TRAEFIK_ENTRYPOINT", "websecure") certResolver := envOr("TRAEFIK_CERT_RESOLVER", "letsencryptresolver") - // Load the SSH key for connecting to the Traefik host. + // Load the SSH key for connecting to the Swarm manager. traefikSigner, err := server.LoadSigner(traefikKey) if err != nil { log.Fatalf("Failed to load Traefik SSH key: %v", err) } - log.Printf("Loaded Traefik host SSH key") + log.Printf("Loaded Swarm manager SSH key") // Initialize port pool. pool := server.NewPortPool(portStart, portEnd) log.Printf("Port pool: %d-%d (%d ports)", portStart, portEnd, portEnd-portStart+1) - // Initialize Traefik label manager (remote SSH). + // Initialize label manager (Swarm service update via SSH). labels, err := server.NewLabelManager( traefikHost, traefikUser, traefikSigner, - traefikConfigDir, entrypoint, certResolver, + serviceName, entrypoint, certResolver, ) if err != nil { log.Fatalf("Failed to init label manager: %v", err) diff --git a/internal/server/labels.go b/internal/server/labels.go index 2005434..3b1f3b5 100644 --- a/internal/server/labels.go +++ b/internal/server/labels.go @@ -9,85 +9,126 @@ import ( "golang.org/x/crypto/ssh" ) -// LabelManager manages Traefik dynamic config on a remote host via SSH. -// It SSHs into the Traefik host and writes per-tunnel YAML config files -// into the Traefik file provider directory. +// LabelManager manages Traefik routing labels on its own Swarm service +// by SSHing into the Swarm manager and running docker service update. type LabelManager struct { mu sync.Mutex - remoteHost string // e.g. "ingress.nixc.us" or "ingress.nixc.us:22" - remoteUser string // SSH user on the Traefik host + remoteHost string // Swarm manager, e.g. "ingress.nixc.us" + remoteUser string // SSH user signer ssh.Signer - configDir string // remote path where Traefik watches for file provider + serviceName string // Swarm service name, e.g. "better-argo-tunnels_tunnel-server" entrypoint string // e.g. "websecure" certResolver string // e.g. "letsencryptresolver" + labels map[string]bool // track which tunnel keys we've added } -// NewLabelManager creates a label manager that writes Traefik config via SSH. +// NewLabelManager creates a label manager that updates Swarm service labels via SSH. func NewLabelManager( remoteHost, remoteUser string, signer ssh.Signer, - configDir, entrypoint, certResolver string, + serviceName, entrypoint, certResolver string, ) (*LabelManager, error) { lm := &LabelManager{ remoteHost: remoteHost, remoteUser: remoteUser, signer: signer, - configDir: configDir, + serviceName: serviceName, entrypoint: entrypoint, certResolver: certResolver, + labels: make(map[string]bool), } - // Ensure the remote config directory exists. - if err := lm.runRemote(fmt.Sprintf("mkdir -p %s", configDir)); err != nil { - return nil, fmt.Errorf("ensure remote config dir: %w", err) + // Verify we can reach the Swarm manager and the service exists. + cmd := fmt.Sprintf("docker service inspect --format '{{.Spec.Name}}' %s", serviceName) + if err := lm.runRemote(cmd); err != nil { + log.Printf("WARN: could not verify service %s (may not exist yet): %v", serviceName, err) + } else { + log.Printf("Verified Swarm service: %s", serviceName) } - log.Printf("Label manager ready (host=%s, dir=%s, ep=%s, resolver=%s)", - remoteHost, configDir, entrypoint, certResolver) + log.Printf("Label manager ready (host=%s, service=%s, ep=%s, resolver=%s)", + remoteHost, serviceName, entrypoint, certResolver) return lm, nil } -// Add writes a Traefik dynamic config file on the remote host for a tunnel. +// Add adds Traefik routing labels to the Swarm service for a tunnel. func (lm *LabelManager) Add(tunKey, domain string, port int) error { lm.mu.Lock() defer lm.mu.Unlock() routerName := fmt.Sprintf("tunnel-%s-router", tunKey) serviceName := fmt.Sprintf("tunnel-%s-service", tunKey) - cfg := buildRouteConfig(routerName, serviceName, domain, port, lm.entrypoint, lm.certResolver) - remotePath := fmt.Sprintf("%s/tunnel-%s.yml", lm.configDir, tunKey) - - // Write the config file via SSH using cat heredoc. - cmd := fmt.Sprintf("cat > %s << 'TRAEFIKEOF'\n%sTRAEFIKEOF", remotePath, cfg) - - if err := lm.runRemote(cmd); err != nil { - return fmt.Errorf("write remote config %s: %w", remotePath, err) + // Build the label-add flags for docker service update. + labelArgs := []string{ + labelFlag(fmt.Sprintf("traefik.http.routers.%s.rule", routerName), + fmt.Sprintf("Host(`%s`)", domain)), + labelFlag(fmt.Sprintf("traefik.http.routers.%s.entrypoints", routerName), + lm.entrypoint), + labelFlag(fmt.Sprintf("traefik.http.routers.%s.tls", routerName), + "true"), + labelFlag(fmt.Sprintf("traefik.http.routers.%s.tls.certresolver", routerName), + lm.certResolver), + labelFlag(fmt.Sprintf("traefik.http.routers.%s.service", routerName), + serviceName), + labelFlag(fmt.Sprintf("traefik.http.services.%s.loadbalancer.server.port", serviceName), + fmt.Sprintf("%d", port)), } - log.Printf("Wrote remote Traefik config: %s (domain=%s port=%d)", remotePath, domain, port) + cmd := fmt.Sprintf("docker service update --label-add %s %s", + strings.Join(labelArgs, " --label-add "), lm.serviceName) + + if err := lm.runRemote(cmd); err != nil { + return fmt.Errorf("add labels for %s: %w", domain, err) + } + + lm.labels[tunKey] = true + log.Printf("Added Swarm labels: %s -> %s:%d", domain, lm.serviceName, port) return nil } -// Remove deletes the Traefik dynamic config file on the remote host. +// Remove removes Traefik routing labels from the Swarm service for a tunnel. func (lm *LabelManager) Remove(tunKey string) error { lm.mu.Lock() defer lm.mu.Unlock() - remotePath := fmt.Sprintf("%s/tunnel-%s.yml", lm.configDir, tunKey) - cmd := fmt.Sprintf("rm -f %s", remotePath) - - if err := lm.runRemote(cmd); err != nil { - return fmt.Errorf("remove remote config %s: %w", remotePath, err) + if !lm.labels[tunKey] { + return nil // nothing to remove } - log.Printf("Removed remote Traefik config: %s", remotePath) + routerName := fmt.Sprintf("tunnel-%s-router", tunKey) + serviceName := fmt.Sprintf("tunnel-%s-service", tunKey) + + // Build the label-rm flags. + rmLabels := []string{ + fmt.Sprintf("traefik.http.routers.%s.rule", routerName), + fmt.Sprintf("traefik.http.routers.%s.entrypoints", routerName), + fmt.Sprintf("traefik.http.routers.%s.tls", routerName), + fmt.Sprintf("traefik.http.routers.%s.tls.certresolver", routerName), + fmt.Sprintf("traefik.http.routers.%s.service", routerName), + fmt.Sprintf("traefik.http.services.%s.loadbalancer.server.port", serviceName), + } + + cmd := fmt.Sprintf("docker service update --label-rm %s %s", + strings.Join(rmLabels, " --label-rm "), lm.serviceName) + + if err := lm.runRemote(cmd); err != nil { + return fmt.Errorf("remove labels for %s: %w", tunKey, err) + } + + delete(lm.labels, tunKey) + log.Printf("Removed Swarm labels for tunnel: %s", tunKey) return nil } -// runRemote executes a command on the remote Traefik host via SSH. +// labelFlag formats a --label-add value, quoting properly for shell. +func labelFlag(key, value string) string { + return fmt.Sprintf("'%s=%s'", key, value) +} + +// runRemote executes a command on the Swarm manager via SSH. func (lm *LabelManager) runRemote(cmd string) error { addr := lm.remoteHost if !strings.Contains(addr, ":") { @@ -122,39 +163,6 @@ func (lm *LabelManager) runRemote(cmd string) error { return nil } -// buildRouteConfig generates Traefik dynamic config YAML for one tunnel. -func buildRouteConfig( - routerName, serviceName, domain string, - port int, - entrypoint, certResolver string, -) string { - var b strings.Builder - - b.WriteString("# Auto-generated by tunnel-server. Do not edit.\n") - b.WriteString("http:\n") - - // Router - b.WriteString(" routers:\n") - b.WriteString(fmt.Sprintf(" %s:\n", routerName)) - b.WriteString(fmt.Sprintf(" rule: \"Host(`%s`)\"\n", domain)) - b.WriteString(" entryPoints:\n") - b.WriteString(fmt.Sprintf(" - %s\n", entrypoint)) - b.WriteString(" tls:\n") - b.WriteString(fmt.Sprintf(" certResolver: %s\n", certResolver)) - b.WriteString(fmt.Sprintf(" service: %s\n", serviceName)) - - // Service — points to the tunnel-server's allocated port. - // The tunnel-server container is on the same network as Traefik, - // so Traefik can reach it by container name or IP. - b.WriteString(" services:\n") - b.WriteString(fmt.Sprintf(" %s:\n", serviceName)) - b.WriteString(" loadBalancer:\n") - b.WriteString(" servers:\n") - b.WriteString(fmt.Sprintf(" - url: \"http://tunnel-server:%d\"\n", port)) - - return b.String() -} - // Close is a no-op — SSH connections are opened/closed per operation. func (lm *LabelManager) Close() error { return nil diff --git a/stack.production.yml b/stack.production.yml index 7ff32b1..e6fce2e 100644 --- a/stack.production.yml +++ b/stack.production.yml @@ -34,7 +34,7 @@ services: TRAEFIK_SSH_HOST: "ingress.nixc.us" TRAEFIK_SSH_USER: "root" TRAEFIK_SSH_KEY: "/run/secrets/traefik_deploy_key" - TRAEFIK_CONFIG_DIR: "/root/traefik/dynamic" + SWARM_SERVICE_NAME: "better-argo-tunnels_tunnel-server" TRAEFIK_ENTRYPOINT: "websecure" TRAEFIK_CERT_RESOLVER: "letsencryptresolver" HOSTNAME: "{{.Node.Hostname}}" @@ -43,16 +43,24 @@ services: TASK_ID: "{{.Task.ID}}" ENVIRONMENT: "production" ports: - - "2222:2222" - - "10000-10100:10000-10100" + - target: 2222 + published: 2222 + protocol: tcp + mode: host deploy: replicas: 1 placement: constraints: - - node.hostname == macmini1 + - node.hostname == ingress.nixc.us labels: traefik.enable: "true" - traefik.tcp.routers.tunnel-ssh-router.rule: "HostSNI(`*`)" - traefik.tcp.routers.tunnel-ssh-router.entrypoints: "ssh" - traefik.tcp.services.tunnel-ssh-service.loadbalancer.server.port: "2222" traefik.docker.network: "traefik" + # Dynamic tunnel labels are added at runtime via docker service update. + # The base labels below just enable Traefik discovery. + update_config: + order: stop-first + failure_action: rollback + delay: 0s + parallelism: 1 + restart_policy: + condition: on-failure