From 8024cddc7cc3b31463f93fdd1749e19ae79c9fc2 Mon Sep 17 00:00:00 2001 From: Leopere Date: Sat, 28 Feb 2026 20:03:57 -0500 Subject: [PATCH] Fix stale Traefik labels causing 502 on tunnel reconnect Three failure modes fixed: - Startup: purge all tunnel-* labels so container restarts don't leave dead routes pointing to ports that no longer exist - Reconnect: tear down old listener/port before registering new one for the same domain, preventing port leaks - Race: cleanupConnection skips label removal if a newer connection has already taken over the tunnel key Made-with: Cursor --- cmd/server/main.go | 5 +++ internal/server/labels.go | 64 +++++++++++++++++++++++++++++++++++++ internal/server/tunnel.go | 66 ++++++++++++++++++++++++++++++--------- 3 files changed, 121 insertions(+), 14 deletions(-) diff --git a/cmd/server/main.go b/cmd/server/main.go index 91c4794..4053c24 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -78,6 +78,11 @@ func main() { } defer labels.Close() + // Purge stale tunnel labels left by a previous container instance. + if err := labels.PurgeAllTunnelLabels(); err != nil { + log.Printf("WARN: failed to purge stale labels: %v", err) + } + // Initialize SSH server for tunnel clients. sshSrv, err := server.NewSSHServer(hostKeyPath, authKeysPath, pool, labels) if err != nil { diff --git a/internal/server/labels.go b/internal/server/labels.go index bb9cd83..f4353f5 100644 --- a/internal/server/labels.go +++ b/internal/server/labels.go @@ -1,6 +1,7 @@ package server import ( + "encoding/json" "fmt" "log" "strings" @@ -56,6 +57,69 @@ func NewLabelManager( return lm, nil } +// PurgeAllTunnelLabels removes every tunnel-* Traefik label from the Swarm +// service. Called on startup so stale labels from a previous instance don't +// route traffic to ports that no longer exist. +func (lm *LabelManager) PurgeAllTunnelLabels() error { + addr := lm.remoteHost + if !strings.Contains(addr, ":") { + addr = addr + ":22" + } + + config := &ssh.ClientConfig{ + User: lm.remoteUser, + Auth: []ssh.AuthMethod{ssh.PublicKeys(lm.signer)}, + HostKeyCallback: ssh.InsecureIgnoreHostKey(), + } + + client, err := ssh.Dial("tcp", addr, config) + if err != nil { + return fmt.Errorf("SSH dial %s: %w", addr, err) + } + defer client.Close() + + session, err := client.NewSession() + if err != nil { + return fmt.Errorf("SSH session: %w", err) + } + defer session.Close() + + inspectCmd := fmt.Sprintf( + "docker service inspect --format '{{json .Spec.Labels}}' %s", lm.serviceName) + output, err := session.CombinedOutput(inspectCmd) + if err != nil { + return fmt.Errorf("inspect labels: %w (%s)", err, string(output)) + } + + var labels map[string]string + if err := json.Unmarshal(output, &labels); err != nil { + return fmt.Errorf("parse labels JSON: %w", err) + } + + var rmLabels []string + for key := range labels { + if strings.HasPrefix(key, "traefik.http.routers.tunnel-") || + strings.HasPrefix(key, "traefik.http.services.tunnel-") || + strings.HasPrefix(key, "traefik.http.middlewares.tunnel-") { + rmLabels = append(rmLabels, key) + } + } + + if len(rmLabels) == 0 { + log.Println("No stale tunnel labels to purge") + return nil + } + + rmCmd := fmt.Sprintf("docker service update --label-rm %s %s", + strings.Join(rmLabels, " --label-rm "), lm.serviceName) + if err := lm.runRemote(rmCmd); err != nil { + return fmt.Errorf("purge tunnel labels: %w", err) + } + + log.Printf("Purged %d stale tunnel labels on startup", len(rmLabels)) + return nil +} + // Add adds Traefik routing labels to the Swarm service for a tunnel. // If authUser and authPass are non-empty, a basicauth middleware is also added. func (lm *LabelManager) Add(tunKey, domain string, port int, authUser, authPass string) error { diff --git a/internal/server/tunnel.go b/internal/server/tunnel.go index 6237ecb..0a135b8 100644 --- a/internal/server/tunnel.go +++ b/internal/server/tunnel.go @@ -5,6 +5,7 @@ import ( "io" "log" "net" + "strings" "sync" "golang.org/x/crypto/ssh" @@ -161,7 +162,14 @@ func (s *SSHServer) handleForwardRequest( authPass: authPass, } + // If a previous tunnel exists for this domain (reconnect), tear it down + // so we don't leak ports/listeners and so Traefik labels stay consistent. s.mu.Lock() + if old, exists := s.activeTuns[tunKey]; exists && old.listener != nil { + log.Printf("Replacing stale tunnel %s (old port %d, new port %d)", tunKey, old.port, port) + old.listener.Close() + s.pool.Release(old.port) + } s.activeTuns[tunKey] = tun s.mu.Unlock() @@ -231,26 +239,56 @@ func forwardConnection( wg.Wait() } -// cleanupConnection removes all tunnels associated with a closed SSH connection. +// cleanupConnection removes tunnels associated with a closed SSH connection. +// It collects work under the lock, then performs slow label removal outside it. func (s *SSHServer) cleanupConnection(connKey string) { - s.mu.Lock() - defer s.mu.Unlock() + type cleanupItem struct { + key string + port int + listener net.Listener + ownedNow bool // true if this connKey still owns the map entry + } + s.mu.Lock() + var items []cleanupItem for key, tun := range s.activeTuns { if tun.connKey != connKey { continue } - - if tun.listener != nil { - tun.listener.Close() - s.pool.Release(tun.port) - } - - if err := s.labels.Remove(key); err != nil { - log.Printf("WARN: failed to remove labels for %s: %v", key, err) - } - - log.Printf("Cleaned up tunnel %s (port %d, conn=%s)", key, tun.port, connKey) + items = append(items, cleanupItem{ + key: key, + port: tun.port, + listener: tun.listener, + ownedNow: true, + }) delete(s.activeTuns, key) } + s.mu.Unlock() + + for _, item := range items { + if item.listener != nil { + item.listener.Close() + s.pool.Release(item.port) + } + + // Re-check ownership: a reconnecting client may have inserted a new + // entry for this key between our Unlock above and now. + s.mu.Lock() + current, replaced := s.activeTuns[item.key] + s.mu.Unlock() + + if replaced && current.connKey != connKey { + log.Printf("Skipping label removal for %s — replaced by conn %s", item.key, current.connKey) + continue + } + + if strings.HasSuffix(item.key, "-meta") { + continue + } + + if err := s.labels.Remove(item.key); err != nil { + log.Printf("WARN: failed to remove labels for %s: %v", item.key, err) + } + log.Printf("Cleaned up tunnel %s (port %d, conn=%s)", item.key, item.port, connKey) + } }