Fix stale Traefik labels causing 502 on tunnel reconnect
ci/woodpecker/push/woodpecker Pipeline was successful Details

Three failure modes fixed:
- Startup: purge all tunnel-* labels so container restarts don't leave
  dead routes pointing to ports that no longer exist
- Reconnect: tear down old listener/port before registering new one for
  the same domain, preventing port leaks
- Race: cleanupConnection skips label removal if a newer connection has
  already taken over the tunnel key

Made-with: Cursor
This commit is contained in:
Leopere 2026-02-28 20:03:57 -05:00
parent fd81852ea5
commit 8024cddc7c
Signed by: colin
SSH Key Fingerprint: SHA256:nRPCQTeMFLdGytxRQmPVK9VXY3/ePKQ5lGRyJhT5DY8
3 changed files with 121 additions and 14 deletions

View File

@ -78,6 +78,11 @@ func main() {
}
defer labels.Close()
// Purge stale tunnel labels left by a previous container instance.
if err := labels.PurgeAllTunnelLabels(); err != nil {
log.Printf("WARN: failed to purge stale labels: %v", err)
}
// Initialize SSH server for tunnel clients.
sshSrv, err := server.NewSSHServer(hostKeyPath, authKeysPath, pool, labels)
if err != nil {

View File

@ -1,6 +1,7 @@
package server
import (
"encoding/json"
"fmt"
"log"
"strings"
@ -56,6 +57,69 @@ func NewLabelManager(
return lm, nil
}
// PurgeAllTunnelLabels removes every tunnel-* Traefik label from the Swarm
// service. Called on startup so stale labels from a previous instance don't
// route traffic to ports that no longer exist.
func (lm *LabelManager) PurgeAllTunnelLabels() error {
addr := lm.remoteHost
if !strings.Contains(addr, ":") {
addr = addr + ":22"
}
config := &ssh.ClientConfig{
User: lm.remoteUser,
Auth: []ssh.AuthMethod{ssh.PublicKeys(lm.signer)},
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
}
client, err := ssh.Dial("tcp", addr, config)
if err != nil {
return fmt.Errorf("SSH dial %s: %w", addr, err)
}
defer client.Close()
session, err := client.NewSession()
if err != nil {
return fmt.Errorf("SSH session: %w", err)
}
defer session.Close()
inspectCmd := fmt.Sprintf(
"docker service inspect --format '{{json .Spec.Labels}}' %s", lm.serviceName)
output, err := session.CombinedOutput(inspectCmd)
if err != nil {
return fmt.Errorf("inspect labels: %w (%s)", err, string(output))
}
var labels map[string]string
if err := json.Unmarshal(output, &labels); err != nil {
return fmt.Errorf("parse labels JSON: %w", err)
}
var rmLabels []string
for key := range labels {
if strings.HasPrefix(key, "traefik.http.routers.tunnel-") ||
strings.HasPrefix(key, "traefik.http.services.tunnel-") ||
strings.HasPrefix(key, "traefik.http.middlewares.tunnel-") {
rmLabels = append(rmLabels, key)
}
}
if len(rmLabels) == 0 {
log.Println("No stale tunnel labels to purge")
return nil
}
rmCmd := fmt.Sprintf("docker service update --label-rm %s %s",
strings.Join(rmLabels, " --label-rm "), lm.serviceName)
if err := lm.runRemote(rmCmd); err != nil {
return fmt.Errorf("purge tunnel labels: %w", err)
}
log.Printf("Purged %d stale tunnel labels on startup", len(rmLabels))
return nil
}
// Add adds Traefik routing labels to the Swarm service for a tunnel.
// If authUser and authPass are non-empty, a basicauth middleware is also added.
func (lm *LabelManager) Add(tunKey, domain string, port int, authUser, authPass string) error {

View File

@ -5,6 +5,7 @@ import (
"io"
"log"
"net"
"strings"
"sync"
"golang.org/x/crypto/ssh"
@ -161,7 +162,14 @@ func (s *SSHServer) handleForwardRequest(
authPass: authPass,
}
// If a previous tunnel exists for this domain (reconnect), tear it down
// so we don't leak ports/listeners and so Traefik labels stay consistent.
s.mu.Lock()
if old, exists := s.activeTuns[tunKey]; exists && old.listener != nil {
log.Printf("Replacing stale tunnel %s (old port %d, new port %d)", tunKey, old.port, port)
old.listener.Close()
s.pool.Release(old.port)
}
s.activeTuns[tunKey] = tun
s.mu.Unlock()
@ -231,26 +239,56 @@ func forwardConnection(
wg.Wait()
}
// cleanupConnection removes all tunnels associated with a closed SSH connection.
// cleanupConnection removes tunnels associated with a closed SSH connection.
// It collects work under the lock, then performs slow label removal outside it.
func (s *SSHServer) cleanupConnection(connKey string) {
s.mu.Lock()
defer s.mu.Unlock()
type cleanupItem struct {
key string
port int
listener net.Listener
ownedNow bool // true if this connKey still owns the map entry
}
s.mu.Lock()
var items []cleanupItem
for key, tun := range s.activeTuns {
if tun.connKey != connKey {
continue
}
if tun.listener != nil {
tun.listener.Close()
s.pool.Release(tun.port)
}
if err := s.labels.Remove(key); err != nil {
log.Printf("WARN: failed to remove labels for %s: %v", key, err)
}
log.Printf("Cleaned up tunnel %s (port %d, conn=%s)", key, tun.port, connKey)
items = append(items, cleanupItem{
key: key,
port: tun.port,
listener: tun.listener,
ownedNow: true,
})
delete(s.activeTuns, key)
}
s.mu.Unlock()
for _, item := range items {
if item.listener != nil {
item.listener.Close()
s.pool.Release(item.port)
}
// Re-check ownership: a reconnecting client may have inserted a new
// entry for this key between our Unlock above and now.
s.mu.Lock()
current, replaced := s.activeTuns[item.key]
s.mu.Unlock()
if replaced && current.connKey != connKey {
log.Printf("Skipping label removal for %s — replaced by conn %s", item.key, current.connKey)
continue
}
if strings.HasSuffix(item.key, "-meta") {
continue
}
if err := s.labels.Remove(item.key); err != nil {
log.Printf("WARN: failed to remove labels for %s: %v", item.key, err)
}
log.Printf("Cleaned up tunnel %s (port %d, conn=%s)", item.key, item.port, connKey)
}
}