Fix stale Traefik labels causing 502 on tunnel reconnect
ci/woodpecker/push/woodpecker Pipeline was successful
Details
ci/woodpecker/push/woodpecker Pipeline was successful
Details
Three failure modes fixed: - Startup: purge all tunnel-* labels so container restarts don't leave dead routes pointing to ports that no longer exist - Reconnect: tear down old listener/port before registering new one for the same domain, preventing port leaks - Race: cleanupConnection skips label removal if a newer connection has already taken over the tunnel key Made-with: Cursor
This commit is contained in:
parent
fd81852ea5
commit
8024cddc7c
|
|
@ -78,6 +78,11 @@ func main() {
|
|||
}
|
||||
defer labels.Close()
|
||||
|
||||
// Purge stale tunnel labels left by a previous container instance.
|
||||
if err := labels.PurgeAllTunnelLabels(); err != nil {
|
||||
log.Printf("WARN: failed to purge stale labels: %v", err)
|
||||
}
|
||||
|
||||
// Initialize SSH server for tunnel clients.
|
||||
sshSrv, err := server.NewSSHServer(hostKeyPath, authKeysPath, pool, labels)
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package server
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
|
@ -56,6 +57,69 @@ func NewLabelManager(
|
|||
return lm, nil
|
||||
}
|
||||
|
||||
// PurgeAllTunnelLabels removes every tunnel-* Traefik label from the Swarm
|
||||
// service. Called on startup so stale labels from a previous instance don't
|
||||
// route traffic to ports that no longer exist.
|
||||
func (lm *LabelManager) PurgeAllTunnelLabels() error {
|
||||
addr := lm.remoteHost
|
||||
if !strings.Contains(addr, ":") {
|
||||
addr = addr + ":22"
|
||||
}
|
||||
|
||||
config := &ssh.ClientConfig{
|
||||
User: lm.remoteUser,
|
||||
Auth: []ssh.AuthMethod{ssh.PublicKeys(lm.signer)},
|
||||
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
|
||||
}
|
||||
|
||||
client, err := ssh.Dial("tcp", addr, config)
|
||||
if err != nil {
|
||||
return fmt.Errorf("SSH dial %s: %w", addr, err)
|
||||
}
|
||||
defer client.Close()
|
||||
|
||||
session, err := client.NewSession()
|
||||
if err != nil {
|
||||
return fmt.Errorf("SSH session: %w", err)
|
||||
}
|
||||
defer session.Close()
|
||||
|
||||
inspectCmd := fmt.Sprintf(
|
||||
"docker service inspect --format '{{json .Spec.Labels}}' %s", lm.serviceName)
|
||||
output, err := session.CombinedOutput(inspectCmd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("inspect labels: %w (%s)", err, string(output))
|
||||
}
|
||||
|
||||
var labels map[string]string
|
||||
if err := json.Unmarshal(output, &labels); err != nil {
|
||||
return fmt.Errorf("parse labels JSON: %w", err)
|
||||
}
|
||||
|
||||
var rmLabels []string
|
||||
for key := range labels {
|
||||
if strings.HasPrefix(key, "traefik.http.routers.tunnel-") ||
|
||||
strings.HasPrefix(key, "traefik.http.services.tunnel-") ||
|
||||
strings.HasPrefix(key, "traefik.http.middlewares.tunnel-") {
|
||||
rmLabels = append(rmLabels, key)
|
||||
}
|
||||
}
|
||||
|
||||
if len(rmLabels) == 0 {
|
||||
log.Println("No stale tunnel labels to purge")
|
||||
return nil
|
||||
}
|
||||
|
||||
rmCmd := fmt.Sprintf("docker service update --label-rm %s %s",
|
||||
strings.Join(rmLabels, " --label-rm "), lm.serviceName)
|
||||
if err := lm.runRemote(rmCmd); err != nil {
|
||||
return fmt.Errorf("purge tunnel labels: %w", err)
|
||||
}
|
||||
|
||||
log.Printf("Purged %d stale tunnel labels on startup", len(rmLabels))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Add adds Traefik routing labels to the Swarm service for a tunnel.
|
||||
// If authUser and authPass are non-empty, a basicauth middleware is also added.
|
||||
func (lm *LabelManager) Add(tunKey, domain string, port int, authUser, authPass string) error {
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import (
|
|||
"io"
|
||||
"log"
|
||||
"net"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"golang.org/x/crypto/ssh"
|
||||
|
|
@ -161,7 +162,14 @@ func (s *SSHServer) handleForwardRequest(
|
|||
authPass: authPass,
|
||||
}
|
||||
|
||||
// If a previous tunnel exists for this domain (reconnect), tear it down
|
||||
// so we don't leak ports/listeners and so Traefik labels stay consistent.
|
||||
s.mu.Lock()
|
||||
if old, exists := s.activeTuns[tunKey]; exists && old.listener != nil {
|
||||
log.Printf("Replacing stale tunnel %s (old port %d, new port %d)", tunKey, old.port, port)
|
||||
old.listener.Close()
|
||||
s.pool.Release(old.port)
|
||||
}
|
||||
s.activeTuns[tunKey] = tun
|
||||
s.mu.Unlock()
|
||||
|
||||
|
|
@ -231,26 +239,56 @@ func forwardConnection(
|
|||
wg.Wait()
|
||||
}
|
||||
|
||||
// cleanupConnection removes all tunnels associated with a closed SSH connection.
|
||||
// cleanupConnection removes tunnels associated with a closed SSH connection.
|
||||
// It collects work under the lock, then performs slow label removal outside it.
|
||||
func (s *SSHServer) cleanupConnection(connKey string) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
type cleanupItem struct {
|
||||
key string
|
||||
port int
|
||||
listener net.Listener
|
||||
ownedNow bool // true if this connKey still owns the map entry
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
var items []cleanupItem
|
||||
for key, tun := range s.activeTuns {
|
||||
if tun.connKey != connKey {
|
||||
continue
|
||||
}
|
||||
|
||||
if tun.listener != nil {
|
||||
tun.listener.Close()
|
||||
s.pool.Release(tun.port)
|
||||
}
|
||||
|
||||
if err := s.labels.Remove(key); err != nil {
|
||||
log.Printf("WARN: failed to remove labels for %s: %v", key, err)
|
||||
}
|
||||
|
||||
log.Printf("Cleaned up tunnel %s (port %d, conn=%s)", key, tun.port, connKey)
|
||||
items = append(items, cleanupItem{
|
||||
key: key,
|
||||
port: tun.port,
|
||||
listener: tun.listener,
|
||||
ownedNow: true,
|
||||
})
|
||||
delete(s.activeTuns, key)
|
||||
}
|
||||
s.mu.Unlock()
|
||||
|
||||
for _, item := range items {
|
||||
if item.listener != nil {
|
||||
item.listener.Close()
|
||||
s.pool.Release(item.port)
|
||||
}
|
||||
|
||||
// Re-check ownership: a reconnecting client may have inserted a new
|
||||
// entry for this key between our Unlock above and now.
|
||||
s.mu.Lock()
|
||||
current, replaced := s.activeTuns[item.key]
|
||||
s.mu.Unlock()
|
||||
|
||||
if replaced && current.connKey != connKey {
|
||||
log.Printf("Skipping label removal for %s — replaced by conn %s", item.key, current.connKey)
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasSuffix(item.key, "-meta") {
|
||||
continue
|
||||
}
|
||||
|
||||
if err := s.labels.Remove(item.key); err != nil {
|
||||
log.Printf("WARN: failed to remove labels for %s: %v", item.key, err)
|
||||
}
|
||||
log.Printf("Cleaned up tunnel %s (port %d, conn=%s)", item.key, item.port, connKey)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue