wgengine/magicsock: move UDP relay path discovery to heartbeat() (#16407)

This was previously hooked around direct UDP path discovery /
CallMeMaybe transmission, and related conditions. Now it is subject to
relay-specific considerations.

Updates tailscale/corp#27502

Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
Jordan Whited 2025-06-27 13:56:55 -07:00 committed by GitHub
parent 711698f5a9
commit 0a64e86a0d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 118 additions and 18 deletions

View File

@ -75,11 +75,12 @@ type endpoint struct {
// mu protects all following fields.
mu sync.Mutex // Lock ordering: Conn.mu, then endpoint.mu
heartBeatTimer *time.Timer // nil when idle
lastSendExt mono.Time // last time there were outgoing packets sent to this peer from an external trigger (e.g. wireguard-go or disco pingCLI)
lastSendAny mono.Time // last time there were outgoing packets sent this peer from any trigger, internal or external to magicsock
lastFullPing mono.Time // last time we pinged all disco or wireguard only endpoints
derpAddr netip.AddrPort // fallback/bootstrap path, if non-zero (non-zero for well-behaved clients)
heartBeatTimer *time.Timer // nil when idle
lastSendExt mono.Time // last time there were outgoing packets sent to this peer from an external trigger (e.g. wireguard-go or disco pingCLI)
lastSendAny mono.Time // last time there were outgoing packets sent this peer from any trigger, internal or external to magicsock
lastFullPing mono.Time // last time we pinged all disco or wireguard only endpoints
lastUDPRelayPathDiscovery mono.Time // last time we ran UDP relay path discovery
derpAddr netip.AddrPort // fallback/bootstrap path, if non-zero (non-zero for well-behaved clients)
bestAddr addrQuality // best non-DERP path; zero if none; mutate via setBestAddrLocked()
bestAddrAt mono.Time // time best address re-confirmed
@ -851,6 +852,10 @@ func (de *endpoint) heartbeat() {
de.sendDiscoPingsLocked(now, true)
}
if de.wantUDPRelayPathDiscoveryLocked(now) {
de.discoverUDPRelayPathsLocked(now)
}
de.heartBeatTimer = time.AfterFunc(heartbeatInterval, de.heartbeat)
}
@ -861,6 +866,45 @@ func (de *endpoint) setHeartbeatDisabled(v bool) {
de.heartbeatDisabled = v
}
// discoverUDPRelayPathsLocked starts UDP relay path discovery.
func (de *endpoint) discoverUDPRelayPathsLocked(now mono.Time) {
// TODO(jwhited): return early if there are no relay servers set, otherwise
// we spin up and down relayManager.runLoop unnecessarily.
de.lastUDPRelayPathDiscovery = now
de.c.relayManager.allocateAndHandshakeAllServers(de)
}
// wantUDPRelayPathDiscoveryLocked reports whether we should kick off UDP relay
// path discovery.
func (de *endpoint) wantUDPRelayPathDiscoveryLocked(now mono.Time) bool {
if runtime.GOOS == "js" {
return false
}
if !de.relayCapable {
return false
}
if de.bestAddr.isDirect() && now.Before(de.trustBestAddrUntil) {
return false
}
if !de.lastUDPRelayPathDiscovery.IsZero() && now.Sub(de.lastUDPRelayPathDiscovery) < discoverUDPRelayPathsInterval {
return false
}
// TODO(jwhited): consider applying 'goodEnoughLatency' suppression here,
// but not until we have a strategy for triggering CallMeMaybeVia regularly
// and/or enabling inbound packets to act as a UDP relay path discovery
// trigger, otherwise clients without relay servers may fall off a UDP
// relay path and never come back. They are dependent on the remote side
// regularly TX'ing CallMeMaybeVia, which currently only happens as part
// of full UDP relay path discovery.
if now.After(de.trustBestAddrUntil) {
return true
}
if !de.lastUDPRelayPathDiscovery.IsZero() && now.Sub(de.lastUDPRelayPathDiscovery) >= upgradeUDPRelayInterval {
return true
}
return false
}
// wantFullPingLocked reports whether we should ping to all our peers looking for
// a better path.
//
@ -869,7 +913,7 @@ func (de *endpoint) wantFullPingLocked(now mono.Time) bool {
if runtime.GOOS == "js" {
return false
}
if !de.bestAddr.ap.IsValid() || de.lastFullPing.IsZero() {
if !de.bestAddr.isDirect() || de.lastFullPing.IsZero() {
return true
}
if now.After(de.trustBestAddrUntil) {
@ -878,7 +922,7 @@ func (de *endpoint) wantFullPingLocked(now mono.Time) bool {
if de.bestAddr.latency <= goodEnoughLatency {
return false
}
if now.Sub(de.lastFullPing) >= upgradeInterval {
if now.Sub(de.lastFullPing) >= upgradeUDPDirectInterval {
return true
}
return false
@ -964,9 +1008,16 @@ func (de *endpoint) send(buffs [][]byte, offset int) error {
if startWGPing {
de.sendWireGuardOnlyPingsLocked(now)
}
} else if !udpAddr.ap.IsValid() || now.After(de.trustBestAddrUntil) {
} else if !udpAddr.isDirect() || now.After(de.trustBestAddrUntil) {
de.sendDiscoPingsLocked(now, true)
}
// TODO(jwhited): consider triggering UDP relay path discovery here under
// certain conditions. We currently only trigger it in heartbeat(), which
// is both good and bad. It's good because the first heartbeat() tick is 3s
// after the first packet, which gives us time to discover a UDP direct
// path and potentially avoid what would be wasted UDP relay path discovery
// work. It's bad because we might not discover a UDP direct path, and we
// incur a 3s delay before we try to discover a UDP relay path.
de.noteTxActivityExtTriggerLocked(now)
de.lastSendAny = now
de.mu.Unlock()
@ -1275,13 +1326,6 @@ func (de *endpoint) sendDiscoPingsLocked(now mono.Time, sendCallMeMaybe bool) {
// sent so our firewall ports are probably open and now
// would be a good time for them to connect.
go de.c.enqueueCallMeMaybe(derpAddr, de)
// Schedule allocation of relay endpoints. We make no considerations for
// current relay endpoints or best UDP path state for now, keep it
// simple.
if de.relayCapable {
go de.c.relayManager.allocateAndHandshakeAllServers(de)
}
}
}
@ -1703,6 +1747,12 @@ type epAddr struct {
vni virtualNetworkID // vni.isSet() indicates if this [epAddr] involves a Geneve header
}
// isDirect returns true if e.ap is valid and not tailcfg.DerpMagicIPAddr,
// and a VNI is not set.
func (e epAddr) isDirect() bool {
return e.ap.IsValid() && e.ap.Addr() != tailcfg.DerpMagicIPAddr && !e.vni.isSet()
}
func (e epAddr) String() string {
if !e.vni.isSet() {
return e.ap.String()

View File

@ -8,6 +8,7 @@ import (
"testing"
"time"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
)
@ -323,3 +324,44 @@ func Test_endpoint_maybeProbeUDPLifetimeLocked(t *testing.T) {
})
}
}
func Test_epAddr_isDirectUDP(t *testing.T) {
vni := virtualNetworkID{}
vni.set(7)
tests := []struct {
name string
ap netip.AddrPort
vni virtualNetworkID
want bool
}{
{
name: "true",
ap: netip.MustParseAddrPort("192.0.2.1:7"),
vni: virtualNetworkID{},
want: true,
},
{
name: "false derp magic addr",
ap: netip.AddrPortFrom(tailcfg.DerpMagicIPAddr, 0),
vni: virtualNetworkID{},
want: false,
},
{
name: "false vni set",
ap: netip.MustParseAddrPort("192.0.2.1:7"),
vni: vni,
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
e := epAddr{
ap: tt.ap,
vni: tt.vni,
}
if got := e.isDirect(); got != tt.want {
t.Errorf("isDirect() = %v, want %v", got, tt.want)
}
})
}
}

View File

@ -3494,9 +3494,17 @@ const (
// keep NAT mappings alive.
sessionActiveTimeout = 45 * time.Second
// upgradeInterval is how often we try to upgrade to a better path
// even if we have some non-DERP route that works.
upgradeInterval = 1 * time.Minute
// upgradeUDPDirectInterval is how often we try to upgrade to a better,
// direct UDP path even if we have some direct UDP path that works.
upgradeUDPDirectInterval = 1 * time.Minute
// upgradeUDPRelayInterval is how often we try to discover UDP relay paths
// even if we have a UDP relay path that works.
upgradeUDPRelayInterval = 1 * time.Minute
// discoverUDPRelayPathsInterval is the minimum time between UDP relay path
// discovery.
discoverUDPRelayPathsInterval = 30 * time.Second
// heartbeatInterval is how often pings to the best UDP address
// are sent.