diff --git a/wgengine/magicsock/endpoint.go b/wgengine/magicsock/endpoint.go index 48d5ef5a1..4df2a7315 100644 --- a/wgengine/magicsock/endpoint.go +++ b/wgengine/magicsock/endpoint.go @@ -899,8 +899,16 @@ func (de *endpoint) wantUDPRelayPathDiscoveryLocked(now mono.Time) bool { if de.bestAddr.isDirect() && now.Before(de.trustBestAddrUntil) { return false } - if !de.lastUDPRelayPathDiscovery.IsZero() && now.Sub(de.lastUDPRelayPathDiscovery) < discoverUDPRelayPathsInterval { - return false + if de.lastUDPRelayPathDiscovery.IsZero() { + return true + } + // TODO(jwhited): consider suppressing peer relay path discovery if + // time since de.lastSendExt is greater than some value (maybe 2x + // heartbeatInterval). Otherwise, [endpoint.heartbeat] can be extended + // fairly inefficiently into the future as peer relay path discovery moves + // de.lastSendExt forward. + if now.Sub(de.lastUDPRelayPathDiscovery) > discoverUDPRelayPathsInterval { + return true } // TODO(jwhited): consider applying 'goodEnoughLatency' suppression here, // but not until we have a strategy for triggering CallMeMaybeVia regularly @@ -909,12 +917,6 @@ func (de *endpoint) wantUDPRelayPathDiscoveryLocked(now mono.Time) bool { // relay path and never come back. They are dependent on the remote side // regularly TX'ing CallMeMaybeVia, which currently only happens as part // of full UDP relay path discovery. - if now.After(de.trustBestAddrUntil) { - return true - } - if !de.lastUDPRelayPathDiscovery.IsZero() && now.Sub(de.lastUDPRelayPathDiscovery) >= upgradeUDPRelayInterval { - return true - } return false } diff --git a/wgengine/magicsock/endpoint_test.go b/wgengine/magicsock/endpoint_test.go index 92f4ef1d3..e1d52c44b 100644 --- a/wgengine/magicsock/endpoint_test.go +++ b/wgengine/magicsock/endpoint_test.go @@ -454,3 +454,11 @@ func Test_endpoint_udpRelayEndpointReady(t *testing.T) { }) } } + +func TestPeerRelayPathDiscoveryInterval(t *testing.T) { + schedulingJitter := time.Second * 3 + mustBeGreaterThan := sessionActiveTimeout + peerRelayAllocReqMaxSendDuration + schedulingJitter + if discoverUDPRelayPathsInterval <= mustBeGreaterThan { + t.Fatalf("discoverUDPRelayPathsInterval is too short (%v) want > %v, see http://go/corp/30534", discoverUDPRelayPathsInterval, mustBeGreaterThan) + } +} diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index ad07003f7..954e4d8a8 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -3597,13 +3597,12 @@ const ( // direct UDP path even if we have some direct UDP path that works. upgradeUDPDirectInterval = 1 * time.Minute - // upgradeUDPRelayInterval is how often we try to discover UDP relay paths - // even if we have a UDP relay path that works. - upgradeUDPRelayInterval = 1 * time.Minute - // discoverUDPRelayPathsInterval is the minimum time between UDP relay path - // discovery. - discoverUDPRelayPathsInterval = 30 * time.Second + // discovery. discoverUDPRelayPathsInterval MUST be greater than the sum of + // [sessionActiveTimeout] + [peerRelayAllocReqMaxSendDuration] + some jitter + // to account for delays in scheduling work. Otherwise, peer relay path + // discovery can hold a session active, forever. + discoverUDPRelayPathsInterval = 90 * time.Second // heartbeatInterval is how often pings to the best UDP address // are sent. diff --git a/wgengine/magicsock/relaymanager.go b/wgengine/magicsock/relaymanager.go index d7acf80b5..5c9751a86 100644 --- a/wgengine/magicsock/relaymanager.go +++ b/wgengine/magicsock/relaymanager.go @@ -850,7 +850,18 @@ func (e errNotReady) Error() string { return fmt.Sprintf("server not ready, retry after %v", e.retryAfter) } -const reqTimeout = time.Second * 10 +const ( + // peerRelayAllocReqTimeout is the timeout for a single peer relay binding + // allocation request. + peerRelayAllocReqTimeout = time.Second * 10 + // peerRelayAllocReqMaxRetryWait is the maximum time we can spend waiting to + // retry a peer relay binding allocation request (we only retry once). + peerRelayAllocReqMaxRetryWait = time.Second * 3 + // peerRelayAllocReqMaxSendDuration is the maximum time we can spend + // transmitting for peer relay binding allocation requests associated with + // a single path discovery cycle. + peerRelayAllocReqMaxSendDuration = peerRelayAllocReqTimeout*2 + peerRelayAllocReqMaxRetryWait +) func doAllocate(ctx context.Context, server netip.AddrPort, discoKeys [2]key.DiscoPublic) (udprelay.ServerEndpoint, error) { var reqBody bytes.Buffer @@ -864,7 +875,7 @@ func doAllocate(ctx context.Context, server netip.AddrPort, discoKeys [2]key.Dis if err != nil { return udprelay.ServerEndpoint{}, err } - reqCtx, cancel := context.WithTimeout(ctx, reqTimeout) + reqCtx, cancel := context.WithTimeout(ctx, peerRelayAllocReqTimeout) defer cancel() req, err := http.NewRequestWithContext(reqCtx, httpm.POST, "http://"+server.String()+"/v0/relay/endpoint", &reqBody) if err != nil { @@ -900,8 +911,10 @@ func (r *relayManager) allocateSingleServer(ctx context.Context, wg *sync.WaitGr return } firstTry := true + doAllocateCtx, cancel := context.WithTimeout(ctx, peerRelayAllocReqMaxSendDuration) + defer cancel() for { - se, err := doAllocate(ctx, server, [2]key.DiscoPublic{wlb.ep.c.discoPublic, remoteDisco.key}) + se, err := doAllocate(doAllocateCtx, server, [2]key.DiscoPublic{wlb.ep.c.discoPublic, remoteDisco.key}) if err == nil { relayManagerInputEvent(r, ctx, &r.newServerEndpointCh, newRelayServerEndpointEvent{ wlb: wlb, @@ -916,7 +929,7 @@ func (r *relayManager) allocateSingleServer(ctx context.Context, wg *sync.WaitGr select { case <-ctx.Done(): return - case <-time.After(min(notReady.retryAfter, reqTimeout)): + case <-time.After(min(notReady.retryAfter, peerRelayAllocReqMaxRetryWait)): firstTry = false continue }