From b0f7b23efe1c7d02e8caec2a5ad74ab2d5cb138a Mon Sep 17 00:00:00 2001 From: James Tucker Date: Wed, 11 Jun 2025 15:57:55 -0700 Subject: [PATCH] net/netcheck: preserve live home DERP through packet loss During a short period of packet loss, a TCP connection to the home DERP may be maintained. If no other regions emerge as winners, such as when all regions but one are avoided/disallowed as candidates, ensure that the current home region, if still active, is not dropped as the preferred region until it has failed two keepalives. Relatedly apply avoid and no measure no home to ICMP and HTTP checks as intended. Updates tailscale/corp#12894 Updates tailscale/corp#29491 Signed-off-by: James Tucker --- cmd/tailscale/depaware.txt | 2 +- derp/derp.go | 6 +++++- derp/derp_server.go | 2 +- net/netcheck/netcheck.go | 38 ++++++++++++++++++++++------------- net/netcheck/netcheck_test.go | 34 +++++++++++++++++++++++++++++++ 5 files changed, 65 insertions(+), 17 deletions(-) diff --git a/cmd/tailscale/depaware.txt b/cmd/tailscale/depaware.txt index 8c3b404b1..69d054ea4 100644 --- a/cmd/tailscale/depaware.txt +++ b/cmd/tailscale/depaware.txt @@ -85,7 +85,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep tailscale.com/control/controlhttp from tailscale.com/cmd/tailscale/cli tailscale.com/control/controlhttp/controlhttpcommon from tailscale.com/control/controlhttp tailscale.com/control/controlknobs from tailscale.com/net/portmapper - tailscale.com/derp from tailscale.com/derp/derphttp + tailscale.com/derp from tailscale.com/derp/derphttp+ tailscale.com/derp/derpconst from tailscale.com/derp+ tailscale.com/derp/derphttp from tailscale.com/net/netcheck tailscale.com/disco from tailscale.com/derp diff --git a/derp/derp.go b/derp/derp.go index 65acd4321..24c1ca65c 100644 --- a/derp/derp.go +++ b/derp/derp.go @@ -36,9 +36,13 @@ const ( frameHeaderLen = 1 + 4 // frameType byte + 4 byte length keyLen = 32 maxInfoLen = 1 << 20 - keepAlive = 60 * time.Second ) +// KeepAlive is the minimum frequency at which the DERP server sends +// keep alive frames. The server adds some jitter, so this timing is not +// exact, but 2x this value can be considered a missed keep alive. +const KeepAlive = 60 * time.Second + // ProtocolVersion is bumped whenever there's a wire-incompatible change. // - version 1 (zero on wire): consistent box headers, in use by employee dev nodes a bit // - version 2: received packets have src addrs in frameRecvPacket at beginning diff --git a/derp/derp_server.go b/derp/derp_server.go index c6a749485..bd67e7eec 100644 --- a/derp/derp_server.go +++ b/derp/derp_server.go @@ -1789,7 +1789,7 @@ func (c *sclient) sendLoop(ctx context.Context) error { defer c.onSendLoopDone() jitter := rand.N(5 * time.Second) - keepAliveTick, keepAliveTickChannel := c.s.clock.NewTicker(keepAlive + jitter) + keepAliveTick, keepAliveTickChannel := c.s.clock.NewTicker(KeepAlive + jitter) defer keepAliveTick.Stop() var werr error // last write error diff --git a/net/netcheck/netcheck.go b/net/netcheck/netcheck.go index 54627f713..cb622a339 100644 --- a/net/netcheck/netcheck.go +++ b/net/netcheck/netcheck.go @@ -23,6 +23,7 @@ import ( "syscall" "time" + "tailscale.com/derp" "tailscale.com/derp/derphttp" "tailscale.com/envknob" "tailscale.com/hostinfo" @@ -449,7 +450,7 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report, pre // restoration back to the home DERP on the next full netcheck ~5 minutes later // - which is highly disruptive when it causes shifts in geo routed subnet // routers. By always including the home DERP in the incremental netcheck, we - // ensure that the home DERP is always probed, even if it observed a recenet + // ensure that the home DERP is always probed, even if it observed a recent // poor latency sample. This inclusion enables the latency history checks in // home DERP selection to still take effect. // planContainsHome indicates whether the home DERP has been added to the probePlan, @@ -989,7 +990,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe var wg sync.WaitGroup var need []*tailcfg.DERPRegion for rid, reg := range dm.Regions { - if !rs.haveRegionLatency(rid) && regionHasDERPNode(reg) { + if !rs.haveRegionLatency(rid) && regionHasDERPNode(reg) && !reg.Avoid && !reg.NoMeasureNoHome { need = append(need, reg) } } @@ -1371,6 +1372,15 @@ const ( // even without receiving a STUN response. // Note: must remain higher than the derp package frameReceiveRecordRate PreferredDERPFrameTime = 8 * time.Second + // PreferredDERPKeepAliveTimeout is 2x the DERP Keep Alive timeout. If there + // is no latency data to make judgements from, but we have heard from our + // current DERP region inside of 2x the KeepAlive window, don't switch DERP + // regions yet, keep the current region. This prevents region flapping / + // home DERP removal during short periods of packet loss where the DERP TCP + // connection may itself naturally recover. + // TODO(raggi): expose shared time bounds from the DERP package rather than + // duplicating them here. + PreferredDERPKeepAliveTimeout = 2 * derp.KeepAlive ) // addReportHistoryAndSetPreferredDERP adds r to the set of recent Reports @@ -1455,13 +1465,10 @@ func (c *Client) addReportHistoryAndSetPreferredDERP(rs *reportState, r *Report, // the STUN probe) since we started the netcheck, or in the past 2s, as // another signal for "this region is still working". heardFromOldRegionRecently := false + prevRegionLastHeard := rs.opts.getLastDERPActivity(prevDERP) if changingPreferred { - if lastHeard := rs.opts.getLastDERPActivity(prevDERP); !lastHeard.IsZero() { - now := c.timeNow() - - heardFromOldRegionRecently = lastHeard.After(rs.start) - heardFromOldRegionRecently = heardFromOldRegionRecently || lastHeard.After(now.Add(-PreferredDERPFrameTime)) - } + heardFromOldRegionRecently = prevRegionLastHeard.After(rs.start) + heardFromOldRegionRecently = heardFromOldRegionRecently || prevRegionLastHeard.After(now.Add(-PreferredDERPFrameTime)) } // The old region is accessible if we've heard from it via a non-STUN @@ -1488,17 +1495,20 @@ func (c *Client) addReportHistoryAndSetPreferredDERP(rs *reportState, r *Report, // If the forced DERP region probed successfully, or has recent traffic, // use it. _, haveLatencySample := r.RegionLatency[c.ForcePreferredDERP] - var recentActivity bool - if lastHeard := rs.opts.getLastDERPActivity(c.ForcePreferredDERP); !lastHeard.IsZero() { - now := c.timeNow() - recentActivity = lastHeard.After(rs.start) - recentActivity = recentActivity || lastHeard.After(now.Add(-PreferredDERPFrameTime)) - } + lastHeard := rs.opts.getLastDERPActivity(c.ForcePreferredDERP) + recentActivity := lastHeard.After(rs.start) + recentActivity = recentActivity || lastHeard.After(now.Add(-PreferredDERPFrameTime)) if haveLatencySample || recentActivity { r.PreferredDERP = c.ForcePreferredDERP } } + // If there was no latency data to make judgements on, but there is an + // active DERP connection that has at least been doing KeepAlive recently, + // keep it, rather than dropping it. + if r.PreferredDERP == 0 && prevRegionLastHeard.After(now.Add(-PreferredDERPKeepAliveTimeout)) { + r.PreferredDERP = prevDERP + } } func updateLatency(m map[int]time.Duration, regionID int, d time.Duration) { diff --git a/net/netcheck/netcheck_test.go b/net/netcheck/netcheck_test.go index 3affa614d..6830e7f27 100644 --- a/net/netcheck/netcheck_test.go +++ b/net/netcheck/netcheck_test.go @@ -18,6 +18,7 @@ import ( "testing" "time" + "tailscale.com/derp" "tailscale.com/net/netmon" "tailscale.com/net/stun/stuntest" "tailscale.com/tailcfg" @@ -419,6 +420,39 @@ func TestAddReportHistoryAndSetPreferredDERP(t *testing.T) { wantPrevLen: 2, wantDERP: 1, }, + { + name: "no_data_keep_home", + steps: []step{ + {0, report("d1", 2, "d2", 3)}, + {30 * time.Second, report()}, + {2 * time.Second, report()}, + {2 * time.Second, report()}, + {2 * time.Second, report()}, + {2 * time.Second, report()}, + }, + opts: &GetReportOpts{ + GetLastDERPActivity: mkLDAFunc(map[int]time.Time{ + 1: startTime, + }), + }, + wantPrevLen: 6, + wantDERP: 1, + }, + { + name: "no_data_home_expires", + steps: []step{ + {0, report("d1", 2, "d2", 3)}, + {30 * time.Second, report()}, + {2 * derp.KeepAlive, report()}, + }, + opts: &GetReportOpts{ + GetLastDERPActivity: mkLDAFunc(map[int]time.Time{ + 1: startTime, + }), + }, + wantPrevLen: 3, + wantDERP: 0, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) {