net/netcheck: preserve live home DERP through packet loss

During a short period of packet loss, a TCP connection to the home DERP
may be maintained. If no other regions emerge as winners, such as when
all regions but one are avoided/disallowed as candidates, ensure that
the current home region, if still active, is not dropped as the
preferred region until it has failed two keepalives.

Relatedly apply avoid and no measure no home to ICMP and HTTP checks as
intended.

Updates tailscale/corp#12894
Updates tailscale/corp#29491

Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker 2025-06-11 15:57:55 -07:00 committed by James Tucker
parent 3ed76ceed3
commit b0f7b23efe
5 changed files with 65 additions and 17 deletions

View File

@ -85,7 +85,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep
tailscale.com/control/controlhttp from tailscale.com/cmd/tailscale/cli
tailscale.com/control/controlhttp/controlhttpcommon from tailscale.com/control/controlhttp
tailscale.com/control/controlknobs from tailscale.com/net/portmapper
tailscale.com/derp from tailscale.com/derp/derphttp
tailscale.com/derp from tailscale.com/derp/derphttp+
tailscale.com/derp/derpconst from tailscale.com/derp+
tailscale.com/derp/derphttp from tailscale.com/net/netcheck
tailscale.com/disco from tailscale.com/derp

View File

@ -36,9 +36,13 @@ const (
frameHeaderLen = 1 + 4 // frameType byte + 4 byte length
keyLen = 32
maxInfoLen = 1 << 20
keepAlive = 60 * time.Second
)
// KeepAlive is the minimum frequency at which the DERP server sends
// keep alive frames. The server adds some jitter, so this timing is not
// exact, but 2x this value can be considered a missed keep alive.
const KeepAlive = 60 * time.Second
// ProtocolVersion is bumped whenever there's a wire-incompatible change.
// - version 1 (zero on wire): consistent box headers, in use by employee dev nodes a bit
// - version 2: received packets have src addrs in frameRecvPacket at beginning

View File

@ -1789,7 +1789,7 @@ func (c *sclient) sendLoop(ctx context.Context) error {
defer c.onSendLoopDone()
jitter := rand.N(5 * time.Second)
keepAliveTick, keepAliveTickChannel := c.s.clock.NewTicker(keepAlive + jitter)
keepAliveTick, keepAliveTickChannel := c.s.clock.NewTicker(KeepAlive + jitter)
defer keepAliveTick.Stop()
var werr error // last write error

View File

@ -23,6 +23,7 @@ import (
"syscall"
"time"
"tailscale.com/derp"
"tailscale.com/derp/derphttp"
"tailscale.com/envknob"
"tailscale.com/hostinfo"
@ -449,7 +450,7 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report, pre
// restoration back to the home DERP on the next full netcheck ~5 minutes later
// - which is highly disruptive when it causes shifts in geo routed subnet
// routers. By always including the home DERP in the incremental netcheck, we
// ensure that the home DERP is always probed, even if it observed a recenet
// ensure that the home DERP is always probed, even if it observed a recent
// poor latency sample. This inclusion enables the latency history checks in
// home DERP selection to still take effect.
// planContainsHome indicates whether the home DERP has been added to the probePlan,
@ -989,7 +990,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe
var wg sync.WaitGroup
var need []*tailcfg.DERPRegion
for rid, reg := range dm.Regions {
if !rs.haveRegionLatency(rid) && regionHasDERPNode(reg) {
if !rs.haveRegionLatency(rid) && regionHasDERPNode(reg) && !reg.Avoid && !reg.NoMeasureNoHome {
need = append(need, reg)
}
}
@ -1371,6 +1372,15 @@ const (
// even without receiving a STUN response.
// Note: must remain higher than the derp package frameReceiveRecordRate
PreferredDERPFrameTime = 8 * time.Second
// PreferredDERPKeepAliveTimeout is 2x the DERP Keep Alive timeout. If there
// is no latency data to make judgements from, but we have heard from our
// current DERP region inside of 2x the KeepAlive window, don't switch DERP
// regions yet, keep the current region. This prevents region flapping /
// home DERP removal during short periods of packet loss where the DERP TCP
// connection may itself naturally recover.
// TODO(raggi): expose shared time bounds from the DERP package rather than
// duplicating them here.
PreferredDERPKeepAliveTimeout = 2 * derp.KeepAlive
)
// addReportHistoryAndSetPreferredDERP adds r to the set of recent Reports
@ -1455,13 +1465,10 @@ func (c *Client) addReportHistoryAndSetPreferredDERP(rs *reportState, r *Report,
// the STUN probe) since we started the netcheck, or in the past 2s, as
// another signal for "this region is still working".
heardFromOldRegionRecently := false
prevRegionLastHeard := rs.opts.getLastDERPActivity(prevDERP)
if changingPreferred {
if lastHeard := rs.opts.getLastDERPActivity(prevDERP); !lastHeard.IsZero() {
now := c.timeNow()
heardFromOldRegionRecently = lastHeard.After(rs.start)
heardFromOldRegionRecently = heardFromOldRegionRecently || lastHeard.After(now.Add(-PreferredDERPFrameTime))
}
heardFromOldRegionRecently = prevRegionLastHeard.After(rs.start)
heardFromOldRegionRecently = heardFromOldRegionRecently || prevRegionLastHeard.After(now.Add(-PreferredDERPFrameTime))
}
// The old region is accessible if we've heard from it via a non-STUN
@ -1488,17 +1495,20 @@ func (c *Client) addReportHistoryAndSetPreferredDERP(rs *reportState, r *Report,
// If the forced DERP region probed successfully, or has recent traffic,
// use it.
_, haveLatencySample := r.RegionLatency[c.ForcePreferredDERP]
var recentActivity bool
if lastHeard := rs.opts.getLastDERPActivity(c.ForcePreferredDERP); !lastHeard.IsZero() {
now := c.timeNow()
recentActivity = lastHeard.After(rs.start)
recentActivity = recentActivity || lastHeard.After(now.Add(-PreferredDERPFrameTime))
}
lastHeard := rs.opts.getLastDERPActivity(c.ForcePreferredDERP)
recentActivity := lastHeard.After(rs.start)
recentActivity = recentActivity || lastHeard.After(now.Add(-PreferredDERPFrameTime))
if haveLatencySample || recentActivity {
r.PreferredDERP = c.ForcePreferredDERP
}
}
// If there was no latency data to make judgements on, but there is an
// active DERP connection that has at least been doing KeepAlive recently,
// keep it, rather than dropping it.
if r.PreferredDERP == 0 && prevRegionLastHeard.After(now.Add(-PreferredDERPKeepAliveTimeout)) {
r.PreferredDERP = prevDERP
}
}
func updateLatency(m map[int]time.Duration, regionID int, d time.Duration) {

View File

@ -18,6 +18,7 @@ import (
"testing"
"time"
"tailscale.com/derp"
"tailscale.com/net/netmon"
"tailscale.com/net/stun/stuntest"
"tailscale.com/tailcfg"
@ -419,6 +420,39 @@ func TestAddReportHistoryAndSetPreferredDERP(t *testing.T) {
wantPrevLen: 2,
wantDERP: 1,
},
{
name: "no_data_keep_home",
steps: []step{
{0, report("d1", 2, "d2", 3)},
{30 * time.Second, report()},
{2 * time.Second, report()},
{2 * time.Second, report()},
{2 * time.Second, report()},
{2 * time.Second, report()},
},
opts: &GetReportOpts{
GetLastDERPActivity: mkLDAFunc(map[int]time.Time{
1: startTime,
}),
},
wantPrevLen: 6,
wantDERP: 1,
},
{
name: "no_data_home_expires",
steps: []step{
{0, report("d1", 2, "d2", 3)},
{30 * time.Second, report()},
{2 * derp.KeepAlive, report()},
},
opts: &GetReportOpts{
GetLastDERPActivity: mkLDAFunc(map[int]time.Time{
1: startTime,
}),
},
wantPrevLen: 3,
wantDERP: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {