mirror of
https://github.com/tailscale/tailscale.git
synced 2025-06-16 21:29:14 +00:00
net/netcheck: preserve live home DERP through packet loss
During a short period of packet loss, a TCP connection to the home DERP may be maintained. If no other regions emerge as winners, such as when all regions but one are avoided/disallowed as candidates, ensure that the current home region, if still active, is not dropped as the preferred region until it has failed two keepalives. Relatedly apply avoid and no measure no home to ICMP and HTTP checks as intended. Updates tailscale/corp#12894 Updates tailscale/corp#29491 Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
parent
3ed76ceed3
commit
b0f7b23efe
@ -85,7 +85,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep
|
||||
tailscale.com/control/controlhttp from tailscale.com/cmd/tailscale/cli
|
||||
tailscale.com/control/controlhttp/controlhttpcommon from tailscale.com/control/controlhttp
|
||||
tailscale.com/control/controlknobs from tailscale.com/net/portmapper
|
||||
tailscale.com/derp from tailscale.com/derp/derphttp
|
||||
tailscale.com/derp from tailscale.com/derp/derphttp+
|
||||
tailscale.com/derp/derpconst from tailscale.com/derp+
|
||||
tailscale.com/derp/derphttp from tailscale.com/net/netcheck
|
||||
tailscale.com/disco from tailscale.com/derp
|
||||
|
@ -36,9 +36,13 @@ const (
|
||||
frameHeaderLen = 1 + 4 // frameType byte + 4 byte length
|
||||
keyLen = 32
|
||||
maxInfoLen = 1 << 20
|
||||
keepAlive = 60 * time.Second
|
||||
)
|
||||
|
||||
// KeepAlive is the minimum frequency at which the DERP server sends
|
||||
// keep alive frames. The server adds some jitter, so this timing is not
|
||||
// exact, but 2x this value can be considered a missed keep alive.
|
||||
const KeepAlive = 60 * time.Second
|
||||
|
||||
// ProtocolVersion is bumped whenever there's a wire-incompatible change.
|
||||
// - version 1 (zero on wire): consistent box headers, in use by employee dev nodes a bit
|
||||
// - version 2: received packets have src addrs in frameRecvPacket at beginning
|
||||
|
@ -1789,7 +1789,7 @@ func (c *sclient) sendLoop(ctx context.Context) error {
|
||||
defer c.onSendLoopDone()
|
||||
|
||||
jitter := rand.N(5 * time.Second)
|
||||
keepAliveTick, keepAliveTickChannel := c.s.clock.NewTicker(keepAlive + jitter)
|
||||
keepAliveTick, keepAliveTickChannel := c.s.clock.NewTicker(KeepAlive + jitter)
|
||||
defer keepAliveTick.Stop()
|
||||
|
||||
var werr error // last write error
|
||||
|
@ -23,6 +23,7 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"tailscale.com/derp"
|
||||
"tailscale.com/derp/derphttp"
|
||||
"tailscale.com/envknob"
|
||||
"tailscale.com/hostinfo"
|
||||
@ -449,7 +450,7 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report, pre
|
||||
// restoration back to the home DERP on the next full netcheck ~5 minutes later
|
||||
// - which is highly disruptive when it causes shifts in geo routed subnet
|
||||
// routers. By always including the home DERP in the incremental netcheck, we
|
||||
// ensure that the home DERP is always probed, even if it observed a recenet
|
||||
// ensure that the home DERP is always probed, even if it observed a recent
|
||||
// poor latency sample. This inclusion enables the latency history checks in
|
||||
// home DERP selection to still take effect.
|
||||
// planContainsHome indicates whether the home DERP has been added to the probePlan,
|
||||
@ -989,7 +990,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe
|
||||
var wg sync.WaitGroup
|
||||
var need []*tailcfg.DERPRegion
|
||||
for rid, reg := range dm.Regions {
|
||||
if !rs.haveRegionLatency(rid) && regionHasDERPNode(reg) {
|
||||
if !rs.haveRegionLatency(rid) && regionHasDERPNode(reg) && !reg.Avoid && !reg.NoMeasureNoHome {
|
||||
need = append(need, reg)
|
||||
}
|
||||
}
|
||||
@ -1371,6 +1372,15 @@ const (
|
||||
// even without receiving a STUN response.
|
||||
// Note: must remain higher than the derp package frameReceiveRecordRate
|
||||
PreferredDERPFrameTime = 8 * time.Second
|
||||
// PreferredDERPKeepAliveTimeout is 2x the DERP Keep Alive timeout. If there
|
||||
// is no latency data to make judgements from, but we have heard from our
|
||||
// current DERP region inside of 2x the KeepAlive window, don't switch DERP
|
||||
// regions yet, keep the current region. This prevents region flapping /
|
||||
// home DERP removal during short periods of packet loss where the DERP TCP
|
||||
// connection may itself naturally recover.
|
||||
// TODO(raggi): expose shared time bounds from the DERP package rather than
|
||||
// duplicating them here.
|
||||
PreferredDERPKeepAliveTimeout = 2 * derp.KeepAlive
|
||||
)
|
||||
|
||||
// addReportHistoryAndSetPreferredDERP adds r to the set of recent Reports
|
||||
@ -1455,13 +1465,10 @@ func (c *Client) addReportHistoryAndSetPreferredDERP(rs *reportState, r *Report,
|
||||
// the STUN probe) since we started the netcheck, or in the past 2s, as
|
||||
// another signal for "this region is still working".
|
||||
heardFromOldRegionRecently := false
|
||||
prevRegionLastHeard := rs.opts.getLastDERPActivity(prevDERP)
|
||||
if changingPreferred {
|
||||
if lastHeard := rs.opts.getLastDERPActivity(prevDERP); !lastHeard.IsZero() {
|
||||
now := c.timeNow()
|
||||
|
||||
heardFromOldRegionRecently = lastHeard.After(rs.start)
|
||||
heardFromOldRegionRecently = heardFromOldRegionRecently || lastHeard.After(now.Add(-PreferredDERPFrameTime))
|
||||
}
|
||||
heardFromOldRegionRecently = prevRegionLastHeard.After(rs.start)
|
||||
heardFromOldRegionRecently = heardFromOldRegionRecently || prevRegionLastHeard.After(now.Add(-PreferredDERPFrameTime))
|
||||
}
|
||||
|
||||
// The old region is accessible if we've heard from it via a non-STUN
|
||||
@ -1488,17 +1495,20 @@ func (c *Client) addReportHistoryAndSetPreferredDERP(rs *reportState, r *Report,
|
||||
// If the forced DERP region probed successfully, or has recent traffic,
|
||||
// use it.
|
||||
_, haveLatencySample := r.RegionLatency[c.ForcePreferredDERP]
|
||||
var recentActivity bool
|
||||
if lastHeard := rs.opts.getLastDERPActivity(c.ForcePreferredDERP); !lastHeard.IsZero() {
|
||||
now := c.timeNow()
|
||||
recentActivity = lastHeard.After(rs.start)
|
||||
recentActivity = recentActivity || lastHeard.After(now.Add(-PreferredDERPFrameTime))
|
||||
}
|
||||
lastHeard := rs.opts.getLastDERPActivity(c.ForcePreferredDERP)
|
||||
recentActivity := lastHeard.After(rs.start)
|
||||
recentActivity = recentActivity || lastHeard.After(now.Add(-PreferredDERPFrameTime))
|
||||
|
||||
if haveLatencySample || recentActivity {
|
||||
r.PreferredDERP = c.ForcePreferredDERP
|
||||
}
|
||||
}
|
||||
// If there was no latency data to make judgements on, but there is an
|
||||
// active DERP connection that has at least been doing KeepAlive recently,
|
||||
// keep it, rather than dropping it.
|
||||
if r.PreferredDERP == 0 && prevRegionLastHeard.After(now.Add(-PreferredDERPKeepAliveTimeout)) {
|
||||
r.PreferredDERP = prevDERP
|
||||
}
|
||||
}
|
||||
|
||||
func updateLatency(m map[int]time.Duration, regionID int, d time.Duration) {
|
||||
|
@ -18,6 +18,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"tailscale.com/derp"
|
||||
"tailscale.com/net/netmon"
|
||||
"tailscale.com/net/stun/stuntest"
|
||||
"tailscale.com/tailcfg"
|
||||
@ -419,6 +420,39 @@ func TestAddReportHistoryAndSetPreferredDERP(t *testing.T) {
|
||||
wantPrevLen: 2,
|
||||
wantDERP: 1,
|
||||
},
|
||||
{
|
||||
name: "no_data_keep_home",
|
||||
steps: []step{
|
||||
{0, report("d1", 2, "d2", 3)},
|
||||
{30 * time.Second, report()},
|
||||
{2 * time.Second, report()},
|
||||
{2 * time.Second, report()},
|
||||
{2 * time.Second, report()},
|
||||
{2 * time.Second, report()},
|
||||
},
|
||||
opts: &GetReportOpts{
|
||||
GetLastDERPActivity: mkLDAFunc(map[int]time.Time{
|
||||
1: startTime,
|
||||
}),
|
||||
},
|
||||
wantPrevLen: 6,
|
||||
wantDERP: 1,
|
||||
},
|
||||
{
|
||||
name: "no_data_home_expires",
|
||||
steps: []step{
|
||||
{0, report("d1", 2, "d2", 3)},
|
||||
{30 * time.Second, report()},
|
||||
{2 * derp.KeepAlive, report()},
|
||||
},
|
||||
opts: &GetReportOpts{
|
||||
GetLastDERPActivity: mkLDAFunc(map[int]time.Time{
|
||||
1: startTime,
|
||||
}),
|
||||
},
|
||||
wantPrevLen: 3,
|
||||
wantDERP: 0,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user