net/netcheck: ensure prior preferred DERP is always in netchecks

In an environment with unstable latency, such as upstream bufferbloat,
there are cases where a full netcheck could drop the prior preferred
DERP (likely home DERP) from future netcheck probe plans. This will then
likely result in a home DERP having a missing sample on the next
incremental netcheck, ultimately resulting in a home DERP move.

This change does not fix our overall response to highly unstable
latency, but it is an incremental improvement to prevent single spurious
samples during a full netcheck from alone triggering a flapping
condition, as now the prior changes to include historical latency will
still provide the desired resistance, and the home DERP should not move
unless latency is consistently worse over a 5 minute period.

Note that there is a nomenclature and semantics issue remaining in the
difference between a report preferred DERP and a home DERP. A report
preferred DERP is aspirational, it is what will be picked as a home DERP
if a home DERP connection needs to be established. A nodes home DERP may
be different than a recent preferred DERP, in which case a lot of
netcheck logic is fallible. In future enhancements much of the DERP move
logic should move to consider the home DERP, rather than recent report
preferred DERP.

Updates 
Updates 

Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker 2024-10-30 11:31:36 -07:00 committed by James Tucker
parent b73831b2b5
commit 5280738690
2 changed files with 93 additions and 17 deletions

@ -391,10 +391,11 @@ type probePlan map[string][]probe
// sortRegions returns the regions of dm first sorted
// from fastest to slowest (based on the 'last' report),
// end in regions that have no data.
func sortRegions(dm *tailcfg.DERPMap, last *Report) (prev []*tailcfg.DERPRegion) {
func sortRegions(dm *tailcfg.DERPMap, last *Report, preferredDERP int) (prev []*tailcfg.DERPRegion) {
prev = make([]*tailcfg.DERPRegion, 0, len(dm.Regions))
for _, reg := range dm.Regions {
if reg.Avoid {
// include an otherwise avoid region if it is the current preferred region
if reg.Avoid && reg.RegionID != preferredDERP {
continue
}
prev = append(prev, reg)
@ -419,9 +420,19 @@ func sortRegions(dm *tailcfg.DERPMap, last *Report) (prev []*tailcfg.DERPRegion)
// a full report, all regions are scanned.)
const numIncrementalRegions = 3
// makeProbePlan generates the probe plan for a DERPMap, given the most
// recent report and whether IPv6 is configured on an interface.
func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (plan probePlan) {
// makeProbePlan generates the probe plan for a DERPMap, given the most recent
// report and the current home DERP. preferredDERP is passed independently of
// last (report) because last is currently nil'd to indicate a desire for a full
// netcheck.
//
// TODO(raggi,jwhited): refactor the callers and this function to be more clear
// about full vs. incremental netchecks, and remove the need for the history
// hiding. This was avoided in an incremental change due to exactly this kind of
// distant coupling.
// TODO(raggi): change from "preferred DERP" from a historical report to "home
// DERP" as in what DERP is the current home connection, this would further
// reduce flap events.
func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report, preferredDERP int) (plan probePlan) {
if last == nil || len(last.RegionLatency) == 0 {
return makeProbePlanInitial(dm, ifState)
}
@ -432,10 +443,35 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl
had4 := len(last.RegionV4Latency) > 0
had6 := len(last.RegionV6Latency) > 0
hadBoth := have6if && had4 && had6
for ri, reg := range sortRegions(dm, last) {
if ri == numIncrementalRegions {
// #13969 ensure that the home region is always probed.
// If a netcheck has unstable latency, such as a user with large amounts of
// bufferbloat or a highly congested connection, there are cases where a full
// netcheck may observe a one-off high latency to the current home DERP. Prior
// to the forced inclusion of the home DERP, this would result in an
// incremental netcheck following such an event to cause a home DERP move, with
// restoration back to the home DERP on the next full netcheck ~5 minutes later
// - which is highly disruptive when it causes shifts in geo routed subnet
// routers. By always including the home DERP in the incremental netcheck, we
// ensure that the home DERP is always probed, even if it observed a recenet
// poor latency sample. This inclusion enables the latency history checks in
// home DERP selection to still take effect.
// planContainsHome indicates whether the home DERP has been added to the probePlan,
// if there is no prior home, then there's no home to additionally include.
planContainsHome := preferredDERP == 0
for ri, reg := range sortRegions(dm, last, preferredDERP) {
regIsHome := reg.RegionID == preferredDERP
if ri >= numIncrementalRegions {
// planned at least numIncrementalRegions regions and that includes the
// last home region (or there was none), plan complete.
if planContainsHome {
break
}
// planned at least numIncrementalRegions regions, but not the home region,
// check if this is the home region, if not, skip it.
if !regIsHome {
continue
}
}
var p4, p6 []probe
do4 := have4if
do6 := have6if
@ -445,7 +481,7 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl
tries := 1
isFastestTwo := ri < 2
if isFastestTwo {
if isFastestTwo || regIsHome {
tries = 2
} else if hadBoth {
// For dual stack machines, make the 3rd & slower nodes alternate
@ -456,14 +492,15 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl
do4, do6 = false, true
}
}
if !isFastestTwo && !had6 {
if !regIsHome && !isFastestTwo && !had6 {
do6 = false
}
if reg.RegionID == last.PreferredDERP {
if regIsHome {
// But if we already had a DERP home, try extra hard to
// make sure it's there so we don't flip flop around.
tries = 4
planContainsHome = true
}
for try := 0; try < tries; try++ {
@ -788,9 +825,10 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe
c.curState = rs
last := c.last
// Even if we're doing a non-incremental update, we may want to try our
// preferred DERP region for captive portal detection. Save that, if we
// have it.
// Extract preferredDERP from the last report, if available. This will be used
// in captive portal detection and DERP flapping suppression. Ideally this would
// be the current active home DERP rather than the last report preferred DERP,
// but only the latter is presently available.
var preferredDERP int
if last != nil {
preferredDERP = last.PreferredDERP
@ -847,7 +885,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe
var plan probePlan
if opts == nil || !opts.OnlyTCP443 {
plan = makeProbePlan(dm, ifState, last)
plan = makeProbePlan(dm, ifState, last, preferredDERP)
}
// If we're doing a full probe, also check for a captive portal. We

@ -576,6 +576,40 @@ func TestMakeProbePlan(t *testing.T) {
"region-3-v4": []probe{p("3a", 4)},
},
},
{
// #13969: ensure that the prior/current home region is always included in
// probe plans, so that we don't flap between regions due to a single major
// netcheck having excluded the home region due to a spuriously high sample.
name: "ensure_home_region_inclusion",
dm: basicMap,
have6if: true,
last: &Report{
RegionLatency: map[int]time.Duration{
1: 50 * time.Millisecond,
2: 20 * time.Millisecond,
3: 30 * time.Millisecond,
4: 40 * time.Millisecond,
},
RegionV4Latency: map[int]time.Duration{
1: 50 * time.Millisecond,
2: 20 * time.Millisecond,
},
RegionV6Latency: map[int]time.Duration{
3: 30 * time.Millisecond,
4: 40 * time.Millisecond,
},
PreferredDERP: 1,
},
want: probePlan{
"region-1-v4": []probe{p("1a", 4), p("1a", 4, 60*ms), p("1a", 4, 220*ms), p("1a", 4, 330*ms)},
"region-1-v6": []probe{p("1a", 6), p("1a", 6, 60*ms), p("1a", 6, 220*ms), p("1a", 6, 330*ms)},
"region-2-v4": []probe{p("2a", 4), p("2b", 4, 24*ms)},
"region-2-v6": []probe{p("2a", 6), p("2b", 6, 24*ms)},
"region-3-v4": []probe{p("3a", 4), p("3b", 4, 36*ms)},
"region-3-v6": []probe{p("3a", 6), p("3b", 6, 36*ms)},
"region-4-v4": []probe{p("4a", 4)},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
@ -583,7 +617,11 @@ func TestMakeProbePlan(t *testing.T) {
HaveV6: tt.have6if,
HaveV4: !tt.no4,
}
got := makeProbePlan(tt.dm, ifState, tt.last)
preferredDERP := 0
if tt.last != nil {
preferredDERP = tt.last.PreferredDERP
}
got := makeProbePlan(tt.dm, ifState, tt.last, preferredDERP)
if !reflect.DeepEqual(got, tt.want) {
t.Errorf("unexpected plan; got:\n%v\nwant:\n%v\n", got, tt.want)
}
@ -756,7 +794,7 @@ func TestSortRegions(t *testing.T) {
report.RegionLatency[3] = time.Second * time.Duration(6)
report.RegionLatency[4] = time.Second * time.Duration(0)
report.RegionLatency[5] = time.Second * time.Duration(2)
sortedMap := sortRegions(unsortedMap, report)
sortedMap := sortRegions(unsortedMap, report, 0)
// Sorting by latency this should result in rid: 5, 2, 1, 3
// rid 4 with latency 0 should be at the end