mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-25 19:15:34 +00:00
net/netcheck: ensure prior preferred DERP is always in netchecks
Some checks are pending
checklocks / checklocks (push) Waiting to run
CodeQL / Analyze (go) (push) Waiting to run
Dockerfile build / deploy (push) Waiting to run
CI / race-build (push) Waiting to run
CI / go_generate (push) Waiting to run
CI / go_mod_tidy (push) Waiting to run
CI / licenses (push) Waiting to run
CI / staticcheck (386, windows) (push) Waiting to run
CI / staticcheck (amd64, darwin) (push) Waiting to run
CI / staticcheck (amd64, linux) (push) Waiting to run
CI / staticcheck (amd64, windows) (push) Waiting to run
CI / race-root-integration (3/4) (push) Waiting to run
CI / race-root-integration (4/4) (push) Waiting to run
CI / test (-coverprofile=/tmp/coverage.out, amd64) (push) Waiting to run
CI / windows (push) Waiting to run
CI / privileged (push) Waiting to run
CI / vm (push) Waiting to run
CI / race-root-integration (1/4) (push) Waiting to run
CI / race-root-integration (2/4) (push) Waiting to run
CI / test (-race, amd64, 1/3) (push) Waiting to run
CI / test (-race, amd64, 2/3) (push) Waiting to run
CI / test (-race, amd64, 3/3) (push) Waiting to run
CI / test (386) (push) Waiting to run
CI / cross (386, linux) (push) Waiting to run
CI / cross (amd64, darwin) (push) Waiting to run
CI / cross (amd64, freebsd) (push) Waiting to run
CI / cross (amd64, openbsd) (push) Waiting to run
CI / cross (amd64, windows) (push) Waiting to run
CI / cross (arm, 5, linux) (push) Waiting to run
CI / cross (arm, 7, linux) (push) Waiting to run
CI / cross (arm64, darwin) (push) Waiting to run
CI / cross (arm64, linux) (push) Waiting to run
CI / cross (arm64, windows) (push) Waiting to run
CI / cross (loong64, linux) (push) Waiting to run
CI / ios (push) Waiting to run
CI / crossmin (amd64, plan9) (push) Waiting to run
CI / crossmin (ppc64, aix) (push) Waiting to run
CI / android (push) Waiting to run
CI / wasm (push) Waiting to run
CI / tailscale_go (push) Waiting to run
CI / fuzz (push) Waiting to run
CI / depaware (push) Waiting to run
CI / notify_slack (push) Blocked by required conditions
CI / check_mergeability (push) Blocked by required conditions
Some checks are pending
checklocks / checklocks (push) Waiting to run
CodeQL / Analyze (go) (push) Waiting to run
Dockerfile build / deploy (push) Waiting to run
CI / race-build (push) Waiting to run
CI / go_generate (push) Waiting to run
CI / go_mod_tidy (push) Waiting to run
CI / licenses (push) Waiting to run
CI / staticcheck (386, windows) (push) Waiting to run
CI / staticcheck (amd64, darwin) (push) Waiting to run
CI / staticcheck (amd64, linux) (push) Waiting to run
CI / staticcheck (amd64, windows) (push) Waiting to run
CI / race-root-integration (3/4) (push) Waiting to run
CI / race-root-integration (4/4) (push) Waiting to run
CI / test (-coverprofile=/tmp/coverage.out, amd64) (push) Waiting to run
CI / windows (push) Waiting to run
CI / privileged (push) Waiting to run
CI / vm (push) Waiting to run
CI / race-root-integration (1/4) (push) Waiting to run
CI / race-root-integration (2/4) (push) Waiting to run
CI / test (-race, amd64, 1/3) (push) Waiting to run
CI / test (-race, amd64, 2/3) (push) Waiting to run
CI / test (-race, amd64, 3/3) (push) Waiting to run
CI / test (386) (push) Waiting to run
CI / cross (386, linux) (push) Waiting to run
CI / cross (amd64, darwin) (push) Waiting to run
CI / cross (amd64, freebsd) (push) Waiting to run
CI / cross (amd64, openbsd) (push) Waiting to run
CI / cross (amd64, windows) (push) Waiting to run
CI / cross (arm, 5, linux) (push) Waiting to run
CI / cross (arm, 7, linux) (push) Waiting to run
CI / cross (arm64, darwin) (push) Waiting to run
CI / cross (arm64, linux) (push) Waiting to run
CI / cross (arm64, windows) (push) Waiting to run
CI / cross (loong64, linux) (push) Waiting to run
CI / ios (push) Waiting to run
CI / crossmin (amd64, plan9) (push) Waiting to run
CI / crossmin (ppc64, aix) (push) Waiting to run
CI / android (push) Waiting to run
CI / wasm (push) Waiting to run
CI / tailscale_go (push) Waiting to run
CI / fuzz (push) Waiting to run
CI / depaware (push) Waiting to run
CI / notify_slack (push) Blocked by required conditions
CI / check_mergeability (push) Blocked by required conditions
In an environment with unstable latency, such as upstream bufferbloat, there are cases where a full netcheck could drop the prior preferred DERP (likely home DERP) from future netcheck probe plans. This will then likely result in a home DERP having a missing sample on the next incremental netcheck, ultimately resulting in a home DERP move. This change does not fix our overall response to highly unstable latency, but it is an incremental improvement to prevent single spurious samples during a full netcheck from alone triggering a flapping condition, as now the prior changes to include historical latency will still provide the desired resistance, and the home DERP should not move unless latency is consistently worse over a 5 minute period. Note that there is a nomenclature and semantics issue remaining in the difference between a report preferred DERP and a home DERP. A report preferred DERP is aspirational, it is what will be picked as a home DERP if a home DERP connection needs to be established. A nodes home DERP may be different than a recent preferred DERP, in which case a lot of netcheck logic is fallible. In future enhancements much of the DERP move logic should move to consider the home DERP, rather than recent report preferred DERP. Updates #8603 Updates #13969 Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
parent
f81348a16b
commit
e1e22785b4
@ -392,10 +392,11 @@ type probe struct {
|
|||||||
// sortRegions returns the regions of dm first sorted
|
// sortRegions returns the regions of dm first sorted
|
||||||
// from fastest to slowest (based on the 'last' report),
|
// from fastest to slowest (based on the 'last' report),
|
||||||
// end in regions that have no data.
|
// end in regions that have no data.
|
||||||
func sortRegions(dm *tailcfg.DERPMap, last *Report) (prev []*tailcfg.DERPRegion) {
|
func sortRegions(dm *tailcfg.DERPMap, last *Report, preferredDERP int) (prev []*tailcfg.DERPRegion) {
|
||||||
prev = make([]*tailcfg.DERPRegion, 0, len(dm.Regions))
|
prev = make([]*tailcfg.DERPRegion, 0, len(dm.Regions))
|
||||||
for _, reg := range dm.Regions {
|
for _, reg := range dm.Regions {
|
||||||
if reg.Avoid {
|
// include an otherwise avoid region if it is the current preferred region
|
||||||
|
if reg.Avoid && reg.RegionID != preferredDERP {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
prev = append(prev, reg)
|
prev = append(prev, reg)
|
||||||
@ -420,9 +421,19 @@ func sortRegions(dm *tailcfg.DERPMap, last *Report) (prev []*tailcfg.DERPRegion)
|
|||||||
// a full report, all regions are scanned.)
|
// a full report, all regions are scanned.)
|
||||||
const numIncrementalRegions = 3
|
const numIncrementalRegions = 3
|
||||||
|
|
||||||
// makeProbePlan generates the probe plan for a DERPMap, given the most
|
// makeProbePlan generates the probe plan for a DERPMap, given the most recent
|
||||||
// recent report and whether IPv6 is configured on an interface.
|
// report and the current home DERP. preferredDERP is passed independently of
|
||||||
func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (plan probePlan) {
|
// last (report) because last is currently nil'd to indicate a desire for a full
|
||||||
|
// netcheck.
|
||||||
|
//
|
||||||
|
// TODO(raggi,jwhited): refactor the callers and this function to be more clear
|
||||||
|
// about full vs. incremental netchecks, and remove the need for the history
|
||||||
|
// hiding. This was avoided in an incremental change due to exactly this kind of
|
||||||
|
// distant coupling.
|
||||||
|
// TODO(raggi): change from "preferred DERP" from a historical report to "home
|
||||||
|
// DERP" as in what DERP is the current home connection, this would further
|
||||||
|
// reduce flap events.
|
||||||
|
func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report, preferredDERP int) (plan probePlan) {
|
||||||
if last == nil || len(last.RegionLatency) == 0 {
|
if last == nil || len(last.RegionLatency) == 0 {
|
||||||
return makeProbePlanInitial(dm, ifState)
|
return makeProbePlanInitial(dm, ifState)
|
||||||
}
|
}
|
||||||
@ -433,10 +444,35 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl
|
|||||||
had4 := len(last.RegionV4Latency) > 0
|
had4 := len(last.RegionV4Latency) > 0
|
||||||
had6 := len(last.RegionV6Latency) > 0
|
had6 := len(last.RegionV6Latency) > 0
|
||||||
hadBoth := have6if && had4 && had6
|
hadBoth := have6if && had4 && had6
|
||||||
for ri, reg := range sortRegions(dm, last) {
|
// #13969 ensure that the home region is always probed.
|
||||||
if ri == numIncrementalRegions {
|
// If a netcheck has unstable latency, such as a user with large amounts of
|
||||||
|
// bufferbloat or a highly congested connection, there are cases where a full
|
||||||
|
// netcheck may observe a one-off high latency to the current home DERP. Prior
|
||||||
|
// to the forced inclusion of the home DERP, this would result in an
|
||||||
|
// incremental netcheck following such an event to cause a home DERP move, with
|
||||||
|
// restoration back to the home DERP on the next full netcheck ~5 minutes later
|
||||||
|
// - which is highly disruptive when it causes shifts in geo routed subnet
|
||||||
|
// routers. By always including the home DERP in the incremental netcheck, we
|
||||||
|
// ensure that the home DERP is always probed, even if it observed a recenet
|
||||||
|
// poor latency sample. This inclusion enables the latency history checks in
|
||||||
|
// home DERP selection to still take effect.
|
||||||
|
// planContainsHome indicates whether the home DERP has been added to the probePlan,
|
||||||
|
// if there is no prior home, then there's no home to additionally include.
|
||||||
|
planContainsHome := preferredDERP == 0
|
||||||
|
for ri, reg := range sortRegions(dm, last, preferredDERP) {
|
||||||
|
regIsHome := reg.RegionID == preferredDERP
|
||||||
|
if ri >= numIncrementalRegions {
|
||||||
|
// planned at least numIncrementalRegions regions and that includes the
|
||||||
|
// last home region (or there was none), plan complete.
|
||||||
|
if planContainsHome {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
// planned at least numIncrementalRegions regions, but not the home region,
|
||||||
|
// check if this is the home region, if not, skip it.
|
||||||
|
if !regIsHome {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
var p4, p6 []probe
|
var p4, p6 []probe
|
||||||
do4 := have4if
|
do4 := have4if
|
||||||
do6 := have6if
|
do6 := have6if
|
||||||
@ -446,7 +482,7 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl
|
|||||||
tries := 1
|
tries := 1
|
||||||
isFastestTwo := ri < 2
|
isFastestTwo := ri < 2
|
||||||
|
|
||||||
if isFastestTwo {
|
if isFastestTwo || regIsHome {
|
||||||
tries = 2
|
tries = 2
|
||||||
} else if hadBoth {
|
} else if hadBoth {
|
||||||
// For dual stack machines, make the 3rd & slower nodes alternate
|
// For dual stack machines, make the 3rd & slower nodes alternate
|
||||||
@ -457,14 +493,15 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl
|
|||||||
do4, do6 = false, true
|
do4, do6 = false, true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !isFastestTwo && !had6 {
|
if !regIsHome && !isFastestTwo && !had6 {
|
||||||
do6 = false
|
do6 = false
|
||||||
}
|
}
|
||||||
|
|
||||||
if reg.RegionID == last.PreferredDERP {
|
if regIsHome {
|
||||||
// But if we already had a DERP home, try extra hard to
|
// But if we already had a DERP home, try extra hard to
|
||||||
// make sure it's there so we don't flip flop around.
|
// make sure it's there so we don't flip flop around.
|
||||||
tries = 4
|
tries = 4
|
||||||
|
planContainsHome = true
|
||||||
}
|
}
|
||||||
|
|
||||||
for try := 0; try < tries; try++ {
|
for try := 0; try < tries; try++ {
|
||||||
@ -789,9 +826,10 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe
|
|||||||
c.curState = rs
|
c.curState = rs
|
||||||
last := c.last
|
last := c.last
|
||||||
|
|
||||||
// Even if we're doing a non-incremental update, we may want to try our
|
// Extract preferredDERP from the last report, if available. This will be used
|
||||||
// preferred DERP region for captive portal detection. Save that, if we
|
// in captive portal detection and DERP flapping suppression. Ideally this would
|
||||||
// have it.
|
// be the current active home DERP rather than the last report preferred DERP,
|
||||||
|
// but only the latter is presently available.
|
||||||
var preferredDERP int
|
var preferredDERP int
|
||||||
if last != nil {
|
if last != nil {
|
||||||
preferredDERP = last.PreferredDERP
|
preferredDERP = last.PreferredDERP
|
||||||
@ -848,7 +886,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe
|
|||||||
|
|
||||||
var plan probePlan
|
var plan probePlan
|
||||||
if opts == nil || !opts.OnlyTCP443 {
|
if opts == nil || !opts.OnlyTCP443 {
|
||||||
plan = makeProbePlan(dm, ifState, last)
|
plan = makeProbePlan(dm, ifState, last, preferredDERP)
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we're doing a full probe, also check for a captive portal. We
|
// If we're doing a full probe, also check for a captive portal. We
|
||||||
|
@ -590,6 +590,40 @@ func TestMakeProbePlan(t *testing.T) {
|
|||||||
"region-3-v4": []probe{p("3a", 4)},
|
"region-3-v4": []probe{p("3a", 4)},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
// #13969: ensure that the prior/current home region is always included in
|
||||||
|
// probe plans, so that we don't flap between regions due to a single major
|
||||||
|
// netcheck having excluded the home region due to a spuriously high sample.
|
||||||
|
name: "ensure_home_region_inclusion",
|
||||||
|
dm: basicMap,
|
||||||
|
have6if: true,
|
||||||
|
last: &Report{
|
||||||
|
RegionLatency: map[int]time.Duration{
|
||||||
|
1: 50 * time.Millisecond,
|
||||||
|
2: 20 * time.Millisecond,
|
||||||
|
3: 30 * time.Millisecond,
|
||||||
|
4: 40 * time.Millisecond,
|
||||||
|
},
|
||||||
|
RegionV4Latency: map[int]time.Duration{
|
||||||
|
1: 50 * time.Millisecond,
|
||||||
|
2: 20 * time.Millisecond,
|
||||||
|
},
|
||||||
|
RegionV6Latency: map[int]time.Duration{
|
||||||
|
3: 30 * time.Millisecond,
|
||||||
|
4: 40 * time.Millisecond,
|
||||||
|
},
|
||||||
|
PreferredDERP: 1,
|
||||||
|
},
|
||||||
|
want: probePlan{
|
||||||
|
"region-1-v4": []probe{p("1a", 4), p("1a", 4, 60*ms), p("1a", 4, 220*ms), p("1a", 4, 330*ms)},
|
||||||
|
"region-1-v6": []probe{p("1a", 6), p("1a", 6, 60*ms), p("1a", 6, 220*ms), p("1a", 6, 330*ms)},
|
||||||
|
"region-2-v4": []probe{p("2a", 4), p("2b", 4, 24*ms)},
|
||||||
|
"region-2-v6": []probe{p("2a", 6), p("2b", 6, 24*ms)},
|
||||||
|
"region-3-v4": []probe{p("3a", 4), p("3b", 4, 36*ms)},
|
||||||
|
"region-3-v6": []probe{p("3a", 6), p("3b", 6, 36*ms)},
|
||||||
|
"region-4-v4": []probe{p("4a", 4)},
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
@ -597,7 +631,11 @@ func TestMakeProbePlan(t *testing.T) {
|
|||||||
HaveV6: tt.have6if,
|
HaveV6: tt.have6if,
|
||||||
HaveV4: !tt.no4,
|
HaveV4: !tt.no4,
|
||||||
}
|
}
|
||||||
got := makeProbePlan(tt.dm, ifState, tt.last)
|
preferredDERP := 0
|
||||||
|
if tt.last != nil {
|
||||||
|
preferredDERP = tt.last.PreferredDERP
|
||||||
|
}
|
||||||
|
got := makeProbePlan(tt.dm, ifState, tt.last, preferredDERP)
|
||||||
if !reflect.DeepEqual(got, tt.want) {
|
if !reflect.DeepEqual(got, tt.want) {
|
||||||
t.Errorf("unexpected plan; got:\n%v\nwant:\n%v\n", got, tt.want)
|
t.Errorf("unexpected plan; got:\n%v\nwant:\n%v\n", got, tt.want)
|
||||||
}
|
}
|
||||||
@ -770,7 +808,7 @@ func TestSortRegions(t *testing.T) {
|
|||||||
report.RegionLatency[3] = time.Second * time.Duration(6)
|
report.RegionLatency[3] = time.Second * time.Duration(6)
|
||||||
report.RegionLatency[4] = time.Second * time.Duration(0)
|
report.RegionLatency[4] = time.Second * time.Duration(0)
|
||||||
report.RegionLatency[5] = time.Second * time.Duration(2)
|
report.RegionLatency[5] = time.Second * time.Duration(2)
|
||||||
sortedMap := sortRegions(unsortedMap, report)
|
sortedMap := sortRegions(unsortedMap, report, 0)
|
||||||
|
|
||||||
// Sorting by latency this should result in rid: 5, 2, 1, 3
|
// Sorting by latency this should result in rid: 5, 2, 1, 3
|
||||||
// rid 4 with latency 0 should be at the end
|
// rid 4 with latency 0 should be at the end
|
||||||
|
Loading…
Reference in New Issue
Block a user