From 727acf96a6864493d1741c78d06662739b139ec2 Mon Sep 17 00:00:00 2001 From: Andrew Dunham Date: Fri, 8 Dec 2023 15:03:15 -0500 Subject: [PATCH] net/netcheck: use DERP frames as a signal for home region liveness This uses the fact that we've received a frame from a given DERP region within a certain time as a signal that the region is stil present (and thus can still be a node's PreferredDERP / home region) even if we don't get a STUN response from that region during a netcheck. This should help avoid DERP flaps that occur due to losing STUN probes while still having a valid and active TCP connection to the DERP server. RELNOTE=Reduce home DERP flapping when there's still an active connection Updates #8603 Signed-off-by: Andrew Dunham Change-Id: If7da6312581e1d434d5c0811697319c621e187a0 --- cmd/tailscale/cli/netcheck.go | 2 +- health/health.go | 11 ++++++ net/netcheck/netcheck.go | 59 +++++++++++++++++++++++++++++---- net/netcheck/netcheck_test.go | 53 ++++++++++++++++++++++++++--- wgengine/magicsock/magicsock.go | 12 ++++++- 5 files changed, 123 insertions(+), 14 deletions(-) diff --git a/cmd/tailscale/cli/netcheck.go b/cmd/tailscale/cli/netcheck.go index 95ba5ccfd..f430414eb 100644 --- a/cmd/tailscale/cli/netcheck.go +++ b/cmd/tailscale/cli/netcheck.go @@ -85,7 +85,7 @@ func runNetcheck(ctx context.Context, args []string) error { } for { t0 := time.Now() - report, err := c.GetReport(ctx, dm) + report, err := c.GetReport(ctx, dm, nil) d := time.Since(t0) if netcheckArgs.verbose { c.Logf("GetReport took %v; err=%v", d.Round(time.Millisecond), err) diff --git a/health/health.go b/health/health.go index 92fa01e7d..c040b8b03 100644 --- a/health/health.go +++ b/health/health.go @@ -360,6 +360,8 @@ func SetDERPRegionHealth(region int, problem string) { selfCheckLocked() } +// NoteDERPRegionReceivedFrame is called to note that a frame was received from +// the given DERP region at the current time. func NoteDERPRegionReceivedFrame(region int) { mu.Lock() defer mu.Unlock() @@ -367,6 +369,15 @@ func NoteDERPRegionReceivedFrame(region int) { selfCheckLocked() } +// GetDERPRegionReceivedTime returns the last time that a frame was received +// from the given DERP region, or the zero time if no communication with that +// region has occurred. +func GetDERPRegionReceivedTime(region int) time.Time { + mu.Lock() + defer mu.Unlock() + return derpRegionLastFrame[region] +} + // state is an ipn.State.String() value: "Running", "Stopped", "NeedsLogin", etc. func SetIPNState(state string, wantRunning bool) { mu.Lock() diff --git a/net/netcheck/netcheck.go b/net/netcheck/netcheck.go index 1c2b58853..0cea345c3 100644 --- a/net/netcheck/netcheck.go +++ b/net/netcheck/netcheck.go @@ -523,6 +523,8 @@ func nodeMight4(n *tailcfg.DERPNode) bool { // reportState holds the state for a single invocation of Client.GetReport. type reportState struct { c *Client + start time.Time + opts *GetReportOpts hairTX stun.TxID gotHairSTUN chan netip.AddrPort hairTimeout chan struct{} // closed on timeout @@ -736,10 +738,32 @@ func newReport() *Report { } } -// GetReport gets a report. +// GetReportOpts contains options that can be passed to GetReport. Unless +// specified, all fields are optional and can be left as their zero value. +type GetReportOpts struct { + // GetLastDERPActivity is a callback that, if provided, should return + // the absolute time that the calling code last communicated with a + // given DERP region. This is used to assist in avoiding PreferredDERP + // ("home DERP") flaps. + // + // If no communication with that region has occurred, or it occurred + // too far in the past, this function should return the zero time. + GetLastDERPActivity func(int) time.Time +} + +// getLastDERPActivity calls o.GetLastDERPActivity if both o and +// o.GetLastDERPActivity are non-nil; otherwise it returns the zero time. +func (o *GetReportOpts) getLastDERPActivity(region int) time.Time { + if o == nil || o.GetLastDERPActivity == nil { + return time.Time{} + } + return o.GetLastDERPActivity(region) +} + +// GetReport gets a report. The 'opts' argument is optional and can be nil. // // It may not be called concurrently with itself. -func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (_ *Report, reterr error) { +func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetReportOpts) (_ *Report, reterr error) { defer func() { if reterr != nil { metricNumGetReportError.Add(1) @@ -763,8 +787,11 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (_ *Report, c.mu.Unlock() return nil, errors.New("invalid concurrent call to GetReport") } + now := c.timeNow() rs := &reportState{ c: c, + start: now, + opts: opts, report: newReport(), inFlight: map[stun.TxID]func(netip.AddrPort){}, hairTX: stun.NewTxID(), // random payload @@ -783,8 +810,6 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (_ *Report, preferredDERP = last.PreferredDERP } - now := c.timeNow() - doFull := false if c.nextFull || now.Sub(c.lastFull) > 5*time.Minute { doFull = true @@ -1022,7 +1047,7 @@ func (c *Client) finishAndStoreReport(rs *reportState, dm *tailcfg.DERPMap) *Rep report := rs.report.Clone() rs.mu.Unlock() - c.addReportHistoryAndSetPreferredDERP(report, dm.View()) + c.addReportHistoryAndSetPreferredDERP(rs, report, dm.View()) c.logConciseReport(report, dm) return report @@ -1363,11 +1388,15 @@ func (c *Client) timeNow() time.Time { // node is near region 1 @ 4ms and region 2 @ 5ms, region 1 getting // 5ms slower would cause a flap). preferredDERPAbsoluteDiff = 10 * time.Millisecond + // preferredDERPFrameTime is the time which, if a DERP frame has been + // received within that period, we treat that region as being present + // even without receiving a STUN response. + preferredDERPFrameTime = 2 * time.Second ) // addReportHistoryAndSetPreferredDERP adds r to the set of recent Reports // and mutates r.PreferredDERP to contain the best recent one. -func (c *Client) addReportHistoryAndSetPreferredDERP(r *Report, dm tailcfg.DERPMapView) { +func (c *Client) addReportHistoryAndSetPreferredDERP(rs *reportState, r *Report, dm tailcfg.DERPMapView) { c.mu.Lock() defer c.mu.Unlock() @@ -1441,7 +1470,23 @@ func (c *Client) addReportHistoryAndSetPreferredDERP(r *Report, dm tailcfg.DERPM // still accessible and one of the conditions below is true. keepOld := false changingPreferred := prevDERP != 0 && r.PreferredDERP != prevDERP - oldRegionIsAccessible := oldRegionCurLatency != 0 + + // See if we've heard from our previous preferred DERP (other than via + // the STUN probe) since we started the netcheck, or in the past 2s, as + // another signal for "this region is still working". + heardFromOldRegionRecently := false + if changingPreferred { + if lastHeard := rs.opts.getLastDERPActivity(prevDERP); !lastHeard.IsZero() { + now := c.timeNow() + + heardFromOldRegionRecently = lastHeard.After(rs.start) + heardFromOldRegionRecently = heardFromOldRegionRecently || lastHeard.After(now.Add(-preferredDERPFrameTime)) + } + } + + // The old region is accessible if we've heard from it via a non-STUN + // mechanism, or have a latency (and thus heard back via STUN). + oldRegionIsAccessible := oldRegionCurLatency != 0 || heardFromOldRegionRecently if changingPreferred && oldRegionIsAccessible { // bestAny < any other value, so oldRegionCurLatency - bestAny >= 0 if oldRegionCurLatency-bestAny < preferredDERPAbsoluteDiff { diff --git a/net/netcheck/netcheck_test.go b/net/netcheck/netcheck_test.go index 1ed728a00..80aa5a657 100644 --- a/net/netcheck/netcheck_test.go +++ b/net/netcheck/netcheck_test.go @@ -169,7 +169,7 @@ func TestBasic(t *testing.T) { t.Fatal(err) } - r, err := c.GetReport(ctx, stuntest.DERPMapOf(stunAddr.String())) + r, err := c.GetReport(ctx, stuntest.DERPMapOf(stunAddr.String()), nil) if err != nil { t.Fatal(err) } @@ -208,7 +208,7 @@ func TestWorksWhenUDPBlocked(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond) defer cancel() - r, err := c.GetReport(ctx, dm) + r, err := c.GetReport(ctx, dm, nil) if err != nil { t.Fatal(err) } @@ -260,14 +260,21 @@ func TestAddReportHistoryAndSetPreferredDERP(t *testing.T) { } return r } + mkLDAFunc := func(mm map[int]time.Time) func(int) time.Time { + return func(region int) time.Time { + return mm[region] + } + } type step struct { after time.Duration r *Report } + startTime := time.Unix(123, 0) tests := []struct { name string steps []step homeParams *tailcfg.DERPHomeParams + opts *GetReportOpts wantDERP int // want PreferredDERP on final step wantPrevLen int // wanted len(c.prev) }{ @@ -394,17 +401,53 @@ type step struct { wantPrevLen: 1, wantDERP: 1, }, + { + name: "saw_derp_traffic", + steps: []step{ + {0, report("d1", 2, "d2", 3)}, // (1) initially pick d1 + {2 * time.Second, report("d1", 4, "d2", 3)}, // (2) still d1 + {2 * time.Second, report("d2", 3)}, // (3) d1 gone, but have traffic + }, + opts: &GetReportOpts{ + GetLastDERPActivity: mkLDAFunc(map[int]time.Time{ + 1: startTime.Add(2*time.Second + preferredDERPFrameTime/2), // within active window of step (3) + }), + }, + wantPrevLen: 3, + wantDERP: 1, // still on 1 since we got traffic from it + }, + { + name: "saw_derp_traffic_history", + steps: []step{ + {0, report("d1", 2, "d2", 3)}, // (1) initially pick d1 + {2 * time.Second, report("d1", 4, "d2", 3)}, // (2) still d1 + {2 * time.Second, report("d2", 3)}, // (3) d1 gone, but have traffic + }, + opts: &GetReportOpts{ + GetLastDERPActivity: mkLDAFunc(map[int]time.Time{ + 1: startTime.Add(4*time.Second - preferredDERPFrameTime - 1), // not within active window of (3) + }), + }, + wantPrevLen: 3, + wantDERP: 2, // moved to d2 since d1 is gone + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - fakeTime := time.Unix(123, 0) + fakeTime := startTime c := &Client{ TimeNow: func() time.Time { return fakeTime }, } dm := &tailcfg.DERPMap{HomeParams: tt.homeParams} + rs := &reportState{ + c: c, + start: fakeTime, + opts: tt.opts, + } for _, s := range tt.steps { fakeTime = fakeTime.Add(s.after) - c.addReportHistoryAndSetPreferredDERP(s.r, dm.View()) + rs.start = fakeTime.Add(-100 * time.Millisecond) + c.addReportHistoryAndSetPreferredDERP(rs, s.r, dm.View()) } lastReport := tt.steps[len(tt.steps)-1].r if got, want := len(c.prev), tt.wantPrevLen; got != want { @@ -868,7 +911,7 @@ func TestNoCaptivePortalWhenUDP(t *testing.T) { t.Fatal(err) } - r, err := c.GetReport(ctx, stuntest.DERPMapOf(stunAddr.String())) + r, err := c.GetReport(ctx, stuntest.DERPMapOf(stunAddr.String()), nil) if err != nil { t.Fatal(err) } diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index 537901cc0..ea4ebb05f 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -628,7 +628,17 @@ func (c *Conn) updateNetInfo(ctx context.Context) (*netcheck.Report, error) { ctx, cancel := context.WithTimeout(ctx, 2*time.Second) defer cancel() - report, err := c.netChecker.GetReport(ctx, dm) + report, err := c.netChecker.GetReport(ctx, dm, &netcheck.GetReportOpts{ + // Pass information about the last time that we received a + // frame from a DERP server to our netchecker to help avoid + // flapping the home region while there's still active + // communication. + // + // NOTE(andrew-d): I don't love that we're depending on the + // health package here, but I'd rather do that and not store + // the exact same state in two different places. + GetLastDERPActivity: health.GetDERPRegionReceivedTime, + }) if err != nil { return nil, err }