mirror of
https://github.com/tailscale/tailscale.git
synced 2025-02-18 02:48:40 +00:00
net/netcheck: use DERP frames as a signal for home region liveness
This uses the fact that we've received a frame from a given DERP region within a certain time as a signal that the region is stil present (and thus can still be a node's PreferredDERP / home region) even if we don't get a STUN response from that region during a netcheck. This should help avoid DERP flaps that occur due to losing STUN probes while still having a valid and active TCP connection to the DERP server. RELNOTE=Reduce home DERP flapping when there's still an active connection Updates #8603 Signed-off-by: Andrew Dunham <andrew@du.nham.ca> Change-Id: If7da6312581e1d434d5c0811697319c621e187a0
This commit is contained in:
parent
bac4890467
commit
727acf96a6
@ -85,7 +85,7 @@ func runNetcheck(ctx context.Context, args []string) error {
|
|||||||
}
|
}
|
||||||
for {
|
for {
|
||||||
t0 := time.Now()
|
t0 := time.Now()
|
||||||
report, err := c.GetReport(ctx, dm)
|
report, err := c.GetReport(ctx, dm, nil)
|
||||||
d := time.Since(t0)
|
d := time.Since(t0)
|
||||||
if netcheckArgs.verbose {
|
if netcheckArgs.verbose {
|
||||||
c.Logf("GetReport took %v; err=%v", d.Round(time.Millisecond), err)
|
c.Logf("GetReport took %v; err=%v", d.Round(time.Millisecond), err)
|
||||||
|
@ -360,6 +360,8 @@ func SetDERPRegionHealth(region int, problem string) {
|
|||||||
selfCheckLocked()
|
selfCheckLocked()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NoteDERPRegionReceivedFrame is called to note that a frame was received from
|
||||||
|
// the given DERP region at the current time.
|
||||||
func NoteDERPRegionReceivedFrame(region int) {
|
func NoteDERPRegionReceivedFrame(region int) {
|
||||||
mu.Lock()
|
mu.Lock()
|
||||||
defer mu.Unlock()
|
defer mu.Unlock()
|
||||||
@ -367,6 +369,15 @@ func NoteDERPRegionReceivedFrame(region int) {
|
|||||||
selfCheckLocked()
|
selfCheckLocked()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetDERPRegionReceivedTime returns the last time that a frame was received
|
||||||
|
// from the given DERP region, or the zero time if no communication with that
|
||||||
|
// region has occurred.
|
||||||
|
func GetDERPRegionReceivedTime(region int) time.Time {
|
||||||
|
mu.Lock()
|
||||||
|
defer mu.Unlock()
|
||||||
|
return derpRegionLastFrame[region]
|
||||||
|
}
|
||||||
|
|
||||||
// state is an ipn.State.String() value: "Running", "Stopped", "NeedsLogin", etc.
|
// state is an ipn.State.String() value: "Running", "Stopped", "NeedsLogin", etc.
|
||||||
func SetIPNState(state string, wantRunning bool) {
|
func SetIPNState(state string, wantRunning bool) {
|
||||||
mu.Lock()
|
mu.Lock()
|
||||||
|
@ -523,6 +523,8 @@ func nodeMight4(n *tailcfg.DERPNode) bool {
|
|||||||
// reportState holds the state for a single invocation of Client.GetReport.
|
// reportState holds the state for a single invocation of Client.GetReport.
|
||||||
type reportState struct {
|
type reportState struct {
|
||||||
c *Client
|
c *Client
|
||||||
|
start time.Time
|
||||||
|
opts *GetReportOpts
|
||||||
hairTX stun.TxID
|
hairTX stun.TxID
|
||||||
gotHairSTUN chan netip.AddrPort
|
gotHairSTUN chan netip.AddrPort
|
||||||
hairTimeout chan struct{} // closed on timeout
|
hairTimeout chan struct{} // closed on timeout
|
||||||
@ -736,10 +738,32 @@ func newReport() *Report {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetReport gets a report.
|
// GetReportOpts contains options that can be passed to GetReport. Unless
|
||||||
|
// specified, all fields are optional and can be left as their zero value.
|
||||||
|
type GetReportOpts struct {
|
||||||
|
// GetLastDERPActivity is a callback that, if provided, should return
|
||||||
|
// the absolute time that the calling code last communicated with a
|
||||||
|
// given DERP region. This is used to assist in avoiding PreferredDERP
|
||||||
|
// ("home DERP") flaps.
|
||||||
|
//
|
||||||
|
// If no communication with that region has occurred, or it occurred
|
||||||
|
// too far in the past, this function should return the zero time.
|
||||||
|
GetLastDERPActivity func(int) time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
// getLastDERPActivity calls o.GetLastDERPActivity if both o and
|
||||||
|
// o.GetLastDERPActivity are non-nil; otherwise it returns the zero time.
|
||||||
|
func (o *GetReportOpts) getLastDERPActivity(region int) time.Time {
|
||||||
|
if o == nil || o.GetLastDERPActivity == nil {
|
||||||
|
return time.Time{}
|
||||||
|
}
|
||||||
|
return o.GetLastDERPActivity(region)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetReport gets a report. The 'opts' argument is optional and can be nil.
|
||||||
//
|
//
|
||||||
// It may not be called concurrently with itself.
|
// It may not be called concurrently with itself.
|
||||||
func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (_ *Report, reterr error) {
|
func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetReportOpts) (_ *Report, reterr error) {
|
||||||
defer func() {
|
defer func() {
|
||||||
if reterr != nil {
|
if reterr != nil {
|
||||||
metricNumGetReportError.Add(1)
|
metricNumGetReportError.Add(1)
|
||||||
@ -763,8 +787,11 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (_ *Report,
|
|||||||
c.mu.Unlock()
|
c.mu.Unlock()
|
||||||
return nil, errors.New("invalid concurrent call to GetReport")
|
return nil, errors.New("invalid concurrent call to GetReport")
|
||||||
}
|
}
|
||||||
|
now := c.timeNow()
|
||||||
rs := &reportState{
|
rs := &reportState{
|
||||||
c: c,
|
c: c,
|
||||||
|
start: now,
|
||||||
|
opts: opts,
|
||||||
report: newReport(),
|
report: newReport(),
|
||||||
inFlight: map[stun.TxID]func(netip.AddrPort){},
|
inFlight: map[stun.TxID]func(netip.AddrPort){},
|
||||||
hairTX: stun.NewTxID(), // random payload
|
hairTX: stun.NewTxID(), // random payload
|
||||||
@ -783,8 +810,6 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (_ *Report,
|
|||||||
preferredDERP = last.PreferredDERP
|
preferredDERP = last.PreferredDERP
|
||||||
}
|
}
|
||||||
|
|
||||||
now := c.timeNow()
|
|
||||||
|
|
||||||
doFull := false
|
doFull := false
|
||||||
if c.nextFull || now.Sub(c.lastFull) > 5*time.Minute {
|
if c.nextFull || now.Sub(c.lastFull) > 5*time.Minute {
|
||||||
doFull = true
|
doFull = true
|
||||||
@ -1022,7 +1047,7 @@ func (c *Client) finishAndStoreReport(rs *reportState, dm *tailcfg.DERPMap) *Rep
|
|||||||
report := rs.report.Clone()
|
report := rs.report.Clone()
|
||||||
rs.mu.Unlock()
|
rs.mu.Unlock()
|
||||||
|
|
||||||
c.addReportHistoryAndSetPreferredDERP(report, dm.View())
|
c.addReportHistoryAndSetPreferredDERP(rs, report, dm.View())
|
||||||
c.logConciseReport(report, dm)
|
c.logConciseReport(report, dm)
|
||||||
|
|
||||||
return report
|
return report
|
||||||
@ -1363,11 +1388,15 @@ const (
|
|||||||
// node is near region 1 @ 4ms and region 2 @ 5ms, region 1 getting
|
// node is near region 1 @ 4ms and region 2 @ 5ms, region 1 getting
|
||||||
// 5ms slower would cause a flap).
|
// 5ms slower would cause a flap).
|
||||||
preferredDERPAbsoluteDiff = 10 * time.Millisecond
|
preferredDERPAbsoluteDiff = 10 * time.Millisecond
|
||||||
|
// preferredDERPFrameTime is the time which, if a DERP frame has been
|
||||||
|
// received within that period, we treat that region as being present
|
||||||
|
// even without receiving a STUN response.
|
||||||
|
preferredDERPFrameTime = 2 * time.Second
|
||||||
)
|
)
|
||||||
|
|
||||||
// addReportHistoryAndSetPreferredDERP adds r to the set of recent Reports
|
// addReportHistoryAndSetPreferredDERP adds r to the set of recent Reports
|
||||||
// and mutates r.PreferredDERP to contain the best recent one.
|
// and mutates r.PreferredDERP to contain the best recent one.
|
||||||
func (c *Client) addReportHistoryAndSetPreferredDERP(r *Report, dm tailcfg.DERPMapView) {
|
func (c *Client) addReportHistoryAndSetPreferredDERP(rs *reportState, r *Report, dm tailcfg.DERPMapView) {
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
defer c.mu.Unlock()
|
defer c.mu.Unlock()
|
||||||
|
|
||||||
@ -1441,7 +1470,23 @@ func (c *Client) addReportHistoryAndSetPreferredDERP(r *Report, dm tailcfg.DERPM
|
|||||||
// still accessible and one of the conditions below is true.
|
// still accessible and one of the conditions below is true.
|
||||||
keepOld := false
|
keepOld := false
|
||||||
changingPreferred := prevDERP != 0 && r.PreferredDERP != prevDERP
|
changingPreferred := prevDERP != 0 && r.PreferredDERP != prevDERP
|
||||||
oldRegionIsAccessible := oldRegionCurLatency != 0
|
|
||||||
|
// See if we've heard from our previous preferred DERP (other than via
|
||||||
|
// the STUN probe) since we started the netcheck, or in the past 2s, as
|
||||||
|
// another signal for "this region is still working".
|
||||||
|
heardFromOldRegionRecently := false
|
||||||
|
if changingPreferred {
|
||||||
|
if lastHeard := rs.opts.getLastDERPActivity(prevDERP); !lastHeard.IsZero() {
|
||||||
|
now := c.timeNow()
|
||||||
|
|
||||||
|
heardFromOldRegionRecently = lastHeard.After(rs.start)
|
||||||
|
heardFromOldRegionRecently = heardFromOldRegionRecently || lastHeard.After(now.Add(-preferredDERPFrameTime))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// The old region is accessible if we've heard from it via a non-STUN
|
||||||
|
// mechanism, or have a latency (and thus heard back via STUN).
|
||||||
|
oldRegionIsAccessible := oldRegionCurLatency != 0 || heardFromOldRegionRecently
|
||||||
if changingPreferred && oldRegionIsAccessible {
|
if changingPreferred && oldRegionIsAccessible {
|
||||||
// bestAny < any other value, so oldRegionCurLatency - bestAny >= 0
|
// bestAny < any other value, so oldRegionCurLatency - bestAny >= 0
|
||||||
if oldRegionCurLatency-bestAny < preferredDERPAbsoluteDiff {
|
if oldRegionCurLatency-bestAny < preferredDERPAbsoluteDiff {
|
||||||
|
@ -169,7 +169,7 @@ func TestBasic(t *testing.T) {
|
|||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
r, err := c.GetReport(ctx, stuntest.DERPMapOf(stunAddr.String()))
|
r, err := c.GetReport(ctx, stuntest.DERPMapOf(stunAddr.String()), nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -208,7 +208,7 @@ func TestWorksWhenUDPBlocked(t *testing.T) {
|
|||||||
ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond)
|
ctx, cancel := context.WithTimeout(context.Background(), 250*time.Millisecond)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
r, err := c.GetReport(ctx, dm)
|
r, err := c.GetReport(ctx, dm, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
@ -260,14 +260,21 @@ func TestAddReportHistoryAndSetPreferredDERP(t *testing.T) {
|
|||||||
}
|
}
|
||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
mkLDAFunc := func(mm map[int]time.Time) func(int) time.Time {
|
||||||
|
return func(region int) time.Time {
|
||||||
|
return mm[region]
|
||||||
|
}
|
||||||
|
}
|
||||||
type step struct {
|
type step struct {
|
||||||
after time.Duration
|
after time.Duration
|
||||||
r *Report
|
r *Report
|
||||||
}
|
}
|
||||||
|
startTime := time.Unix(123, 0)
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
steps []step
|
steps []step
|
||||||
homeParams *tailcfg.DERPHomeParams
|
homeParams *tailcfg.DERPHomeParams
|
||||||
|
opts *GetReportOpts
|
||||||
wantDERP int // want PreferredDERP on final step
|
wantDERP int // want PreferredDERP on final step
|
||||||
wantPrevLen int // wanted len(c.prev)
|
wantPrevLen int // wanted len(c.prev)
|
||||||
}{
|
}{
|
||||||
@ -394,17 +401,53 @@ func TestAddReportHistoryAndSetPreferredDERP(t *testing.T) {
|
|||||||
wantPrevLen: 1,
|
wantPrevLen: 1,
|
||||||
wantDERP: 1,
|
wantDERP: 1,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "saw_derp_traffic",
|
||||||
|
steps: []step{
|
||||||
|
{0, report("d1", 2, "d2", 3)}, // (1) initially pick d1
|
||||||
|
{2 * time.Second, report("d1", 4, "d2", 3)}, // (2) still d1
|
||||||
|
{2 * time.Second, report("d2", 3)}, // (3) d1 gone, but have traffic
|
||||||
|
},
|
||||||
|
opts: &GetReportOpts{
|
||||||
|
GetLastDERPActivity: mkLDAFunc(map[int]time.Time{
|
||||||
|
1: startTime.Add(2*time.Second + preferredDERPFrameTime/2), // within active window of step (3)
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
wantPrevLen: 3,
|
||||||
|
wantDERP: 1, // still on 1 since we got traffic from it
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "saw_derp_traffic_history",
|
||||||
|
steps: []step{
|
||||||
|
{0, report("d1", 2, "d2", 3)}, // (1) initially pick d1
|
||||||
|
{2 * time.Second, report("d1", 4, "d2", 3)}, // (2) still d1
|
||||||
|
{2 * time.Second, report("d2", 3)}, // (3) d1 gone, but have traffic
|
||||||
|
},
|
||||||
|
opts: &GetReportOpts{
|
||||||
|
GetLastDERPActivity: mkLDAFunc(map[int]time.Time{
|
||||||
|
1: startTime.Add(4*time.Second - preferredDERPFrameTime - 1), // not within active window of (3)
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
wantPrevLen: 3,
|
||||||
|
wantDERP: 2, // moved to d2 since d1 is gone
|
||||||
|
},
|
||||||
}
|
}
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
fakeTime := time.Unix(123, 0)
|
fakeTime := startTime
|
||||||
c := &Client{
|
c := &Client{
|
||||||
TimeNow: func() time.Time { return fakeTime },
|
TimeNow: func() time.Time { return fakeTime },
|
||||||
}
|
}
|
||||||
dm := &tailcfg.DERPMap{HomeParams: tt.homeParams}
|
dm := &tailcfg.DERPMap{HomeParams: tt.homeParams}
|
||||||
|
rs := &reportState{
|
||||||
|
c: c,
|
||||||
|
start: fakeTime,
|
||||||
|
opts: tt.opts,
|
||||||
|
}
|
||||||
for _, s := range tt.steps {
|
for _, s := range tt.steps {
|
||||||
fakeTime = fakeTime.Add(s.after)
|
fakeTime = fakeTime.Add(s.after)
|
||||||
c.addReportHistoryAndSetPreferredDERP(s.r, dm.View())
|
rs.start = fakeTime.Add(-100 * time.Millisecond)
|
||||||
|
c.addReportHistoryAndSetPreferredDERP(rs, s.r, dm.View())
|
||||||
}
|
}
|
||||||
lastReport := tt.steps[len(tt.steps)-1].r
|
lastReport := tt.steps[len(tt.steps)-1].r
|
||||||
if got, want := len(c.prev), tt.wantPrevLen; got != want {
|
if got, want := len(c.prev), tt.wantPrevLen; got != want {
|
||||||
@ -868,7 +911,7 @@ func TestNoCaptivePortalWhenUDP(t *testing.T) {
|
|||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
r, err := c.GetReport(ctx, stuntest.DERPMapOf(stunAddr.String()))
|
r, err := c.GetReport(ctx, stuntest.DERPMapOf(stunAddr.String()), nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
@ -628,7 +628,17 @@ func (c *Conn) updateNetInfo(ctx context.Context) (*netcheck.Report, error) {
|
|||||||
ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
report, err := c.netChecker.GetReport(ctx, dm)
|
report, err := c.netChecker.GetReport(ctx, dm, &netcheck.GetReportOpts{
|
||||||
|
// Pass information about the last time that we received a
|
||||||
|
// frame from a DERP server to our netchecker to help avoid
|
||||||
|
// flapping the home region while there's still active
|
||||||
|
// communication.
|
||||||
|
//
|
||||||
|
// NOTE(andrew-d): I don't love that we're depending on the
|
||||||
|
// health package here, but I'd rather do that and not store
|
||||||
|
// the exact same state in two different places.
|
||||||
|
GetLastDERPActivity: health.GetDERPRegionReceivedTime,
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user