wgengine/magicsock: don't change DERP home when not connected to control

This pretty much always results in an outage because peers won't
discover our new home region and thus won't be able to establish
connectivity.

Updates tailscale/corp#18095

Signed-off-by: Andrew Dunham <andrew@du.nham.ca>
Change-Id: Ic0d09133f198b528dd40c6383b16d7663d9d37a7
This commit is contained in:
Andrew Dunham 2024-03-08 12:32:15 -05:00
parent 54e52532eb
commit f072d017bd
3 changed files with 172 additions and 11 deletions

View File

@ -23,6 +23,7 @@
"tailscale.com/health"
"tailscale.com/logtail/backoff"
"tailscale.com/net/dnscache"
"tailscale.com/net/netcheck"
"tailscale.com/net/tsaddr"
"tailscale.com/syncs"
"tailscale.com/tailcfg"
@ -31,6 +32,7 @@
"tailscale.com/types/logger"
"tailscale.com/util/mak"
"tailscale.com/util/sysresources"
"tailscale.com/util/testenv"
)
// useDerpRoute reports whether magicsock should enable the DERP
@ -85,7 +87,10 @@ type activeDerp struct {
createTime time.Time
}
var processStartUnixNano = time.Now().UnixNano()
var (
processStartUnixNano = time.Now().UnixNano()
pickDERPFallbackForTests func() int
)
// pickDERPFallback returns a non-zero but deterministic DERP node to
// connect to. This is only used if netcheck couldn't find the
@ -121,11 +126,59 @@ func (c *Conn) pickDERPFallback() int {
return c.myDerp
}
if pickDERPFallbackForTests != nil {
return pickDERPFallbackForTests()
}
h := fnv.New64()
fmt.Fprintf(h, "%p/%d", c, processStartUnixNano) // arbitrary
return ids[rand.New(rand.NewSource(int64(h.Sum64()))).Intn(len(ids))]
}
// This allows existing tests to pass, but allows us to still test the
// behaviour during tests.
var checkControlHealthDuringNearestDERPInTests = false
// maybeSetNearestDERP selects and changes the nearest/preferred DERP server
// based on the netcheck report and other heuristics. It returns the DERP
// region that it selected and set (via setNearestDERP).
//
// c.mu must NOT be held.
func (c *Conn) maybeSetNearestDERP(report *netcheck.Report) (preferredDERP int) {
// Don't change our PreferredDERP if we don't have a connection to
// control; if we don't, then we can't inform peers about a DERP home
// change, which breaks all connectivity. Even if this DERP region is
// down, changing our home DERP isn't correct since peers can't
// discover that change.
//
// See https://github.com/tailscale/corp/issues/18095
//
// For tests, always assume we're connected to control unless we're
// explicitly testing this behaviour.
var connectedToControl bool
if testenv.InTest() && !checkControlHealthDuringNearestDERPInTests {
connectedToControl = true
} else {
connectedToControl = health.GetInPollNetMap()
}
if !connectedToControl {
c.mu.Lock()
defer c.mu.Unlock()
return c.myDerp
}
preferredDERP = report.PreferredDERP
if preferredDERP == 0 {
// Perhaps UDP is blocked. Pick a deterministic but arbitrary
// one.
preferredDERP = c.pickDERPFallback()
}
if !c.setNearestDERP(preferredDERP) {
preferredDERP = 0
}
return
}
func (c *Conn) derpRegionCodeLocked(regionID int) string {
if c.derpMap == nil {
return ""

View File

@ -682,16 +682,7 @@ func (c *Conn) updateNetInfo(ctx context.Context) (*netcheck.Report, error) {
ni.OSHasIPv6.Set(report.OSHasIPv6)
ni.WorkingUDP.Set(report.UDP)
ni.WorkingICMPv4.Set(report.ICMPv4)
ni.PreferredDERP = report.PreferredDERP
if ni.PreferredDERP == 0 {
// Perhaps UDP is blocked. Pick a deterministic but arbitrary
// one.
ni.PreferredDERP = c.pickDERPFallback()
}
if !c.setNearestDERP(ni.PreferredDERP) {
ni.PreferredDERP = 0
}
ni.PreferredDERP = c.maybeSetNearestDERP(report)
ni.FirewallMode = hostinfo.FirewallMode()
c.callNetInfoCallback(ni)

View File

@ -41,9 +41,11 @@
"tailscale.com/derp/derphttp"
"tailscale.com/disco"
"tailscale.com/envknob"
"tailscale.com/health"
"tailscale.com/ipn/ipnstate"
"tailscale.com/net/connstats"
"tailscale.com/net/netaddr"
"tailscale.com/net/netcheck"
"tailscale.com/net/packet"
"tailscale.com/net/ping"
"tailscale.com/net/stun/stuntest"
@ -3017,3 +3019,118 @@ func TestAddrForPingSizeLocked(t *testing.T) {
})
}
}
func TestMaybeSetNearestDERP(t *testing.T) {
derpMap := &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
1: {
RegionID: 1,
RegionCode: "test",
Nodes: []*tailcfg.DERPNode{
{
Name: "t1",
RegionID: 1,
HostName: "test-node.unused",
IPv4: "127.0.0.1",
IPv6: "none",
},
},
},
21: {
RegionID: 21,
RegionCode: "tor",
Nodes: []*tailcfg.DERPNode{
{
Name: "21b",
RegionID: 21,
HostName: "tor.test-node.unused",
IPv4: "127.0.0.1",
IPv6: "none",
},
},
},
31: {
RegionID: 31,
RegionCode: "fallback",
Nodes: []*tailcfg.DERPNode{
{
Name: "31b",
RegionID: 31,
HostName: "fallback.test-node.unused",
IPv4: "127.0.0.1",
IPv6: "none",
},
},
},
},
}
// Ensure that our fallback code always picks a deterministic value.
tstest.Replace(t, &pickDERPFallbackForTests, func() int { return 31 })
// Actually test this code path.
tstest.Replace(t, &checkControlHealthDuringNearestDERPInTests, true)
testCases := []struct {
name string
old int
reportDERP int
connectedToControl bool
want int
}{
{
name: "connected_with_report_derp",
old: 1,
reportDERP: 21,
connectedToControl: true,
want: 21,
},
{
name: "not_connected_with_report_derp",
old: 1,
reportDERP: 21,
connectedToControl: false,
want: 1, // no change
},
{
name: "connected_no_derp",
old: 1,
reportDERP: 0,
connectedToControl: true,
want: 1, // no change
},
{
name: "connected_no_derp_fallback",
old: 0,
reportDERP: 0,
connectedToControl: true,
want: 31, // deterministic fallback
},
}
for _, tt := range testCases {
t.Run(tt.name, func(t *testing.T) {
c := newConn()
c.logf = t.Logf
c.myDerp = tt.old
c.derpMap = derpMap
report := &netcheck.Report{PreferredDERP: tt.reportDERP}
oldConnected := health.GetInPollNetMap()
if tt.connectedToControl != oldConnected {
if tt.connectedToControl {
health.GotStreamedMapResponse()
t.Cleanup(health.SetOutOfPollNetMap)
} else {
health.SetOutOfPollNetMap()
t.Cleanup(health.GotStreamedMapResponse)
}
}
got := c.maybeSetNearestDERP(report)
if got != tt.want {
t.Errorf("got new DERP region %d, want %d", got, tt.want)
}
})
}
}