ipn/ipnlocal: fix race in enterState

It would acquire the lock, calculate `nextState`, relase
the lock, then call `enterState` which would acquire the lock
again. There were obvious races there which could lead to
nil panics as seen in a test in a different repo.

```
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x2 addr=0x70 pc=0x1050f2c7c]

goroutine 42240 [running]:
tailscale.com/ipn/ipnlocal.(*LocalBackend).enterStateLockedOnEntry(0x14002154e00, 0x6)
        tailscale.com/ipn/ipnlocal/local.go:3715 +0x30c
tailscale.com/ipn/ipnlocal.(*LocalBackend).enterState(0x14002154e00?, 0x14002e3a140?)
        tailscale.com/ipn/ipnlocal/local.go:3663 +0x8c
tailscale.com/ipn/ipnlocal.(*LocalBackend).stateMachine(0x14001f5e280?)
        tailscale.com/ipn/ipnlocal/local.go:3836 +0x2c
tailscale.com/ipn/ipnlocal.(*LocalBackend).setWgengineStatus(0x14002154e00, 0x14002e3a190, {0x0?, 0x0?})
        tailscale.com/ipn/ipnlocal/local.go:1193 +0x4d0
tailscale.com/wgengine.(*userspaceEngine).RequestStatus(0x14005d90300)
        tailscale.com/wgengine/userspace.go:1051 +0x80
tailscale.com/wgengine.NewUserspaceEngine.func2({0x14002e3a0a0, 0x2, 0x140025cce40?})
        tailscale.com/wgengine/userspace.go:318 +0x1a0
tailscale.com/wgengine/magicsock.(*Conn).updateEndpoints(0x14002154700, {0x105c13eaf, 0xf})
        tailscale.com/wgengine/magicsock/magicsock.go:531 +0x424
created by tailscale.com/wgengine/magicsock.(*Conn).ReSTUN in goroutine 42077
        tailscale.com/wgengine/magicsock/magicsock.go:2142 +0x3a4
```

Updates tailscale/corp#14480

Signed-off-by: Maisem Ali <maisem@tailscale.com>
This commit is contained in:
Maisem Ali 2023-09-05 10:28:01 -07:00 committed by Maisem Ali
parent a4aa6507fa
commit cb3b281e98

View File

@ -3753,10 +3753,11 @@ func (b *LocalBackend) NodeKey() key.NodePublic {
return b.pm.CurrentPrefs().Persist().PublicNodeKey()
}
// nextState returns the state the backend seems to be in, based on
// nextStateLocked returns the state the backend seems to be in, based on
// its internal state.
func (b *LocalBackend) nextState() ipn.State {
b.mu.Lock()
//
// b.mu must be held
func (b *LocalBackend) nextStateLocked() ipn.State {
var (
cc = b.cc
netMap = b.netMap
@ -3772,10 +3773,9 @@ func (b *LocalBackend) nextState() ipn.State {
wantRunning = p.WantRunning()
loggedOut = p.LoggedOut()
}
b.mu.Unlock()
switch {
case !wantRunning && !loggedOut && !blocked && b.hasNodeKey():
case !wantRunning && !loggedOut && !blocked && b.hasNodeKeyLocked():
return ipn.Stopped
case netMap == nil:
if (cc != nil && cc.AuthCantContinue()) || loggedOut {
@ -3838,7 +3838,8 @@ func (b *LocalBackend) RequestEngineStatus() {
// TODO(apenwarr): use a channel or something to prevent reentrancy?
// Or maybe just call the state machine from fewer places.
func (b *LocalBackend) stateMachine() {
b.enterState(b.nextState())
b.mu.Lock()
b.enterStateLockedOnEntry(b.nextStateLocked())
}
// stopEngineAndWait deconfigures the local network data plane, and
@ -3900,7 +3901,6 @@ func (b *LocalBackend) resetControlClientLocked() controlclient.Client {
// don't want to the user to have to reauthenticate in the future
// when they restart the GUI.
func (b *LocalBackend) ResetForClientDisconnect() {
defer b.enterState(ipn.Stopped)
b.logf("LocalBackend.ResetForClientDisconnect")
b.mu.Lock()
@ -3909,7 +3909,6 @@ func (b *LocalBackend) ResetForClientDisconnect() {
// Needs to happen without b.mu held.
defer prevCC.Shutdown()
}
defer b.mu.Unlock()
b.setNetMapLocked(nil)
b.pm.Reset()
@ -3918,6 +3917,7 @@ func (b *LocalBackend) ResetForClientDisconnect() {
b.authURLSticky = ""
b.activeLogin = ""
b.setAtomicValuesFromPrefsLocked(ipn.PrefsView{})
b.enterStateLockedOnEntry(ipn.Stopped)
}
func (b *LocalBackend) ShouldRunSSH() bool { return b.sshAtomicBool.Load() && envknob.CanSSHD() }