ipn/ipnlocal: fix setAuthURL / setWgengineStatus race condition (#17408)

If we received a wg engine status while processing an auth URL, there was a
race condition where the authURL could be reset to "" immediately after we
set it.

To fix this we need to check that we are moving from a non-Running state to
a Running state rather than always resetting the URL when we "move" into a
Running state even if that is the current state.

We also need to make sure that we do not return from stopEngineAndWait until
the engine is stopped: before, we would return as soon as we received any
engine status update, but that might have been an update already in-flight
before we asked the engine to stop. Now we wait until we see an update that
is indicative of a stopped engine, or we see that the engine is unblocked
again, which indicates that the engine stopped and then started again while
we were waiting before we checked the state.

Updates #17388

Signed-off-by: James Sanderson <jsanderson@tailscale.com>
Co-authored-by: Nick Khyl <nickk@tailscale.com>
This commit is contained in:
James 'zofrex' Sanderson
2025-10-06 22:48:43 +01:00
committed by GitHub
parent d816454a88
commit 7407f404d9
3 changed files with 278 additions and 24 deletions

View File

@@ -313,9 +313,8 @@ type LocalBackend struct {
serveListeners map[netip.AddrPort]*localListener // listeners for local serve traffic
serveProxyHandlers sync.Map // string (HTTPHandler.Proxy) => *reverseProxy
// statusLock must be held before calling statusChanged.Wait() or
// mu must be held before calling statusChanged.Wait() or
// statusChanged.Broadcast().
statusLock sync.Mutex
statusChanged *sync.Cond
// dialPlan is any dial plan that we've received from the control
@@ -542,7 +541,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
b.setTCPPortsIntercepted(nil)
b.statusChanged = sync.NewCond(&b.statusLock)
b.statusChanged = sync.NewCond(&b.mu)
b.e.SetStatusCallback(b.setWgengineStatus)
b.prevIfState = netMon.InterfaceState()
@@ -2265,14 +2264,15 @@ func (b *LocalBackend) setWgengineStatus(s *wgengine.Status, err error) {
b.send(ipn.Notify{Engine: &es})
}
// broadcastStatusChanged must not be called with b.mu held.
func (b *LocalBackend) broadcastStatusChanged() {
// The sync.Cond docs say: "It is allowed but not required for the caller to hold c.L during the call."
// In this particular case, we must acquire b.statusLock. Otherwise we might broadcast before
// In this particular case, we must acquire b.mu. Otherwise we might broadcast before
// the waiter (in requestEngineStatusAndWait) starts to wait, in which case
// the waiter can get stuck indefinitely. See PR 2865.
b.statusLock.Lock()
b.mu.Lock()
b.statusChanged.Broadcast()
b.statusLock.Unlock()
b.mu.Unlock()
}
// SetNotifyCallback sets the function to call when the backend has something to
@@ -3343,11 +3343,12 @@ func (b *LocalBackend) popBrowserAuthNow(url string, keyExpired bool, recipient
if !b.seamlessRenewalEnabled() || keyExpired {
b.blockEngineUpdates(true)
b.stopEngineAndWait()
if b.State() == ipn.Running {
b.enterState(ipn.Starting)
}
}
b.tellRecipientToBrowseToURL(url, toNotificationTarget(recipient))
if b.State() == ipn.Running {
b.enterState(ipn.Starting)
}
}
// validPopBrowserURL reports whether urlStr is a valid value for a
@@ -5513,7 +5514,13 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock
activeLogin := b.activeLogin
authURL := b.authURL
if newState == ipn.Running {
b.resetAuthURLLocked()
// TODO(zofrex): Is this needed? As of 2025-10-03 it doesn't seem to be
// necessary when logging in or authenticating. When do we need to reset it
// here, rather than the other places it is reset? We should test if it is
// necessary and add unit tests to cover those cases, or remove it.
if oldState != ipn.Running {
b.resetAuthURLLocked()
}
// Start a captive portal detection loop if none has been
// started. Create a new context if none is present, since it
@@ -5750,29 +5757,38 @@ func (u unlockOnce) UnlockEarly() {
}
// stopEngineAndWait deconfigures the local network data plane, and
// waits for it to deliver a status update before returning.
//
// TODO(danderson): this may be racy. We could unblock upon receiving
// a status update that predates the "I've shut down" update.
// waits for it to deliver a status update indicating it has stopped
// before returning.
func (b *LocalBackend) stopEngineAndWait() {
b.logf("stopEngineAndWait...")
b.e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{})
b.requestEngineStatusAndWait()
b.requestEngineStatusAndWaitForStopped()
b.logf("stopEngineAndWait: done.")
}
// Requests the wgengine status, and does not return until the status
// was delivered (to the usual callback).
func (b *LocalBackend) requestEngineStatusAndWait() {
b.logf("requestEngineStatusAndWait")
// Requests the wgengine status, and does not return until a status was
// delivered (to the usual callback) that indicates the engine is stopped.
func (b *LocalBackend) requestEngineStatusAndWaitForStopped() {
b.logf("requestEngineStatusAndWaitForStopped")
b.statusLock.Lock()
defer b.statusLock.Unlock()
b.mu.Lock()
defer b.mu.Unlock()
b.goTracker.Go(b.e.RequestStatus)
b.logf("requestEngineStatusAndWait: waiting...")
b.statusChanged.Wait() // temporarily releases lock while waiting
b.logf("requestEngineStatusAndWait: got status update.")
b.logf("requestEngineStatusAndWaitForStopped: waiting...")
for {
b.statusChanged.Wait() // temporarily releases lock while waiting
if !b.blocked {
b.logf("requestEngineStatusAndWaitForStopped: engine is no longer blocked, must have stopped and started again, not safe to wait.")
break
}
if b.engineStatus.NumLive == 0 && b.engineStatus.LiveDERPs == 0 {
b.logf("requestEngineStatusAndWaitForStopped: engine is stopped.")
break
}
b.logf("requestEngineStatusAndWaitForStopped: engine is still running. Waiting...")
}
}
// setControlClientLocked sets the control client to cc,

View File

@@ -1505,6 +1505,15 @@ func wantExitNodeIDNotify(want tailcfg.StableNodeID) wantedNotification {
}
}
func wantStateNotify(want ipn.State) wantedNotification {
return wantedNotification{
name: "State=" + want.String(),
cond: func(_ testing.TB, _ ipnauth.Actor, n *ipn.Notify) bool {
return n.State != nil && *n.State == want
},
}
}
func TestInternalAndExternalInterfaces(t *testing.T) {
type interfacePrefix struct {
i netmon.Interface

View File

@@ -1561,6 +1561,235 @@ func TestEngineReconfigOnStateChange(t *testing.T) {
}
}
// TestStateMachineURLRace tests that wgengine updates arriving in the middle of
// processing an auth URL doesn't result in the auth URL being cleared.
func TestStateMachineURLRace(t *testing.T) {
runTestStateMachineURLRace(t, false)
}
func TestStateMachineURLRaceSeamless(t *testing.T) {
runTestStateMachineURLRace(t, true)
}
func runTestStateMachineURLRace(t *testing.T, seamless bool) {
var cc *mockControl
b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
cc = newClient(t, opts)
return cc
})
nw := newNotificationWatcher(t, b, &ipnauth.TestActor{})
t.Logf("Start")
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.NeedsLogin)})
b.Start(ipn.Options{
UpdatePrefs: &ipn.Prefs{
WantRunning: true,
ControlURL: "https://localhost:1/",
},
})
nw.check()
t.Logf("LoginFinished")
cc.persist.UserProfile.LoginName = "user1"
cc.persist.NodeID = "node1"
if seamless {
b.sys.ControlKnobs().SeamlessKeyRenewal.Store(true)
}
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.Starting)})
cc.send(nil, "", true, &netmap.NetworkMap{
SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
})
nw.check()
t.Logf("Running")
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.Running)})
b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
nw.check()
t.Logf("Re-auth (StartLoginInteractive)")
b.StartLoginInteractive(t.Context())
stop := make(chan struct{})
stopSpamming := sync.OnceFunc(func() {
stop <- struct{}{}
})
// if seamless renewal is enabled, the engine won't be disabled, and we won't
// ever call stopSpamming, so make sure it does get called
defer stopSpamming()
// Intercept updates between the engine and localBackend, so that we can see
// when the "stopped" update comes in and ensure we stop sending our "we're
// up" updates after that point.
b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
// This is not one of our fake status updates, this is generated from the
// engine in response to LocalBackend calling RequestStatus. Stop spamming
// our fake statuses.
//
// TODO(zofrex): This is fragile, it works right now but would break if the
// calling pattern of RequestStatus changes. We should ensure that we keep
// sending "we're up" statuses right until Reconfig is called with
// zero-valued configs, and after that point only send "stopped" statuses.
stopSpamming()
// Once stopSpamming returns we are guaranteed to not send any more updates,
// so we can now send the real update (indicating shutdown) and be certain
// it will be received after any fake updates we sent. This is possibly a
// stronger guarantee than we get from the real engine?
b.setWgengineStatus(s, err)
})
// time needs to be >= last time for the status to be accepted, send all our
// spam with the same stale time so that when a real update comes in it will
// definitely be accepted.
time := b.lastStatusTime
// Flood localBackend with a lot of wgengine status updates, so if there are
// any race conditions in the multiple locks/unlocks that happen as we process
// the received auth URL, we will hit them.
go func() {
t.Logf("sending lots of fake wgengine status updates")
for {
select {
case <-stop:
t.Logf("stopping fake wgengine status updates")
return
default:
b.setWgengineStatus(&wgengine.Status{AsOf: time, DERPs: 1}, nil)
}
}
}()
t.Logf("Re-auth (receive URL)")
url1 := "https://localhost:1/1"
cc.send(nil, url1, false, nil)
// Don't need to wait on anything else - once .send completes, authURL should
// be set, and once .send has completed, any opportunities for a WG engine
// status update to trample it have ended as well.
if b.authURL == "" {
t.Fatalf("expected authURL to be set")
}
}
func TestWGEngineDownThenUpRace(t *testing.T) {
var cc *mockControl
b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
cc = newClient(t, opts)
return cc
})
nw := newNotificationWatcher(t, b, &ipnauth.TestActor{})
t.Logf("Start")
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.NeedsLogin)})
b.Start(ipn.Options{
UpdatePrefs: &ipn.Prefs{
WantRunning: true,
ControlURL: "https://localhost:1/",
},
})
nw.check()
t.Logf("LoginFinished")
cc.persist.UserProfile.LoginName = "user1"
cc.persist.NodeID = "node1"
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.Starting)})
cc.send(nil, "", true, &netmap.NetworkMap{
SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
})
nw.check()
nw.watch(0, []wantedNotification{
wantStateNotify(ipn.Running)})
b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
nw.check()
t.Logf("Re-auth (StartLoginInteractive)")
b.StartLoginInteractive(t.Context())
var timeLock sync.RWMutex
timestamp := b.lastStatusTime
engineShutdown := make(chan struct{})
gotShutdown := sync.OnceFunc(func() {
t.Logf("engineShutdown")
engineShutdown <- struct{}{}
})
b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
timeLock.Lock()
if s.AsOf.After(timestamp) {
timestamp = s.AsOf
}
timeLock.Unlock()
if err != nil || (s.DERPs == 0 && len(s.Peers) == 0) {
gotShutdown()
} else {
b.setWgengineStatus(s, err)
}
})
t.Logf("Re-auth (receive URL)")
url1 := "https://localhost:1/1"
done := make(chan struct{})
var wg sync.WaitGroup
wg.Go(func() {
t.Log("cc.send starting")
cc.send(nil, url1, false, nil) // will block until engine stops
t.Log("cc.send returned")
})
<-engineShutdown // will get called once cc.send is blocked
gotShutdown = sync.OnceFunc(func() {
t.Logf("engineShutdown")
engineShutdown <- struct{}{}
})
wg.Go(func() {
t.Log("StartLoginInteractive starting")
b.StartLoginInteractive(t.Context()) // will also block until engine stops
t.Log("StartLoginInteractive returned")
})
<-engineShutdown // will get called once StartLoginInteractive is blocked
st := controlclient.Status{}
st.SetStateForTest(controlclient.StateAuthenticated)
b.SetControlClientStatus(cc, st)
timeLock.RLock()
b.setWgengineStatus(&wgengine.Status{AsOf: timestamp}, nil) // engine is down event finally arrives
b.setWgengineStatus(&wgengine.Status{AsOf: timestamp, DERPs: 1}, nil) // engine is back up
timeLock.RUnlock()
go func() {
wg.Wait()
done <- struct{}{}
}()
t.Log("waiting for .send and .StartLoginInteractive to return")
select {
case <-done:
case <-time.After(10 * time.Second):
t.Fatalf("timed out waiting")
}
t.Log("both returned")
}
func buildNetmapWithPeers(self tailcfg.NodeView, peers ...tailcfg.NodeView) *netmap.NetworkMap {
const (
firstAutoUserID = tailcfg.UserID(10000)