mirror of
https://github.com/tailscale/tailscale.git
synced 2025-11-16 19:07:42 +00:00
ipn/ipnlocal: fix setAuthURL / setWgengineStatus race condition (#17408)
If we received a wg engine status while processing an auth URL, there was a race condition where the authURL could be reset to "" immediately after we set it. To fix this we need to check that we are moving from a non-Running state to a Running state rather than always resetting the URL when we "move" into a Running state even if that is the current state. We also need to make sure that we do not return from stopEngineAndWait until the engine is stopped: before, we would return as soon as we received any engine status update, but that might have been an update already in-flight before we asked the engine to stop. Now we wait until we see an update that is indicative of a stopped engine, or we see that the engine is unblocked again, which indicates that the engine stopped and then started again while we were waiting before we checked the state. Updates #17388 Signed-off-by: James Sanderson <jsanderson@tailscale.com> Co-authored-by: Nick Khyl <nickk@tailscale.com>
This commit is contained in:
committed by
GitHub
parent
d816454a88
commit
7407f404d9
@@ -313,9 +313,8 @@ type LocalBackend struct {
|
|||||||
serveListeners map[netip.AddrPort]*localListener // listeners for local serve traffic
|
serveListeners map[netip.AddrPort]*localListener // listeners for local serve traffic
|
||||||
serveProxyHandlers sync.Map // string (HTTPHandler.Proxy) => *reverseProxy
|
serveProxyHandlers sync.Map // string (HTTPHandler.Proxy) => *reverseProxy
|
||||||
|
|
||||||
// statusLock must be held before calling statusChanged.Wait() or
|
// mu must be held before calling statusChanged.Wait() or
|
||||||
// statusChanged.Broadcast().
|
// statusChanged.Broadcast().
|
||||||
statusLock sync.Mutex
|
|
||||||
statusChanged *sync.Cond
|
statusChanged *sync.Cond
|
||||||
|
|
||||||
// dialPlan is any dial plan that we've received from the control
|
// dialPlan is any dial plan that we've received from the control
|
||||||
@@ -542,7 +541,7 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
|
|||||||
|
|
||||||
b.setTCPPortsIntercepted(nil)
|
b.setTCPPortsIntercepted(nil)
|
||||||
|
|
||||||
b.statusChanged = sync.NewCond(&b.statusLock)
|
b.statusChanged = sync.NewCond(&b.mu)
|
||||||
b.e.SetStatusCallback(b.setWgengineStatus)
|
b.e.SetStatusCallback(b.setWgengineStatus)
|
||||||
|
|
||||||
b.prevIfState = netMon.InterfaceState()
|
b.prevIfState = netMon.InterfaceState()
|
||||||
@@ -2265,14 +2264,15 @@ func (b *LocalBackend) setWgengineStatus(s *wgengine.Status, err error) {
|
|||||||
b.send(ipn.Notify{Engine: &es})
|
b.send(ipn.Notify{Engine: &es})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// broadcastStatusChanged must not be called with b.mu held.
|
||||||
func (b *LocalBackend) broadcastStatusChanged() {
|
func (b *LocalBackend) broadcastStatusChanged() {
|
||||||
// The sync.Cond docs say: "It is allowed but not required for the caller to hold c.L during the call."
|
// The sync.Cond docs say: "It is allowed but not required for the caller to hold c.L during the call."
|
||||||
// In this particular case, we must acquire b.statusLock. Otherwise we might broadcast before
|
// In this particular case, we must acquire b.mu. Otherwise we might broadcast before
|
||||||
// the waiter (in requestEngineStatusAndWait) starts to wait, in which case
|
// the waiter (in requestEngineStatusAndWait) starts to wait, in which case
|
||||||
// the waiter can get stuck indefinitely. See PR 2865.
|
// the waiter can get stuck indefinitely. See PR 2865.
|
||||||
b.statusLock.Lock()
|
b.mu.Lock()
|
||||||
b.statusChanged.Broadcast()
|
b.statusChanged.Broadcast()
|
||||||
b.statusLock.Unlock()
|
b.mu.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
// SetNotifyCallback sets the function to call when the backend has something to
|
// SetNotifyCallback sets the function to call when the backend has something to
|
||||||
@@ -3343,11 +3343,12 @@ func (b *LocalBackend) popBrowserAuthNow(url string, keyExpired bool, recipient
|
|||||||
if !b.seamlessRenewalEnabled() || keyExpired {
|
if !b.seamlessRenewalEnabled() || keyExpired {
|
||||||
b.blockEngineUpdates(true)
|
b.blockEngineUpdates(true)
|
||||||
b.stopEngineAndWait()
|
b.stopEngineAndWait()
|
||||||
|
|
||||||
|
if b.State() == ipn.Running {
|
||||||
|
b.enterState(ipn.Starting)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
b.tellRecipientToBrowseToURL(url, toNotificationTarget(recipient))
|
b.tellRecipientToBrowseToURL(url, toNotificationTarget(recipient))
|
||||||
if b.State() == ipn.Running {
|
|
||||||
b.enterState(ipn.Starting)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// validPopBrowserURL reports whether urlStr is a valid value for a
|
// validPopBrowserURL reports whether urlStr is a valid value for a
|
||||||
@@ -5513,7 +5514,13 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock
|
|||||||
activeLogin := b.activeLogin
|
activeLogin := b.activeLogin
|
||||||
authURL := b.authURL
|
authURL := b.authURL
|
||||||
if newState == ipn.Running {
|
if newState == ipn.Running {
|
||||||
b.resetAuthURLLocked()
|
// TODO(zofrex): Is this needed? As of 2025-10-03 it doesn't seem to be
|
||||||
|
// necessary when logging in or authenticating. When do we need to reset it
|
||||||
|
// here, rather than the other places it is reset? We should test if it is
|
||||||
|
// necessary and add unit tests to cover those cases, or remove it.
|
||||||
|
if oldState != ipn.Running {
|
||||||
|
b.resetAuthURLLocked()
|
||||||
|
}
|
||||||
|
|
||||||
// Start a captive portal detection loop if none has been
|
// Start a captive portal detection loop if none has been
|
||||||
// started. Create a new context if none is present, since it
|
// started. Create a new context if none is present, since it
|
||||||
@@ -5750,29 +5757,38 @@ func (u unlockOnce) UnlockEarly() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// stopEngineAndWait deconfigures the local network data plane, and
|
// stopEngineAndWait deconfigures the local network data plane, and
|
||||||
// waits for it to deliver a status update before returning.
|
// waits for it to deliver a status update indicating it has stopped
|
||||||
//
|
// before returning.
|
||||||
// TODO(danderson): this may be racy. We could unblock upon receiving
|
|
||||||
// a status update that predates the "I've shut down" update.
|
|
||||||
func (b *LocalBackend) stopEngineAndWait() {
|
func (b *LocalBackend) stopEngineAndWait() {
|
||||||
b.logf("stopEngineAndWait...")
|
b.logf("stopEngineAndWait...")
|
||||||
b.e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{})
|
b.e.Reconfig(&wgcfg.Config{}, &router.Config{}, &dns.Config{})
|
||||||
b.requestEngineStatusAndWait()
|
b.requestEngineStatusAndWaitForStopped()
|
||||||
b.logf("stopEngineAndWait: done.")
|
b.logf("stopEngineAndWait: done.")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Requests the wgengine status, and does not return until the status
|
// Requests the wgengine status, and does not return until a status was
|
||||||
// was delivered (to the usual callback).
|
// delivered (to the usual callback) that indicates the engine is stopped.
|
||||||
func (b *LocalBackend) requestEngineStatusAndWait() {
|
func (b *LocalBackend) requestEngineStatusAndWaitForStopped() {
|
||||||
b.logf("requestEngineStatusAndWait")
|
b.logf("requestEngineStatusAndWaitForStopped")
|
||||||
|
|
||||||
b.statusLock.Lock()
|
b.mu.Lock()
|
||||||
defer b.statusLock.Unlock()
|
defer b.mu.Unlock()
|
||||||
|
|
||||||
b.goTracker.Go(b.e.RequestStatus)
|
b.goTracker.Go(b.e.RequestStatus)
|
||||||
b.logf("requestEngineStatusAndWait: waiting...")
|
b.logf("requestEngineStatusAndWaitForStopped: waiting...")
|
||||||
b.statusChanged.Wait() // temporarily releases lock while waiting
|
for {
|
||||||
b.logf("requestEngineStatusAndWait: got status update.")
|
b.statusChanged.Wait() // temporarily releases lock while waiting
|
||||||
|
|
||||||
|
if !b.blocked {
|
||||||
|
b.logf("requestEngineStatusAndWaitForStopped: engine is no longer blocked, must have stopped and started again, not safe to wait.")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if b.engineStatus.NumLive == 0 && b.engineStatus.LiveDERPs == 0 {
|
||||||
|
b.logf("requestEngineStatusAndWaitForStopped: engine is stopped.")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
b.logf("requestEngineStatusAndWaitForStopped: engine is still running. Waiting...")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// setControlClientLocked sets the control client to cc,
|
// setControlClientLocked sets the control client to cc,
|
||||||
|
|||||||
@@ -1505,6 +1505,15 @@ func wantExitNodeIDNotify(want tailcfg.StableNodeID) wantedNotification {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func wantStateNotify(want ipn.State) wantedNotification {
|
||||||
|
return wantedNotification{
|
||||||
|
name: "State=" + want.String(),
|
||||||
|
cond: func(_ testing.TB, _ ipnauth.Actor, n *ipn.Notify) bool {
|
||||||
|
return n.State != nil && *n.State == want
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestInternalAndExternalInterfaces(t *testing.T) {
|
func TestInternalAndExternalInterfaces(t *testing.T) {
|
||||||
type interfacePrefix struct {
|
type interfacePrefix struct {
|
||||||
i netmon.Interface
|
i netmon.Interface
|
||||||
|
|||||||
@@ -1561,6 +1561,235 @@ func TestEngineReconfigOnStateChange(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestStateMachineURLRace tests that wgengine updates arriving in the middle of
|
||||||
|
// processing an auth URL doesn't result in the auth URL being cleared.
|
||||||
|
func TestStateMachineURLRace(t *testing.T) {
|
||||||
|
runTestStateMachineURLRace(t, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStateMachineURLRaceSeamless(t *testing.T) {
|
||||||
|
runTestStateMachineURLRace(t, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
func runTestStateMachineURLRace(t *testing.T, seamless bool) {
|
||||||
|
var cc *mockControl
|
||||||
|
b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
|
||||||
|
cc = newClient(t, opts)
|
||||||
|
return cc
|
||||||
|
})
|
||||||
|
|
||||||
|
nw := newNotificationWatcher(t, b, &ipnauth.TestActor{})
|
||||||
|
|
||||||
|
t.Logf("Start")
|
||||||
|
nw.watch(0, []wantedNotification{
|
||||||
|
wantStateNotify(ipn.NeedsLogin)})
|
||||||
|
b.Start(ipn.Options{
|
||||||
|
UpdatePrefs: &ipn.Prefs{
|
||||||
|
WantRunning: true,
|
||||||
|
ControlURL: "https://localhost:1/",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
nw.check()
|
||||||
|
|
||||||
|
t.Logf("LoginFinished")
|
||||||
|
cc.persist.UserProfile.LoginName = "user1"
|
||||||
|
cc.persist.NodeID = "node1"
|
||||||
|
|
||||||
|
if seamless {
|
||||||
|
b.sys.ControlKnobs().SeamlessKeyRenewal.Store(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
nw.watch(0, []wantedNotification{
|
||||||
|
wantStateNotify(ipn.Starting)})
|
||||||
|
cc.send(nil, "", true, &netmap.NetworkMap{
|
||||||
|
SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
|
||||||
|
})
|
||||||
|
nw.check()
|
||||||
|
|
||||||
|
t.Logf("Running")
|
||||||
|
nw.watch(0, []wantedNotification{
|
||||||
|
wantStateNotify(ipn.Running)})
|
||||||
|
b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
|
||||||
|
nw.check()
|
||||||
|
|
||||||
|
t.Logf("Re-auth (StartLoginInteractive)")
|
||||||
|
b.StartLoginInteractive(t.Context())
|
||||||
|
|
||||||
|
stop := make(chan struct{})
|
||||||
|
stopSpamming := sync.OnceFunc(func() {
|
||||||
|
stop <- struct{}{}
|
||||||
|
})
|
||||||
|
// if seamless renewal is enabled, the engine won't be disabled, and we won't
|
||||||
|
// ever call stopSpamming, so make sure it does get called
|
||||||
|
defer stopSpamming()
|
||||||
|
|
||||||
|
// Intercept updates between the engine and localBackend, so that we can see
|
||||||
|
// when the "stopped" update comes in and ensure we stop sending our "we're
|
||||||
|
// up" updates after that point.
|
||||||
|
b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
|
||||||
|
// This is not one of our fake status updates, this is generated from the
|
||||||
|
// engine in response to LocalBackend calling RequestStatus. Stop spamming
|
||||||
|
// our fake statuses.
|
||||||
|
//
|
||||||
|
// TODO(zofrex): This is fragile, it works right now but would break if the
|
||||||
|
// calling pattern of RequestStatus changes. We should ensure that we keep
|
||||||
|
// sending "we're up" statuses right until Reconfig is called with
|
||||||
|
// zero-valued configs, and after that point only send "stopped" statuses.
|
||||||
|
stopSpamming()
|
||||||
|
|
||||||
|
// Once stopSpamming returns we are guaranteed to not send any more updates,
|
||||||
|
// so we can now send the real update (indicating shutdown) and be certain
|
||||||
|
// it will be received after any fake updates we sent. This is possibly a
|
||||||
|
// stronger guarantee than we get from the real engine?
|
||||||
|
b.setWgengineStatus(s, err)
|
||||||
|
})
|
||||||
|
|
||||||
|
// time needs to be >= last time for the status to be accepted, send all our
|
||||||
|
// spam with the same stale time so that when a real update comes in it will
|
||||||
|
// definitely be accepted.
|
||||||
|
time := b.lastStatusTime
|
||||||
|
|
||||||
|
// Flood localBackend with a lot of wgengine status updates, so if there are
|
||||||
|
// any race conditions in the multiple locks/unlocks that happen as we process
|
||||||
|
// the received auth URL, we will hit them.
|
||||||
|
go func() {
|
||||||
|
t.Logf("sending lots of fake wgengine status updates")
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stop:
|
||||||
|
t.Logf("stopping fake wgengine status updates")
|
||||||
|
return
|
||||||
|
default:
|
||||||
|
b.setWgengineStatus(&wgengine.Status{AsOf: time, DERPs: 1}, nil)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
t.Logf("Re-auth (receive URL)")
|
||||||
|
url1 := "https://localhost:1/1"
|
||||||
|
cc.send(nil, url1, false, nil)
|
||||||
|
|
||||||
|
// Don't need to wait on anything else - once .send completes, authURL should
|
||||||
|
// be set, and once .send has completed, any opportunities for a WG engine
|
||||||
|
// status update to trample it have ended as well.
|
||||||
|
if b.authURL == "" {
|
||||||
|
t.Fatalf("expected authURL to be set")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWGEngineDownThenUpRace(t *testing.T) {
|
||||||
|
var cc *mockControl
|
||||||
|
b := newLocalBackendWithTestControl(t, true, func(tb testing.TB, opts controlclient.Options) controlclient.Client {
|
||||||
|
cc = newClient(t, opts)
|
||||||
|
return cc
|
||||||
|
})
|
||||||
|
|
||||||
|
nw := newNotificationWatcher(t, b, &ipnauth.TestActor{})
|
||||||
|
|
||||||
|
t.Logf("Start")
|
||||||
|
nw.watch(0, []wantedNotification{
|
||||||
|
wantStateNotify(ipn.NeedsLogin)})
|
||||||
|
b.Start(ipn.Options{
|
||||||
|
UpdatePrefs: &ipn.Prefs{
|
||||||
|
WantRunning: true,
|
||||||
|
ControlURL: "https://localhost:1/",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
nw.check()
|
||||||
|
|
||||||
|
t.Logf("LoginFinished")
|
||||||
|
cc.persist.UserProfile.LoginName = "user1"
|
||||||
|
cc.persist.NodeID = "node1"
|
||||||
|
|
||||||
|
nw.watch(0, []wantedNotification{
|
||||||
|
wantStateNotify(ipn.Starting)})
|
||||||
|
cc.send(nil, "", true, &netmap.NetworkMap{
|
||||||
|
SelfNode: (&tailcfg.Node{MachineAuthorized: true}).View(),
|
||||||
|
})
|
||||||
|
nw.check()
|
||||||
|
|
||||||
|
nw.watch(0, []wantedNotification{
|
||||||
|
wantStateNotify(ipn.Running)})
|
||||||
|
b.setWgengineStatus(&wgengine.Status{AsOf: time.Now(), DERPs: 1}, nil)
|
||||||
|
nw.check()
|
||||||
|
|
||||||
|
t.Logf("Re-auth (StartLoginInteractive)")
|
||||||
|
b.StartLoginInteractive(t.Context())
|
||||||
|
|
||||||
|
var timeLock sync.RWMutex
|
||||||
|
timestamp := b.lastStatusTime
|
||||||
|
|
||||||
|
engineShutdown := make(chan struct{})
|
||||||
|
gotShutdown := sync.OnceFunc(func() {
|
||||||
|
t.Logf("engineShutdown")
|
||||||
|
engineShutdown <- struct{}{}
|
||||||
|
})
|
||||||
|
|
||||||
|
b.e.SetStatusCallback(func(s *wgengine.Status, err error) {
|
||||||
|
timeLock.Lock()
|
||||||
|
if s.AsOf.After(timestamp) {
|
||||||
|
timestamp = s.AsOf
|
||||||
|
}
|
||||||
|
timeLock.Unlock()
|
||||||
|
|
||||||
|
if err != nil || (s.DERPs == 0 && len(s.Peers) == 0) {
|
||||||
|
gotShutdown()
|
||||||
|
} else {
|
||||||
|
b.setWgengineStatus(s, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Logf("Re-auth (receive URL)")
|
||||||
|
url1 := "https://localhost:1/1"
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
wg.Go(func() {
|
||||||
|
t.Log("cc.send starting")
|
||||||
|
cc.send(nil, url1, false, nil) // will block until engine stops
|
||||||
|
t.Log("cc.send returned")
|
||||||
|
})
|
||||||
|
|
||||||
|
<-engineShutdown // will get called once cc.send is blocked
|
||||||
|
gotShutdown = sync.OnceFunc(func() {
|
||||||
|
t.Logf("engineShutdown")
|
||||||
|
engineShutdown <- struct{}{}
|
||||||
|
})
|
||||||
|
|
||||||
|
wg.Go(func() {
|
||||||
|
t.Log("StartLoginInteractive starting")
|
||||||
|
b.StartLoginInteractive(t.Context()) // will also block until engine stops
|
||||||
|
t.Log("StartLoginInteractive returned")
|
||||||
|
})
|
||||||
|
|
||||||
|
<-engineShutdown // will get called once StartLoginInteractive is blocked
|
||||||
|
|
||||||
|
st := controlclient.Status{}
|
||||||
|
st.SetStateForTest(controlclient.StateAuthenticated)
|
||||||
|
b.SetControlClientStatus(cc, st)
|
||||||
|
|
||||||
|
timeLock.RLock()
|
||||||
|
b.setWgengineStatus(&wgengine.Status{AsOf: timestamp}, nil) // engine is down event finally arrives
|
||||||
|
b.setWgengineStatus(&wgengine.Status{AsOf: timestamp, DERPs: 1}, nil) // engine is back up
|
||||||
|
timeLock.RUnlock()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
wg.Wait()
|
||||||
|
done <- struct{}{}
|
||||||
|
}()
|
||||||
|
|
||||||
|
t.Log("waiting for .send and .StartLoginInteractive to return")
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-time.After(10 * time.Second):
|
||||||
|
t.Fatalf("timed out waiting")
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Log("both returned")
|
||||||
|
}
|
||||||
|
|
||||||
func buildNetmapWithPeers(self tailcfg.NodeView, peers ...tailcfg.NodeView) *netmap.NetworkMap {
|
func buildNetmapWithPeers(self tailcfg.NodeView, peers ...tailcfg.NodeView) *netmap.NetworkMap {
|
||||||
const (
|
const (
|
||||||
firstAutoUserID = tailcfg.UserID(10000)
|
firstAutoUserID = tailcfg.UserID(10000)
|
||||||
|
|||||||
Reference in New Issue
Block a user