ipn/ipnlocal: fix deadlock in resetControlClientLocked

resetControlClientLocked is called while b.mu was held and
would call cc.Shutdown which would wait for the observer queue
to drain.
However, there may be active callbacks from cc already waiting for
b.mu resulting in a deadlock.

This makes it so that resetControlClientLocked does not call
Shutdown, and instead just returns the value.
It also makes it so that any status received from previous cc
are ignored.

Updates tailscale/corp#12827

Signed-off-by: Maisem Ali <maisem@tailscale.com>
This commit is contained in:
Maisem Ali 2023-09-02 12:04:03 -07:00 committed by Maisem Ali
parent c6fadd6d71
commit d06a75dcd0
6 changed files with 53 additions and 28 deletions

View File

@ -595,7 +595,7 @@ func (c *Auto) sendStatus(who string, err error, url string, nm *netmap.NetworkM
// Launch a new goroutine to avoid blocking the caller while the observer
// does its thing, which may result in a call back into the client.
c.observerQueue.Add(func() {
c.observer.SetControlClientStatus(new)
c.observer.SetControlClientStatus(c, new)
})
}
@ -667,6 +667,7 @@ func (c *Auto) Shutdown() {
direct := c.direct
if !closed {
c.closed = true
c.observerQueue.shutdown()
c.cancelAuthCtxLocked()
c.cancelMapCtxLocked()
for _, w := range c.unpauseWaiters {

View File

@ -25,6 +25,9 @@
// Client represents a client connection to the control server.
// Currently this is done through a pair of polling https requests in
// the Auto client, but that might change eventually.
//
// The Client must be comparable as it is used by the Observer to detect stale
// clients.
type Client interface {
// Shutdown closes this session, which should not be used any further
// afterwards.

View File

@ -104,7 +104,11 @@ type Direct struct {
// Observer is implemented by users of the control client (such as LocalBackend)
// to get notified of changes in the control client's status.
type Observer interface {
SetControlClientStatus(Status)
// SetControlClientStatus is called when the client has a new status to
// report. The Client is provided to allow the Observer to track which
// Client is reporting the status, allowing it to ignore stale status
// reports from previous Clients.
SetControlClientStatus(Client, Status)
}
type Options struct {

View File

@ -897,9 +897,16 @@ func (b *LocalBackend) SetDecompressor(fn func() (controlclient.Decompressor, er
// SetControlClientStatus is the callback invoked by the control client whenever it posts a new status.
// Among other things, this is where we update the netmap, packet filters, DNS and DERP maps.
func (b *LocalBackend) SetControlClientStatus(st controlclient.Status) {
func (b *LocalBackend) SetControlClientStatus(c controlclient.Client, st controlclient.Status) {
b.mu.Lock()
if b.cc != c {
b.logf("Ignoring SetControlClientStatus from old client")
b.mu.Unlock()
return
}
// The following do not depend on any data for which we need to lock b.
if st.Err != nil {
b.mu.Unlock()
if errors.Is(st.Err, io.EOF) {
b.logf("[v1] Received error: EOF")
return
@ -916,8 +923,6 @@ func (b *LocalBackend) SetControlClientStatus(st controlclient.Status) {
// Track the number of calls
currCall := b.numClientStatusCalls.Add(1)
b.mu.Lock()
// Handle node expiry in the netmap
if st.NetMap != nil {
now := b.clock.Now()
@ -953,7 +958,7 @@ func (b *LocalBackend) SetControlClientStatus(st controlclient.Status) {
// Call ourselves with the current status again; the logic in
// setClientStatus will take care of updating the expired field
// of peers in the netmap.
b.SetControlClientStatus(st)
b.SetControlClientStatus(c, st)
})
}
}
@ -1340,16 +1345,14 @@ func (b *LocalBackend) Start(opts ipn.Options) error {
hostinfo.Userspace.Set(b.sys.IsNetstack())
hostinfo.UserspaceRouter.Set(b.sys.IsNetstackRouter())
if b.cc != nil {
// TODO(apenwarr): avoid the need to reinit controlclient.
// This will trigger a full relogin/reconfigure cycle every
// time a Handle reconnects to the backend. Ideally, we
// would send the new Prefs and everything would get back
// into sync with the minimal changes. But that's not how it
// is right now, which is a sign that the code is still too
// complicated.
b.resetControlClientLockedAsync()
}
// TODO(apenwarr): avoid the need to reinit controlclient.
// This will trigger a full relogin/reconfigure cycle every
// time a Handle reconnects to the backend. Ideally, we
// would send the new Prefs and everything would get back
// into sync with the minimal changes. But that's not how it
// is right now, which is a sign that the code is still too
// complicated.
prevCC := b.resetControlClientLocked()
httpTestClient := b.httpTestClient
if b.hostinfo != nil {
@ -1429,6 +1432,10 @@ func (b *LocalBackend) Start(opts ipn.Options) error {
debugFlags = append([]string{"netstack"}, debugFlags...)
}
if prevCC != nil {
prevCC.Shutdown()
}
// TODO(apenwarr): The only way to change the ServerURL is to
// re-run b.Start(), because this is the only place we create a
// new controlclient. SetPrefs() allows you to overwrite ServerURL,
@ -3847,12 +3854,12 @@ func (b *LocalBackend) requestEngineStatusAndWait() {
b.statusLock.Unlock()
}
// resetControlClientLockedAsync sets b.cc to nil, and starts a
// goroutine to Shutdown the old client. It does not wait for the
// shutdown to complete.
func (b *LocalBackend) resetControlClientLockedAsync() {
// resetControlClientLocked sets b.cc to nil and returns the old value. If the
// returned value is non-nil, the caller must call Shutdown on it after
// releasing b.mu.
func (b *LocalBackend) resetControlClientLocked() controlclient.Client {
if b.cc == nil {
return
return nil
}
// When we clear the control client, stop any outstanding netmap expiry
@ -3868,9 +3875,10 @@ func (b *LocalBackend) resetControlClientLockedAsync() {
// will abort.
b.numClientStatusCalls.Add(1)
}
b.cc.Shutdown()
prev := b.cc
b.cc = nil
b.ccAuto = nil
return prev
}
// ResetForClientDisconnect resets the backend for GUI clients running
@ -3881,10 +3889,16 @@ func (b *LocalBackend) resetControlClientLockedAsync() {
// when they restart the GUI.
func (b *LocalBackend) ResetForClientDisconnect() {
defer b.enterState(ipn.Stopped)
b.mu.Lock()
defer b.mu.Unlock()
b.logf("LocalBackend.ResetForClientDisconnect")
b.resetControlClientLockedAsync()
b.mu.Lock()
prevCC := b.resetControlClientLocked()
if prevCC != nil {
// Needs to happen without b.mu held.
defer prevCC.Shutdown()
}
defer b.mu.Unlock()
b.setNetMapLocked(nil)
b.pm.Reset()
b.keyExpired = false
@ -4997,7 +5011,10 @@ func (b *LocalBackend) ListProfiles() []ipn.LoginProfile {
// called to register it as new node.
func (b *LocalBackend) ResetAuth() error {
b.mu.Lock()
b.resetControlClientLockedAsync()
prevCC := b.resetControlClientLocked()
if prevCC != nil {
defer prevCC.Shutdown() // call must happen after release b.mu
}
if err := b.clearMachineKeyLocked(); err != nil {
b.mu.Unlock()
return err

View File

@ -31,7 +31,7 @@
type observerFunc func(controlclient.Status)
func (f observerFunc) SetControlClientStatus(s controlclient.Status) {
func (f observerFunc) SetControlClientStatus(_ controlclient.Client, s controlclient.Status) {
f(s)
}

View File

@ -173,7 +173,7 @@ func (cc *mockControl) send(err error, url string, loginFinished bool, nm *netma
} else if url == "" && err == nil && nm == nil {
s.SetStateForTest(controlclient.StateNotAuthenticated)
}
cc.opts.Observer.SetControlClientStatus(s)
cc.opts.Observer.SetControlClientStatus(cc, s)
}
}