wgengine/magicsock: fix lazy config deadlock, document more lock ordering

This removes the atomic bool that tried to track whether we needed to acquire
the lock on a future recursive call back into magicsock. Unfortunately that
hack doesn't work because we also had a lock ordering issue between magicsock
and userspaceEngine (see issue). This documents that too.

Fixes #644
This commit is contained in:
Brad Fitzpatrick 2020-08-06 08:43:48 -07:00
parent 43bc86588e
commit cff737786e

View File

@ -128,13 +128,6 @@ type Conn struct {
mu sync.Mutex // guards all following fields; see userspaceEngine lock ordering rules mu sync.Mutex // guards all following fields; see userspaceEngine lock ordering rules
muCond *sync.Cond muCond *sync.Cond
// canCreateEPUnlocked tracks at one place whether mu is
// already held. It's then checked in CreateEndpoint to avoid
// double-locking mu and thus deadlocking. mu should be held
// while setting this; but can be read without mu held.
// TODO(bradfitz): delete this shameful hack; refactor the one use
canCreateEPUnlocked syncs.AtomicBool
started bool // Start was called started bool // Start was called
closed bool // Close was called closed bool // Close was called
@ -289,8 +282,10 @@ type Options struct {
// sole user just doesn't need or want it called on every // sole user just doesn't need or want it called on every
// packet, just every minute or two for Wireguard timeouts, // packet, just every minute or two for Wireguard timeouts,
// and 10 seconds seems like a good trade-off between often // and 10 seconds seems like a good trade-off between often
// enough and not too often.) The provided func likely calls // enough and not too often.) The provided func is called while
// Conn.CreateEndpoint, which acquires Conn.mu. // holding userspaceEngine.wgLock and likely calls
// Conn.CreateEndpoint, which acquires Conn.mu. As such, you should
// not hold
NoteRecvActivity func(tailcfg.DiscoKey) NoteRecvActivity func(tailcfg.DiscoKey)
} }
@ -1633,16 +1628,26 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
c.logf("magicsock: [unexpected] have node without endpoint, without c.noteRecvActivity hook") c.logf("magicsock: [unexpected] have node without endpoint, without c.noteRecvActivity hook")
return false return false
} }
// noteRecvActivity calls back into CreateEndpoint, which we can't easily control, // We can't hold Conn.mu while calling noteRecvActivity.
// and CreateEndpoint expects to be called with c.mu held, but we hold it here, and // noteRecvActivity acquires userspaceEngine.wgLock (and per our
// it's too invasive for now to release it here and recheck invariants. So instead, // lock ordering rules: wgLock must come first), and also calls
// use this unfortunate hack: set canCreateEPUnlocked which CreateEndpoint then // back into our Conn.CreateEndpoint, which would double-acquire
// checks to conditionally acquire the mutex. I'm so sorry. // Conn.mu.
c.canCreateEPUnlocked.Set(true) c.mu.Unlock()
c.noteRecvActivity(sender) c.noteRecvActivity(sender)
c.canCreateEPUnlocked.Set(false) c.mu.Lock() // re-acquire
// Now, recheck invariants that might've changed while we'd
// released the lock, which isn't much:
if c.closed || c.privateKey.IsZero() {
return true
}
de, ok = c.endpointOfDisco[sender] de, ok = c.endpointOfDisco[sender]
if !ok { if !ok {
if _, ok := c.nodeOfDisco[sender]; !ok {
// They just disappeared while we'd released the lock.
return false
}
c.logf("magicsock: [unexpected] lazy endpoint not created for %v, %v", peerNode.Key.ShortString(), sender.ShortString()) c.logf("magicsock: [unexpected] lazy endpoint not created for %v, %v", peerNode.Key.ShortString(), sender.ShortString())
return false return false
} }
@ -2650,14 +2655,12 @@ func (c *Conn) CreateBind(uint16) (conn.Bind, uint16, error) {
// //
func (c *Conn) CreateEndpoint(pubKey [32]byte, addrs string) (conn.Endpoint, error) { func (c *Conn) CreateEndpoint(pubKey [32]byte, addrs string) (conn.Endpoint, error) {
c.mu.Lock()
defer c.mu.Unlock()
pk := key.Public(pubKey) pk := key.Public(pubKey)
c.logf("magicsock: CreateEndpoint: key=%s: %s", pk.ShortString(), derpStr(addrs)) c.logf("magicsock: CreateEndpoint: key=%s: %s", pk.ShortString(), derpStr(addrs))
if !c.canCreateEPUnlocked.Get() { // sorry
c.mu.Lock()
defer c.mu.Unlock()
}
if strings.HasSuffix(addrs, controlclient.EndpointDiscoSuffix) { if strings.HasSuffix(addrs, controlclient.EndpointDiscoSuffix) {
discoHex := strings.TrimSuffix(addrs, controlclient.EndpointDiscoSuffix) discoHex := strings.TrimSuffix(addrs, controlclient.EndpointDiscoSuffix)
discoKey, err := key.NewPublicFromHexMem(mem.S(discoHex)) discoKey, err := key.NewPublicFromHexMem(mem.S(discoHex))