mirror of
https://github.com/tailscale/tailscale.git
synced 2025-04-22 08:51:41 +00:00
wgengine/magicsock: fix lazy config deadlock, document more lock ordering
This removes the atomic bool that tried to track whether we needed to acquire the lock on a future recursive call back into magicsock. Unfortunately that hack doesn't work because we also had a lock ordering issue between magicsock and userspaceEngine (see issue). This documents that too. Fixes #644
This commit is contained in:
parent
43bc86588e
commit
cff737786e
@ -128,13 +128,6 @@ type Conn struct {
|
|||||||
mu sync.Mutex // guards all following fields; see userspaceEngine lock ordering rules
|
mu sync.Mutex // guards all following fields; see userspaceEngine lock ordering rules
|
||||||
muCond *sync.Cond
|
muCond *sync.Cond
|
||||||
|
|
||||||
// canCreateEPUnlocked tracks at one place whether mu is
|
|
||||||
// already held. It's then checked in CreateEndpoint to avoid
|
|
||||||
// double-locking mu and thus deadlocking. mu should be held
|
|
||||||
// while setting this; but can be read without mu held.
|
|
||||||
// TODO(bradfitz): delete this shameful hack; refactor the one use
|
|
||||||
canCreateEPUnlocked syncs.AtomicBool
|
|
||||||
|
|
||||||
started bool // Start was called
|
started bool // Start was called
|
||||||
closed bool // Close was called
|
closed bool // Close was called
|
||||||
|
|
||||||
@ -289,8 +282,10 @@ type Options struct {
|
|||||||
// sole user just doesn't need or want it called on every
|
// sole user just doesn't need or want it called on every
|
||||||
// packet, just every minute or two for Wireguard timeouts,
|
// packet, just every minute or two for Wireguard timeouts,
|
||||||
// and 10 seconds seems like a good trade-off between often
|
// and 10 seconds seems like a good trade-off between often
|
||||||
// enough and not too often.) The provided func likely calls
|
// enough and not too often.) The provided func is called while
|
||||||
// Conn.CreateEndpoint, which acquires Conn.mu.
|
// holding userspaceEngine.wgLock and likely calls
|
||||||
|
// Conn.CreateEndpoint, which acquires Conn.mu. As such, you should
|
||||||
|
// not hold
|
||||||
NoteRecvActivity func(tailcfg.DiscoKey)
|
NoteRecvActivity func(tailcfg.DiscoKey)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1633,16 +1628,26 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
|
|||||||
c.logf("magicsock: [unexpected] have node without endpoint, without c.noteRecvActivity hook")
|
c.logf("magicsock: [unexpected] have node without endpoint, without c.noteRecvActivity hook")
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
// noteRecvActivity calls back into CreateEndpoint, which we can't easily control,
|
// We can't hold Conn.mu while calling noteRecvActivity.
|
||||||
// and CreateEndpoint expects to be called with c.mu held, but we hold it here, and
|
// noteRecvActivity acquires userspaceEngine.wgLock (and per our
|
||||||
// it's too invasive for now to release it here and recheck invariants. So instead,
|
// lock ordering rules: wgLock must come first), and also calls
|
||||||
// use this unfortunate hack: set canCreateEPUnlocked which CreateEndpoint then
|
// back into our Conn.CreateEndpoint, which would double-acquire
|
||||||
// checks to conditionally acquire the mutex. I'm so sorry.
|
// Conn.mu.
|
||||||
c.canCreateEPUnlocked.Set(true)
|
c.mu.Unlock()
|
||||||
c.noteRecvActivity(sender)
|
c.noteRecvActivity(sender)
|
||||||
c.canCreateEPUnlocked.Set(false)
|
c.mu.Lock() // re-acquire
|
||||||
|
|
||||||
|
// Now, recheck invariants that might've changed while we'd
|
||||||
|
// released the lock, which isn't much:
|
||||||
|
if c.closed || c.privateKey.IsZero() {
|
||||||
|
return true
|
||||||
|
}
|
||||||
de, ok = c.endpointOfDisco[sender]
|
de, ok = c.endpointOfDisco[sender]
|
||||||
if !ok {
|
if !ok {
|
||||||
|
if _, ok := c.nodeOfDisco[sender]; !ok {
|
||||||
|
// They just disappeared while we'd released the lock.
|
||||||
|
return false
|
||||||
|
}
|
||||||
c.logf("magicsock: [unexpected] lazy endpoint not created for %v, %v", peerNode.Key.ShortString(), sender.ShortString())
|
c.logf("magicsock: [unexpected] lazy endpoint not created for %v, %v", peerNode.Key.ShortString(), sender.ShortString())
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
@ -2650,14 +2655,12 @@ func (c *Conn) CreateBind(uint16) (conn.Bind, uint16, error) {
|
|||||||
//
|
//
|
||||||
|
|
||||||
func (c *Conn) CreateEndpoint(pubKey [32]byte, addrs string) (conn.Endpoint, error) {
|
func (c *Conn) CreateEndpoint(pubKey [32]byte, addrs string) (conn.Endpoint, error) {
|
||||||
|
c.mu.Lock()
|
||||||
|
defer c.mu.Unlock()
|
||||||
|
|
||||||
pk := key.Public(pubKey)
|
pk := key.Public(pubKey)
|
||||||
c.logf("magicsock: CreateEndpoint: key=%s: %s", pk.ShortString(), derpStr(addrs))
|
c.logf("magicsock: CreateEndpoint: key=%s: %s", pk.ShortString(), derpStr(addrs))
|
||||||
|
|
||||||
if !c.canCreateEPUnlocked.Get() { // sorry
|
|
||||||
c.mu.Lock()
|
|
||||||
defer c.mu.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
if strings.HasSuffix(addrs, controlclient.EndpointDiscoSuffix) {
|
if strings.HasSuffix(addrs, controlclient.EndpointDiscoSuffix) {
|
||||||
discoHex := strings.TrimSuffix(addrs, controlclient.EndpointDiscoSuffix)
|
discoHex := strings.TrimSuffix(addrs, controlclient.EndpointDiscoSuffix)
|
||||||
discoKey, err := key.NewPublicFromHexMem(mem.S(discoHex))
|
discoKey, err := key.NewPublicFromHexMem(mem.S(discoHex))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user