wgengine: lazily create peer wireguard configs more explicitly

Rather than consider bigs jumps in last-received-from activity as a
signal to possibly reconfigure the set of wireguard peers to have
configured, instead just track the set of peers that are currently
excluded from the configuration. Easier to reason about.

Also adds a bit more logging.

This might fix an error we saw on a machine running a recent unstable
build:

2020-08-26 17:54:11.528033751 +0000 UTC: 8.6M/92.6M magicsock: [unexpected] lazy endpoint not created for [UcppE], d:42a770f678357249
2020-08-26 17:54:13.691305296 +0000 UTC: 8.7M/92.6M magicsock: DERP packet received from idle peer [UcppE]; created=false
2020-08-26 17:54:13.691383687 +0000 UTC: 8.7M/92.6M magicsock: DERP packet from unknown key: [UcppE]

If it does happen again, though, we'll have more logs.
This commit is contained in:
Brad Fitzpatrick 2020-08-26 12:20:09 -07:00
parent 483141094c
commit 756d6a72bd
3 changed files with 30 additions and 28 deletions

View File

@ -1765,6 +1765,7 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
needsRecvActivityCall = de.isFirstRecvActivityInAwhile()
}
if needsRecvActivityCall && c.noteRecvActivity != nil {
c.logf("magicsock: got disco message from idle peer, starting lazy conf for %v, %v", peerNode.Key.ShortString(), sender.ShortString())
// We can't hold Conn.mu while calling noteRecvActivity.
// noteRecvActivity acquires userspaceEngine.wgLock (and per our
// lock ordering rules: wgLock must come first), and also calls

View File

@ -105,7 +105,8 @@ type userspaceEngine struct {
lastEngineSigFull string // of full wireguard config
lastEngineSigTrim string // of trimmed wireguard config
recvActivityAt map[tailcfg.DiscoKey]time.Time
sentActivityAt map[packet.IP]*int64 // value is atomic int64 of unixtime
trimmedDisco map[tailcfg.DiscoKey]bool // set of disco keys of peers currently excluded from wireguard config
sentActivityAt map[packet.IP]*int64 // value is atomic int64 of unixtime
destIPActivityFuncs map[packet.IP]func()
mu sync.Mutex // guards following; see lock order comment below
@ -636,9 +637,11 @@ func (e *userspaceEngine) noteReceiveActivity(dk tailcfg.DiscoKey) {
e.wgLock.Lock()
defer e.wgLock.Unlock()
was, ok := e.recvActivityAt[dk]
if !ok {
if _, ok := e.recvActivityAt[dk]; !ok {
// Not a trimmable peer we care about tracking. (See isTrimmablePeer)
if e.trimmedDisco[dk] {
e.logf("wgengine: [unexpected] noteReceiveActivity called on idle discokey %v that's not in recvActivityAt", dk.ShortString())
}
return
}
now := e.timeNow()
@ -650,7 +653,8 @@ func (e *userspaceEngine) noteReceiveActivity(dk tailcfg.DiscoKey) {
// lazyPeerIdleThreshold without the divide by 2, but
// maybeReconfigWireguardLocked is cheap enough to call every
// couple minutes (just not on every packet).
if was.IsZero() || now.Sub(was) > lazyPeerIdleThreshold/2 {
if e.trimmedDisco[dk] {
e.logf("wgengine: idle peer %v now active, reconfiguring wireguard", dk.ShortString())
e.maybeReconfigWireguardLocked()
}
}
@ -718,6 +722,8 @@ func (e *userspaceEngine) maybeReconfigWireguardLocked() error {
trackDisco := make([]tailcfg.DiscoKey, 0, len(full.Peers))
trackIPs := make([]wgcfg.IP, 0, len(full.Peers))
trimmedDisco := map[tailcfg.DiscoKey]bool{} // TODO: don't re-alloc this map each time
for i := range full.Peers {
p := &full.Peers[i]
if !isTrimmablePeer(p, len(full.Peers)) {
@ -730,6 +736,8 @@ func (e *userspaceEngine) maybeReconfigWireguardLocked() error {
trackIPs = append(trackIPs, tsIP)
if e.isActiveSince(dk, tsIP, activeCutoff) {
min.Peers = append(min.Peers, *p)
} else {
trimmedDisco[dk] = true
}
}
@ -738,6 +746,8 @@ func (e *userspaceEngine) maybeReconfigWireguardLocked() error {
return nil
}
e.trimmedDisco = trimmedDisco
e.updateActivityMapsLocked(trackDisco, trackIPs)
e.logf("wgengine: Reconfig: configuring userspace wireguard config (with %d/%d peers)", len(min.Peers), len(full.Peers))

View File

@ -17,7 +17,6 @@
func TestNoteReceiveActivity(t *testing.T) {
now := time.Unix(1, 0)
tick := func(d time.Duration) { now = now.Add(d) }
var logBuf bytes.Buffer
confc := make(chan bool, 1)
@ -37,6 +36,7 @@ func TestNoteReceiveActivity(t *testing.T) {
},
tundev: new(tstun.TUN),
testMaybeReconfigHook: func() { confc <- true },
trimmedDisco: map[tailcfg.DiscoKey]bool{},
}
ra := e.recvActivityAt
@ -51,7 +51,7 @@ func TestNoteReceiveActivity(t *testing.T) {
t.Fatalf("unexpected log write (and thus activity): %s", logBuf.Bytes())
}
// Now track it and expect updates.
// Now track it, but don't mark it trimmed, so shouldn't update.
ra[dk] = time.Time{}
e.noteReceiveActivity(dk)
if len(ra) != 1 {
@ -60,29 +60,20 @@ func TestNoteReceiveActivity(t *testing.T) {
if got := ra[dk]; got != now {
t.Fatalf("time in map = %v; want %v", got, now)
}
if gotConf() {
t.Fatalf("unexpected reconfig")
}
// Now mark it trimmed and expect an update.
e.trimmedDisco[dk] = true
e.noteReceiveActivity(dk)
if len(ra) != 1 {
t.Fatalf("unexpected growth in map: now has %d keys; want 1", len(ra))
}
if got := ra[dk]; got != now {
t.Fatalf("time in map = %v; want %v", got, now)
}
if !gotConf() {
t.Fatalf("didn't get expected reconfig")
}
// With updates 1 second apart, don't expect a reconfig.
for i := 0; i < 300; i++ {
tick(time.Second)
e.noteReceiveActivity(dk)
if len(ra) != 1 {
t.Fatalf("map len = %d; want 1", len(ra))
}
if got := ra[dk]; got != now {
t.Fatalf("time in map = %v; want %v", got, now)
}
if gotConf() {
t.Fatalf("unexpected reconfig")
}
}
// But if there's a big jump it should get an update.
tick(3 * time.Minute)
e.noteReceiveActivity(dk)
if !gotConf() {
t.Fatalf("expected config")
}
}