mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-25 19:15:34 +00:00
6d84f3409b
We now handle the case where the NetworkMap.SelfNode has already expired and do not return an expiry time in the past (which causes an ~infinite loop of timers to fire). Additionally, we now add an explicit check to ensure that the next expiry time is never before the current local-to-the-system time, to ensure that we don't end up in a similar situation due to clock skew. Finally, we add more tests for this logic to ensure that we don't regress on these edge cases. Fixes #7193 Change-Id: Iaf8e3d83be1d133a7aab7f8d62939e508cc53f9c Signed-off-by: Andrew Dunham <andrew@du.nham.ca>
215 lines
7.9 KiB
Go
215 lines
7.9 KiB
Go
// Copyright (c) Tailscale Inc & AUTHORS
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
package ipnlocal
|
|
|
|
import (
|
|
"time"
|
|
|
|
"tailscale.com/syncs"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/types/key"
|
|
"tailscale.com/types/logger"
|
|
"tailscale.com/types/netmap"
|
|
)
|
|
|
|
// For extra defense-in-depth, when we're testing expired nodes we check
|
|
// ControlTime against this 'epoch' (set to the approximate time that this code
|
|
// was written) such that if control (or Headscale, etc.) sends a ControlTime
|
|
// that's sufficiently far in the past, we can safely ignore it.
|
|
var flagExpiredPeersEpoch = time.Unix(1673373066, 0)
|
|
|
|
// If the offset between the current time and the time received from control is
|
|
// larger than this, we store an offset in our expiryManager to adjust future
|
|
// clock timings.
|
|
const minClockDelta = 1 * time.Minute
|
|
|
|
// expiryManager tracks the state of expired nodes and the delta from the
|
|
// current clock time to the time returned from control, and allows mutating a
|
|
// netmap to mark peers as expired based on the current delta-adjusted time.
|
|
type expiryManager struct {
|
|
// previouslyExpired stores nodes that have already expired so we can
|
|
// only log on state transitions.
|
|
previouslyExpired map[tailcfg.StableNodeID]bool
|
|
|
|
// clockDelta stores the delta between the current time and the time
|
|
// received from control such that:
|
|
// time.Now().Add(clockDelta) == MapResponse.ControlTime
|
|
clockDelta syncs.AtomicValue[time.Duration]
|
|
|
|
logf logger.Logf
|
|
timeNow func() time.Time
|
|
}
|
|
|
|
func newExpiryManager(logf logger.Logf) *expiryManager {
|
|
return &expiryManager{
|
|
previouslyExpired: map[tailcfg.StableNodeID]bool{},
|
|
logf: logf,
|
|
timeNow: time.Now,
|
|
}
|
|
}
|
|
|
|
// onControlTime is called whenever we receive a new timestamp from the control
|
|
// server to store the delta.
|
|
func (em *expiryManager) onControlTime(t time.Time) {
|
|
localNow := em.timeNow()
|
|
delta := t.Sub(localNow)
|
|
if delta.Abs() > minClockDelta {
|
|
em.logf("[v1] netmap: flagExpiredPeers: setting clock delta to %v", delta)
|
|
em.clockDelta.Store(delta)
|
|
} else {
|
|
em.clockDelta.Store(0)
|
|
}
|
|
}
|
|
|
|
// flagExpiredPeers updates mapRes.Peers, mutating all peers that have expired,
|
|
// taking into account any clock skew detected by using the ControlTime field
|
|
// in the MapResponse. We don't actually remove expired peers from the Peers
|
|
// array; instead, we clear some fields of the Node object, and set
|
|
// Node.Expired so other parts of the codebase can provide more clear error
|
|
// messages when attempting to e.g. ping an expired node.
|
|
//
|
|
// The localNow time should be the output of time.Now for the local system; it
|
|
// will be adjusted by any stored clock skew from ControlTime.
|
|
//
|
|
// This is additionally a defense-in-depth against something going wrong with
|
|
// control such that we start seeing expired peers with a valid Endpoints or
|
|
// DERP field.
|
|
//
|
|
// This function is safe to call concurrently with onControlTime but not
|
|
// concurrently with any other call to flagExpiredPeers.
|
|
func (em *expiryManager) flagExpiredPeers(netmap *netmap.NetworkMap, localNow time.Time) {
|
|
// Adjust our current time by any saved delta to adjust for clock skew.
|
|
controlNow := localNow.Add(em.clockDelta.Load())
|
|
if controlNow.Before(flagExpiredPeersEpoch) {
|
|
em.logf("netmap: flagExpiredPeers: [unexpected] delta-adjusted current time is before hardcoded epoch; skipping")
|
|
return
|
|
}
|
|
|
|
for _, peer := range netmap.Peers {
|
|
// Nodes that don't expire have KeyExpiry set to the zero time;
|
|
// skip those and peers that are already marked as expired
|
|
// (e.g. from control).
|
|
if peer.KeyExpiry.IsZero() || peer.KeyExpiry.After(controlNow) {
|
|
delete(em.previouslyExpired, peer.StableID)
|
|
continue
|
|
} else if peer.Expired {
|
|
continue
|
|
}
|
|
|
|
if !em.previouslyExpired[peer.StableID] {
|
|
em.logf("[v1] netmap: flagExpiredPeers: clearing expired peer %v", peer.StableID)
|
|
em.previouslyExpired[peer.StableID] = true
|
|
}
|
|
|
|
// Actually mark the node as expired
|
|
peer.Expired = true
|
|
|
|
// Control clears the Endpoints and DERP fields of expired
|
|
// nodes; do so here as well. The Expired bool is the correct
|
|
// thing to set, but this replicates the previous behaviour.
|
|
//
|
|
// NOTE: this is insufficient to actually break connectivity,
|
|
// since we discover endpoints via DERP, and due to DERP return
|
|
// path optimization.
|
|
peer.Endpoints = nil
|
|
peer.DERP = ""
|
|
|
|
// Defense-in-depth: break the node's public key as well, in
|
|
// case something tries to communicate.
|
|
peer.Key = key.NodePublicWithBadOldPrefix(peer.Key)
|
|
}
|
|
}
|
|
|
|
// nextPeerExpiry returns the time that the next node in the netmap expires
|
|
// (including the self node), based on their KeyExpiry. It skips nodes that are
|
|
// already marked as Expired. If there are no nodes expiring in the future,
|
|
// then the zero Time will be returned.
|
|
//
|
|
// The localNow time should be the output of time.Now for the local system; it
|
|
// will be adjusted by any stored clock skew from ControlTime.
|
|
//
|
|
// This function is safe to call concurrently with other methods of this expiryManager.
|
|
func (em *expiryManager) nextPeerExpiry(nm *netmap.NetworkMap, localNow time.Time) time.Time {
|
|
if nm == nil {
|
|
return time.Time{}
|
|
}
|
|
|
|
controlNow := localNow.Add(em.clockDelta.Load())
|
|
if controlNow.Before(flagExpiredPeersEpoch) {
|
|
em.logf("netmap: nextPeerExpiry: [unexpected] delta-adjusted current time is before hardcoded epoch; skipping")
|
|
return time.Time{}
|
|
}
|
|
|
|
var nextExpiry time.Time // zero if none
|
|
for _, peer := range nm.Peers {
|
|
if peer.KeyExpiry.IsZero() {
|
|
continue // tagged node
|
|
} else if peer.Expired {
|
|
// Peer already expired; Expired is set by the
|
|
// flagExpiredPeers function, above.
|
|
continue
|
|
} else if peer.KeyExpiry.Before(controlNow) {
|
|
// This peer already expired, and peer.Expired
|
|
// isn't set for some reason. Skip this node.
|
|
continue
|
|
}
|
|
|
|
// nextExpiry being zero is a sentinel that we haven't yet set
|
|
// an expiry; otherwise, only update if this node's expiry is
|
|
// sooner than the currently-stored one (since we want the
|
|
// soonest-occuring expiry time).
|
|
if nextExpiry.IsZero() || peer.KeyExpiry.Before(nextExpiry) {
|
|
nextExpiry = peer.KeyExpiry
|
|
}
|
|
}
|
|
|
|
// Ensure that we also fire this timer if our own node key expires.
|
|
if nm.SelfNode != nil {
|
|
selfExpiry := nm.SelfNode.KeyExpiry
|
|
|
|
if selfExpiry.IsZero() {
|
|
// No expiry for self node
|
|
} else if selfExpiry.Before(controlNow) {
|
|
// Self node already expired; we don't want to return a
|
|
// time in the past, so skip this.
|
|
} else if nextExpiry.IsZero() || selfExpiry.Before(nextExpiry) {
|
|
// Self node expires after now, but before the soonest
|
|
// peer in the netmap; update our next expiry to this
|
|
// time.
|
|
nextExpiry = selfExpiry
|
|
}
|
|
}
|
|
|
|
// As an additional defense in depth, never return a time that is
|
|
// before the current time from the perspective of the local system
|
|
// (since timers with a zero or negative duration will fire
|
|
// immediately and can cause unnecessary reconfigurations).
|
|
//
|
|
// This can happen if the local clock is running fast; for example:
|
|
// localTime = 2pm
|
|
// controlTime = 1pm (real time)
|
|
// nextExpiry = 1:30pm (real time)
|
|
//
|
|
// In the above case, we'd return a nextExpiry of 1:30pm while the
|
|
// current clock reads 2pm; in this case, setting a timer for
|
|
// nextExpiry.Sub(now) would result in a negative duration and a timer
|
|
// that fired immediately.
|
|
//
|
|
// In this particular edge-case, return an expiry time 30 seconds after
|
|
// the local time so that any timers created based on this expiry won't
|
|
// fire too quickly.
|
|
//
|
|
// The alternative would be to do all comparisons in local time,
|
|
// unadjusted for clock skew, but that doesn't handle cases where the
|
|
// local clock is "fixed" between netmap updates.
|
|
if !nextExpiry.IsZero() && nextExpiry.Before(localNow) {
|
|
em.logf("netmap: nextPeerExpiry: skipping nextExpiry %q before local time %q due to clock skew",
|
|
nextExpiry.UTC().Format(time.RFC3339),
|
|
localNow.UTC().Format(time.RFC3339))
|
|
return localNow.Add(30 * time.Second)
|
|
}
|
|
|
|
return nextExpiry
|
|
}
|