wgengine/wgcfg: recover from mismatched PublicKey/Endpoints

In rare circumstances (tailscale/corp#3016), the PublicKey
and Endpoints can diverge.

This by itself doesn't cause any harm, but our early exit
in response did, because it prevented us from recovering from it.

Remove the early exit.

Signed-off-by: Josh Bleecher Snyder <josh@tailscale.com>
This commit is contained in:
Josh Bleecher Snyder 2021-11-10 18:42:16 -08:00 committed by Josh Bleecher Snyder
parent 9fc4e876e3
commit 85184a58ed
7 changed files with 27 additions and 9 deletions

View File

@ -57,4 +57,5 @@ func (src *Peer) Clone() *Peer {
DiscoKey key.DiscoPublic
AllowedIPs []netaddr.IPPrefix
PersistentKeepalive uint16
WGEndpoint key.NodePublic
}{})

View File

@ -28,6 +28,11 @@ type Peer struct {
DiscoKey key.DiscoPublic // present only so we can handle restarts within wgengine, not passed to WireGuard
AllowedIPs []netaddr.IPPrefix
PersistentKeepalive uint16
// wireguard-go's endpoint for this peer. It should always equal Peer.PublicKey.
// We represent it explicitly so that we can detect if they diverge and recover.
// There is no need to set WGEndpoint explicitly when constructing a Peer by hand.
// It is only populated when reading Peers from wireguard-go.
WGEndpoint key.NodePublic
}
// PeerWithKey returns the Peer with key k and reports whether it was found.

View File

@ -62,7 +62,7 @@ func ReconfigDevice(d *device.Device, cfg *Config, logf logger.Logf) (err error)
r.Close()
}()
toErr := cfg.ToUAPI(w, prev)
toErr := cfg.ToUAPI(logf, w, prev)
w.Close()
setErr := <-errc
return multierr.New(setErr, toErr)

View File

@ -68,14 +68,14 @@ func TestDeviceConfig(t *testing.T) {
}
prev := new(Config)
gotbuf := new(strings.Builder)
err = got.ToUAPI(gotbuf, prev)
err = got.ToUAPI(t.Logf, gotbuf, prev)
gotStr := gotbuf.String()
if err != nil {
t.Errorf("got.ToUAPI(): error: %v", err)
return
}
wantbuf := new(strings.Builder)
err = want.ToUAPI(wantbuf, prev)
err = want.ToUAPI(t.Logf, wantbuf, prev)
wantStr := wantbuf.String()
if err != nil {
t.Errorf("want.ToUAPI(): error: %v", err)

View File

@ -151,9 +151,11 @@ func (cfg *Config) handlePeerLine(peer *Peer, k, value mem.RO, valueBytes []byte
if err != nil {
return fmt.Errorf("invalid endpoint %q for peer %q, expected a hex public key", value.StringCopy(), peer.PublicKey.ShortString())
}
if nk != peer.PublicKey {
return fmt.Errorf("unexpected endpoint %q for peer %q, expected the peer's public key", value.StringCopy(), peer.PublicKey.ShortString())
}
// nk ought to equal peer.PublicKey.
// Under some rare circumstances, it might not. See corp issue #3016.
// Even if that happens, don't stop early, so that we can recover from it.
// Instead, note the value of nk so we can fix as needed.
peer.WGEndpoint = nk
case k.EqualString("persistent_keepalive_interval"):
n, err := mem.ParseUint(value, 10, 16)
if err != nil {

View File

@ -80,7 +80,7 @@ func BenchmarkFromUAPI(b *testing.B) {
buf := new(bytes.Buffer)
w := bufio.NewWriter(buf)
if err := cfg1.ToUAPI(w, &Config{}); err != nil {
if err := cfg1.ToUAPI(b.Logf, w, &Config{}); err != nil {
b.Fatal(err)
}
w.Flush()

View File

@ -11,13 +11,14 @@
"inet.af/netaddr"
"tailscale.com/types/key"
"tailscale.com/types/logger"
)
// ToUAPI writes cfg in UAPI format to w.
// Prev is the previous device Config.
// Prev is required so that we can remove now-defunct peers
// without having to remove and re-add all peers.
func (cfg *Config) ToUAPI(w io.Writer, prev *Config) error {
func (cfg *Config) ToUAPI(logf logger.Logf, w io.Writer, prev *Config) error {
var stickyErr error
set := func(key, value string) {
if stickyErr != nil {
@ -54,7 +55,16 @@ func (cfg *Config) ToUAPI(w io.Writer, prev *Config) error {
// Avoid setting endpoints if the correct one is already known
// to WireGuard, because doing so generates a bit more work in
// calling magicsock's ParseEndpoint for effectively a no-op.
if !wasPresent {
if oldPeer.WGEndpoint != p.PublicKey {
if wasPresent {
// We had an endpoint, and it was wrong.
// By construction, this should not happen.
// If it does, keep going so that we can recover from it,
// but log so that we know about it,
// because it is an indicator of other failed invariants.
// See corp issue 3016.
logf("[unexpected] endpoint changed from %s to %s", oldPeer.WGEndpoint, p.PublicKey)
}
set("endpoint", p.PublicKey.UntypedHexString())
}