wgengine/magicsock: rebind on EPIPE/ECONNRESET

Observed in the wild some macOS machines gain broken sockets coming out
of sleep (we observe "time jumped", followed by EPIPE on sendto). The
cause of this in the platform is unclear, but the fix is clear: always
rebind if the socket is broken. This can also be created artificially on
Linux via `ss -K`, and other conditions or software on a system could
also lead to the same outcomes.

Updates tailscale/corp#25648

Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker 2025-01-06 12:32:13 -08:00 committed by James Tucker
parent 6e45a8304e
commit f4f57b815b
3 changed files with 61 additions and 29 deletions

View File

@ -21,7 +21,6 @@ import (
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/tailscale/wireguard-go/conn"
@ -1290,34 +1289,6 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte, isDisco bool) (sent bool, e
return
}
// maybeRebindOnError performs a rebind and restun if the error is defined and
// any conditionals are met.
func (c *Conn) maybeRebindOnError(os string, err error) bool {
switch {
case errors.Is(err, syscall.EPERM):
why := "operation-not-permitted-rebind"
switch os {
// We currently will only rebind and restun on a syscall.EPERM if it is experienced
// on a client running darwin.
// TODO(charlotte, raggi): expand os options if required.
case "darwin":
// TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
// EPERMs.
if c.lastEPERMRebind.Load().Before(time.Now().Add(-5 * time.Second)) {
c.logf("magicsock: performing %q", why)
c.lastEPERMRebind.Store(time.Now())
c.Rebind()
go c.ReSTUN(why)
return true
}
default:
c.logf("magicsock: not performing %q", why)
return false
}
}
return false
}
// sendUDPNetcheck sends b via UDP to addr. It is used exclusively by netcheck.
// It returns the number of bytes sent along with any error encountered. It
// returns errors.ErrUnsupported if the client is explicitly configured to only

View File

@ -0,0 +1,49 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build !plan9
package magicsock
import (
"errors"
"syscall"
"time"
)
// maybeRebindOnError performs a rebind and restun if the error is defined and
// any conditionals are met.
func (c *Conn) maybeRebindOnError(os string, err error) bool {
switch {
case errors.Is(err, syscall.EPIPE) || errors.Is(err, syscall.ENOTCONN):
// EPIPE/ENOTCONN are common errors when a send fails due to a closed
// socket. There is some platform and version inconsistency in which
// error is returned, but the meaning is the same.
why := "broken-pipe-rebind"
c.logf("magicsock: performing %q", why)
c.Rebind()
go c.ReSTUN(why)
return true
case errors.Is(err, syscall.EPERM):
why := "operation-not-permitted-rebind"
switch os {
// We currently will only rebind and restun on a syscall.EPERM if it is experienced
// on a client running darwin.
// TODO(charlotte, raggi): expand os options if required.
case "darwin":
// TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
// EPERMs.
if c.lastEPERMRebind.Load().Before(time.Now().Add(-5 * time.Second)) {
c.logf("magicsock: performing %q", why)
c.lastEPERMRebind.Store(time.Now())
c.Rebind()
go c.ReSTUN(why)
return true
}
default:
c.logf("magicsock: not performing %q", why)
return false
}
}
return false
}

View File

@ -0,0 +1,12 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build plan9
package magicsock
// maybeRebindOnError performs a rebind and restun if the error is defined and
// any conditionals are met.
func (c *Conn) maybeRebindOnError(os string, err error) bool {
return false
}