From f4f57b815bf9804badf449e91a42ff80a08ea59d Mon Sep 17 00:00:00 2001 From: James Tucker Date: Mon, 6 Jan 2025 12:32:13 -0800 Subject: [PATCH] wgengine/magicsock: rebind on EPIPE/ECONNRESET Observed in the wild some macOS machines gain broken sockets coming out of sleep (we observe "time jumped", followed by EPIPE on sendto). The cause of this in the platform is unclear, but the fix is clear: always rebind if the socket is broken. This can also be created artificially on Linux via `ss -K`, and other conditions or software on a system could also lead to the same outcomes. Updates tailscale/corp#25648 Signed-off-by: James Tucker --- wgengine/magicsock/magicsock.go | 29 -------------- wgengine/magicsock/magicsock_notplan9.go | 49 ++++++++++++++++++++++++ wgengine/magicsock/magicsock_plan9.go | 12 ++++++ 3 files changed, 61 insertions(+), 29 deletions(-) create mode 100644 wgengine/magicsock/magicsock_notplan9.go create mode 100644 wgengine/magicsock/magicsock_plan9.go diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index bff905caa..188933c0e 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -21,7 +21,6 @@ import ( "strings" "sync" "sync/atomic" - "syscall" "time" "github.com/tailscale/wireguard-go/conn" @@ -1290,34 +1289,6 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte, isDisco bool) (sent bool, e return } -// maybeRebindOnError performs a rebind and restun if the error is defined and -// any conditionals are met. -func (c *Conn) maybeRebindOnError(os string, err error) bool { - switch { - case errors.Is(err, syscall.EPERM): - why := "operation-not-permitted-rebind" - switch os { - // We currently will only rebind and restun on a syscall.EPERM if it is experienced - // on a client running darwin. - // TODO(charlotte, raggi): expand os options if required. - case "darwin": - // TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent - // EPERMs. - if c.lastEPERMRebind.Load().Before(time.Now().Add(-5 * time.Second)) { - c.logf("magicsock: performing %q", why) - c.lastEPERMRebind.Store(time.Now()) - c.Rebind() - go c.ReSTUN(why) - return true - } - default: - c.logf("magicsock: not performing %q", why) - return false - } - } - return false -} - // sendUDPNetcheck sends b via UDP to addr. It is used exclusively by netcheck. // It returns the number of bytes sent along with any error encountered. It // returns errors.ErrUnsupported if the client is explicitly configured to only diff --git a/wgengine/magicsock/magicsock_notplan9.go b/wgengine/magicsock/magicsock_notplan9.go new file mode 100644 index 000000000..44f08cb1c --- /dev/null +++ b/wgengine/magicsock/magicsock_notplan9.go @@ -0,0 +1,49 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !plan9 + +package magicsock + +import ( + "errors" + "syscall" + "time" +) + +// maybeRebindOnError performs a rebind and restun if the error is defined and +// any conditionals are met. +func (c *Conn) maybeRebindOnError(os string, err error) bool { + switch { + case errors.Is(err, syscall.EPIPE) || errors.Is(err, syscall.ENOTCONN): + // EPIPE/ENOTCONN are common errors when a send fails due to a closed + // socket. There is some platform and version inconsistency in which + // error is returned, but the meaning is the same. + why := "broken-pipe-rebind" + c.logf("magicsock: performing %q", why) + c.Rebind() + go c.ReSTUN(why) + return true + case errors.Is(err, syscall.EPERM): + why := "operation-not-permitted-rebind" + switch os { + // We currently will only rebind and restun on a syscall.EPERM if it is experienced + // on a client running darwin. + // TODO(charlotte, raggi): expand os options if required. + case "darwin": + // TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent + // EPERMs. + if c.lastEPERMRebind.Load().Before(time.Now().Add(-5 * time.Second)) { + c.logf("magicsock: performing %q", why) + c.lastEPERMRebind.Store(time.Now()) + c.Rebind() + go c.ReSTUN(why) + return true + } + default: + c.logf("magicsock: not performing %q", why) + return false + } + } + return false +} diff --git a/wgengine/magicsock/magicsock_plan9.go b/wgengine/magicsock/magicsock_plan9.go new file mode 100644 index 000000000..23f710430 --- /dev/null +++ b/wgengine/magicsock/magicsock_plan9.go @@ -0,0 +1,12 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build plan9 + +package magicsock + +// maybeRebindOnError performs a rebind and restun if the error is defined and +// any conditionals are met. +func (c *Conn) maybeRebindOnError(os string, err error) bool { + return false +}