From 0b4d702a543079453e6e59de7d340299b5ca89ed Mon Sep 17 00:00:00 2001 From: James Tucker Date: Wed, 25 Sep 2024 16:33:59 -0700 Subject: [PATCH] wgengine/magicsock: report health warnings when blocked by firewalls macOS and Linux both return EPERM when sendto(2) is blocked by the firewall. Sometimes these blocks and transient, in the case of a fault between EDR software and a kernel, and at other times this may be a persistent state. Report a health warning for the state, and rebind only up to once every 15m in order to avoid excess workload. Updates #11710 Updates #12891 Updates #13511 Signed-off-by: James Tucker --- net/netcheck/netcheck.go | 2 +- net/neterror/neterror.go | 35 +++-------- ...eterror_linux_test.go => neterror_test.go} | 4 +- net/portmapper/portmapper.go | 6 +- wgengine/magicsock/magicsock.go | 61 +++++++++++-------- wgengine/magicsock/magicsock_test.go | 17 ++---- 6 files changed, 52 insertions(+), 73 deletions(-) rename net/neterror/{neterror_linux_test.go => neterror_test.go} (89%) diff --git a/net/netcheck/netcheck.go b/net/netcheck/netcheck.go index 003b5fbf8..e3c6f0346 100644 --- a/net/netcheck/netcheck.go +++ b/net/netcheck/netcheck.go @@ -1470,7 +1470,7 @@ func (rs *reportState) runProbe(ctx context.Context, dm *tailcfg.DERPMap, probe } n, err := rs.c.SendPacket(req, addr) - if n == len(req) && err == nil || neterror.TreatAsLostUDP(err) { + if n == len(req) && err == nil || neterror.IsEPERM(err) { rs.mu.Lock() switch probe.proto { case probeIPv4: diff --git a/net/neterror/neterror.go b/net/neterror/neterror.go index e2387440d..553f6f774 100644 --- a/net/neterror/neterror.go +++ b/net/neterror/neterror.go @@ -7,38 +7,19 @@ import ( "errors" "fmt" - "runtime" "syscall" ) var errEPERM error = syscall.EPERM // box it into interface just once -// TreatAsLostUDP reports whether err is an error from a UDP send -// operation that should be treated as a UDP packet that just got -// lost. -// -// Notably, on Linux this reports true for EPERM errors (from outbound -// firewall blocks) which aren't really send errors; they're just -// sends that are never going to make it because the local OS blocked -// it. -func TreatAsLostUDP(err error) bool { - if err == nil { - return false - } - switch runtime.GOOS { - case "linux": - // Linux, while not documented in the man page, - // returns EPERM when there's an OUTPUT rule with -j - // DROP or -j REJECT. We use this very specific - // Linux+EPERM check rather than something super broad - // like net.Error.Temporary which could be anything. - // - // For now we only do this on Linux, as such outgoing - // firewall violations mapping to syscall errors - // hasn't yet been observed on other OSes. - return errors.Is(err, errEPERM) - } - return false +// IsEPERM returns true if the error is or wraps EPERM. +func IsEPERM(err error) bool { + // Linux and macOS, while not documented in the man page, returns EPERM when + // there's a rule rejecting matching sendto(2) destinations. + // + // We use this very specific Linux+EPERM check rather than something super + // broad like net.Error.Temporary which could be anything. + return errors.Is(err, errEPERM) } var packetWasTruncated func(error) bool // non-nil on Windows at least diff --git a/net/neterror/neterror_linux_test.go b/net/neterror/neterror_test.go similarity index 89% rename from net/neterror/neterror_linux_test.go rename to net/neterror/neterror_test.go index 5b9906074..ec2632166 100644 --- a/net/neterror/neterror_linux_test.go +++ b/net/neterror/neterror_test.go @@ -11,7 +11,7 @@ "testing" ) -func TestTreatAsLostUDP(t *testing.T) { +func TestIsEPERM(t *testing.T) { tests := []struct { name string err error @@ -45,7 +45,7 @@ func TestTreatAsLostUDP(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := TreatAsLostUDP(tt.err); got != tt.want { + if got := IsEPERM(tt.err); got != tt.want { t.Errorf("got = %v; want %v", got, tt.want) } }) diff --git a/net/portmapper/portmapper.go b/net/portmapper/portmapper.go index 7cdca1fb3..55acb1f31 100644 --- a/net/portmapper/portmapper.go +++ b/net/portmapper/portmapper.go @@ -613,7 +613,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor // Only do PCP mapping in the case when PMP did not appear to be available recently. pkt := buildPCPRequestMappingPacket(myIP, localPort, prevPort, pcpMapLifetimeSec, wildcardIP) if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil { - if neterror.TreatAsLostUDP(err) { + if neterror.IsEPERM(err) { err = NoMappingError{ErrNoPortMappingServices} } return netip.AddrPort{}, err @@ -622,7 +622,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor // Ask for our external address if needed. if !m.external.Addr().IsValid() { if _, err := uc.WriteToUDPAddrPort(pmpReqExternalAddrPacket, pxpAddr); err != nil { - if neterror.TreatAsLostUDP(err) { + if neterror.IsEPERM(err) { err = NoMappingError{ErrNoPortMappingServices} } return netip.AddrPort{}, err @@ -631,7 +631,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor pkt := buildPMPRequestMappingPacket(localPort, prevPort, pmpMapLifetimeSec) if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil { - if neterror.TreatAsLostUDP(err) { + if neterror.IsEPERM(err) { err = NoMappingError{ErrNoPortMappingServices} } return netip.AddrPort{}, err diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index aceab3be5..575fcb823 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -20,7 +20,6 @@ "strings" "sync" "sync/atomic" - "syscall" "time" "github.com/tailscale/wireguard-go/conn" @@ -323,6 +322,18 @@ type Conn struct { staticEndpoints views.Slice[netip.AddrPort] } +// sendToBlockedWithEPERMWarning is set when a sendto call returns an error containing EPERM. +var sendToBlockedWithEPERMWarning = health.Register(&health.Warnable{ + Code: "firewall-blocking-udp", + Severity: health.SeverityMedium, + Title: "Tailscale blocked by system firewall", + Text: func(args health.Args) string { + return "The operating system firewall is blocking UDP sends, preventing direct connections. Try reconfiguring your firewall or checking the configuration of EDR software." + }, + // TODO(raggi): we could do with a state indicating that we'll have degraded connectivity, as in this example we'll likely fail to relayed conns. + ImpactsConnectivity: false, +}) + // SetDebugLoggingEnabled controls whether spammy debug logging is enabled. // // Note that this is currently independent from the log levels, even though @@ -1122,7 +1133,7 @@ func (c *Conn) sendUDPBatch(addr netip.AddrPort, buffs [][]byte) (sent bool, err c.logf("magicsock: %s", errGSO.Error()) err = errGSO.RetryErr } else { - _ = c.maybeRebindOnError(runtime.GOOS, err) + _ = c.maybeRebindOnError(err) } } return err == nil, err @@ -1137,7 +1148,7 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) { sent, err = c.sendUDPStd(ipp, b) if err != nil { metricSendUDPError.Add(1) - _ = c.maybeRebindOnError(runtime.GOOS, err) + _ = c.maybeRebindOnError(err) } else { if sent { metricSendUDP.Add(1) @@ -1146,29 +1157,22 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) { return } -// maybeRebindOnError performs a rebind and restun if the error is defined and -// any conditionals are met. -func (c *Conn) maybeRebindOnError(os string, err error) bool { - switch { - case errors.Is(err, syscall.EPERM): +// maybeRebindOnError performs a rebind and restun if the error may potentially +// be fixed by performing a rebind and one has not been performed recently. It +// returns true if a rebind was performed. +func (c *Conn) maybeRebindOnError(err error) bool { + if neterror.IsEPERM(err) { + c.health.SetUnhealthy(sendToBlockedWithEPERMWarning, nil) + why := "operation-not-permitted-rebind" - switch os { - // We currently will only rebind and restun on a syscall.EPERM if it is experienced - // on a client running darwin. - // TODO(charlotte, raggi): expand os options if required. - case "darwin": - // TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent - // EPERMs. - if c.lastEPERMRebind.Load().Before(time.Now().Add(-5 * time.Second)) { - c.logf("magicsock: performing %q", why) - c.lastEPERMRebind.Store(time.Now()) - c.Rebind() - go c.ReSTUN(why) - return true - } - default: - c.logf("magicsock: not performing %q", why) - return false + // TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent + // EPERMs. + if c.lastEPERMRebind.Load().Before(time.Now().Add(-15 * time.Minute)) { + c.logf("magicsock: performing %q", why) + c.lastEPERMRebind.Store(time.Now()) + c.Rebind() + go c.ReSTUN(why) + return true } } return false @@ -1201,12 +1205,14 @@ func (c *Conn) sendUDPStd(addr netip.AddrPort, b []byte) (sent bool, err error) switch { case addr.Addr().Is4(): _, err = c.pconn4.WriteToUDPAddrPort(b, addr) - if err != nil && (c.noV4.Load() || neterror.TreatAsLostUDP(err)) { + if err != nil && (c.noV4.Load() || neterror.IsEPERM(err)) { + c.maybeRebindOnError(err) return false, nil } case addr.Addr().Is6(): _, err = c.pconn6.WriteToUDPAddrPort(b, addr) - if err != nil && (c.noV6.Load() || neterror.TreatAsLostUDP(err)) { + if err != nil && (c.noV6.Load() || neterror.IsEPERM(err)) { + c.maybeRebindOnError(err) return false, nil } default: @@ -2607,6 +2613,7 @@ func (c *Conn) rebind(curPortFate currentPortFate) error { // It should be followed by a call to ReSTUN. func (c *Conn) Rebind() { metricRebindCalls.Add(1) + c.health.SetHealthy(sendToBlockedWithEPERMWarning) if err := c.rebind(keepCurrentPort); err != nil { c.logf("%v", err) return diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go index 6b2d961b9..86c181892 100644 --- a/wgengine/magicsock/magicsock_test.go +++ b/wgengine/magicsock/magicsock_test.go @@ -2967,29 +2967,20 @@ func TestMaybeRebindOnError(t *testing.T) { err := fmt.Errorf("outer err: %w", syscall.EPERM) - t.Run("darwin-rebind", func(t *testing.T) { + t.Run("rebind", func(t *testing.T) { conn := newTestConn(t) defer conn.Close() - rebound := conn.maybeRebindOnError("darwin", err) + rebound := conn.maybeRebindOnError(err) if !rebound { t.Errorf("darwin should rebind on syscall.EPERM") } }) - t.Run("linux-not-rebind", func(t *testing.T) { - conn := newTestConn(t) - defer conn.Close() - rebound := conn.maybeRebindOnError("linux", err) - if rebound { - t.Errorf("linux should not rebind on syscall.EPERM") - } - }) - t.Run("no-frequent-rebind", func(t *testing.T) { conn := newTestConn(t) defer conn.Close() - conn.lastEPERMRebind.Store(time.Now().Add(-1 * time.Second)) - rebound := conn.maybeRebindOnError("darwin", err) + conn.lastEPERMRebind.Store(time.Now().Add(-60 * time.Second)) + rebound := conn.maybeRebindOnError(err) if rebound { t.Errorf("darwin should not rebind on syscall.EPERM within 5 seconds of last") }