wgengine/magicsock: report health warnings when blocked by firewalls

macOS and Linux both return EPERM when sendto(2) is blocked by the
firewall. Sometimes these blocks and transient, in the case of a fault
between EDR software and a kernel, and at other times this may be a
persistent state.

Report a health warning for the state, and rebind only up to once every
15m in order to avoid excess workload.

Updates #11710
Updates #12891
Updates #13511

Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker 2024-09-25 16:33:59 -07:00
parent 9eb59c72c1
commit 0b4d702a54
No known key found for this signature in database
6 changed files with 52 additions and 73 deletions

View File

@ -1470,7 +1470,7 @@ func (rs *reportState) runProbe(ctx context.Context, dm *tailcfg.DERPMap, probe
} }
n, err := rs.c.SendPacket(req, addr) n, err := rs.c.SendPacket(req, addr)
if n == len(req) && err == nil || neterror.TreatAsLostUDP(err) { if n == len(req) && err == nil || neterror.IsEPERM(err) {
rs.mu.Lock() rs.mu.Lock()
switch probe.proto { switch probe.proto {
case probeIPv4: case probeIPv4:

View File

@ -7,38 +7,19 @@
import ( import (
"errors" "errors"
"fmt" "fmt"
"runtime"
"syscall" "syscall"
) )
var errEPERM error = syscall.EPERM // box it into interface just once var errEPERM error = syscall.EPERM // box it into interface just once
// TreatAsLostUDP reports whether err is an error from a UDP send // IsEPERM returns true if the error is or wraps EPERM.
// operation that should be treated as a UDP packet that just got func IsEPERM(err error) bool {
// lost. // Linux and macOS, while not documented in the man page, returns EPERM when
// // there's a rule rejecting matching sendto(2) destinations.
// Notably, on Linux this reports true for EPERM errors (from outbound
// firewall blocks) which aren't really send errors; they're just
// sends that are never going to make it because the local OS blocked
// it.
func TreatAsLostUDP(err error) bool {
if err == nil {
return false
}
switch runtime.GOOS {
case "linux":
// Linux, while not documented in the man page,
// returns EPERM when there's an OUTPUT rule with -j
// DROP or -j REJECT. We use this very specific
// Linux+EPERM check rather than something super broad
// like net.Error.Temporary which could be anything.
// //
// For now we only do this on Linux, as such outgoing // We use this very specific Linux+EPERM check rather than something super
// firewall violations mapping to syscall errors // broad like net.Error.Temporary which could be anything.
// hasn't yet been observed on other OSes.
return errors.Is(err, errEPERM) return errors.Is(err, errEPERM)
}
return false
} }
var packetWasTruncated func(error) bool // non-nil on Windows at least var packetWasTruncated func(error) bool // non-nil on Windows at least

View File

@ -11,7 +11,7 @@
"testing" "testing"
) )
func TestTreatAsLostUDP(t *testing.T) { func TestIsEPERM(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
err error err error
@ -45,7 +45,7 @@ func TestTreatAsLostUDP(t *testing.T) {
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
if got := TreatAsLostUDP(tt.err); got != tt.want { if got := IsEPERM(tt.err); got != tt.want {
t.Errorf("got = %v; want %v", got, tt.want) t.Errorf("got = %v; want %v", got, tt.want)
} }
}) })

View File

@ -613,7 +613,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
// Only do PCP mapping in the case when PMP did not appear to be available recently. // Only do PCP mapping in the case when PMP did not appear to be available recently.
pkt := buildPCPRequestMappingPacket(myIP, localPort, prevPort, pcpMapLifetimeSec, wildcardIP) pkt := buildPCPRequestMappingPacket(myIP, localPort, prevPort, pcpMapLifetimeSec, wildcardIP)
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil { if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
if neterror.TreatAsLostUDP(err) { if neterror.IsEPERM(err) {
err = NoMappingError{ErrNoPortMappingServices} err = NoMappingError{ErrNoPortMappingServices}
} }
return netip.AddrPort{}, err return netip.AddrPort{}, err
@ -622,7 +622,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
// Ask for our external address if needed. // Ask for our external address if needed.
if !m.external.Addr().IsValid() { if !m.external.Addr().IsValid() {
if _, err := uc.WriteToUDPAddrPort(pmpReqExternalAddrPacket, pxpAddr); err != nil { if _, err := uc.WriteToUDPAddrPort(pmpReqExternalAddrPacket, pxpAddr); err != nil {
if neterror.TreatAsLostUDP(err) { if neterror.IsEPERM(err) {
err = NoMappingError{ErrNoPortMappingServices} err = NoMappingError{ErrNoPortMappingServices}
} }
return netip.AddrPort{}, err return netip.AddrPort{}, err
@ -631,7 +631,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
pkt := buildPMPRequestMappingPacket(localPort, prevPort, pmpMapLifetimeSec) pkt := buildPMPRequestMappingPacket(localPort, prevPort, pmpMapLifetimeSec)
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil { if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
if neterror.TreatAsLostUDP(err) { if neterror.IsEPERM(err) {
err = NoMappingError{ErrNoPortMappingServices} err = NoMappingError{ErrNoPortMappingServices}
} }
return netip.AddrPort{}, err return netip.AddrPort{}, err

View File

@ -20,7 +20,6 @@
"strings" "strings"
"sync" "sync"
"sync/atomic" "sync/atomic"
"syscall"
"time" "time"
"github.com/tailscale/wireguard-go/conn" "github.com/tailscale/wireguard-go/conn"
@ -323,6 +322,18 @@ type Conn struct {
staticEndpoints views.Slice[netip.AddrPort] staticEndpoints views.Slice[netip.AddrPort]
} }
// sendToBlockedWithEPERMWarning is set when a sendto call returns an error containing EPERM.
var sendToBlockedWithEPERMWarning = health.Register(&health.Warnable{
Code: "firewall-blocking-udp",
Severity: health.SeverityMedium,
Title: "Tailscale blocked by system firewall",
Text: func(args health.Args) string {
return "The operating system firewall is blocking UDP sends, preventing direct connections. Try reconfiguring your firewall or checking the configuration of EDR software."
},
// TODO(raggi): we could do with a state indicating that we'll have degraded connectivity, as in this example we'll likely fail to relayed conns.
ImpactsConnectivity: false,
})
// SetDebugLoggingEnabled controls whether spammy debug logging is enabled. // SetDebugLoggingEnabled controls whether spammy debug logging is enabled.
// //
// Note that this is currently independent from the log levels, even though // Note that this is currently independent from the log levels, even though
@ -1122,7 +1133,7 @@ func (c *Conn) sendUDPBatch(addr netip.AddrPort, buffs [][]byte) (sent bool, err
c.logf("magicsock: %s", errGSO.Error()) c.logf("magicsock: %s", errGSO.Error())
err = errGSO.RetryErr err = errGSO.RetryErr
} else { } else {
_ = c.maybeRebindOnError(runtime.GOOS, err) _ = c.maybeRebindOnError(err)
} }
} }
return err == nil, err return err == nil, err
@ -1137,7 +1148,7 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) {
sent, err = c.sendUDPStd(ipp, b) sent, err = c.sendUDPStd(ipp, b)
if err != nil { if err != nil {
metricSendUDPError.Add(1) metricSendUDPError.Add(1)
_ = c.maybeRebindOnError(runtime.GOOS, err) _ = c.maybeRebindOnError(err)
} else { } else {
if sent { if sent {
metricSendUDP.Add(1) metricSendUDP.Add(1)
@ -1146,30 +1157,23 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) {
return return
} }
// maybeRebindOnError performs a rebind and restun if the error is defined and // maybeRebindOnError performs a rebind and restun if the error may potentially
// any conditionals are met. // be fixed by performing a rebind and one has not been performed recently. It
func (c *Conn) maybeRebindOnError(os string, err error) bool { // returns true if a rebind was performed.
switch { func (c *Conn) maybeRebindOnError(err error) bool {
case errors.Is(err, syscall.EPERM): if neterror.IsEPERM(err) {
c.health.SetUnhealthy(sendToBlockedWithEPERMWarning, nil)
why := "operation-not-permitted-rebind" why := "operation-not-permitted-rebind"
switch os {
// We currently will only rebind and restun on a syscall.EPERM if it is experienced
// on a client running darwin.
// TODO(charlotte, raggi): expand os options if required.
case "darwin":
// TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent // TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
// EPERMs. // EPERMs.
if c.lastEPERMRebind.Load().Before(time.Now().Add(-5 * time.Second)) { if c.lastEPERMRebind.Load().Before(time.Now().Add(-15 * time.Minute)) {
c.logf("magicsock: performing %q", why) c.logf("magicsock: performing %q", why)
c.lastEPERMRebind.Store(time.Now()) c.lastEPERMRebind.Store(time.Now())
c.Rebind() c.Rebind()
go c.ReSTUN(why) go c.ReSTUN(why)
return true return true
} }
default:
c.logf("magicsock: not performing %q", why)
return false
}
} }
return false return false
} }
@ -1201,12 +1205,14 @@ func (c *Conn) sendUDPStd(addr netip.AddrPort, b []byte) (sent bool, err error)
switch { switch {
case addr.Addr().Is4(): case addr.Addr().Is4():
_, err = c.pconn4.WriteToUDPAddrPort(b, addr) _, err = c.pconn4.WriteToUDPAddrPort(b, addr)
if err != nil && (c.noV4.Load() || neterror.TreatAsLostUDP(err)) { if err != nil && (c.noV4.Load() || neterror.IsEPERM(err)) {
c.maybeRebindOnError(err)
return false, nil return false, nil
} }
case addr.Addr().Is6(): case addr.Addr().Is6():
_, err = c.pconn6.WriteToUDPAddrPort(b, addr) _, err = c.pconn6.WriteToUDPAddrPort(b, addr)
if err != nil && (c.noV6.Load() || neterror.TreatAsLostUDP(err)) { if err != nil && (c.noV6.Load() || neterror.IsEPERM(err)) {
c.maybeRebindOnError(err)
return false, nil return false, nil
} }
default: default:
@ -2607,6 +2613,7 @@ func (c *Conn) rebind(curPortFate currentPortFate) error {
// It should be followed by a call to ReSTUN. // It should be followed by a call to ReSTUN.
func (c *Conn) Rebind() { func (c *Conn) Rebind() {
metricRebindCalls.Add(1) metricRebindCalls.Add(1)
c.health.SetHealthy(sendToBlockedWithEPERMWarning)
if err := c.rebind(keepCurrentPort); err != nil { if err := c.rebind(keepCurrentPort); err != nil {
c.logf("%v", err) c.logf("%v", err)
return return

View File

@ -2967,29 +2967,20 @@ func TestMaybeRebindOnError(t *testing.T) {
err := fmt.Errorf("outer err: %w", syscall.EPERM) err := fmt.Errorf("outer err: %w", syscall.EPERM)
t.Run("darwin-rebind", func(t *testing.T) { t.Run("rebind", func(t *testing.T) {
conn := newTestConn(t) conn := newTestConn(t)
defer conn.Close() defer conn.Close()
rebound := conn.maybeRebindOnError("darwin", err) rebound := conn.maybeRebindOnError(err)
if !rebound { if !rebound {
t.Errorf("darwin should rebind on syscall.EPERM") t.Errorf("darwin should rebind on syscall.EPERM")
} }
}) })
t.Run("linux-not-rebind", func(t *testing.T) {
conn := newTestConn(t)
defer conn.Close()
rebound := conn.maybeRebindOnError("linux", err)
if rebound {
t.Errorf("linux should not rebind on syscall.EPERM")
}
})
t.Run("no-frequent-rebind", func(t *testing.T) { t.Run("no-frequent-rebind", func(t *testing.T) {
conn := newTestConn(t) conn := newTestConn(t)
defer conn.Close() defer conn.Close()
conn.lastEPERMRebind.Store(time.Now().Add(-1 * time.Second)) conn.lastEPERMRebind.Store(time.Now().Add(-60 * time.Second))
rebound := conn.maybeRebindOnError("darwin", err) rebound := conn.maybeRebindOnError(err)
if rebound { if rebound {
t.Errorf("darwin should not rebind on syscall.EPERM within 5 seconds of last") t.Errorf("darwin should not rebind on syscall.EPERM within 5 seconds of last")
} }