wgengine/magicsock: report health warnings when blocked by firewalls

macOS and Linux both return EPERM when sendto(2) is blocked by the
firewall. Sometimes these blocks and transient, in the case of a fault
between EDR software and a kernel, and at other times this may be a
persistent state.

Report a health warning for the state, and rebind only up to once every
15m in order to avoid excess workload.

Updates #11710
Updates #12891
Updates #13511

Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker 2024-09-25 16:33:59 -07:00
parent 9eb59c72c1
commit 0b4d702a54
No known key found for this signature in database
6 changed files with 52 additions and 73 deletions

View File

@ -1470,7 +1470,7 @@ func (rs *reportState) runProbe(ctx context.Context, dm *tailcfg.DERPMap, probe
}
n, err := rs.c.SendPacket(req, addr)
if n == len(req) && err == nil || neterror.TreatAsLostUDP(err) {
if n == len(req) && err == nil || neterror.IsEPERM(err) {
rs.mu.Lock()
switch probe.proto {
case probeIPv4:

View File

@ -7,38 +7,19 @@
import (
"errors"
"fmt"
"runtime"
"syscall"
)
var errEPERM error = syscall.EPERM // box it into interface just once
// TreatAsLostUDP reports whether err is an error from a UDP send
// operation that should be treated as a UDP packet that just got
// lost.
//
// Notably, on Linux this reports true for EPERM errors (from outbound
// firewall blocks) which aren't really send errors; they're just
// sends that are never going to make it because the local OS blocked
// it.
func TreatAsLostUDP(err error) bool {
if err == nil {
return false
}
switch runtime.GOOS {
case "linux":
// Linux, while not documented in the man page,
// returns EPERM when there's an OUTPUT rule with -j
// DROP or -j REJECT. We use this very specific
// Linux+EPERM check rather than something super broad
// like net.Error.Temporary which could be anything.
//
// For now we only do this on Linux, as such outgoing
// firewall violations mapping to syscall errors
// hasn't yet been observed on other OSes.
return errors.Is(err, errEPERM)
}
return false
// IsEPERM returns true if the error is or wraps EPERM.
func IsEPERM(err error) bool {
// Linux and macOS, while not documented in the man page, returns EPERM when
// there's a rule rejecting matching sendto(2) destinations.
//
// We use this very specific Linux+EPERM check rather than something super
// broad like net.Error.Temporary which could be anything.
return errors.Is(err, errEPERM)
}
var packetWasTruncated func(error) bool // non-nil on Windows at least

View File

@ -11,7 +11,7 @@
"testing"
)
func TestTreatAsLostUDP(t *testing.T) {
func TestIsEPERM(t *testing.T) {
tests := []struct {
name string
err error
@ -45,7 +45,7 @@ func TestTreatAsLostUDP(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := TreatAsLostUDP(tt.err); got != tt.want {
if got := IsEPERM(tt.err); got != tt.want {
t.Errorf("got = %v; want %v", got, tt.want)
}
})

View File

@ -613,7 +613,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
// Only do PCP mapping in the case when PMP did not appear to be available recently.
pkt := buildPCPRequestMappingPacket(myIP, localPort, prevPort, pcpMapLifetimeSec, wildcardIP)
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
if neterror.TreatAsLostUDP(err) {
if neterror.IsEPERM(err) {
err = NoMappingError{ErrNoPortMappingServices}
}
return netip.AddrPort{}, err
@ -622,7 +622,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
// Ask for our external address if needed.
if !m.external.Addr().IsValid() {
if _, err := uc.WriteToUDPAddrPort(pmpReqExternalAddrPacket, pxpAddr); err != nil {
if neterror.TreatAsLostUDP(err) {
if neterror.IsEPERM(err) {
err = NoMappingError{ErrNoPortMappingServices}
}
return netip.AddrPort{}, err
@ -631,7 +631,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
pkt := buildPMPRequestMappingPacket(localPort, prevPort, pmpMapLifetimeSec)
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
if neterror.TreatAsLostUDP(err) {
if neterror.IsEPERM(err) {
err = NoMappingError{ErrNoPortMappingServices}
}
return netip.AddrPort{}, err

View File

@ -20,7 +20,6 @@
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/tailscale/wireguard-go/conn"
@ -323,6 +322,18 @@ type Conn struct {
staticEndpoints views.Slice[netip.AddrPort]
}
// sendToBlockedWithEPERMWarning is set when a sendto call returns an error containing EPERM.
var sendToBlockedWithEPERMWarning = health.Register(&health.Warnable{
Code: "firewall-blocking-udp",
Severity: health.SeverityMedium,
Title: "Tailscale blocked by system firewall",
Text: func(args health.Args) string {
return "The operating system firewall is blocking UDP sends, preventing direct connections. Try reconfiguring your firewall or checking the configuration of EDR software."
},
// TODO(raggi): we could do with a state indicating that we'll have degraded connectivity, as in this example we'll likely fail to relayed conns.
ImpactsConnectivity: false,
})
// SetDebugLoggingEnabled controls whether spammy debug logging is enabled.
//
// Note that this is currently independent from the log levels, even though
@ -1122,7 +1133,7 @@ func (c *Conn) sendUDPBatch(addr netip.AddrPort, buffs [][]byte) (sent bool, err
c.logf("magicsock: %s", errGSO.Error())
err = errGSO.RetryErr
} else {
_ = c.maybeRebindOnError(runtime.GOOS, err)
_ = c.maybeRebindOnError(err)
}
}
return err == nil, err
@ -1137,7 +1148,7 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) {
sent, err = c.sendUDPStd(ipp, b)
if err != nil {
metricSendUDPError.Add(1)
_ = c.maybeRebindOnError(runtime.GOOS, err)
_ = c.maybeRebindOnError(err)
} else {
if sent {
metricSendUDP.Add(1)
@ -1146,29 +1157,22 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) {
return
}
// maybeRebindOnError performs a rebind and restun if the error is defined and
// any conditionals are met.
func (c *Conn) maybeRebindOnError(os string, err error) bool {
switch {
case errors.Is(err, syscall.EPERM):
// maybeRebindOnError performs a rebind and restun if the error may potentially
// be fixed by performing a rebind and one has not been performed recently. It
// returns true if a rebind was performed.
func (c *Conn) maybeRebindOnError(err error) bool {
if neterror.IsEPERM(err) {
c.health.SetUnhealthy(sendToBlockedWithEPERMWarning, nil)
why := "operation-not-permitted-rebind"
switch os {
// We currently will only rebind and restun on a syscall.EPERM if it is experienced
// on a client running darwin.
// TODO(charlotte, raggi): expand os options if required.
case "darwin":
// TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
// EPERMs.
if c.lastEPERMRebind.Load().Before(time.Now().Add(-5 * time.Second)) {
c.logf("magicsock: performing %q", why)
c.lastEPERMRebind.Store(time.Now())
c.Rebind()
go c.ReSTUN(why)
return true
}
default:
c.logf("magicsock: not performing %q", why)
return false
// TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
// EPERMs.
if c.lastEPERMRebind.Load().Before(time.Now().Add(-15 * time.Minute)) {
c.logf("magicsock: performing %q", why)
c.lastEPERMRebind.Store(time.Now())
c.Rebind()
go c.ReSTUN(why)
return true
}
}
return false
@ -1201,12 +1205,14 @@ func (c *Conn) sendUDPStd(addr netip.AddrPort, b []byte) (sent bool, err error)
switch {
case addr.Addr().Is4():
_, err = c.pconn4.WriteToUDPAddrPort(b, addr)
if err != nil && (c.noV4.Load() || neterror.TreatAsLostUDP(err)) {
if err != nil && (c.noV4.Load() || neterror.IsEPERM(err)) {
c.maybeRebindOnError(err)
return false, nil
}
case addr.Addr().Is6():
_, err = c.pconn6.WriteToUDPAddrPort(b, addr)
if err != nil && (c.noV6.Load() || neterror.TreatAsLostUDP(err)) {
if err != nil && (c.noV6.Load() || neterror.IsEPERM(err)) {
c.maybeRebindOnError(err)
return false, nil
}
default:
@ -2607,6 +2613,7 @@ func (c *Conn) rebind(curPortFate currentPortFate) error {
// It should be followed by a call to ReSTUN.
func (c *Conn) Rebind() {
metricRebindCalls.Add(1)
c.health.SetHealthy(sendToBlockedWithEPERMWarning)
if err := c.rebind(keepCurrentPort); err != nil {
c.logf("%v", err)
return

View File

@ -2967,29 +2967,20 @@ func TestMaybeRebindOnError(t *testing.T) {
err := fmt.Errorf("outer err: %w", syscall.EPERM)
t.Run("darwin-rebind", func(t *testing.T) {
t.Run("rebind", func(t *testing.T) {
conn := newTestConn(t)
defer conn.Close()
rebound := conn.maybeRebindOnError("darwin", err)
rebound := conn.maybeRebindOnError(err)
if !rebound {
t.Errorf("darwin should rebind on syscall.EPERM")
}
})
t.Run("linux-not-rebind", func(t *testing.T) {
conn := newTestConn(t)
defer conn.Close()
rebound := conn.maybeRebindOnError("linux", err)
if rebound {
t.Errorf("linux should not rebind on syscall.EPERM")
}
})
t.Run("no-frequent-rebind", func(t *testing.T) {
conn := newTestConn(t)
defer conn.Close()
conn.lastEPERMRebind.Store(time.Now().Add(-1 * time.Second))
rebound := conn.maybeRebindOnError("darwin", err)
conn.lastEPERMRebind.Store(time.Now().Add(-60 * time.Second))
rebound := conn.maybeRebindOnError(err)
if rebound {
t.Errorf("darwin should not rebind on syscall.EPERM within 5 seconds of last")
}