mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-25 11:05:45 +00:00
wgengine/magicsock: report health warnings when blocked by firewalls
macOS and Linux both return EPERM when sendto(2) is blocked by the firewall. Sometimes these blocks and transient, in the case of a fault between EDR software and a kernel, and at other times this may be a persistent state. Report a health warning for the state, and rebind only up to once every 15m in order to avoid excess workload. Updates #11710 Updates #12891 Updates #13511 Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
parent
9eb59c72c1
commit
0b4d702a54
@ -1470,7 +1470,7 @@ func (rs *reportState) runProbe(ctx context.Context, dm *tailcfg.DERPMap, probe
|
||||
}
|
||||
|
||||
n, err := rs.c.SendPacket(req, addr)
|
||||
if n == len(req) && err == nil || neterror.TreatAsLostUDP(err) {
|
||||
if n == len(req) && err == nil || neterror.IsEPERM(err) {
|
||||
rs.mu.Lock()
|
||||
switch probe.proto {
|
||||
case probeIPv4:
|
||||
|
@ -7,38 +7,19 @@
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
var errEPERM error = syscall.EPERM // box it into interface just once
|
||||
|
||||
// TreatAsLostUDP reports whether err is an error from a UDP send
|
||||
// operation that should be treated as a UDP packet that just got
|
||||
// lost.
|
||||
//
|
||||
// Notably, on Linux this reports true for EPERM errors (from outbound
|
||||
// firewall blocks) which aren't really send errors; they're just
|
||||
// sends that are never going to make it because the local OS blocked
|
||||
// it.
|
||||
func TreatAsLostUDP(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
switch runtime.GOOS {
|
||||
case "linux":
|
||||
// Linux, while not documented in the man page,
|
||||
// returns EPERM when there's an OUTPUT rule with -j
|
||||
// DROP or -j REJECT. We use this very specific
|
||||
// Linux+EPERM check rather than something super broad
|
||||
// like net.Error.Temporary which could be anything.
|
||||
//
|
||||
// For now we only do this on Linux, as such outgoing
|
||||
// firewall violations mapping to syscall errors
|
||||
// hasn't yet been observed on other OSes.
|
||||
return errors.Is(err, errEPERM)
|
||||
}
|
||||
return false
|
||||
// IsEPERM returns true if the error is or wraps EPERM.
|
||||
func IsEPERM(err error) bool {
|
||||
// Linux and macOS, while not documented in the man page, returns EPERM when
|
||||
// there's a rule rejecting matching sendto(2) destinations.
|
||||
//
|
||||
// We use this very specific Linux+EPERM check rather than something super
|
||||
// broad like net.Error.Temporary which could be anything.
|
||||
return errors.Is(err, errEPERM)
|
||||
}
|
||||
|
||||
var packetWasTruncated func(error) bool // non-nil on Windows at least
|
||||
|
@ -11,7 +11,7 @@
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestTreatAsLostUDP(t *testing.T) {
|
||||
func TestIsEPERM(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
err error
|
||||
@ -45,7 +45,7 @@ func TestTreatAsLostUDP(t *testing.T) {
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := TreatAsLostUDP(tt.err); got != tt.want {
|
||||
if got := IsEPERM(tt.err); got != tt.want {
|
||||
t.Errorf("got = %v; want %v", got, tt.want)
|
||||
}
|
||||
})
|
@ -613,7 +613,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
|
||||
// Only do PCP mapping in the case when PMP did not appear to be available recently.
|
||||
pkt := buildPCPRequestMappingPacket(myIP, localPort, prevPort, pcpMapLifetimeSec, wildcardIP)
|
||||
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
|
||||
if neterror.TreatAsLostUDP(err) {
|
||||
if neterror.IsEPERM(err) {
|
||||
err = NoMappingError{ErrNoPortMappingServices}
|
||||
}
|
||||
return netip.AddrPort{}, err
|
||||
@ -622,7 +622,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
|
||||
// Ask for our external address if needed.
|
||||
if !m.external.Addr().IsValid() {
|
||||
if _, err := uc.WriteToUDPAddrPort(pmpReqExternalAddrPacket, pxpAddr); err != nil {
|
||||
if neterror.TreatAsLostUDP(err) {
|
||||
if neterror.IsEPERM(err) {
|
||||
err = NoMappingError{ErrNoPortMappingServices}
|
||||
}
|
||||
return netip.AddrPort{}, err
|
||||
@ -631,7 +631,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
|
||||
|
||||
pkt := buildPMPRequestMappingPacket(localPort, prevPort, pmpMapLifetimeSec)
|
||||
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
|
||||
if neterror.TreatAsLostUDP(err) {
|
||||
if neterror.IsEPERM(err) {
|
||||
err = NoMappingError{ErrNoPortMappingServices}
|
||||
}
|
||||
return netip.AddrPort{}, err
|
||||
|
@ -20,7 +20,6 @@
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/tailscale/wireguard-go/conn"
|
||||
@ -323,6 +322,18 @@ type Conn struct {
|
||||
staticEndpoints views.Slice[netip.AddrPort]
|
||||
}
|
||||
|
||||
// sendToBlockedWithEPERMWarning is set when a sendto call returns an error containing EPERM.
|
||||
var sendToBlockedWithEPERMWarning = health.Register(&health.Warnable{
|
||||
Code: "firewall-blocking-udp",
|
||||
Severity: health.SeverityMedium,
|
||||
Title: "Tailscale blocked by system firewall",
|
||||
Text: func(args health.Args) string {
|
||||
return "The operating system firewall is blocking UDP sends, preventing direct connections. Try reconfiguring your firewall or checking the configuration of EDR software."
|
||||
},
|
||||
// TODO(raggi): we could do with a state indicating that we'll have degraded connectivity, as in this example we'll likely fail to relayed conns.
|
||||
ImpactsConnectivity: false,
|
||||
})
|
||||
|
||||
// SetDebugLoggingEnabled controls whether spammy debug logging is enabled.
|
||||
//
|
||||
// Note that this is currently independent from the log levels, even though
|
||||
@ -1122,7 +1133,7 @@ func (c *Conn) sendUDPBatch(addr netip.AddrPort, buffs [][]byte) (sent bool, err
|
||||
c.logf("magicsock: %s", errGSO.Error())
|
||||
err = errGSO.RetryErr
|
||||
} else {
|
||||
_ = c.maybeRebindOnError(runtime.GOOS, err)
|
||||
_ = c.maybeRebindOnError(err)
|
||||
}
|
||||
}
|
||||
return err == nil, err
|
||||
@ -1137,7 +1148,7 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) {
|
||||
sent, err = c.sendUDPStd(ipp, b)
|
||||
if err != nil {
|
||||
metricSendUDPError.Add(1)
|
||||
_ = c.maybeRebindOnError(runtime.GOOS, err)
|
||||
_ = c.maybeRebindOnError(err)
|
||||
} else {
|
||||
if sent {
|
||||
metricSendUDP.Add(1)
|
||||
@ -1146,29 +1157,22 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) {
|
||||
return
|
||||
}
|
||||
|
||||
// maybeRebindOnError performs a rebind and restun if the error is defined and
|
||||
// any conditionals are met.
|
||||
func (c *Conn) maybeRebindOnError(os string, err error) bool {
|
||||
switch {
|
||||
case errors.Is(err, syscall.EPERM):
|
||||
// maybeRebindOnError performs a rebind and restun if the error may potentially
|
||||
// be fixed by performing a rebind and one has not been performed recently. It
|
||||
// returns true if a rebind was performed.
|
||||
func (c *Conn) maybeRebindOnError(err error) bool {
|
||||
if neterror.IsEPERM(err) {
|
||||
c.health.SetUnhealthy(sendToBlockedWithEPERMWarning, nil)
|
||||
|
||||
why := "operation-not-permitted-rebind"
|
||||
switch os {
|
||||
// We currently will only rebind and restun on a syscall.EPERM if it is experienced
|
||||
// on a client running darwin.
|
||||
// TODO(charlotte, raggi): expand os options if required.
|
||||
case "darwin":
|
||||
// TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
|
||||
// EPERMs.
|
||||
if c.lastEPERMRebind.Load().Before(time.Now().Add(-5 * time.Second)) {
|
||||
c.logf("magicsock: performing %q", why)
|
||||
c.lastEPERMRebind.Store(time.Now())
|
||||
c.Rebind()
|
||||
go c.ReSTUN(why)
|
||||
return true
|
||||
}
|
||||
default:
|
||||
c.logf("magicsock: not performing %q", why)
|
||||
return false
|
||||
// TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
|
||||
// EPERMs.
|
||||
if c.lastEPERMRebind.Load().Before(time.Now().Add(-15 * time.Minute)) {
|
||||
c.logf("magicsock: performing %q", why)
|
||||
c.lastEPERMRebind.Store(time.Now())
|
||||
c.Rebind()
|
||||
go c.ReSTUN(why)
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
@ -1201,12 +1205,14 @@ func (c *Conn) sendUDPStd(addr netip.AddrPort, b []byte) (sent bool, err error)
|
||||
switch {
|
||||
case addr.Addr().Is4():
|
||||
_, err = c.pconn4.WriteToUDPAddrPort(b, addr)
|
||||
if err != nil && (c.noV4.Load() || neterror.TreatAsLostUDP(err)) {
|
||||
if err != nil && (c.noV4.Load() || neterror.IsEPERM(err)) {
|
||||
c.maybeRebindOnError(err)
|
||||
return false, nil
|
||||
}
|
||||
case addr.Addr().Is6():
|
||||
_, err = c.pconn6.WriteToUDPAddrPort(b, addr)
|
||||
if err != nil && (c.noV6.Load() || neterror.TreatAsLostUDP(err)) {
|
||||
if err != nil && (c.noV6.Load() || neterror.IsEPERM(err)) {
|
||||
c.maybeRebindOnError(err)
|
||||
return false, nil
|
||||
}
|
||||
default:
|
||||
@ -2607,6 +2613,7 @@ func (c *Conn) rebind(curPortFate currentPortFate) error {
|
||||
// It should be followed by a call to ReSTUN.
|
||||
func (c *Conn) Rebind() {
|
||||
metricRebindCalls.Add(1)
|
||||
c.health.SetHealthy(sendToBlockedWithEPERMWarning)
|
||||
if err := c.rebind(keepCurrentPort); err != nil {
|
||||
c.logf("%v", err)
|
||||
return
|
||||
|
@ -2967,29 +2967,20 @@ func TestMaybeRebindOnError(t *testing.T) {
|
||||
|
||||
err := fmt.Errorf("outer err: %w", syscall.EPERM)
|
||||
|
||||
t.Run("darwin-rebind", func(t *testing.T) {
|
||||
t.Run("rebind", func(t *testing.T) {
|
||||
conn := newTestConn(t)
|
||||
defer conn.Close()
|
||||
rebound := conn.maybeRebindOnError("darwin", err)
|
||||
rebound := conn.maybeRebindOnError(err)
|
||||
if !rebound {
|
||||
t.Errorf("darwin should rebind on syscall.EPERM")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("linux-not-rebind", func(t *testing.T) {
|
||||
conn := newTestConn(t)
|
||||
defer conn.Close()
|
||||
rebound := conn.maybeRebindOnError("linux", err)
|
||||
if rebound {
|
||||
t.Errorf("linux should not rebind on syscall.EPERM")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("no-frequent-rebind", func(t *testing.T) {
|
||||
conn := newTestConn(t)
|
||||
defer conn.Close()
|
||||
conn.lastEPERMRebind.Store(time.Now().Add(-1 * time.Second))
|
||||
rebound := conn.maybeRebindOnError("darwin", err)
|
||||
conn.lastEPERMRebind.Store(time.Now().Add(-60 * time.Second))
|
||||
rebound := conn.maybeRebindOnError(err)
|
||||
if rebound {
|
||||
t.Errorf("darwin should not rebind on syscall.EPERM within 5 seconds of last")
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user