mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-25 19:15:34 +00:00
wgengine/magicsock: report health warnings when blocked by firewalls
macOS and Linux both return EPERM when sendto(2) is blocked by the firewall. Sometimes these blocks and transient, in the case of a fault between EDR software and a kernel, and at other times this may be a persistent state. Report a health warning for the state, and rebind only up to once every 15m in order to avoid excess workload. Updates #11710 Updates #12891 Updates #13511 Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
parent
9eb59c72c1
commit
0b4d702a54
@ -1470,7 +1470,7 @@ func (rs *reportState) runProbe(ctx context.Context, dm *tailcfg.DERPMap, probe
|
|||||||
}
|
}
|
||||||
|
|
||||||
n, err := rs.c.SendPacket(req, addr)
|
n, err := rs.c.SendPacket(req, addr)
|
||||||
if n == len(req) && err == nil || neterror.TreatAsLostUDP(err) {
|
if n == len(req) && err == nil || neterror.IsEPERM(err) {
|
||||||
rs.mu.Lock()
|
rs.mu.Lock()
|
||||||
switch probe.proto {
|
switch probe.proto {
|
||||||
case probeIPv4:
|
case probeIPv4:
|
||||||
|
@ -7,38 +7,19 @@
|
|||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"runtime"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
var errEPERM error = syscall.EPERM // box it into interface just once
|
var errEPERM error = syscall.EPERM // box it into interface just once
|
||||||
|
|
||||||
// TreatAsLostUDP reports whether err is an error from a UDP send
|
// IsEPERM returns true if the error is or wraps EPERM.
|
||||||
// operation that should be treated as a UDP packet that just got
|
func IsEPERM(err error) bool {
|
||||||
// lost.
|
// Linux and macOS, while not documented in the man page, returns EPERM when
|
||||||
//
|
// there's a rule rejecting matching sendto(2) destinations.
|
||||||
// Notably, on Linux this reports true for EPERM errors (from outbound
|
//
|
||||||
// firewall blocks) which aren't really send errors; they're just
|
// We use this very specific Linux+EPERM check rather than something super
|
||||||
// sends that are never going to make it because the local OS blocked
|
// broad like net.Error.Temporary which could be anything.
|
||||||
// it.
|
return errors.Is(err, errEPERM)
|
||||||
func TreatAsLostUDP(err error) bool {
|
|
||||||
if err == nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
switch runtime.GOOS {
|
|
||||||
case "linux":
|
|
||||||
// Linux, while not documented in the man page,
|
|
||||||
// returns EPERM when there's an OUTPUT rule with -j
|
|
||||||
// DROP or -j REJECT. We use this very specific
|
|
||||||
// Linux+EPERM check rather than something super broad
|
|
||||||
// like net.Error.Temporary which could be anything.
|
|
||||||
//
|
|
||||||
// For now we only do this on Linux, as such outgoing
|
|
||||||
// firewall violations mapping to syscall errors
|
|
||||||
// hasn't yet been observed on other OSes.
|
|
||||||
return errors.Is(err, errEPERM)
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var packetWasTruncated func(error) bool // non-nil on Windows at least
|
var packetWasTruncated func(error) bool // non-nil on Windows at least
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestTreatAsLostUDP(t *testing.T) {
|
func TestIsEPERM(t *testing.T) {
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
err error
|
err error
|
||||||
@ -45,7 +45,7 @@ func TestTreatAsLostUDP(t *testing.T) {
|
|||||||
}
|
}
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
if got := TreatAsLostUDP(tt.err); got != tt.want {
|
if got := IsEPERM(tt.err); got != tt.want {
|
||||||
t.Errorf("got = %v; want %v", got, tt.want)
|
t.Errorf("got = %v; want %v", got, tt.want)
|
||||||
}
|
}
|
||||||
})
|
})
|
@ -613,7 +613,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
|
|||||||
// Only do PCP mapping in the case when PMP did not appear to be available recently.
|
// Only do PCP mapping in the case when PMP did not appear to be available recently.
|
||||||
pkt := buildPCPRequestMappingPacket(myIP, localPort, prevPort, pcpMapLifetimeSec, wildcardIP)
|
pkt := buildPCPRequestMappingPacket(myIP, localPort, prevPort, pcpMapLifetimeSec, wildcardIP)
|
||||||
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
|
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
|
||||||
if neterror.TreatAsLostUDP(err) {
|
if neterror.IsEPERM(err) {
|
||||||
err = NoMappingError{ErrNoPortMappingServices}
|
err = NoMappingError{ErrNoPortMappingServices}
|
||||||
}
|
}
|
||||||
return netip.AddrPort{}, err
|
return netip.AddrPort{}, err
|
||||||
@ -622,7 +622,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
|
|||||||
// Ask for our external address if needed.
|
// Ask for our external address if needed.
|
||||||
if !m.external.Addr().IsValid() {
|
if !m.external.Addr().IsValid() {
|
||||||
if _, err := uc.WriteToUDPAddrPort(pmpReqExternalAddrPacket, pxpAddr); err != nil {
|
if _, err := uc.WriteToUDPAddrPort(pmpReqExternalAddrPacket, pxpAddr); err != nil {
|
||||||
if neterror.TreatAsLostUDP(err) {
|
if neterror.IsEPERM(err) {
|
||||||
err = NoMappingError{ErrNoPortMappingServices}
|
err = NoMappingError{ErrNoPortMappingServices}
|
||||||
}
|
}
|
||||||
return netip.AddrPort{}, err
|
return netip.AddrPort{}, err
|
||||||
@ -631,7 +631,7 @@ func (c *Client) createOrGetMapping(ctx context.Context) (external netip.AddrPor
|
|||||||
|
|
||||||
pkt := buildPMPRequestMappingPacket(localPort, prevPort, pmpMapLifetimeSec)
|
pkt := buildPMPRequestMappingPacket(localPort, prevPort, pmpMapLifetimeSec)
|
||||||
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
|
if _, err := uc.WriteToUDPAddrPort(pkt, pxpAddr); err != nil {
|
||||||
if neterror.TreatAsLostUDP(err) {
|
if neterror.IsEPERM(err) {
|
||||||
err = NoMappingError{ErrNoPortMappingServices}
|
err = NoMappingError{ErrNoPortMappingServices}
|
||||||
}
|
}
|
||||||
return netip.AddrPort{}, err
|
return netip.AddrPort{}, err
|
||||||
|
@ -20,7 +20,6 @@
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"syscall"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/tailscale/wireguard-go/conn"
|
"github.com/tailscale/wireguard-go/conn"
|
||||||
@ -323,6 +322,18 @@ type Conn struct {
|
|||||||
staticEndpoints views.Slice[netip.AddrPort]
|
staticEndpoints views.Slice[netip.AddrPort]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sendToBlockedWithEPERMWarning is set when a sendto call returns an error containing EPERM.
|
||||||
|
var sendToBlockedWithEPERMWarning = health.Register(&health.Warnable{
|
||||||
|
Code: "firewall-blocking-udp",
|
||||||
|
Severity: health.SeverityMedium,
|
||||||
|
Title: "Tailscale blocked by system firewall",
|
||||||
|
Text: func(args health.Args) string {
|
||||||
|
return "The operating system firewall is blocking UDP sends, preventing direct connections. Try reconfiguring your firewall or checking the configuration of EDR software."
|
||||||
|
},
|
||||||
|
// TODO(raggi): we could do with a state indicating that we'll have degraded connectivity, as in this example we'll likely fail to relayed conns.
|
||||||
|
ImpactsConnectivity: false,
|
||||||
|
})
|
||||||
|
|
||||||
// SetDebugLoggingEnabled controls whether spammy debug logging is enabled.
|
// SetDebugLoggingEnabled controls whether spammy debug logging is enabled.
|
||||||
//
|
//
|
||||||
// Note that this is currently independent from the log levels, even though
|
// Note that this is currently independent from the log levels, even though
|
||||||
@ -1122,7 +1133,7 @@ func (c *Conn) sendUDPBatch(addr netip.AddrPort, buffs [][]byte) (sent bool, err
|
|||||||
c.logf("magicsock: %s", errGSO.Error())
|
c.logf("magicsock: %s", errGSO.Error())
|
||||||
err = errGSO.RetryErr
|
err = errGSO.RetryErr
|
||||||
} else {
|
} else {
|
||||||
_ = c.maybeRebindOnError(runtime.GOOS, err)
|
_ = c.maybeRebindOnError(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return err == nil, err
|
return err == nil, err
|
||||||
@ -1137,7 +1148,7 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) {
|
|||||||
sent, err = c.sendUDPStd(ipp, b)
|
sent, err = c.sendUDPStd(ipp, b)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
metricSendUDPError.Add(1)
|
metricSendUDPError.Add(1)
|
||||||
_ = c.maybeRebindOnError(runtime.GOOS, err)
|
_ = c.maybeRebindOnError(err)
|
||||||
} else {
|
} else {
|
||||||
if sent {
|
if sent {
|
||||||
metricSendUDP.Add(1)
|
metricSendUDP.Add(1)
|
||||||
@ -1146,29 +1157,22 @@ func (c *Conn) sendUDP(ipp netip.AddrPort, b []byte) (sent bool, err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// maybeRebindOnError performs a rebind and restun if the error is defined and
|
// maybeRebindOnError performs a rebind and restun if the error may potentially
|
||||||
// any conditionals are met.
|
// be fixed by performing a rebind and one has not been performed recently. It
|
||||||
func (c *Conn) maybeRebindOnError(os string, err error) bool {
|
// returns true if a rebind was performed.
|
||||||
switch {
|
func (c *Conn) maybeRebindOnError(err error) bool {
|
||||||
case errors.Is(err, syscall.EPERM):
|
if neterror.IsEPERM(err) {
|
||||||
|
c.health.SetUnhealthy(sendToBlockedWithEPERMWarning, nil)
|
||||||
|
|
||||||
why := "operation-not-permitted-rebind"
|
why := "operation-not-permitted-rebind"
|
||||||
switch os {
|
// TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
|
||||||
// We currently will only rebind and restun on a syscall.EPERM if it is experienced
|
// EPERMs.
|
||||||
// on a client running darwin.
|
if c.lastEPERMRebind.Load().Before(time.Now().Add(-15 * time.Minute)) {
|
||||||
// TODO(charlotte, raggi): expand os options if required.
|
c.logf("magicsock: performing %q", why)
|
||||||
case "darwin":
|
c.lastEPERMRebind.Store(time.Now())
|
||||||
// TODO(charlotte): implement a backoff, so we don't end up in a rebind loop for persistent
|
c.Rebind()
|
||||||
// EPERMs.
|
go c.ReSTUN(why)
|
||||||
if c.lastEPERMRebind.Load().Before(time.Now().Add(-5 * time.Second)) {
|
return true
|
||||||
c.logf("magicsock: performing %q", why)
|
|
||||||
c.lastEPERMRebind.Store(time.Now())
|
|
||||||
c.Rebind()
|
|
||||||
go c.ReSTUN(why)
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
c.logf("magicsock: not performing %q", why)
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
@ -1201,12 +1205,14 @@ func (c *Conn) sendUDPStd(addr netip.AddrPort, b []byte) (sent bool, err error)
|
|||||||
switch {
|
switch {
|
||||||
case addr.Addr().Is4():
|
case addr.Addr().Is4():
|
||||||
_, err = c.pconn4.WriteToUDPAddrPort(b, addr)
|
_, err = c.pconn4.WriteToUDPAddrPort(b, addr)
|
||||||
if err != nil && (c.noV4.Load() || neterror.TreatAsLostUDP(err)) {
|
if err != nil && (c.noV4.Load() || neterror.IsEPERM(err)) {
|
||||||
|
c.maybeRebindOnError(err)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
case addr.Addr().Is6():
|
case addr.Addr().Is6():
|
||||||
_, err = c.pconn6.WriteToUDPAddrPort(b, addr)
|
_, err = c.pconn6.WriteToUDPAddrPort(b, addr)
|
||||||
if err != nil && (c.noV6.Load() || neterror.TreatAsLostUDP(err)) {
|
if err != nil && (c.noV6.Load() || neterror.IsEPERM(err)) {
|
||||||
|
c.maybeRebindOnError(err)
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
@ -2607,6 +2613,7 @@ func (c *Conn) rebind(curPortFate currentPortFate) error {
|
|||||||
// It should be followed by a call to ReSTUN.
|
// It should be followed by a call to ReSTUN.
|
||||||
func (c *Conn) Rebind() {
|
func (c *Conn) Rebind() {
|
||||||
metricRebindCalls.Add(1)
|
metricRebindCalls.Add(1)
|
||||||
|
c.health.SetHealthy(sendToBlockedWithEPERMWarning)
|
||||||
if err := c.rebind(keepCurrentPort); err != nil {
|
if err := c.rebind(keepCurrentPort); err != nil {
|
||||||
c.logf("%v", err)
|
c.logf("%v", err)
|
||||||
return
|
return
|
||||||
|
@ -2967,29 +2967,20 @@ func TestMaybeRebindOnError(t *testing.T) {
|
|||||||
|
|
||||||
err := fmt.Errorf("outer err: %w", syscall.EPERM)
|
err := fmt.Errorf("outer err: %w", syscall.EPERM)
|
||||||
|
|
||||||
t.Run("darwin-rebind", func(t *testing.T) {
|
t.Run("rebind", func(t *testing.T) {
|
||||||
conn := newTestConn(t)
|
conn := newTestConn(t)
|
||||||
defer conn.Close()
|
defer conn.Close()
|
||||||
rebound := conn.maybeRebindOnError("darwin", err)
|
rebound := conn.maybeRebindOnError(err)
|
||||||
if !rebound {
|
if !rebound {
|
||||||
t.Errorf("darwin should rebind on syscall.EPERM")
|
t.Errorf("darwin should rebind on syscall.EPERM")
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("linux-not-rebind", func(t *testing.T) {
|
|
||||||
conn := newTestConn(t)
|
|
||||||
defer conn.Close()
|
|
||||||
rebound := conn.maybeRebindOnError("linux", err)
|
|
||||||
if rebound {
|
|
||||||
t.Errorf("linux should not rebind on syscall.EPERM")
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("no-frequent-rebind", func(t *testing.T) {
|
t.Run("no-frequent-rebind", func(t *testing.T) {
|
||||||
conn := newTestConn(t)
|
conn := newTestConn(t)
|
||||||
defer conn.Close()
|
defer conn.Close()
|
||||||
conn.lastEPERMRebind.Store(time.Now().Add(-1 * time.Second))
|
conn.lastEPERMRebind.Store(time.Now().Add(-60 * time.Second))
|
||||||
rebound := conn.maybeRebindOnError("darwin", err)
|
rebound := conn.maybeRebindOnError(err)
|
||||||
if rebound {
|
if rebound {
|
||||||
t.Errorf("darwin should not rebind on syscall.EPERM within 5 seconds of last")
|
t.Errorf("darwin should not rebind on syscall.EPERM within 5 seconds of last")
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user