net/netcheck,wgengine/magicsock: add potential workaround for Palo Alto DIPP behavior

Palo Alto firewalls have a typically hard NAT, but also have a mode
called Persistent DIPP that is supposed to provide consistent port
mapping suitable for STUN resolution of public ports. Persistent DIPP
works initially on most Palo Alto firewalls, but some models/software
versions have a bug which this works around.

The bug symptom presents as follows:

- STUN sessions resolve a correct public IP:port to start with
- Much later netchecks still return the same IP:Port for a subset of
  connections, most often the users active DERP, and/or the port related
  to sustained traffic.
- New public addresses however, such as every other DERP node in a
  full netcheck will now observe a new IP:Port.
- At this point of observation, new inbound connections will only
  succeed to the new IP:Port observed, and existing/old sessions will
  only work to the old binding.

In this patch we now advertise the lowest latency global endpoint
discovered as we always have, but in addition any global endpoints that
are observed more than once in a single netcheck report. This should
provide viable endpoints for potential connection establishment across
a NAT with this behavior.

Updates #19106

Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker 2024-05-16 15:58:19 -07:00
parent 024376e7e1
commit 7c15156d4c
No known key found for this signature in database
3 changed files with 98 additions and 8 deletions

View File

@ -115,6 +115,9 @@ type Report struct {
RegionV4Latency map[int]time.Duration // keyed by DERP Region ID
RegionV6Latency map[int]time.Duration // keyed by DERP Region ID
GlobalV4Counters map[string]int // keyed by IP:port, number of times observed
GlobalV6Counters map[string]int // keyed by [IP]:port, number of times observed
GlobalV4 string // ip:port of global IPv4
GlobalV6 string // [ip]:port of global IPv6
@ -125,6 +128,44 @@ type Report struct {
// TODO: update Clone when adding new fields
}
// GetGlobalAddrs returns the v4 and v6 global addresses observed during the
// netcheck, which includes the best latency endpoint first, followed by any
// other endpoints that were observed repeatedly. It excludes singular endpoints
// that are likely only the result of a hard NAT.
func (r *Report) GetGlobalAddrs() ([]netip.AddrPort, []netip.AddrPort) {
var v4, v6 []netip.AddrPort
// Always add the best latency entries first.
if r.GlobalV4 != "" {
v4 = append(v4, netip.MustParseAddrPort(r.GlobalV4))
}
if r.GlobalV6 != "" {
v6 = append(v6, netip.MustParseAddrPort(r.GlobalV6))
}
// Add any other entries for which we have multiple observations.
// This covers a case of bad NATs that start to provide new mappings for new
// STUN sessions mid-expiration, even while a live mapping for the best
// latency endpoint still exists. This has been observed on some Palo Alto
// Networks firewalls, wherein new traffic to the old endpoint will not
// succeed, but new traffic to the newly discovered endpoints does succeed.
for k, count := range r.GlobalV4Counters {
if k == r.GlobalV4 {
continue
}
if count > 1 {
v4 = append(v4, netip.MustParseAddrPort(k))
}
}
for k, count := range r.GlobalV6Counters {
if k == r.GlobalV6 {
continue
}
if count > 1 {
v6 = append(v6, netip.MustParseAddrPort(k))
}
}
return v4, v6
}
// AnyPortMappingChecked reports whether any of UPnP, PMP, or PCP are non-empty.
func (r *Report) AnyPortMappingChecked() bool {
return r.UPnP != "" || r.PMP != "" || r.PCP != ""
@ -675,11 +716,13 @@ func (rs *reportState) addNodeLatency(node *tailcfg.DERPNode, ipp netip.AddrPort
updateLatency(ret.RegionV6Latency, node.RegionID, d)
ret.IPv6 = true
ret.GlobalV6 = ipPortStr
mak.Set(&ret.GlobalV6Counters, ipPortStr, ret.GlobalV6Counters[ipPortStr]+1)
// TODO: track MappingVariesByDestIP for IPv6
// too? Would be sad if so, but who knows.
case ipp.Addr().Is4():
updateLatency(ret.RegionV4Latency, node.RegionID, d)
ret.IPv4 = true
mak.Set(&ret.GlobalV4Counters, ipPortStr, ret.GlobalV4Counters[ipPortStr]+1)
if rs.gotEP4 == "" {
rs.gotEP4 = ipPortStr
ret.GlobalV4 = ipPortStr

View File

@ -11,6 +11,7 @@ import (
"net/http"
"net/netip"
"reflect"
"slices"
"sort"
"strconv"
"strings"
@ -188,6 +189,51 @@ func TestBasic(t *testing.T) {
if r.PreferredDERP != 1 {
t.Errorf("PreferredDERP = %v; want 1", r.PreferredDERP)
}
v4Addrs, _ := r.GetGlobalAddrs()
if len(v4Addrs) != 1 {
t.Error("expected one global IPv4 address")
}
if got, want := v4Addrs[0], netip.MustParseAddrPort(r.GlobalV4); got != want {
t.Errorf("got %v; want %v", got, want)
}
}
func TestMultiGlobalAddressMapping(t *testing.T) {
c := &Client{
Logf: t.Logf,
}
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
defer cancel()
if err := c.Standalone(ctx, "127.0.0.1:0"); err != nil {
t.Fatal(err)
}
rs := &reportState{
c: c,
start: time.Now(),
report: newReport(),
sentHairCheck: true, // prevent hair check start, not relevant here
}
derpNode := &tailcfg.DERPNode{}
port1 := netip.MustParseAddrPort("127.0.0.1:1234")
port2 := netip.MustParseAddrPort("127.0.0.1:2345")
port3 := netip.MustParseAddrPort("127.0.0.1:3456")
// First report for port1
rs.addNodeLatency(derpNode, port1, 10*time.Millisecond)
// Singular report for port2
rs.addNodeLatency(derpNode, port2, 11*time.Millisecond)
// Duplicate reports for port3
rs.addNodeLatency(derpNode, port3, 12*time.Millisecond)
rs.addNodeLatency(derpNode, port3, 13*time.Millisecond)
r := rs.report
v4Addrs, _ := r.GetGlobalAddrs()
wantV4Addrs := []netip.AddrPort{port1, port3}
if !slices.Equal(v4Addrs, wantV4Addrs) {
t.Errorf("got global addresses: %v, want %v", v4Addrs, wantV4Addrs)
}
}
func TestWorksWhenUDPBlocked(t *testing.T) {

View File

@ -884,23 +884,24 @@ func (c *Conn) determineEndpoints(ctx context.Context) ([]tailcfg.Endpoint, erro
c.setNetInfoHavePortMap()
}
if nr.GlobalV4 != "" {
addAddr(ipp(nr.GlobalV4), tailcfg.EndpointSTUN)
v4Addrs, v6Addrs := nr.GetGlobalAddrs()
for _, addr := range v4Addrs {
addAddr(addr, tailcfg.EndpointSTUN)
}
for _, addr := range v6Addrs {
addAddr(addr, tailcfg.EndpointSTUN)
}
if len(v4Addrs) == 1 {
// If they're behind a hard NAT and are using a fixed
// port locally, assume they might've added a static
// port mapping on their router to the same explicit
// port that tailscaled is running with. Worst case
// it's an invalid candidate mapping.
if port := c.port.Load(); nr.MappingVariesByDestIP.EqualBool(true) && port != 0 {
if ip, _, err := net.SplitHostPort(nr.GlobalV4); err == nil {
addAddr(ipp(net.JoinHostPort(ip, strconv.Itoa(int(port)))), tailcfg.EndpointSTUN4LocalPort)
}
addAddr(netip.AddrPortFrom(v4Addrs[0].Addr(), uint16(port)), tailcfg.EndpointSTUN4LocalPort)
}
}
if nr.GlobalV6 != "" {
addAddr(ipp(nr.GlobalV6), tailcfg.EndpointSTUN)
}
// Update our set of endpoints by adding any endpoints that we
// previously found but haven't expired yet. This also updates the