mirror of
https://github.com/tailscale/tailscale.git
synced 2025-01-05 14:57:49 +00:00
wgengine/magicsock: use AF_PACKET socket + BPF to read disco messages
This is entirely optional (i.e. failing in this code is non-fatal) and only enabled on Linux for now. Additionally, this new behaviour can be disabled by setting the TS_DEBUG_DISABLE_AF_PACKET environment variable. Updates #3824 Replaces #5474 Co-authored-by: Andrew Dunham <andrew@du.nham.ca> Signed-off-by: David Anderson <danderson@tailscale.com>
This commit is contained in:
parent
58f35261d0
commit
c72caa6672
@ -290,7 +290,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
|
||||
W tailscale.com/wf from tailscale.com/cmd/tailscaled
|
||||
tailscale.com/wgengine from tailscale.com/ipn/ipnlocal+
|
||||
tailscale.com/wgengine/filter from tailscale.com/control/controlclient+
|
||||
tailscale.com/wgengine/magicsock from tailscale.com/ipn/ipnlocal+
|
||||
💣 tailscale.com/wgengine/magicsock from tailscale.com/ipn/ipnlocal+
|
||||
tailscale.com/wgengine/monitor from tailscale.com/control/controlclient+
|
||||
tailscale.com/wgengine/netstack from tailscale.com/cmd/tailscaled+
|
||||
tailscale.com/wgengine/router from tailscale.com/ipn/ipnlocal+
|
||||
|
@ -14,6 +14,7 @@
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"io"
|
||||
"math"
|
||||
"math/rand"
|
||||
"net"
|
||||
@ -60,6 +61,16 @@
|
||||
"tailscale.com/wgengine/monitor"
|
||||
)
|
||||
|
||||
const (
|
||||
// These are disco.Magic in big-endian form, 4 then 2 bytes. The
|
||||
// BPF filters need the magic in this format to match on it. Used
|
||||
// only in magicsock_linux.go, but defined here so that the test
|
||||
// which verifies this is the correct magic doesn't also need a
|
||||
// _linux variant.
|
||||
discoMagic1 = 0x5453f09f
|
||||
discoMagic2 = 0x92ac
|
||||
)
|
||||
|
||||
// useDerpRoute reports whether magicsock should enable the DERP
|
||||
// return path optimization (Issue 150).
|
||||
func useDerpRoute() bool {
|
||||
@ -254,6 +265,12 @@ type Conn struct {
|
||||
pconn4 *RebindingUDPConn
|
||||
pconn6 *RebindingUDPConn
|
||||
|
||||
// closeDisco4 and closeDisco6 are io.Closers to shut down the raw
|
||||
// disco packet receivers. If nil, no raw disco receiver is
|
||||
// running for the given family.
|
||||
closeDisco4 io.Closer
|
||||
closeDisco6 io.Closer
|
||||
|
||||
// netChecker is the prober that discovers local network
|
||||
// conditions, including the closest DERP relay and NAT mappings.
|
||||
netChecker *netcheck.Client
|
||||
@ -572,6 +589,19 @@ func NewConn(opts Options) (*Conn, error) {
|
||||
|
||||
c.ignoreSTUNPackets()
|
||||
|
||||
if d4, err := c.listenRawDisco("ip4"); err == nil {
|
||||
c.logf("[v1] using BPF disco receiver for IPv4")
|
||||
c.closeDisco4 = d4
|
||||
} else {
|
||||
c.logf("[v1] couldn't create raw v4 disco listener, using regular listener instead: %v", err)
|
||||
}
|
||||
if d6, err := c.listenRawDisco("ip6"); err == nil {
|
||||
c.logf("[v1] using BPF disco receiver for IPv6")
|
||||
c.closeDisco6 = d6
|
||||
} else {
|
||||
c.logf("[v1] couldn't create raw v6 disco listener, using regular listener instead: %v", err)
|
||||
}
|
||||
|
||||
return c, nil
|
||||
}
|
||||
|
||||
@ -1638,7 +1668,7 @@ func (c *Conn) receiveIPv6(b []byte) (int, conn.Endpoint, error) {
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint6); ok {
|
||||
if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint6, c.closeDisco6 == nil); ok {
|
||||
metricRecvDataIPv6.Add(1)
|
||||
return n, ep, nil
|
||||
}
|
||||
@ -1654,7 +1684,7 @@ func (c *Conn) receiveIPv4(b []byte) (n int, ep conn.Endpoint, err error) {
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint4); ok {
|
||||
if ep, ok := c.receiveIP(b[:n], ipp, &c.ippEndpoint4, c.closeDisco4 == nil); ok {
|
||||
metricRecvDataIPv4.Add(1)
|
||||
return n, ep, nil
|
||||
}
|
||||
@ -1665,12 +1695,18 @@ func (c *Conn) receiveIPv4(b []byte) (n int, ep conn.Endpoint, err error) {
|
||||
//
|
||||
// ok is whether this read should be reported up to wireguard-go (our
|
||||
// caller).
|
||||
func (c *Conn) receiveIP(b []byte, ipp netip.AddrPort, cache *ippEndpointCache) (ep *endpoint, ok bool) {
|
||||
func (c *Conn) receiveIP(b []byte, ipp netip.AddrPort, cache *ippEndpointCache, checkDisco bool) (ep *endpoint, ok bool) {
|
||||
if stun.Is(b) {
|
||||
c.stunReceiveFunc.Load()(b, ipp)
|
||||
return nil, false
|
||||
}
|
||||
if c.handleDiscoMessage(b, ipp, key.NodePublic{}) {
|
||||
if checkDisco {
|
||||
if c.handleDiscoMessage(b, ipp, key.NodePublic{}) {
|
||||
return nil, false
|
||||
}
|
||||
} else if disco.LooksLikeDiscoWrapper(b) {
|
||||
// Caller told us to ignore disco traffic, don't let it fall
|
||||
// through to wireguard-go.
|
||||
return nil, false
|
||||
}
|
||||
if !c.havePrivateKey.Load() {
|
||||
@ -2632,6 +2668,12 @@ func (c *connBind) Close() error {
|
||||
if c.pconn6 != nil {
|
||||
c.pconn6.Close()
|
||||
}
|
||||
if c.closeDisco4 != nil {
|
||||
c.closeDisco4.Close()
|
||||
}
|
||||
if c.closeDisco6 != nil {
|
||||
c.closeDisco6.Close()
|
||||
}
|
||||
// Send an empty read result to unblock receiveDERP,
|
||||
// which will then check connBind.Closed.
|
||||
// connBind.Closed takes c.mu, but c.derpRecvCh is buffered.
|
||||
@ -4192,4 +4234,8 @@ func (s derpAddrFamSelector) PreferIPv6() bool {
|
||||
// metricDERPHomeChange is how many times our DERP home region DI has
|
||||
// changed from non-zero to a different non-zero.
|
||||
metricDERPHomeChange = clientmetric.NewCounter("derp_home_change")
|
||||
|
||||
// Disco packets received bpf read path
|
||||
metricRecvDiscoPacketIPv4 = clientmetric.NewCounter("magicsock_disco_recv_bpf_ipv4")
|
||||
metricRecvDiscoPacketIPv6 = clientmetric.NewCounter("magicsock_disco_recv_bpf_ipv6")
|
||||
)
|
||||
|
17
wgengine/magicsock/magicsock_default.go
Normal file
17
wgengine/magicsock/magicsock_default.go
Normal file
@ -0,0 +1,17 @@
|
||||
// Copyright (c) 2022 Tailscale Inc & AUTHORS All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build !linux
|
||||
// +build !linux
|
||||
|
||||
package magicsock
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
)
|
||||
|
||||
func (c *Conn) listenRawDisco(family string) (io.Closer, error) {
|
||||
return nil, errors.New("raw disco listening not supported on this OS")
|
||||
}
|
260
wgengine/magicsock/magicsock_linux.go
Normal file
260
wgengine/magicsock/magicsock_linux.go
Normal file
@ -0,0 +1,260 @@
|
||||
// Copyright (c) 2022 Tailscale Inc & AUTHORS All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package magicsock
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/netip"
|
||||
"time"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/net/bpf"
|
||||
"golang.org/x/sys/unix"
|
||||
"tailscale.com/envknob"
|
||||
"tailscale.com/types/key"
|
||||
)
|
||||
|
||||
const (
|
||||
udpHeaderSize = 8
|
||||
ipv6FragmentHeaderSize = 8
|
||||
)
|
||||
|
||||
// Enable/disable using raw sockets to receive disco traffic.
|
||||
var debugDisableRawDisco = envknob.Bool("TS_DEBUG_DISABLE_RAW_DISCO")
|
||||
|
||||
// These are our BPF filters that we use for testing packets.
|
||||
var (
|
||||
magicsockFilterV4 = []bpf.Instruction{
|
||||
// For raw UDPv4 sockets, BPF receives the entire IP packet to
|
||||
// inspect.
|
||||
|
||||
// Disco packets are so small they should never get
|
||||
// fragmented, and we don't want to handle reassembly.
|
||||
bpf.LoadAbsolute{Off: 6, Size: 2},
|
||||
// More Fragments bit set means this is part of a fragmented packet.
|
||||
bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 0x2000, SkipTrue: 7, SkipFalse: 0},
|
||||
// Non-zero fragment offset with MF=0 means this is the last
|
||||
// fragment of packet.
|
||||
bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 0x1fff, SkipTrue: 6, SkipFalse: 0},
|
||||
|
||||
// Load IP header length into X register.
|
||||
bpf.LoadMemShift{Off: 0},
|
||||
|
||||
// Get the first 4 bytes of the UDP packet, compare with our magic number
|
||||
bpf.LoadIndirect{Off: udpHeaderSize, Size: 4},
|
||||
bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic1, SkipTrue: 0, SkipFalse: 3},
|
||||
|
||||
// Compare the next 2 bytes
|
||||
bpf.LoadIndirect{Off: udpHeaderSize + 4, Size: 2},
|
||||
bpf.JumpIf{Cond: bpf.JumpEqual, Val: uint32(discoMagic2), SkipTrue: 0, SkipFalse: 1},
|
||||
|
||||
// Accept the whole packet
|
||||
bpf.RetConstant{Val: 0xFFFFFFFF},
|
||||
|
||||
// Skip the packet
|
||||
bpf.RetConstant{Val: 0x0},
|
||||
}
|
||||
|
||||
// IPv6 is more complicated to filter, since we can have 0-to-N
|
||||
// extension headers following the IPv6 header. Since BPF can't
|
||||
// loop, we can't really parse these in a general way; instead, we
|
||||
// simply handle the case where we have no extension headers; any
|
||||
// packets with headers will be skipped. IPv6 extension headers
|
||||
// are sufficiently uncommon that we're willing to accept false
|
||||
// negatives here.
|
||||
//
|
||||
// The "proper" way to handle this would be to do minimal parsing in
|
||||
// BPF and more in-depth parsing of all IPv6 packets in userspace, but
|
||||
// on systems with a high volume of UDP that would be unacceptably slow
|
||||
// and thus we'd rather be conservative here and possibly not receive
|
||||
// disco packets rather than slow down the system.
|
||||
magicsockFilterV6 = []bpf.Instruction{
|
||||
// For raw UDPv6 sockets, BPF receives _only_ the UDP header onwards, not an entire IP packet.
|
||||
//
|
||||
// https://stackoverflow.com/questions/24514333/using-bpf-with-sock-dgram-on-linux-machine
|
||||
// https://blog.cloudflare.com/epbf_sockets_hop_distance/
|
||||
//
|
||||
// This is especially confusing because this *isn't* true for
|
||||
// IPv4; see the following code from the 'ping' utility that
|
||||
// corroborates this:
|
||||
//
|
||||
// https://github.com/iputils/iputils/blob/1ab5fa/ping/ping.c#L1667-L1676
|
||||
// https://github.com/iputils/iputils/blob/1ab5fa/ping/ping6_common.c#L933-L941
|
||||
|
||||
// Compare with our magic number. Start by loading and
|
||||
// comparing the first 4 bytes of the UDP payload.
|
||||
bpf.LoadAbsolute{Off: udpHeaderSize, Size: 4},
|
||||
bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic1, SkipTrue: 0, SkipFalse: 3},
|
||||
|
||||
// Compare the next 2 bytes
|
||||
bpf.LoadAbsolute{Off: udpHeaderSize + 4, Size: 2},
|
||||
bpf.JumpIf{Cond: bpf.JumpEqual, Val: discoMagic2, SkipTrue: 0, SkipFalse: 1},
|
||||
|
||||
// Accept the whole packet
|
||||
bpf.RetConstant{Val: 0xFFFFFFFF},
|
||||
|
||||
// Skip the packet
|
||||
bpf.RetConstant{Val: 0x0},
|
||||
}
|
||||
|
||||
testDiscoPacket = []byte{
|
||||
// Disco magic
|
||||
0x54, 0x53, 0xf0, 0x9f, 0x92, 0xac,
|
||||
// Sender key
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
// Nonce
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
}
|
||||
)
|
||||
|
||||
// listenRawDisco starts listening for disco packets on the given
|
||||
// address family, which must be "ip4" or "ip6", using a raw socket
|
||||
// and BPF filter.
|
||||
// https://github.com/tailscale/tailscale/issues/3824
|
||||
func (c *Conn) listenRawDisco(family string) (io.Closer, error) {
|
||||
if debugDisableRawDisco {
|
||||
return nil, errors.New("raw disco listening disabled by debug flag")
|
||||
}
|
||||
|
||||
var (
|
||||
network string
|
||||
addr string
|
||||
testAddr string
|
||||
prog []bpf.Instruction
|
||||
)
|
||||
switch family {
|
||||
case "ip4":
|
||||
network = "ip4:17"
|
||||
addr = "0.0.0.0"
|
||||
testAddr = "127.0.0.1:1"
|
||||
prog = magicsockFilterV4
|
||||
case "ip6":
|
||||
network = "ip6:17"
|
||||
addr = "::"
|
||||
testAddr = "[::1]:1"
|
||||
prog = magicsockFilterV6
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported address family %q", family)
|
||||
}
|
||||
|
||||
asm, err := bpf.Assemble(prog)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("assembling filter: %w", err)
|
||||
}
|
||||
|
||||
pc, err := net.ListenPacket(network, addr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("creating packet conn: %w", err)
|
||||
}
|
||||
|
||||
if err := setBPF(pc, asm); err != nil {
|
||||
pc.Close()
|
||||
return nil, fmt.Errorf("installing BPF filter: %w", err)
|
||||
}
|
||||
|
||||
// If all the above succeeds, we should be ready to receive. Just
|
||||
// out of paranoia, check that we do receive a well-formed disco
|
||||
// packet.
|
||||
tc, err := net.ListenPacket("udp", net.JoinHostPort(addr, "0"))
|
||||
if err != nil {
|
||||
pc.Close()
|
||||
return nil, fmt.Errorf("creating disco test socket: %w", err)
|
||||
}
|
||||
defer tc.Close()
|
||||
if _, err := tc.(*net.UDPConn).WriteToUDPAddrPort(testDiscoPacket, netip.MustParseAddrPort(testAddr)); err != nil {
|
||||
pc.Close()
|
||||
return nil, fmt.Errorf("writing disco test packet: %w", err)
|
||||
}
|
||||
pc.SetReadDeadline(time.Now().Add(100 * time.Millisecond))
|
||||
var buf [1500]byte
|
||||
for {
|
||||
n, _, err := pc.ReadFrom(buf[:])
|
||||
if err != nil {
|
||||
pc.Close()
|
||||
return nil, fmt.Errorf("reading during raw disco self-test: %w", err)
|
||||
}
|
||||
if n < udpHeaderSize {
|
||||
continue
|
||||
}
|
||||
if !bytes.Equal(buf[udpHeaderSize:n], testDiscoPacket) {
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
pc.SetReadDeadline(time.Time{})
|
||||
|
||||
go c.receiveDisco(pc)
|
||||
return pc, nil
|
||||
}
|
||||
|
||||
func (c *Conn) receiveDisco(pc net.PacketConn) {
|
||||
var buf [1500]byte
|
||||
for {
|
||||
n, src, err := pc.ReadFrom(buf[:])
|
||||
if errors.Is(err, net.ErrClosed) {
|
||||
return
|
||||
} else if err != nil {
|
||||
c.logf("disco raw reader failed: %v", err)
|
||||
return
|
||||
}
|
||||
if n < udpHeaderSize {
|
||||
// Too small to be a valid UDP datagram, drop.
|
||||
continue
|
||||
}
|
||||
srcIP, ok := netip.AddrFromSlice(src.(*net.IPAddr).IP)
|
||||
if !ok {
|
||||
c.logf("[unexpected] PacketConn.ReadFrom returned not-an-IP %v in from", src)
|
||||
continue
|
||||
}
|
||||
srcPort := binary.BigEndian.Uint16(buf[:2])
|
||||
|
||||
if srcIP.Is4() {
|
||||
metricRecvDiscoPacketIPv4.Add(1)
|
||||
} else {
|
||||
metricRecvDiscoPacketIPv6.Add(1)
|
||||
}
|
||||
|
||||
c.handleDiscoMessage(buf[udpHeaderSize:n], netip.AddrPortFrom(srcIP, srcPort), key.NodePublic{})
|
||||
}
|
||||
}
|
||||
|
||||
// setBPF installs filter as the BPF filter on conn.
|
||||
// Ideally we would just use SetBPF as implemented in x/net/ipv4,
|
||||
// but x/net/ipv6 doesn't implement it. And once you've written
|
||||
// this code once, it turns out to be address family agnostic, so
|
||||
// we might as well use it on both and get to use a net.PacketConn
|
||||
// directly for both families instead of being stuck with
|
||||
// different types.
|
||||
func setBPF(conn net.PacketConn, filter []bpf.RawInstruction) error {
|
||||
sc, err := conn.(*net.IPConn).SyscallConn()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
prog := &unix.SockFprog{
|
||||
Len: uint16(len(filter)),
|
||||
Filter: (*unix.SockFilter)(unsafe.Pointer(&filter[0])),
|
||||
}
|
||||
var setErr error
|
||||
err = sc.Control(func(fd uintptr) {
|
||||
setErr = unix.SetsockoptSockFprog(int(fd), unix.SOL_SOCKET, unix.SO_ATTACH_FILTER, prog)
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if setErr != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
@ -32,6 +32,7 @@
|
||||
"golang.zx2c4.com/wireguard/tun/tuntest"
|
||||
"tailscale.com/derp"
|
||||
"tailscale.com/derp/derphttp"
|
||||
"tailscale.com/disco"
|
||||
"tailscale.com/ipn/ipnstate"
|
||||
"tailscale.com/net/netaddr"
|
||||
"tailscale.com/net/stun/stuntest"
|
||||
@ -1799,3 +1800,21 @@ func TestBlockForeverConnUnblocks(t *testing.T) {
|
||||
t.Fatal("timeout")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDiscoMagicMatches(t *testing.T) {
|
||||
// Convert our disco magic number into a uint32 and uint16 to test
|
||||
// against. We panic on an incorrect length here rather than try to be
|
||||
// generic with our BPF instructions below.
|
||||
//
|
||||
// Note that BPF uses network byte order (big-endian) when loading data
|
||||
// from a packet, so that is what we use to generate our magic numbers.
|
||||
if len(disco.Magic) != 6 {
|
||||
t.Fatalf("expected disco.Magic to be of length 6")
|
||||
}
|
||||
if m1 := binary.BigEndian.Uint32([]byte(disco.Magic[:4])); m1 != discoMagic1 {
|
||||
t.Errorf("first 4 bytes of disco magic don't match, got %v want %v", discoMagic1, m1)
|
||||
}
|
||||
if m2 := binary.BigEndian.Uint16([]byte(disco.Magic[4:6])); m2 != discoMagic2 {
|
||||
t.Errorf("last 2 bytes of disco magic don't match, got %v want %v", discoMagic2, m2)
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user