mirror of
https://github.com/tailscale/tailscale.git
synced 2024-12-03 23:15:42 +00:00
cb96b14bf4
Upstream netaddr has a change that makes it alloc-free, so it's safe to use in hot codepaths. This gets rid of one of the many IP types in our codebase. Performance is currently worse across the board. This is likely due in part to netaddr.IP being a larger value type (4b -> 24b for IPv4, 16b -> 24b for IPv6), and in other part due to missing low-hanging fruit optimizations in netaddr. However, the regression is less bad than it looks at first glance, because we'd micro-optimized packet.IP* in the past few weeks. This change drops us back to roughly where we were at the 1.2 release, but with the benefit of a significant code and architectural simplification. name old time/op new time/op delta pkg:tailscale.com/net/packet goos:linux goarch:amd64 Decode/tcp4-8 12.2ns ± 5% 29.7ns ± 2% +142.32% (p=0.008 n=5+5) Decode/tcp6-8 12.6ns ± 3% 65.1ns ± 2% +418.47% (p=0.008 n=5+5) Decode/udp4-8 11.8ns ± 3% 30.5ns ± 2% +157.94% (p=0.008 n=5+5) Decode/udp6-8 27.1ns ± 1% 65.7ns ± 2% +142.36% (p=0.016 n=4+5) Decode/icmp4-8 24.6ns ± 2% 30.5ns ± 2% +23.65% (p=0.016 n=4+5) Decode/icmp6-8 22.9ns ±51% 65.5ns ± 2% +186.19% (p=0.008 n=5+5) Decode/igmp-8 18.1ns ±44% 30.2ns ± 1% +66.89% (p=0.008 n=5+5) Decode/unknown-8 20.8ns ± 1% 10.6ns ± 9% -49.11% (p=0.016 n=4+5) pkg:tailscale.com/wgengine/filter goos:linux goarch:amd64 Filter/icmp4-8 30.5ns ± 1% 77.9ns ± 3% +155.01% (p=0.008 n=5+5) Filter/tcp4_syn_in-8 43.7ns ± 3% 123.0ns ± 3% +181.72% (p=0.008 n=5+5) Filter/tcp4_syn_out-8 24.5ns ± 2% 45.7ns ± 6% +86.22% (p=0.008 n=5+5) Filter/udp4_in-8 64.8ns ± 1% 210.0ns ± 2% +223.87% (p=0.008 n=5+5) Filter/udp4_out-8 119ns ± 0% 278ns ± 0% +133.78% (p=0.016 n=4+5) Filter/icmp6-8 40.3ns ± 2% 204.4ns ± 4% +407.70% (p=0.008 n=5+5) Filter/tcp6_syn_in-8 35.3ns ± 3% 199.2ns ± 2% +464.95% (p=0.008 n=5+5) Filter/tcp6_syn_out-8 32.8ns ± 2% 81.0ns ± 2% +147.10% (p=0.008 n=5+5) Filter/udp6_in-8 106ns ± 2% 290ns ± 2% +174.48% (p=0.008 n=5+5) Filter/udp6_out-8 184ns ± 2% 314ns ± 3% +70.43% (p=0.016 n=4+5) pkg:tailscale.com/wgengine/tstun goos:linux goarch:amd64 Write-8 9.02ns ± 3% 8.92ns ± 1% ~ (p=0.421 n=5+5) name old alloc/op new alloc/op delta pkg:tailscale.com/net/packet goos:linux goarch:amd64 Decode/tcp4-8 0.00B 0.00B ~ (all equal) Decode/tcp6-8 0.00B 0.00B ~ (all equal) Decode/udp4-8 0.00B 0.00B ~ (all equal) Decode/udp6-8 0.00B 0.00B ~ (all equal) Decode/icmp4-8 0.00B 0.00B ~ (all equal) Decode/icmp6-8 0.00B 0.00B ~ (all equal) Decode/igmp-8 0.00B 0.00B ~ (all equal) Decode/unknown-8 0.00B 0.00B ~ (all equal) pkg:tailscale.com/wgengine/filter goos:linux goarch:amd64 Filter/icmp4-8 0.00B 0.00B ~ (all equal) Filter/tcp4_syn_in-8 0.00B 0.00B ~ (all equal) Filter/tcp4_syn_out-8 0.00B 0.00B ~ (all equal) Filter/udp4_in-8 0.00B 0.00B ~ (all equal) Filter/udp4_out-8 16.0B ± 0% 64.0B ± 0% +300.00% (p=0.008 n=5+5) Filter/icmp6-8 0.00B 0.00B ~ (all equal) Filter/tcp6_syn_in-8 0.00B 0.00B ~ (all equal) Filter/tcp6_syn_out-8 0.00B 0.00B ~ (all equal) Filter/udp6_in-8 0.00B 0.00B ~ (all equal) Filter/udp6_out-8 48.0B ± 0% 64.0B ± 0% +33.33% (p=0.008 n=5+5) name old allocs/op new allocs/op delta pkg:tailscale.com/net/packet goos:linux goarch:amd64 Decode/tcp4-8 0.00 0.00 ~ (all equal) Decode/tcp6-8 0.00 0.00 ~ (all equal) Decode/udp4-8 0.00 0.00 ~ (all equal) Decode/udp6-8 0.00 0.00 ~ (all equal) Decode/icmp4-8 0.00 0.00 ~ (all equal) Decode/icmp6-8 0.00 0.00 ~ (all equal) Decode/igmp-8 0.00 0.00 ~ (all equal) Decode/unknown-8 0.00 0.00 ~ (all equal) pkg:tailscale.com/wgengine/filter goos:linux goarch:amd64 Filter/icmp4-8 0.00 0.00 ~ (all equal) Filter/tcp4_syn_in-8 0.00 0.00 ~ (all equal) Filter/tcp4_syn_out-8 0.00 0.00 ~ (all equal) Filter/udp4_in-8 0.00 0.00 ~ (all equal) Filter/udp4_out-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) Filter/icmp6-8 0.00 0.00 ~ (all equal) Filter/tcp6_syn_in-8 0.00 0.00 ~ (all equal) Filter/tcp6_syn_out-8 0.00 0.00 ~ (all equal) Filter/udp6_in-8 0.00 0.00 ~ (all equal) Filter/udp6_out-8 1.00 ± 0% 1.00 ± 0% ~ (all equal) Signed-off-by: David Anderson <danderson@tailscale.com>
423 lines
11 KiB
Go
423 lines
11 KiB
Go
// Copyright (c) 2020 Tailscale Inc & AUTHORS All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package packet
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"fmt"
|
|
"net"
|
|
"strings"
|
|
|
|
"inet.af/netaddr"
|
|
"tailscale.com/types/strbuilder"
|
|
)
|
|
|
|
// RFC1858: prevent overlapping fragment attacks.
|
|
const minFrag = 60 + 20 // max IPv4 header + basic TCP header
|
|
|
|
const (
|
|
TCPSyn = 0x02
|
|
TCPAck = 0x10
|
|
TCPSynAck = TCPSyn | TCPAck
|
|
)
|
|
|
|
// Parsed is a minimal decoding of a packet suitable for use in filters.
|
|
type Parsed struct {
|
|
// b is the byte buffer that this decodes.
|
|
b []byte
|
|
// subofs is the offset of IP subprotocol.
|
|
subofs int
|
|
// dataofs is the offset of IP subprotocol payload.
|
|
dataofs int
|
|
// length is the total length of the packet.
|
|
// This is not the same as len(b) because b can have trailing zeros.
|
|
length int
|
|
|
|
// IPVersion is the IP protocol version of the packet (4 or
|
|
// 6), or 0 if the packet doesn't look like IPv4 or IPv6.
|
|
IPVersion uint8
|
|
// IPProto is the IP subprotocol (UDP, TCP, etc.). Valid iff IPVersion != 0.
|
|
IPProto IPProto
|
|
// SrcIP4 is the source address. Family matches IPVersion. Port is
|
|
// valid iff IPProto == TCP || IPProto == UDP.
|
|
Src netaddr.IPPort
|
|
// DstIP4 is the destination address. Family matches IPVersion.
|
|
Dst netaddr.IPPort
|
|
// TCPFlags is the packet's TCP flag bigs. Valid iff IPProto == TCP.
|
|
TCPFlags uint8
|
|
}
|
|
|
|
func (p *Parsed) String() string {
|
|
if p.IPVersion != 4 && p.IPVersion != 6 {
|
|
return "Unknown{???}"
|
|
}
|
|
|
|
sb := strbuilder.Get()
|
|
sb.WriteString(p.IPProto.String())
|
|
sb.WriteByte('{')
|
|
writeIPPort(sb, p.Src)
|
|
sb.WriteString(" > ")
|
|
writeIPPort(sb, p.Dst)
|
|
sb.WriteByte('}')
|
|
return sb.String()
|
|
}
|
|
|
|
// writeIPPort writes ipp.String() into sb, with fewer allocations.
|
|
//
|
|
// TODO: make netaddr more efficient in this area, and retire this func.
|
|
func writeIPPort(sb *strbuilder.Builder, ipp netaddr.IPPort) {
|
|
if ipp.IP.Is4() {
|
|
raw := ipp.IP.As4()
|
|
sb.WriteUint(uint64(raw[0]))
|
|
sb.WriteByte('.')
|
|
sb.WriteUint(uint64(raw[1]))
|
|
sb.WriteByte('.')
|
|
sb.WriteUint(uint64(raw[2]))
|
|
sb.WriteByte('.')
|
|
sb.WriteUint(uint64(raw[3]))
|
|
sb.WriteByte(':')
|
|
} else {
|
|
sb.WriteByte('[')
|
|
sb.WriteString(ipp.IP.String()) // TODO: faster?
|
|
sb.WriteString("]:")
|
|
}
|
|
sb.WriteUint(uint64(ipp.Port))
|
|
}
|
|
|
|
// Decode extracts data from the packet in b into q.
|
|
// It performs extremely simple packet decoding for basic IPv4 packet types.
|
|
// It extracts only the subprotocol id, IP addresses, and (if any) ports,
|
|
// and shouldn't need any memory allocation.
|
|
func (q *Parsed) Decode(b []byte) {
|
|
q.b = b
|
|
|
|
if len(b) < 1 {
|
|
q.IPVersion = 0
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
|
|
q.IPVersion = b[0] >> 4
|
|
switch q.IPVersion {
|
|
case 4:
|
|
q.decode4(b)
|
|
case 6:
|
|
q.decode6(b)
|
|
default:
|
|
q.IPVersion = 0
|
|
q.IPProto = Unknown
|
|
}
|
|
}
|
|
|
|
func (q *Parsed) decode4(b []byte) {
|
|
if len(b) < ip4HeaderLength {
|
|
q.IPVersion = 0
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
|
|
// Check that it's IPv4.
|
|
q.IPProto = IPProto(b[9])
|
|
q.length = int(binary.BigEndian.Uint16(b[2:4]))
|
|
if len(b) < q.length {
|
|
// Packet was cut off before full IPv4 length.
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
|
|
// If it's valid IPv4, then the IP addresses are valid
|
|
q.Src.IP = netaddr.IPv4(b[12], b[13], b[14], b[15])
|
|
q.Dst.IP = netaddr.IPv4(b[16], b[17], b[18], b[19])
|
|
|
|
q.subofs = int((b[0] & 0x0F) << 2)
|
|
if q.subofs > q.length {
|
|
// next-proto starts beyond end of packet.
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
sub := b[q.subofs:]
|
|
sub = sub[:len(sub):len(sub)] // help the compiler do bounds check elimination
|
|
|
|
// We don't care much about IP fragmentation, except insofar as it's
|
|
// used for firewall bypass attacks. The trick is make the first
|
|
// fragment of a TCP or UDP packet so short that it doesn't fit
|
|
// the TCP or UDP header, so we can't read the port, in hope that
|
|
// it'll sneak past. Then subsequent fragments fill it in, but we're
|
|
// missing the first part of the header, so we can't read that either.
|
|
//
|
|
// A "perfectly correct" implementation would have to reassemble
|
|
// fragments before deciding what to do. But the truth is there's
|
|
// zero reason to send such a short first fragment, so we can treat
|
|
// it as Unknown. We can also treat any subsequent fragment that starts
|
|
// at such a low offset as Unknown.
|
|
fragFlags := binary.BigEndian.Uint16(b[6:8])
|
|
moreFrags := (fragFlags & 0x20) != 0
|
|
fragOfs := fragFlags & 0x1FFF
|
|
if fragOfs == 0 {
|
|
// This is the first fragment
|
|
if moreFrags && len(sub) < minFrag {
|
|
// Suspiciously short first fragment, dump it.
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
// otherwise, this is either non-fragmented (the usual case)
|
|
// or a big enough initial fragment that we can read the
|
|
// whole subprotocol header.
|
|
switch q.IPProto {
|
|
case ICMPv4:
|
|
if len(sub) < icmp4HeaderLength {
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
q.Src.Port = 0
|
|
q.Dst.Port = 0
|
|
q.dataofs = q.subofs + icmp4HeaderLength
|
|
return
|
|
case IGMP:
|
|
// Keep IPProto, but don't parse anything else
|
|
// out.
|
|
return
|
|
case TCP:
|
|
if len(sub) < tcpHeaderLength {
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
q.Src.Port = binary.BigEndian.Uint16(sub[0:2])
|
|
q.Dst.Port = binary.BigEndian.Uint16(sub[2:4])
|
|
q.TCPFlags = sub[13] & 0x3F
|
|
headerLength := (sub[12] & 0xF0) >> 2
|
|
q.dataofs = q.subofs + int(headerLength)
|
|
return
|
|
case UDP:
|
|
if len(sub) < udpHeaderLength {
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
q.Src.Port = binary.BigEndian.Uint16(sub[0:2])
|
|
q.Dst.Port = binary.BigEndian.Uint16(sub[2:4])
|
|
q.dataofs = q.subofs + udpHeaderLength
|
|
return
|
|
default:
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
} else {
|
|
// This is a fragment other than the first one.
|
|
if fragOfs < minFrag {
|
|
// First frag was suspiciously short, so we can't
|
|
// trust the followup either.
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
// otherwise, we have to permit the fragment to slide through.
|
|
// Second and later fragments don't have sub-headers.
|
|
// Ideally, we would drop fragments that we can't identify,
|
|
// but that would require statefulness. Anyway, receivers'
|
|
// kernels know to drop fragments where the initial fragment
|
|
// doesn't arrive.
|
|
q.IPProto = Fragment
|
|
return
|
|
}
|
|
}
|
|
|
|
func (q *Parsed) decode6(b []byte) {
|
|
if len(b) < ip6HeaderLength {
|
|
q.IPVersion = 0
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
|
|
q.IPProto = IPProto(b[6])
|
|
q.length = int(binary.BigEndian.Uint16(b[4:6])) + ip6HeaderLength
|
|
if len(b) < q.length {
|
|
// Packet was cut off before the full IPv6 length.
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
|
|
// okay to ignore `ok` here, because IPs pulled from packets are
|
|
// always well-formed stdlib IPs.
|
|
q.Src.IP, _ = netaddr.FromStdIP(net.IP(b[8:24]))
|
|
q.Dst.IP, _ = netaddr.FromStdIP(net.IP(b[24:40]))
|
|
|
|
// We don't support any IPv6 extension headers. Don't try to
|
|
// be clever. Therefore, the IP subprotocol always starts at
|
|
// byte 40.
|
|
//
|
|
// Note that this means we don't support fragmentation in
|
|
// IPv6. This is fine, because IPv6 strongly mandates that you
|
|
// should not fragment, which makes fragmentation on the open
|
|
// internet extremely uncommon.
|
|
//
|
|
// This also means we don't support IPSec headers (AH/ESP), or
|
|
// IPv6 jumbo frames. Those will get marked Unknown and
|
|
// dropped.
|
|
q.subofs = 40
|
|
sub := b[q.subofs:]
|
|
sub = sub[:len(sub):len(sub)] // help the compiler do bounds check elimination
|
|
|
|
switch q.IPProto {
|
|
case ICMPv6:
|
|
if len(sub) < icmp6HeaderLength {
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
q.Src.Port = 0
|
|
q.Dst.Port = 0
|
|
q.dataofs = q.subofs + icmp6HeaderLength
|
|
case TCP:
|
|
if len(sub) < tcpHeaderLength {
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
q.Src.Port = binary.BigEndian.Uint16(sub[0:2])
|
|
q.Dst.Port = binary.BigEndian.Uint16(sub[2:4])
|
|
q.TCPFlags = sub[13] & 0x3F
|
|
headerLength := (sub[12] & 0xF0) >> 2
|
|
q.dataofs = q.subofs + int(headerLength)
|
|
return
|
|
case UDP:
|
|
if len(sub) < udpHeaderLength {
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
q.Src.Port = binary.BigEndian.Uint16(sub[0:2])
|
|
q.Dst.Port = binary.BigEndian.Uint16(sub[2:4])
|
|
q.dataofs = q.subofs + udpHeaderLength
|
|
default:
|
|
q.IPProto = Unknown
|
|
return
|
|
}
|
|
}
|
|
|
|
func (q *Parsed) IP4Header() IP4Header {
|
|
if q.IPVersion != 4 {
|
|
panic("IP4Header called on non-IPv4 Parsed")
|
|
}
|
|
ipid := binary.BigEndian.Uint16(q.b[4:6])
|
|
return IP4Header{
|
|
IPID: ipid,
|
|
IPProto: q.IPProto,
|
|
Src: q.Src.IP,
|
|
Dst: q.Dst.IP,
|
|
}
|
|
}
|
|
|
|
func (q *Parsed) ICMP4Header() ICMP4Header {
|
|
if q.IPVersion != 4 {
|
|
panic("IP4Header called on non-IPv4 Parsed")
|
|
}
|
|
return ICMP4Header{
|
|
IP4Header: q.IP4Header(),
|
|
Type: ICMP4Type(q.b[q.subofs+0]),
|
|
Code: ICMP4Code(q.b[q.subofs+1]),
|
|
}
|
|
}
|
|
|
|
func (q *Parsed) UDP4Header() UDP4Header {
|
|
if q.IPVersion != 4 {
|
|
panic("IP4Header called on non-IPv4 Parsed")
|
|
}
|
|
return UDP4Header{
|
|
IP4Header: q.IP4Header(),
|
|
SrcPort: q.Src.Port,
|
|
DstPort: q.Dst.Port,
|
|
}
|
|
}
|
|
|
|
// Buffer returns the entire packet buffer.
|
|
// This is a read-only view; that is, q retains the ownership of the buffer.
|
|
func (q *Parsed) Buffer() []byte {
|
|
return q.b
|
|
}
|
|
|
|
// Payload returns the payload of the IP subprotocol section.
|
|
// This is a read-only view; that is, q retains the ownership of the buffer.
|
|
func (q *Parsed) Payload() []byte {
|
|
return q.b[q.dataofs:q.length]
|
|
}
|
|
|
|
// IsTCPSyn reports whether q is a TCP SYN packet
|
|
// (i.e. the first packet in a new connection).
|
|
func (q *Parsed) IsTCPSyn() bool {
|
|
return (q.TCPFlags & TCPSynAck) == TCPSyn
|
|
}
|
|
|
|
// IsError reports whether q is an ICMP "Error" packet.
|
|
func (q *Parsed) IsError() bool {
|
|
switch q.IPProto {
|
|
case ICMPv4:
|
|
if len(q.b) < q.subofs+8 {
|
|
return false
|
|
}
|
|
t := ICMP4Type(q.b[q.subofs])
|
|
return t == ICMP4Unreachable || t == ICMP4TimeExceeded
|
|
case ICMPv6:
|
|
if len(q.b) < q.subofs+8 {
|
|
return false
|
|
}
|
|
t := ICMP6Type(q.b[q.subofs])
|
|
return t == ICMP6Unreachable || t == ICMP6TimeExceeded
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// IsEchoRequest reports whether q is an ICMP Echo Request.
|
|
func (q *Parsed) IsEchoRequest() bool {
|
|
switch q.IPProto {
|
|
case ICMPv4:
|
|
return len(q.b) >= q.subofs+8 && ICMP4Type(q.b[q.subofs]) == ICMP4EchoRequest && ICMP4Code(q.b[q.subofs+1]) == ICMP4NoCode
|
|
case ICMPv6:
|
|
return len(q.b) >= q.subofs+8 && ICMP6Type(q.b[q.subofs]) == ICMP6EchoRequest && ICMP6Code(q.b[q.subofs+1]) == ICMP6NoCode
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// IsEchoRequest reports whether q is an IPv4 ICMP Echo Response.
|
|
func (q *Parsed) IsEchoResponse() bool {
|
|
switch q.IPProto {
|
|
case ICMPv4:
|
|
return len(q.b) >= q.subofs+8 && ICMP4Type(q.b[q.subofs]) == ICMP4EchoReply && ICMP4Code(q.b[q.subofs+1]) == ICMP4NoCode
|
|
case ICMPv6:
|
|
return len(q.b) >= q.subofs+8 && ICMP6Type(q.b[q.subofs]) == ICMP6EchoReply && ICMP6Code(q.b[q.subofs+1]) == ICMP6NoCode
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func Hexdump(b []byte) string {
|
|
out := new(strings.Builder)
|
|
for i := 0; i < len(b); i += 16 {
|
|
if i > 0 {
|
|
fmt.Fprintf(out, "\n")
|
|
}
|
|
fmt.Fprintf(out, " %04x ", i)
|
|
j := 0
|
|
for ; j < 16 && i+j < len(b); j++ {
|
|
if j == 8 {
|
|
fmt.Fprintf(out, " ")
|
|
}
|
|
fmt.Fprintf(out, "%02x ", b[i+j])
|
|
}
|
|
for ; j < 16; j++ {
|
|
if j == 8 {
|
|
fmt.Fprintf(out, " ")
|
|
}
|
|
fmt.Fprintf(out, " ")
|
|
}
|
|
fmt.Fprintf(out, " ")
|
|
for j = 0; j < 16 && i+j < len(b); j++ {
|
|
if b[i+j] >= 32 && b[i+j] < 128 {
|
|
fmt.Fprintf(out, "%c", b[i+j])
|
|
} else {
|
|
fmt.Fprintf(out, ".")
|
|
}
|
|
}
|
|
}
|
|
return out.String()
|
|
}
|