go.mod,net/tstun,wgengine/netstack: implement gVisor TCP GSO for Linux (#12869)

This commit implements TCP GSO for packets being read from gVisor on
Linux. Windows support will follow later. The wireguard-go dependency is
updated in order to make use of newly exported GSO logic from its tun
package.

A new gVisor stack.LinkEndpoint implementation has been established
(linkEndpoint) that is loosely modeled after its predecessor
(channel.Endpoint). This new implementation supports GSO of monster TCP
segments up to 64K in size, whereas channel.Endpoint only supports up to
32K. linkEndpoint will also be required for GRO, which will be
implemented in a follow-on commit.

TCP throughput from gVisor, i.e. TUN read direction, is dramatically
improved as a result of this commit. Benchmarks show substantial
improvement through a wide range of RTT and loss conditions, sometimes
as high as 5x.

The iperf3 results below demonstrate the effect of this commit between
two Linux computers with i5-12400 CPUs. There is roughly ~13us of round
trip latency between them.

The first result is from commit 57856fc without TCP GSO.

Starting Test: protocol: TCP, 1 streams, 131072 byte blocks
- - - - - - - - - - - - - - - - - - - - - - - - -
Test Complete. Summary Results:
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec  2.51 GBytes  2.15 Gbits/sec  154 sender
[  5]   0.00-10.00  sec  2.49 GBytes  2.14 Gbits/sec      receiver

The second result is from this commit with TCP GSO.

Starting Test: protocol: TCP, 1 streams, 131072 byte blocks
- - - - - - - - - - - - - - - - - - - - - - - - -
Test Complete. Summary Results:
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec  12.6 GBytes  10.8 Gbits/sec    6 sender
[  5]   0.00-10.00  sec  12.6 GBytes  10.8 Gbits/sec      receiver

Updates #6816

Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
Jordan Whited
2024-07-31 09:42:11 -07:00
committed by GitHub
parent 949b15d858
commit 7bc2ddaedc
7 changed files with 361 additions and 29 deletions

View File

@@ -10,6 +10,7 @@ import (
"net/netip"
"os"
"reflect"
"runtime"
"slices"
"strings"
"sync"
@@ -17,6 +18,7 @@ import (
"time"
"github.com/gaissmai/bart"
"github.com/tailscale/wireguard-go/conn"
"github.com/tailscale/wireguard-go/device"
"github.com/tailscale/wireguard-go/tun"
"go4.org/mem"
@@ -894,13 +896,7 @@ func (t *Wrapper) Read(buffs [][]byte, sizes []int, offset int) (int, error) {
return 0, res.err
}
if res.data == nil {
n, err := t.injectedRead(res.injected, buffs[0], offset)
sizes[0] = n
if err != nil && n == 0 {
return 0, err
}
return 1, err
return t.injectedRead(res.injected, buffs, sizes, offset)
}
metricPacketOut.Add(int64(len(res.data)))
@@ -955,27 +951,85 @@ func (t *Wrapper) Read(buffs [][]byte, sizes []int, offset int) (int, error) {
return buffsPos, res.err
}
const (
minTCPHeaderSize = 20
)
func stackGSOToTunGSO(pkt []byte, gso stack.GSO) (tun.GSOOptions, error) {
options := tun.GSOOptions{
CsumStart: gso.L3HdrLen,
CsumOffset: gso.CsumOffset,
GSOSize: gso.MSS,
NeedsCsum: gso.NeedsCsum,
}
switch gso.Type {
case stack.GSONone:
options.GSOType = tun.GSONone
return options, nil
case stack.GSOTCPv4:
options.GSOType = tun.GSOTCPv4
case stack.GSOTCPv6:
options.GSOType = tun.GSOTCPv6
default:
return tun.GSOOptions{}, fmt.Errorf("unsupported gVisor GSOType: %v", gso.Type)
}
// options.HdrLen is both layer 3 and 4 together, whereas gVisor only
// gives us layer 3 length. We have to gather TCP header length
// ourselves.
if len(pkt) < int(gso.L3HdrLen)+minTCPHeaderSize {
return tun.GSOOptions{}, errors.New("gVisor GSOTCP packet length too short")
}
tcphLen := uint16(pkt[int(gso.L3HdrLen)+12] >> 4 * 4)
options.HdrLen = gso.L3HdrLen + tcphLen
return options, nil
}
func invertGSOChecksum(pkt []byte, gso stack.GSO) {
if gso.NeedsCsum != true {
return
}
at := int(gso.L3HdrLen + gso.CsumOffset)
if at+1 > len(pkt)-1 {
return
}
pkt[at] = ^pkt[at]
pkt[at+1] = ^pkt[at+1]
}
// injectedRead handles injected reads, which bypass filters.
func (t *Wrapper) injectedRead(res tunInjectedRead, buf []byte, offset int) (int, error) {
metricPacketOut.Add(1)
func (t *Wrapper) injectedRead(res tunInjectedRead, outBuffs [][]byte, sizes []int, offset int) (n int, err error) {
var gso stack.GSO
var n int
if !res.packet.IsNil() {
n = copy(buf[offset:], res.packet.NetworkHeader().Slice())
n += copy(buf[offset+n:], res.packet.TransportHeader().Slice())
n += copy(buf[offset+n:], res.packet.Data().AsRange().ToSlice())
res.packet.DecRef()
pkt := outBuffs[0][offset:]
if res.packet != nil {
bufN := copy(pkt, res.packet.NetworkHeader().Slice())
bufN += copy(pkt[bufN:], res.packet.TransportHeader().Slice())
bufN += copy(pkt[bufN:], res.packet.Data().AsRange().ToSlice())
gso = res.packet.GSOOptions
pkt = pkt[:bufN]
defer res.packet.DecRef() // defer DecRef so we may continue to reference it
} else {
n = copy(buf[offset:], res.data)
sizes[0] = copy(pkt, res.data)
pkt = pkt[:sizes[0]]
n = 1
}
pc := t.peerConfig.Load()
p := parsedPacketPool.Get().(*packet.Parsed)
defer parsedPacketPool.Put(p)
p.Decode(buf[offset : offset+n])
p.Decode(pkt)
// We invert the transport layer checksum before and after snat() if gVisor
// handed us a segment with a partial checksum. A partial checksum is not a
// ones' complement of the sum, and incremental checksum updating that could
// occur as a result of snat() is not aware of this. Alternatively we could
// plumb partial transport layer checksum awareness down through snat(),
// but the surface area of such a change is much larger, and not yet
// justified by this singular case.
invertGSOChecksum(pkt, gso)
pc.snat(p)
invertGSOChecksum(pkt, gso)
if m := t.destIPActivity.Load(); m != nil {
if fn := m[p.Dst.Addr()]; fn != nil {
@@ -983,11 +1037,24 @@ func (t *Wrapper) injectedRead(res tunInjectedRead, buf []byte, offset int) (int
}
}
if stats := t.stats.Load(); stats != nil {
stats.UpdateTxVirtual(buf[offset:][:n])
if res.packet != nil {
var gsoOptions tun.GSOOptions
gsoOptions, err = stackGSOToTunGSO(pkt, gso)
if err != nil {
return 0, err
}
n, err = tun.GSOSplit(pkt, gsoOptions, outBuffs, sizes, offset)
}
if stats := t.stats.Load(); stats != nil {
for i := 0; i < n; i++ {
stats.UpdateTxVirtual(outBuffs[i][offset : offset+sizes[i]])
}
}
t.noteActivity()
return n, nil
metricPacketOut.Add(int64(n))
return n, err
}
func (t *Wrapper) filterPacketInboundFromWireGuard(p *packet.Parsed, captHook capture.Callback, pc *peerConfigTable) filter.Response {
@@ -1288,6 +1355,14 @@ func (t *Wrapper) InjectOutboundPacketBuffer(pkt *stack.PacketBuffer) error {
}
func (t *Wrapper) BatchSize() int {
if runtime.GOOS == "linux" {
// Always setup Linux to handle vectors, even in the very rare case that
// the underlying t.tdev returns 1. gVisor GSO is always enabled for
// Linux, and we cannot make a determination on gVisor usage at
// wireguard-go.Device startup, which is when this value matters for
// packet memory init.
return conn.IdealBatchSize
}
return t.tdev.BatchSize()
}