mirror of
https://github.com/tailscale/tailscale.git
synced 2025-08-11 21:27:31 +00:00
go.mod,net/tstun,wgengine/netstack: implement gVisor TCP GSO for Linux (#12869)
This commit implements TCP GSO for packets being read from gVisor on
Linux. Windows support will follow later. The wireguard-go dependency is
updated in order to make use of newly exported GSO logic from its tun
package.
A new gVisor stack.LinkEndpoint implementation has been established
(linkEndpoint) that is loosely modeled after its predecessor
(channel.Endpoint). This new implementation supports GSO of monster TCP
segments up to 64K in size, whereas channel.Endpoint only supports up to
32K. linkEndpoint will also be required for GRO, which will be
implemented in a follow-on commit.
TCP throughput from gVisor, i.e. TUN read direction, is dramatically
improved as a result of this commit. Benchmarks show substantial
improvement through a wide range of RTT and loss conditions, sometimes
as high as 5x.
The iperf3 results below demonstrate the effect of this commit between
two Linux computers with i5-12400 CPUs. There is roughly ~13us of round
trip latency between them.
The first result is from commit 57856fc
without TCP GSO.
Starting Test: protocol: TCP, 1 streams, 131072 byte blocks
- - - - - - - - - - - - - - - - - - - - - - - - -
Test Complete. Summary Results:
[ ID] Interval Transfer Bitrate Retr
[ 5] 0.00-10.00 sec 2.51 GBytes 2.15 Gbits/sec 154 sender
[ 5] 0.00-10.00 sec 2.49 GBytes 2.14 Gbits/sec receiver
The second result is from this commit with TCP GSO.
Starting Test: protocol: TCP, 1 streams, 131072 byte blocks
- - - - - - - - - - - - - - - - - - - - - - - - -
Test Complete. Summary Results:
[ ID] Interval Transfer Bitrate Retr
[ 5] 0.00-10.00 sec 12.6 GBytes 10.8 Gbits/sec 6 sender
[ 5] 0.00-10.00 sec 12.6 GBytes 10.8 Gbits/sec receiver
Updates #6816
Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
@@ -10,6 +10,7 @@ import (
|
||||
"net/netip"
|
||||
"os"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -17,6 +18,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/gaissmai/bart"
|
||||
"github.com/tailscale/wireguard-go/conn"
|
||||
"github.com/tailscale/wireguard-go/device"
|
||||
"github.com/tailscale/wireguard-go/tun"
|
||||
"go4.org/mem"
|
||||
@@ -894,13 +896,7 @@ func (t *Wrapper) Read(buffs [][]byte, sizes []int, offset int) (int, error) {
|
||||
return 0, res.err
|
||||
}
|
||||
if res.data == nil {
|
||||
n, err := t.injectedRead(res.injected, buffs[0], offset)
|
||||
sizes[0] = n
|
||||
if err != nil && n == 0 {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return 1, err
|
||||
return t.injectedRead(res.injected, buffs, sizes, offset)
|
||||
}
|
||||
|
||||
metricPacketOut.Add(int64(len(res.data)))
|
||||
@@ -955,27 +951,85 @@ func (t *Wrapper) Read(buffs [][]byte, sizes []int, offset int) (int, error) {
|
||||
return buffsPos, res.err
|
||||
}
|
||||
|
||||
const (
|
||||
minTCPHeaderSize = 20
|
||||
)
|
||||
|
||||
func stackGSOToTunGSO(pkt []byte, gso stack.GSO) (tun.GSOOptions, error) {
|
||||
options := tun.GSOOptions{
|
||||
CsumStart: gso.L3HdrLen,
|
||||
CsumOffset: gso.CsumOffset,
|
||||
GSOSize: gso.MSS,
|
||||
NeedsCsum: gso.NeedsCsum,
|
||||
}
|
||||
switch gso.Type {
|
||||
case stack.GSONone:
|
||||
options.GSOType = tun.GSONone
|
||||
return options, nil
|
||||
case stack.GSOTCPv4:
|
||||
options.GSOType = tun.GSOTCPv4
|
||||
case stack.GSOTCPv6:
|
||||
options.GSOType = tun.GSOTCPv6
|
||||
default:
|
||||
return tun.GSOOptions{}, fmt.Errorf("unsupported gVisor GSOType: %v", gso.Type)
|
||||
}
|
||||
// options.HdrLen is both layer 3 and 4 together, whereas gVisor only
|
||||
// gives us layer 3 length. We have to gather TCP header length
|
||||
// ourselves.
|
||||
if len(pkt) < int(gso.L3HdrLen)+minTCPHeaderSize {
|
||||
return tun.GSOOptions{}, errors.New("gVisor GSOTCP packet length too short")
|
||||
}
|
||||
tcphLen := uint16(pkt[int(gso.L3HdrLen)+12] >> 4 * 4)
|
||||
options.HdrLen = gso.L3HdrLen + tcphLen
|
||||
return options, nil
|
||||
}
|
||||
|
||||
func invertGSOChecksum(pkt []byte, gso stack.GSO) {
|
||||
if gso.NeedsCsum != true {
|
||||
return
|
||||
}
|
||||
at := int(gso.L3HdrLen + gso.CsumOffset)
|
||||
if at+1 > len(pkt)-1 {
|
||||
return
|
||||
}
|
||||
pkt[at] = ^pkt[at]
|
||||
pkt[at+1] = ^pkt[at+1]
|
||||
}
|
||||
|
||||
// injectedRead handles injected reads, which bypass filters.
|
||||
func (t *Wrapper) injectedRead(res tunInjectedRead, buf []byte, offset int) (int, error) {
|
||||
metricPacketOut.Add(1)
|
||||
func (t *Wrapper) injectedRead(res tunInjectedRead, outBuffs [][]byte, sizes []int, offset int) (n int, err error) {
|
||||
var gso stack.GSO
|
||||
|
||||
var n int
|
||||
if !res.packet.IsNil() {
|
||||
|
||||
n = copy(buf[offset:], res.packet.NetworkHeader().Slice())
|
||||
n += copy(buf[offset+n:], res.packet.TransportHeader().Slice())
|
||||
n += copy(buf[offset+n:], res.packet.Data().AsRange().ToSlice())
|
||||
res.packet.DecRef()
|
||||
pkt := outBuffs[0][offset:]
|
||||
if res.packet != nil {
|
||||
bufN := copy(pkt, res.packet.NetworkHeader().Slice())
|
||||
bufN += copy(pkt[bufN:], res.packet.TransportHeader().Slice())
|
||||
bufN += copy(pkt[bufN:], res.packet.Data().AsRange().ToSlice())
|
||||
gso = res.packet.GSOOptions
|
||||
pkt = pkt[:bufN]
|
||||
defer res.packet.DecRef() // defer DecRef so we may continue to reference it
|
||||
} else {
|
||||
n = copy(buf[offset:], res.data)
|
||||
sizes[0] = copy(pkt, res.data)
|
||||
pkt = pkt[:sizes[0]]
|
||||
n = 1
|
||||
}
|
||||
|
||||
pc := t.peerConfig.Load()
|
||||
|
||||
p := parsedPacketPool.Get().(*packet.Parsed)
|
||||
defer parsedPacketPool.Put(p)
|
||||
p.Decode(buf[offset : offset+n])
|
||||
p.Decode(pkt)
|
||||
|
||||
// We invert the transport layer checksum before and after snat() if gVisor
|
||||
// handed us a segment with a partial checksum. A partial checksum is not a
|
||||
// ones' complement of the sum, and incremental checksum updating that could
|
||||
// occur as a result of snat() is not aware of this. Alternatively we could
|
||||
// plumb partial transport layer checksum awareness down through snat(),
|
||||
// but the surface area of such a change is much larger, and not yet
|
||||
// justified by this singular case.
|
||||
invertGSOChecksum(pkt, gso)
|
||||
pc.snat(p)
|
||||
invertGSOChecksum(pkt, gso)
|
||||
|
||||
if m := t.destIPActivity.Load(); m != nil {
|
||||
if fn := m[p.Dst.Addr()]; fn != nil {
|
||||
@@ -983,11 +1037,24 @@ func (t *Wrapper) injectedRead(res tunInjectedRead, buf []byte, offset int) (int
|
||||
}
|
||||
}
|
||||
|
||||
if stats := t.stats.Load(); stats != nil {
|
||||
stats.UpdateTxVirtual(buf[offset:][:n])
|
||||
if res.packet != nil {
|
||||
var gsoOptions tun.GSOOptions
|
||||
gsoOptions, err = stackGSOToTunGSO(pkt, gso)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
n, err = tun.GSOSplit(pkt, gsoOptions, outBuffs, sizes, offset)
|
||||
}
|
||||
|
||||
if stats := t.stats.Load(); stats != nil {
|
||||
for i := 0; i < n; i++ {
|
||||
stats.UpdateTxVirtual(outBuffs[i][offset : offset+sizes[i]])
|
||||
}
|
||||
}
|
||||
|
||||
t.noteActivity()
|
||||
return n, nil
|
||||
metricPacketOut.Add(int64(n))
|
||||
return n, err
|
||||
}
|
||||
|
||||
func (t *Wrapper) filterPacketInboundFromWireGuard(p *packet.Parsed, captHook capture.Callback, pc *peerConfigTable) filter.Response {
|
||||
@@ -1288,6 +1355,14 @@ func (t *Wrapper) InjectOutboundPacketBuffer(pkt *stack.PacketBuffer) error {
|
||||
}
|
||||
|
||||
func (t *Wrapper) BatchSize() int {
|
||||
if runtime.GOOS == "linux" {
|
||||
// Always setup Linux to handle vectors, even in the very rare case that
|
||||
// the underlying t.tdev returns 1. gVisor GSO is always enabled for
|
||||
// Linux, and we cannot make a determination on gVisor usage at
|
||||
// wireguard-go.Device startup, which is when this value matters for
|
||||
// packet memory init.
|
||||
return conn.IdealBatchSize
|
||||
}
|
||||
return t.tdev.BatchSize()
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user