Revert "go.mod,net/tstun,wgengine/netstack: implement gVisor TCP GSO for Linux (#12869)"

This reverts commit 7bc2ddaedc2ac5965f02a76453147e23d58687bd.

Updates tailscale/corp#22348

Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
Jordan Whited 2024-08-15 09:16:50 -07:00
parent 657f17a805
commit 93ce2833fd
No known key found for this signature in database
GPG Key ID: 33DF352F65991EB8
7 changed files with 28 additions and 360 deletions

View File

@ -305,6 +305,7 @@ tailscale.com/cmd/k8s-operator dependencies: (generated by github.com/tailscale/
gvisor.dev/gvisor/pkg/tcpip/header from gvisor.dev/gvisor/pkg/tcpip/header/parse+
gvisor.dev/gvisor/pkg/tcpip/header/parse from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+
gvisor.dev/gvisor/pkg/tcpip/internal/tcp from gvisor.dev/gvisor/pkg/tcpip/stack+
gvisor.dev/gvisor/pkg/tcpip/link/channel from tailscale.com/wgengine/netstack
gvisor.dev/gvisor/pkg/tcpip/network/hash from gvisor.dev/gvisor/pkg/tcpip/network/ipv4
gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+
gvisor.dev/gvisor/pkg/tcpip/network/internal/ip from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+

View File

@ -216,6 +216,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
gvisor.dev/gvisor/pkg/tcpip/header from gvisor.dev/gvisor/pkg/tcpip/header/parse+
gvisor.dev/gvisor/pkg/tcpip/header/parse from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+
gvisor.dev/gvisor/pkg/tcpip/internal/tcp from gvisor.dev/gvisor/pkg/tcpip/stack+
gvisor.dev/gvisor/pkg/tcpip/link/channel from tailscale.com/wgengine/netstack
gvisor.dev/gvisor/pkg/tcpip/network/hash from gvisor.dev/gvisor/pkg/tcpip/network/ipv4
gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+
gvisor.dev/gvisor/pkg/tcpip/network/internal/ip from gvisor.dev/gvisor/pkg/tcpip/network/ipv4+

2
go.mod
View File

@ -82,7 +82,7 @@ require (
github.com/tailscale/peercred v0.0.0-20240214030740-b535050b2aa4
github.com/tailscale/web-client-prebuilt v0.0.0-20240226180453-5db17b287bf1
github.com/tailscale/wf v0.0.0-20240214030419-6fbb0a674ee6
github.com/tailscale/wireguard-go v0.0.0-20240724015428-60eeedfd624b
github.com/tailscale/wireguard-go v0.0.0-20240705152531-2f5d148bcfe1
github.com/tailscale/xnet v0.0.0-20240729143630-8497ac4dab2e
github.com/tc-hib/winres v0.2.1
github.com/tcnksm/go-httpstat v0.2.0

4
go.sum
View File

@ -938,8 +938,8 @@ github.com/tailscale/web-client-prebuilt v0.0.0-20240226180453-5db17b287bf1 h1:t
github.com/tailscale/web-client-prebuilt v0.0.0-20240226180453-5db17b287bf1/go.mod h1:agQPE6y6ldqCOui2gkIh7ZMztTkIQKH049tv8siLuNQ=
github.com/tailscale/wf v0.0.0-20240214030419-6fbb0a674ee6 h1:l10Gi6w9jxvinoiq15g8OToDdASBni4CyJOdHY1Hr8M=
github.com/tailscale/wf v0.0.0-20240214030419-6fbb0a674ee6/go.mod h1:ZXRML051h7o4OcI0d3AaILDIad/Xw0IkXaHM17dic1Y=
github.com/tailscale/wireguard-go v0.0.0-20240724015428-60eeedfd624b h1:8U9NaPB32iFoNjJ+H/yPkAVqXw/dudtj+fLTE4edF+Q=
github.com/tailscale/wireguard-go v0.0.0-20240724015428-60eeedfd624b/go.mod h1:BOm5fXUBFM+m9woLNBoxI9TaBXXhGNP50LX/TGIvGb4=
github.com/tailscale/wireguard-go v0.0.0-20240705152531-2f5d148bcfe1 h1:ycpNCSYwzZ7x4G4ioPNtKQmIY0G/3o4pVf8wCZq6blY=
github.com/tailscale/wireguard-go v0.0.0-20240705152531-2f5d148bcfe1/go.mod h1:BOm5fXUBFM+m9woLNBoxI9TaBXXhGNP50LX/TGIvGb4=
github.com/tailscale/xnet v0.0.0-20240729143630-8497ac4dab2e h1:zOGKqN5D5hHhiYUp091JqK7DPCqSARyUfduhGUY8Bek=
github.com/tailscale/xnet v0.0.0-20240729143630-8497ac4dab2e/go.mod h1:orPd6JZXXRyuDusYilywte7k094d7dycXXU5YnWsrwg=
github.com/tc-hib/winres v0.2.1 h1:YDE0FiP0VmtRaDn7+aaChp1KiF4owBiJa5l964l5ujA=

View File

@ -10,7 +10,6 @@ import (
"net/netip"
"os"
"reflect"
"runtime"
"slices"
"strings"
"sync"
@ -18,7 +17,6 @@ import (
"time"
"github.com/gaissmai/bart"
"github.com/tailscale/wireguard-go/conn"
"github.com/tailscale/wireguard-go/device"
"github.com/tailscale/wireguard-go/tun"
"go4.org/mem"
@ -896,7 +894,13 @@ func (t *Wrapper) Read(buffs [][]byte, sizes []int, offset int) (int, error) {
return 0, res.err
}
if res.data == nil {
return t.injectedRead(res.injected, buffs, sizes, offset)
n, err := t.injectedRead(res.injected, buffs[0], offset)
sizes[0] = n
if err != nil && n == 0 {
return 0, err
}
return 1, err
}
metricPacketOut.Add(int64(len(res.data)))
@ -951,85 +955,27 @@ func (t *Wrapper) Read(buffs [][]byte, sizes []int, offset int) (int, error) {
return buffsPos, res.err
}
const (
minTCPHeaderSize = 20
)
func stackGSOToTunGSO(pkt []byte, gso stack.GSO) (tun.GSOOptions, error) {
options := tun.GSOOptions{
CsumStart: gso.L3HdrLen,
CsumOffset: gso.CsumOffset,
GSOSize: gso.MSS,
NeedsCsum: gso.NeedsCsum,
}
switch gso.Type {
case stack.GSONone:
options.GSOType = tun.GSONone
return options, nil
case stack.GSOTCPv4:
options.GSOType = tun.GSOTCPv4
case stack.GSOTCPv6:
options.GSOType = tun.GSOTCPv6
default:
return tun.GSOOptions{}, fmt.Errorf("unsupported gVisor GSOType: %v", gso.Type)
}
// options.HdrLen is both layer 3 and 4 together, whereas gVisor only
// gives us layer 3 length. We have to gather TCP header length
// ourselves.
if len(pkt) < int(gso.L3HdrLen)+minTCPHeaderSize {
return tun.GSOOptions{}, errors.New("gVisor GSOTCP packet length too short")
}
tcphLen := uint16(pkt[int(gso.L3HdrLen)+12] >> 4 * 4)
options.HdrLen = gso.L3HdrLen + tcphLen
return options, nil
}
func invertGSOChecksum(pkt []byte, gso stack.GSO) {
if gso.NeedsCsum != true {
return
}
at := int(gso.L3HdrLen + gso.CsumOffset)
if at+1 > len(pkt)-1 {
return
}
pkt[at] = ^pkt[at]
pkt[at+1] = ^pkt[at+1]
}
// injectedRead handles injected reads, which bypass filters.
func (t *Wrapper) injectedRead(res tunInjectedRead, outBuffs [][]byte, sizes []int, offset int) (n int, err error) {
var gso stack.GSO
func (t *Wrapper) injectedRead(res tunInjectedRead, buf []byte, offset int) (int, error) {
metricPacketOut.Add(1)
pkt := outBuffs[0][offset:]
if res.packet != nil {
bufN := copy(pkt, res.packet.NetworkHeader().Slice())
bufN += copy(pkt[bufN:], res.packet.TransportHeader().Slice())
bufN += copy(pkt[bufN:], res.packet.Data().AsRange().ToSlice())
gso = res.packet.GSOOptions
pkt = pkt[:bufN]
defer res.packet.DecRef() // defer DecRef so we may continue to reference it
var n int
if !res.packet.IsNil() {
n = copy(buf[offset:], res.packet.NetworkHeader().Slice())
n += copy(buf[offset+n:], res.packet.TransportHeader().Slice())
n += copy(buf[offset+n:], res.packet.Data().AsRange().ToSlice())
res.packet.DecRef()
} else {
sizes[0] = copy(pkt, res.data)
pkt = pkt[:sizes[0]]
n = 1
n = copy(buf[offset:], res.data)
}
pc := t.peerConfig.Load()
p := parsedPacketPool.Get().(*packet.Parsed)
defer parsedPacketPool.Put(p)
p.Decode(pkt)
// We invert the transport layer checksum before and after snat() if gVisor
// handed us a segment with a partial checksum. A partial checksum is not a
// ones' complement of the sum, and incremental checksum updating that could
// occur as a result of snat() is not aware of this. Alternatively we could
// plumb partial transport layer checksum awareness down through snat(),
// but the surface area of such a change is much larger, and not yet
// justified by this singular case.
invertGSOChecksum(pkt, gso)
p.Decode(buf[offset : offset+n])
pc.snat(p)
invertGSOChecksum(pkt, gso)
if m := t.destIPActivity.Load(); m != nil {
if fn := m[p.Dst.Addr()]; fn != nil {
@ -1037,24 +983,11 @@ func (t *Wrapper) injectedRead(res tunInjectedRead, outBuffs [][]byte, sizes []i
}
}
if res.packet != nil {
var gsoOptions tun.GSOOptions
gsoOptions, err = stackGSOToTunGSO(pkt, gso)
if err != nil {
return 0, err
}
n, err = tun.GSOSplit(pkt, gsoOptions, outBuffs, sizes, offset)
}
if stats := t.stats.Load(); stats != nil {
for i := 0; i < n; i++ {
stats.UpdateTxVirtual(outBuffs[i][offset : offset+sizes[i]])
}
stats.UpdateTxVirtual(buf[offset:][:n])
}
t.noteActivity()
metricPacketOut.Add(int64(n))
return n, err
return n, nil
}
func (t *Wrapper) filterPacketInboundFromWireGuard(p *packet.Parsed, captHook capture.Callback, pc *peerConfigTable) filter.Response {
@ -1355,14 +1288,6 @@ func (t *Wrapper) InjectOutboundPacketBuffer(pkt *stack.PacketBuffer) error {
}
func (t *Wrapper) BatchSize() int {
if runtime.GOOS == "linux" {
// Always setup Linux to handle vectors, even in the very rare case that
// the underlying t.tdev returns 1. gVisor GSO is always enabled for
// Linux, and we cannot make a determination on gVisor usage at
// wireguard-go.Device startup, which is when this value matters for
// packet memory init.
return conn.IdealBatchSize
}
return t.tdev.BatchSize()
}

View File

@ -1,256 +0,0 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package netstack
import (
"context"
"sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
)
type queue struct {
// TODO(jwhited): evaluate performance with mu as Mutex and/or alternative
// non-channel buffer.
c chan *stack.PacketBuffer
mu sync.RWMutex // mu guards closed
closed bool
}
func (q *queue) Close() {
q.mu.Lock()
defer q.mu.Unlock()
if !q.closed {
close(q.c)
}
q.closed = true
}
func (q *queue) Read() *stack.PacketBuffer {
select {
case p := <-q.c:
return p
default:
return nil
}
}
func (q *queue) ReadContext(ctx context.Context) *stack.PacketBuffer {
select {
case pkt := <-q.c:
return pkt
case <-ctx.Done():
return nil
}
}
func (q *queue) Write(pkt *stack.PacketBuffer) tcpip.Error {
// q holds the PacketBuffer.
q.mu.RLock()
defer q.mu.RUnlock()
if q.closed {
return &tcpip.ErrClosedForSend{}
}
wrote := false
select {
case q.c <- pkt.IncRef():
wrote = true
default:
// TODO(jwhited): reconsider/count
pkt.DecRef()
}
if wrote {
return nil
}
return &tcpip.ErrNoBufferSpace{}
}
func (q *queue) Num() int {
return len(q.c)
}
var _ stack.LinkEndpoint = (*linkEndpoint)(nil)
var _ stack.GSOEndpoint = (*linkEndpoint)(nil)
// linkEndpoint implements stack.LinkEndpoint and stack.GSOEndpoint. Outbound
// packets written by gVisor towards Tailscale are stored in a channel.
// Inbound is fed to gVisor via InjectInbound. This is loosely modeled after
// gvisor.dev/pkg/tcpip/link/channel.Endpoint.
type linkEndpoint struct {
LinkEPCapabilities stack.LinkEndpointCapabilities
SupportedGSOKind stack.SupportedGSO
mu sync.RWMutex // mu guards the following fields
dispatcher stack.NetworkDispatcher
linkAddr tcpip.LinkAddress
mtu uint32
q *queue // outbound
}
func newLinkEndpoint(size int, mtu uint32, linkAddr tcpip.LinkAddress) *linkEndpoint {
return &linkEndpoint{
q: &queue{
c: make(chan *stack.PacketBuffer, size),
},
mtu: mtu,
linkAddr: linkAddr,
}
}
// Close closes l. Further packet injections will return an error, and all
// pending packets are discarded. Close may be called concurrently with
// WritePackets.
func (l *linkEndpoint) Close() {
l.q.Close()
l.Drain()
}
// Read does non-blocking read one packet from the outbound packet queue.
func (l *linkEndpoint) Read() *stack.PacketBuffer {
return l.q.Read()
}
// ReadContext does blocking read for one packet from the outbound packet queue.
// It can be cancelled by ctx, and in this case, it returns nil.
func (l *linkEndpoint) ReadContext(ctx context.Context) *stack.PacketBuffer {
return l.q.ReadContext(ctx)
}
// Drain removes all outbound packets from the channel and counts them.
func (l *linkEndpoint) Drain() int {
c := 0
for pkt := l.Read(); pkt != nil; pkt = l.Read() {
pkt.DecRef()
c++
}
return c
}
// NumQueued returns the number of packet queued for outbound.
func (l *linkEndpoint) NumQueued() int {
return l.q.Num()
}
// InjectInbound injects an inbound packet. If the endpoint is not attached, the
// packet is not delivered.
func (l *linkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
l.mu.RLock()
d := l.dispatcher
l.mu.RUnlock()
if d != nil {
d.DeliverNetworkPacket(protocol, pkt)
}
}
// Attach saves the stack network-layer dispatcher for use later when packets
// are injected.
func (l *linkEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
l.mu.Lock()
defer l.mu.Unlock()
l.dispatcher = dispatcher
}
// IsAttached implements stack.LinkEndpoint.IsAttached.
func (l *linkEndpoint) IsAttached() bool {
l.mu.RLock()
defer l.mu.RUnlock()
return l.dispatcher != nil
}
// MTU implements stack.LinkEndpoint.MTU.
func (l *linkEndpoint) MTU() uint32 {
l.mu.RLock()
defer l.mu.RUnlock()
return l.mtu
}
// SetMTU implements stack.LinkEndpoint.SetMTU.
func (l *linkEndpoint) SetMTU(mtu uint32) {
l.mu.Lock()
defer l.mu.Unlock()
l.mtu = mtu
}
// Capabilities implements stack.LinkEndpoint.Capabilities.
func (l *linkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
return l.LinkEPCapabilities
}
// GSOMaxSize implements stack.GSOEndpoint.
func (*linkEndpoint) GSOMaxSize() uint32 {
// This an increase from 32k returned by channel.Endpoint.GSOMaxSize() to
// 64k, which improves throughput.
return (1 << 16) - 1
}
// SupportedGSO implements stack.GSOEndpoint.
func (l *linkEndpoint) SupportedGSO() stack.SupportedGSO {
return l.SupportedGSOKind
}
// MaxHeaderLength returns the maximum size of the link layer header. Given it
// doesn't have a header, it just returns 0.
func (*linkEndpoint) MaxHeaderLength() uint16 {
return 0
}
// LinkAddress returns the link address of this endpoint.
func (l *linkEndpoint) LinkAddress() tcpip.LinkAddress {
l.mu.RLock()
defer l.mu.RUnlock()
return l.linkAddr
}
// SetLinkAddress implements stack.LinkEndpoint.SetLinkAddress.
func (l *linkEndpoint) SetLinkAddress(addr tcpip.LinkAddress) {
l.mu.Lock()
defer l.mu.Unlock()
l.linkAddr = addr
}
// WritePackets stores outbound packets into the channel.
// Multiple concurrent calls are permitted.
func (l *linkEndpoint) WritePackets(pkts stack.PacketBufferList) (int, tcpip.Error) {
n := 0
// TODO(jwhited): evaluate writing a stack.PacketBufferList instead of a
// single packet. We can split 2 x 64K GSO across
// wireguard-go/conn.IdealBatchSize (128 slots) @ 1280 MTU, and non-GSO we
// could do more. Read API would need to change to take advantage. Verify
// gVisor limits around max number of segments packed together. Since we
// control MTU (and by effect TCP MSS in gVisor) we *shouldn't* expect to
// ever overflow 128 slots (see wireguard-go/tun.ErrTooManySegments usage).
for _, pkt := range pkts.AsSlice() {
if err := l.q.Write(pkt); err != nil {
if _, ok := err.(*tcpip.ErrNoBufferSpace); !ok && n == 0 {
return 0, err
}
break
}
n++
}
return n, nil
}
// Wait implements stack.LinkEndpoint.Wait.
func (*linkEndpoint) Wait() {}
// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
func (*linkEndpoint) ARPHardwareType() header.ARPHardwareType {
return header.ARPHardwareNone
}
// AddHeader implements stack.LinkEndpoint.AddHeader.
func (*linkEndpoint) AddHeader(*stack.PacketBuffer) {}
// ParseHeader implements stack.LinkEndpoint.ParseHeader.
func (*linkEndpoint) ParseHeader(*stack.PacketBuffer) bool { return true }
// SetOnCloseAction implements stack.LinkEndpoint.
func (*linkEndpoint) SetOnCloseAction(func()) {}

View File

@ -26,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/link/channel"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
@ -175,7 +176,7 @@ type Impl struct {
ProcessSubnets bool
ipstack *stack.Stack
linkEP *linkEndpoint
linkEP *channel.Endpoint
tundev *tstun.Wrapper
e wgengine.Engine
pm *proxymap.Mapper
@ -326,11 +327,7 @@ func Create(logf logger.Logf, tundev *tstun.Wrapper, e wgengine.Engine, mc *magi
if err != nil {
return nil, err
}
linkEP := newLinkEndpoint(512, uint32(tstun.DefaultTUNMTU()), "")
if runtime.GOOS == "linux" {
// TODO(jwhited): add Windows support https://github.com/tailscale/corp/issues/21874
linkEP.SupportedGSOKind = stack.HostGSOSupported
}
linkEP := channel.New(512, uint32(tstun.DefaultTUNMTU()), "")
if tcpipProblem := ipstack.CreateNIC(nicID, linkEP); tcpipProblem != nil {
return nil, fmt.Errorf("could not create netstack NIC: %v", tcpipProblem)
}