net/neterror, wgengine/magicsock: use UDP GSO and GRO on Linux (#7791)

This commit implements UDP offloading for Linux. GSO size is passed to and from the kernel via socket control messages. Support is probed at runtime. UDP GSO is dependent on checksum offload support on the egress netdev. UDP GSO will be disabled in the event sendmmsg() returns EIO, which is a strong signal that the egress netdev does not support checksum offload. Updates tailscale/corp#8734 Signed-off-by: Jordan Whited <jordan@tailscale.com>
2025-08-25 04:37:42 +00:00 · 2023-04-04 16:32:16 -07:00
parent 45138fcfba
commit f475e5550c
6 changed files with 780 additions and 198 deletions
--- a/wgengine/magicsock/magicsock.go
+++ b/wgengine/magicsock/magicsock.go
@@ -44,7 +44,6 @@ import (
 	"tailscale.com/net/connstats"
 	"tailscale.com/net/dnscache"
 	"tailscale.com/net/interfaces"
-	"tailscale.com/net/netaddr"
 	"tailscale.com/net/netcheck"
 	"tailscale.com/net/neterror"
 	"tailscale.com/net/netns"
@@ -281,7 +280,6 @@ type Conn struct {
 	pconn6 RebindingUDPConn

 	receiveBatchPool sync.Pool
-	sendBatchPool    sync.Pool

 	// closeDisco4 and closeDisco6 are io.Closers to shut down the raw
 	// disco packet receivers. If nil, no raw disco receiver is
@@ -597,26 +595,13 @@ func newConn() *Conn {
 		msgs := make([]ipv6.Message, c.bind.BatchSize())
 		for i := range msgs {
 			msgs[i].Buffers = make([][]byte, 1)
+			msgs[i].OOB = make([]byte, controlMessageSize)
 		}
 		batch := &receiveBatch{
 			msgs: msgs,
 		}
 		return batch
 	}}
-	c.sendBatchPool = sync.Pool{New: func() any {
-		ua := &net.UDPAddr{
-			IP: make([]byte, 16),
-		}
-		msgs := make([]ipv6.Message, c.bind.BatchSize())
-		for i := range msgs {
-			msgs[i].Buffers = make([][]byte, 1)
-			msgs[i].Addr = ua
-		}
-		return &sendBatch{
-			ua:   ua,
-			msgs: msgs,
-		}
-	}}
 	c.muCond = sync.NewCond(&c.mu)
 	c.networkUp.Store(true) // assume up until told otherwise
 	return c
@@ -1301,19 +1286,11 @@ var errNoUDP = errors.New("no UDP available on platform")

 var (
 	// This acts as a compile-time check for our usage of ipv6.Message in
-	// udpConnWithBatchOps for both IPv6 and IPv4 operations.
+	// batchingUDPConn for both IPv6 and IPv4 operations.
 	_ ipv6.Message = ipv4.Message{}
 )

-type sendBatch struct {
-	ua   *net.UDPAddr
-	msgs []ipv6.Message // ipv4.Message and ipv6.Message are the same underlying type
-}
-
 func (c *Conn) sendUDPBatch(addr netip.AddrPort, buffs [][]byte) (sent bool, err error) {
-	batch := c.sendBatchPool.Get().(*sendBatch)
-	defer c.sendBatchPool.Put(batch)
-
 	isIPv6 := false
 	switch {
 	case addr.Addr().Is4():
@@ -1322,19 +1299,17 @@ func (c *Conn) sendUDPBatch(addr netip.AddrPort, buffs [][]byte) (sent bool, err
 	default:
 		panic("bogus sendUDPBatch addr type")
 	}
-
-	as16 := addr.Addr().As16()
-	copy(batch.ua.IP, as16[:])
-	batch.ua.Port = int(addr.Port())
-	for i, buff := range buffs {
-		batch.msgs[i].Buffers[0] = buff
-		batch.msgs[i].Addr = batch.ua
-	}
-
 	if isIPv6 {
-		_, err = c.pconn6.WriteBatch(batch.msgs[:len(buffs)], 0)
+		err = c.pconn6.WriteBatchTo(buffs, addr)
 	} else {
-		_, err = c.pconn4.WriteBatch(batch.msgs[:len(buffs)], 0)
+		err = c.pconn4.WriteBatchTo(buffs, addr)
+	}
+	if err != nil {
+		var errGSO neterror.ErrUDPGSODisabled
+		if errors.As(err, &errGSO) {
+			c.logf("magicsock: %s", errGSO.Error())
+			err = errGSO.RetryErr
+		}
 	}
 	return err == nil, err
 }
@@ -1844,14 +1819,18 @@ type receiveBatch struct {
 	msgs []ipv6.Message
 }

-func (c *Conn) getReceiveBatch() *receiveBatch {
+func (c *Conn) getReceiveBatchForBuffs(buffs [][]byte) *receiveBatch {
 	batch := c.receiveBatchPool.Get().(*receiveBatch)
+	for i := range buffs {
+		batch.msgs[i].Buffers[0] = buffs[i]
+		batch.msgs[i].OOB = batch.msgs[i].OOB[:cap(batch.msgs[i].OOB)]
+	}
 	return batch
 }

 func (c *Conn) putReceiveBatch(batch *receiveBatch) {
 	for i := range batch.msgs {
-		batch.msgs[i] = ipv6.Message{Buffers: batch.msgs[i].Buffers}
+		batch.msgs[i] = ipv6.Message{Buffers: batch.msgs[i].Buffers, OOB: batch.msgs[i].OOB}
 	}
 	c.receiveBatchPool.Put(batch)
 }
@@ -1860,13 +1839,10 @@ func (c *Conn) receiveIPv6(buffs [][]byte, sizes []int, eps []conn.Endpoint) (in
 	health.ReceiveIPv6.Enter()
 	defer health.ReceiveIPv6.Exit()

-	batch := c.getReceiveBatch()
+	batch := c.getReceiveBatchForBuffs(buffs)
 	defer c.putReceiveBatch(batch)
 	for {
-		for i := range buffs {
-			batch.msgs[i].Buffers[0] = buffs[i]
-		}
-		numMsgs, err := c.pconn6.ReadBatch(batch.msgs, 0)
+		numMsgs, err := c.pconn6.ReadBatch(batch.msgs[:len(buffs)], 0)
 		if err != nil {
 			if neterror.PacketWasTruncated(err) {
 				// TODO(raggi): discuss whether to log?
@@ -1877,6 +1853,10 @@ func (c *Conn) receiveIPv6(buffs [][]byte, sizes []int, eps []conn.Endpoint) (in

 		reportToCaller := false
 		for i, msg := range batch.msgs[:numMsgs] {
+			if msg.N == 0 {
+				sizes[i] = 0
+				continue
+			}
 			ipp := msg.Addr.(*net.UDPAddr).AddrPort()
 			if ep, ok := c.receiveIP(msg.Buffers[0][:msg.N], ipp, &c.ippEndpoint6); ok {
 				metricRecvDataIPv6.Add(1)
@@ -1898,13 +1878,10 @@ func (c *Conn) receiveIPv4(buffs [][]byte, sizes []int, eps []conn.Endpoint) (in
 	health.ReceiveIPv4.Enter()
 	defer health.ReceiveIPv4.Exit()

-	batch := c.getReceiveBatch()
+	batch := c.getReceiveBatchForBuffs(buffs)
 	defer c.putReceiveBatch(batch)
 	for {
-		for i := range buffs {
-			batch.msgs[i].Buffers[0] = buffs[i]
-		}
-		numMsgs, err := c.pconn4.ReadBatch(batch.msgs, 0)
+		numMsgs, err := c.pconn4.ReadBatch(batch.msgs[:len(buffs)], 0)
 		if err != nil {
 			if neterror.PacketWasTruncated(err) {
 				// TODO(raggi): discuss whether to log?
@@ -1915,6 +1892,10 @@ func (c *Conn) receiveIPv4(buffs [][]byte, sizes []int, eps []conn.Endpoint) (in

 		reportToCaller := false
 		for i, msg := range batch.msgs[:numMsgs] {
+			if msg.N == 0 {
+				sizes[i] = 0
+				continue
+			}
 			ipp := msg.Addr.(*net.UDPAddr).AddrPort()
 			if ep, ok := c.receiveIP(msg.Buffers[0][:msg.N], ipp, &c.ippEndpoint4); ok {
 				metricRecvDataIPv4.Add(1)
@@ -1940,7 +1921,7 @@ func (c *Conn) receiveIP(b []byte, ipp netip.AddrPort, cache *ippEndpointCache)
 		c.stunReceiveFunc.Load()(b, ipp)
 		return nil, false
 	}
-	if c.handleDiscoMessage(b, ipp, key.NodePublic{}) {
+	if c.handleDiscoMessage(b, ipp, key.NodePublic{}, discoRXPathUDP) {
 		return nil, false
 	}
 	if !c.havePrivateKey.Load() {
@@ -2005,7 +1986,7 @@ func (c *Conn) processDERPReadResult(dm derpReadResult, b []byte) (n int, ep *en
 	}

 	ipp := netip.AddrPortFrom(derpMagicIPAddr, uint16(regionID))
-	if c.handleDiscoMessage(b[:n], ipp, dm.src) {
+	if c.handleDiscoMessage(b[:n], ipp, dm.src, discoRXPathDERP) {
 		return 0, nil
 	}

@@ -2139,6 +2120,14 @@ func discoPcapFrame(src netip.AddrPort, derpNodeSrc key.NodePublic, payload []by
 	return b.Bytes()
 }

+type discoRXPath string
+
+const (
+	discoRXPathUDP       discoRXPath = "UDP socket"
+	discoRXPathDERP      discoRXPath = "DERP"
+	discoRXPathRawSocket discoRXPath = "raw socket"
+)
+
 // handleDiscoMessage handles a discovery message and reports whether
 // msg was a Tailscale inter-node discovery message.
 //
@@ -2153,7 +2142,7 @@ func discoPcapFrame(src netip.AddrPort, derpNodeSrc key.NodePublic, payload []by
 // src.Port() being the region ID) and the derpNodeSrc will be the node key
 // it was received from at the DERP layer. derpNodeSrc is zero when received
 // over UDP.
-func (c *Conn) handleDiscoMessage(msg []byte, src netip.AddrPort, derpNodeSrc key.NodePublic) (isDiscoMsg bool) {
+func (c *Conn) handleDiscoMessage(msg []byte, src netip.AddrPort, derpNodeSrc key.NodePublic, via discoRXPath) (isDiscoMsg bool) {
 	const headerLen = len(disco.Magic) + key.DiscoPublicRawLen
 	if len(msg) < headerLen || string(msg[:len(disco.Magic)]) != disco.Magic {
 		return false
@@ -2174,7 +2163,7 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netip.AddrPort, derpNodeSrc ke
 		return
 	}
 	if debugDisco() {
-		c.logf("magicsock: disco: got disco-looking frame from %v", sender.ShortString())
+		c.logf("magicsock: disco: got disco-looking frame from %v via %s", sender.ShortString(), via)
 	}
 	if c.privateKey.IsZero() {
 		// Ignore disco messages when we're stopped.
@@ -2210,12 +2199,14 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netip.AddrPort, derpNodeSrc ke
 		// disco key.  When we restart we get a new disco key
 		// and old packets might've still been in flight (or
 		// scheduled). This is particularly the case for LANs
-		// or non-NATed endpoints.
-		// Don't log in normal case. Pass on to wireguard, in case
-		// it's actually a wireguard packet (super unlikely,
-		// but).
+		// or non-NATed endpoints. UDP offloading on Linux
+		// can also cause this when a disco message is
+		// received via raw socket at the head of a coalesced
+		// group of messages. Don't log in normal case.
+		// Callers may choose to pass on to wireguard, in case
+		// it's actually a wireguard packet (super unlikely, but).
 		if debugDisco() {
-			c.logf("magicsock: disco: failed to open naclbox from %v (wrong rcpt?)", sender)
+			c.logf("magicsock: disco: failed to open naclbox from %v (wrong rcpt?) via %s", sender, via)
 		}
 		metricRecvDiscoBadKey.Add(1)
 		return
@@ -3205,13 +3196,13 @@ func (c *Conn) bindSocket(ruc *RebindingUDPConn, network string, curPortFate cur
 	defer ruc.mu.Unlock()

 	if runtime.GOOS == "js" {
-		ruc.setConnLocked(newBlockForeverConn(), "")
+		ruc.setConnLocked(newBlockForeverConn(), "", c.bind.BatchSize())
 		return nil
 	}

 	if debugAlwaysDERP() {
 		c.logf("disabled %v per TS_DEBUG_ALWAYS_USE_DERP", network)
-		ruc.setConnLocked(newBlockForeverConn(), "")
+		ruc.setConnLocked(newBlockForeverConn(), "", c.bind.BatchSize())
 		return nil
 	}

@@ -3253,7 +3244,7 @@ func (c *Conn) bindSocket(ruc *RebindingUDPConn, network string, curPortFate cur
 		if debugBindSocket() {
 			c.logf("magicsock: bindSocket: successfully listened %v port %d", network, port)
 		}
-		ruc.setConnLocked(pconn, network)
+		ruc.setConnLocked(pconn, network, c.bind.BatchSize())
 		if network == "udp4" {
 			health.SetUDP4Unbound(false)
 		}
@@ -3264,7 +3255,7 @@ func (c *Conn) bindSocket(ruc *RebindingUDPConn, network string, curPortFate cur
 	// Set pconn to a dummy conn whose reads block until closed.
 	// This keeps the receive funcs alive for a future in which
 	// we get a link change and we can try binding again.
-	ruc.setConnLocked(newBlockForeverConn(), "")
+	ruc.setConnLocked(newBlockForeverConn(), "", c.bind.BatchSize())
 	if network == "udp4" {
 		health.SetUDP4Unbound(true)
 	}
@@ -3361,49 +3352,332 @@ func (c *Conn) ParseEndpoint(nodeKeyStr string) (conn.Endpoint, error) {
 	return ep, nil
 }

-type batchReaderWriter interface {
-	batchReader
-	batchWriter
+// xnetBatchReaderWriter defines the batching i/o methods of
+// golang.org/x/net/ipv4.PacketConn (and ipv6.PacketConn).
+// TODO(jwhited): This should eventually be replaced with the standard library
+// implementation of https://github.com/golang/go/issues/45886
+type xnetBatchReaderWriter interface {
+	xnetBatchReader
+	xnetBatchWriter
 }

-type batchWriter interface {
-	WriteBatch([]ipv6.Message, int) (int, error)
-}
-
-type batchReader interface {
+type xnetBatchReader interface {
 	ReadBatch([]ipv6.Message, int) (int, error)
 }

-// udpConnWithBatchOps wraps a *net.UDPConn in order to extend it to support
-// batch operations.
-//
-// TODO(jwhited): This wrapping is temporary. https://github.com/golang/go/issues/45886
-type udpConnWithBatchOps struct {
-	*net.UDPConn
-	xpc batchReaderWriter
+type xnetBatchWriter interface {
+	WriteBatch([]ipv6.Message, int) (int, error)
 }

-func newUDPConnWithBatchOps(conn *net.UDPConn, network string) udpConnWithBatchOps {
-	ucbo := udpConnWithBatchOps{
-		UDPConn: conn,
+// batchingUDPConn is a UDP socket that provides batched i/o.
+type batchingUDPConn struct {
+	pc                    nettype.PacketConn
+	xpc                   xnetBatchReaderWriter
+	rxOffload             bool                                  // supports UDP GRO or similar
+	txOffload             atomic.Bool                           // supports UDP GSO or similar
+	setGSOSizeInControl   func(control *[]byte, gsoSize uint16) // typically setGSOSizeInControl(); swappable for testing
+	getGSOSizeFromControl func(control []byte) (int, error)     // typically getGSOSizeFromControl(); swappable for testing
+	sendBatchPool         sync.Pool
+}
+
+func (c *batchingUDPConn) ReadFrom(p []byte) (n int, addr net.Addr, err error) {
+	if c.rxOffload {
+		// UDP_GRO is opt-in on Linux via setsockopt(). Once enabled you may
+		// receive a "monster datagram" from any read call. The ReadFrom() API
+		// does not support passing the GSO size and is unsafe to use in such a
+		// case. Other platforms may vary in behavior, but we go with the most
+		// conservative approach to prevent this from becoming a footgun in the
+		// future.
+		return 0, nil, errors.New("rx UDP offload is enabled on this socket, single packet reads are unavailable")
+	}
+	return c.pc.ReadFrom(p)
+}
+
+func (c *batchingUDPConn) WriteTo(b []byte, addr net.Addr) (n int, err error) {
+	return c.pc.WriteTo(b, addr)
+}
+
+func (c *batchingUDPConn) SetDeadline(t time.Time) error {
+	return c.pc.SetDeadline(t)
+}
+
+func (c *batchingUDPConn) SetReadDeadline(t time.Time) error {
+	return c.pc.SetReadDeadline(t)
+}
+
+func (c *batchingUDPConn) SetWriteDeadline(t time.Time) error {
+	return c.pc.SetWriteDeadline(t)
+}
+
+const (
+	// This was initially established for Linux, but may split out to
+	// GOOS-specific values later. It originates as UDP_MAX_SEGMENTS in the
+	// kernel's TX path, and UDP_GRO_CNT_MAX for RX.
+	udpSegmentMaxDatagrams = 64
+)
+
+const (
+	// Exceeding these values results in EMSGSIZE.
+	maxIPv4PayloadLen = 1<<16 - 1 - 20 - 8
+	maxIPv6PayloadLen = 1<<16 - 1 - 8
+)
+
+// coalesceMessages iterates msgs, coalescing them where possible while
+// maintaining datagram order. All msgs have their Addr field set to addr.
+func (c *batchingUDPConn) coalesceMessages(addr *net.UDPAddr, buffs [][]byte, msgs []ipv6.Message) int {
+	var (
+		base     = -1 // index of msg we are currently coalescing into
+		gsoSize  int  // segmentation size of msgs[base]
+		dgramCnt int  // number of dgrams coalesced into msgs[base]
+		endBatch bool // tracking flag to start a new batch on next iteration of buffs
+	)
+	maxPayloadLen := maxIPv4PayloadLen
+	if addr.IP.To4() == nil {
+		maxPayloadLen = maxIPv6PayloadLen
+	}
+	for i, buff := range buffs {
+		if i > 0 {
+			msgLen := len(buff)
+			baseLenBefore := len(msgs[base].Buffers[0])
+			freeBaseCap := cap(msgs[base].Buffers[0]) - baseLenBefore
+			if msgLen+baseLenBefore <= maxPayloadLen &&
+				msgLen <= gsoSize &&
+				msgLen <= freeBaseCap &&
+				dgramCnt < udpSegmentMaxDatagrams &&
+				!endBatch {
+				msgs[base].Buffers[0] = append(msgs[base].Buffers[0], make([]byte, msgLen)...)
+				copy(msgs[base].Buffers[0][baseLenBefore:], buff)
+				if i == len(buffs)-1 {
+					c.setGSOSizeInControl(&msgs[base].OOB, uint16(gsoSize))
+				}
+				dgramCnt++
+				if msgLen < gsoSize {
+					// A smaller than gsoSize packet on the tail is legal, but
+					// it must end the batch.
+					endBatch = true
+				}
+				continue
+			}
+		}
+		if dgramCnt > 1 {
+			c.setGSOSizeInControl(&msgs[base].OOB, uint16(gsoSize))
+		}
+		// Reset prior to incrementing base since we are preparing to start a
+		// new potential batch.
+		endBatch = false
+		base++
+		gsoSize = len(buff)
+		msgs[base].OOB = msgs[base].OOB[:0]
+		msgs[base].Buffers[0] = buff
+		msgs[base].Addr = addr
+		dgramCnt = 1
+	}
+	return base + 1
+}
+
+type sendBatch struct {
+	msgs []ipv6.Message
+	ua   *net.UDPAddr
+}
+
+func (c *batchingUDPConn) getSendBatch() *sendBatch {
+	batch := c.sendBatchPool.Get().(*sendBatch)
+	return batch
+}
+
+func (c *batchingUDPConn) putSendBatch(batch *sendBatch) {
+	for i := range batch.msgs {
+		batch.msgs[i] = ipv6.Message{Buffers: batch.msgs[i].Buffers, OOB: batch.msgs[i].OOB}
+	}
+	c.sendBatchPool.Put(batch)
+}
+
+func (c *batchingUDPConn) WriteBatchTo(buffs [][]byte, addr netip.AddrPort) error {
+	batch := c.getSendBatch()
+	defer c.putSendBatch(batch)
+	if addr.Addr().Is6() {
+		as16 := addr.Addr().As16()
+		copy(batch.ua.IP, as16[:])
+		batch.ua.IP = batch.ua.IP[:16]
+	} else {
+		as4 := addr.Addr().As4()
+		copy(batch.ua.IP, as4[:])
+		batch.ua.IP = batch.ua.IP[:4]
+	}
+	batch.ua.Port = int(addr.Port())
+	var (
+		n       int
+		retried bool
+	)
+retry:
+	if c.txOffload.Load() {
+		n = c.coalesceMessages(batch.ua, buffs, batch.msgs)
+	} else {
+		for i := range buffs {
+			batch.msgs[i].Buffers[0] = buffs[i]
+			batch.msgs[i].Addr = batch.ua
+			batch.msgs[i].OOB = batch.msgs[i].OOB[:0]
+		}
+		n = len(buffs)
+	}
+
+	err := c.writeBatch(batch.msgs[:n])
+	if err != nil && c.txOffload.Load() && neterror.ShouldDisableUDPGSO(err) {
+		c.txOffload.Store(false)
+		retried = true
+		goto retry
+	}
+	if retried {
+		return neterror.ErrUDPGSODisabled{OnLaddr: c.pc.LocalAddr().String(), RetryErr: err}
+	}
+	return err
+}
+
+func (c *batchingUDPConn) writeBatch(msgs []ipv6.Message) error {
+	var head int
+	for {
+		n, err := c.xpc.WriteBatch(msgs[head:], 0)
+		if err != nil || n == len(msgs[head:]) {
+			// Returning the number of packets written would require
+			// unraveling individual msg len and gso size during a coalesced
+			// write. The top of the call stack disregards partial success,
+			// so keep this simple for now.
+			return err
+		}
+		head += n
+	}
+}
+
+// splitCoalescedMessages splits coalesced messages from the tail of dst
+// beginning at index 'firstMsgAt' into the head of the same slice. It reports
+// the number of elements to evaluate in msgs for nonzero len (msgs[i].N). An
+// error is returned if a socket control message cannot be parsed or a split
+// operation would overflow msgs.
+func (c *batchingUDPConn) splitCoalescedMessages(msgs []ipv6.Message, firstMsgAt int) (n int, err error) {
+	for i := firstMsgAt; i < len(msgs); i++ {
+		msg := &msgs[i]
+		if msg.N == 0 {
+			return n, err
+		}
+		var (
+			gsoSize    int
+			start      int
+			end        = msg.N
+			numToSplit = 1
+		)
+		gsoSize, err = c.getGSOSizeFromControl(msg.OOB[:msg.NN])
+		if err != nil {
+			return n, err
+		}
+		if gsoSize > 0 {
+			numToSplit = (msg.N + gsoSize - 1) / gsoSize
+			end = gsoSize
+		}
+		for j := 0; j < numToSplit; j++ {
+			if n > i {
+				return n, errors.New("splitting coalesced packet resulted in overflow")
+			}
+			copied := copy(msgs[n].Buffers[0], msg.Buffers[0][start:end])
+			msgs[n].N = copied
+			msgs[n].Addr = msg.Addr
+			start = end
+			end += gsoSize
+			if end > msg.N {
+				end = msg.N
+			}
+			n++
+		}
+		if i != n-1 {
+			// It is legal for bytes to move within msg.Buffers[0] as a result
+			// of splitting, so we only zero the source msg len when it is not
+			// the destination of the last split operation above.
+			msg.N = 0
+		}
+	}
+	return n, nil
+}
+
+func (c *batchingUDPConn) ReadBatch(msgs []ipv6.Message, flags int) (n int, err error) {
+	if !c.rxOffload || len(msgs) < 2 {
+		return c.xpc.ReadBatch(msgs, flags)
+	}
+	// Read into the tail of msgs, split into the head.
+	readAt := len(msgs) - 2
+	numRead, err := c.xpc.ReadBatch(msgs[readAt:], 0)
+	if err != nil || numRead == 0 {
+		return 0, err
+	}
+	return c.splitCoalescedMessages(msgs, readAt)
+}
+
+func (c *batchingUDPConn) LocalAddr() net.Addr {
+	return c.pc.LocalAddr().(*net.UDPAddr)
+}
+
+func (c *batchingUDPConn) WriteToUDPAddrPort(b []byte, addr netip.AddrPort) (int, error) {
+	return c.pc.WriteToUDPAddrPort(b, addr)
+}
+
+func (c *batchingUDPConn) Close() error {
+	return c.pc.Close()
+}
+
+// tryUpgradeToBatchingUDPConn probes the capabilities of the OS and pconn, and
+// upgrades pconn to a *batchingUDPConn if appropriate.
+func tryUpgradeToBatchingUDPConn(pconn nettype.PacketConn, network string, batchSize int) nettype.PacketConn {
+	if network != "udp4" && network != "udp6" {
+		return pconn
+	}
+	if runtime.GOOS != "linux" {
+		return pconn
+	}
+	if strings.HasPrefix(hostinfo.GetOSVersion(), "2.") {
+		// recvmmsg/sendmmsg were added in 2.6.33, but we support down to
+		// 2.6.32 for old NAS devices. See https://github.com/tailscale/tailscale/issues/6807.
+		// As a cheap heuristic: if the Linux kernel starts with "2", just
+		// consider it too old for mmsg. Nobody who cares about performance runs
+		// such ancient kernels. UDP offload was added much later, so no
+		// upgrades are available.
+		return pconn
+	}
+	uc, ok := pconn.(*net.UDPConn)
+	if !ok {
+		return pconn
+	}
+	b := &batchingUDPConn{
+		pc:                    pconn,
+		getGSOSizeFromControl: getGSOSizeFromControl,
+		setGSOSizeInControl:   setGSOSizeInControl,
+		sendBatchPool: sync.Pool{
+			New: func() any {
+				ua := &net.UDPAddr{
+					IP: make([]byte, 16),
+				}
+				msgs := make([]ipv6.Message, batchSize)
+				for i := range msgs {
+					msgs[i].Buffers = make([][]byte, 1)
+					msgs[i].Addr = ua
+					msgs[i].OOB = make([]byte, controlMessageSize)
+				}
+				return &sendBatch{
+					ua:   ua,
+					msgs: msgs,
+				}
+			},
+		},
 	}
 	switch network {
 	case "udp4":
-		ucbo.xpc = ipv4.NewPacketConn(conn)
+		b.xpc = ipv4.NewPacketConn(uc)
 	case "udp6":
-		ucbo.xpc = ipv6.NewPacketConn(conn)
+		b.xpc = ipv6.NewPacketConn(uc)
 	default:
 		panic("bogus network")
 	}
-	return ucbo
-}
-
-func (u udpConnWithBatchOps) WriteBatch(ms []ipv6.Message, flags int) (int, error) {
-	return u.xpc.WriteBatch(ms, flags)
-}
-
-func (u udpConnWithBatchOps) ReadBatch(ms []ipv6.Message, flags int) (int, error) {
-	return u.xpc.ReadBatch(ms, flags)
+	var txOffload bool
+	txOffload, b.rxOffload = tryEnableUDPOffload(uc)
+	b.txOffload.Store(txOffload)
+	return b
 }

 // RebindingUDPConn is a UDP socket that can be re-bound.
@@ -3423,34 +3697,14 @@ type RebindingUDPConn struct {
 	port  uint16
 }

-// upgradePacketConn may upgrade a nettype.PacketConn to a udpConnWithBatchOps.
-func upgradePacketConn(p nettype.PacketConn, network string) nettype.PacketConn {
-	uc, ok := p.(*net.UDPConn)
-	if ok && runtime.GOOS == "linux" && (network == "udp4" || network == "udp6") {
-		// recvmmsg/sendmmsg were added in 2.6.33 but we support down to 2.6.32
-		// for old NAS devices. See https://github.com/tailscale/tailscale/issues/6807.
-		// As a cheap heuristic: if the Linux kernel starts with "2", just consider
-		// it too old for the fast paths. Nobody who cares about performance runs such
-		// ancient kernels.
-		if strings.HasPrefix(hostinfo.GetOSVersion(), "2") {
-			return p
-		}
-		// Non-Linux does not support batch operations. x/net will fall back to
-		// recv/sendmsg, but not all platforms have recv/sendmsg support. Keep
-		// this simple for now.
-		return newUDPConnWithBatchOps(uc, network)
-	}
-	return p
-}
-
 // setConnLocked sets the provided nettype.PacketConn. It should be called only
 // after acquiring RebindingUDPConn.mu. It upgrades the provided
-// nettype.PacketConn to a udpConnWithBatchOps when appropriate. This upgrade
+// nettype.PacketConn to a *batchingUDPConn when appropriate. This upgrade
 // is intentionally pushed closest to where read/write ops occur in order to
 // avoid disrupting surrounding code that assumes nettype.PacketConn is a
 // *net.UDPConn.
-func (c *RebindingUDPConn) setConnLocked(p nettype.PacketConn, network string) {
-	upc := upgradePacketConn(p, network)
+func (c *RebindingUDPConn) setConnLocked(p nettype.PacketConn, network string, batchSize int) {
+	upc := tryUpgradeToBatchingUDPConn(p, network, batchSize)
 	c.pconn = upc
 	c.pconnAtomic.Store(&upc)
 	c.port = uint16(c.localAddrLocked().Port)
@@ -3480,83 +3734,38 @@ func (c *RebindingUDPConn) ReadFrom(b []byte) (int, net.Addr, error) {
 	return c.readFromWithInitPconn(*c.pconnAtomic.Load(), b)
 }

-// ReadFromNetaddr reads a packet from c into b.
-// It returns the number of bytes copied and the return address.
-// It is identical to c.ReadFrom, except that it returns a netip.AddrPort instead of a net.Addr.
-// ReadFromNetaddr is designed to work with specific underlying connection types.
-// If c's underlying connection returns a non-*net.UPDAddr return address, ReadFromNetaddr will return an error.
-// ReadFromNetaddr exists because it removes an allocation per read,
-// when c's underlying connection is a net.UDPConn.
-func (c *RebindingUDPConn) ReadFromNetaddr(b []byte) (n int, ipp netip.AddrPort, err error) {
+// WriteBatchTo writes buffs to addr.
+func (c *RebindingUDPConn) WriteBatchTo(buffs [][]byte, addr netip.AddrPort) error {
 	for {
 		pconn := *c.pconnAtomic.Load()
-
-		// Optimization: Treat *net.UDPConn specially.
-		// This lets us avoid allocations by calling ReadFromUDPAddrPort.
-		// The non-*net.UDPConn case works, but it allocates.
-		if udpConn, ok := pconn.(*net.UDPConn); ok {
-			n, ipp, err = udpConn.ReadFromUDPAddrPort(b)
-		} else {
-			var addr net.Addr
-			n, addr, err = pconn.ReadFrom(b)
-			pAddr, ok := addr.(*net.UDPAddr)
-			if addr != nil && !ok {
-				return 0, netip.AddrPort{}, fmt.Errorf("RebindingUDPConn.ReadFromNetaddr: underlying connection returned address of type %T, want *netaddr.UDPAddr", addr)
-			}
-			if pAddr != nil {
-				ipp = netaddr.Unmap(pAddr.AddrPort())
-				if !ipp.IsValid() {
-					return 0, netip.AddrPort{}, errors.New("netaddr.FromStdAddr failed")
-				}
-			}
-		}
-
-		if err != nil && pconn != c.currentConn() {
-			// The connection changed underfoot. Try again.
-			continue
-		}
-		return n, ipp, err
-	}
-}
-
-func (c *RebindingUDPConn) WriteBatch(msgs []ipv6.Message, flags int) (int, error) {
-	var (
-		n     int
-		err   error
-		start int
-	)
-	for {
-		pconn := *c.pconnAtomic.Load()
-		bw, ok := pconn.(batchWriter)
+		b, ok := pconn.(*batchingUDPConn)
 		if !ok {
-			for _, msg := range msgs {
-				_, err = c.writeToWithInitPconn(pconn, msg.Buffers[0], msg.Addr)
+			for _, buf := range buffs {
+				_, err := c.writeToUDPAddrPortWithInitPconn(pconn, buf, addr)
 				if err != nil {
-					return n, err
+					return err
 				}
-				n++
 			}
-			return n, nil
+			return nil
 		}
-
-		n, err = bw.WriteBatch(msgs[start:], flags)
+		err := b.WriteBatchTo(buffs, addr)
 		if err != nil {
 			if pconn != c.currentConn() {
 				continue
 			}
-			return n, err
-		} else if n == len(msgs[start:]) {
-			return len(msgs), nil
-		} else {
-			start += n
+			return err
 		}
+		return err
 	}
 }

+// ReadBatch reads messages from c into msgs. It returns the number of messages
+// the caller should evaluate for nonzero len, as a zero len message may fall
+// on either side of a nonzero.
 func (c *RebindingUDPConn) ReadBatch(msgs []ipv6.Message, flags int) (int, error) {
 	for {
 		pconn := *c.pconnAtomic.Load()
-		br, ok := pconn.(batchReader)
+		b, ok := pconn.(*batchingUDPConn)
 		if !ok {
 			var err error
 			msgs[0].N, msgs[0].Addr, err = c.readFromWithInitPconn(pconn, msgs[0].Buffers[0])
@@ -3565,7 +3774,7 @@ func (c *RebindingUDPConn) ReadBatch(msgs []ipv6.Message, flags int) (int, error
 			}
 			return 0, err
 		}
-		n, err := br.ReadBatch(msgs, flags)
+		n, err := b.ReadBatch(msgs, flags)
 		if err != nil && pconn != c.currentConn() {
 			continue
 		}
@@ -3607,9 +3816,9 @@ func (c *RebindingUDPConn) closeLocked() error {
 	return c.pconn.Close()
 }

-func (c *RebindingUDPConn) writeToWithInitPconn(pconn nettype.PacketConn, b []byte, addr net.Addr) (int, error) {
+func (c *RebindingUDPConn) writeToUDPAddrPortWithInitPconn(pconn nettype.PacketConn, b []byte, addr netip.AddrPort) (int, error) {
 	for {
-		n, err := pconn.WriteTo(b, addr)
+		n, err := pconn.WriteToUDPAddrPort(b, addr)
 		if err != nil && pconn != c.currentConn() {
 			pconn = *c.pconnAtomic.Load()
 			continue
@@ -3619,13 +3828,9 @@ func (c *RebindingUDPConn) writeToWithInitPconn(pconn nettype.PacketConn, b []by
 }

 func (c *RebindingUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) {
-	return c.writeToWithInitPconn(*c.pconnAtomic.Load(), b, addr)
-}
-
-func (c *RebindingUDPConn) WriteToUDPAddrPort(b []byte, addr netip.AddrPort) (int, error) {
 	for {
 		pconn := *c.pconnAtomic.Load()
-		n, err := pconn.WriteToUDPAddrPort(b, addr)
+		n, err := pconn.WriteTo(b, addr)
 		if err != nil && pconn != c.currentConn() {
 			continue
 		}
@@ -3633,6 +3838,10 @@ func (c *RebindingUDPConn) WriteToUDPAddrPort(b []byte, addr netip.AddrPort) (in
 	}
 }

+func (c *RebindingUDPConn) WriteToUDPAddrPort(b []byte, addr netip.AddrPort) (int, error) {
+	return c.writeToUDPAddrPortWithInitPconn(*c.pconnAtomic.Load(), b, addr)
+}
+
 func newBlockForeverConn() *blockForeverConn {
 	c := new(blockForeverConn)
 	c.cond = sync.NewCond(&c.mu)
@@ -3665,20 +3874,6 @@ func (c *blockForeverConn) WriteToUDPAddrPort(p []byte, addr netip.AddrPort) (in
 	return len(p), nil
 }

-func (c *blockForeverConn) ReadBatch(p []ipv6.Message, flags int) (int, error) {
-	c.mu.Lock()
-	for !c.closed {
-		c.cond.Wait()
-	}
-	c.mu.Unlock()
-	return 0, net.ErrClosed
-}
-
-func (c *blockForeverConn) WriteBatch(p []ipv6.Message, flags int) (int, error) {
-	// Silently drop writes.
-	return len(p), nil
-}
-
 func (c *blockForeverConn) LocalAddr() net.Addr {
 	// Return a *net.UDPAddr because lots of code assumes that it will.
 	return new(net.UDPAddr)