// Copyright (c) 2022 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package tunstats maintains statistics about connections
// flowing through a TUN device (which operate at the IP layer).
package tunstats

import (
	"encoding/binary"
	"hash/maphash"
	"math/bits"
	"net/netip"
	"sync"
	"sync/atomic"

	"tailscale.com/net/flowtrack"
	"tailscale.com/types/ipproto"
)

// Statistics maintains counters for every connection.
// All methods are safe for concurrent use.
// The zero value is ready for use.
type Statistics struct {
	v4 hashTable[addrsPortsV4]
	v6 hashTable[addrsPortsV6]
}

// Counts are statistics about a particular connection.
type Counts struct {
	TxPackets uint64 `json:"txPkts,omitempty"`
	TxBytes   uint64 `json:"txBytes,omitempty"`
	RxPackets uint64 `json:"rxPkts,omitempty"`
	RxBytes   uint64 `json:"rxBytes,omitempty"`
}

const (
	minTableLen = 8
	maxProbeLen = 64
)

// hashTable is a hash table that uses open addressing with probing.
// See https://en.wikipedia.org/wiki/Hash_table#Open_addressing.
// The primary table is in the active field and can be retrieved atomically.
// In the common case, this data structure is mostly lock free.
//
// If the current table is too small, a new table is allocated that
// replaces the current active table. The contents of the older table are
// NOT copied to the new table, but rather the older table is appended
// to a list of outgrown tables. Re-growth happens under a lock,
// but is expected to happen rarely as the table size grows exponentially.
//
// To reduce memory usage, the counters uses 32-bit unsigned integers,
// which carry the risk of overflowing. If an overflow is detected,
// we add the amount overflowed to the overflow map. This is a naive Go map
// protected by a sync.Mutex. Overflow is rare that contention is not a concern.
//
// To extract all counters, we replace the active table with a zeroed table,
// and clear out the outgrown and overflow tables.
// We take advantage of the fact that all the tables can be merged together
// by simply adding up all the counters for each connection.
type hashTable[AddrsPorts addrsPorts] struct {
	// TODO: Get rid of this. It is just an atomic update in the common case,
	// but contention updating the same word still incurs a 25% performance hit.
	mu sync.RWMutex // RLock held while updating, Lock held while extracting

	active  atomic.Pointer[countsTable[AddrsPorts]]
	inserts atomic.Uint32 // heuristic for next active table to allocate

	muGrow   sync.Mutex // muGrow.Lock implies that mu.RLock held
	outgrown []countsTable[AddrsPorts]

	muOverflow sync.Mutex // muOverflow.Lock implies that mu.RLock held
	overflow   map[flowtrack.Tuple]Counts
}

type countsTable[AddrsPorts addrsPorts] []counts[AddrsPorts]

func (t *countsTable[AddrsPorts]) len() int {
	if t == nil {
		return 0
	}
	return len(*t)
}

type counts[AddrsPorts addrsPorts] struct {
	// initProto is both an initialization flag and the IP protocol.
	// It is 0 if uninitialized, 1 if initializing, and
	// 2+ipproto.Proto if initialized.
	initProto atomic.Uint32

	addrsPorts AddrsPorts // only valid if initProto is initialized

	txPackets atomic.Uint32
	txBytes   atomic.Uint32
	rxPackets atomic.Uint32
	rxBytes   atomic.Uint32
}

// NOTE: There is some degree of duplicated code.
// For example, the functionality to swap the addrsPorts and compute the hash
// should be performed by hashTable.update rather than Statistics.update.
// However, Go generics cannot invoke pointer methods on addressable values.
// See https://go.googlesource.com/proposal/+/refs/heads/master/design/43651-type-parameters.md#no-way-to-require-pointer-methods

type addrsPorts interface {
	comparable
	asTuple(ipproto.Proto) flowtrack.Tuple
}

type addrsPortsV4 [4 + 4 + 2 + 2]byte

func (x *addrsPortsV4) addrs() *[8]byte { return (*[8]byte)(x[:]) }
func (x *addrsPortsV4) ports() *[4]byte { return (*[4]byte)(x[8:]) }
func (x *addrsPortsV4) swap() {
	*(*[4]byte)(x[0:]), *(*[4]byte)(x[4:]) = *(*[4]byte)(x[4:]), *(*[4]byte)(x[0:])
	*(*[2]byte)(x[8:]), *(*[2]byte)(x[10:]) = *(*[2]byte)(x[10:]), *(*[2]byte)(x[8:])
}
func (x addrsPortsV4) asTuple(proto ipproto.Proto) flowtrack.Tuple {
	return flowtrack.Tuple{Proto: proto,
		Src: netip.AddrPortFrom(netip.AddrFrom4(*(*[4]byte)(x[0:])), binary.BigEndian.Uint16(x[8:])),
		Dst: netip.AddrPortFrom(netip.AddrFrom4(*(*[4]byte)(x[4:])), binary.BigEndian.Uint16(x[10:])),
	}
}

type addrsPortsV6 [16 + 16 + 2 + 2]byte

func (x *addrsPortsV6) addrs() *[32]byte { return (*[32]byte)(x[:]) }
func (x *addrsPortsV6) ports() *[4]byte  { return (*[4]byte)(x[32:]) }
func (x *addrsPortsV6) swap() {
	*(*[16]byte)(x[0:]), *(*[16]byte)(x[16:]) = *(*[16]byte)(x[16:]), *(*[16]byte)(x[0:])
	*(*[2]byte)(x[32:]), *(*[2]byte)(x[34:]) = *(*[2]byte)(x[34:]), *(*[2]byte)(x[32:])
}
func (x addrsPortsV6) asTuple(proto ipproto.Proto) flowtrack.Tuple {
	return flowtrack.Tuple{Proto: proto,
		Src: netip.AddrPortFrom(netip.AddrFrom16(*(*[16]byte)(x[0:])), binary.BigEndian.Uint16(x[32:])),
		Dst: netip.AddrPortFrom(netip.AddrFrom16(*(*[16]byte)(x[16:])), binary.BigEndian.Uint16(x[34:])),
	}
}

// UpdateTx updates the statistics for a transmitted IP packet.
func (s *Statistics) UpdateTx(b []byte) {
	s.update(b, false)
}

// UpdateRx updates the statistics for a received IP packet.
func (s *Statistics) UpdateRx(b []byte) {
	s.update(b, true)
}

var seed = maphash.MakeSeed()

func (s *Statistics) update(b []byte, receive bool) {
	switch {
	case len(b) >= 20 && b[0]>>4 == 4: // IPv4
		proto := ipproto.Proto(b[9])
		hasPorts := proto == ipproto.TCP || proto == ipproto.UDP
		var addrsPorts addrsPortsV4
		if hdrLen := int(4 * (b[0] & 0xf)); hdrLen == 20 && len(b) >= 24 && hasPorts {
			addrsPorts = *(*addrsPortsV4)(b[12:]) // addresses and ports are contiguous
		} else {
			*addrsPorts.addrs() = *(*[8]byte)(b[12:])
			// May have IPv4 options in-between address and ports.
			if len(b) >= hdrLen+4 && hasPorts {
				*addrsPorts.ports() = *(*[4]byte)(b[hdrLen:])
			}
		}
		if receive {
			addrsPorts.swap()
		}
		hash := maphash.Bytes(seed, addrsPorts[:]) ^ uint64(proto) // TODO: Hash proto better?
		s.v4.update(receive, proto, &addrsPorts, hash, uint32(len(b)))
		return
	case len(b) >= 40 && b[0]>>4 == 6: // IPv6
		proto := ipproto.Proto(b[6])
		hasPorts := proto == ipproto.TCP || proto == ipproto.UDP
		var addrsPorts addrsPortsV6
		if len(b) >= 44 && hasPorts {
			addrsPorts = *(*addrsPortsV6)(b[8:]) // addresses and ports are contiguous
		} else {
			*addrsPorts.addrs() = *(*[32]byte)(b[8:])
			// TODO: Support IPv6 extension headers?
			if hdrLen := 40; len(b) > hdrLen+4 && hasPorts {
				*addrsPorts.ports() = *(*[4]byte)(b[hdrLen:])
			}
		}
		if receive {
			addrsPorts.swap()
		}
		hash := maphash.Bytes(seed, addrsPorts[:]) ^ uint64(proto) // TODO: Hash proto better?
		s.v6.update(receive, proto, &addrsPorts, hash, uint32(len(b)))
		return
	}
	// TODO: Track malformed packets?
}

func (h *hashTable[AddrsPorts]) update(receive bool, proto ipproto.Proto, addrsPorts *AddrsPorts, hash uint64, size uint32) {
	h.mu.RLock()
	defer h.mu.RUnlock()

	table := h.active.Load()
	for {
		// Start with an initialized table.
		if table.len() == 0 {
			table = h.grow(table)
		}

		// Try to update an entry in the currently active table.
		for i := 0; i < len(*table) && i < maxProbeLen; i++ {
			probe := uint64(i) // linear probing for small tables
			if len(*table) > 2*maxProbeLen {
				probe *= probe // quadratic probing for large tables
			}
			entry := &(*table)[(hash+probe)%uint64(len(*table))]

			// Spin-lock waiting for the entry to be initialized,
			// which should be quick as it only stores the AddrsPort.
		retry:
			switch initProto := entry.initProto.Load(); initProto {
			case 0: // uninitialized
				if !entry.initProto.CompareAndSwap(0, 1) {
					goto retry // raced with another initialization attempt
				}
				entry.addrsPorts = *addrsPorts
				entry.initProto.Store(uint32(proto) + 2) // initialization done
				h.inserts.Add(1)
			case 1: // initializing
				goto retry
			default: // initialized
				if ipproto.Proto(initProto-2) != proto || entry.addrsPorts != *addrsPorts {
					continue // this entry is for a different connection; try next entry
				}
			}

			// Atomically update the counters for the connection entry.
			var overflowPackets, overflowBytes bool
			if receive {
				overflowPackets = entry.rxPackets.Add(1) < 1
				overflowBytes = entry.rxBytes.Add(size) < size
			} else {
				overflowPackets = entry.txPackets.Add(1) < 1
				overflowBytes = entry.txBytes.Add(size) < size
			}
			if overflowPackets || overflowBytes {
				h.updateOverflow(receive, proto, addrsPorts, overflowPackets, overflowBytes)
			}
			return
		}

		// Unable to update, so grow the table and try again.
		// TODO: Use overflow map instead if table utilization is too low.
		table = h.grow(table)
	}
}

// grow grows the table unless the active table is larger than oldTable.
func (h *hashTable[AddrsPorts]) grow(oldTable *countsTable[AddrsPorts]) (newTable *countsTable[AddrsPorts]) {
	h.muGrow.Lock()
	defer h.muGrow.Unlock()

	if newTable = h.active.Load(); newTable.len() > oldTable.len() {
		return newTable // raced with another grow
	}
	newTable = new(countsTable[AddrsPorts])
	if oldTable.len() == 0 {
		*newTable = make(countsTable[AddrsPorts], minTableLen)
	} else {
		*newTable = make(countsTable[AddrsPorts], 2*len(*oldTable))
		h.outgrown = append(h.outgrown, *oldTable)
	}
	h.active.Store(newTable)
	return newTable
}

// updateOverflow updates the overflow map for counters that overflowed.
// Using 32-bit counters, this condition happens rarely as it only triggers
// after every 4 GiB of unidirectional network traffic on the same connection.
func (h *hashTable[AddrsPorts]) updateOverflow(receive bool, proto ipproto.Proto, addrsPorts *AddrsPorts, overflowPackets, overflowBytes bool) {
	h.muOverflow.Lock()
	defer h.muOverflow.Unlock()
	if h.overflow == nil {
		h.overflow = make(map[flowtrack.Tuple]Counts)
	}
	tuple := (*addrsPorts).asTuple(proto)
	cnts := h.overflow[tuple]
	if overflowPackets {
		if receive {
			cnts.RxPackets += 1 << 32
		} else {
			cnts.TxPackets += 1 << 32
		}
	}
	if overflowBytes {
		if receive {
			cnts.RxBytes += 1 << 32
		} else {
			cnts.TxBytes += 1 << 32
		}
	}
	h.overflow[tuple] = cnts
}

func (h *hashTable[AddrsPorts]) extractInto(out map[flowtrack.Tuple]Counts) {
	// Allocate a new table based on previous usage.
	var newTable *countsTable[AddrsPorts]
	if numInserts := h.inserts.Load(); numInserts > 0 {
		newLen := 1 << bits.Len(uint(4*numInserts/3)|uint(minTableLen-1))
		newTable = new(countsTable[AddrsPorts])
		*newTable = make(countsTable[AddrsPorts], newLen)
	}

	// Swap out the old tables for new tables.
	// We do not need to lock h.muGrow or h.muOverflow since holding h.mu
	// implies that nothing else could be holding those locks.
	h.mu.Lock()
	oldTable := h.active.Swap(newTable)
	oldOutgrown := h.outgrown
	oldOverflow := h.overflow
	h.outgrown = nil
	h.overflow = nil
	h.inserts.Store(0)
	h.mu.Unlock()

	// Merge tables into output.
	if oldTable != nil {
		mergeTable(out, *oldTable)
	}
	for _, table := range oldOutgrown {
		mergeTable(out, table)
	}
	mergeMap(out, oldOverflow)
}

// Extract extracts and resets the counters for all active connections.
// It must be called periodically otherwise the memory used is unbounded.
func (s *Statistics) Extract() map[flowtrack.Tuple]Counts {
	out := make(map[flowtrack.Tuple]Counts)
	s.v4.extractInto(out)
	s.v6.extractInto(out)
	return out
}

func mergeTable[AddrsPorts addrsPorts](dst map[flowtrack.Tuple]Counts, src countsTable[AddrsPorts]) {
	for i := range src {
		entry := &src[i]
		if initProto := entry.initProto.Load(); initProto > 0 {
			tuple := entry.addrsPorts.asTuple(ipproto.Proto(initProto - 2))
			cnts := dst[tuple]
			cnts.TxPackets += uint64(entry.txPackets.Load())
			cnts.TxBytes += uint64(entry.txBytes.Load())
			cnts.RxPackets += uint64(entry.rxPackets.Load())
			cnts.RxBytes += uint64(entry.rxBytes.Load())
			dst[tuple] = cnts
		}
	}
}

func mergeMap(dst, src map[flowtrack.Tuple]Counts) {
	for tuple, cntsSrc := range src {
		cntsDst := dst[tuple]
		cntsDst.TxPackets += cntsSrc.TxPackets
		cntsDst.TxBytes += cntsSrc.TxBytes
		cntsDst.RxPackets += cntsSrc.RxPackets
		cntsDst.RxBytes += cntsSrc.RxBytes
		dst[tuple] = cntsDst
	}
}