net/connstats: invert network logging data flow (#6272)

Previously, tstun.Wrapper and magicsock.Conn managed their
own statistics data structure and relied on an external call to
Extract to extract (and reset) the statistics.
This makes it difficult to ensure a maximum size on the statistics
as the caller has no introspection into whether the number
of unique connections is getting too large.

Invert the control flow such that a *connstats.Statistics
is registered with tstun.Wrapper and magicsock.Conn.
Methods on non-nil *connstats.Statistics are called for every packet.
This allows the implementation of connstats.Statistics (in the future)
to better control when it needs to flush to ensure
bounds on maximum sizes.

The value registered into tstun.Wrapper and magicsock.Conn could
be an interface, but that has two performance detriments:

1. Method calls on interface values are more expensive since
they must go through a virtual method dispatch.

2. The implementation would need a sync.Mutex to protect the
statistics value instead of using an atomic.Pointer.

Given that methods on constats.Statistics are called for every packet,
we want reduce the CPU cost on this hot path.

Signed-off-by: Joe Tsai <joetsai@digital-static.net>
This commit is contained in:
Joe Tsai
2022-11-28 15:59:33 -08:00
committed by GitHub
parent 35c10373b5
commit 2e5d08ec4f
10 changed files with 172 additions and 252 deletions

View File

@@ -20,6 +20,7 @@ import (
"golang.org/x/sync/errgroup"
"tailscale.com/logpolicy"
"tailscale.com/logtail"
"tailscale.com/net/connstats"
"tailscale.com/net/tsaddr"
"tailscale.com/smallzstd"
"tailscale.com/tailcfg"
@@ -34,14 +35,12 @@ const pollPeriod = 5 * time.Second
// *tstun.Wrapper implements this interface.
// *magicsock.Conn implements this interface.
type Device interface {
SetStatisticsEnabled(bool)
ExtractStatistics() map[netlogtype.Connection]netlogtype.Counts
SetStatistics(*connstats.Statistics)
}
type noopDevice struct{}
func (noopDevice) SetStatisticsEnabled(bool) {}
func (noopDevice) ExtractStatistics() map[netlogtype.Connection]netlogtype.Counts { return nil }
func (noopDevice) SetStatistics(*connstats.Statistics) {}
// Logger logs statistics about every connection.
// At present, it only logs connections within a tailscale network.
@@ -130,16 +129,15 @@ func (nl *Logger) Startup(nodeID tailcfg.StableNodeID, nodeLogID, domainLogID lo
}, log.Printf)
nl.logger = logger
stats := new(connstats.Statistics)
ctx, cancel := context.WithCancel(context.Background())
nl.cancel = cancel
nl.group.Go(func() error {
tun.SetStatisticsEnabled(true)
defer tun.SetStatisticsEnabled(false)
tun.ExtractStatistics() // clear out any stale statistics
tun.SetStatistics(stats)
defer tun.SetStatistics(nil)
sock.SetStatisticsEnabled(true)
defer sock.SetStatisticsEnabled(false)
sock.ExtractStatistics() // clear out any stale statistics
sock.SetStatistics(stats)
defer sock.SetStatistics(nil)
start := time.Now()
ticker := time.NewTicker(pollPeriod)
@@ -147,22 +145,20 @@ func (nl *Logger) Startup(nodeID tailcfg.StableNodeID, nodeLogID, domainLogID lo
var end time.Time
select {
case <-ctx.Done():
tun.SetStatisticsEnabled(false)
end = time.Now()
case end = <-ticker.C:
}
// NOTE: tunStats and sockStats will always be slightly out-of-sync.
// NOTE: connstats and sockStats will always be slightly out-of-sync.
// It is impossible to have an atomic snapshot of statistics
// at both layers without a global mutex that spans all layers.
tunStats := tun.ExtractStatistics()
sockStats := sock.ExtractStatistics()
if len(tunStats)+len(sockStats) > 0 {
connstats, sockStats := stats.Extract()
if len(connstats)+len(sockStats) > 0 {
nl.mu.Lock()
addrs := nl.addrs
prefixes := nl.prefixes
nl.mu.Unlock()
recordStatistics(logger, nodeID, start, end, tunStats, sockStats, addrs, prefixes)
recordStatistics(logger, nodeID, start, end, connstats, sockStats, addrs, prefixes)
}
if ctx.Err() != nil {
@@ -175,7 +171,7 @@ func (nl *Logger) Startup(nodeID tailcfg.StableNodeID, nodeLogID, domainLogID lo
return nil
}
func recordStatistics(logger *logtail.Logger, nodeID tailcfg.StableNodeID, start, end time.Time, tunStats, sockStats map[netlogtype.Connection]netlogtype.Counts, addrs map[netip.Addr]bool, prefixes map[netip.Prefix]bool) {
func recordStatistics(logger *logtail.Logger, nodeID tailcfg.StableNodeID, start, end time.Time, connstats, sockStats map[netlogtype.Connection]netlogtype.Counts, addrs map[netip.Addr]bool, prefixes map[netip.Prefix]bool) {
m := netlogtype.Message{NodeID: nodeID, Start: start.UTC(), End: end.UTC()}
classifyAddr := func(a netip.Addr) (isTailscale, withinRoute bool) {
@@ -194,7 +190,7 @@ func recordStatistics(logger *logtail.Logger, nodeID tailcfg.StableNodeID, start
}
exitTraffic := make(map[netlogtype.Connection]netlogtype.Counts)
for conn, cnts := range tunStats {
for conn, cnts := range connstats {
srcIsTailscaleIP, srcWithinSubnet := classifyAddr(conn.Src.Addr())
dstIsTailscaleIP, dstWithinSubnet := classifyAddr(conn.Dst.Addr())
switch {

View File

@@ -1,66 +0,0 @@
// Copyright (c) 2022 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package netlog
import (
"context"
"net/http"
"testing"
qt "github.com/frankban/quicktest"
"tailscale.com/logtail"
"tailscale.com/tstest"
"tailscale.com/types/netlogtype"
"tailscale.com/util/must"
"tailscale.com/wgengine/router"
)
func init() {
testClient = &http.Client{Transport: &roundTripper}
}
var roundTripper roundTripperFunc
type roundTripperFunc struct {
F func(*http.Request) (*http.Response, error)
}
func (f roundTripperFunc) RoundTrip(r *http.Request) (*http.Response, error) {
return f.F(r)
}
type fakeDevice struct {
toggled int // even => disabled, odd => enabled
}
func (d *fakeDevice) SetStatisticsEnabled(enable bool) {
if enabled := d.toggled%2 == 1; enabled != enable {
d.toggled++
}
}
func (fakeDevice) ExtractStatistics() map[netlogtype.Connection]netlogtype.Counts {
// TODO(dsnet): Add a test that verifies that statistics are correctly
// extracted from the device and uploaded. Unfortunately,
// we can't reliably run this test until we fix http://go/oss/5856.
return nil
}
func TestResourceCheck(t *testing.T) {
roundTripper.F = func(r *http.Request) (*http.Response, error) {
return &http.Response{StatusCode: 200}, nil
}
c := qt.New(t)
tstest.ResourceCheck(t)
var l Logger
var d fakeDevice
for i := 0; i < 10; i++ {
must.Do(l.Startup("", logtail.PrivateID{}, logtail.PrivateID{}, &d, nil))
l.ReconfigRoutes(&router.Config{})
must.Do(l.Shutdown(context.Background()))
c.Assert(d.toggled, qt.Equals, 2*(i+1))
}
}