util/usermetrics: make usermetrics non-global

this commit changes usermetrics to be non-global, this is a building
block for correct metrics if a go process runs multiple tsnets or
in tests.

Updates #13420
Updates tailscale/corp#22075

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
Kristoffer Dalby
2024-09-23 18:34:00 +02:00
committed by Kristoffer Dalby
parent e1bbe1bf45
commit 0e0e53d3b3
27 changed files with 372 additions and 129 deletions

View File

@@ -5,6 +5,7 @@ package tsnet
import (
"bufio"
"bytes"
"context"
"crypto/ecdsa"
"crypto/elliptic"
@@ -31,7 +32,8 @@ import (
"testing"
"time"
"github.com/google/go-cmp/cmp"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"golang.org/x/net/proxy"
"tailscale.com/cmd/testwrapper/flakytest"
"tailscale.com/health"
@@ -818,65 +820,166 @@ func TestUDPConn(t *testing.T) {
}
}
// testWarnable is a Warnable that is used within this package for testing purposes only.
var testWarnable = health.Register(&health.Warnable{
Code: "test-warnable-tsnet",
Title: "Test warnable",
Severity: health.SeverityLow,
Text: func(args health.Args) string {
return args[health.ArgError]
},
})
func parseMetrics(m []byte) (map[string]float64, error) {
metrics := make(map[string]float64)
var parser expfmt.TextParser
mf, err := parser.TextToMetricFamilies(bytes.NewReader(m))
if err != nil {
return nil, err
}
for _, f := range mf {
for _, ff := range f.Metric {
val := float64(0)
switch f.GetType() {
case dto.MetricType_COUNTER:
val = ff.GetCounter().GetValue()
case dto.MetricType_GAUGE:
val = ff.GetGauge().GetValue()
}
metrics[f.GetName()+promMetricLabelsStr(ff.GetLabel())] = val
}
}
return metrics, nil
}
func promMetricLabelsStr(labels []*dto.LabelPair) string {
if len(labels) == 0 {
return ""
}
var b strings.Builder
b.WriteString("{")
for i, l := range labels {
if i > 0 {
b.WriteString(",")
}
b.WriteString(fmt.Sprintf("%s=%q", l.GetName(), l.GetValue()))
}
b.WriteString("}")
return b.String()
}
func TestUserMetrics(t *testing.T) {
flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/13420")
tstest.ResourceCheck(t)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
defer cancel()
// testWarnable is a Warnable that is used within this package for testing purposes only.
var testWarnable = health.Register(&health.Warnable{
Code: "test-warnable-tsnet",
Title: "Test warnable",
Severity: health.SeverityLow,
Text: func(args health.Args) string {
return args[health.ArgError]
},
})
controlURL, c := startControl(t)
s1, _, s1PubKey := startServer(t, ctx, controlURL, "s1")
s1, s1ip, s1PubKey := startServer(t, ctx, controlURL, "s1")
s2, _, _ := startServer(t, ctx, controlURL, "s2")
s1.lb.EditPrefs(&ipn.MaskedPrefs{
Prefs: ipn.Prefs{
AdvertiseRoutes: []netip.Prefix{
netip.MustParsePrefix("192.0.2.0/24"),
netip.MustParsePrefix("192.0.3.0/24"),
netip.MustParsePrefix("192.0.5.1/32"),
netip.MustParsePrefix("0.0.0.0/0"),
},
},
AdvertiseRoutesSet: true,
})
c.SetSubnetRoutes(s1PubKey, []netip.Prefix{netip.MustParsePrefix("192.0.2.0/24")})
c.SetSubnetRoutes(s1PubKey, []netip.Prefix{
netip.MustParsePrefix("192.0.2.0/24"),
netip.MustParsePrefix("192.0.5.1/32"),
netip.MustParsePrefix("0.0.0.0/0"),
})
lc1, err := s1.LocalClient()
if err != nil {
t.Fatal(err)
}
ht := s1.lb.HealthTracker()
ht.SetUnhealthy(testWarnable, health.Args{"Text": "Hello world 1"})
metrics1, err := lc1.UserMetrics(ctx)
lc2, err := s2.LocalClient()
if err != nil {
t.Fatal(err)
}
// Note that this test will check for two warnings because the health
// tracker will have two warnings: one from the testWarnable, added in
// this test, and one because we are running the dev/unstable version
// of tailscale.
want := `# TYPE tailscaled_advertised_routes gauge
# HELP tailscaled_advertised_routes Number of advertised network routes (e.g. by a subnet router)
tailscaled_advertised_routes 2
# TYPE tailscaled_health_messages gauge
# HELP tailscaled_health_messages Number of health messages broken down by type.
tailscaled_health_messages{type="warning"} 2
# TYPE tailscaled_inbound_dropped_packets_total counter
# HELP tailscaled_inbound_dropped_packets_total Counts the number of dropped packets received by the node from other peers
# TYPE tailscaled_outbound_dropped_packets_total counter
# HELP tailscaled_outbound_dropped_packets_total Counts the number of packets dropped while being sent to other peers
`
// ping to make sure the connection is up.
res, err := lc2.Ping(ctx, s1ip, tailcfg.PingICMP)
if err != nil {
t.Fatalf("pinging: %s", err)
}
t.Logf("ping success: %#+v", res)
if diff := cmp.Diff(want, string(metrics1)); diff != "" {
t.Fatalf("unexpected metrics (-want +got):\n%s", diff)
ht := s1.lb.HealthTracker()
ht.SetUnhealthy(testWarnable, health.Args{"Text": "Hello world 1"})
// Force an update to the netmap to ensure that the metrics are up-to-date.
s1.lb.DebugForceNetmapUpdate()
s2.lb.DebugForceNetmapUpdate()
ctxLc, cancelLc := context.WithTimeout(context.Background(), 5*time.Second)
defer cancelLc()
metrics1, err := lc1.UserMetrics(ctxLc)
if err != nil {
t.Fatal(err)
}
status1, err := lc1.Status(ctxLc)
if err != nil {
t.Fatal(err)
}
parsedMetrics1, err := parseMetrics(metrics1)
if err != nil {
t.Fatal(err)
}
t.Logf("Metrics1:\n%s\n", metrics1)
// The node is advertising 4 routes:
// - 192.0.2.0/24
// - 192.0.3.0/24
// - 192.0.5.1/32
if got, want := parsedMetrics1["tailscaled_advertised_routes"], 3.0; got != want {
t.Errorf("metrics1, tailscaled_advertised_routes: got %v, want %v", got, want)
}
// Validate the health counter metric against the status of the node
if got, want := parsedMetrics1[`tailscaled_health_messages{type="warning"}`], float64(len(status1.Health)); got != want {
t.Errorf("metrics1, tailscaled_health_messages: got %v, want %v", got, want)
}
metrics2, err := lc2.UserMetrics(ctx)
if err != nil {
t.Fatal(err)
}
status2, err := lc2.Status(ctx)
if err != nil {
t.Fatal(err)
}
parsedMetrics2, err := parseMetrics(metrics2)
if err != nil {
t.Fatal(err)
}
t.Logf("Metrics2:\n%s\n", metrics2)
// The node is advertising 0 routes
if got, want := parsedMetrics2["tailscaled_advertised_routes"], 0.0; got != want {
t.Errorf("metrics2, tailscaled_advertised_routes: got %v, want %v", got, want)
}
// Validate the health counter metric against the status of the node
if got, want := parsedMetrics2[`tailscaled_health_messages{type="warning"}`], float64(len(status2.Health)); got != want {
t.Errorf("metrics2, tailscaled_health_messages: got %v, want %v", got, want)
}
}