cmd/stunstamp: add explicit metric to track timeout events (#12564)

Timeouts could already be identified as NaN values on
stunstamp_derp_stun_rtt_ns, but we can't use NaN effectively with
promql to visualize them. So, this commit adds a timeouts metric that
we can use with rate/delta/etc promql functions.

Updates tailscale/corp#20689

Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
Jordan Whited 2024-06-21 09:17:35 -07:00 committed by GitHub
parent dcb0f189cc
commit 0d6e71df70
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -89,13 +89,19 @@ func (t timestampSource) String() string {
} }
} }
type result struct { // resultKey contains the stable dimensions and their values for a given
at time.Time // timeseries, i.e. not time and not rtt/timeout.
type resultKey struct {
meta nodeMeta meta nodeMeta
timestampSource timestampSource timestampSource timestampSource
connStability connStability connStability connStability
dstPort int dstPort int
rtt *time.Duration // nil signifies failure, e.g. timeout }
type result struct {
key resultKey
at time.Time
rtt *time.Duration // nil signifies failure, e.g. timeout
} }
func measureRTT(conn io.ReadWriteCloser, dst *net.UDPAddr) (rtt time.Duration, err error) { func measureRTT(conn io.ReadWriteCloser, dst *net.UDPAddr) (rtt time.Duration, err error) {
@ -149,6 +155,10 @@ type nodeMeta struct {
type measureFn func(conn io.ReadWriteCloser, dst *net.UDPAddr) (rtt time.Duration, err error) type measureFn func(conn io.ReadWriteCloser, dst *net.UDPAddr) (rtt time.Duration, err error)
// probe measures STUN round trip time for the node described by meta over
// conn against dstPort. It may return a nil duration and nil error if the
// STUN request timed out. A non-nil error indicates an unrecoverable or
// non-temporary error.
func probe(meta nodeMeta, conn io.ReadWriteCloser, fn measureFn, dstPort int) (*time.Duration, error) { func probe(meta nodeMeta, conn io.ReadWriteCloser, fn measureFn, dstPort int) (*time.Duration, error) {
ua := &net.UDPAddr{ ua := &net.UDPAddr{
IP: net.IP(meta.addr.AsSlice()), IP: net.IP(meta.addr.AsSlice()),
@ -162,10 +172,15 @@ func probe(meta nodeMeta, conn io.ReadWriteCloser, fn measureFn, dstPort int) (*
log.Printf("temp error measuring RTT to %s(%s): %v", meta.hostname, ua.String(), err) log.Printf("temp error measuring RTT to %s(%s): %v", meta.hostname, ua.String(), err)
return nil, nil return nil, nil
} }
return nil, err
} }
return &rtt, nil return &rtt, nil
} }
// nodeMetaFromDERPMap parses the provided DERP map in order to update nodeMeta
// in the provided nodeMetaByAddr. It returns a slice of nodeMeta containing
// the nodes that are no longer seen in the DERP map, but were previously held
// in nodeMetaByAddr.
func nodeMetaFromDERPMap(dm *tailcfg.DERPMap, nodeMetaByAddr map[netip.Addr]nodeMeta, ipv6 bool) (stale []nodeMeta, err error) { func nodeMetaFromDERPMap(dm *tailcfg.DERPMap, nodeMetaByAddr map[netip.Addr]nodeMeta, ipv6 bool) (stale []nodeMeta, err error) {
// Parse the new derp map before making any state changes in nodeMetaByAddr. // Parse the new derp map before making any state changes in nodeMetaByAddr.
// If parse fails we just stick with the old state. // If parse fails we just stick with the old state.
@ -271,10 +286,12 @@ func probeNodes(nodeMetaByAddr map[netip.Addr]nodeMeta, stableConns map[netip.Ad
doProbe := func(conn io.ReadWriteCloser, meta nodeMeta, source timestampSource, dstPort int) { doProbe := func(conn io.ReadWriteCloser, meta nodeMeta, source timestampSource, dstPort int) {
defer wg.Done() defer wg.Done()
r := result{ r := result{
at: at, key: resultKey{
meta: meta, meta: meta,
timestampSource: source, timestampSource: source,
dstPort: dstPort, dstPort: dstPort,
},
at: at,
} }
if conn == nil { if conn == nil {
var err error var err error
@ -293,7 +310,7 @@ func probeNodes(nodeMetaByAddr map[netip.Addr]nodeMeta, stableConns map[netip.Ad
} }
defer conn.Close() defer conn.Close()
} else { } else {
r.connStability = stableConn r.key.connStability = stableConn
} }
fn := measureRTT fn := measureRTT
if source == timestampSourceKernel { if source == timestampSourceKernel {
@ -373,7 +390,12 @@ const (
stableConn connStability = true stableConn connStability = true
) )
func timeSeriesLabels(meta nodeMeta, instance string, source timestampSource, stability connStability, dstPort int) []prompb.Label { const (
rttMetricName = "stunstamp_derp_stun_rtt_ns"
timeoutsMetricName = "stunstamp_derp_stun_timeouts_total"
)
func timeSeriesLabels(metricName string, meta nodeMeta, instance string, source timestampSource, stability connStability, dstPort int) []prompb.Label {
addressFamily := "ipv4" addressFamily := "ipv4"
if meta.addr.Is6() { if meta.addr.Is6() {
addressFamily = "ipv6" addressFamily = "ipv6"
@ -409,7 +431,7 @@ func timeSeriesLabels(meta nodeMeta, instance string, source timestampSource, st
}) })
labels = append(labels, prompb.Label{ labels = append(labels, prompb.Label{
Name: "__name__", Name: "__name__",
Value: "stunstamp_derp_stun_rtt_ns", Value: metricName,
}) })
labels = append(labels, prompb.Label{ labels = append(labels, prompb.Label{
Name: "timestamp_source", Name: "timestamp_source",
@ -443,20 +465,36 @@ func staleMarkersFromNodeMeta(stale []nodeMeta, instance string, dstPorts []int)
}, },
} }
staleMarkers = append(staleMarkers, prompb.TimeSeries{ staleMarkers = append(staleMarkers, prompb.TimeSeries{
Labels: timeSeriesLabels(s, instance, timestampSourceUserspace, unstableConn, dstPort), Labels: timeSeriesLabels(rttMetricName, s, instance, timestampSourceUserspace, unstableConn, dstPort),
Samples: samples, Samples: samples,
}) })
staleMarkers = append(staleMarkers, prompb.TimeSeries{ staleMarkers = append(staleMarkers, prompb.TimeSeries{
Labels: timeSeriesLabels(s, instance, timestampSourceUserspace, stableConn, dstPort), Labels: timeSeriesLabels(rttMetricName, s, instance, timestampSourceUserspace, stableConn, dstPort),
Samples: samples,
})
staleMarkers = append(staleMarkers, prompb.TimeSeries{
Labels: timeSeriesLabels(timeoutsMetricName, s, instance, timestampSourceUserspace, unstableConn, dstPort),
Samples: samples,
})
staleMarkers = append(staleMarkers, prompb.TimeSeries{
Labels: timeSeriesLabels(timeoutsMetricName, s, instance, timestampSourceUserspace, stableConn, dstPort),
Samples: samples, Samples: samples,
}) })
if supportsKernelTS() { if supportsKernelTS() {
staleMarkers = append(staleMarkers, prompb.TimeSeries{ staleMarkers = append(staleMarkers, prompb.TimeSeries{
Labels: timeSeriesLabels(s, instance, timestampSourceKernel, unstableConn, dstPort), Labels: timeSeriesLabels(rttMetricName, s, instance, timestampSourceKernel, unstableConn, dstPort),
Samples: samples, Samples: samples,
}) })
staleMarkers = append(staleMarkers, prompb.TimeSeries{ staleMarkers = append(staleMarkers, prompb.TimeSeries{
Labels: timeSeriesLabels(s, instance, timestampSourceKernel, stableConn, dstPort), Labels: timeSeriesLabels(rttMetricName, s, instance, timestampSourceKernel, stableConn, dstPort),
Samples: samples,
})
staleMarkers = append(staleMarkers, prompb.TimeSeries{
Labels: timeSeriesLabels(timeoutsMetricName, s, instance, timestampSourceKernel, unstableConn, dstPort),
Samples: samples,
})
staleMarkers = append(staleMarkers, prompb.TimeSeries{
Labels: timeSeriesLabels(timeoutsMetricName, s, instance, timestampSourceKernel, stableConn, dstPort),
Samples: samples, Samples: samples,
}) })
} }
@ -465,25 +503,47 @@ func staleMarkersFromNodeMeta(stale []nodeMeta, instance string, dstPorts []int)
return staleMarkers return staleMarkers
} }
func resultToPromTimeSeries(r result, instance string) prompb.TimeSeries { // resultsToPromTimeSeries returns a slice of prometheus TimeSeries for the
labels := timeSeriesLabels(r.meta, instance, r.timestampSource, r.connStability, r.dstPort) // provided results and instance. timeouts is updated based on results, i.e.
samples := make([]prompb.Sample, 1) // all result.key's are added to timeouts if they do not exist, and removed
samples[0].Timestamp = r.at.UnixMilli() // from timeouts if they are not present in results.
if r.rtt != nil { func resultsToPromTimeSeries(results []result, instance string, timeouts map[resultKey]uint64) []prompb.TimeSeries {
samples[0].Value = float64(*r.rtt) all := make([]prompb.TimeSeries, 0, len(results)*2)
} else { seenKeys := make(map[resultKey]bool)
samples[0].Value = math.NaN() for _, r := range results {
// TODO: timeout counter timeoutsCount := timeouts[r.key] // a non-existent key will return a zero val
seenKeys[r.key] = true
rttLabels := timeSeriesLabels(rttMetricName, r.key.meta, instance, r.key.timestampSource, r.key.connStability, r.key.dstPort)
rttSamples := make([]prompb.Sample, 1)
rttSamples[0].Timestamp = r.at.UnixMilli()
if r.rtt != nil {
rttSamples[0].Value = float64(*r.rtt)
} else {
rttSamples[0].Value = math.NaN()
timeoutsCount++
}
rttTS := prompb.TimeSeries{
Labels: rttLabels,
Samples: rttSamples,
}
all = append(all, rttTS)
timeouts[r.key] = timeoutsCount
timeoutsLabels := timeSeriesLabels(timeoutsMetricName, r.key.meta, instance, r.key.timestampSource, r.key.connStability, r.key.dstPort)
timeoutsSamples := make([]prompb.Sample, 1)
timeoutsSamples[0].Timestamp = r.at.UnixMilli()
timeoutsSamples[0].Value = float64(timeoutsCount)
timeoutsTS := prompb.TimeSeries{
Labels: timeoutsLabels,
Samples: timeoutsSamples,
}
all = append(all, timeoutsTS)
} }
ts := prompb.TimeSeries{ for k := range timeouts {
Labels: labels, if !seenKeys[k] {
Samples: samples, delete(timeouts, k)
}
} }
slices.SortFunc(ts.Labels, func(a, b prompb.Label) int { return all
// prometheus remote-write spec requires lexicographically sorted label names
return cmp.Compare(a.Name, b.Name)
})
return ts
} }
type remoteWriteClient struct { type remoteWriteClient struct {
@ -719,6 +779,10 @@ CREATE TABLE IF NOT EXISTS rtt(at_unix INT, region_id INT, hostname TEXT, af INT
// comes into play. // comes into play.
stableConns := make(map[netip.Addr]map[int][2]io.ReadWriteCloser) stableConns := make(map[netip.Addr]map[int][2]io.ReadWriteCloser)
// timeouts holds counts of timeout events. Values are persisted for the
// lifetime of the related node in the DERP map.
timeouts := make(map[resultKey]uint64)
derpMapTicker := time.NewTicker(time.Minute * 5) derpMapTicker := time.NewTicker(time.Minute * 5)
defer derpMapTicker.Stop() defer derpMapTicker.Stop()
probeTicker := time.NewTicker(*flagInterval) probeTicker := time.NewTicker(*flagInterval)
@ -744,10 +808,7 @@ CREATE TABLE IF NOT EXISTS rtt(at_unix INT, region_id INT, hostname TEXT, af INT
shutdown() shutdown()
return return
} }
ts := make([]prompb.TimeSeries, 0, len(results)) ts := resultsToPromTimeSeries(results, *flagInstance, timeouts)
for _, r := range results {
ts = append(ts, resultToPromTimeSeries(r, *flagInstance))
}
select { select {
case tsCh <- ts: case tsCh <- ts:
default: default:
@ -766,11 +827,11 @@ CREATE TABLE IF NOT EXISTS rtt(at_unix INT, region_id INT, hostname TEXT, af INT
} }
for _, result := range results { for _, result := range results {
af := 4 af := 4
if result.meta.addr.Is6() { if result.key.meta.addr.Is6() {
af = 6 af = 6
} }
_, err = tx.Exec("INSERT INTO rtt(at_unix, region_id, hostname, af, address, timestamp_source, stable_conn, dst_port, rtt_ns) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)", _, err = tx.Exec("INSERT INTO rtt(at_unix, region_id, hostname, af, address, timestamp_source, stable_conn, dst_port, rtt_ns) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)",
result.at.Unix(), result.meta.regionID, result.meta.hostname, af, result.meta.addr.String(), result.timestampSource, result.connStability, result.dstPort, result.rtt) result.at.Unix(), result.key.meta.regionID, result.key.meta.hostname, af, result.key.meta.addr.String(), result.key.timestampSource, result.key.connStability, result.key.dstPort, result.rtt)
if err != nil { if err != nil {
tx.Rollback() tx.Rollback()
log.Printf("error adding result to tx: %v", err) log.Printf("error adding result to tx: %v", err)