derp: change packets_dropped metric to also have reason and kind labels (#14651)

Metrics currently exist for dropped packets by reason, and total
received packets by kind (e.g., `disco` or `other`), but relating these
two together to gleam information about the drop rate for specific
reasons on a per-kind basis is not currently possible.

Change `derp_packets_dropped` to use a `metrics.MultiLabelMap` to
track both the `reason` and `kind` in the same metric to allow for this
desired level of granularity.

Drop metrics that this makes unnecessary (namely `packetsDroppedReason`
and `packetsDroppedType`).

Updates https://github.com/tailscale/corp/issues/25489

Signed-off-by: Mario Minardi <mario@tailscale.com>
This commit is contained in:
Mario Minardi 2025-01-16 12:21:33 -07:00 committed by GitHub
parent 7d73a38b40
commit de5683f7c6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 126 additions and 131 deletions

View File

@ -112,6 +112,14 @@ const (
disableFighters disableFighters
) )
// packetKind is the kind of packet being sent through DERP
type packetKind string
const (
packetKindDisco packetKind = "disco"
packetKindOther packetKind = "other"
)
type align64 [0]atomic.Int64 // for side effect of its 64-bit alignment type align64 [0]atomic.Int64 // for side effect of its 64-bit alignment
// Server is a DERP server. // Server is a DERP server.
@ -131,44 +139,37 @@ type Server struct {
debug bool debug bool
// Counters: // Counters:
packetsSent, bytesSent expvar.Int packetsSent, bytesSent expvar.Int
packetsRecv, bytesRecv expvar.Int packetsRecv, bytesRecv expvar.Int
packetsRecvByKind metrics.LabelMap packetsRecvByKind metrics.LabelMap
packetsRecvDisco *expvar.Int packetsRecvDisco *expvar.Int
packetsRecvOther *expvar.Int packetsRecvOther *expvar.Int
_ align64 _ align64
packetsDropped expvar.Int packetsForwardedOut expvar.Int
packetsDroppedReason metrics.LabelMap packetsForwardedIn expvar.Int
packetsDroppedReasonCounters []*expvar.Int // indexed by dropReason peerGoneDisconnectedFrames expvar.Int // number of peer disconnected frames sent
packetsDroppedType metrics.LabelMap peerGoneNotHereFrames expvar.Int // number of peer not here frames sent
packetsDroppedTypeDisco *expvar.Int gotPing expvar.Int // number of ping frames from client
packetsDroppedTypeOther *expvar.Int sentPong expvar.Int // number of pong frames enqueued to client
_ align64 accepts expvar.Int
packetsForwardedOut expvar.Int curClients expvar.Int
packetsForwardedIn expvar.Int curClientsNotIdeal expvar.Int
peerGoneDisconnectedFrames expvar.Int // number of peer disconnected frames sent curHomeClients expvar.Int // ones with preferred
peerGoneNotHereFrames expvar.Int // number of peer not here frames sent dupClientKeys expvar.Int // current number of public keys we have 2+ connections for
gotPing expvar.Int // number of ping frames from client dupClientConns expvar.Int // current number of connections sharing a public key
sentPong expvar.Int // number of pong frames enqueued to client dupClientConnTotal expvar.Int // total number of accepted connections when a dup key existed
accepts expvar.Int unknownFrames expvar.Int
curClients expvar.Int homeMovesIn expvar.Int // established clients announce home server moves in
curClientsNotIdeal expvar.Int homeMovesOut expvar.Int // established clients announce home server moves out
curHomeClients expvar.Int // ones with preferred multiForwarderCreated expvar.Int
dupClientKeys expvar.Int // current number of public keys we have 2+ connections for multiForwarderDeleted expvar.Int
dupClientConns expvar.Int // current number of connections sharing a public key removePktForwardOther expvar.Int
dupClientConnTotal expvar.Int // total number of accepted connections when a dup key existed sclientWriteTimeouts expvar.Int
unknownFrames expvar.Int avgQueueDuration *uint64 // In milliseconds; accessed atomically
homeMovesIn expvar.Int // established clients announce home server moves in tcpRtt metrics.LabelMap // histogram
homeMovesOut expvar.Int // established clients announce home server moves out meshUpdateBatchSize *metrics.Histogram
multiForwarderCreated expvar.Int meshUpdateLoopCount *metrics.Histogram
multiForwarderDeleted expvar.Int bufferedWriteFrames *metrics.Histogram // how many sendLoop frames (or groups of related frames) get written per flush
removePktForwardOther expvar.Int
sclientWriteTimeouts expvar.Int
avgQueueDuration *uint64 // In milliseconds; accessed atomically
tcpRtt metrics.LabelMap // histogram
meshUpdateBatchSize *metrics.Histogram
meshUpdateLoopCount *metrics.Histogram
bufferedWriteFrames *metrics.Histogram // how many sendLoop frames (or groups of related frames) get written per flush
// verifyClientsLocalTailscaled only accepts client connections to the DERP // verifyClientsLocalTailscaled only accepts client connections to the DERP
// server if the clientKey is a known peer in the network, as specified by a // server if the clientKey is a known peer in the network, as specified by a
@ -351,6 +352,11 @@ type Conn interface {
SetWriteDeadline(time.Time) error SetWriteDeadline(time.Time) error
} }
var packetsDropped = metrics.NewMultiLabelMap[dropReasonKindLabels](
"derp_packets_dropped",
"counter",
"DERP packets dropped by reason and by kind")
// NewServer returns a new DERP server. It doesn't listen on its own. // NewServer returns a new DERP server. It doesn't listen on its own.
// Connections are given to it via Server.Accept. // Connections are given to it via Server.Accept.
func NewServer(privateKey key.NodePrivate, logf logger.Logf) *Server { func NewServer(privateKey key.NodePrivate, logf logger.Logf) *Server {
@ -358,61 +364,81 @@ func NewServer(privateKey key.NodePrivate, logf logger.Logf) *Server {
runtime.ReadMemStats(&ms) runtime.ReadMemStats(&ms)
s := &Server{ s := &Server{
debug: envknob.Bool("DERP_DEBUG_LOGS"), debug: envknob.Bool("DERP_DEBUG_LOGS"),
privateKey: privateKey, privateKey: privateKey,
publicKey: privateKey.Public(), publicKey: privateKey.Public(),
logf: logf, logf: logf,
limitedLogf: logger.RateLimitedFn(logf, 30*time.Second, 5, 100), limitedLogf: logger.RateLimitedFn(logf, 30*time.Second, 5, 100),
packetsRecvByKind: metrics.LabelMap{Label: "kind"}, packetsRecvByKind: metrics.LabelMap{Label: "kind"},
packetsDroppedReason: metrics.LabelMap{Label: "reason"}, clients: map[key.NodePublic]*clientSet{},
packetsDroppedType: metrics.LabelMap{Label: "type"}, clientsMesh: map[key.NodePublic]PacketForwarder{},
clients: map[key.NodePublic]*clientSet{}, netConns: map[Conn]chan struct{}{},
clientsMesh: map[key.NodePublic]PacketForwarder{}, memSys0: ms.Sys,
netConns: map[Conn]chan struct{}{}, watchers: set.Set[*sclient]{},
memSys0: ms.Sys, peerGoneWatchers: map[key.NodePublic]set.HandleSet[func(key.NodePublic)]{},
watchers: set.Set[*sclient]{}, avgQueueDuration: new(uint64),
peerGoneWatchers: map[key.NodePublic]set.HandleSet[func(key.NodePublic)]{}, tcpRtt: metrics.LabelMap{Label: "le"},
avgQueueDuration: new(uint64), meshUpdateBatchSize: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000}),
tcpRtt: metrics.LabelMap{Label: "le"}, meshUpdateLoopCount: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100}),
meshUpdateBatchSize: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000}), bufferedWriteFrames: metrics.NewHistogram([]float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 50, 100}),
meshUpdateLoopCount: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100}), keyOfAddr: map[netip.AddrPort]key.NodePublic{},
bufferedWriteFrames: metrics.NewHistogram([]float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 50, 100}), clock: tstime.StdClock{},
keyOfAddr: map[netip.AddrPort]key.NodePublic{},
clock: tstime.StdClock{},
} }
s.initMetacert() s.initMetacert()
s.packetsRecvDisco = s.packetsRecvByKind.Get("disco") s.packetsRecvDisco = s.packetsRecvByKind.Get(string(packetKindDisco))
s.packetsRecvOther = s.packetsRecvByKind.Get("other") s.packetsRecvOther = s.packetsRecvByKind.Get(string(packetKindOther))
s.packetsDroppedReasonCounters = s.genPacketsDroppedReasonCounters() genPacketsDroppedCounters()
s.packetsDroppedTypeDisco = s.packetsDroppedType.Get("disco")
s.packetsDroppedTypeOther = s.packetsDroppedType.Get("other")
s.perClientSendQueueDepth = getPerClientSendQueueDepth() s.perClientSendQueueDepth = getPerClientSendQueueDepth()
return s return s
} }
func (s *Server) genPacketsDroppedReasonCounters() []*expvar.Int { func genPacketsDroppedCounters() {
getMetric := s.packetsDroppedReason.Get initMetrics := func(reason dropReason) {
ret := []*expvar.Int{ packetsDropped.Add(dropReasonKindLabels{
dropReasonUnknownDest: getMetric("unknown_dest"), Kind: string(packetKindDisco),
dropReasonUnknownDestOnFwd: getMetric("unknown_dest_on_fwd"), Reason: string(reason),
dropReasonGoneDisconnected: getMetric("gone_disconnected"), }, 0)
dropReasonQueueHead: getMetric("queue_head"), packetsDropped.Add(dropReasonKindLabels{
dropReasonQueueTail: getMetric("queue_tail"), Kind: string(packetKindOther),
dropReasonWriteError: getMetric("write_error"), Reason: string(reason),
dropReasonDupClient: getMetric("dup_client"), }, 0)
} }
if len(ret) != int(numDropReasons) { getMetrics := func(reason dropReason) []expvar.Var {
panic("dropReason metrics out of sync") return []expvar.Var{
packetsDropped.Get(dropReasonKindLabels{
Kind: string(packetKindDisco),
Reason: string(reason),
}),
packetsDropped.Get(dropReasonKindLabels{
Kind: string(packetKindOther),
Reason: string(reason),
}),
}
} }
for i := range numDropReasons {
if ret[i] == nil { dropReasons := []dropReason{
dropReasonUnknownDest,
dropReasonUnknownDestOnFwd,
dropReasonGoneDisconnected,
dropReasonQueueHead,
dropReasonQueueTail,
dropReasonWriteError,
dropReasonDupClient,
}
for _, dr := range dropReasons {
initMetrics(dr)
m := getMetrics(dr)
if len(m) != 2 {
panic("dropReason metrics out of sync")
}
if m[0] == nil || m[1] == nil {
panic("dropReason metrics out of sync") panic("dropReason metrics out of sync")
} }
} }
return ret
} }
// SetMesh sets the pre-shared key that regional DERP servers used to mesh // SetMesh sets the pre-shared key that regional DERP servers used to mesh
@ -1152,31 +1178,36 @@ func (c *sclient) debugLogf(format string, v ...any) {
} }
} }
// dropReason is why we dropped a DERP frame. type dropReasonKindLabels struct {
type dropReason int Reason string // metric label corresponding to a given dropReason
Kind string // either `disco` or `other`
}
//go:generate go run tailscale.com/cmd/addlicense -file dropreason_string.go go run golang.org/x/tools/cmd/stringer -type=dropReason -trimprefix=dropReason // dropReason is why we dropped a DERP frame.
type dropReason string
const ( const (
dropReasonUnknownDest dropReason = iota // unknown destination pubkey dropReasonUnknownDest dropReason = "unknown_dest" // unknown destination pubkey
dropReasonUnknownDestOnFwd // unknown destination pubkey on a derp-forwarded packet dropReasonUnknownDestOnFwd dropReason = "unknown_dest_on_fwd" // unknown destination pubkey on a derp-forwarded packet
dropReasonGoneDisconnected // destination tailscaled disconnected before we could send dropReasonGoneDisconnected dropReason = "gone_disconnected" // destination tailscaled disconnected before we could send
dropReasonQueueHead // destination queue is full, dropped packet at queue head dropReasonQueueHead dropReason = "queue_head" // destination queue is full, dropped packet at queue head
dropReasonQueueTail // destination queue is full, dropped packet at queue tail dropReasonQueueTail dropReason = "queue_tail" // destination queue is full, dropped packet at queue tail
dropReasonWriteError // OS write() failed dropReasonWriteError dropReason = "write_error" // OS write() failed
dropReasonDupClient // the public key is connected 2+ times (active/active, fighting) dropReasonDupClient dropReason = "dup_client" // the public key is connected 2+ times (active/active, fighting)
numDropReasons // unused; keep last
) )
func (s *Server) recordDrop(packetBytes []byte, srcKey, dstKey key.NodePublic, reason dropReason) { func (s *Server) recordDrop(packetBytes []byte, srcKey, dstKey key.NodePublic, reason dropReason) {
s.packetsDropped.Add(1) labels := dropReasonKindLabels{
s.packetsDroppedReasonCounters[reason].Add(1) Reason: string(reason),
}
looksDisco := disco.LooksLikeDiscoWrapper(packetBytes) looksDisco := disco.LooksLikeDiscoWrapper(packetBytes)
if looksDisco { if looksDisco {
s.packetsDroppedTypeDisco.Add(1) labels.Kind = string(packetKindDisco)
} else { } else {
s.packetsDroppedTypeOther.Add(1) labels.Kind = string(packetKindOther)
} }
packetsDropped.Add(labels, 1)
if verboseDropKeys[dstKey] { if verboseDropKeys[dstKey] {
// Preformat the log string prior to calling limitedLogf. The // Preformat the log string prior to calling limitedLogf. The
// limiter acts based on the format string, and we want to // limiter acts based on the format string, and we want to
@ -2095,9 +2126,6 @@ func (s *Server) ExpVar() expvar.Var {
m.Set("accepts", &s.accepts) m.Set("accepts", &s.accepts)
m.Set("bytes_received", &s.bytesRecv) m.Set("bytes_received", &s.bytesRecv)
m.Set("bytes_sent", &s.bytesSent) m.Set("bytes_sent", &s.bytesSent)
m.Set("packets_dropped", &s.packetsDropped)
m.Set("counter_packets_dropped_reason", &s.packetsDroppedReason)
m.Set("counter_packets_dropped_type", &s.packetsDroppedType)
m.Set("counter_packets_received_kind", &s.packetsRecvByKind) m.Set("counter_packets_received_kind", &s.packetsRecvByKind)
m.Set("packets_sent", &s.packetsSent) m.Set("packets_sent", &s.packetsSent)
m.Set("packets_received", &s.packetsRecv) m.Set("packets_received", &s.packetsRecv)

View File

@ -1,33 +0,0 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
// Code generated by "stringer -type=dropReason -trimprefix=dropReason"; DO NOT EDIT.
package derp
import "strconv"
func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
_ = x[dropReasonUnknownDest-0]
_ = x[dropReasonUnknownDestOnFwd-1]
_ = x[dropReasonGoneDisconnected-2]
_ = x[dropReasonQueueHead-3]
_ = x[dropReasonQueueTail-4]
_ = x[dropReasonWriteError-5]
_ = x[dropReasonDupClient-6]
_ = x[numDropReasons-7]
}
const _dropReason_name = "UnknownDestUnknownDestOnFwdGoneDisconnectedQueueHeadQueueTailWriteErrorDupClientnumDropReasons"
var _dropReason_index = [...]uint8{0, 11, 27, 43, 52, 61, 71, 80, 94}
func (i dropReason) String() string {
if i < 0 || i >= dropReason(len(_dropReason_index)-1) {
return "dropReason(" + strconv.FormatInt(int64(i), 10) + ")"
}
return _dropReason_name[_dropReason_index[i]:_dropReason_index[i+1]]
}