mirror of
https://github.com/tailscale/tailscale.git
synced 2025-05-06 15:46:53 +00:00
derp: change packets_dropped metric to also have reason and kind labels (#14651)
Metrics currently exist for dropped packets by reason, and total received packets by kind (e.g., `disco` or `other`), but relating these two together to gleam information about the drop rate for specific reasons on a per-kind basis is not currently possible. Change `derp_packets_dropped` to use a `metrics.MultiLabelMap` to track both the `reason` and `kind` in the same metric to allow for this desired level of granularity. Drop metrics that this makes unnecessary (namely `packetsDroppedReason` and `packetsDroppedType`). Updates https://github.com/tailscale/corp/issues/25489 Signed-off-by: Mario Minardi <mario@tailscale.com>
This commit is contained in:
parent
7d73a38b40
commit
de5683f7c6
@ -112,6 +112,14 @@ const (
|
|||||||
disableFighters
|
disableFighters
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// packetKind is the kind of packet being sent through DERP
|
||||||
|
type packetKind string
|
||||||
|
|
||||||
|
const (
|
||||||
|
packetKindDisco packetKind = "disco"
|
||||||
|
packetKindOther packetKind = "other"
|
||||||
|
)
|
||||||
|
|
||||||
type align64 [0]atomic.Int64 // for side effect of its 64-bit alignment
|
type align64 [0]atomic.Int64 // for side effect of its 64-bit alignment
|
||||||
|
|
||||||
// Server is a DERP server.
|
// Server is a DERP server.
|
||||||
@ -131,44 +139,37 @@ type Server struct {
|
|||||||
debug bool
|
debug bool
|
||||||
|
|
||||||
// Counters:
|
// Counters:
|
||||||
packetsSent, bytesSent expvar.Int
|
packetsSent, bytesSent expvar.Int
|
||||||
packetsRecv, bytesRecv expvar.Int
|
packetsRecv, bytesRecv expvar.Int
|
||||||
packetsRecvByKind metrics.LabelMap
|
packetsRecvByKind metrics.LabelMap
|
||||||
packetsRecvDisco *expvar.Int
|
packetsRecvDisco *expvar.Int
|
||||||
packetsRecvOther *expvar.Int
|
packetsRecvOther *expvar.Int
|
||||||
_ align64
|
_ align64
|
||||||
packetsDropped expvar.Int
|
packetsForwardedOut expvar.Int
|
||||||
packetsDroppedReason metrics.LabelMap
|
packetsForwardedIn expvar.Int
|
||||||
packetsDroppedReasonCounters []*expvar.Int // indexed by dropReason
|
peerGoneDisconnectedFrames expvar.Int // number of peer disconnected frames sent
|
||||||
packetsDroppedType metrics.LabelMap
|
peerGoneNotHereFrames expvar.Int // number of peer not here frames sent
|
||||||
packetsDroppedTypeDisco *expvar.Int
|
gotPing expvar.Int // number of ping frames from client
|
||||||
packetsDroppedTypeOther *expvar.Int
|
sentPong expvar.Int // number of pong frames enqueued to client
|
||||||
_ align64
|
accepts expvar.Int
|
||||||
packetsForwardedOut expvar.Int
|
curClients expvar.Int
|
||||||
packetsForwardedIn expvar.Int
|
curClientsNotIdeal expvar.Int
|
||||||
peerGoneDisconnectedFrames expvar.Int // number of peer disconnected frames sent
|
curHomeClients expvar.Int // ones with preferred
|
||||||
peerGoneNotHereFrames expvar.Int // number of peer not here frames sent
|
dupClientKeys expvar.Int // current number of public keys we have 2+ connections for
|
||||||
gotPing expvar.Int // number of ping frames from client
|
dupClientConns expvar.Int // current number of connections sharing a public key
|
||||||
sentPong expvar.Int // number of pong frames enqueued to client
|
dupClientConnTotal expvar.Int // total number of accepted connections when a dup key existed
|
||||||
accepts expvar.Int
|
unknownFrames expvar.Int
|
||||||
curClients expvar.Int
|
homeMovesIn expvar.Int // established clients announce home server moves in
|
||||||
curClientsNotIdeal expvar.Int
|
homeMovesOut expvar.Int // established clients announce home server moves out
|
||||||
curHomeClients expvar.Int // ones with preferred
|
multiForwarderCreated expvar.Int
|
||||||
dupClientKeys expvar.Int // current number of public keys we have 2+ connections for
|
multiForwarderDeleted expvar.Int
|
||||||
dupClientConns expvar.Int // current number of connections sharing a public key
|
removePktForwardOther expvar.Int
|
||||||
dupClientConnTotal expvar.Int // total number of accepted connections when a dup key existed
|
sclientWriteTimeouts expvar.Int
|
||||||
unknownFrames expvar.Int
|
avgQueueDuration *uint64 // In milliseconds; accessed atomically
|
||||||
homeMovesIn expvar.Int // established clients announce home server moves in
|
tcpRtt metrics.LabelMap // histogram
|
||||||
homeMovesOut expvar.Int // established clients announce home server moves out
|
meshUpdateBatchSize *metrics.Histogram
|
||||||
multiForwarderCreated expvar.Int
|
meshUpdateLoopCount *metrics.Histogram
|
||||||
multiForwarderDeleted expvar.Int
|
bufferedWriteFrames *metrics.Histogram // how many sendLoop frames (or groups of related frames) get written per flush
|
||||||
removePktForwardOther expvar.Int
|
|
||||||
sclientWriteTimeouts expvar.Int
|
|
||||||
avgQueueDuration *uint64 // In milliseconds; accessed atomically
|
|
||||||
tcpRtt metrics.LabelMap // histogram
|
|
||||||
meshUpdateBatchSize *metrics.Histogram
|
|
||||||
meshUpdateLoopCount *metrics.Histogram
|
|
||||||
bufferedWriteFrames *metrics.Histogram // how many sendLoop frames (or groups of related frames) get written per flush
|
|
||||||
|
|
||||||
// verifyClientsLocalTailscaled only accepts client connections to the DERP
|
// verifyClientsLocalTailscaled only accepts client connections to the DERP
|
||||||
// server if the clientKey is a known peer in the network, as specified by a
|
// server if the clientKey is a known peer in the network, as specified by a
|
||||||
@ -351,6 +352,11 @@ type Conn interface {
|
|||||||
SetWriteDeadline(time.Time) error
|
SetWriteDeadline(time.Time) error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var packetsDropped = metrics.NewMultiLabelMap[dropReasonKindLabels](
|
||||||
|
"derp_packets_dropped",
|
||||||
|
"counter",
|
||||||
|
"DERP packets dropped by reason and by kind")
|
||||||
|
|
||||||
// NewServer returns a new DERP server. It doesn't listen on its own.
|
// NewServer returns a new DERP server. It doesn't listen on its own.
|
||||||
// Connections are given to it via Server.Accept.
|
// Connections are given to it via Server.Accept.
|
||||||
func NewServer(privateKey key.NodePrivate, logf logger.Logf) *Server {
|
func NewServer(privateKey key.NodePrivate, logf logger.Logf) *Server {
|
||||||
@ -358,61 +364,81 @@ func NewServer(privateKey key.NodePrivate, logf logger.Logf) *Server {
|
|||||||
runtime.ReadMemStats(&ms)
|
runtime.ReadMemStats(&ms)
|
||||||
|
|
||||||
s := &Server{
|
s := &Server{
|
||||||
debug: envknob.Bool("DERP_DEBUG_LOGS"),
|
debug: envknob.Bool("DERP_DEBUG_LOGS"),
|
||||||
privateKey: privateKey,
|
privateKey: privateKey,
|
||||||
publicKey: privateKey.Public(),
|
publicKey: privateKey.Public(),
|
||||||
logf: logf,
|
logf: logf,
|
||||||
limitedLogf: logger.RateLimitedFn(logf, 30*time.Second, 5, 100),
|
limitedLogf: logger.RateLimitedFn(logf, 30*time.Second, 5, 100),
|
||||||
packetsRecvByKind: metrics.LabelMap{Label: "kind"},
|
packetsRecvByKind: metrics.LabelMap{Label: "kind"},
|
||||||
packetsDroppedReason: metrics.LabelMap{Label: "reason"},
|
clients: map[key.NodePublic]*clientSet{},
|
||||||
packetsDroppedType: metrics.LabelMap{Label: "type"},
|
clientsMesh: map[key.NodePublic]PacketForwarder{},
|
||||||
clients: map[key.NodePublic]*clientSet{},
|
netConns: map[Conn]chan struct{}{},
|
||||||
clientsMesh: map[key.NodePublic]PacketForwarder{},
|
memSys0: ms.Sys,
|
||||||
netConns: map[Conn]chan struct{}{},
|
watchers: set.Set[*sclient]{},
|
||||||
memSys0: ms.Sys,
|
peerGoneWatchers: map[key.NodePublic]set.HandleSet[func(key.NodePublic)]{},
|
||||||
watchers: set.Set[*sclient]{},
|
avgQueueDuration: new(uint64),
|
||||||
peerGoneWatchers: map[key.NodePublic]set.HandleSet[func(key.NodePublic)]{},
|
tcpRtt: metrics.LabelMap{Label: "le"},
|
||||||
avgQueueDuration: new(uint64),
|
meshUpdateBatchSize: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000}),
|
||||||
tcpRtt: metrics.LabelMap{Label: "le"},
|
meshUpdateLoopCount: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100}),
|
||||||
meshUpdateBatchSize: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000}),
|
bufferedWriteFrames: metrics.NewHistogram([]float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 50, 100}),
|
||||||
meshUpdateLoopCount: metrics.NewHistogram([]float64{0, 1, 2, 5, 10, 20, 50, 100}),
|
keyOfAddr: map[netip.AddrPort]key.NodePublic{},
|
||||||
bufferedWriteFrames: metrics.NewHistogram([]float64{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 50, 100}),
|
clock: tstime.StdClock{},
|
||||||
keyOfAddr: map[netip.AddrPort]key.NodePublic{},
|
|
||||||
clock: tstime.StdClock{},
|
|
||||||
}
|
}
|
||||||
s.initMetacert()
|
s.initMetacert()
|
||||||
s.packetsRecvDisco = s.packetsRecvByKind.Get("disco")
|
s.packetsRecvDisco = s.packetsRecvByKind.Get(string(packetKindDisco))
|
||||||
s.packetsRecvOther = s.packetsRecvByKind.Get("other")
|
s.packetsRecvOther = s.packetsRecvByKind.Get(string(packetKindOther))
|
||||||
|
|
||||||
s.packetsDroppedReasonCounters = s.genPacketsDroppedReasonCounters()
|
genPacketsDroppedCounters()
|
||||||
|
|
||||||
s.packetsDroppedTypeDisco = s.packetsDroppedType.Get("disco")
|
|
||||||
s.packetsDroppedTypeOther = s.packetsDroppedType.Get("other")
|
|
||||||
|
|
||||||
s.perClientSendQueueDepth = getPerClientSendQueueDepth()
|
s.perClientSendQueueDepth = getPerClientSendQueueDepth()
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) genPacketsDroppedReasonCounters() []*expvar.Int {
|
func genPacketsDroppedCounters() {
|
||||||
getMetric := s.packetsDroppedReason.Get
|
initMetrics := func(reason dropReason) {
|
||||||
ret := []*expvar.Int{
|
packetsDropped.Add(dropReasonKindLabels{
|
||||||
dropReasonUnknownDest: getMetric("unknown_dest"),
|
Kind: string(packetKindDisco),
|
||||||
dropReasonUnknownDestOnFwd: getMetric("unknown_dest_on_fwd"),
|
Reason: string(reason),
|
||||||
dropReasonGoneDisconnected: getMetric("gone_disconnected"),
|
}, 0)
|
||||||
dropReasonQueueHead: getMetric("queue_head"),
|
packetsDropped.Add(dropReasonKindLabels{
|
||||||
dropReasonQueueTail: getMetric("queue_tail"),
|
Kind: string(packetKindOther),
|
||||||
dropReasonWriteError: getMetric("write_error"),
|
Reason: string(reason),
|
||||||
dropReasonDupClient: getMetric("dup_client"),
|
}, 0)
|
||||||
}
|
}
|
||||||
if len(ret) != int(numDropReasons) {
|
getMetrics := func(reason dropReason) []expvar.Var {
|
||||||
panic("dropReason metrics out of sync")
|
return []expvar.Var{
|
||||||
|
packetsDropped.Get(dropReasonKindLabels{
|
||||||
|
Kind: string(packetKindDisco),
|
||||||
|
Reason: string(reason),
|
||||||
|
}),
|
||||||
|
packetsDropped.Get(dropReasonKindLabels{
|
||||||
|
Kind: string(packetKindOther),
|
||||||
|
Reason: string(reason),
|
||||||
|
}),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for i := range numDropReasons {
|
|
||||||
if ret[i] == nil {
|
dropReasons := []dropReason{
|
||||||
|
dropReasonUnknownDest,
|
||||||
|
dropReasonUnknownDestOnFwd,
|
||||||
|
dropReasonGoneDisconnected,
|
||||||
|
dropReasonQueueHead,
|
||||||
|
dropReasonQueueTail,
|
||||||
|
dropReasonWriteError,
|
||||||
|
dropReasonDupClient,
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, dr := range dropReasons {
|
||||||
|
initMetrics(dr)
|
||||||
|
m := getMetrics(dr)
|
||||||
|
if len(m) != 2 {
|
||||||
|
panic("dropReason metrics out of sync")
|
||||||
|
}
|
||||||
|
|
||||||
|
if m[0] == nil || m[1] == nil {
|
||||||
panic("dropReason metrics out of sync")
|
panic("dropReason metrics out of sync")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ret
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// SetMesh sets the pre-shared key that regional DERP servers used to mesh
|
// SetMesh sets the pre-shared key that regional DERP servers used to mesh
|
||||||
@ -1152,31 +1178,36 @@ func (c *sclient) debugLogf(format string, v ...any) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// dropReason is why we dropped a DERP frame.
|
type dropReasonKindLabels struct {
|
||||||
type dropReason int
|
Reason string // metric label corresponding to a given dropReason
|
||||||
|
Kind string // either `disco` or `other`
|
||||||
|
}
|
||||||
|
|
||||||
//go:generate go run tailscale.com/cmd/addlicense -file dropreason_string.go go run golang.org/x/tools/cmd/stringer -type=dropReason -trimprefix=dropReason
|
// dropReason is why we dropped a DERP frame.
|
||||||
|
type dropReason string
|
||||||
|
|
||||||
const (
|
const (
|
||||||
dropReasonUnknownDest dropReason = iota // unknown destination pubkey
|
dropReasonUnknownDest dropReason = "unknown_dest" // unknown destination pubkey
|
||||||
dropReasonUnknownDestOnFwd // unknown destination pubkey on a derp-forwarded packet
|
dropReasonUnknownDestOnFwd dropReason = "unknown_dest_on_fwd" // unknown destination pubkey on a derp-forwarded packet
|
||||||
dropReasonGoneDisconnected // destination tailscaled disconnected before we could send
|
dropReasonGoneDisconnected dropReason = "gone_disconnected" // destination tailscaled disconnected before we could send
|
||||||
dropReasonQueueHead // destination queue is full, dropped packet at queue head
|
dropReasonQueueHead dropReason = "queue_head" // destination queue is full, dropped packet at queue head
|
||||||
dropReasonQueueTail // destination queue is full, dropped packet at queue tail
|
dropReasonQueueTail dropReason = "queue_tail" // destination queue is full, dropped packet at queue tail
|
||||||
dropReasonWriteError // OS write() failed
|
dropReasonWriteError dropReason = "write_error" // OS write() failed
|
||||||
dropReasonDupClient // the public key is connected 2+ times (active/active, fighting)
|
dropReasonDupClient dropReason = "dup_client" // the public key is connected 2+ times (active/active, fighting)
|
||||||
numDropReasons // unused; keep last
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func (s *Server) recordDrop(packetBytes []byte, srcKey, dstKey key.NodePublic, reason dropReason) {
|
func (s *Server) recordDrop(packetBytes []byte, srcKey, dstKey key.NodePublic, reason dropReason) {
|
||||||
s.packetsDropped.Add(1)
|
labels := dropReasonKindLabels{
|
||||||
s.packetsDroppedReasonCounters[reason].Add(1)
|
Reason: string(reason),
|
||||||
|
}
|
||||||
looksDisco := disco.LooksLikeDiscoWrapper(packetBytes)
|
looksDisco := disco.LooksLikeDiscoWrapper(packetBytes)
|
||||||
if looksDisco {
|
if looksDisco {
|
||||||
s.packetsDroppedTypeDisco.Add(1)
|
labels.Kind = string(packetKindDisco)
|
||||||
} else {
|
} else {
|
||||||
s.packetsDroppedTypeOther.Add(1)
|
labels.Kind = string(packetKindOther)
|
||||||
}
|
}
|
||||||
|
packetsDropped.Add(labels, 1)
|
||||||
|
|
||||||
if verboseDropKeys[dstKey] {
|
if verboseDropKeys[dstKey] {
|
||||||
// Preformat the log string prior to calling limitedLogf. The
|
// Preformat the log string prior to calling limitedLogf. The
|
||||||
// limiter acts based on the format string, and we want to
|
// limiter acts based on the format string, and we want to
|
||||||
@ -2095,9 +2126,6 @@ func (s *Server) ExpVar() expvar.Var {
|
|||||||
m.Set("accepts", &s.accepts)
|
m.Set("accepts", &s.accepts)
|
||||||
m.Set("bytes_received", &s.bytesRecv)
|
m.Set("bytes_received", &s.bytesRecv)
|
||||||
m.Set("bytes_sent", &s.bytesSent)
|
m.Set("bytes_sent", &s.bytesSent)
|
||||||
m.Set("packets_dropped", &s.packetsDropped)
|
|
||||||
m.Set("counter_packets_dropped_reason", &s.packetsDroppedReason)
|
|
||||||
m.Set("counter_packets_dropped_type", &s.packetsDroppedType)
|
|
||||||
m.Set("counter_packets_received_kind", &s.packetsRecvByKind)
|
m.Set("counter_packets_received_kind", &s.packetsRecvByKind)
|
||||||
m.Set("packets_sent", &s.packetsSent)
|
m.Set("packets_sent", &s.packetsSent)
|
||||||
m.Set("packets_received", &s.packetsRecv)
|
m.Set("packets_received", &s.packetsRecv)
|
||||||
|
@ -1,33 +0,0 @@
|
|||||||
// Copyright (c) Tailscale Inc & AUTHORS
|
|
||||||
// SPDX-License-Identifier: BSD-3-Clause
|
|
||||||
|
|
||||||
// Code generated by "stringer -type=dropReason -trimprefix=dropReason"; DO NOT EDIT.
|
|
||||||
|
|
||||||
package derp
|
|
||||||
|
|
||||||
import "strconv"
|
|
||||||
|
|
||||||
func _() {
|
|
||||||
// An "invalid array index" compiler error signifies that the constant values have changed.
|
|
||||||
// Re-run the stringer command to generate them again.
|
|
||||||
var x [1]struct{}
|
|
||||||
_ = x[dropReasonUnknownDest-0]
|
|
||||||
_ = x[dropReasonUnknownDestOnFwd-1]
|
|
||||||
_ = x[dropReasonGoneDisconnected-2]
|
|
||||||
_ = x[dropReasonQueueHead-3]
|
|
||||||
_ = x[dropReasonQueueTail-4]
|
|
||||||
_ = x[dropReasonWriteError-5]
|
|
||||||
_ = x[dropReasonDupClient-6]
|
|
||||||
_ = x[numDropReasons-7]
|
|
||||||
}
|
|
||||||
|
|
||||||
const _dropReason_name = "UnknownDestUnknownDestOnFwdGoneDisconnectedQueueHeadQueueTailWriteErrorDupClientnumDropReasons"
|
|
||||||
|
|
||||||
var _dropReason_index = [...]uint8{0, 11, 27, 43, 52, 61, 71, 80, 94}
|
|
||||||
|
|
||||||
func (i dropReason) String() string {
|
|
||||||
if i < 0 || i >= dropReason(len(_dropReason_index)-1) {
|
|
||||||
return "dropReason(" + strconv.FormatInt(int64(i), 10) + ")"
|
|
||||||
}
|
|
||||||
return _dropReason_name[_dropReason_index[i]:_dropReason_index[i+1]]
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user