From a15bc415b9db9a1897525cca3e82e2be66e76ea5 Mon Sep 17 00:00:00 2001 From: Jonathan Nobels Date: Mon, 28 Apr 2025 15:21:34 -0400 Subject: [PATCH] health, ipn/ipnlocal: add metrics for various client events updates tailscale/corp#28092 Adds metrics for various client events: * Enabling an exit node * Enabling a mullvad exit node * Enabling a preferred exit node * Setting WantRunning to true/false * Requesting a bug report ID * Profile counts * Profile deletions * Captive portal detection Signed-off-by: Jonathan Nobels --- health/health.go | 12 ++++ ipn/ipnlocal/local.go | 22 ++++++++ ipn/ipnlocal/node_backend.go | 12 ++++ ipn/ipnlocal/prefs_metrics.go | 100 ++++++++++++++++++++++++++++++++++ ipn/ipnlocal/profiles.go | 6 +- ipn/localapi/localapi.go | 17 +++--- 6 files changed, 160 insertions(+), 9 deletions(-) create mode 100644 ipn/ipnlocal/prefs_metrics.go diff --git a/health/health.go b/health/health.go index 65d4402ae..1ec2bcc9b 100644 --- a/health/health.go +++ b/health/health.go @@ -362,6 +362,18 @@ func (t *Tracker) SetMetricsRegistry(reg *usermetric.Registry) { })) } +// IsUnhealthy reports whether the current state is unhealthy because the given +// warnable is set. +func (t *Tracker) IsUnhealthy(w *Warnable) bool { + if t.nil() { + return false + } + t.mu.Lock() + defer t.mu.Unlock() + _, exists := t.warnableVal[w] + return exists +} + // SetUnhealthy sets a warningState for the given Warnable with the provided Args, and should be // called when a Warnable becomes unhealthy, or its unhealthy status needs to be updated. // SetUnhealthy takes ownership of args. The args can be nil if no additional information is diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index b2998d11c..b8771aef5 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -167,6 +167,8 @@ type watchSession struct { cancel context.CancelFunc // to shut down the session } +var metricCaptivePortalDetected = clientmetric.NewCounter("captiveportal_detected") + // LocalBackend is the glue between the major pieces of the Tailscale // network software: the cloud control plane (via controlclient), the // network data plane (via wgengine), and the user-facing UIs and CLIs @@ -2776,6 +2778,9 @@ func (b *LocalBackend) performCaptiveDetection() { b.mu.Unlock() found := d.Detect(ctx, netMon, dm, preferredDERP) if found { + if !b.health.IsUnhealthy(captivePortalWarnable) { + metricCaptivePortalDetected.Add(1) + } b.health.SetUnhealthy(captivePortalWarnable, health.Args{}) } else { b.health.SetHealthy(captivePortalWarnable) @@ -4402,9 +4407,11 @@ func (b *LocalBackend) editPrefsLockedOnEntry(mp *ipn.MaskedPrefs, unlock unlock b.egg = true b.goTracker.Go(b.doSetHostinfoFilterServices) } + p0 := b.pm.CurrentPrefs() p1 := b.pm.CurrentPrefs().AsStruct() p1.ApplyEdits(mp) + if err := b.checkPrefsLocked(p1); err != nil { b.logf("EditPrefs check error: %v", err) return ipn.PrefsView{}, err @@ -4416,9 +4423,23 @@ func (b *LocalBackend) editPrefsLockedOnEntry(mp *ipn.MaskedPrefs, unlock unlock if p1.View().Equals(p0) { return stripKeysFromPrefs(p0), nil } + b.logf("EditPrefs: %v", mp.Pretty()) newPrefs := b.setPrefsLockedOnEntry(p1, unlock) + // This is recorded here in the EditPrefs path, not the setPrefs path on purpose. + // recordForEdit records metrics related to edits and changes, not the final state. + // If, in the future, we want to record gauge-metrics related to the state of prefs, + // that should be done in the setPrefs path. + e := prefsMetricsEditEvent{ + change: mp, + pNew: p1.View(), + pOld: p0, + node: b.currentNode(), + lastSuggestedExitNode: b.lastSuggestedExitNode, + } + e.record() + // Note: don't perform any actions for the new prefs here. Not // every prefs change goes through EditPrefs. Put your actions // in setPrefsLocksOnEntry instead. @@ -4490,6 +4511,7 @@ func (b *LocalBackend) setPrefsLockedOnEntry(newp *ipn.Prefs, unlock unlockOnce) applySysPolicy(newp, b.lastSuggestedExitNode, b.overrideAlwaysOn) // setExitNodeID does likewise. No-op if no exit node resolution is needed. setExitNodeID(newp, netMap) + // We do this to avoid holding the lock while doing everything else. oldHi := b.hostinfo diff --git a/ipn/ipnlocal/node_backend.go b/ipn/ipnlocal/node_backend.go index 415c32ccf..88a723365 100644 --- a/ipn/ipnlocal/node_backend.go +++ b/ipn/ipnlocal/node_backend.go @@ -172,6 +172,18 @@ func (nb *nodeBackend) PeerByID(id tailcfg.NodeID) (_ tailcfg.NodeView, ok bool) return n, ok } +func (nb *nodeBackend) PeerByStableID(id tailcfg.StableNodeID) (_ tailcfg.NodeView, ok bool) { + nb.mu.Lock() + peers := nb.peers + nb.mu.Unlock() + for _, n := range peers { + if n.StableID() == id { + return n, true + } + } + return tailcfg.NodeView{}, false +} + func (nb *nodeBackend) UserByID(id tailcfg.UserID) (_ tailcfg.UserProfileView, ok bool) { nb.mu.Lock() nm := nb.netMap diff --git a/ipn/ipnlocal/prefs_metrics.go b/ipn/ipnlocal/prefs_metrics.go new file mode 100644 index 000000000..0c83e2a26 --- /dev/null +++ b/ipn/ipnlocal/prefs_metrics.go @@ -0,0 +1,100 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package ipnlocal + +import ( + "errors" + + "tailscale.com/ipn" + "tailscale.com/tailcfg" + "tailscale.com/util/clientmetric" +) + +// Counter metrics for edit/change events +var ( + // metricExitNodeEnabled is incremented when the user enables an exit node independent of the node's characteristics. + metricExitNodeEnabled = clientmetric.NewCounter("prefs_exit_node_enabled") + // metricExitNodeEnabledSuggested is incremented when the user enables the suggested exit node. + metricExitNodeEnabledSuggested = clientmetric.NewCounter("prefs_exit_node_enabled_suggested") + // metricExitNodeEnabledMullvad is incremented when the user enables a Mullvad exit node. + metricExitNodeEnabledMullvad = clientmetric.NewCounter("prefs_exit_node_enabled_mullvad") + // metricWantRunningEnabled is incremented when WantRunning transitions from false to true. + metricWantRunningEnabled = clientmetric.NewCounter("prefs_want_running_enabled") + // metricWantRunningDisabled is incremented when WantRunning transitions from true to false. + metricWantRunningDisabled = clientmetric.NewCounter("prefs_want_running_disabled") +) + +type exitNodeProperty string + +const ( + exitNodeTypePreferred exitNodeProperty = "suggested" // The exit node is the last suggested exit node + exitNodeTypeMullvad exitNodeProperty = "mullvad" // The exit node is a Mullvad exit node +) + +// prefsMetricsEditEvent encapsulates information needed to record metrics related +// to any changes to preferences. +type prefsMetricsEditEvent struct { + change *ipn.MaskedPrefs // the preference mask used to update the preferences + pNew ipn.PrefsView // new preferences (after ApplyUpdates) + pOld ipn.PrefsView // old preferences (before ApplyUpdates) + node *nodeBackend // the node the event is associated with + lastSuggestedExitNode tailcfg.StableNodeID // the last suggested exit node +} + +// record records changes to preferences as clientmetrics. +func (e *prefsMetricsEditEvent) record() error { + if e.change == nil || e.node == nil { + return errors.New("prefsMetricsEditEvent: missing required fields") + } + + // Record up/down events. + if e.change.WantRunningSet && (e.pNew.WantRunning() != e.pOld.WantRunning()) { + if e.pNew.WantRunning() { + metricWantRunningEnabled.Add(1) + } else { + metricWantRunningDisabled.Add(1) + } + } + + // Record any changes to exit node settings. + if e.change.ExitNodeIDSet || e.change.ExitNodeIPSet { + if exitNodeTypes, ok := e.exitNodeTypeLocked(e.pNew.ExitNodeID()); ok { + // We have switched to a valid exit node if ok is true. + metricExitNodeEnabled.Add(1) + + // We may have some additional characteristics we should also record. + for _, t := range exitNodeTypes { + switch t { + case exitNodeTypePreferred: + metricExitNodeEnabledSuggested.Add(1) + case exitNodeTypeMullvad: + metricExitNodeEnabledMullvad.Add(1) + } + } + } + } + return nil +} + +// exitNodeTypesLocked returns type of exit node for the given stable ID. +// An exit node may have multiple type (can be both mullvad and preferred +// simultaneously for example). +// +// This will return ok as true if the supplied stable ID resolves to a known peer, +// false otherwise. The caller is responsible for ensuring that the id belongs to +// an exit node. +func (e *prefsMetricsEditEvent) exitNodeTypeLocked(id tailcfg.StableNodeID) (_ []exitNodeProperty, isNode bool) { + var peer tailcfg.NodeView + var t []exitNodeProperty + + if peer, isNode = e.node.PeerByStableID(id); isNode { + if tailcfg.StableNodeID(id) == e.lastSuggestedExitNode { + t = append(t, exitNodeTypePreferred) + } + if peer.IsWireGuardOnly() { + t = append(t, exitNodeTypeMullvad) + } + } + return t, isNode +} diff --git a/ipn/ipnlocal/profiles.go b/ipn/ipnlocal/profiles.go index 5c1b17038..1d312cfa6 100644 --- a/ipn/ipnlocal/profiles.go +++ b/ipn/ipnlocal/profiles.go @@ -705,7 +705,6 @@ var errProfileAccessDenied = errors.New("profile access denied") // This is useful for deleting the last profile. In other cases, it is // recommended to call [profileManager.SwitchProfile] first. func (pm *profileManager) DeleteProfile(id ipn.ProfileID) error { - metricDeleteProfile.Add(1) if id == pm.currentProfile.ID() { return pm.deleteCurrentProfile() } @@ -741,6 +740,7 @@ func (pm *profileManager) deleteProfileNoPermCheck(profile ipn.LoginProfileView) return err } delete(pm.knownProfiles, profile.ID()) + metricDeleteProfile.Add(1) return pm.writeKnownProfiles() } @@ -781,6 +781,7 @@ func (pm *profileManager) writeKnownProfiles() error { if err != nil { return err } + metricProfileCount.Set(int64(len(pm.knownProfiles))) return pm.WriteState(ipn.KnownProfilesStateKey, b) } @@ -893,6 +894,8 @@ func newProfileManagerWithGOOS(store ipn.StateStore, logf logger.Logf, ht *healt return nil, err } + metricProfileCount.Set(int64(len(knownProfiles))) + pm := &profileManager{ goos: goos, store: store, @@ -961,6 +964,7 @@ var ( metricSwitchProfile = clientmetric.NewCounter("profiles_switch") metricDeleteProfile = clientmetric.NewCounter("profiles_delete") metricDeleteAllProfile = clientmetric.NewCounter("profiles_delete_all") + metricProfileCount = clientmetric.NewGauge("profiles_count") metricMigration = clientmetric.NewCounter("profiles_migration") metricMigrationError = clientmetric.NewCounter("profiles_migration_error") diff --git a/ipn/localapi/localapi.go b/ipn/localapi/localapi.go index 94f51d4f2..9c6c0a528 100644 --- a/ipn/localapi/localapi.go +++ b/ipn/localapi/localapi.go @@ -62,6 +62,13 @@ import ( "tailscale.com/wgengine/magicsock" ) +var ( + metricInvalidRequests = clientmetric.NewCounter("localapi_invalid_requests") + metricDebugMetricsCalls = clientmetric.NewCounter("localapi_debugmetric_requests") + metricUserMetricsCalls = clientmetric.NewCounter("localapi_usermetric_requests") + metricBugReportRequests = clientmetric.NewCounter("localapi_bugreport_requests") +) + type LocalAPIHandler func(*Handler, http.ResponseWriter, *http.Request) // handler is the set of LocalAPI handlers, keyed by the part of the @@ -424,6 +431,8 @@ func (h *Handler) serveBugReport(w http.ResponseWriter, r *http.Request) { // NOTE(andrew): if we have anything else we want to do while recording // a bugreport, we can add it here. + metricBugReportRequests.Add(1) + // Read from the client; this will also return when the client closes // the connection. var buf [1]byte @@ -2623,14 +2632,6 @@ func (h *Handler) serveShares(w http.ResponseWriter, r *http.Request) { } } -var ( - metricInvalidRequests = clientmetric.NewCounter("localapi_invalid_requests") - - // User-visible LocalAPI endpoints. - metricDebugMetricsCalls = clientmetric.NewCounter("localapi_debugmetric_requests") - metricUserMetricsCalls = clientmetric.NewCounter("localapi_usermetric_requests") -) - // serveSuggestExitNode serves a POST endpoint for returning a suggested exit node. func (h *Handler) serveSuggestExitNode(w http.ResponseWriter, r *http.Request) { if r.Method != "GET" {