Nick Khyl 2336c340c4 util/syspolicy: implement a syspolicy store that reads settings from environment variables
In this PR, we implement (but do not use yet, pending #13727 review) a syspolicy/source.Store
that reads policy settings from environment variables. It converts a CamelCase setting.Key,
such as AuthKey or ExitNodeID, to a SCREAMING_SNAKE_CASE, TS_-prefixed environment
variable name, such as TS_AUTH_KEY and TS_EXIT_NODE_ID. It then looks up the variable
and attempts to parse it according to the expected value type. If the environment variable
is not set, the policy setting is considered not configured in this store (the syspolicy package
will still read it from other sources). Similarly, if the environment variable has an invalid value
for the setting type, it won't be used (though the reported/logged error will differ).

Updates #13193
Updates #12687

Signed-off-by: Nick Khyl <nickk@tailscale.com>
2024-10-30 11:12:22 -05:00

321 lines
11 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
// Package metrics provides logging and reporting for policy settings and scopes.
package metrics
import (
"strings"
"sync"
xmaps "golang.org/x/exp/maps"
"tailscale.com/syncs"
"tailscale.com/types/lazy"
"tailscale.com/util/clientmetric"
"tailscale.com/util/mak"
"tailscale.com/util/slicesx"
"tailscale.com/util/syspolicy/internal"
"tailscale.com/util/syspolicy/internal/loggerx"
"tailscale.com/util/syspolicy/setting"
"tailscale.com/util/testenv"
)
var lazyReportMetrics lazy.SyncValue[bool] // used as a test hook
// ShouldReport reports whether metrics should be reported on the current environment.
func ShouldReport() bool {
return lazyReportMetrics.Get(func() bool {
// macOS, iOS and tvOS create their own metrics,
// and we don't have syspolicy on any other platforms.
return setting.PlatformList{"android", "windows"}.HasCurrent()
})
}
// Reset metrics for the specified policy origin.
func Reset(origin *setting.Origin) {
scopeMetrics(origin).Reset()
}
// ReportConfigured updates metrics and logs that the specified setting is
// configured with the given value in the origin.
func ReportConfigured(origin *setting.Origin, setting *setting.Definition, value any) {
settingMetricsFor(setting).ReportValue(origin, value)
}
// ReportError updates metrics and logs that the specified setting has an error
// in the origin.
func ReportError(origin *setting.Origin, setting *setting.Definition, err error) {
settingMetricsFor(setting).ReportError(origin, err)
}
// ReportNotConfigured updates metrics and logs that the specified setting is
// not configured in the origin.
func ReportNotConfigured(origin *setting.Origin, setting *setting.Definition) {
settingMetricsFor(setting).Reset(origin)
}
// metric is an interface implemented by [clientmetric.Metric] and [funcMetric].
type metric interface {
Add(v int64)
Set(v int64)
}
// policyScopeMetrics are metrics that apply to an entire policy scope rather
// than a specific policy setting.
type policyScopeMetrics struct {
hasAny metric
numErrored metric
}
func newScopeMetrics(scope setting.Scope) *policyScopeMetrics {
prefix := metricScopeName(scope)
// {os}_syspolicy_{scope_unless_device}_any
// Example: windows_syspolicy_any or windows_syspolicy_user_any.
hasAny := newMetric([]string{prefix, "any"}, clientmetric.TypeGauge)
// {os}_syspolicy_{scope_unless_device}_errors
// Example: windows_syspolicy_errors or windows_syspolicy_user_errors.
//
// TODO(nickkhyl): maybe make the `{os}_syspolicy_errors` metric a gauge rather than a counter?
// It was a counter prior to https://github.com/tailscale/tailscale/issues/12687, so I kept it as such.
// But I think a gauge makes more sense: syspolicy errors indicate a mismatch between the expected
// policy value type or format and the actual value read from the underlying store (like the Windows Registry).
// We'll encounter the same error every time we re-read the policy setting from the backing store
// until the policy value is corrected by the user, or until we fix the bug in the code or ADMX.
// There's probably no reason to count and accumulate them over time.
//
// Brief discussion: https://github.com/tailscale/tailscale/pull/13113#discussion_r1723475136
numErrored := newMetric([]string{prefix, "errors"}, clientmetric.TypeCounter)
return &policyScopeMetrics{hasAny, numErrored}
}
// ReportHasSettings is called when there's any configured policy setting in the scope.
func (m *policyScopeMetrics) ReportHasSettings() {
if m != nil {
m.hasAny.Set(1)
}
}
// ReportError is called when there's any errored policy setting in the scope.
func (m *policyScopeMetrics) ReportError() {
if m != nil {
m.numErrored.Add(1)
}
}
// Reset is called to reset the policy scope metrics, such as when the policy scope
// is about to be reloaded.
func (m *policyScopeMetrics) Reset() {
if m != nil {
m.hasAny.Set(0)
// numErrored is a counter and cannot be (re-)set.
}
}
// settingMetrics are metrics for a single policy setting in one or more scopes.
type settingMetrics struct {
definition *setting.Definition
isSet []metric // by scope
hasErrors []metric // by scope
}
// ReportValue is called when the policy setting is found to be configured in the specified source.
func (m *settingMetrics) ReportValue(origin *setting.Origin, v any) {
if m == nil {
return
}
if scope := origin.Scope().Kind(); scope >= 0 && int(scope) < len(m.isSet) {
m.isSet[scope].Set(1)
m.hasErrors[scope].Set(0)
}
scopeMetrics(origin).ReportHasSettings()
loggerx.Verbosef("%v(%q) = %v", origin, m.definition.Key(), v)
}
// ReportError is called when there's an error with the policy setting in the specified source.
func (m *settingMetrics) ReportError(origin *setting.Origin, err error) {
if m == nil {
return
}
if scope := origin.Scope().Kind(); int(scope) < len(m.hasErrors) {
m.isSet[scope].Set(0)
m.hasErrors[scope].Set(1)
}
scopeMetrics(origin).ReportError()
loggerx.Errorf("%v(%q): %v", origin, m.definition.Key(), err)
}
// Reset is called to reset the policy setting's metrics, such as when
// the policy setting does not exist or the source containing the policy
// is about to be reloaded.
func (m *settingMetrics) Reset(origin *setting.Origin) {
if m == nil {
return
}
if scope := origin.Scope().Kind(); scope >= 0 && int(scope) < len(m.isSet) {
m.isSet[scope].Set(0)
m.hasErrors[scope].Set(0)
}
}
// metricFn is a function that adds or sets a metric value.
type metricFn func(name string, typ clientmetric.Type, v int64)
// funcMetric implements [metric] by calling the specified add and set functions.
// Used for testing, and with nil functions on platforms that do not support
// syspolicy, and on platforms that report policy metrics from the GUI.
type funcMetric struct {
name string
typ clientmetric.Type
add, set metricFn
}
func (m funcMetric) Add(v int64) {
if m.add != nil {
m.add(m.name, m.typ, v)
}
}
func (m funcMetric) Set(v int64) {
if m.set != nil {
m.set(m.name, m.typ, v)
}
}
var (
lazyDeviceMetrics lazy.SyncValue[*policyScopeMetrics]
lazyProfileMetrics lazy.SyncValue[*policyScopeMetrics]
lazyUserMetrics lazy.SyncValue[*policyScopeMetrics]
)
func scopeMetrics(origin *setting.Origin) *policyScopeMetrics {
switch origin.Scope().Kind() {
case setting.DeviceSetting:
return lazyDeviceMetrics.Get(func() *policyScopeMetrics {
return newScopeMetrics(setting.DeviceSetting)
})
case setting.ProfileSetting:
return lazyProfileMetrics.Get(func() *policyScopeMetrics {
return newScopeMetrics(setting.ProfileSetting)
})
case setting.UserSetting:
return lazyUserMetrics.Get(func() *policyScopeMetrics {
return newScopeMetrics(setting.UserSetting)
})
default:
panic("unreachable")
}
}
var (
settingMetricsMu sync.RWMutex
settingMetricsMap map[setting.Key]*settingMetrics
)
func settingMetricsFor(setting *setting.Definition) *settingMetrics {
settingMetricsMu.RLock()
metrics, ok := settingMetricsMap[setting.Key()]
settingMetricsMu.RUnlock()
if ok {
return metrics
}
return settingMetricsForSlow(setting)
}
func settingMetricsForSlow(d *setting.Definition) *settingMetrics {
settingMetricsMu.Lock()
defer settingMetricsMu.Unlock()
if metrics, ok := settingMetricsMap[d.Key()]; ok {
return metrics
}
// The loop below initializes metrics for each scope where a policy setting defined in 'd'
// can be configured. The [setting.Definition.Scope] returns the narrowest scope at which the policy
// setting may be configured, and more specific scopes always have higher numeric values.
// In other words, [setting.UserSetting] > [setting.ProfileScope] > [setting.DeviceScope].
// It's impossible for a policy setting to be configured in a scope with a higher numeric value than
// the [setting.Definition.Scope] returns. Therefore, a policy setting can be configured in at
// most d.Scope()+1 different scopes, and having d.Scope()+1 metrics for the corresponding scopes
// is always sufficient for [settingMetrics]; it won't access elements past the end of the slice
// or need to reallocate with a longer slice if one of those arrives.
isSet := make([]metric, d.Scope()+1)
hasErrors := make([]metric, d.Scope()+1)
for i := range isSet {
scope := setting.Scope(i)
// {os}_syspolicy_{key}_{scope_unless_device}
// Example: windows_syspolicy_AdminConsole or windows_syspolicy_AdminConsole_user.
isSet[i] = newSettingMetric(d.Key(), scope, "", clientmetric.TypeGauge)
// {os}_syspolicy_{key}_{scope_unless_device}_error
// Example: windows_syspolicy_AdminConsole_error or windows_syspolicy_TestSetting01_user_error.
hasErrors[i] = newSettingMetric(d.Key(), scope, "error", clientmetric.TypeGauge)
}
metrics := &settingMetrics{d, isSet, hasErrors}
mak.Set(&settingMetricsMap, d.Key(), metrics)
return metrics
}
// hooks for testing
var addMetricTestHook, setMetricTestHook syncs.AtomicValue[metricFn]
// SetHooksForTest sets the specified addMetric and setMetric functions
// as the metric functions for the duration of tb and all its subtests.
func SetHooksForTest(tb internal.TB, addMetric, setMetric metricFn) {
oldAddMetric := addMetricTestHook.Swap(addMetric)
oldSetMetric := setMetricTestHook.Swap(setMetric)
tb.Cleanup(func() {
addMetricTestHook.Store(oldAddMetric)
setMetricTestHook.Store(oldSetMetric)
})
settingMetricsMu.Lock()
oldSettingMetricsMap := xmaps.Clone(settingMetricsMap)
clear(settingMetricsMap)
settingMetricsMu.Unlock()
tb.Cleanup(func() {
settingMetricsMu.Lock()
settingMetricsMap = oldSettingMetricsMap
settingMetricsMu.Unlock()
})
// (re-)set the scope metrics to use the test hooks for the duration of tb.
lazyDeviceMetrics.SetForTest(tb, newScopeMetrics(setting.DeviceSetting), nil)
lazyProfileMetrics.SetForTest(tb, newScopeMetrics(setting.ProfileSetting), nil)
lazyUserMetrics.SetForTest(tb, newScopeMetrics(setting.UserSetting), nil)
}
func newSettingMetric(key setting.Key, scope setting.Scope, suffix string, typ clientmetric.Type) metric {
name := strings.ReplaceAll(string(key), string(setting.KeyPathSeparator), "_")
return newMetric([]string{name, metricScopeName(scope), suffix}, typ)
}
func newMetric(nameParts []string, typ clientmetric.Type) metric {
name := strings.Join(slicesx.Filter([]string{internal.OS(), "syspolicy"}, nameParts, isNonEmpty), "_")
switch {
case !ShouldReport():
return &funcMetric{name: name, typ: typ}
case testenv.InTest():
return &funcMetric{name, typ, addMetricTestHook.Load(), setMetricTestHook.Load()}
case typ == clientmetric.TypeCounter:
return clientmetric.NewCounter(name)
case typ == clientmetric.TypeGauge:
return clientmetric.NewGauge(name)
default:
panic("unreachable")
}
}
func isNonEmpty(s string) bool { return s != "" }
func metricScopeName(scope setting.Scope) string {
switch scope {
case setting.DeviceSetting:
return ""
case setting.ProfileSetting:
return "profile"
case setting.UserSetting:
return "user"
default:
panic("unreachable")
}
}