tailscale/prober/prober.go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause

// Package prober implements a simple blackbox prober. Each probe runs
// in its own goroutine, and run results are recorded as Prometheus
// metrics.
package prober

import (
	"context"
	"errors"
	"fmt"
	"hash/fnv"
	"log"
	"maps"
	"math/rand"
	"sync"
	"time"

	"github.com/prometheus/client_golang/prometheus"
)

// ProbeClass defines a probe of a specific type: a probing function that will
// be regularly ran, and metric labels that will be added automatically to all
// probes using this class.
type ProbeClass struct {
	// Probe is a function that probes something and reports whether the Probe
	// succeeded. The provided context's deadline must be obeyed for correct
	// Probe scheduling.
	Probe func(context.Context) error

	// Class defines a user-facing name of the probe class that will be used
	// in the `class` metric label.
	Class string

	// Labels defines a set of metric labels that will be added to all metrics
	// exposed by this probe class.
	Labels Labels

	// Metrics allows a probe class to export custom Metrics. Can be nil.
	Metrics func(prometheus.Labels) []prometheus.Metric
}

// FuncProbe wraps a simple probe function in a ProbeClass.
func FuncProbe(fn func(context.Context) error) ProbeClass {
	return ProbeClass{
		Probe: fn,
	}
}

// a Prober manages a set of probes and keeps track of their results.
type Prober struct {
	// Whether to spread probe execution over time by introducing a
	// random delay before the first probe run.
	spread bool

	// Whether to run all probes once instead of running them in a loop.
	once bool

	// Time-related functions that get faked out during tests.
	now       func() time.Time
	newTicker func(time.Duration) ticker

	mu     sync.Mutex // protects all following fields
	probes map[string]*Probe

	namespace string
	metrics   *prometheus.Registry
}

// New returns a new Prober.
func New() *Prober {
	return newForTest(time.Now, newRealTicker)
}

func newForTest(now func() time.Time, newTicker func(time.Duration) ticker) *Prober {
	p := &Prober{
		now:       now,
		newTicker: newTicker,
		probes:    map[string]*Probe{},
		metrics:   prometheus.NewRegistry(),
		namespace: "prober",
	}
	prometheus.DefaultRegisterer.MustRegister(p.metrics)
	return p
}

// Run executes probe class function every interval, and exports probe results under probeName.
//
// Registering a probe under an already-registered name panics.
func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc ProbeClass) *Probe {
	p.mu.Lock()
	defer p.mu.Unlock()
	if _, ok := p.probes[name]; ok {
		panic(fmt.Sprintf("probe named %q already registered", name))
	}

	l := prometheus.Labels{
		"name":  name,
		"class": pc.Class,
	}
	for k, v := range pc.Labels {
		l[k] = v
	}
	for k, v := range labels {
		l[k] = v
	}

	ctx, cancel := context.WithCancel(context.Background())
	probe := &Probe{
		prober:  p,
		ctx:     ctx,
		cancel:  cancel,
		stopped: make(chan struct{}),

		name:         name,
		probeClass:   pc,
		interval:     interval,
		initialDelay: initialDelay(name, interval),
		metrics:      prometheus.NewRegistry(),
		metricLabels: l,
		mInterval:    prometheus.NewDesc("interval_secs", "Probe interval in seconds", nil, l),
		mStartTime:   prometheus.NewDesc("start_secs", "Latest probe start time (seconds since epoch)", nil, l),
		mEndTime:     prometheus.NewDesc("end_secs", "Latest probe end time (seconds since epoch)", nil, l),
		mLatency:     prometheus.NewDesc("latency_millis", "Latest probe latency (ms)", nil, l),
		mResult:      prometheus.NewDesc("result", "Latest probe result (1 = success, 0 = failure)", nil, l),
		mAttempts: prometheus.NewCounterVec(prometheus.CounterOpts{
			Name: "attempts_total", Help: "Total number of probing attempts", ConstLabels: l,
		}, []string{"status"}),
		mSeconds: prometheus.NewCounterVec(prometheus.CounterOpts{
			Name: "seconds_total", Help: "Total amount of time spent executing the probe", ConstLabels: l,
		}, []string{"status"}),
	}

	prometheus.WrapRegistererWithPrefix(p.namespace+"_", p.metrics).MustRegister(probe.metrics)
	probe.metrics.MustRegister(probe)

	p.probes[name] = probe
	go probe.loop()
	return probe
}

func (p *Prober) unregister(probe *Probe) {
	p.mu.Lock()
	defer p.mu.Unlock()
	probe.metrics.Unregister(probe)
	p.metrics.Unregister(probe.metrics)
	name := probe.name
	delete(p.probes, name)
}

// WithSpread is used to enable random delay before the first run of
// each added probe.
func (p *Prober) WithSpread(s bool) *Prober {
	p.spread = s
	return p
}

// WithOnce mode can be used if you want to run all configured probes once
// rather than on a schedule.
func (p *Prober) WithOnce(s bool) *Prober {
	p.once = s
	return p
}

// WithMetricNamespace allows changing metric name prefix from the default `prober`.
func (p *Prober) WithMetricNamespace(n string) *Prober {
	p.namespace = n
	return p
}

// Wait blocks until all probes have finished execution. It should typically
// be used with the `once` mode to wait for probes to finish before collecting
// their results.
func (p *Prober) Wait() {
	for {
		chans := make([]chan struct{}, 0)
		p.mu.Lock()
		for _, p := range p.probes {
			chans = append(chans, p.stopped)
		}
		p.mu.Unlock()
		for _, c := range chans {
			<-c
		}

		// Since probes can add other probes, retry if the number of probes has changed.
		if p.activeProbes() != len(chans) {
			continue
		}
		return
	}
}

// Reports the number of registered probes.
func (p *Prober) activeProbes() int {
	p.mu.Lock()
	defer p.mu.Unlock()
	return len(p.probes)
}

// Probe is a probe that healthchecks something and updates Prometheus
// metrics with the results.
type Probe struct {
	prober  *Prober
	ctx     context.Context
	cancel  context.CancelFunc // run to initiate shutdown
	stopped chan struct{}      // closed when shutdown is complete

	name         string
	probeClass   ProbeClass
	interval     time.Duration
	initialDelay time.Duration
	tick         ticker

	// metrics is a Prometheus metrics registry for metrics exported by this probe.
	// Using a separate registry allows cleanly removing metrics exported by this
	// probe when it gets unregistered.
	metrics      *prometheus.Registry
	metricLabels prometheus.Labels
	mInterval    *prometheus.Desc
	mStartTime   *prometheus.Desc
	mEndTime     *prometheus.Desc
	mLatency     *prometheus.Desc
	mResult      *prometheus.Desc
	mAttempts    *prometheus.CounterVec
	mSeconds     *prometheus.CounterVec

	mu        sync.Mutex
	start     time.Time     // last time doProbe started
	end       time.Time     // last time doProbe returned
	latency   time.Duration // last successful probe latency
	succeeded bool          // whether the last doProbe call succeeded
	lastErr   error
}

// Close shuts down the Probe and unregisters it from its Prober.
// It is safe to Run a new probe of the same name after Close returns.
func (p *Probe) Close() error {
	p.cancel()
	<-p.stopped
	p.prober.unregister(p)
	return nil
}

// probeLoop invokes runProbe on fun every interval. The first probe
// is run after a random delay (if spreading is enabled) or immediately.
func (p *Probe) loop() {
	defer close(p.stopped)

	if p.prober.spread && p.initialDelay > 0 {
		t := p.prober.newTicker(p.initialDelay)
		select {
		case <-t.Chan():
			p.run()
		case <-p.ctx.Done():
			t.Stop()
			return
		}
		t.Stop()
	} else {
		p.run()
	}

	if p.prober.once {
		return
	}

	p.tick = p.prober.newTicker(p.interval)
	defer p.tick.Stop()
	for {
		select {
		case <-p.tick.Chan():
			p.run()
		case <-p.ctx.Done():
			return
		}
	}
}

// run invokes fun and records the results.
//
// fun is invoked with a timeout slightly less than interval, so that
// the probe either succeeds or fails before the next cycle is
// scheduled to start.
func (p *Probe) run() {
	start := p.recordStart()
	defer func() {
		// Prevent a panic within one probe function from killing the
		// entire prober, so that a single buggy probe doesn't destroy
		// our entire ability to monitor anything. A panic is recorded
		// as a probe failure, so panicking probes will trigger an
		// alert for debugging.
		if r := recover(); r != nil {
			log.Printf("probe %s panicked: %v", p.name, r)
			p.recordEnd(start, errors.New("panic"))
		}
	}()
	timeout := time.Duration(float64(p.interval) * 0.8)
	ctx, cancel := context.WithTimeout(p.ctx, timeout)
	defer cancel()

	err := p.probeClass.Probe(ctx)
	p.recordEnd(start, err)
	if err != nil {
		log.Printf("probe %s: %v", p.name, err)
	}
}

func (p *Probe) recordStart() time.Time {
	st := p.prober.now()
	p.mu.Lock()
	defer p.mu.Unlock()
	p.start = st
	return st
}

func (p *Probe) recordEnd(start time.Time, err error) {
	end := p.prober.now()
	p.mu.Lock()
	defer p.mu.Unlock()
	p.end = end
	p.succeeded = err == nil
	p.lastErr = err
	latency := end.Sub(p.start)
	if p.succeeded {
		p.latency = latency
		p.mAttempts.WithLabelValues("ok").Inc()
		p.mSeconds.WithLabelValues("ok").Add(latency.Seconds())
	} else {
		p.latency = 0
		p.mAttempts.WithLabelValues("fail").Inc()
		p.mSeconds.WithLabelValues("fail").Add(latency.Seconds())
	}
}

// ProbeInfo is the state of a Probe.
type ProbeInfo struct {
	Start   time.Time
	End     time.Time
	Latency string
	Result  bool
	Error   string
}

func (p *Prober) ProbeInfo() map[string]ProbeInfo {
	out := map[string]ProbeInfo{}

	p.mu.Lock()
	probes := make([]*Probe, 0, len(p.probes))
	for _, probe := range p.probes {
		probes = append(probes, probe)
	}
	p.mu.Unlock()

	for _, probe := range probes {
		probe.mu.Lock()
		inf := ProbeInfo{
			Start:  probe.start,
			End:    probe.end,
			Result: probe.succeeded,
		}
		if probe.lastErr != nil {
			inf.Error = probe.lastErr.Error()
		}
		if probe.latency > 0 {
			inf.Latency = probe.latency.String()
		}
		out[probe.name] = inf
		probe.mu.Unlock()
	}
	return out
}

// Describe implements prometheus.Collector.
func (p *Probe) Describe(ch chan<- *prometheus.Desc) {
	ch <- p.mInterval
	ch <- p.mStartTime
	ch <- p.mEndTime
	ch <- p.mResult
	ch <- p.mLatency
	p.mAttempts.Describe(ch)
	p.mSeconds.Describe(ch)
	if p.probeClass.Metrics != nil {
		for _, m := range p.probeClass.Metrics(p.metricLabels) {
			ch <- m.Desc()
		}
	}
}

// Collect implements prometheus.Collector.
func (p *Probe) Collect(ch chan<- prometheus.Metric) {
	p.mu.Lock()
	defer p.mu.Unlock()
	ch <- prometheus.MustNewConstMetric(p.mInterval, prometheus.GaugeValue, p.interval.Seconds())
	if !p.start.IsZero() {
		ch <- prometheus.MustNewConstMetric(p.mStartTime, prometheus.GaugeValue, float64(p.start.Unix()))
	}
	if p.end.IsZero() {
		return
	}
	ch <- prometheus.MustNewConstMetric(p.mEndTime, prometheus.GaugeValue, float64(p.end.Unix()))
	if p.succeeded {
		ch <- prometheus.MustNewConstMetric(p.mResult, prometheus.GaugeValue, 1)
	} else {
		ch <- prometheus.MustNewConstMetric(p.mResult, prometheus.GaugeValue, 0)
	}
	if p.latency > 0 {
		ch <- prometheus.MustNewConstMetric(p.mLatency, prometheus.GaugeValue, float64(p.latency.Milliseconds()))
	}
	p.mAttempts.Collect(ch)
	p.mSeconds.Collect(ch)
	if p.probeClass.Metrics != nil {
		for _, m := range p.probeClass.Metrics(p.metricLabels) {
			ch <- m
		}
	}
}

// ticker wraps a time.Ticker in a way that can be faked for tests.
type ticker interface {
	Chan() <-chan time.Time
	Stop()
}

type realTicker struct {
	*time.Ticker
}

func (t *realTicker) Chan() <-chan time.Time {
	return t.Ticker.C
}

func newRealTicker(d time.Duration) ticker {
	return &realTicker{time.NewTicker(d)}
}

// initialDelay returns a pseudorandom duration in [0, interval) that
// is based on the provided seed string.
func initialDelay(seed string, interval time.Duration) time.Duration {
	h := fnv.New64()
	fmt.Fprint(h, seed)
	r := rand.New(rand.NewSource(int64(h.Sum64()))).Float64()
	return time.Duration(float64(interval) * r)
}

// Labels is a set of metric labels used by a prober.
type Labels map[string]string

func (l Labels) With(k, v string) Labels {
	new := maps.Clone(l)
	new[k] = v
	return new
}