tailscale/prober/prober.go
David Anderson 19f61607b6 prober: run all probes once on initial registration.
Turns out, it's annoying to have to wait the entire interval
before getting any monitorable data, especially for very long
interval probes like hourly/daily checks.

Signed-off-by: David Anderson <danderson@tailscale.com>
2022-03-19 18:54:33 -07:00

238 lines
7.0 KiB
Go

// Copyright (c) 2022 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package prober implements a simple blackbox prober. Each probe runs
// in its own goroutine, and run results are recorded as Prometheus
// metrics.
package prober
import (
"context"
"errors"
"fmt"
"log"
"sync"
"time"
"tailscale.com/metrics"
)
// Probe is a function that probes something and reports whether the
// probe succeeded. The provided context must be used to ensure timely
// cancellation and timeout behavior.
type Probe func(context.Context) error
// a Prober manages a set of probes and keeps track of their results.
type Prober struct {
// Time-related functions that get faked out during tests.
now func() time.Time
newTicker func(time.Duration) ticker
// lastStart is the time, in seconds since epoch, of the last time
// each probe started a probe cycle.
lastStart metrics.LabelMap
// lastEnd is the time, in seconds since epoch, of the last time
// each probe finished a probe cycle.
lastEnd metrics.LabelMap
// lastResult records whether probes succeeded. A successful probe
// is recorded as 1, a failure as 0.
lastResult metrics.LabelMap
// lastLatency records how long the last probe cycle took for each
// probe, in milliseconds.
lastLatency metrics.LabelMap
// probeInterval records the time in seconds between successive
// runs of each probe.
//
// This is to help Prometheus figure out how long a probe should
// be failing before it fires an alert for it. To avoid random
// background noise, you want it to wait for more than 1
// datapoint, but you also can't use a fixed interval because some
// probes might run every few seconds, while e.g. TLS certificate
// expiry might only run once a day.
//
// So, for each probe, the prober tells Prometheus how often it
// runs, so that the alert can autotune itself to eliminate noise
// without being excessively delayed.
probeInterval metrics.LabelMap
mu sync.Mutex // protects all following fields
activeProbeCh map[string]chan struct{}
}
// New returns a new Prober.
func New() *Prober {
return newForTest(time.Now, newRealTicker)
}
func newForTest(now func() time.Time, newTicker func(time.Duration) ticker) *Prober {
return &Prober{
now: now,
newTicker: newTicker,
lastStart: metrics.LabelMap{Label: "probe"},
lastEnd: metrics.LabelMap{Label: "probe"},
lastResult: metrics.LabelMap{Label: "probe"},
lastLatency: metrics.LabelMap{Label: "probe"},
probeInterval: metrics.LabelMap{Label: "probe"},
activeProbeCh: map[string]chan struct{}{},
}
}
// Expvar returns the metrics for running probes.
func (p *Prober) Expvar() *metrics.Set {
ret := new(metrics.Set)
ret.Set("start_secs", &p.lastStart)
ret.Set("end_secs", &p.lastEnd)
ret.Set("result", &p.lastResult)
ret.Set("latency_millis", &p.lastLatency)
ret.Set("interval_secs", &p.probeInterval)
return ret
}
// Run executes fun every interval, and exports probe results under probeName.
//
// fun is given a context.Context that, if obeyed, ensures that fun
// ends within interval. If fun disregards the context, it will not be
// run again until it does finish, and metrics will reflect that the
// probe function is stuck.
//
// Run returns a context.CancelFunc that stops the probe when
// invoked. Probe shutdown and removal happens-before the CancelFunc
// returns.
//
// Registering a probe under an already-registered name panics.
func (p *Prober) Run(name string, interval time.Duration, fun Probe) context.CancelFunc {
p.mu.Lock()
defer p.mu.Unlock()
ticker := p.registerLocked(name, interval)
ctx, cancel := context.WithCancel(context.Background())
go p.probeLoop(ctx, name, interval, ticker, fun)
return func() {
p.mu.Lock()
stopped := p.activeProbeCh[name]
p.mu.Unlock()
cancel()
<-stopped
}
}
// probeLoop invokes runProbe on fun every interval. The first probe
// is run after interval.
func (p *Prober) probeLoop(ctx context.Context, name string, interval time.Duration, tick ticker, fun Probe) {
defer func() {
p.unregister(name)
tick.Stop()
}()
// Do a first probe right away, so that the prober immediately exports results for everything.
p.runProbe(ctx, name, interval, fun)
for {
select {
case <-tick.Chan():
p.runProbe(ctx, name, interval, fun)
case <-ctx.Done():
return
}
}
}
// runProbe invokes fun and records the results.
//
// fun is invoked with a timeout slightly less than interval, so that
// the probe either succeeds or fails before the next cycle is
// scheduled to start.
func (p *Prober) runProbe(ctx context.Context, name string, interval time.Duration, fun Probe) {
start := p.start(name)
defer func() {
// Prevent a panic within one probe function from killing the
// entire prober, so that a single buggy probe doesn't destroy
// our entire ability to monitor anything. A panic is recorded
// as a probe failure, so panicking probes will trigger an
// alert for debugging.
if r := recover(); r != nil {
log.Printf("probe %s panicked: %v", name, r)
p.end(name, start, errors.New("panic"))
}
}()
timeout := time.Duration(float64(interval) * 0.8)
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
err := fun(ctx)
p.end(name, start, err)
if err != nil {
log.Printf("probe %s: %v", name, err)
}
}
func (p *Prober) registerLocked(name string, interval time.Duration) ticker {
if _, ok := p.activeProbeCh[name]; ok {
panic(fmt.Sprintf("probe named %q already registered", name))
}
stoppedCh := make(chan struct{})
p.activeProbeCh[name] = stoppedCh
p.probeInterval.Get(name).Set(int64(interval.Seconds()))
// Create and return a ticker from here, while Prober is
// locked. This ensures that our fake time in tests always sees
// the new fake ticker being created before seeing that a new
// probe is registered.
return p.newTicker(interval)
}
func (p *Prober) unregister(name string) {
p.mu.Lock()
defer p.mu.Unlock()
close(p.activeProbeCh[name])
delete(p.activeProbeCh, name)
p.lastStart.Delete(name)
p.lastEnd.Delete(name)
p.lastResult.Delete(name)
p.lastLatency.Delete(name)
p.probeInterval.Delete(name)
}
func (p *Prober) start(name string) time.Time {
st := p.now()
p.lastStart.Get(name).Set(st.Unix())
return st
}
func (p *Prober) end(name string, start time.Time, err error) {
end := p.now()
p.lastEnd.Get(name).Set(end.Unix())
p.lastLatency.Get(name).Set(end.Sub(start).Milliseconds())
v := int64(1)
if err != nil {
v = 0
}
p.lastResult.Get(name).Set(v)
}
// Reports the number of registered probes. For tests only.
func (p *Prober) activeProbes() int {
p.mu.Lock()
defer p.mu.Unlock()
return len(p.activeProbeCh)
}
// ticker wraps a time.Ticker in a way that can be faked for tests.
type ticker interface {
Chan() <-chan time.Time
Stop()
}
type realTicker struct {
*time.Ticker
}
func (t *realTicker) Chan() <-chan time.Time {
return t.Ticker.C
}
func newRealTicker(d time.Duration) ticker {
return &realTicker{time.NewTicker(d)}
}