prober: optionally spread probes over time

By default all probes with the same probe interval that have been added together will run on a synchronized schedule, which results in spiky resource usage and potential throttling by third-party systems (for example, OCSP servers used by the TLS probes). To address this, prober can now run in "spread" mode that will introduce a random delay before the first run of each probe. Signed-off-by: Anton Tolchanov <anton@tailscale.com>
2025-12-13 10:31:57 +00:00 · 2022-10-20 23:36:02 +01:00
parent adec726fee
commit bd47e28638
2 changed files with 114 additions and 21 deletions
--- a/prober/prober.go
+++ b/prober/prober.go
@@ -13,8 +13,10 @@ import (
 	"errors"
 	"expvar"
 	"fmt"
+	"hash/fnv"
 	"io"
 	"log"
+	"math/rand"
 	"sort"
 	"strings"
 	"sync"
@@ -28,6 +30,10 @@ type ProbeFunc func(context.Context) error

 // a Prober manages a set of probes and keeps track of their results.
 type Prober struct {
+	// Whether to spread probe execution over time by introducing a
+	// random delay before the first probe run.
+	spread bool
+
 	// Time-related functions that get faked out during tests.
 	now       func() time.Time
 	newTicker func(time.Duration) ticker
@@ -65,7 +71,6 @@ func (p *Prober) Run(name string, interval time.Duration, labels map[string]stri
 	}

 	ctx, cancel := context.WithCancel(context.Background())
-	ticker := p.newTicker(interval)
 	probe := &Probe{
 		prober:  p,
 		ctx:     ctx,
@@ -75,7 +80,7 @@ func (p *Prober) Run(name string, interval time.Duration, labels map[string]stri
 		name:         name,
 		doProbe:      fun,
 		interval:     interval,
-		tick:     ticker,
+		initialDelay: initialDelay(name, interval),
 		labels:       labels,
 	}
 	p.probes[name] = probe
@@ -90,6 +95,13 @@ func (p *Prober) unregister(probe *Probe) {
 	delete(p.probes, name)
 }

+// WithSpread is used to enable random delay before the first run of
+// each added probe.
+func (p *Prober) WithSpread(s bool) *Prober {
+	p.spread = s
+	return p
+}
+
 // Reports the number of registered probes. For tests only.
 func (p *Prober) activeProbes() int {
 	p.mu.Lock()
@@ -108,6 +120,7 @@ type Probe struct {
 	name         string
 	doProbe      ProbeFunc
 	interval     time.Duration
+	initialDelay time.Duration
 	tick         ticker
 	labels       map[string]string

@@ -127,12 +140,26 @@ func (p *Probe) Close() error {
 }

 // probeLoop invokes runProbe on fun every interval. The first probe
-// is run after interval.
+// is run after a random delay (if spreading is enabled) or immediately.
 func (p *Probe) loop() {
 	defer close(p.stopped)

-	// Do a first probe right away, so that the prober immediately exports results for everything.
+	if p.prober.spread && p.initialDelay > 0 {
+		t := p.prober.newTicker(p.initialDelay)
+		select {
+		case <-t.Chan():
 			p.run()
+		case <-p.ctx.Done():
+			t.Stop()
+			return
+		}
+		t.Stop()
+	} else {
+		p.run()
+	}
+
+	p.tick = p.prober.newTicker(p.interval)
+	defer p.tick.Stop()
 	for {
 		select {
 		case <-p.tick.Chan():
@@ -310,3 +337,12 @@ func (t *realTicker) Chan() <-chan time.Time {
 func newRealTicker(d time.Duration) ticker {
 	return &realTicker{time.NewTicker(d)}
 }
+
+// initialDelay returns a pseudorandom duration in [0, interval) that
+// is based on the provided seed string.
+func initialDelay(seed string, interval time.Duration) time.Duration {
+	h := fnv.New64()
+	fmt.Fprint(h, seed)
+	r := rand.New(rand.NewSource(int64(h.Sum64()))).Float64()
+	return time.Duration(float64(interval) * r)
+}
--- a/prober/prober_test.go
+++ b/prober/prober_test.go
@@ -60,7 +60,7 @@ func TestProberTiming(t *testing.T) {
 		return nil
 	})

-	waitActiveProbes(t, p, 1)
+	waitActiveProbes(t, p, clk, 1)

 	called()
 	notCalled()
@@ -74,6 +74,49 @@ func TestProberTiming(t *testing.T) {
 	notCalled()
 }

+func TestProberTimingSpread(t *testing.T) {
+	clk := newFakeTime()
+	p := newForTest(clk.Now, clk.NewTicker).WithSpread(true)
+
+	invoked := make(chan struct{}, 1)
+
+	notCalled := func() {
+		t.Helper()
+		select {
+		case <-invoked:
+			t.Fatal("probe was invoked earlier than expected")
+		default:
+		}
+	}
+	called := func() {
+		t.Helper()
+		select {
+		case <-invoked:
+		case <-time.After(2 * time.Second):
+			t.Fatal("probe wasn't invoked as expected")
+		}
+	}
+
+	p.Run("test-spread-probe", probeInterval, nil, func(context.Context) error {
+		invoked <- struct{}{}
+		return nil
+	})
+
+	waitActiveProbes(t, p, clk, 1)
+
+	notCalled()
+	// Name of the probe (test-spread-probe) has been chosen to ensure that
+	// the initial delay is smaller than half of the probe interval.
+	clk.Advance(halfProbeInterval)
+	called()
+	notCalled()
+	clk.Advance(quarterProbeInterval)
+	notCalled()
+	clk.Advance(probeInterval)
+	called()
+	notCalled()
+}
+
 func TestProberRun(t *testing.T) {
 	clk := newFakeTime()
 	p := newForTest(clk.Now, clk.NewTicker)
@@ -111,7 +154,7 @@ func TestProberRun(t *testing.T) {
 		}
 	}

-	waitActiveProbes(t, p, startingProbes)
+	waitActiveProbes(t, p, clk, startingProbes)
 	checkCnt(startingProbes)
 	clk.Advance(probeInterval + halfProbeInterval)
 	checkCnt(startingProbes)
@@ -121,7 +164,7 @@ func TestProberRun(t *testing.T) {
 	for i := keep; i < startingProbes; i++ {
 		probes[i].Close()
 	}
-	waitActiveProbes(t, p, keep)
+	waitActiveProbes(t, p, clk, keep)

 	clk.Advance(probeInterval)
 	checkCnt(keep)
@@ -140,7 +183,7 @@ func TestExpvar(t *testing.T) {
 		return errors.New("failing, as instructed by test")
 	})

-	waitActiveProbes(t, p, 1)
+	waitActiveProbes(t, p, clk, 1)

 	check := func(name string, want probeInfo) {
 		t.Helper()
@@ -198,7 +241,7 @@ func TestPrometheus(t *testing.T) {
 		return errors.New("failing, as instructed by test")
 	})

-	waitActiveProbes(t, p, 1)
+	waitActiveProbes(t, p, clk, 1)

 	err := tstest.WaitFor(convergenceTimeout, func() error {
 		var b bytes.Buffer
@@ -326,6 +369,17 @@ func (t *fakeTime) Advance(d time.Duration) {
 	}
 }

+func (t *fakeTime) activeTickers() (count int) {
+	t.Lock()
+	defer t.Unlock()
+	for _, tick := range t.tickers {
+		if !tick.stopped {
+			count += 1
+		}
+	}
+	return
+}
+
 func probeExpvar(t *testing.T, p *Prober) map[string]*probeInfo {
 	t.Helper()
 	s := p.Expvar().String()
@@ -336,11 +390,14 @@ func probeExpvar(t *testing.T, p *Prober) map[string]*probeInfo {
 	return ret
 }

-func waitActiveProbes(t *testing.T, p *Prober, want int) {
+func waitActiveProbes(t *testing.T, p *Prober, clk *fakeTime, want int) {
 	t.Helper()
 	err := tstest.WaitFor(convergenceTimeout, func() error {
 		if got := p.activeProbes(); got != want {
-			return fmt.Errorf("active probe count is %d, want %d", got, want)
+			return fmt.Errorf("installed probe count is %d, want %d", got, want)
+		}
+		if got := clk.activeTickers(); got != want {
+			return fmt.Errorf("active ticker count is %d, want %d", got, want)
 		}
 		return nil
 	})