prober: optionally spread probes over time

By default all probes with the same probe interval that have been added together will run on a synchronized schedule, which results in spiky resource usage and potential throttling by third-party systems (for example, OCSP servers used by the TLS probes). To address this, prober can now run in "spread" mode that will introduce a random delay before the first run of each probe. Signed-off-by: Anton Tolchanov <anton@tailscale.com>
2025-10-09 16:11:23 +00:00 · 2022-10-20 23:36:02 +01:00
parent adec726fee
commit bd47e28638
2 changed files with 114 additions and 21 deletions
--- a/prober/prober_test.go
+++ b/prober/prober_test.go
@@ -60,7 +60,7 @@ func TestProberTiming(t *testing.T) {
 		return nil
 	})

-	waitActiveProbes(t, p, 1)
+	waitActiveProbes(t, p, clk, 1)

 	called()
 	notCalled()
@@ -74,6 +74,49 @@ func TestProberTiming(t *testing.T) {
 	notCalled()
 }

+func TestProberTimingSpread(t *testing.T) {
+	clk := newFakeTime()
+	p := newForTest(clk.Now, clk.NewTicker).WithSpread(true)
+
+	invoked := make(chan struct{}, 1)
+
+	notCalled := func() {
+		t.Helper()
+		select {
+		case <-invoked:
+			t.Fatal("probe was invoked earlier than expected")
+		default:
+		}
+	}
+	called := func() {
+		t.Helper()
+		select {
+		case <-invoked:
+		case <-time.After(2 * time.Second):
+			t.Fatal("probe wasn't invoked as expected")
+		}
+	}
+
+	p.Run("test-spread-probe", probeInterval, nil, func(context.Context) error {
+		invoked <- struct{}{}
+		return nil
+	})
+
+	waitActiveProbes(t, p, clk, 1)
+
+	notCalled()
+	// Name of the probe (test-spread-probe) has been chosen to ensure that
+	// the initial delay is smaller than half of the probe interval.
+	clk.Advance(halfProbeInterval)
+	called()
+	notCalled()
+	clk.Advance(quarterProbeInterval)
+	notCalled()
+	clk.Advance(probeInterval)
+	called()
+	notCalled()
+}
+
 func TestProberRun(t *testing.T) {
 	clk := newFakeTime()
 	p := newForTest(clk.Now, clk.NewTicker)
@@ -111,7 +154,7 @@ func TestProberRun(t *testing.T) {
 		}
 	}

-	waitActiveProbes(t, p, startingProbes)
+	waitActiveProbes(t, p, clk, startingProbes)
 	checkCnt(startingProbes)
 	clk.Advance(probeInterval + halfProbeInterval)
 	checkCnt(startingProbes)
@@ -121,7 +164,7 @@ func TestProberRun(t *testing.T) {
 	for i := keep; i < startingProbes; i++ {
 		probes[i].Close()
 	}
-	waitActiveProbes(t, p, keep)
+	waitActiveProbes(t, p, clk, keep)

 	clk.Advance(probeInterval)
 	checkCnt(keep)
@@ -140,7 +183,7 @@ func TestExpvar(t *testing.T) {
 		return errors.New("failing, as instructed by test")
 	})

-	waitActiveProbes(t, p, 1)
+	waitActiveProbes(t, p, clk, 1)

 	check := func(name string, want probeInfo) {
 		t.Helper()
@@ -198,7 +241,7 @@ func TestPrometheus(t *testing.T) {
 		return errors.New("failing, as instructed by test")
 	})

-	waitActiveProbes(t, p, 1)
+	waitActiveProbes(t, p, clk, 1)

 	err := tstest.WaitFor(convergenceTimeout, func() error {
 		var b bytes.Buffer
@@ -326,6 +369,17 @@ func (t *fakeTime) Advance(d time.Duration) {
 	}
 }

+func (t *fakeTime) activeTickers() (count int) {
+	t.Lock()
+	defer t.Unlock()
+	for _, tick := range t.tickers {
+		if !tick.stopped {
+			count += 1
+		}
+	}
+	return
+}
+
 func probeExpvar(t *testing.T, p *Prober) map[string]*probeInfo {
 	t.Helper()
 	s := p.Expvar().String()
@@ -336,11 +390,14 @@ func probeExpvar(t *testing.T, p *Prober) map[string]*probeInfo {
 	return ret
 }

-func waitActiveProbes(t *testing.T, p *Prober, want int) {
+func waitActiveProbes(t *testing.T, p *Prober, clk *fakeTime, want int) {
 	t.Helper()
 	err := tstest.WaitFor(convergenceTimeout, func() error {
 		if got := p.activeProbes(); got != want {
-			return fmt.Errorf("active probe count is %d, want %d", got, want)
+			return fmt.Errorf("installed probe count is %d, want %d", got, want)
+		}
+		if got := clk.activeTickers(); got != want {
+			return fmt.Errorf("active ticker count is %d, want %d", got, want)
 		}
 		return nil
 	})