prober: optionally spread probes over time

By default all probes with the same probe interval that have been added
together will run on a synchronized schedule, which results in spiky
resource usage and potential throttling by third-party systems (for
example, OCSP servers used by the TLS probes).

To address this, prober can now run in "spread" mode that will
introduce a random delay before the first run of each probe.

Signed-off-by: Anton Tolchanov <anton@tailscale.com>
This commit is contained in:
Anton Tolchanov
2022-10-20 23:36:02 +01:00
committed by Anton Tolchanov
parent adec726fee
commit bd47e28638
2 changed files with 114 additions and 21 deletions

View File

@@ -60,7 +60,7 @@ func TestProberTiming(t *testing.T) {
return nil
})
waitActiveProbes(t, p, 1)
waitActiveProbes(t, p, clk, 1)
called()
notCalled()
@@ -74,6 +74,49 @@ func TestProberTiming(t *testing.T) {
notCalled()
}
func TestProberTimingSpread(t *testing.T) {
clk := newFakeTime()
p := newForTest(clk.Now, clk.NewTicker).WithSpread(true)
invoked := make(chan struct{}, 1)
notCalled := func() {
t.Helper()
select {
case <-invoked:
t.Fatal("probe was invoked earlier than expected")
default:
}
}
called := func() {
t.Helper()
select {
case <-invoked:
case <-time.After(2 * time.Second):
t.Fatal("probe wasn't invoked as expected")
}
}
p.Run("test-spread-probe", probeInterval, nil, func(context.Context) error {
invoked <- struct{}{}
return nil
})
waitActiveProbes(t, p, clk, 1)
notCalled()
// Name of the probe (test-spread-probe) has been chosen to ensure that
// the initial delay is smaller than half of the probe interval.
clk.Advance(halfProbeInterval)
called()
notCalled()
clk.Advance(quarterProbeInterval)
notCalled()
clk.Advance(probeInterval)
called()
notCalled()
}
func TestProberRun(t *testing.T) {
clk := newFakeTime()
p := newForTest(clk.Now, clk.NewTicker)
@@ -111,7 +154,7 @@ func TestProberRun(t *testing.T) {
}
}
waitActiveProbes(t, p, startingProbes)
waitActiveProbes(t, p, clk, startingProbes)
checkCnt(startingProbes)
clk.Advance(probeInterval + halfProbeInterval)
checkCnt(startingProbes)
@@ -121,7 +164,7 @@ func TestProberRun(t *testing.T) {
for i := keep; i < startingProbes; i++ {
probes[i].Close()
}
waitActiveProbes(t, p, keep)
waitActiveProbes(t, p, clk, keep)
clk.Advance(probeInterval)
checkCnt(keep)
@@ -140,7 +183,7 @@ func TestExpvar(t *testing.T) {
return errors.New("failing, as instructed by test")
})
waitActiveProbes(t, p, 1)
waitActiveProbes(t, p, clk, 1)
check := func(name string, want probeInfo) {
t.Helper()
@@ -198,7 +241,7 @@ func TestPrometheus(t *testing.T) {
return errors.New("failing, as instructed by test")
})
waitActiveProbes(t, p, 1)
waitActiveProbes(t, p, clk, 1)
err := tstest.WaitFor(convergenceTimeout, func() error {
var b bytes.Buffer
@@ -326,6 +369,17 @@ func (t *fakeTime) Advance(d time.Duration) {
}
}
func (t *fakeTime) activeTickers() (count int) {
t.Lock()
defer t.Unlock()
for _, tick := range t.tickers {
if !tick.stopped {
count += 1
}
}
return
}
func probeExpvar(t *testing.T, p *Prober) map[string]*probeInfo {
t.Helper()
s := p.Expvar().String()
@@ -336,11 +390,14 @@ func probeExpvar(t *testing.T, p *Prober) map[string]*probeInfo {
return ret
}
func waitActiveProbes(t *testing.T, p *Prober, want int) {
func waitActiveProbes(t *testing.T, p *Prober, clk *fakeTime, want int) {
t.Helper()
err := tstest.WaitFor(convergenceTimeout, func() error {
if got := p.activeProbes(); got != want {
return fmt.Errorf("active probe count is %d, want %d", got, want)
return fmt.Errorf("installed probe count is %d, want %d", got, want)
}
if got := clk.activeTickers(); got != want {
return fmt.Errorf("active ticker count is %d, want %d", got, want)
}
return nil
})