prober: add an HTTP endpoint for triggering a probe

- Keep track of the last 10 probe results and successful probe
  latencies;
- Add an HTTP handler that triggers a given probe by name and returns it
  result as a plaintext HTML page, showing recent probe results as a
  baseline

Updates tailscale/corp#20583

Signed-off-by: Anton Tolchanov <anton@tailscale.com>
This commit is contained in:
Anton Tolchanov
2024-07-30 23:06:39 +01:00
committed by Anton Tolchanov
parent 227509547f
commit 153a476957
2 changed files with 311 additions and 40 deletions

View File

@@ -13,6 +13,8 @@ import (
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/prometheus/client_golang/prometheus/testutil"
"tailscale.com/tstest"
)
@@ -292,6 +294,173 @@ func TestOnceMode(t *testing.T) {
}
}
func TestProberProbeInfo(t *testing.T) {
clk := newFakeTime()
p := newForTest(clk.Now, clk.NewTicker).WithOnce(true)
p.Run("probe1", probeInterval, nil, FuncProbe(func(context.Context) error {
clk.Advance(500 * time.Millisecond)
return nil
}))
p.Run("probe2", probeInterval, nil, FuncProbe(func(context.Context) error { return fmt.Errorf("error2") }))
p.Wait()
info := p.ProbeInfo()
wantInfo := map[string]ProbeInfo{
"probe1": {
Name: "probe1",
Interval: probeInterval,
Labels: map[string]string{"class": "", "name": "probe1"},
Latency: 500 * time.Millisecond,
Result: true,
RecentResults: []bool{true},
RecentLatencies: []time.Duration{500 * time.Millisecond},
},
"probe2": {
Name: "probe2",
Interval: probeInterval,
Labels: map[string]string{"class": "", "name": "probe2"},
Error: "error2",
RecentResults: []bool{false},
RecentLatencies: nil, // no latency for failed probes
},
}
if diff := cmp.Diff(wantInfo, info, cmpopts.IgnoreFields(ProbeInfo{}, "Start", "End")); diff != "" {
t.Fatalf("unexpected ProbeInfo (-want +got):\n%s", diff)
}
}
func TestProbeInfoRecent(t *testing.T) {
type probeResult struct {
latency time.Duration
err error
}
tests := []struct {
name string
results []probeResult
wantProbeInfo ProbeInfo
wantRecentSuccessRatio float64
wantRecentMedianLatency time.Duration
}{
{
name: "no_runs",
wantProbeInfo: ProbeInfo{},
wantRecentSuccessRatio: 0,
wantRecentMedianLatency: 0,
},
{
name: "single_success",
results: []probeResult{{latency: 100 * time.Millisecond, err: nil}},
wantProbeInfo: ProbeInfo{
Latency: 100 * time.Millisecond,
Result: true,
RecentResults: []bool{true},
RecentLatencies: []time.Duration{100 * time.Millisecond},
},
wantRecentSuccessRatio: 1,
wantRecentMedianLatency: 100 * time.Millisecond,
},
{
name: "single_failure",
results: []probeResult{{latency: 100 * time.Millisecond, err: errors.New("error123")}},
wantProbeInfo: ProbeInfo{
Result: false,
RecentResults: []bool{false},
RecentLatencies: nil,
Error: "error123",
},
wantRecentSuccessRatio: 0,
wantRecentMedianLatency: 0,
},
{
name: "recent_mix",
results: []probeResult{
{latency: 10 * time.Millisecond, err: errors.New("error1")},
{latency: 20 * time.Millisecond, err: nil},
{latency: 30 * time.Millisecond, err: nil},
{latency: 40 * time.Millisecond, err: errors.New("error4")},
{latency: 50 * time.Millisecond, err: nil},
{latency: 60 * time.Millisecond, err: nil},
{latency: 70 * time.Millisecond, err: errors.New("error7")},
{latency: 80 * time.Millisecond, err: nil},
},
wantProbeInfo: ProbeInfo{
Result: true,
Latency: 80 * time.Millisecond,
RecentResults: []bool{false, true, true, false, true, true, false, true},
RecentLatencies: []time.Duration{
20 * time.Millisecond,
30 * time.Millisecond,
50 * time.Millisecond,
60 * time.Millisecond,
80 * time.Millisecond,
},
},
wantRecentSuccessRatio: 0.625,
wantRecentMedianLatency: 50 * time.Millisecond,
},
{
name: "only_last_10",
results: []probeResult{
{latency: 10 * time.Millisecond, err: errors.New("old_error")},
{latency: 20 * time.Millisecond, err: nil},
{latency: 30 * time.Millisecond, err: nil},
{latency: 40 * time.Millisecond, err: nil},
{latency: 50 * time.Millisecond, err: nil},
{latency: 60 * time.Millisecond, err: nil},
{latency: 70 * time.Millisecond, err: nil},
{latency: 80 * time.Millisecond, err: nil},
{latency: 90 * time.Millisecond, err: nil},
{latency: 100 * time.Millisecond, err: nil},
{latency: 110 * time.Millisecond, err: nil},
},
wantProbeInfo: ProbeInfo{
Result: true,
Latency: 110 * time.Millisecond,
RecentResults: []bool{true, true, true, true, true, true, true, true, true, true},
RecentLatencies: []time.Duration{
20 * time.Millisecond,
30 * time.Millisecond,
40 * time.Millisecond,
50 * time.Millisecond,
60 * time.Millisecond,
70 * time.Millisecond,
80 * time.Millisecond,
90 * time.Millisecond,
100 * time.Millisecond,
110 * time.Millisecond,
},
},
wantRecentSuccessRatio: 1,
wantRecentMedianLatency: 70 * time.Millisecond,
},
}
clk := newFakeTime()
p := newForTest(clk.Now, clk.NewTicker).WithOnce(true)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
probe := newProbe(p, "", probeInterval, nil, FuncProbe(func(context.Context) error { return nil }))
for _, r := range tt.results {
probe.recordStart()
clk.Advance(r.latency)
probe.recordEnd(r.err)
}
info := probe.probeInfoLocked()
if diff := cmp.Diff(tt.wantProbeInfo, info, cmpopts.IgnoreFields(ProbeInfo{}, "Start", "End", "Interval")); diff != "" {
t.Fatalf("unexpected ProbeInfo (-want +got):\n%s", diff)
}
if got := info.RecentSuccessRatio(); got != tt.wantRecentSuccessRatio {
t.Errorf("recentSuccessRatio() = %v, want %v", got, tt.wantRecentSuccessRatio)
}
if got := info.RecentMedianLatency(); got != tt.wantRecentMedianLatency {
t.Errorf("recentMedianLatency() = %v, want %v", got, tt.wantRecentMedianLatency)
}
})
}
}
type fakeTicker struct {
ch chan time.Time
interval time.Duration