prober: add an HTTP endpoint for triggering a probe

- Keep track of the last 10 probe results and successful probe latencies; - Add an HTTP handler that triggers a given probe by name and returns it result as a plaintext HTML page, showing recent probe results as a baseline Updates tailscale/corp#20583 Signed-off-by: Anton Tolchanov <anton@tailscale.com>
2025-08-14 23:17:29 +00:00 · 2024-07-30 23:06:39 +01:00
parent 227509547f
commit 153a476957
2 changed files with 311 additions and 40 deletions
--- a/prober/prober_test.go
+++ b/prober/prober_test.go
@@ -13,6 +13,8 @@ import (
 	"testing"
 	"time"

+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
 	"github.com/prometheus/client_golang/prometheus/testutil"
 	"tailscale.com/tstest"
 )
@@ -292,6 +294,173 @@ func TestOnceMode(t *testing.T) {
 	}
 }

+func TestProberProbeInfo(t *testing.T) {
+	clk := newFakeTime()
+	p := newForTest(clk.Now, clk.NewTicker).WithOnce(true)
+
+	p.Run("probe1", probeInterval, nil, FuncProbe(func(context.Context) error {
+		clk.Advance(500 * time.Millisecond)
+		return nil
+	}))
+	p.Run("probe2", probeInterval, nil, FuncProbe(func(context.Context) error { return fmt.Errorf("error2") }))
+	p.Wait()
+
+	info := p.ProbeInfo()
+	wantInfo := map[string]ProbeInfo{
+		"probe1": {
+			Name:            "probe1",
+			Interval:        probeInterval,
+			Labels:          map[string]string{"class": "", "name": "probe1"},
+			Latency:         500 * time.Millisecond,
+			Result:          true,
+			RecentResults:   []bool{true},
+			RecentLatencies: []time.Duration{500 * time.Millisecond},
+		},
+		"probe2": {
+			Name:            "probe2",
+			Interval:        probeInterval,
+			Labels:          map[string]string{"class": "", "name": "probe2"},
+			Error:           "error2",
+			RecentResults:   []bool{false},
+			RecentLatencies: nil, // no latency for failed probes
+		},
+	}
+
+	if diff := cmp.Diff(wantInfo, info, cmpopts.IgnoreFields(ProbeInfo{}, "Start", "End")); diff != "" {
+		t.Fatalf("unexpected ProbeInfo (-want +got):\n%s", diff)
+	}
+}
+
+func TestProbeInfoRecent(t *testing.T) {
+	type probeResult struct {
+		latency time.Duration
+		err     error
+	}
+	tests := []struct {
+		name                    string
+		results                 []probeResult
+		wantProbeInfo           ProbeInfo
+		wantRecentSuccessRatio  float64
+		wantRecentMedianLatency time.Duration
+	}{
+		{
+			name:                    "no_runs",
+			wantProbeInfo:           ProbeInfo{},
+			wantRecentSuccessRatio:  0,
+			wantRecentMedianLatency: 0,
+		},
+		{
+			name:    "single_success",
+			results: []probeResult{{latency: 100 * time.Millisecond, err: nil}},
+			wantProbeInfo: ProbeInfo{
+				Latency:         100 * time.Millisecond,
+				Result:          true,
+				RecentResults:   []bool{true},
+				RecentLatencies: []time.Duration{100 * time.Millisecond},
+			},
+			wantRecentSuccessRatio:  1,
+			wantRecentMedianLatency: 100 * time.Millisecond,
+		},
+		{
+			name:    "single_failure",
+			results: []probeResult{{latency: 100 * time.Millisecond, err: errors.New("error123")}},
+			wantProbeInfo: ProbeInfo{
+				Result:          false,
+				RecentResults:   []bool{false},
+				RecentLatencies: nil,
+				Error:           "error123",
+			},
+			wantRecentSuccessRatio:  0,
+			wantRecentMedianLatency: 0,
+		},
+		{
+			name: "recent_mix",
+			results: []probeResult{
+				{latency: 10 * time.Millisecond, err: errors.New("error1")},
+				{latency: 20 * time.Millisecond, err: nil},
+				{latency: 30 * time.Millisecond, err: nil},
+				{latency: 40 * time.Millisecond, err: errors.New("error4")},
+				{latency: 50 * time.Millisecond, err: nil},
+				{latency: 60 * time.Millisecond, err: nil},
+				{latency: 70 * time.Millisecond, err: errors.New("error7")},
+				{latency: 80 * time.Millisecond, err: nil},
+			},
+			wantProbeInfo: ProbeInfo{
+				Result:        true,
+				Latency:       80 * time.Millisecond,
+				RecentResults: []bool{false, true, true, false, true, true, false, true},
+				RecentLatencies: []time.Duration{
+					20 * time.Millisecond,
+					30 * time.Millisecond,
+					50 * time.Millisecond,
+					60 * time.Millisecond,
+					80 * time.Millisecond,
+				},
+			},
+			wantRecentSuccessRatio:  0.625,
+			wantRecentMedianLatency: 50 * time.Millisecond,
+		},
+		{
+			name: "only_last_10",
+			results: []probeResult{
+				{latency: 10 * time.Millisecond, err: errors.New("old_error")},
+				{latency: 20 * time.Millisecond, err: nil},
+				{latency: 30 * time.Millisecond, err: nil},
+				{latency: 40 * time.Millisecond, err: nil},
+				{latency: 50 * time.Millisecond, err: nil},
+				{latency: 60 * time.Millisecond, err: nil},
+				{latency: 70 * time.Millisecond, err: nil},
+				{latency: 80 * time.Millisecond, err: nil},
+				{latency: 90 * time.Millisecond, err: nil},
+				{latency: 100 * time.Millisecond, err: nil},
+				{latency: 110 * time.Millisecond, err: nil},
+			},
+			wantProbeInfo: ProbeInfo{
+				Result:        true,
+				Latency:       110 * time.Millisecond,
+				RecentResults: []bool{true, true, true, true, true, true, true, true, true, true},
+				RecentLatencies: []time.Duration{
+					20 * time.Millisecond,
+					30 * time.Millisecond,
+					40 * time.Millisecond,
+					50 * time.Millisecond,
+					60 * time.Millisecond,
+					70 * time.Millisecond,
+					80 * time.Millisecond,
+					90 * time.Millisecond,
+					100 * time.Millisecond,
+					110 * time.Millisecond,
+				},
+			},
+			wantRecentSuccessRatio:  1,
+			wantRecentMedianLatency: 70 * time.Millisecond,
+		},
+	}
+
+	clk := newFakeTime()
+	p := newForTest(clk.Now, clk.NewTicker).WithOnce(true)
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			probe := newProbe(p, "", probeInterval, nil, FuncProbe(func(context.Context) error { return nil }))
+			for _, r := range tt.results {
+				probe.recordStart()
+				clk.Advance(r.latency)
+				probe.recordEnd(r.err)
+			}
+			info := probe.probeInfoLocked()
+			if diff := cmp.Diff(tt.wantProbeInfo, info, cmpopts.IgnoreFields(ProbeInfo{}, "Start", "End", "Interval")); diff != "" {
+				t.Fatalf("unexpected ProbeInfo (-want +got):\n%s", diff)
+			}
+			if got := info.RecentSuccessRatio(); got != tt.wantRecentSuccessRatio {
+				t.Errorf("recentSuccessRatio() = %v, want %v", got, tt.wantRecentSuccessRatio)
+			}
+			if got := info.RecentMedianLatency(); got != tt.wantRecentMedianLatency {
+				t.Errorf("recentMedianLatency() = %v, want %v", got, tt.wantRecentMedianLatency)
+			}
+		})
+	}
+}
+
 type fakeTicker struct {
 	ch       chan time.Time
 	interval time.Duration