From 00a4504cf1a1b150a8896283ccf5e01011112626 Mon Sep 17 00:00:00 2001 From: Percy Wegmann Date: Mon, 16 Dec 2024 23:05:46 -0600 Subject: [PATCH] cmd/derpprobe,prober: add ability to perform continuous queuing delay measurements against DERP servers This new type of probe sends DERP packets sized similarly to CallMeMaybe packets at a rate of 10 packets per second. It records the round-trip times in a Prometheus histogram. It also keeps track of how many packets are dropped. Packets that fail to arrive within 5 seconds are considered dropped. Updates tailscale/corp#24522 Signed-off-by: Percy Wegmann --- cmd/derpprobe/derpprobe.go | 31 +++--- prober/derp.go | 222 ++++++++++++++++++++++++++++++++++++- prober/histogram.go | 50 +++++++++ prober/histogram_test.go | 29 +++++ prober/prober.go | 64 +++++++++-- prober/prober_test.go | 17 +-- prober/status.go | 12 +- prober/status.html | 59 +++++++--- 8 files changed, 429 insertions(+), 55 deletions(-) create mode 100644 prober/histogram.go create mode 100644 prober/histogram_test.go diff --git a/cmd/derpprobe/derpprobe.go b/cmd/derpprobe/derpprobe.go index 620b96609..62b7d47a4 100644 --- a/cmd/derpprobe/derpprobe.go +++ b/cmd/derpprobe/derpprobe.go @@ -18,19 +18,21 @@ import ( ) var ( - derpMapURL = flag.String("derp-map", "https://login.tailscale.com/derpmap/default", "URL to DERP map (https:// or file://) or 'local' to use the local tailscaled's DERP map") - versionFlag = flag.Bool("version", false, "print version and exit") - listen = flag.String("listen", ":8030", "HTTP listen address") - probeOnce = flag.Bool("once", false, "probe once and print results, then exit; ignores the listen flag") - spread = flag.Bool("spread", true, "whether to spread probing over time") - interval = flag.Duration("interval", 15*time.Second, "probe interval") - meshInterval = flag.Duration("mesh-interval", 15*time.Second, "mesh probe interval") - stunInterval = flag.Duration("stun-interval", 15*time.Second, "STUN probe interval") - tlsInterval = flag.Duration("tls-interval", 15*time.Second, "TLS probe interval") - bwInterval = flag.Duration("bw-interval", 0, "bandwidth probe interval (0 = no bandwidth probing)") - bwSize = flag.Int64("bw-probe-size-bytes", 1_000_000, "bandwidth probe size") - bwTUNIPv4Address = flag.String("bw-tun-ipv4-addr", "", "if specified, bandwidth probes will be performed over a TUN device at this address in order to exercise TCP-in-TCP in similar fashion to TCP over Tailscale via DERP. We will use a /30 subnet including this IP address.") - regionCode = flag.String("region-code", "", "probe only this region (e.g. 'lax'); if left blank, all regions will be probed") + derpMapURL = flag.String("derp-map", "https://login.tailscale.com/derpmap/default", "URL to DERP map (https:// or file://) or 'local' to use the local tailscaled's DERP map") + versionFlag = flag.Bool("version", false, "print version and exit") + listen = flag.String("listen", ":8030", "HTTP listen address") + probeOnce = flag.Bool("once", false, "probe once and print results, then exit; ignores the listen flag") + spread = flag.Bool("spread", true, "whether to spread probing over time") + interval = flag.Duration("interval", 15*time.Second, "probe interval") + meshInterval = flag.Duration("mesh-interval", 15*time.Second, "mesh probe interval") + stunInterval = flag.Duration("stun-interval", 15*time.Second, "STUN probe interval") + tlsInterval = flag.Duration("tls-interval", 15*time.Second, "TLS probe interval") + bwInterval = flag.Duration("bw-interval", 0, "bandwidth probe interval (0 = no bandwidth probing)") + bwSize = flag.Int64("bw-probe-size-bytes", 1_000_000, "bandwidth probe size") + bwTUNIPv4Address = flag.String("bw-tun-ipv4-addr", "", "if specified, bandwidth probes will be performed over a TUN device at this address in order to exercise TCP-in-TCP in similar fashion to TCP over Tailscale via DERP; we will use a /30 subnet including this IP address") + qdPacketsPerSecond = flag.Int("qd-packets-per-second", 0, "if greater than 0, queuing delay will be measured continuously using 260 byte packets (approximate size of a CallMeMaybe packet) sent at this rate per second") + qdPacketTimeout = flag.Duration("qd-packet-timeout", 5*time.Second, "queuing delay packets arriving after this period of time from being sent are treated like dropped packets and don't count toward queuing delay timings") + regionCode = flag.String("region-code", "", "probe only this region (e.g. 'lax'); if left blank, all regions will be probed") ) func main() { @@ -45,6 +47,7 @@ func main() { prober.WithMeshProbing(*meshInterval), prober.WithSTUNProbing(*stunInterval), prober.WithTLSProbing(*tlsInterval), + prober.WithQueuingDelayProbing(*qdPacketsPerSecond, *qdPacketTimeout), } if *bwInterval > 0 { opts = append(opts, prober.WithBandwidthProbing(*bwInterval, *bwSize, *bwTUNIPv4Address)) @@ -107,7 +110,7 @@ func getOverallStatus(p *prober.Prober) (o overallStatus) { // Do not show probes that have not finished yet. continue } - if i.Result { + if i.Status == prober.ProbeStatusSucceeded { o.addGoodf("%s: %s", p, i.Latency) } else { o.addBadf("%s: %s", p, i.Error) diff --git a/prober/derp.go b/prober/derp.go index 742e8a5f4..5adc0c0b4 100644 --- a/prober/derp.go +++ b/prober/derp.go @@ -8,6 +8,7 @@ import ( "cmp" "context" crand "crypto/rand" + "encoding/binary" "encoding/json" "errors" "expvar" @@ -17,6 +18,7 @@ import ( "net" "net/http" "net/netip" + "slices" "strconv" "strings" "sync" @@ -53,6 +55,10 @@ type derpProber struct { bwProbeSize int64 bwTUNIPv4Prefix *netip.Prefix // or nil to not use TUN + // Optional queuing delay probing. + qdPacketsPerSecond int // in packets per second + qdPacketTimeout time.Duration + // Optionally restrict probes to a single regionCode. regionCode string @@ -64,6 +70,7 @@ type derpProber struct { udpProbeFn func(string, int) ProbeClass meshProbeFn func(string, string) ProbeClass bwProbeFn func(string, string, int64) ProbeClass + qdProbeFn func(string, string, int, time.Duration) ProbeClass sync.Mutex lastDERPMap *tailcfg.DERPMap @@ -93,6 +100,16 @@ func WithBandwidthProbing(interval time.Duration, size int64, tunAddress string) } } +// WithQueuingDelayProbing enables/disables queuing delay probing. qdSendRate +// is the number of packets sent per second. qdTimeout is the amount of time +// after which a sent packet is considered to have timed out. +func WithQueuingDelayProbing(qdPacketsPerSecond int, qdPacketTimeout time.Duration) DERPOpt { + return func(d *derpProber) { + d.qdPacketsPerSecond = qdPacketsPerSecond + d.qdPacketTimeout = qdPacketTimeout + } +} + // WithMeshProbing enables mesh probing. When enabled, a small message will be // transferred through each DERP server and each pair of DERP servers. func WithMeshProbing(interval time.Duration) DERPOpt { @@ -147,6 +164,7 @@ func DERP(p *Prober, derpMapURL string, opts ...DERPOpt) (*derpProber, error) { d.udpProbeFn = d.ProbeUDP d.meshProbeFn = d.probeMesh d.bwProbeFn = d.probeBandwidth + d.qdProbeFn = d.probeQueuingDelay return d, nil } @@ -213,7 +231,7 @@ func (d *derpProber) probeMapFn(ctx context.Context) error { } } - if d.bwInterval > 0 && d.bwProbeSize > 0 { + if d.bwInterval != 0 && d.bwProbeSize > 0 { n := fmt.Sprintf("derp/%s/%s/%s/bw", region.RegionCode, server.Name, to.Name) wantProbes[n] = true if d.probes[n] == nil { @@ -225,6 +243,15 @@ func (d *derpProber) probeMapFn(ctx context.Context) error { d.probes[n] = d.p.Run(n, d.bwInterval, labels, d.bwProbeFn(server.Name, to.Name, d.bwProbeSize)) } } + + if d.qdPacketsPerSecond > 0 { + n := fmt.Sprintf("derp/%s/%s/%s/qd", region.RegionCode, server.Name, to.Name) + wantProbes[n] = true + if d.probes[n] == nil { + log.Printf("adding DERP queuing delay probe for %s->%s (%s)", server.Name, to.Name, region.RegionName) + d.probes[n] = d.p.Run(n, -10*time.Second, labels, d.qdProbeFn(server.Name, to.Name, d.qdPacketsPerSecond, d.qdPacketTimeout)) + } + } } } } @@ -240,7 +267,7 @@ func (d *derpProber) probeMapFn(ctx context.Context) error { return nil } -// probeMesh returs a probe class that sends a test packet through a pair of DERP +// probeMesh returns a probe class that sends a test packet through a pair of DERP // servers (or just one server, if 'from' and 'to' are the same). 'from' and 'to' // are expected to be names (DERPNode.Name) of two DERP servers in the same region. func (d *derpProber) probeMesh(from, to string) ProbeClass { @@ -263,7 +290,7 @@ func (d *derpProber) probeMesh(from, to string) ProbeClass { } } -// probeBandwidth returs a probe class that sends a payload of a given size +// probeBandwidth returns a probe class that sends a payload of a given size // through a pair of DERP servers (or just one server, if 'from' and 'to' are // the same). 'from' and 'to' are expected to be names (DERPNode.Name) of two // DERP servers in the same region. @@ -295,6 +322,193 @@ func (d *derpProber) probeBandwidth(from, to string, size int64) ProbeClass { } } +// probeQueuingDelay returns a probe class that continuously sends packets +// through a pair of DERP servers (or just one server, if 'from' and 'to' are +// the same) at a rate of `packetsPerSecond` packets per second in order to +// measure queuing delays. Packets arriving after `packetTimeout` don't contribute +// to the queuing delay measurement and are recorded as dropped. 'from' and 'to' are +// expected to be names (DERPNode.Name) of two DERP servers in the same region, +// and may refer to the same server. +func (d *derpProber) probeQueuingDelay(from, to string, packetsPerSecond int, packetTimeout time.Duration) ProbeClass { + derpPath := "mesh" + if from == to { + derpPath = "single" + } + var packetsDropped expvar.Float + qdh := newHistogram([]float64{.005, .01, .025, .05, .1, .25, .5, 1}) + return ProbeClass{ + Probe: func(ctx context.Context) error { + fromN, toN, err := d.getNodePair(from, to) + if err != nil { + return err + } + return derpProbeQueuingDelay(ctx, d.lastDERPMap, fromN, toN, packetsPerSecond, packetTimeout, &packetsDropped, qdh) + }, + Class: "derp_qd", + Labels: Labels{"derp_path": derpPath}, + Metrics: func(l prometheus.Labels) []prometheus.Metric { + qdh.mx.Lock() + result := []prometheus.Metric{ + prometheus.MustNewConstMetric(prometheus.NewDesc("derp_qd_probe_dropped_packets", "Total packets dropped", nil, l), prometheus.CounterValue, float64(packetsDropped.Value())), + prometheus.MustNewConstHistogram(prometheus.NewDesc("derp_qd_probe_delays_seconds", "Distribution of queuing delays", nil, l), qdh.count, qdh.sum, qdh.bucketedCounts), + } + qdh.mx.Unlock() + return result + }, + } +} + +// derpProbeQueuingDelay continuously sends data between two local DERP clients +// connected to two DERP servers in order to measure queuing delays. From and to +// can be the same server. +func derpProbeQueuingDelay(ctx context.Context, dm *tailcfg.DERPMap, from, to *tailcfg.DERPNode, packetsPerSecond int, packetTimeout time.Duration, packetsDropped *expvar.Float, qdh *histogram) (err error) { + // This probe uses clients with isProber=false to avoid spamming the derper + // logs with every packet sent by the queuing delay probe. + fromc, err := newConn(ctx, dm, from, false) + if err != nil { + return err + } + defer fromc.Close() + toc, err := newConn(ctx, dm, to, false) + if err != nil { + return err + } + defer toc.Close() + + // Wait a bit for from's node to hear about to existing on the + // other node in the region, in the case where the two nodes + // are different. + if from.Name != to.Name { + time.Sleep(100 * time.Millisecond) // pretty arbitrary + } + + if err := runDerpProbeQueuingDelayContinously(ctx, from, to, fromc, toc, packetsPerSecond, packetTimeout, packetsDropped, qdh); err != nil { + // Record pubkeys on failed probes to aid investigation. + return fmt.Errorf("%s -> %s: %w", + fromc.SelfPublicKey().ShortString(), + toc.SelfPublicKey().ShortString(), err) + } + return nil +} + +func runDerpProbeQueuingDelayContinously(ctx context.Context, from, to *tailcfg.DERPNode, fromc, toc *derphttp.Client, packetsPerSecond int, packetTimeout time.Duration, packetsDropped *expvar.Float, qdh *histogram) error { + // Make sure all goroutines have finished. + var wg sync.WaitGroup + defer wg.Wait() + + // Close the clients to make sure goroutines that are reading/writing from them terminate. + defer fromc.Close() + defer toc.Close() + + type txRecord struct { + at time.Time + seq uint64 + } + // txRecords is sized to hold enough transmission records to keep timings + // for packets up to their timeout. As records age out of the front of this + // list, if the associated packet arrives, we won't have a txRecord for it + // and will consider it to have timed out. + txRecords := make([]txRecord, 0, packetsPerSecond*int(packetTimeout.Seconds())) + var txRecordsMu sync.Mutex + + // Send the packets. + sendErrC := make(chan error, 1) + // TODO: construct a disco CallMeMaybe in the same fashion as magicsock, e.g. magic bytes, src pub, seal payload. + // DERP server handling of disco may vary from non-disco, and we may want to measure queue delay of both. + pkt := make([]byte, 260) // the same size as a CallMeMaybe packet observed on a Tailscale client. + crand.Read(pkt) + + wg.Add(1) + go func() { + defer wg.Done() + t := time.NewTicker(time.Second / time.Duration(packetsPerSecond)) + defer t.Stop() + + seq := uint64(0) + for { + select { + case <-ctx.Done(): + return + case <-t.C: + txRecordsMu.Lock() + if len(txRecords) == cap(txRecords) { + txRecords = slices.Delete(txRecords, 0, 1) + packetsDropped.Add(1) + } + txRecords = append(txRecords, txRecord{time.Now(), seq}) + txRecordsMu.Unlock() + binary.BigEndian.PutUint64(pkt, seq) + seq++ + if err := fromc.Send(toc.SelfPublicKey(), pkt); err != nil { + sendErrC <- fmt.Errorf("sending packet %w", err) + return + } + } + } + }() + + // Receive the packets. + recvFinishedC := make(chan error, 1) + wg.Add(1) + go func() { + defer wg.Done() + defer close(recvFinishedC) // to break out of 'select' below. + for { + m, err := toc.Recv() + if err != nil { + recvFinishedC <- err + return + } + switch v := m.(type) { + case derp.ReceivedPacket: + now := time.Now() + if v.Source != fromc.SelfPublicKey() { + recvFinishedC <- fmt.Errorf("got data packet from unexpected source, %v", v.Source) + return + } + seq := binary.BigEndian.Uint64(v.Data) + txRecordsMu.Lock() + findTxRecord: + for i, record := range txRecords { + switch { + case record.seq == seq: + rtt := now.Sub(record.at) + qdh.add(rtt.Seconds()) + txRecords = slices.Delete(txRecords, i, i+1) + break findTxRecord + case record.seq > seq: + // No sent time found, probably a late arrival already + // recorded as drop by sender when deleted. + break findTxRecord + case record.seq < seq: + continue + } + } + txRecordsMu.Unlock() + + case derp.KeepAliveMessage: + // Silently ignore. + + default: + log.Printf("%v: ignoring Recv frame type %T", to.Name, v) + // Loop. + } + } + }() + + select { + case <-ctx.Done(): + return fmt.Errorf("timeout: %w", ctx.Err()) + case err := <-sendErrC: + return fmt.Errorf("error sending via %q: %w", from.Name, err) + case err := <-recvFinishedC: + if err != nil { + return fmt.Errorf("error receiving from %q: %w", to.Name, err) + } + } + return nil +} + // getNodePair returns DERPNode objects for two DERP servers based on their // short names. func (d *derpProber) getNodePair(n1, n2 string) (ret1, ret2 *tailcfg.DERPNode, _ error) { @@ -573,6 +787,8 @@ func runDerpProbeNodePair(ctx context.Context, from, to *tailcfg.DERPNode, fromc recvc <- fmt.Errorf("got data packet %d from unexpected source, %v", idx, v.Source) return } + // This assumes that the packets are received reliably and in order. + // The DERP protocol does not guarantee this, but this probe assumes it. if got, want := v.Data, pkts[idx]; !bytes.Equal(got, want) { recvc <- fmt.Errorf("unexpected data packet %d (out of %d)", idx, len(pkts)) return diff --git a/prober/histogram.go b/prober/histogram.go new file mode 100644 index 000000000..e9005b452 --- /dev/null +++ b/prober/histogram.go @@ -0,0 +1,50 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package prober + +import ( + "slices" + "sync" +) + +// histogram serves as an adapter to the Prometheus histogram datatype. +// The prober framework passes labels at custom metric collection time that +// it expects to be coupled with the returned metrics. See ProbeClass.Metrics +// and its call sites. Native prometheus histograms cannot be collected while +// injecting more labels. Instead we use this type and pass observations + +// collection labels to prometheus.MustNewConstHistogram() at prometheus +// metric collection time. +type histogram struct { + count uint64 + sum float64 + buckets []float64 + bucketedCounts map[float64]uint64 + mx sync.Mutex +} + +// newHistogram constructs a histogram that buckets data based on the given +// slice of upper bounds. +func newHistogram(buckets []float64) *histogram { + slices.Sort(buckets) + return &histogram{ + buckets: buckets, + bucketedCounts: make(map[float64]uint64, len(buckets)), + } +} + +func (h *histogram) add(v float64) { + h.mx.Lock() + defer h.mx.Unlock() + + h.count++ + h.sum += v + + for _, b := range h.buckets { + if v > b { + continue + } + h.bucketedCounts[b] += 1 + break + } +} diff --git a/prober/histogram_test.go b/prober/histogram_test.go new file mode 100644 index 000000000..a569167e6 --- /dev/null +++ b/prober/histogram_test.go @@ -0,0 +1,29 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package prober + +import ( + "testing" + + "github.com/google/go-cmp/cmp" +) + +func TestHistogram(t *testing.T) { + h := newHistogram([]float64{1, 2}) + h.add(0.5) + h.add(1) + h.add(1.5) + h.add(2) + h.add(2.5) + + if diff := cmp.Diff(h.count, uint64(5)); diff != "" { + t.Errorf("wrong count; (-got+want):%v", diff) + } + if diff := cmp.Diff(h.sum, 7.5); diff != "" { + t.Errorf("wrong sum; (-got+want):%v", diff) + } + if diff := cmp.Diff(h.bucketedCounts, map[float64]uint64{1: 2, 2: 2}); diff != "" { + t.Errorf("wrong bucketedCounts; (-got+want):%v", diff) + } +} diff --git a/prober/prober.go b/prober/prober.go index 2a43628bd..e3860e7b9 100644 --- a/prober/prober.go +++ b/prober/prober.go @@ -94,6 +94,9 @@ func newForTest(now func() time.Time, newTicker func(time.Duration) ticker) *Pro // Run executes probe class function every interval, and exports probe results under probeName. // +// If interval is negative, the probe will run continuously. If it encounters a failure while +// running continuously, it will pause for -1*interval and then retry. +// // Registering a probe under an already-registered name panics. func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc ProbeClass) *Probe { p.mu.Lock() @@ -256,6 +259,11 @@ type Probe struct { latencyHist *ring.Ring } +// IsContinuous indicates that this is a continuous probe. +func (p *Probe) IsContinuous() bool { + return p.interval < 0 +} + // Close shuts down the Probe and unregisters it from its Prober. // It is safe to Run a new probe of the same name after Close returns. func (p *Probe) Close() error { @@ -288,6 +296,22 @@ func (p *Probe) loop() { return } + if p.IsContinuous() { + // Probe function is going to run continuously. + for { + p.run() + // Wait and then retry if probe fails. We use the inverse of the + // configured negative interval as our sleep period. + // TODO(percy):implement exponential backoff, possibly using logtail/backoff. + select { + case <-time.After(-1 * p.interval): + p.run() + case <-p.ctx.Done(): + return + } + } + } + p.tick = p.prober.newTicker(p.interval) defer p.tick.Stop() for { @@ -323,9 +347,13 @@ func (p *Probe) run() (pi ProbeInfo, err error) { p.recordEnd(err) } }() - timeout := time.Duration(float64(p.interval) * 0.8) - ctx, cancel := context.WithTimeout(p.ctx, timeout) - defer cancel() + ctx := p.ctx + if !p.IsContinuous() { + timeout := time.Duration(float64(p.interval) * 0.8) + var cancel func() + ctx, cancel = context.WithTimeout(ctx, timeout) + defer cancel() + } err = p.probeClass.Probe(ctx) p.recordEnd(err) @@ -365,6 +393,16 @@ func (p *Probe) recordEnd(err error) { p.successHist = p.successHist.Next() } +// ProbeStatus indicates the status of a probe. +type ProbeStatus string + +const ( + ProbeStatusUnknown = "unknown" + ProbeStatusRunning = "running" + ProbeStatusFailed = "failed" + ProbeStatusSucceeded = "succeeded" +) + // ProbeInfo is a snapshot of the configuration and state of a Probe. type ProbeInfo struct { Name string @@ -374,7 +412,7 @@ type ProbeInfo struct { Start time.Time End time.Time Latency time.Duration - Result bool + Status ProbeStatus Error string RecentResults []bool RecentLatencies []time.Duration @@ -402,6 +440,10 @@ func (pb ProbeInfo) RecentMedianLatency() time.Duration { return pb.RecentLatencies[len(pb.RecentLatencies)/2] } +func (pb ProbeInfo) Continuous() bool { + return pb.Interval < 0 +} + // ProbeInfo returns the state of all probes. func (p *Prober) ProbeInfo() map[string]ProbeInfo { out := map[string]ProbeInfo{} @@ -429,9 +471,14 @@ func (probe *Probe) probeInfoLocked() ProbeInfo { Labels: probe.metricLabels, Start: probe.start, End: probe.end, - Result: probe.succeeded, } - if probe.lastErr != nil { + inf.Status = ProbeStatusUnknown + if probe.end.Before(probe.start) { + inf.Status = ProbeStatusRunning + } else if probe.succeeded { + inf.Status = ProbeStatusSucceeded + } else if probe.lastErr != nil { + inf.Status = ProbeStatusFailed inf.Error = probe.lastErr.Error() } if probe.latency > 0 { @@ -467,7 +514,7 @@ func (p *Prober) RunHandler(w http.ResponseWriter, r *http.Request) error { p.mu.Lock() probe, ok := p.probes[name] p.mu.Unlock() - if !ok { + if !ok || probe.IsContinuous() { return tsweb.Error(http.StatusNotFound, fmt.Sprintf("unknown probe %q", name), nil) } @@ -531,7 +578,8 @@ func (p *Probe) Collect(ch chan<- prometheus.Metric) { if !p.start.IsZero() { ch <- prometheus.MustNewConstMetric(p.mStartTime, prometheus.GaugeValue, float64(p.start.Unix())) } - if p.end.IsZero() { + // For periodic probes that haven't ended, don't collect probe metrics yet. + if p.end.IsZero() && !p.IsContinuous() { return } ch <- prometheus.MustNewConstMetric(p.mEndTime, prometheus.GaugeValue, float64(p.end.Unix())) diff --git a/prober/prober_test.go b/prober/prober_test.go index 742a914b2..3905bfbc9 100644 --- a/prober/prober_test.go +++ b/prober/prober_test.go @@ -316,7 +316,7 @@ func TestProberProbeInfo(t *testing.T) { Interval: probeInterval, Labels: map[string]string{"class": "", "name": "probe1"}, Latency: 500 * time.Millisecond, - Result: true, + Status: ProbeStatusSucceeded, RecentResults: []bool{true}, RecentLatencies: []time.Duration{500 * time.Millisecond}, }, @@ -324,6 +324,7 @@ func TestProberProbeInfo(t *testing.T) { Name: "probe2", Interval: probeInterval, Labels: map[string]string{"class": "", "name": "probe2"}, + Status: ProbeStatusFailed, Error: "error2", RecentResults: []bool{false}, RecentLatencies: nil, // no latency for failed probes @@ -349,7 +350,7 @@ func TestProbeInfoRecent(t *testing.T) { }{ { name: "no_runs", - wantProbeInfo: ProbeInfo{}, + wantProbeInfo: ProbeInfo{Status: ProbeStatusUnknown}, wantRecentSuccessRatio: 0, wantRecentMedianLatency: 0, }, @@ -358,7 +359,7 @@ func TestProbeInfoRecent(t *testing.T) { results: []probeResult{{latency: 100 * time.Millisecond, err: nil}}, wantProbeInfo: ProbeInfo{ Latency: 100 * time.Millisecond, - Result: true, + Status: ProbeStatusSucceeded, RecentResults: []bool{true}, RecentLatencies: []time.Duration{100 * time.Millisecond}, }, @@ -369,7 +370,7 @@ func TestProbeInfoRecent(t *testing.T) { name: "single_failure", results: []probeResult{{latency: 100 * time.Millisecond, err: errors.New("error123")}}, wantProbeInfo: ProbeInfo{ - Result: false, + Status: ProbeStatusFailed, RecentResults: []bool{false}, RecentLatencies: nil, Error: "error123", @@ -390,7 +391,7 @@ func TestProbeInfoRecent(t *testing.T) { {latency: 80 * time.Millisecond, err: nil}, }, wantProbeInfo: ProbeInfo{ - Result: true, + Status: ProbeStatusSucceeded, Latency: 80 * time.Millisecond, RecentResults: []bool{false, true, true, false, true, true, false, true}, RecentLatencies: []time.Duration{ @@ -420,7 +421,7 @@ func TestProbeInfoRecent(t *testing.T) { {latency: 110 * time.Millisecond, err: nil}, }, wantProbeInfo: ProbeInfo{ - Result: true, + Status: ProbeStatusSucceeded, Latency: 110 * time.Millisecond, RecentResults: []bool{true, true, true, true, true, true, true, true, true, true}, RecentLatencies: []time.Duration{ @@ -483,7 +484,7 @@ func TestProberRunHandler(t *testing.T) { ProbeInfo: ProbeInfo{ Name: "success", Interval: probeInterval, - Result: true, + Status: ProbeStatusSucceeded, RecentResults: []bool{true, true}, }, PreviousSuccessRatio: 1, @@ -498,7 +499,7 @@ func TestProberRunHandler(t *testing.T) { ProbeInfo: ProbeInfo{ Name: "failure", Interval: probeInterval, - Result: false, + Status: ProbeStatusFailed, Error: "error123", RecentResults: []bool{false, false}, }, diff --git a/prober/status.go b/prober/status.go index aa9ef99d0..20fbeec58 100644 --- a/prober/status.go +++ b/prober/status.go @@ -62,8 +62,9 @@ func (p *Prober) StatusHandler(opts ...statusHandlerOpt) tsweb.ReturnHandlerFunc return func(w http.ResponseWriter, r *http.Request) error { type probeStatus struct { ProbeInfo - TimeSinceLast time.Duration - Links map[string]template.URL + TimeSinceLastStart time.Duration + TimeSinceLastEnd time.Duration + Links map[string]template.URL } vars := struct { Title string @@ -81,12 +82,15 @@ func (p *Prober) StatusHandler(opts ...statusHandlerOpt) tsweb.ReturnHandlerFunc for name, info := range p.ProbeInfo() { vars.TotalProbes++ - if !info.Result { + if info.Error != "" { vars.UnhealthyProbes++ } s := probeStatus{ProbeInfo: info} + if !info.Start.IsZero() { + s.TimeSinceLastStart = time.Since(info.Start).Truncate(time.Second) + } if !info.End.IsZero() { - s.TimeSinceLast = time.Since(info.End).Truncate(time.Second) + s.TimeSinceLastEnd = time.Since(info.End).Truncate(time.Second) } for textTpl, urlTpl := range params.probeLinks { text, err := renderTemplate(textTpl, info) diff --git a/prober/status.html b/prober/status.html index ff0f06c13..d26588da1 100644 --- a/prober/status.html +++ b/prober/status.html @@ -73,8 +73,9 @@ Name Probe Class & Labels Interval - Last Attempt - Success + Last Finished + Last Started + Status Latency Last Error @@ -85,9 +86,11 @@ {{$name}} {{range $text, $url := $probeInfo.Links}}
- + {{if not $probeInfo.Continuous}} + + {{end}} {{end}} {{$probeInfo.Class}}
@@ -97,28 +100,48 @@ {{end}} - {{$probeInfo.Interval}} - - {{if $probeInfo.TimeSinceLast}} - {{$probeInfo.TimeSinceLast.String}} ago
+ + {{if $probeInfo.Continuous}} + Continuous + {{else}} + {{$probeInfo.Interval}} + {{end}} + + + {{if $probeInfo.TimeSinceLastEnd}} + {{$probeInfo.TimeSinceLastEnd.String}} ago
{{$probeInfo.End.Format "2006-01-02T15:04:05Z07:00"}} {{else}} Never {{end}} - - {{if $probeInfo.Result}} - {{$probeInfo.Result}} + + {{if $probeInfo.TimeSinceLastStart}} + {{$probeInfo.TimeSinceLastStart.String}} ago
+ {{$probeInfo.Start.Format "2006-01-02T15:04:05Z07:00"}} {{else}} - {{$probeInfo.Result}} + Never + {{end}} + + + {{if $probeInfo.Error}} + {{$probeInfo.Status}} + {{else}} + {{$probeInfo.Status}} {{end}}
-
Recent: {{$probeInfo.RecentResults}}
-
Mean: {{$probeInfo.RecentSuccessRatio}}
+ {{if not $probeInfo.Continuous}} +
Recent: {{$probeInfo.RecentResults}}
+
Mean: {{$probeInfo.RecentSuccessRatio}}
+ {{end}} - {{$probeInfo.Latency.String}} -
Recent: {{$probeInfo.RecentLatencies}}
-
Median: {{$probeInfo.RecentMedianLatency}}
+ {{if $probeInfo.Continuous}} + n/a + {{else}} + {{$probeInfo.Latency.String}} +
Recent: {{$probeInfo.RecentLatencies}}
+
Median: {{$probeInfo.RecentMedianLatency}}
+ {{end}} {{$probeInfo.Error}}