cmd/derpprobe,prober: add ability to perform continuous queuing delay measurements against DERP servers

This new type of probe sends DERP packets sized similarly to CallMeMaybe packets
at a rate of 10 packets per second. It records the round-trip times in a Prometheus
histogram. It also keeps track of how many packets are dropped. Packets that fail to
arrive within 5 seconds are considered dropped.

Updates tailscale/corp#24522

Signed-off-by: Percy Wegmann <percy@tailscale.com>
This commit is contained in:
Percy Wegmann 2024-12-16 23:05:46 -06:00 committed by Percy Wegmann
parent 6ae0287a57
commit 00a4504cf1
8 changed files with 429 additions and 55 deletions

View File

@ -29,7 +29,9 @@ var (
tlsInterval = flag.Duration("tls-interval", 15*time.Second, "TLS probe interval") tlsInterval = flag.Duration("tls-interval", 15*time.Second, "TLS probe interval")
bwInterval = flag.Duration("bw-interval", 0, "bandwidth probe interval (0 = no bandwidth probing)") bwInterval = flag.Duration("bw-interval", 0, "bandwidth probe interval (0 = no bandwidth probing)")
bwSize = flag.Int64("bw-probe-size-bytes", 1_000_000, "bandwidth probe size") bwSize = flag.Int64("bw-probe-size-bytes", 1_000_000, "bandwidth probe size")
bwTUNIPv4Address = flag.String("bw-tun-ipv4-addr", "", "if specified, bandwidth probes will be performed over a TUN device at this address in order to exercise TCP-in-TCP in similar fashion to TCP over Tailscale via DERP. We will use a /30 subnet including this IP address.") bwTUNIPv4Address = flag.String("bw-tun-ipv4-addr", "", "if specified, bandwidth probes will be performed over a TUN device at this address in order to exercise TCP-in-TCP in similar fashion to TCP over Tailscale via DERP; we will use a /30 subnet including this IP address")
qdPacketsPerSecond = flag.Int("qd-packets-per-second", 0, "if greater than 0, queuing delay will be measured continuously using 260 byte packets (approximate size of a CallMeMaybe packet) sent at this rate per second")
qdPacketTimeout = flag.Duration("qd-packet-timeout", 5*time.Second, "queuing delay packets arriving after this period of time from being sent are treated like dropped packets and don't count toward queuing delay timings")
regionCode = flag.String("region-code", "", "probe only this region (e.g. 'lax'); if left blank, all regions will be probed") regionCode = flag.String("region-code", "", "probe only this region (e.g. 'lax'); if left blank, all regions will be probed")
) )
@ -45,6 +47,7 @@ func main() {
prober.WithMeshProbing(*meshInterval), prober.WithMeshProbing(*meshInterval),
prober.WithSTUNProbing(*stunInterval), prober.WithSTUNProbing(*stunInterval),
prober.WithTLSProbing(*tlsInterval), prober.WithTLSProbing(*tlsInterval),
prober.WithQueuingDelayProbing(*qdPacketsPerSecond, *qdPacketTimeout),
} }
if *bwInterval > 0 { if *bwInterval > 0 {
opts = append(opts, prober.WithBandwidthProbing(*bwInterval, *bwSize, *bwTUNIPv4Address)) opts = append(opts, prober.WithBandwidthProbing(*bwInterval, *bwSize, *bwTUNIPv4Address))
@ -107,7 +110,7 @@ func getOverallStatus(p *prober.Prober) (o overallStatus) {
// Do not show probes that have not finished yet. // Do not show probes that have not finished yet.
continue continue
} }
if i.Result { if i.Status == prober.ProbeStatusSucceeded {
o.addGoodf("%s: %s", p, i.Latency) o.addGoodf("%s: %s", p, i.Latency)
} else { } else {
o.addBadf("%s: %s", p, i.Error) o.addBadf("%s: %s", p, i.Error)

View File

@ -8,6 +8,7 @@ import (
"cmp" "cmp"
"context" "context"
crand "crypto/rand" crand "crypto/rand"
"encoding/binary"
"encoding/json" "encoding/json"
"errors" "errors"
"expvar" "expvar"
@ -17,6 +18,7 @@ import (
"net" "net"
"net/http" "net/http"
"net/netip" "net/netip"
"slices"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@ -53,6 +55,10 @@ type derpProber struct {
bwProbeSize int64 bwProbeSize int64
bwTUNIPv4Prefix *netip.Prefix // or nil to not use TUN bwTUNIPv4Prefix *netip.Prefix // or nil to not use TUN
// Optional queuing delay probing.
qdPacketsPerSecond int // in packets per second
qdPacketTimeout time.Duration
// Optionally restrict probes to a single regionCode. // Optionally restrict probes to a single regionCode.
regionCode string regionCode string
@ -64,6 +70,7 @@ type derpProber struct {
udpProbeFn func(string, int) ProbeClass udpProbeFn func(string, int) ProbeClass
meshProbeFn func(string, string) ProbeClass meshProbeFn func(string, string) ProbeClass
bwProbeFn func(string, string, int64) ProbeClass bwProbeFn func(string, string, int64) ProbeClass
qdProbeFn func(string, string, int, time.Duration) ProbeClass
sync.Mutex sync.Mutex
lastDERPMap *tailcfg.DERPMap lastDERPMap *tailcfg.DERPMap
@ -93,6 +100,16 @@ func WithBandwidthProbing(interval time.Duration, size int64, tunAddress string)
} }
} }
// WithQueuingDelayProbing enables/disables queuing delay probing. qdSendRate
// is the number of packets sent per second. qdTimeout is the amount of time
// after which a sent packet is considered to have timed out.
func WithQueuingDelayProbing(qdPacketsPerSecond int, qdPacketTimeout time.Duration) DERPOpt {
return func(d *derpProber) {
d.qdPacketsPerSecond = qdPacketsPerSecond
d.qdPacketTimeout = qdPacketTimeout
}
}
// WithMeshProbing enables mesh probing. When enabled, a small message will be // WithMeshProbing enables mesh probing. When enabled, a small message will be
// transferred through each DERP server and each pair of DERP servers. // transferred through each DERP server and each pair of DERP servers.
func WithMeshProbing(interval time.Duration) DERPOpt { func WithMeshProbing(interval time.Duration) DERPOpt {
@ -147,6 +164,7 @@ func DERP(p *Prober, derpMapURL string, opts ...DERPOpt) (*derpProber, error) {
d.udpProbeFn = d.ProbeUDP d.udpProbeFn = d.ProbeUDP
d.meshProbeFn = d.probeMesh d.meshProbeFn = d.probeMesh
d.bwProbeFn = d.probeBandwidth d.bwProbeFn = d.probeBandwidth
d.qdProbeFn = d.probeQueuingDelay
return d, nil return d, nil
} }
@ -213,7 +231,7 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
} }
} }
if d.bwInterval > 0 && d.bwProbeSize > 0 { if d.bwInterval != 0 && d.bwProbeSize > 0 {
n := fmt.Sprintf("derp/%s/%s/%s/bw", region.RegionCode, server.Name, to.Name) n := fmt.Sprintf("derp/%s/%s/%s/bw", region.RegionCode, server.Name, to.Name)
wantProbes[n] = true wantProbes[n] = true
if d.probes[n] == nil { if d.probes[n] == nil {
@ -225,6 +243,15 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
d.probes[n] = d.p.Run(n, d.bwInterval, labels, d.bwProbeFn(server.Name, to.Name, d.bwProbeSize)) d.probes[n] = d.p.Run(n, d.bwInterval, labels, d.bwProbeFn(server.Name, to.Name, d.bwProbeSize))
} }
} }
if d.qdPacketsPerSecond > 0 {
n := fmt.Sprintf("derp/%s/%s/%s/qd", region.RegionCode, server.Name, to.Name)
wantProbes[n] = true
if d.probes[n] == nil {
log.Printf("adding DERP queuing delay probe for %s->%s (%s)", server.Name, to.Name, region.RegionName)
d.probes[n] = d.p.Run(n, -10*time.Second, labels, d.qdProbeFn(server.Name, to.Name, d.qdPacketsPerSecond, d.qdPacketTimeout))
}
}
} }
} }
} }
@ -240,7 +267,7 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
return nil return nil
} }
// probeMesh returs a probe class that sends a test packet through a pair of DERP // probeMesh returns a probe class that sends a test packet through a pair of DERP
// servers (or just one server, if 'from' and 'to' are the same). 'from' and 'to' // servers (or just one server, if 'from' and 'to' are the same). 'from' and 'to'
// are expected to be names (DERPNode.Name) of two DERP servers in the same region. // are expected to be names (DERPNode.Name) of two DERP servers in the same region.
func (d *derpProber) probeMesh(from, to string) ProbeClass { func (d *derpProber) probeMesh(from, to string) ProbeClass {
@ -263,7 +290,7 @@ func (d *derpProber) probeMesh(from, to string) ProbeClass {
} }
} }
// probeBandwidth returs a probe class that sends a payload of a given size // probeBandwidth returns a probe class that sends a payload of a given size
// through a pair of DERP servers (or just one server, if 'from' and 'to' are // through a pair of DERP servers (or just one server, if 'from' and 'to' are
// the same). 'from' and 'to' are expected to be names (DERPNode.Name) of two // the same). 'from' and 'to' are expected to be names (DERPNode.Name) of two
// DERP servers in the same region. // DERP servers in the same region.
@ -295,6 +322,193 @@ func (d *derpProber) probeBandwidth(from, to string, size int64) ProbeClass {
} }
} }
// probeQueuingDelay returns a probe class that continuously sends packets
// through a pair of DERP servers (or just one server, if 'from' and 'to' are
// the same) at a rate of `packetsPerSecond` packets per second in order to
// measure queuing delays. Packets arriving after `packetTimeout` don't contribute
// to the queuing delay measurement and are recorded as dropped. 'from' and 'to' are
// expected to be names (DERPNode.Name) of two DERP servers in the same region,
// and may refer to the same server.
func (d *derpProber) probeQueuingDelay(from, to string, packetsPerSecond int, packetTimeout time.Duration) ProbeClass {
derpPath := "mesh"
if from == to {
derpPath = "single"
}
var packetsDropped expvar.Float
qdh := newHistogram([]float64{.005, .01, .025, .05, .1, .25, .5, 1})
return ProbeClass{
Probe: func(ctx context.Context) error {
fromN, toN, err := d.getNodePair(from, to)
if err != nil {
return err
}
return derpProbeQueuingDelay(ctx, d.lastDERPMap, fromN, toN, packetsPerSecond, packetTimeout, &packetsDropped, qdh)
},
Class: "derp_qd",
Labels: Labels{"derp_path": derpPath},
Metrics: func(l prometheus.Labels) []prometheus.Metric {
qdh.mx.Lock()
result := []prometheus.Metric{
prometheus.MustNewConstMetric(prometheus.NewDesc("derp_qd_probe_dropped_packets", "Total packets dropped", nil, l), prometheus.CounterValue, float64(packetsDropped.Value())),
prometheus.MustNewConstHistogram(prometheus.NewDesc("derp_qd_probe_delays_seconds", "Distribution of queuing delays", nil, l), qdh.count, qdh.sum, qdh.bucketedCounts),
}
qdh.mx.Unlock()
return result
},
}
}
// derpProbeQueuingDelay continuously sends data between two local DERP clients
// connected to two DERP servers in order to measure queuing delays. From and to
// can be the same server.
func derpProbeQueuingDelay(ctx context.Context, dm *tailcfg.DERPMap, from, to *tailcfg.DERPNode, packetsPerSecond int, packetTimeout time.Duration, packetsDropped *expvar.Float, qdh *histogram) (err error) {
// This probe uses clients with isProber=false to avoid spamming the derper
// logs with every packet sent by the queuing delay probe.
fromc, err := newConn(ctx, dm, from, false)
if err != nil {
return err
}
defer fromc.Close()
toc, err := newConn(ctx, dm, to, false)
if err != nil {
return err
}
defer toc.Close()
// Wait a bit for from's node to hear about to existing on the
// other node in the region, in the case where the two nodes
// are different.
if from.Name != to.Name {
time.Sleep(100 * time.Millisecond) // pretty arbitrary
}
if err := runDerpProbeQueuingDelayContinously(ctx, from, to, fromc, toc, packetsPerSecond, packetTimeout, packetsDropped, qdh); err != nil {
// Record pubkeys on failed probes to aid investigation.
return fmt.Errorf("%s -> %s: %w",
fromc.SelfPublicKey().ShortString(),
toc.SelfPublicKey().ShortString(), err)
}
return nil
}
func runDerpProbeQueuingDelayContinously(ctx context.Context, from, to *tailcfg.DERPNode, fromc, toc *derphttp.Client, packetsPerSecond int, packetTimeout time.Duration, packetsDropped *expvar.Float, qdh *histogram) error {
// Make sure all goroutines have finished.
var wg sync.WaitGroup
defer wg.Wait()
// Close the clients to make sure goroutines that are reading/writing from them terminate.
defer fromc.Close()
defer toc.Close()
type txRecord struct {
at time.Time
seq uint64
}
// txRecords is sized to hold enough transmission records to keep timings
// for packets up to their timeout. As records age out of the front of this
// list, if the associated packet arrives, we won't have a txRecord for it
// and will consider it to have timed out.
txRecords := make([]txRecord, 0, packetsPerSecond*int(packetTimeout.Seconds()))
var txRecordsMu sync.Mutex
// Send the packets.
sendErrC := make(chan error, 1)
// TODO: construct a disco CallMeMaybe in the same fashion as magicsock, e.g. magic bytes, src pub, seal payload.
// DERP server handling of disco may vary from non-disco, and we may want to measure queue delay of both.
pkt := make([]byte, 260) // the same size as a CallMeMaybe packet observed on a Tailscale client.
crand.Read(pkt)
wg.Add(1)
go func() {
defer wg.Done()
t := time.NewTicker(time.Second / time.Duration(packetsPerSecond))
defer t.Stop()
seq := uint64(0)
for {
select {
case <-ctx.Done():
return
case <-t.C:
txRecordsMu.Lock()
if len(txRecords) == cap(txRecords) {
txRecords = slices.Delete(txRecords, 0, 1)
packetsDropped.Add(1)
}
txRecords = append(txRecords, txRecord{time.Now(), seq})
txRecordsMu.Unlock()
binary.BigEndian.PutUint64(pkt, seq)
seq++
if err := fromc.Send(toc.SelfPublicKey(), pkt); err != nil {
sendErrC <- fmt.Errorf("sending packet %w", err)
return
}
}
}
}()
// Receive the packets.
recvFinishedC := make(chan error, 1)
wg.Add(1)
go func() {
defer wg.Done()
defer close(recvFinishedC) // to break out of 'select' below.
for {
m, err := toc.Recv()
if err != nil {
recvFinishedC <- err
return
}
switch v := m.(type) {
case derp.ReceivedPacket:
now := time.Now()
if v.Source != fromc.SelfPublicKey() {
recvFinishedC <- fmt.Errorf("got data packet from unexpected source, %v", v.Source)
return
}
seq := binary.BigEndian.Uint64(v.Data)
txRecordsMu.Lock()
findTxRecord:
for i, record := range txRecords {
switch {
case record.seq == seq:
rtt := now.Sub(record.at)
qdh.add(rtt.Seconds())
txRecords = slices.Delete(txRecords, i, i+1)
break findTxRecord
case record.seq > seq:
// No sent time found, probably a late arrival already
// recorded as drop by sender when deleted.
break findTxRecord
case record.seq < seq:
continue
}
}
txRecordsMu.Unlock()
case derp.KeepAliveMessage:
// Silently ignore.
default:
log.Printf("%v: ignoring Recv frame type %T", to.Name, v)
// Loop.
}
}
}()
select {
case <-ctx.Done():
return fmt.Errorf("timeout: %w", ctx.Err())
case err := <-sendErrC:
return fmt.Errorf("error sending via %q: %w", from.Name, err)
case err := <-recvFinishedC:
if err != nil {
return fmt.Errorf("error receiving from %q: %w", to.Name, err)
}
}
return nil
}
// getNodePair returns DERPNode objects for two DERP servers based on their // getNodePair returns DERPNode objects for two DERP servers based on their
// short names. // short names.
func (d *derpProber) getNodePair(n1, n2 string) (ret1, ret2 *tailcfg.DERPNode, _ error) { func (d *derpProber) getNodePair(n1, n2 string) (ret1, ret2 *tailcfg.DERPNode, _ error) {
@ -573,6 +787,8 @@ func runDerpProbeNodePair(ctx context.Context, from, to *tailcfg.DERPNode, fromc
recvc <- fmt.Errorf("got data packet %d from unexpected source, %v", idx, v.Source) recvc <- fmt.Errorf("got data packet %d from unexpected source, %v", idx, v.Source)
return return
} }
// This assumes that the packets are received reliably and in order.
// The DERP protocol does not guarantee this, but this probe assumes it.
if got, want := v.Data, pkts[idx]; !bytes.Equal(got, want) { if got, want := v.Data, pkts[idx]; !bytes.Equal(got, want) {
recvc <- fmt.Errorf("unexpected data packet %d (out of %d)", idx, len(pkts)) recvc <- fmt.Errorf("unexpected data packet %d (out of %d)", idx, len(pkts))
return return

50
prober/histogram.go Normal file
View File

@ -0,0 +1,50 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package prober
import (
"slices"
"sync"
)
// histogram serves as an adapter to the Prometheus histogram datatype.
// The prober framework passes labels at custom metric collection time that
// it expects to be coupled with the returned metrics. See ProbeClass.Metrics
// and its call sites. Native prometheus histograms cannot be collected while
// injecting more labels. Instead we use this type and pass observations +
// collection labels to prometheus.MustNewConstHistogram() at prometheus
// metric collection time.
type histogram struct {
count uint64
sum float64
buckets []float64
bucketedCounts map[float64]uint64
mx sync.Mutex
}
// newHistogram constructs a histogram that buckets data based on the given
// slice of upper bounds.
func newHistogram(buckets []float64) *histogram {
slices.Sort(buckets)
return &histogram{
buckets: buckets,
bucketedCounts: make(map[float64]uint64, len(buckets)),
}
}
func (h *histogram) add(v float64) {
h.mx.Lock()
defer h.mx.Unlock()
h.count++
h.sum += v
for _, b := range h.buckets {
if v > b {
continue
}
h.bucketedCounts[b] += 1
break
}
}

29
prober/histogram_test.go Normal file
View File

@ -0,0 +1,29 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package prober
import (
"testing"
"github.com/google/go-cmp/cmp"
)
func TestHistogram(t *testing.T) {
h := newHistogram([]float64{1, 2})
h.add(0.5)
h.add(1)
h.add(1.5)
h.add(2)
h.add(2.5)
if diff := cmp.Diff(h.count, uint64(5)); diff != "" {
t.Errorf("wrong count; (-got+want):%v", diff)
}
if diff := cmp.Diff(h.sum, 7.5); diff != "" {
t.Errorf("wrong sum; (-got+want):%v", diff)
}
if diff := cmp.Diff(h.bucketedCounts, map[float64]uint64{1: 2, 2: 2}); diff != "" {
t.Errorf("wrong bucketedCounts; (-got+want):%v", diff)
}
}

View File

@ -94,6 +94,9 @@ func newForTest(now func() time.Time, newTicker func(time.Duration) ticker) *Pro
// Run executes probe class function every interval, and exports probe results under probeName. // Run executes probe class function every interval, and exports probe results under probeName.
// //
// If interval is negative, the probe will run continuously. If it encounters a failure while
// running continuously, it will pause for -1*interval and then retry.
//
// Registering a probe under an already-registered name panics. // Registering a probe under an already-registered name panics.
func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc ProbeClass) *Probe { func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc ProbeClass) *Probe {
p.mu.Lock() p.mu.Lock()
@ -256,6 +259,11 @@ type Probe struct {
latencyHist *ring.Ring latencyHist *ring.Ring
} }
// IsContinuous indicates that this is a continuous probe.
func (p *Probe) IsContinuous() bool {
return p.interval < 0
}
// Close shuts down the Probe and unregisters it from its Prober. // Close shuts down the Probe and unregisters it from its Prober.
// It is safe to Run a new probe of the same name after Close returns. // It is safe to Run a new probe of the same name after Close returns.
func (p *Probe) Close() error { func (p *Probe) Close() error {
@ -288,6 +296,22 @@ func (p *Probe) loop() {
return return
} }
if p.IsContinuous() {
// Probe function is going to run continuously.
for {
p.run()
// Wait and then retry if probe fails. We use the inverse of the
// configured negative interval as our sleep period.
// TODO(percy):implement exponential backoff, possibly using logtail/backoff.
select {
case <-time.After(-1 * p.interval):
p.run()
case <-p.ctx.Done():
return
}
}
}
p.tick = p.prober.newTicker(p.interval) p.tick = p.prober.newTicker(p.interval)
defer p.tick.Stop() defer p.tick.Stop()
for { for {
@ -323,9 +347,13 @@ func (p *Probe) run() (pi ProbeInfo, err error) {
p.recordEnd(err) p.recordEnd(err)
} }
}() }()
ctx := p.ctx
if !p.IsContinuous() {
timeout := time.Duration(float64(p.interval) * 0.8) timeout := time.Duration(float64(p.interval) * 0.8)
ctx, cancel := context.WithTimeout(p.ctx, timeout) var cancel func()
ctx, cancel = context.WithTimeout(ctx, timeout)
defer cancel() defer cancel()
}
err = p.probeClass.Probe(ctx) err = p.probeClass.Probe(ctx)
p.recordEnd(err) p.recordEnd(err)
@ -365,6 +393,16 @@ func (p *Probe) recordEnd(err error) {
p.successHist = p.successHist.Next() p.successHist = p.successHist.Next()
} }
// ProbeStatus indicates the status of a probe.
type ProbeStatus string
const (
ProbeStatusUnknown = "unknown"
ProbeStatusRunning = "running"
ProbeStatusFailed = "failed"
ProbeStatusSucceeded = "succeeded"
)
// ProbeInfo is a snapshot of the configuration and state of a Probe. // ProbeInfo is a snapshot of the configuration and state of a Probe.
type ProbeInfo struct { type ProbeInfo struct {
Name string Name string
@ -374,7 +412,7 @@ type ProbeInfo struct {
Start time.Time Start time.Time
End time.Time End time.Time
Latency time.Duration Latency time.Duration
Result bool Status ProbeStatus
Error string Error string
RecentResults []bool RecentResults []bool
RecentLatencies []time.Duration RecentLatencies []time.Duration
@ -402,6 +440,10 @@ func (pb ProbeInfo) RecentMedianLatency() time.Duration {
return pb.RecentLatencies[len(pb.RecentLatencies)/2] return pb.RecentLatencies[len(pb.RecentLatencies)/2]
} }
func (pb ProbeInfo) Continuous() bool {
return pb.Interval < 0
}
// ProbeInfo returns the state of all probes. // ProbeInfo returns the state of all probes.
func (p *Prober) ProbeInfo() map[string]ProbeInfo { func (p *Prober) ProbeInfo() map[string]ProbeInfo {
out := map[string]ProbeInfo{} out := map[string]ProbeInfo{}
@ -429,9 +471,14 @@ func (probe *Probe) probeInfoLocked() ProbeInfo {
Labels: probe.metricLabels, Labels: probe.metricLabels,
Start: probe.start, Start: probe.start,
End: probe.end, End: probe.end,
Result: probe.succeeded,
} }
if probe.lastErr != nil { inf.Status = ProbeStatusUnknown
if probe.end.Before(probe.start) {
inf.Status = ProbeStatusRunning
} else if probe.succeeded {
inf.Status = ProbeStatusSucceeded
} else if probe.lastErr != nil {
inf.Status = ProbeStatusFailed
inf.Error = probe.lastErr.Error() inf.Error = probe.lastErr.Error()
} }
if probe.latency > 0 { if probe.latency > 0 {
@ -467,7 +514,7 @@ func (p *Prober) RunHandler(w http.ResponseWriter, r *http.Request) error {
p.mu.Lock() p.mu.Lock()
probe, ok := p.probes[name] probe, ok := p.probes[name]
p.mu.Unlock() p.mu.Unlock()
if !ok { if !ok || probe.IsContinuous() {
return tsweb.Error(http.StatusNotFound, fmt.Sprintf("unknown probe %q", name), nil) return tsweb.Error(http.StatusNotFound, fmt.Sprintf("unknown probe %q", name), nil)
} }
@ -531,7 +578,8 @@ func (p *Probe) Collect(ch chan<- prometheus.Metric) {
if !p.start.IsZero() { if !p.start.IsZero() {
ch <- prometheus.MustNewConstMetric(p.mStartTime, prometheus.GaugeValue, float64(p.start.Unix())) ch <- prometheus.MustNewConstMetric(p.mStartTime, prometheus.GaugeValue, float64(p.start.Unix()))
} }
if p.end.IsZero() { // For periodic probes that haven't ended, don't collect probe metrics yet.
if p.end.IsZero() && !p.IsContinuous() {
return return
} }
ch <- prometheus.MustNewConstMetric(p.mEndTime, prometheus.GaugeValue, float64(p.end.Unix())) ch <- prometheus.MustNewConstMetric(p.mEndTime, prometheus.GaugeValue, float64(p.end.Unix()))

View File

@ -316,7 +316,7 @@ func TestProberProbeInfo(t *testing.T) {
Interval: probeInterval, Interval: probeInterval,
Labels: map[string]string{"class": "", "name": "probe1"}, Labels: map[string]string{"class": "", "name": "probe1"},
Latency: 500 * time.Millisecond, Latency: 500 * time.Millisecond,
Result: true, Status: ProbeStatusSucceeded,
RecentResults: []bool{true}, RecentResults: []bool{true},
RecentLatencies: []time.Duration{500 * time.Millisecond}, RecentLatencies: []time.Duration{500 * time.Millisecond},
}, },
@ -324,6 +324,7 @@ func TestProberProbeInfo(t *testing.T) {
Name: "probe2", Name: "probe2",
Interval: probeInterval, Interval: probeInterval,
Labels: map[string]string{"class": "", "name": "probe2"}, Labels: map[string]string{"class": "", "name": "probe2"},
Status: ProbeStatusFailed,
Error: "error2", Error: "error2",
RecentResults: []bool{false}, RecentResults: []bool{false},
RecentLatencies: nil, // no latency for failed probes RecentLatencies: nil, // no latency for failed probes
@ -349,7 +350,7 @@ func TestProbeInfoRecent(t *testing.T) {
}{ }{
{ {
name: "no_runs", name: "no_runs",
wantProbeInfo: ProbeInfo{}, wantProbeInfo: ProbeInfo{Status: ProbeStatusUnknown},
wantRecentSuccessRatio: 0, wantRecentSuccessRatio: 0,
wantRecentMedianLatency: 0, wantRecentMedianLatency: 0,
}, },
@ -358,7 +359,7 @@ func TestProbeInfoRecent(t *testing.T) {
results: []probeResult{{latency: 100 * time.Millisecond, err: nil}}, results: []probeResult{{latency: 100 * time.Millisecond, err: nil}},
wantProbeInfo: ProbeInfo{ wantProbeInfo: ProbeInfo{
Latency: 100 * time.Millisecond, Latency: 100 * time.Millisecond,
Result: true, Status: ProbeStatusSucceeded,
RecentResults: []bool{true}, RecentResults: []bool{true},
RecentLatencies: []time.Duration{100 * time.Millisecond}, RecentLatencies: []time.Duration{100 * time.Millisecond},
}, },
@ -369,7 +370,7 @@ func TestProbeInfoRecent(t *testing.T) {
name: "single_failure", name: "single_failure",
results: []probeResult{{latency: 100 * time.Millisecond, err: errors.New("error123")}}, results: []probeResult{{latency: 100 * time.Millisecond, err: errors.New("error123")}},
wantProbeInfo: ProbeInfo{ wantProbeInfo: ProbeInfo{
Result: false, Status: ProbeStatusFailed,
RecentResults: []bool{false}, RecentResults: []bool{false},
RecentLatencies: nil, RecentLatencies: nil,
Error: "error123", Error: "error123",
@ -390,7 +391,7 @@ func TestProbeInfoRecent(t *testing.T) {
{latency: 80 * time.Millisecond, err: nil}, {latency: 80 * time.Millisecond, err: nil},
}, },
wantProbeInfo: ProbeInfo{ wantProbeInfo: ProbeInfo{
Result: true, Status: ProbeStatusSucceeded,
Latency: 80 * time.Millisecond, Latency: 80 * time.Millisecond,
RecentResults: []bool{false, true, true, false, true, true, false, true}, RecentResults: []bool{false, true, true, false, true, true, false, true},
RecentLatencies: []time.Duration{ RecentLatencies: []time.Duration{
@ -420,7 +421,7 @@ func TestProbeInfoRecent(t *testing.T) {
{latency: 110 * time.Millisecond, err: nil}, {latency: 110 * time.Millisecond, err: nil},
}, },
wantProbeInfo: ProbeInfo{ wantProbeInfo: ProbeInfo{
Result: true, Status: ProbeStatusSucceeded,
Latency: 110 * time.Millisecond, Latency: 110 * time.Millisecond,
RecentResults: []bool{true, true, true, true, true, true, true, true, true, true}, RecentResults: []bool{true, true, true, true, true, true, true, true, true, true},
RecentLatencies: []time.Duration{ RecentLatencies: []time.Duration{
@ -483,7 +484,7 @@ func TestProberRunHandler(t *testing.T) {
ProbeInfo: ProbeInfo{ ProbeInfo: ProbeInfo{
Name: "success", Name: "success",
Interval: probeInterval, Interval: probeInterval,
Result: true, Status: ProbeStatusSucceeded,
RecentResults: []bool{true, true}, RecentResults: []bool{true, true},
}, },
PreviousSuccessRatio: 1, PreviousSuccessRatio: 1,
@ -498,7 +499,7 @@ func TestProberRunHandler(t *testing.T) {
ProbeInfo: ProbeInfo{ ProbeInfo: ProbeInfo{
Name: "failure", Name: "failure",
Interval: probeInterval, Interval: probeInterval,
Result: false, Status: ProbeStatusFailed,
Error: "error123", Error: "error123",
RecentResults: []bool{false, false}, RecentResults: []bool{false, false},
}, },

View File

@ -62,7 +62,8 @@ func (p *Prober) StatusHandler(opts ...statusHandlerOpt) tsweb.ReturnHandlerFunc
return func(w http.ResponseWriter, r *http.Request) error { return func(w http.ResponseWriter, r *http.Request) error {
type probeStatus struct { type probeStatus struct {
ProbeInfo ProbeInfo
TimeSinceLast time.Duration TimeSinceLastStart time.Duration
TimeSinceLastEnd time.Duration
Links map[string]template.URL Links map[string]template.URL
} }
vars := struct { vars := struct {
@ -81,12 +82,15 @@ func (p *Prober) StatusHandler(opts ...statusHandlerOpt) tsweb.ReturnHandlerFunc
for name, info := range p.ProbeInfo() { for name, info := range p.ProbeInfo() {
vars.TotalProbes++ vars.TotalProbes++
if !info.Result { if info.Error != "" {
vars.UnhealthyProbes++ vars.UnhealthyProbes++
} }
s := probeStatus{ProbeInfo: info} s := probeStatus{ProbeInfo: info}
if !info.Start.IsZero() {
s.TimeSinceLastStart = time.Since(info.Start).Truncate(time.Second)
}
if !info.End.IsZero() { if !info.End.IsZero() {
s.TimeSinceLast = time.Since(info.End).Truncate(time.Second) s.TimeSinceLastEnd = time.Since(info.End).Truncate(time.Second)
} }
for textTpl, urlTpl := range params.probeLinks { for textTpl, urlTpl := range params.probeLinks {
text, err := renderTemplate(textTpl, info) text, err := renderTemplate(textTpl, info)

View File

@ -73,8 +73,9 @@
<th>Name</th> <th>Name</th>
<th>Probe Class & Labels</th> <th>Probe Class & Labels</th>
<th>Interval</th> <th>Interval</th>
<th>Last Attempt</th> <th>Last Finished</th>
<th>Success</th> <th>Last Started</th>
<th>Status</th>
<th>Latency</th> <th>Latency</th>
<th>Last Error</th> <th>Last Error</th>
</tr></thead> </tr></thead>
@ -85,10 +86,12 @@
{{$name}} {{$name}}
{{range $text, $url := $probeInfo.Links}} {{range $text, $url := $probeInfo.Links}}
<br/> <br/>
{{if not $probeInfo.Continuous}}
<button onclick="location.href='{{$url}}';" type="button"> <button onclick="location.href='{{$url}}';" type="button">
{{$text}} {{$text}}
</button> </button>
{{end}} {{end}}
{{end}}
</td> </td>
<td>{{$probeInfo.Class}}<br/> <td>{{$probeInfo.Class}}<br/>
<div class="small"> <div class="small">
@ -97,28 +100,48 @@
{{end}} {{end}}
</div> </div>
</td> </td>
<td>{{$probeInfo.Interval}}</td> <td>
<td data-sort="{{$probeInfo.TimeSinceLast.Milliseconds}}"> {{if $probeInfo.Continuous}}
{{if $probeInfo.TimeSinceLast}} Continuous
{{$probeInfo.TimeSinceLast.String}} ago<br/> {{else}}
{{$probeInfo.Interval}}
{{end}}
</td>
<td data-sort="{{$probeInfo.TimeSinceLastEnd.Milliseconds}}">
{{if $probeInfo.TimeSinceLastEnd}}
{{$probeInfo.TimeSinceLastEnd.String}} ago<br/>
<span class="small">{{$probeInfo.End.Format "2006-01-02T15:04:05Z07:00"}}</span> <span class="small">{{$probeInfo.End.Format "2006-01-02T15:04:05Z07:00"}}</span>
{{else}} {{else}}
Never Never
{{end}} {{end}}
</td> </td>
<td> <td data-sort="{{$probeInfo.TimeSinceLastStart.Milliseconds}}">
{{if $probeInfo.Result}} {{if $probeInfo.TimeSinceLastStart}}
{{$probeInfo.Result}} {{$probeInfo.TimeSinceLastStart.String}} ago<br/>
<span class="small">{{$probeInfo.Start.Format "2006-01-02T15:04:05Z07:00"}}</span>
{{else}} {{else}}
<span class="error">{{$probeInfo.Result}}</span> Never
{{end}}
</td>
<td>
{{if $probeInfo.Error}}
<span class="error">{{$probeInfo.Status}}</span>
{{else}}
{{$probeInfo.Status}}
{{end}}<br/> {{end}}<br/>
{{if not $probeInfo.Continuous}}
<div class="small">Recent: {{$probeInfo.RecentResults}}</div> <div class="small">Recent: {{$probeInfo.RecentResults}}</div>
<div class="small">Mean: {{$probeInfo.RecentSuccessRatio}}</div> <div class="small">Mean: {{$probeInfo.RecentSuccessRatio}}</div>
{{end}}
</td> </td>
<td data-sort="{{$probeInfo.Latency.Milliseconds}}"> <td data-sort="{{$probeInfo.Latency.Milliseconds}}">
{{if $probeInfo.Continuous}}
n/a
{{else}}
{{$probeInfo.Latency.String}} {{$probeInfo.Latency.String}}
<div class="small">Recent: {{$probeInfo.RecentLatencies}}</div> <div class="small">Recent: {{$probeInfo.RecentLatencies}}</div>
<div class="small">Median: {{$probeInfo.RecentMedianLatency}}</div> <div class="small">Median: {{$probeInfo.RecentMedianLatency}}</div>
{{end}}
</td> </td>
<td class="small">{{$probeInfo.Error}}</td> <td class="small">{{$probeInfo.Error}}</td>
</tr> </tr>