cmd/derpprobe,prober: add ability to perform continuous queuing delay measurements against DERP servers

This new type of probe sends DERP packets sized similarly to CallMeMaybe packets
at a rate of 10 packets per second. It records the round-trip times in a Prometheus
histogram. It also keeps track of how many packets are dropped. Packets that fail to
arrive within 5 seconds are considered dropped.

Updates tailscale/corp#24522

Signed-off-by: Percy Wegmann <percy@tailscale.com>
This commit is contained in:
Percy Wegmann 2024-12-16 23:05:46 -06:00 committed by Percy Wegmann
parent 6ae0287a57
commit 00a4504cf1
8 changed files with 429 additions and 55 deletions

View File

@ -29,7 +29,9 @@ var (
tlsInterval = flag.Duration("tls-interval", 15*time.Second, "TLS probe interval")
bwInterval = flag.Duration("bw-interval", 0, "bandwidth probe interval (0 = no bandwidth probing)")
bwSize = flag.Int64("bw-probe-size-bytes", 1_000_000, "bandwidth probe size")
bwTUNIPv4Address = flag.String("bw-tun-ipv4-addr", "", "if specified, bandwidth probes will be performed over a TUN device at this address in order to exercise TCP-in-TCP in similar fashion to TCP over Tailscale via DERP. We will use a /30 subnet including this IP address.")
bwTUNIPv4Address = flag.String("bw-tun-ipv4-addr", "", "if specified, bandwidth probes will be performed over a TUN device at this address in order to exercise TCP-in-TCP in similar fashion to TCP over Tailscale via DERP; we will use a /30 subnet including this IP address")
qdPacketsPerSecond = flag.Int("qd-packets-per-second", 0, "if greater than 0, queuing delay will be measured continuously using 260 byte packets (approximate size of a CallMeMaybe packet) sent at this rate per second")
qdPacketTimeout = flag.Duration("qd-packet-timeout", 5*time.Second, "queuing delay packets arriving after this period of time from being sent are treated like dropped packets and don't count toward queuing delay timings")
regionCode = flag.String("region-code", "", "probe only this region (e.g. 'lax'); if left blank, all regions will be probed")
)
@ -45,6 +47,7 @@ func main() {
prober.WithMeshProbing(*meshInterval),
prober.WithSTUNProbing(*stunInterval),
prober.WithTLSProbing(*tlsInterval),
prober.WithQueuingDelayProbing(*qdPacketsPerSecond, *qdPacketTimeout),
}
if *bwInterval > 0 {
opts = append(opts, prober.WithBandwidthProbing(*bwInterval, *bwSize, *bwTUNIPv4Address))
@ -107,7 +110,7 @@ func getOverallStatus(p *prober.Prober) (o overallStatus) {
// Do not show probes that have not finished yet.
continue
}
if i.Result {
if i.Status == prober.ProbeStatusSucceeded {
o.addGoodf("%s: %s", p, i.Latency)
} else {
o.addBadf("%s: %s", p, i.Error)

View File

@ -8,6 +8,7 @@ import (
"cmp"
"context"
crand "crypto/rand"
"encoding/binary"
"encoding/json"
"errors"
"expvar"
@ -17,6 +18,7 @@ import (
"net"
"net/http"
"net/netip"
"slices"
"strconv"
"strings"
"sync"
@ -53,6 +55,10 @@ type derpProber struct {
bwProbeSize int64
bwTUNIPv4Prefix *netip.Prefix // or nil to not use TUN
// Optional queuing delay probing.
qdPacketsPerSecond int // in packets per second
qdPacketTimeout time.Duration
// Optionally restrict probes to a single regionCode.
regionCode string
@ -64,6 +70,7 @@ type derpProber struct {
udpProbeFn func(string, int) ProbeClass
meshProbeFn func(string, string) ProbeClass
bwProbeFn func(string, string, int64) ProbeClass
qdProbeFn func(string, string, int, time.Duration) ProbeClass
sync.Mutex
lastDERPMap *tailcfg.DERPMap
@ -93,6 +100,16 @@ func WithBandwidthProbing(interval time.Duration, size int64, tunAddress string)
}
}
// WithQueuingDelayProbing enables/disables queuing delay probing. qdSendRate
// is the number of packets sent per second. qdTimeout is the amount of time
// after which a sent packet is considered to have timed out.
func WithQueuingDelayProbing(qdPacketsPerSecond int, qdPacketTimeout time.Duration) DERPOpt {
return func(d *derpProber) {
d.qdPacketsPerSecond = qdPacketsPerSecond
d.qdPacketTimeout = qdPacketTimeout
}
}
// WithMeshProbing enables mesh probing. When enabled, a small message will be
// transferred through each DERP server and each pair of DERP servers.
func WithMeshProbing(interval time.Duration) DERPOpt {
@ -147,6 +164,7 @@ func DERP(p *Prober, derpMapURL string, opts ...DERPOpt) (*derpProber, error) {
d.udpProbeFn = d.ProbeUDP
d.meshProbeFn = d.probeMesh
d.bwProbeFn = d.probeBandwidth
d.qdProbeFn = d.probeQueuingDelay
return d, nil
}
@ -213,7 +231,7 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
}
}
if d.bwInterval > 0 && d.bwProbeSize > 0 {
if d.bwInterval != 0 && d.bwProbeSize > 0 {
n := fmt.Sprintf("derp/%s/%s/%s/bw", region.RegionCode, server.Name, to.Name)
wantProbes[n] = true
if d.probes[n] == nil {
@ -225,6 +243,15 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
d.probes[n] = d.p.Run(n, d.bwInterval, labels, d.bwProbeFn(server.Name, to.Name, d.bwProbeSize))
}
}
if d.qdPacketsPerSecond > 0 {
n := fmt.Sprintf("derp/%s/%s/%s/qd", region.RegionCode, server.Name, to.Name)
wantProbes[n] = true
if d.probes[n] == nil {
log.Printf("adding DERP queuing delay probe for %s->%s (%s)", server.Name, to.Name, region.RegionName)
d.probes[n] = d.p.Run(n, -10*time.Second, labels, d.qdProbeFn(server.Name, to.Name, d.qdPacketsPerSecond, d.qdPacketTimeout))
}
}
}
}
}
@ -240,7 +267,7 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
return nil
}
// probeMesh returs a probe class that sends a test packet through a pair of DERP
// probeMesh returns a probe class that sends a test packet through a pair of DERP
// servers (or just one server, if 'from' and 'to' are the same). 'from' and 'to'
// are expected to be names (DERPNode.Name) of two DERP servers in the same region.
func (d *derpProber) probeMesh(from, to string) ProbeClass {
@ -263,7 +290,7 @@ func (d *derpProber) probeMesh(from, to string) ProbeClass {
}
}
// probeBandwidth returs a probe class that sends a payload of a given size
// probeBandwidth returns a probe class that sends a payload of a given size
// through a pair of DERP servers (or just one server, if 'from' and 'to' are
// the same). 'from' and 'to' are expected to be names (DERPNode.Name) of two
// DERP servers in the same region.
@ -295,6 +322,193 @@ func (d *derpProber) probeBandwidth(from, to string, size int64) ProbeClass {
}
}
// probeQueuingDelay returns a probe class that continuously sends packets
// through a pair of DERP servers (or just one server, if 'from' and 'to' are
// the same) at a rate of `packetsPerSecond` packets per second in order to
// measure queuing delays. Packets arriving after `packetTimeout` don't contribute
// to the queuing delay measurement and are recorded as dropped. 'from' and 'to' are
// expected to be names (DERPNode.Name) of two DERP servers in the same region,
// and may refer to the same server.
func (d *derpProber) probeQueuingDelay(from, to string, packetsPerSecond int, packetTimeout time.Duration) ProbeClass {
derpPath := "mesh"
if from == to {
derpPath = "single"
}
var packetsDropped expvar.Float
qdh := newHistogram([]float64{.005, .01, .025, .05, .1, .25, .5, 1})
return ProbeClass{
Probe: func(ctx context.Context) error {
fromN, toN, err := d.getNodePair(from, to)
if err != nil {
return err
}
return derpProbeQueuingDelay(ctx, d.lastDERPMap, fromN, toN, packetsPerSecond, packetTimeout, &packetsDropped, qdh)
},
Class: "derp_qd",
Labels: Labels{"derp_path": derpPath},
Metrics: func(l prometheus.Labels) []prometheus.Metric {
qdh.mx.Lock()
result := []prometheus.Metric{
prometheus.MustNewConstMetric(prometheus.NewDesc("derp_qd_probe_dropped_packets", "Total packets dropped", nil, l), prometheus.CounterValue, float64(packetsDropped.Value())),
prometheus.MustNewConstHistogram(prometheus.NewDesc("derp_qd_probe_delays_seconds", "Distribution of queuing delays", nil, l), qdh.count, qdh.sum, qdh.bucketedCounts),
}
qdh.mx.Unlock()
return result
},
}
}
// derpProbeQueuingDelay continuously sends data between two local DERP clients
// connected to two DERP servers in order to measure queuing delays. From and to
// can be the same server.
func derpProbeQueuingDelay(ctx context.Context, dm *tailcfg.DERPMap, from, to *tailcfg.DERPNode, packetsPerSecond int, packetTimeout time.Duration, packetsDropped *expvar.Float, qdh *histogram) (err error) {
// This probe uses clients with isProber=false to avoid spamming the derper
// logs with every packet sent by the queuing delay probe.
fromc, err := newConn(ctx, dm, from, false)
if err != nil {
return err
}
defer fromc.Close()
toc, err := newConn(ctx, dm, to, false)
if err != nil {
return err
}
defer toc.Close()
// Wait a bit for from's node to hear about to existing on the
// other node in the region, in the case where the two nodes
// are different.
if from.Name != to.Name {
time.Sleep(100 * time.Millisecond) // pretty arbitrary
}
if err := runDerpProbeQueuingDelayContinously(ctx, from, to, fromc, toc, packetsPerSecond, packetTimeout, packetsDropped, qdh); err != nil {
// Record pubkeys on failed probes to aid investigation.
return fmt.Errorf("%s -> %s: %w",
fromc.SelfPublicKey().ShortString(),
toc.SelfPublicKey().ShortString(), err)
}
return nil
}
func runDerpProbeQueuingDelayContinously(ctx context.Context, from, to *tailcfg.DERPNode, fromc, toc *derphttp.Client, packetsPerSecond int, packetTimeout time.Duration, packetsDropped *expvar.Float, qdh *histogram) error {
// Make sure all goroutines have finished.
var wg sync.WaitGroup
defer wg.Wait()
// Close the clients to make sure goroutines that are reading/writing from them terminate.
defer fromc.Close()
defer toc.Close()
type txRecord struct {
at time.Time
seq uint64
}
// txRecords is sized to hold enough transmission records to keep timings
// for packets up to their timeout. As records age out of the front of this
// list, if the associated packet arrives, we won't have a txRecord for it
// and will consider it to have timed out.
txRecords := make([]txRecord, 0, packetsPerSecond*int(packetTimeout.Seconds()))
var txRecordsMu sync.Mutex
// Send the packets.
sendErrC := make(chan error, 1)
// TODO: construct a disco CallMeMaybe in the same fashion as magicsock, e.g. magic bytes, src pub, seal payload.
// DERP server handling of disco may vary from non-disco, and we may want to measure queue delay of both.
pkt := make([]byte, 260) // the same size as a CallMeMaybe packet observed on a Tailscale client.
crand.Read(pkt)
wg.Add(1)
go func() {
defer wg.Done()
t := time.NewTicker(time.Second / time.Duration(packetsPerSecond))
defer t.Stop()
seq := uint64(0)
for {
select {
case <-ctx.Done():
return
case <-t.C:
txRecordsMu.Lock()
if len(txRecords) == cap(txRecords) {
txRecords = slices.Delete(txRecords, 0, 1)
packetsDropped.Add(1)
}
txRecords = append(txRecords, txRecord{time.Now(), seq})
txRecordsMu.Unlock()
binary.BigEndian.PutUint64(pkt, seq)
seq++
if err := fromc.Send(toc.SelfPublicKey(), pkt); err != nil {
sendErrC <- fmt.Errorf("sending packet %w", err)
return
}
}
}
}()
// Receive the packets.
recvFinishedC := make(chan error, 1)
wg.Add(1)
go func() {
defer wg.Done()
defer close(recvFinishedC) // to break out of 'select' below.
for {
m, err := toc.Recv()
if err != nil {
recvFinishedC <- err
return
}
switch v := m.(type) {
case derp.ReceivedPacket:
now := time.Now()
if v.Source != fromc.SelfPublicKey() {
recvFinishedC <- fmt.Errorf("got data packet from unexpected source, %v", v.Source)
return
}
seq := binary.BigEndian.Uint64(v.Data)
txRecordsMu.Lock()
findTxRecord:
for i, record := range txRecords {
switch {
case record.seq == seq:
rtt := now.Sub(record.at)
qdh.add(rtt.Seconds())
txRecords = slices.Delete(txRecords, i, i+1)
break findTxRecord
case record.seq > seq:
// No sent time found, probably a late arrival already
// recorded as drop by sender when deleted.
break findTxRecord
case record.seq < seq:
continue
}
}
txRecordsMu.Unlock()
case derp.KeepAliveMessage:
// Silently ignore.
default:
log.Printf("%v: ignoring Recv frame type %T", to.Name, v)
// Loop.
}
}
}()
select {
case <-ctx.Done():
return fmt.Errorf("timeout: %w", ctx.Err())
case err := <-sendErrC:
return fmt.Errorf("error sending via %q: %w", from.Name, err)
case err := <-recvFinishedC:
if err != nil {
return fmt.Errorf("error receiving from %q: %w", to.Name, err)
}
}
return nil
}
// getNodePair returns DERPNode objects for two DERP servers based on their
// short names.
func (d *derpProber) getNodePair(n1, n2 string) (ret1, ret2 *tailcfg.DERPNode, _ error) {
@ -573,6 +787,8 @@ func runDerpProbeNodePair(ctx context.Context, from, to *tailcfg.DERPNode, fromc
recvc <- fmt.Errorf("got data packet %d from unexpected source, %v", idx, v.Source)
return
}
// This assumes that the packets are received reliably and in order.
// The DERP protocol does not guarantee this, but this probe assumes it.
if got, want := v.Data, pkts[idx]; !bytes.Equal(got, want) {
recvc <- fmt.Errorf("unexpected data packet %d (out of %d)", idx, len(pkts))
return

50
prober/histogram.go Normal file
View File

@ -0,0 +1,50 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package prober
import (
"slices"
"sync"
)
// histogram serves as an adapter to the Prometheus histogram datatype.
// The prober framework passes labels at custom metric collection time that
// it expects to be coupled with the returned metrics. See ProbeClass.Metrics
// and its call sites. Native prometheus histograms cannot be collected while
// injecting more labels. Instead we use this type and pass observations +
// collection labels to prometheus.MustNewConstHistogram() at prometheus
// metric collection time.
type histogram struct {
count uint64
sum float64
buckets []float64
bucketedCounts map[float64]uint64
mx sync.Mutex
}
// newHistogram constructs a histogram that buckets data based on the given
// slice of upper bounds.
func newHistogram(buckets []float64) *histogram {
slices.Sort(buckets)
return &histogram{
buckets: buckets,
bucketedCounts: make(map[float64]uint64, len(buckets)),
}
}
func (h *histogram) add(v float64) {
h.mx.Lock()
defer h.mx.Unlock()
h.count++
h.sum += v
for _, b := range h.buckets {
if v > b {
continue
}
h.bucketedCounts[b] += 1
break
}
}

29
prober/histogram_test.go Normal file
View File

@ -0,0 +1,29 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
package prober
import (
"testing"
"github.com/google/go-cmp/cmp"
)
func TestHistogram(t *testing.T) {
h := newHistogram([]float64{1, 2})
h.add(0.5)
h.add(1)
h.add(1.5)
h.add(2)
h.add(2.5)
if diff := cmp.Diff(h.count, uint64(5)); diff != "" {
t.Errorf("wrong count; (-got+want):%v", diff)
}
if diff := cmp.Diff(h.sum, 7.5); diff != "" {
t.Errorf("wrong sum; (-got+want):%v", diff)
}
if diff := cmp.Diff(h.bucketedCounts, map[float64]uint64{1: 2, 2: 2}); diff != "" {
t.Errorf("wrong bucketedCounts; (-got+want):%v", diff)
}
}

View File

@ -94,6 +94,9 @@ func newForTest(now func() time.Time, newTicker func(time.Duration) ticker) *Pro
// Run executes probe class function every interval, and exports probe results under probeName.
//
// If interval is negative, the probe will run continuously. If it encounters a failure while
// running continuously, it will pause for -1*interval and then retry.
//
// Registering a probe under an already-registered name panics.
func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc ProbeClass) *Probe {
p.mu.Lock()
@ -256,6 +259,11 @@ type Probe struct {
latencyHist *ring.Ring
}
// IsContinuous indicates that this is a continuous probe.
func (p *Probe) IsContinuous() bool {
return p.interval < 0
}
// Close shuts down the Probe and unregisters it from its Prober.
// It is safe to Run a new probe of the same name after Close returns.
func (p *Probe) Close() error {
@ -288,6 +296,22 @@ func (p *Probe) loop() {
return
}
if p.IsContinuous() {
// Probe function is going to run continuously.
for {
p.run()
// Wait and then retry if probe fails. We use the inverse of the
// configured negative interval as our sleep period.
// TODO(percy):implement exponential backoff, possibly using logtail/backoff.
select {
case <-time.After(-1 * p.interval):
p.run()
case <-p.ctx.Done():
return
}
}
}
p.tick = p.prober.newTicker(p.interval)
defer p.tick.Stop()
for {
@ -323,9 +347,13 @@ func (p *Probe) run() (pi ProbeInfo, err error) {
p.recordEnd(err)
}
}()
ctx := p.ctx
if !p.IsContinuous() {
timeout := time.Duration(float64(p.interval) * 0.8)
ctx, cancel := context.WithTimeout(p.ctx, timeout)
var cancel func()
ctx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
err = p.probeClass.Probe(ctx)
p.recordEnd(err)
@ -365,6 +393,16 @@ func (p *Probe) recordEnd(err error) {
p.successHist = p.successHist.Next()
}
// ProbeStatus indicates the status of a probe.
type ProbeStatus string
const (
ProbeStatusUnknown = "unknown"
ProbeStatusRunning = "running"
ProbeStatusFailed = "failed"
ProbeStatusSucceeded = "succeeded"
)
// ProbeInfo is a snapshot of the configuration and state of a Probe.
type ProbeInfo struct {
Name string
@ -374,7 +412,7 @@ type ProbeInfo struct {
Start time.Time
End time.Time
Latency time.Duration
Result bool
Status ProbeStatus
Error string
RecentResults []bool
RecentLatencies []time.Duration
@ -402,6 +440,10 @@ func (pb ProbeInfo) RecentMedianLatency() time.Duration {
return pb.RecentLatencies[len(pb.RecentLatencies)/2]
}
func (pb ProbeInfo) Continuous() bool {
return pb.Interval < 0
}
// ProbeInfo returns the state of all probes.
func (p *Prober) ProbeInfo() map[string]ProbeInfo {
out := map[string]ProbeInfo{}
@ -429,9 +471,14 @@ func (probe *Probe) probeInfoLocked() ProbeInfo {
Labels: probe.metricLabels,
Start: probe.start,
End: probe.end,
Result: probe.succeeded,
}
if probe.lastErr != nil {
inf.Status = ProbeStatusUnknown
if probe.end.Before(probe.start) {
inf.Status = ProbeStatusRunning
} else if probe.succeeded {
inf.Status = ProbeStatusSucceeded
} else if probe.lastErr != nil {
inf.Status = ProbeStatusFailed
inf.Error = probe.lastErr.Error()
}
if probe.latency > 0 {
@ -467,7 +514,7 @@ func (p *Prober) RunHandler(w http.ResponseWriter, r *http.Request) error {
p.mu.Lock()
probe, ok := p.probes[name]
p.mu.Unlock()
if !ok {
if !ok || probe.IsContinuous() {
return tsweb.Error(http.StatusNotFound, fmt.Sprintf("unknown probe %q", name), nil)
}
@ -531,7 +578,8 @@ func (p *Probe) Collect(ch chan<- prometheus.Metric) {
if !p.start.IsZero() {
ch <- prometheus.MustNewConstMetric(p.mStartTime, prometheus.GaugeValue, float64(p.start.Unix()))
}
if p.end.IsZero() {
// For periodic probes that haven't ended, don't collect probe metrics yet.
if p.end.IsZero() && !p.IsContinuous() {
return
}
ch <- prometheus.MustNewConstMetric(p.mEndTime, prometheus.GaugeValue, float64(p.end.Unix()))

View File

@ -316,7 +316,7 @@ func TestProberProbeInfo(t *testing.T) {
Interval: probeInterval,
Labels: map[string]string{"class": "", "name": "probe1"},
Latency: 500 * time.Millisecond,
Result: true,
Status: ProbeStatusSucceeded,
RecentResults: []bool{true},
RecentLatencies: []time.Duration{500 * time.Millisecond},
},
@ -324,6 +324,7 @@ func TestProberProbeInfo(t *testing.T) {
Name: "probe2",
Interval: probeInterval,
Labels: map[string]string{"class": "", "name": "probe2"},
Status: ProbeStatusFailed,
Error: "error2",
RecentResults: []bool{false},
RecentLatencies: nil, // no latency for failed probes
@ -349,7 +350,7 @@ func TestProbeInfoRecent(t *testing.T) {
}{
{
name: "no_runs",
wantProbeInfo: ProbeInfo{},
wantProbeInfo: ProbeInfo{Status: ProbeStatusUnknown},
wantRecentSuccessRatio: 0,
wantRecentMedianLatency: 0,
},
@ -358,7 +359,7 @@ func TestProbeInfoRecent(t *testing.T) {
results: []probeResult{{latency: 100 * time.Millisecond, err: nil}},
wantProbeInfo: ProbeInfo{
Latency: 100 * time.Millisecond,
Result: true,
Status: ProbeStatusSucceeded,
RecentResults: []bool{true},
RecentLatencies: []time.Duration{100 * time.Millisecond},
},
@ -369,7 +370,7 @@ func TestProbeInfoRecent(t *testing.T) {
name: "single_failure",
results: []probeResult{{latency: 100 * time.Millisecond, err: errors.New("error123")}},
wantProbeInfo: ProbeInfo{
Result: false,
Status: ProbeStatusFailed,
RecentResults: []bool{false},
RecentLatencies: nil,
Error: "error123",
@ -390,7 +391,7 @@ func TestProbeInfoRecent(t *testing.T) {
{latency: 80 * time.Millisecond, err: nil},
},
wantProbeInfo: ProbeInfo{
Result: true,
Status: ProbeStatusSucceeded,
Latency: 80 * time.Millisecond,
RecentResults: []bool{false, true, true, false, true, true, false, true},
RecentLatencies: []time.Duration{
@ -420,7 +421,7 @@ func TestProbeInfoRecent(t *testing.T) {
{latency: 110 * time.Millisecond, err: nil},
},
wantProbeInfo: ProbeInfo{
Result: true,
Status: ProbeStatusSucceeded,
Latency: 110 * time.Millisecond,
RecentResults: []bool{true, true, true, true, true, true, true, true, true, true},
RecentLatencies: []time.Duration{
@ -483,7 +484,7 @@ func TestProberRunHandler(t *testing.T) {
ProbeInfo: ProbeInfo{
Name: "success",
Interval: probeInterval,
Result: true,
Status: ProbeStatusSucceeded,
RecentResults: []bool{true, true},
},
PreviousSuccessRatio: 1,
@ -498,7 +499,7 @@ func TestProberRunHandler(t *testing.T) {
ProbeInfo: ProbeInfo{
Name: "failure",
Interval: probeInterval,
Result: false,
Status: ProbeStatusFailed,
Error: "error123",
RecentResults: []bool{false, false},
},

View File

@ -62,7 +62,8 @@ func (p *Prober) StatusHandler(opts ...statusHandlerOpt) tsweb.ReturnHandlerFunc
return func(w http.ResponseWriter, r *http.Request) error {
type probeStatus struct {
ProbeInfo
TimeSinceLast time.Duration
TimeSinceLastStart time.Duration
TimeSinceLastEnd time.Duration
Links map[string]template.URL
}
vars := struct {
@ -81,12 +82,15 @@ func (p *Prober) StatusHandler(opts ...statusHandlerOpt) tsweb.ReturnHandlerFunc
for name, info := range p.ProbeInfo() {
vars.TotalProbes++
if !info.Result {
if info.Error != "" {
vars.UnhealthyProbes++
}
s := probeStatus{ProbeInfo: info}
if !info.Start.IsZero() {
s.TimeSinceLastStart = time.Since(info.Start).Truncate(time.Second)
}
if !info.End.IsZero() {
s.TimeSinceLast = time.Since(info.End).Truncate(time.Second)
s.TimeSinceLastEnd = time.Since(info.End).Truncate(time.Second)
}
for textTpl, urlTpl := range params.probeLinks {
text, err := renderTemplate(textTpl, info)

View File

@ -73,8 +73,9 @@
<th>Name</th>
<th>Probe Class & Labels</th>
<th>Interval</th>
<th>Last Attempt</th>
<th>Success</th>
<th>Last Finished</th>
<th>Last Started</th>
<th>Status</th>
<th>Latency</th>
<th>Last Error</th>
</tr></thead>
@ -85,10 +86,12 @@
{{$name}}
{{range $text, $url := $probeInfo.Links}}
<br/>
{{if not $probeInfo.Continuous}}
<button onclick="location.href='{{$url}}';" type="button">
{{$text}}
</button>
{{end}}
{{end}}
</td>
<td>{{$probeInfo.Class}}<br/>
<div class="small">
@ -97,28 +100,48 @@
{{end}}
</div>
</td>
<td>{{$probeInfo.Interval}}</td>
<td data-sort="{{$probeInfo.TimeSinceLast.Milliseconds}}">
{{if $probeInfo.TimeSinceLast}}
{{$probeInfo.TimeSinceLast.String}} ago<br/>
<td>
{{if $probeInfo.Continuous}}
Continuous
{{else}}
{{$probeInfo.Interval}}
{{end}}
</td>
<td data-sort="{{$probeInfo.TimeSinceLastEnd.Milliseconds}}">
{{if $probeInfo.TimeSinceLastEnd}}
{{$probeInfo.TimeSinceLastEnd.String}} ago<br/>
<span class="small">{{$probeInfo.End.Format "2006-01-02T15:04:05Z07:00"}}</span>
{{else}}
Never
{{end}}
</td>
<td>
{{if $probeInfo.Result}}
{{$probeInfo.Result}}
<td data-sort="{{$probeInfo.TimeSinceLastStart.Milliseconds}}">
{{if $probeInfo.TimeSinceLastStart}}
{{$probeInfo.TimeSinceLastStart.String}} ago<br/>
<span class="small">{{$probeInfo.Start.Format "2006-01-02T15:04:05Z07:00"}}</span>
{{else}}
<span class="error">{{$probeInfo.Result}}</span>
Never
{{end}}
</td>
<td>
{{if $probeInfo.Error}}
<span class="error">{{$probeInfo.Status}}</span>
{{else}}
{{$probeInfo.Status}}
{{end}}<br/>
{{if not $probeInfo.Continuous}}
<div class="small">Recent: {{$probeInfo.RecentResults}}</div>
<div class="small">Mean: {{$probeInfo.RecentSuccessRatio}}</div>
{{end}}
</td>
<td data-sort="{{$probeInfo.Latency.Milliseconds}}">
{{if $probeInfo.Continuous}}
n/a
{{else}}
{{$probeInfo.Latency.String}}
<div class="small">Recent: {{$probeInfo.RecentLatencies}}</div>
<div class="small">Median: {{$probeInfo.RecentMedianLatency}}</div>
{{end}}
</td>
<td class="small">{{$probeInfo.Error}}</td>
</tr>