From 9b77ac128ae2f33a0539e177d9247188dacdfdb7 Mon Sep 17 00:00:00 2001 From: Andrew Dunham Date: Sat, 27 Aug 2022 22:06:18 -0400 Subject: [PATCH] wgengine: print in-flight operations on watchdog trigger (#5447) In addition to printing goroutine stacks, explicitly track all in-flight operations and print them when the watchdog triggers (along with the time they were started at). This should make debugging watchdog failures easier, since we can look at the longest-running operation(s) first. Signed-off-by: Andrew Dunham Signed-off-by: Andrew Dunham --- wgengine/watchdog.go | 54 ++++++++++++++++++++++++++++++++++++--- wgengine/watchdog_test.go | 11 ++++++-- 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/wgengine/watchdog.go b/wgengine/watchdog.go index 432d869dd..46260221f 100644 --- a/wgengine/watchdog.go +++ b/wgengine/watchdog.go @@ -8,10 +8,12 @@ package wgengine import ( + "fmt" "log" "net/netip" "runtime/pprof" "strings" + "sync" "time" "tailscale.com/envknob" @@ -38,21 +40,49 @@ func NewWatchdog(e Engine) Engine { return e } return &watchdogEngine{ - wrap: e, - logf: log.Printf, - fatalf: log.Fatalf, - maxWait: 45 * time.Second, + wrap: e, + logf: log.Printf, + fatalf: log.Fatalf, + maxWait: 45 * time.Second, + inFlight: make(map[inFlightKey]time.Time), } } +type inFlightKey struct { + op string + ctr uint64 +} + type watchdogEngine struct { wrap Engine logf func(format string, args ...any) fatalf func(format string, args ...any) maxWait time.Duration + + // Track the start time(s) of in-flight operations + inFlightMu sync.Mutex + inFlight map[inFlightKey]time.Time + inFlightCtr uint64 } func (e *watchdogEngine) watchdogErr(name string, fn func() error) error { + // Track all in-flight operations so we can print more useful error + // messages on watchdog failure + e.inFlightMu.Lock() + key := inFlightKey{ + op: name, + ctr: e.inFlightCtr, + } + e.inFlightCtr++ + e.inFlight[key] = time.Now() + e.inFlightMu.Unlock() + + defer func() { + e.inFlightMu.Lock() + defer e.inFlightMu.Unlock() + delete(e.inFlight, key) + }() + errCh := make(chan error) go func() { errCh <- fn() @@ -66,6 +96,22 @@ func (e *watchdogEngine) watchdogErr(name string, fn func() error) error { buf := new(strings.Builder) pprof.Lookup("goroutine").WriteTo(buf, 1) e.logf("wgengine watchdog stacks:\n%s", buf.String()) + + // Collect the list of in-flight operations for debugging. + var ( + b []byte + now = time.Now() + ) + e.inFlightMu.Lock() + for k, t := range e.inFlight { + dur := now.Sub(t).Round(time.Millisecond) + b = fmt.Appendf(b, "in-flight[%d]: name=%s duration=%v start=%s\n", k.ctr, k.op, dur, t.Format(time.RFC3339Nano)) + } + e.inFlightMu.Unlock() + + // Print everything as a single string to avoid log + // rate limits. + e.logf("wgengine watchdog in-flight:\n%s", b) e.fatalf("wgengine: watchdog timeout on %s", name) return nil } diff --git a/wgengine/watchdog_test.go b/wgengine/watchdog_test.go index 277767837..cf950b866 100644 --- a/wgengine/watchdog_test.go +++ b/wgengine/watchdog_test.go @@ -6,6 +6,7 @@ package wgengine import ( "fmt" + "regexp" "runtime" "strings" "testing" @@ -68,9 +69,15 @@ func TestWatchdog(t *testing.T) { select { case <-fatalCalled: - if !strings.Contains(logBuf.String(), "goroutine profile: total ") { - t.Errorf("fatal called without watchdog stacks, got: %s", logBuf.String()) + logs := logBuf.String() + if !strings.Contains(logs, "goroutine profile: total ") { + t.Errorf("fatal called without watchdog stacks, got: %s", logs) } + re := regexp.MustCompile(`(?m)^\s*in-flight\[\d+\]: name=RequestStatus duration=.* start=.*$`) + if !re.MatchString(logs) { + t.Errorf("fatal called without in-flight list, got: %s", logs) + } + // expected case <-time.After(3 * time.Second): t.Fatalf("watchdog failed to fire")