diff --git a/wgengine/watchdog.go b/wgengine/watchdog.go index 432d869dd..46260221f 100644 --- a/wgengine/watchdog.go +++ b/wgengine/watchdog.go @@ -8,10 +8,12 @@ package wgengine import ( + "fmt" "log" "net/netip" "runtime/pprof" "strings" + "sync" "time" "tailscale.com/envknob" @@ -38,21 +40,49 @@ func NewWatchdog(e Engine) Engine { return e } return &watchdogEngine{ - wrap: e, - logf: log.Printf, - fatalf: log.Fatalf, - maxWait: 45 * time.Second, + wrap: e, + logf: log.Printf, + fatalf: log.Fatalf, + maxWait: 45 * time.Second, + inFlight: make(map[inFlightKey]time.Time), } } +type inFlightKey struct { + op string + ctr uint64 +} + type watchdogEngine struct { wrap Engine logf func(format string, args ...any) fatalf func(format string, args ...any) maxWait time.Duration + + // Track the start time(s) of in-flight operations + inFlightMu sync.Mutex + inFlight map[inFlightKey]time.Time + inFlightCtr uint64 } func (e *watchdogEngine) watchdogErr(name string, fn func() error) error { + // Track all in-flight operations so we can print more useful error + // messages on watchdog failure + e.inFlightMu.Lock() + key := inFlightKey{ + op: name, + ctr: e.inFlightCtr, + } + e.inFlightCtr++ + e.inFlight[key] = time.Now() + e.inFlightMu.Unlock() + + defer func() { + e.inFlightMu.Lock() + defer e.inFlightMu.Unlock() + delete(e.inFlight, key) + }() + errCh := make(chan error) go func() { errCh <- fn() @@ -66,6 +96,22 @@ func (e *watchdogEngine) watchdogErr(name string, fn func() error) error { buf := new(strings.Builder) pprof.Lookup("goroutine").WriteTo(buf, 1) e.logf("wgengine watchdog stacks:\n%s", buf.String()) + + // Collect the list of in-flight operations for debugging. + var ( + b []byte + now = time.Now() + ) + e.inFlightMu.Lock() + for k, t := range e.inFlight { + dur := now.Sub(t).Round(time.Millisecond) + b = fmt.Appendf(b, "in-flight[%d]: name=%s duration=%v start=%s\n", k.ctr, k.op, dur, t.Format(time.RFC3339Nano)) + } + e.inFlightMu.Unlock() + + // Print everything as a single string to avoid log + // rate limits. + e.logf("wgengine watchdog in-flight:\n%s", b) e.fatalf("wgengine: watchdog timeout on %s", name) return nil } diff --git a/wgengine/watchdog_test.go b/wgengine/watchdog_test.go index 277767837..cf950b866 100644 --- a/wgengine/watchdog_test.go +++ b/wgengine/watchdog_test.go @@ -6,6 +6,7 @@ package wgengine import ( "fmt" + "regexp" "runtime" "strings" "testing" @@ -68,9 +69,15 @@ func TestWatchdog(t *testing.T) { select { case <-fatalCalled: - if !strings.Contains(logBuf.String(), "goroutine profile: total ") { - t.Errorf("fatal called without watchdog stacks, got: %s", logBuf.String()) + logs := logBuf.String() + if !strings.Contains(logs, "goroutine profile: total ") { + t.Errorf("fatal called without watchdog stacks, got: %s", logs) } + re := regexp.MustCompile(`(?m)^\s*in-flight\[\d+\]: name=RequestStatus duration=.* start=.*$`) + if !re.MatchString(logs) { + t.Errorf("fatal called without in-flight list, got: %s", logs) + } + // expected case <-time.After(3 * time.Second): t.Fatalf("watchdog failed to fire")