diff --git a/health/health.go b/health/health.go index 662114f2d..bd2178fdc 100644 --- a/health/health.go +++ b/health/health.go @@ -11,6 +11,7 @@ "fmt" "sort" "sync" + "sync/atomic" "time" "github.com/go-multierror/multierror" @@ -216,6 +217,7 @@ func SetAnyInterfaceUp(up bool) { func timerSelfCheck() { mu.Lock() defer mu.Unlock() + checkReceiveFuncs() selfCheckLocked() if timer != nil { timer.Reset(time.Minute) @@ -263,6 +265,11 @@ func overallErrorLocked() error { _ = lastMapRequestHeard var errs []error + for _, recv := range receiveFuncs { + if recv.missing { + errs = append(errs, fmt.Errorf("%s is not running", recv.name)) + } + } for sys, err := range sysErr { if err == nil || sys == SysOverall { continue @@ -275,3 +282,58 @@ func overallErrorLocked() error { }) return multierror.New(errs) } + +var ( + ReceiveIPv4 = ReceiveFuncStats{name: "ReceiveIPv4"} + // ReceiveIPv6 isn't guaranteed to be running, so skip it for now. + ReceiveDERP = ReceiveFuncStats{name: "ReceiveDERP"} + + receiveFuncs = []*ReceiveFuncStats{&ReceiveIPv4, &ReceiveDERP} +) + +// ReceiveFuncStats tracks the calls made to a wireguard-go receive func. +type ReceiveFuncStats struct { + // name is the name of the receive func. + name string + // numCalls is the number of times the receive func has ever been called. + // It is required because it is possible for a receive func's wireguard-go goroutine + // to be active even though the receive func isn't. + // The wireguard-go goroutine alternates between calling the receive func and + // processing what the func returned. + numCalls uint64 // accessed atomically + // prevNumCalls is the value of numCalls last time the health check examined it. + prevNumCalls uint64 + // inCall indicates whether the receive func is currently running. + inCall uint32 // bool, accessed atomically + // missing indicates whether the receive func is not running. + missing bool +} + +func (s *ReceiveFuncStats) Enter() { + atomic.AddUint64(&s.numCalls, 1) + atomic.StoreUint32(&s.inCall, 1) +} + +func (s *ReceiveFuncStats) Exit() { + atomic.StoreUint32(&s.inCall, 0) +} + +func checkReceiveFuncs() { + for _, recv := range receiveFuncs { + recv.missing = false + prev := recv.prevNumCalls + numCalls := atomic.LoadUint64(&recv.numCalls) + recv.prevNumCalls = numCalls + if numCalls > prev { + // OK: the function has gotten called since last we checked + continue + } + if atomic.LoadUint32(&recv.inCall) == 1 { + // OK: the function is active, probably blocked due to inactivity + continue + } + // Not OK: The function is not active, and not accumulating new calls. + // It is probably MIA. + recv.missing = true + } +} diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index 51b5c0af1..d7230e651 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -1594,6 +1594,8 @@ func (c *Conn) receiveIPv6(b []byte) (int, conn.Endpoint, error) { // receiveIPv4 receives a UDP IPv4 packet. It is called by wireguard-go. func (c *Conn) receiveIPv4(b []byte) (n int, ep conn.Endpoint, err error) { + health.ReceiveIPv4.Enter() + defer health.ReceiveIPv4.Exit() for { n, ipp, err := c.pconn4.ReadFromNetaddr(b) if err != nil { @@ -1646,6 +1648,8 @@ func (c *Conn) receiveIP(b []byte, ipp netaddr.IPPort, cache *ippEndpointCache) // If the packet was a disco message or the peer endpoint wasn't // found, the returned error is errLoopAgain. func (c *connBind) receiveDERP(b []byte) (n int, ep conn.Endpoint, err error) { + health.ReceiveDERP.Enter() + defer health.ReceiveDERP.Exit() for dm := range c.derpRecvCh { if c.Closed() { break