control/controlclient,health,ipn/ipnlocal,health: fix deadlock by deleting health reporting

A recent change (009d702adf) introduced a deadlock where the
/machine/update-health network request to report the client's health
status update to the control plane was moved to being synchronous
within the eventbus's pump machinery.

I started to instead make the health reporting be async, but then we
realized in the three years since we added that, it's barely been used
and doesn't pay for itself, for how many HTTP requests it makes.

Instead, delete it all and replace it with a c2n handler, which
provides much more helpful information.

Fixes tailscale/corp#32952

Change-Id: I9e8a5458269ebfdda1c752d7bbb8af2780d71b04
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick
2025-10-02 12:01:59 -07:00
committed by Brad Fitzpatrick
parent a208cb9fd5
commit 24e38eb729
5 changed files with 18 additions and 73 deletions

View File

@@ -12,7 +12,6 @@ import (
"sync/atomic"
"time"
"tailscale.com/health"
"tailscale.com/net/sockstats"
"tailscale.com/tailcfg"
"tailscale.com/tstime"
@@ -23,7 +22,6 @@ import (
"tailscale.com/types/structs"
"tailscale.com/util/backoff"
"tailscale.com/util/clientmetric"
"tailscale.com/util/eventbus"
"tailscale.com/util/execqueue"
)
@@ -123,8 +121,6 @@ type Auto struct {
observerQueue execqueue.ExecQueue
shutdownFn func() // to be called prior to shutdown or nil
eventSubs eventbus.Monitor
mu sync.Mutex // mutex guards the following fields
wantLoggedIn bool // whether the user wants to be logged in per last method call
@@ -195,10 +191,6 @@ func NewNoStart(opts Options) (_ *Auto, err error) {
shutdownFn: opts.Shutdown,
}
// Set up eventbus client and subscriber
ec := opts.Bus.Client("controlClient.Auto")
c.eventSubs = ec.Monitor(c.consumeEventbusTopics(ec))
c.authCtx, c.authCancel = context.WithCancel(context.Background())
c.authCtx = sockstats.WithSockStats(c.authCtx, sockstats.LabelControlClientAuto, opts.Logf)
@@ -208,27 +200,6 @@ func NewNoStart(opts Options) (_ *Auto, err error) {
return c, nil
}
// consumeEventbusTopics consumes events from all relevant
// [eventbus.Subscriber]'s and passes them to their related handler. Events are
// always handled in the order they are received, i.e. the next event is not
// read until the previous event's handler has returned. It returns when the
// [eventbus.Client] is closed.
func (c *Auto) consumeEventbusTopics(ec *eventbus.Client) func(*eventbus.Client) {
healthChangeSub := eventbus.Subscribe[health.Change](ec)
return func(cli *eventbus.Client) {
for {
select {
case <-cli.Done():
return
case change := <-healthChangeSub.Events():
if change.WarnableChanged {
c.direct.ReportWarnableChange(change.Warnable, change.UnhealthyState)
}
}
}
}
}
// SetPaused controls whether HTTP activity should be paused.
//
// The client can be paused and unpaused repeatedly, unlike Start and Shutdown, which can only be used once.
@@ -782,8 +753,6 @@ func (c *Auto) UpdateEndpoints(endpoints []tailcfg.Endpoint) {
}
func (c *Auto) Shutdown() {
c.eventSubs.Close()
c.mu.Lock()
if c.closed {
c.mu.Unlock()

View File

@@ -1678,47 +1678,6 @@ func postPingResult(start time.Time, logf logger.Logf, c *http.Client, pr *tailc
return nil
}
// ReportWarnableChange reports to the control plane a change to this node's
// health. w must be non-nil. us can be nil to indicate a healthy state for w.
func (c *Direct) ReportWarnableChange(w *health.Warnable, us *health.UnhealthyState) {
if w == health.NetworkStatusWarnable || w == health.IPNStateWarnable || w == health.LoginStateWarnable {
// We don't report these. These include things like the network is down
// (in which case we can't report anyway) or the user wanted things
// stopped, as opposed to the more unexpected failure types in the other
// subsystems.
return
}
np, err := c.getNoiseClient()
if err != nil {
// Don't report errors to control if the server doesn't support noise.
return
}
nodeKey, ok := c.GetPersist().PublicNodeKeyOK()
if !ok {
return
}
if c.panicOnUse {
panic("tainted client")
}
// TODO(angott): at some point, update `Subsys` in the request to be `Warnable`
req := &tailcfg.HealthChangeRequest{
Subsys: string(w.Code),
NodeKey: nodeKey,
}
if us != nil {
req.Error = us.Text
}
// Best effort, no logging:
ctx, cancel := context.WithTimeout(c.closedCtx, 5*time.Second)
defer cancel()
res, err := np.Post(ctx, "/machine/update-health", nodeKey, req)
if err != nil {
return
}
res.Body.Close()
}
// SetDeviceAttrs does a synchronous call to the control plane to update
// the node's attributes.
//