control/controlclient: stop restarting map polls on health change

At some point we started restarting map polls on health change, but we don't remember why. Maybe it was a desperate workaround for something. I'm not sure it ever worked. Rather than have a haunted graveyard, remove it. In its place, though, and somewhat as a safety backup, send those updates over the HTTP/2 noise channel if we have one open. Then if there was a reason that a map poll restart would help we could do it server-side. But mostly we can gather error stats and show machine-level health info for debugging. Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2025-12-22 16:46:29 +00:00 · 2022-06-21 07:44:11 -07:00
parent 6d04184325
commit fb4e23506f
3 changed files with 40 additions and 9 deletions
--- a/control/controlclient/auto.go
+++ b/control/controlclient/auto.go
@@ -114,19 +114,11 @@ func NewNoStart(opts Options) (*Auto, error) {
 	}
 	c.authCtx, c.authCancel = context.WithCancel(context.Background())
 	c.mapCtx, c.mapCancel = context.WithCancel(context.Background())
-	c.unregisterHealthWatch = health.RegisterWatcher(c.onHealthChange)
+	c.unregisterHealthWatch = health.RegisterWatcher(direct.ReportHealthChange)
 	return c, nil

 }

-func (c *Auto) onHealthChange(sys health.Subsystem, err error) {
-	if sys == health.SysOverall {
-		return
-	}
-	c.logf("controlclient: restarting map request for %q health change to new state: %v", sys, err)
-	c.cancelMapSafely()
-}
-
 // SetPaused controls whether HTTP activity should be paused.
 //
 // The client can be paused and unpaused repeatedly, unlike Start and Shutdown, which can only be used once.
--- a/control/controlclient/direct.go
+++ b/control/controlclient/direct.go
@@ -1586,6 +1586,38 @@ func postPingResult(start time.Time, logf logger.Logf, c *http.Client, pr *tailc
 	return nil
 }

+// ReportHealthChange reports to the control plane a change to this node's
+// health.
+func (c *Direct) ReportHealthChange(sys health.Subsystem, sysErr error) {
+	if sys == health.SysOverall {
+		// We don't report these. These include things like the network is down
+		// (in which case we can't report anyway) or the user wanted things
+		// stopped, as opposed to the more unexpected failure types in the other
+		// subsystems.
+		return
+	}
+	np, err := c.getNoiseClient()
+	if err != nil {
+		// Don't report errors to control if the server doesn't support noise.
+		return
+	}
+	req := &tailcfg.HealthChangeRequest{
+		Subsys: string(sys),
+	}
+	if sysErr != nil {
+		req.Error = sysErr.Error()
+	}
+
+	// Best effort, no logging:
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	res, err := np.post(ctx, "/machine/update-health", req)
+	if err != nil {
+		return
+	}
+	res.Body.Close()
+}
+
 var (
 	metricMapRequestsActive = clientmetric.NewGauge("controlclient_map_requests_active")

--- a/tailcfg/tailcfg.go
+++ b/tailcfg/tailcfg.go
@@ -1684,6 +1684,13 @@ type SetDNSRequest struct {
 // SetDNSResponse is the response to a SetDNSRequest.
 type SetDNSResponse struct{}

+// HealthChangeRequest is the JSON request body type used to report
+// node health changes to https://<control>/machine/<mkey hex>/update-health.
+type HealthChangeRequest struct {
+	Subsys string // a health.Subsystem value in string form
+	Error  string // or empty if cleared
+}
+
 // SSHPolicy is the policy for how to handle incoming SSH connections
 // over Tailscale.
 type SSHPolicy struct {