control/controlclient: stop restarting map polls on health change

At some point we started restarting map polls on health change, but we
don't remember why. Maybe it was a desperate workaround for something.
I'm not sure it ever worked.

Rather than have a haunted graveyard, remove it.

In its place, though, and somewhat as a safety backup, send those
updates over the HTTP/2 noise channel if we have one open. Then if
there was a reason that a map poll restart would help we could do it
server-side. But mostly we can gather error stats and show
machine-level health info for debugging.

Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2022-06-21 07:44:11 -07:00 committed by Brad Fitzpatrick
parent 6d04184325
commit fb4e23506f
3 changed files with 40 additions and 9 deletions

View File

@ -114,19 +114,11 @@ func NewNoStart(opts Options) (*Auto, error) {
}
c.authCtx, c.authCancel = context.WithCancel(context.Background())
c.mapCtx, c.mapCancel = context.WithCancel(context.Background())
c.unregisterHealthWatch = health.RegisterWatcher(c.onHealthChange)
c.unregisterHealthWatch = health.RegisterWatcher(direct.ReportHealthChange)
return c, nil
}
func (c *Auto) onHealthChange(sys health.Subsystem, err error) {
if sys == health.SysOverall {
return
}
c.logf("controlclient: restarting map request for %q health change to new state: %v", sys, err)
c.cancelMapSafely()
}
// SetPaused controls whether HTTP activity should be paused.
//
// The client can be paused and unpaused repeatedly, unlike Start and Shutdown, which can only be used once.

View File

@ -1586,6 +1586,38 @@ func postPingResult(start time.Time, logf logger.Logf, c *http.Client, pr *tailc
return nil
}
// ReportHealthChange reports to the control plane a change to this node's
// health.
func (c *Direct) ReportHealthChange(sys health.Subsystem, sysErr error) {
if sys == health.SysOverall {
// We don't report these. These include things like the network is down
// (in which case we can't report anyway) or the user wanted things
// stopped, as opposed to the more unexpected failure types in the other
// subsystems.
return
}
np, err := c.getNoiseClient()
if err != nil {
// Don't report errors to control if the server doesn't support noise.
return
}
req := &tailcfg.HealthChangeRequest{
Subsys: string(sys),
}
if sysErr != nil {
req.Error = sysErr.Error()
}
// Best effort, no logging:
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
res, err := np.post(ctx, "/machine/update-health", req)
if err != nil {
return
}
res.Body.Close()
}
var (
metricMapRequestsActive = clientmetric.NewGauge("controlclient_map_requests_active")

View File

@ -1684,6 +1684,13 @@ type SetDNSRequest struct {
// SetDNSResponse is the response to a SetDNSRequest.
type SetDNSResponse struct{}
// HealthChangeRequest is the JSON request body type used to report
// node health changes to https://<control>/machine/<mkey hex>/update-health.
type HealthChangeRequest struct {
Subsys string // a health.Subsystem value in string form
Error string // or empty if cleared
}
// SSHPolicy is the policy for how to handle incoming SSH connections
// over Tailscale.
type SSHPolicy struct {