control/controlclient: move client watchdog to cover initial request

The initial control client request can get stuck in the event that a
connection is established but then lost part way through, without any
ICMP or RST. Ensure that the control client will be restarted by timing
out that initial request as well.

Fixes #11542

Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
James Tucker 2024-03-27 15:04:38 -07:00 committed by James Tucker
parent 9b5176c4d9
commit 9401b09028

View File

@ -941,6 +941,30 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap
url = strings.Replace(url, "http:", "https:", 1)
}
// Create a watchdog timer that breaks the connection if we don't receive a
// MapResponse from the network at least once every two minutes. The
// watchdog timer is stopped every time we receive a MapResponse (so it
// doesn't run when we're processing a MapResponse message, including any
// long-running requested operations like Debug.Sleep) and is reset whenever
// we go back to blocking on network reads.
// The watchdog timer also covers the initial request (effectively the
// pre-body and initial-body read timeouts) as we do not have any other
// keep-alive mechanism for the initial request.
watchdogTimer, watchdogTimedOut := c.clock.NewTimer(watchdogTimeout)
defer watchdogTimer.Stop()
go func() {
select {
case <-ctx.Done():
vlogf("netmap: ending timeout goroutine")
return
case <-watchdogTimedOut:
c.logf("map response long-poll timed out!")
cancel()
return
}
}()
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(bodyData))
if err != nil {
return err
@ -962,6 +986,7 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap
defer res.Body.Close()
health.NoteMapRequestHeard(request)
watchdogTimer.Reset(watchdogTimeout)
if nu == nil {
io.Copy(io.Discard, res.Body)
@ -993,27 +1018,6 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap
c.expiry = nm.Expiry
}
// Create a watchdog timer that breaks the connection if we don't receive a
// MapResponse from the network at least once every two minutes. The
// watchdog timer is stopped every time we receive a MapResponse (so it
// doesn't run when we're processing a MapResponse message, including any
// long-running requested operations like Debug.Sleep) and is reset whenever
// we go back to blocking on network reads.
watchdogTimer, watchdogTimedOut := c.clock.NewTimer(watchdogTimeout)
defer watchdogTimer.Stop()
go func() {
select {
case <-ctx.Done():
vlogf("netmap: ending timeout goroutine")
return
case <-watchdogTimedOut:
c.logf("map response long-poll timed out!")
cancel()
return
}
}()
// gotNonKeepAliveMessage is whether we've yet received a MapResponse message without
// KeepAlive set.
var gotNonKeepAliveMessage bool