mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-29 04:55:31 +00:00
control/controlclient: improve handling of concurrent lite map requests
Prior to this change, if we were in the middle of a lite map update we'd tear down the entire map session and restart it. With this change, we'll cancel an in-flight lite map request up to 10 times and restart before we tear down the streaming map request. We tear down everything after 10 retries to ensure that a steady stream of calls to sendNewMapRequest doesn't fail to make progress by repeatedly canceling and restarting. Signed-off-by: Andrew Dunham <andrew@du.nham.ca> Co-authored-by: Maisem Ali <maisem@tailscale.com> Change-Id: I9392bf8cf674e7a58ccd1e476039300a359ef3b1
This commit is contained in:
parent
b0cb39cda1
commit
48f6c1eba4
@ -59,15 +59,17 @@ type Auto struct {
|
||||
|
||||
mu sync.Mutex // mutex guards the following fields
|
||||
|
||||
paused bool // whether we should stop making HTTP requests
|
||||
unpauseWaiters []chan struct{}
|
||||
loggedIn bool // true if currently logged in
|
||||
loginGoal *LoginGoal // non-nil if some login activity is desired
|
||||
synced bool // true if our netmap is up-to-date
|
||||
inPollNetMap bool // true if currently running a PollNetMap
|
||||
inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding
|
||||
inSendStatus int // number of sendStatus calls currently in progress
|
||||
state State
|
||||
paused bool // whether we should stop making HTTP requests
|
||||
unpauseWaiters []chan struct{}
|
||||
loggedIn bool // true if currently logged in
|
||||
loginGoal *LoginGoal // non-nil if some login activity is desired
|
||||
synced bool // true if our netmap is up-to-date
|
||||
inPollNetMap bool // true if currently running a PollNetMap
|
||||
inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding
|
||||
liteMapUpdateCancel func() // cancels a lite map update
|
||||
liteMapUpdateCancels int // how many times we've canceled a lite map update
|
||||
inSendStatus int // number of sendStatus calls currently in progress
|
||||
state State
|
||||
|
||||
authCtx context.Context // context used for auth requests
|
||||
mapCtx context.Context // context used for netmap requests
|
||||
@ -168,28 +170,55 @@ func (c *Auto) Start() {
|
||||
func (c *Auto) sendNewMapRequest() {
|
||||
c.mu.Lock()
|
||||
|
||||
// If we're not already streaming a netmap, or if we're already stuck
|
||||
// in a lite update, then tear down everything and start a new stream
|
||||
// (which starts by sending a new map request)
|
||||
if !c.inPollNetMap || c.inLiteMapUpdate || !c.loggedIn {
|
||||
// If we're not already streaming a netmap, then tear down everything
|
||||
// and start a new stream (which starts by sending a new map request)
|
||||
if !c.inPollNetMap || !c.loggedIn {
|
||||
c.mu.Unlock()
|
||||
c.cancelMapSafely()
|
||||
return
|
||||
}
|
||||
|
||||
// If we are already in process of doing a LiteMapUpdate, cancel it and
|
||||
// try a new one. If this is the 10th time we have done this
|
||||
// cancelation, tear down everything and start again
|
||||
if c.inLiteMapUpdate {
|
||||
// Always cancel the in-flight lite map update, regardless of
|
||||
// whether we cancel the streaming map request or not.
|
||||
c.liteMapUpdateCancel()
|
||||
c.inLiteMapUpdate = false
|
||||
|
||||
if c.liteMapUpdateCancels > 10 {
|
||||
// Not making progress
|
||||
c.mu.Unlock()
|
||||
c.cancelMapSafely()
|
||||
return
|
||||
}
|
||||
|
||||
// Increment our cancel counter and continue below to start a
|
||||
// new lite update.
|
||||
c.liteMapUpdateCancels++
|
||||
}
|
||||
|
||||
// Otherwise, send a lite update that doesn't keep a
|
||||
// long-running stream response.
|
||||
defer c.mu.Unlock()
|
||||
c.inLiteMapUpdate = true
|
||||
ctx, cancel := context.WithTimeout(c.mapCtx, 10*time.Second)
|
||||
c.liteMapUpdateCancel = cancel
|
||||
go func() {
|
||||
defer cancel()
|
||||
t0 := time.Now()
|
||||
err := c.direct.SendLiteMapUpdate(ctx)
|
||||
d := time.Since(t0).Round(time.Millisecond)
|
||||
|
||||
c.mu.Lock()
|
||||
c.inLiteMapUpdate = false
|
||||
c.liteMapUpdateCancel = nil
|
||||
if err == nil {
|
||||
c.liteMapUpdateCancels = 0
|
||||
}
|
||||
c.mu.Unlock()
|
||||
|
||||
if err == nil {
|
||||
c.logf("[v1] successful lite map update in %v", d)
|
||||
return
|
||||
@ -197,10 +226,13 @@ func (c *Auto) sendNewMapRequest() {
|
||||
if ctx.Err() == nil {
|
||||
c.logf("lite map update after %v: %v", d, err)
|
||||
}
|
||||
// Fall back to restarting the long-polling map
|
||||
// request (the old heavy way) if the lite update
|
||||
// failed for any reason.
|
||||
c.cancelMapSafely()
|
||||
if !errors.Is(ctx.Err(), context.Canceled) {
|
||||
// Fall back to restarting the long-polling map
|
||||
// request (the old heavy way) if the lite update
|
||||
// failed for reasons other than the context being
|
||||
// canceled.
|
||||
c.cancelMapSafely()
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
@ -237,6 +269,12 @@ func (c *Auto) cancelMapSafely() {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
// Always reset our lite map cancels counter if we're canceling
|
||||
// everything, since we're about to restart with a new map update; this
|
||||
// allows future calls to sendNewMapRequest to retry sending lite
|
||||
// updates.
|
||||
c.liteMapUpdateCancels = 0
|
||||
|
||||
c.logf("[v1] cancelMapSafely: synced=%v", c.synced)
|
||||
|
||||
if c.inPollNetMap {
|
||||
|
Loading…
Reference in New Issue
Block a user