From 48f6c1eba4e29fdac9b0f807ee50dcefa387471d Mon Sep 17 00:00:00 2001 From: Andrew Dunham Date: Tue, 7 Mar 2023 16:54:11 -0500 Subject: [PATCH] control/controlclient: improve handling of concurrent lite map requests Prior to this change, if we were in the middle of a lite map update we'd tear down the entire map session and restart it. With this change, we'll cancel an in-flight lite map request up to 10 times and restart before we tear down the streaming map request. We tear down everything after 10 retries to ensure that a steady stream of calls to sendNewMapRequest doesn't fail to make progress by repeatedly canceling and restarting. Signed-off-by: Andrew Dunham Co-authored-by: Maisem Ali Change-Id: I9392bf8cf674e7a58ccd1e476039300a359ef3b1 --- control/controlclient/auto.go | 72 ++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 17 deletions(-) diff --git a/control/controlclient/auto.go b/control/controlclient/auto.go index 93598d15a..188e8024d 100644 --- a/control/controlclient/auto.go +++ b/control/controlclient/auto.go @@ -59,15 +59,17 @@ type Auto struct { mu sync.Mutex // mutex guards the following fields - paused bool // whether we should stop making HTTP requests - unpauseWaiters []chan struct{} - loggedIn bool // true if currently logged in - loginGoal *LoginGoal // non-nil if some login activity is desired - synced bool // true if our netmap is up-to-date - inPollNetMap bool // true if currently running a PollNetMap - inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding - inSendStatus int // number of sendStatus calls currently in progress - state State + paused bool // whether we should stop making HTTP requests + unpauseWaiters []chan struct{} + loggedIn bool // true if currently logged in + loginGoal *LoginGoal // non-nil if some login activity is desired + synced bool // true if our netmap is up-to-date + inPollNetMap bool // true if currently running a PollNetMap + inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding + liteMapUpdateCancel func() // cancels a lite map update + liteMapUpdateCancels int // how many times we've canceled a lite map update + inSendStatus int // number of sendStatus calls currently in progress + state State authCtx context.Context // context used for auth requests mapCtx context.Context // context used for netmap requests @@ -168,28 +170,55 @@ func (c *Auto) Start() { func (c *Auto) sendNewMapRequest() { c.mu.Lock() - // If we're not already streaming a netmap, or if we're already stuck - // in a lite update, then tear down everything and start a new stream - // (which starts by sending a new map request) - if !c.inPollNetMap || c.inLiteMapUpdate || !c.loggedIn { + // If we're not already streaming a netmap, then tear down everything + // and start a new stream (which starts by sending a new map request) + if !c.inPollNetMap || !c.loggedIn { c.mu.Unlock() c.cancelMapSafely() return } + // If we are already in process of doing a LiteMapUpdate, cancel it and + // try a new one. If this is the 10th time we have done this + // cancelation, tear down everything and start again + if c.inLiteMapUpdate { + // Always cancel the in-flight lite map update, regardless of + // whether we cancel the streaming map request or not. + c.liteMapUpdateCancel() + c.inLiteMapUpdate = false + + if c.liteMapUpdateCancels > 10 { + // Not making progress + c.mu.Unlock() + c.cancelMapSafely() + return + } + + // Increment our cancel counter and continue below to start a + // new lite update. + c.liteMapUpdateCancels++ + } + // Otherwise, send a lite update that doesn't keep a // long-running stream response. defer c.mu.Unlock() c.inLiteMapUpdate = true ctx, cancel := context.WithTimeout(c.mapCtx, 10*time.Second) + c.liteMapUpdateCancel = cancel go func() { defer cancel() t0 := time.Now() err := c.direct.SendLiteMapUpdate(ctx) d := time.Since(t0).Round(time.Millisecond) + c.mu.Lock() c.inLiteMapUpdate = false + c.liteMapUpdateCancel = nil + if err == nil { + c.liteMapUpdateCancels = 0 + } c.mu.Unlock() + if err == nil { c.logf("[v1] successful lite map update in %v", d) return @@ -197,10 +226,13 @@ func (c *Auto) sendNewMapRequest() { if ctx.Err() == nil { c.logf("lite map update after %v: %v", d, err) } - // Fall back to restarting the long-polling map - // request (the old heavy way) if the lite update - // failed for any reason. - c.cancelMapSafely() + if !errors.Is(ctx.Err(), context.Canceled) { + // Fall back to restarting the long-polling map + // request (the old heavy way) if the lite update + // failed for reasons other than the context being + // canceled. + c.cancelMapSafely() + } }() } @@ -237,6 +269,12 @@ func (c *Auto) cancelMapSafely() { c.mu.Lock() defer c.mu.Unlock() + // Always reset our lite map cancels counter if we're canceling + // everything, since we're about to restart with a new map update; this + // allows future calls to sendNewMapRequest to retry sending lite + // updates. + c.liteMapUpdateCancels = 0 + c.logf("[v1] cancelMapSafely: synced=%v", c.synced) if c.inPollNetMap {