control/controlclient: use lite map request handler to avoid aborting streams

Previously, any change to endpoints or hostinfo (or hostinfo's
netinfo) would result in the long-running map request HTTP stream
being torn down and restarted, losing all compression context along
with it.

This change makes us instead send a lite map request (OmitPeers: true,
Stream: false) that doesn't subscribe to anything, and then the
coordination server knows to not close other streams for that node
when it recives a lite request.

Fixes tailscale/corp#797

Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2020-12-23 13:03:16 -08:00 committed by Brad Fitzpatrick
parent e692e3866b
commit 2f04f49376
2 changed files with 83 additions and 13 deletions

View File

@ -117,15 +117,16 @@ type Client struct {
mu sync.Mutex // mutex guards the following fields
statusFunc func(Status) // called to update Client status
paused bool // whether we should stop making HTTP requests
unpauseWaiters []chan struct{}
loggedIn bool // true if currently logged in
loginGoal *LoginGoal // non-nil if some login activity is desired
synced bool // true if our netmap is up-to-date
hostinfo *tailcfg.Hostinfo
inPollNetMap bool // true if currently running a PollNetMap
inSendStatus int // number of sendStatus calls currently in progress
state State
paused bool // whether we should stop making HTTP requests
unpauseWaiters []chan struct{}
loggedIn bool // true if currently logged in
loginGoal *LoginGoal // non-nil if some login activity is desired
synced bool // true if our netmap is up-to-date
hostinfo *tailcfg.Hostinfo
inPollNetMap bool // true if currently running a PollNetMap
inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding
inSendStatus int // number of sendStatus calls currently in progress
state State
authCtx context.Context // context used for auth requests
mapCtx context.Context // context used for netmap requests
@ -201,6 +202,50 @@ func (c *Client) Start() {
go c.mapRoutine()
}
// sendNewMapRequest either sends a new OmitPeers, non-streaming map request
// (to just send Hostinfo/Netinfo/Endpoints info, while keeping an existing
// streaming response open), or start a new streaming one if necessary.
//
// It should be called whenever there's something new to tell the server.
func (c *Client) sendNewMapRequest() {
c.mu.Lock()
// If we're not already streaming a netmap, or if we're already stuck
// in a lite update, then tear down everything and start a new stream
// (which starts by sending a new map request)
if !c.inPollNetMap || c.inLiteMapUpdate {
c.mu.Unlock()
c.cancelMapSafely()
return
}
// Otherwise, send a lite update that doesn't keep a
// long-running stream response.
defer c.mu.Unlock()
c.inLiteMapUpdate = true
ctx, cancel := context.WithTimeout(c.mapCtx, 10*time.Second)
go func() {
defer cancel()
t0 := time.Now()
err := c.direct.SendLiteMapUpdate(ctx)
d := time.Since(t0).Round(time.Millisecond)
c.mu.Lock()
c.inLiteMapUpdate = false
c.mu.Unlock()
if err == nil {
c.logf("[v1] successful lite map update in %v", d)
return
}
if ctx.Err() == nil {
c.logf("lite map update after %v: %v", d, err)
}
// Fall back to restarting the long-polling map
// request (the old heavy way) if the lite update
// failed for any reason.
c.cancelMapSafely()
}()
}
func (c *Client) cancelAuth() {
c.mu.Lock()
if c.authCancel != nil {
@ -545,7 +590,7 @@ func (c *Client) SetHostinfo(hi *tailcfg.Hostinfo) {
c.logf("Hostinfo: %v", hi)
// Send new Hostinfo to server
c.cancelMapSafely()
c.sendNewMapRequest()
}
func (c *Client) SetNetInfo(ni *tailcfg.NetInfo) {
@ -553,13 +598,12 @@ func (c *Client) SetNetInfo(ni *tailcfg.NetInfo) {
panic("nil NetInfo")
}
if !c.direct.SetNetInfo(ni) {
c.logf("[unexpected] duplicate NetInfo: %v", ni)
return
}
c.logf("NetInfo: %v", ni)
// Send new Hostinfo (which includes NetInfo) to server
c.cancelMapSafely()
c.sendNewMapRequest()
}
func (c *Client) sendStatus(who string, err error, url string, nm *NetworkMap) {
@ -636,7 +680,7 @@ func (c *Client) Logout() {
func (c *Client) UpdateEndpoints(localPort uint16, endpoints []string) {
changed := c.direct.SetEndpoints(localPort, endpoints)
if changed {
c.cancelMapSafely()
c.sendNewMapRequest()
}
}

View File

@ -521,6 +521,18 @@ func inTest() bool { return flag.Lookup("test.v") != nil }
// maxPolls is how many network maps to download; common values are 1
// or -1 (to keep a long-poll query open to the server).
func (c *Direct) PollNetMap(ctx context.Context, maxPolls int, cb func(*NetworkMap)) error {
return c.sendMapRequest(ctx, maxPolls, cb)
}
// SendLiteMapUpdate makes a /map request to update the server of our latest state,
// but does not fetch anything. It returns an error if the server did not return a
// successful 200 OK response.
func (c *Direct) SendLiteMapUpdate(ctx context.Context) error {
return c.sendMapRequest(ctx, 1, nil)
}
// cb nil means to omit peers.
func (c *Direct) sendMapRequest(ctx context.Context, maxPolls int, cb func(*NetworkMap)) error {
c.mu.Lock()
persist := c.persist
serverURL := c.serverURL
@ -555,6 +567,7 @@ func (c *Direct) PollNetMap(ctx context.Context, maxPolls int, cb func(*NetworkM
Stream: allowStream,
Hostinfo: hostinfo,
DebugFlags: c.debugFlags,
OmitPeers: cb == nil,
}
if hostinfo != nil && ipForwardingBroken(hostinfo.RoutableIPs) {
old := request.DebugFlags
@ -607,6 +620,11 @@ func (c *Direct) PollNetMap(ctx context.Context, maxPolls int, cb func(*NetworkM
}
defer res.Body.Close()
if cb == nil {
io.Copy(ioutil.Discard, res.Body)
return nil
}
// If we go more than pollTimeout without hearing from the server,
// end the long poll. We should be receiving a keep alive ping
// every minute.
@ -724,6 +742,14 @@ func (c *Direct) PollNetMap(ctx context.Context, maxPolls int, cb func(*NetworkM
lastParsedPacketFilter = c.parsePacketFilter(pf)
}
// Get latest localPort. This might've changed if
// a lite map update occured meanwhile. This only affects
// the end-to-end test.
// TODO(bradfitz): remove the NetworkMap.LocalPort field entirely.
c.mu.Lock()
localPort = c.localPort
c.mu.Unlock()
nm := &NetworkMap{
NodeKey: tailcfg.NodeKey(persist.PrivateNodeKey.Public()),
PrivateKey: persist.PrivateNodeKey,