control/controlhttp: simplify, fix race dialing, remove priority concept

controlhttp has the responsibility of dialing a set of candidate control
endpoints in a way that minimizes user facing latency. If one control
endpoint is unavailable we promptly dial another, racing across the
dimensions of: IPv6, IPv4, port 80, and port 443, over multiple server
endpoints.

In the case that the top priority endpoint was not available, the prior
implementation would hang waiting for other results, so as to try to
return the highest priority successful connection to the rest of the
client code. This hang would take too long with a large dialplan and
sufficient client to endpoint latency as to cause the server to timeout
the connection due to inactivity in the intermediate state.

Instead of trying to prioritize non-ideal candidate connections, the
first successful connection is now used unconditionally, improving user
facing latency and avoiding any delays that would encroach on the
server-side timeout.

The tests are converted to memnet and synctest, running on all
platforms.

Fixes #8442
Fixes tailscale/corp#32534

Co-authored-by: James Tucker <james@tailscale.com>
Change-Id: I4eb57f046d8b40403220e40eb67a31c41adb3a38
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
Signed-off-by: James Tucker <james@tailscale.com>
This commit is contained in:
Brad Fitzpatrick
2025-09-20 16:48:18 -07:00
committed by James Tucker
parent 1b6bc37f28
commit db048e905d
4 changed files with 316 additions and 334 deletions

View File

@@ -27,14 +27,12 @@ import (
"errors"
"fmt"
"io"
"math"
"net"
"net/http"
"net/http/httptrace"
"net/netip"
"net/url"
"runtime"
"sort"
"sync/atomic"
"time"
@@ -53,7 +51,6 @@ import (
"tailscale.com/syncs"
"tailscale.com/tailcfg"
"tailscale.com/tstime"
"tailscale.com/util/multierr"
)
var stdDialer net.Dialer
@@ -110,18 +107,8 @@ func (a *Dialer) dial(ctx context.Context) (*ClientConn, error) {
}
candidates := a.DialPlan.Candidates
// Otherwise, we try dialing per the plan. Store the highest priority
// in the list, so that if we get a connection to one of those
// candidates we can return quickly.
var highestPriority int = math.MinInt
for _, c := range candidates {
if c.Priority > highestPriority {
highestPriority = c.Priority
}
}
// This context allows us to cancel in-flight connections if we get a
// highest-priority connection before we're all done.
// Create a context to be canceled as we return, so once we get a good connection,
// we can drop all the other ones.
ctx, cancel := context.WithCancel(ctx)
defer cancel()
@@ -129,142 +116,58 @@ func (a *Dialer) dial(ctx context.Context) (*ClientConn, error) {
type dialResult struct {
conn *ClientConn
err error
cand tailcfg.ControlIPCandidate
}
resultsCh := make(chan dialResult, len(candidates))
resultsCh := make(chan dialResult) // unbuffered, never closed
var pending atomic.Int32
pending.Store(int32(len(candidates)))
for _, c := range candidates {
go func(ctx context.Context, c tailcfg.ControlIPCandidate) {
var (
conn *ClientConn
err error
)
dialCand := func(cand tailcfg.ControlIPCandidate) (*ClientConn, error) {
if cand.ACEHost != "" {
a.logf("[v2] controlhttp: waited %.2f seconds, dialing %q via ACE %s (%s)", cand.DialStartDelaySec, a.Hostname, cand.ACEHost, cmp.Or(cand.IP.String(), "dns"))
} else {
a.logf("[v2] controlhttp: waited %.2f seconds, dialing %q @ %s", cand.DialStartDelaySec, a.Hostname, cand.IP.String())
}
// Always send results back to our channel.
defer func() {
resultsCh <- dialResult{conn, err, c}
if pending.Add(-1) == 0 {
close(resultsCh)
}
}()
// If non-zero, wait the configured start timeout
// before we do anything.
if c.DialStartDelaySec > 0 {
a.logf("[v2] controlhttp: waiting %.2f seconds before dialing %q @ %v", c.DialStartDelaySec, a.Hostname, c.IP)
tmr, tmrChannel := a.clock().NewTimer(time.Duration(c.DialStartDelaySec * float64(time.Second)))
defer tmr.Stop()
select {
case <-ctx.Done():
err = ctx.Err()
return
case <-tmrChannel:
}
}
// Now, create a sub-context with the given timeout and
// try dialing the provided host.
ctx, cancel := context.WithTimeout(ctx, time.Duration(c.DialTimeoutSec*float64(time.Second)))
defer cancel()
if c.IP.IsValid() {
a.logf("[v2] controlhttp: trying to dial %q @ %v", a.Hostname, c.IP)
} else if c.ACEHost != "" {
a.logf("[v2] controlhttp: trying to dial %q via ACE %q", a.Hostname, c.ACEHost)
}
// This will dial, and the defer above sends it back to our parent.
conn, err = a.dialHostOpt(ctx, c.IP, c.ACEHost)
}(ctx, c)
ctx, cancel := context.WithTimeout(ctx, time.Duration(cand.DialTimeoutSec*float64(time.Second)))
defer cancel()
return a.dialHostOpt(ctx, cand.IP, cand.ACEHost)
}
var results []dialResult
for res := range resultsCh {
// If we get a response that has the highest priority, we don't
// need to wait for any of the other connections to finish; we
// can just return this connection.
//
// TODO(andrew): we could make this better by keeping track of
// the highest remaining priority dynamically, instead of just
// checking for the highest total
if res.cand.Priority == highestPriority && res.conn != nil {
a.logf("[v1] controlhttp: high-priority success dialing %q @ %v from dial plan", a.Hostname, cmp.Or(res.cand.ACEHost, res.cand.IP.String()))
// Drain the channel and any existing connections in
// the background.
for _, cand := range candidates {
timer := time.AfterFunc(time.Duration(cand.DialStartDelaySec*float64(time.Second)), func() {
go func() {
for _, res := range results {
if res.conn != nil {
res.conn.Close()
conn, err := dialCand(cand)
select {
case resultsCh <- dialResult{conn, err}:
if err == nil {
a.logf("[v1] controlhttp: succeeded dialing %q @ %v from dial plan", a.Hostname, cmp.Or(cand.ACEHost, cand.IP.String()))
}
}
for res := range resultsCh {
if res.conn != nil {
res.conn.Close()
case <-ctx.Done():
if conn != nil {
conn.Close()
}
}
if a.drainFinished != nil {
close(a.drainFinished)
}
}()
return res.conn, nil
}
// This isn't a highest-priority result, so just store it until
// we're done.
results = append(results, res)
})
defer timer.Stop()
}
// After we finish this function, close any remaining open connections.
defer func() {
for _, result := range results {
// Note: below, we nil out the returned connection (if
// any) in the slice so we don't close it.
if result.conn != nil {
result.conn.Close()
var errs []error
for {
select {
case res := <-resultsCh:
if res.err == nil {
return res.conn, nil
}
errs = append(errs, res.err)
if len(errs) == len(candidates) {
// If we get here, then we didn't get anywhere with our dial plan; fall back to just using DNS.
a.logf("controlhttp: failed dialing using DialPlan, falling back to DNS; errs=%s", errors.Join(errs...))
return a.dialHost(ctx)
}
case <-ctx.Done():
a.logf("controlhttp: context aborted dialing")
return nil, ctx.Err()
}
// We don't drain asynchronously after this point, so notify our
// channel when we return.
if a.drainFinished != nil {
close(a.drainFinished)
}
}()
// Sort by priority, then take the first non-error response.
sort.Slice(results, func(i, j int) bool {
// NOTE: intentionally inverted so that the highest priority
// item comes first
return results[i].cand.Priority > results[j].cand.Priority
})
var (
conn *ClientConn
errs []error
)
for i, result := range results {
if result.err != nil {
errs = append(errs, result.err)
continue
}
a.logf("[v1] controlhttp: succeeded dialing %q @ %v from dial plan", a.Hostname, cmp.Or(result.cand.ACEHost, result.cand.IP.String()))
conn = result.conn
results[i].conn = nil // so we don't close it in the defer
return conn, nil
}
if ctx.Err() != nil {
a.logf("controlhttp: context aborted dialing")
return nil, ctx.Err()
}
merr := multierr.New(errs...)
// If we get here, then we didn't get anywhere with our dial plan; fall back to just using DNS.
a.logf("controlhttp: failed dialing using DialPlan, falling back to DNS; errs=%s", merr.Error())
return a.dialHost(ctx)
}
// The TS_FORCE_NOISE_443 envknob forces the controlclient noise dialer to
@@ -402,6 +305,9 @@ func (a *Dialer) dialHostOpt(ctx context.Context, optAddr netip.Addr, optACEHost
}
var err80, err443 error
if forceTLS {
err80 = errors.New("TLS forced: no port 80 dialed")
}
for {
select {
case <-ctx.Done():