mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-30 05:25:35 +00:00
a01b545441
1eaad7d3de
regressed some tests in another repo that were starting up a control server on `http://127.0.0.1:nnn`. Because there was no https running, and because of a bug in1eaad7d3de
(which ended up checking the recently-dialed-control check twice in a single dial call), we ended up forcing only the use of TLS dials in a test that only had plaintext HTTP running. Instead, plumb down support for explicitly disabling TLS fallbacks and use it only when running in a test and using `http` scheme control plane URLs to 127.0.0.1 or localhost. This fixes the tests elsewhere. Updates #13597 Change-Id: I97212ded21daf0bd510891a278078daec3eebaa6 Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
407 lines
12 KiB
Go
407 lines
12 KiB
Go
// Copyright (c) Tailscale Inc & AUTHORS
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
package controlclient
|
|
|
|
import (
|
|
"bytes"
|
|
"cmp"
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"math"
|
|
"net/http"
|
|
"net/url"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/net/http2"
|
|
"tailscale.com/control/controlhttp"
|
|
"tailscale.com/envknob"
|
|
"tailscale.com/health"
|
|
"tailscale.com/internal/noiseconn"
|
|
"tailscale.com/net/dnscache"
|
|
"tailscale.com/net/netmon"
|
|
"tailscale.com/net/tsdial"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/tstime"
|
|
"tailscale.com/types/key"
|
|
"tailscale.com/types/logger"
|
|
"tailscale.com/util/mak"
|
|
"tailscale.com/util/multierr"
|
|
"tailscale.com/util/singleflight"
|
|
"tailscale.com/util/testenv"
|
|
)
|
|
|
|
// NoiseClient provides a http.Client to connect to tailcontrol over
|
|
// the ts2021 protocol.
|
|
type NoiseClient struct {
|
|
// Client is an HTTP client to talk to the coordination server.
|
|
// It automatically makes a new Noise connection as needed.
|
|
// It does not support node key proofs. To do that, call
|
|
// noiseClient.getConn instead to make a connection.
|
|
*http.Client
|
|
|
|
// h2t is the HTTP/2 transport we use a bit to create new
|
|
// *http2.ClientConns. We don't use its connection pool and we don't use its
|
|
// dialing. We use it for exactly one reason: its idle timeout that can only
|
|
// be configured via the HTTP/1 config. And then we call NewClientConn (with
|
|
// an existing Noise connection) on the http2.Transport which sets up an
|
|
// http2.ClientConn using that idle timeout from an http1.Transport.
|
|
h2t *http2.Transport
|
|
|
|
// sfDial ensures that two concurrent requests for a noise connection only
|
|
// produce one shared one between the two callers.
|
|
sfDial singleflight.Group[struct{}, *noiseconn.Conn]
|
|
|
|
dialer *tsdial.Dialer
|
|
dnsCache *dnscache.Resolver
|
|
privKey key.MachinePrivate
|
|
serverPubKey key.MachinePublic
|
|
host string // the host part of serverURL
|
|
httpPort string // the default port to dial
|
|
httpsPort string // the fallback Noise-over-https port or empty if none
|
|
|
|
// dialPlan optionally returns a ControlDialPlan previously received
|
|
// from the control server; either the function or the return value can
|
|
// be nil.
|
|
dialPlan func() *tailcfg.ControlDialPlan
|
|
|
|
logf logger.Logf
|
|
netMon *netmon.Monitor
|
|
health *health.Tracker
|
|
|
|
// mu only protects the following variables.
|
|
mu sync.Mutex
|
|
closed bool
|
|
last *noiseconn.Conn // or nil
|
|
nextID int
|
|
connPool map[int]*noiseconn.Conn // active connections not yet closed; see noiseconn.Conn.Close
|
|
}
|
|
|
|
// NoiseOpts contains options for the NewNoiseClient function. All fields are
|
|
// required unless otherwise specified.
|
|
type NoiseOpts struct {
|
|
// PrivKey is this node's private key.
|
|
PrivKey key.MachinePrivate
|
|
// ServerPubKey is the public key of the server.
|
|
ServerPubKey key.MachinePublic
|
|
// ServerURL is the URL of the server to connect to.
|
|
ServerURL string
|
|
// Dialer's SystemDial function is used to connect to the server.
|
|
Dialer *tsdial.Dialer
|
|
// DNSCache is the caching Resolver to use to connect to the server.
|
|
//
|
|
// This field can be nil.
|
|
DNSCache *dnscache.Resolver
|
|
// Logf is the log function to use. This field can be nil.
|
|
Logf logger.Logf
|
|
// NetMon is the network monitor that, if set, will be used to get the
|
|
// network interface state. This field can be nil; if so, the current
|
|
// state will be looked up dynamically.
|
|
NetMon *netmon.Monitor
|
|
// HealthTracker, if non-nil, is the health tracker to use.
|
|
HealthTracker *health.Tracker
|
|
// DialPlan, if set, is a function that should return an explicit plan
|
|
// on how to connect to the server.
|
|
DialPlan func() *tailcfg.ControlDialPlan
|
|
}
|
|
|
|
// controlIsPlaintext is whether we should assume that the controlplane is only accessible
|
|
// over plaintext HTTP (as the first hop, before the ts2021 encryption begins).
|
|
// This is used by some tests which don't have a real TLS certificate.
|
|
var controlIsPlaintext = envknob.RegisterBool("TS_CONTROL_IS_PLAINTEXT_HTTP")
|
|
|
|
// NewNoiseClient returns a new noiseClient for the provided server and machine key.
|
|
// serverURL is of the form https://<host>:<port> (no trailing slash).
|
|
//
|
|
// netMon may be nil, if non-nil it's used to do faster interface lookups.
|
|
// dialPlan may be nil
|
|
func NewNoiseClient(opts NoiseOpts) (*NoiseClient, error) {
|
|
u, err := url.Parse(opts.ServerURL)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var httpPort string
|
|
var httpsPort string
|
|
if port := u.Port(); port != "" {
|
|
// If there is an explicit port specified, trust the scheme and hope for the best
|
|
if u.Scheme == "http" {
|
|
httpPort = port
|
|
httpsPort = "443"
|
|
if (testenv.InTest() || controlIsPlaintext()) && (u.Hostname() == "127.0.0.1" || u.Hostname() == "localhost") {
|
|
httpsPort = ""
|
|
}
|
|
} else {
|
|
httpPort = "80"
|
|
httpsPort = port
|
|
}
|
|
} else {
|
|
// Otherwise, use the standard ports
|
|
httpPort = "80"
|
|
httpsPort = "443"
|
|
}
|
|
|
|
np := &NoiseClient{
|
|
serverPubKey: opts.ServerPubKey,
|
|
privKey: opts.PrivKey,
|
|
host: u.Hostname(),
|
|
httpPort: httpPort,
|
|
httpsPort: httpsPort,
|
|
dialer: opts.Dialer,
|
|
dnsCache: opts.DNSCache,
|
|
dialPlan: opts.DialPlan,
|
|
logf: opts.Logf,
|
|
netMon: opts.NetMon,
|
|
health: opts.HealthTracker,
|
|
}
|
|
|
|
// Create the HTTP/2 Transport using a net/http.Transport
|
|
// (which only does HTTP/1) because it's the only way to
|
|
// configure certain properties on the http2.Transport. But we
|
|
// never actually use the net/http.Transport for any HTTP/1
|
|
// requests.
|
|
h2Transport, err := http2.ConfigureTransports(&http.Transport{
|
|
IdleConnTimeout: time.Minute,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
np.h2t = h2Transport
|
|
|
|
np.Client = &http.Client{Transport: np}
|
|
return np, nil
|
|
}
|
|
|
|
// GetSingleUseRoundTripper returns a RoundTripper that can be only be used once
|
|
// (and must be used once) to make a single HTTP request over the noise channel
|
|
// to the coordination server.
|
|
//
|
|
// In addition to the RoundTripper, it returns the HTTP/2 channel's early noise
|
|
// payload, if any.
|
|
func (nc *NoiseClient) GetSingleUseRoundTripper(ctx context.Context) (http.RoundTripper, *tailcfg.EarlyNoise, error) {
|
|
for tries := 0; tries < 3; tries++ {
|
|
conn, err := nc.getConn(ctx)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
ok, earlyPayloadMaybeNil, err := conn.ReserveNewRequest(ctx)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if ok {
|
|
return conn, earlyPayloadMaybeNil, nil
|
|
}
|
|
}
|
|
return nil, nil, errors.New("[unexpected] failed to reserve a request on a connection")
|
|
}
|
|
|
|
// contextErr is an error that wraps another error and is used to indicate that
|
|
// the error was because a context expired.
|
|
type contextErr struct {
|
|
err error
|
|
}
|
|
|
|
func (e contextErr) Error() string {
|
|
return e.err.Error()
|
|
}
|
|
|
|
func (e contextErr) Unwrap() error {
|
|
return e.err
|
|
}
|
|
|
|
// getConn returns a noiseconn.Conn that can be used to make requests to the
|
|
// coordination server. It may return a cached connection or create a new one.
|
|
// Dials are singleflighted, so concurrent calls to getConn may only dial once.
|
|
// As such, context values may not be respected as there are no guarantees that
|
|
// the context passed to getConn is the same as the context passed to dial.
|
|
func (nc *NoiseClient) getConn(ctx context.Context) (*noiseconn.Conn, error) {
|
|
nc.mu.Lock()
|
|
if last := nc.last; last != nil && last.CanTakeNewRequest() {
|
|
nc.mu.Unlock()
|
|
return last, nil
|
|
}
|
|
nc.mu.Unlock()
|
|
|
|
for {
|
|
// We singeflight the dial to avoid making multiple connections, however
|
|
// that means that we can't simply cancel the dial if the context is
|
|
// canceled. Instead, we have to additionally check that the context
|
|
// which was canceled is our context and retry if our context is still
|
|
// valid.
|
|
conn, err, _ := nc.sfDial.Do(struct{}{}, func() (*noiseconn.Conn, error) {
|
|
c, err := nc.dial(ctx)
|
|
if err != nil {
|
|
if ctx.Err() != nil {
|
|
return nil, contextErr{ctx.Err()}
|
|
}
|
|
return nil, err
|
|
}
|
|
return c, nil
|
|
})
|
|
var ce contextErr
|
|
if err == nil || !errors.As(err, &ce) {
|
|
return conn, err
|
|
}
|
|
if ctx.Err() == nil {
|
|
// The dial failed because of a context error, but our context
|
|
// is still valid. Retry.
|
|
continue
|
|
}
|
|
// The dial failed because our context was canceled. Return the
|
|
// underlying error.
|
|
return nil, ce.Unwrap()
|
|
}
|
|
}
|
|
|
|
func (nc *NoiseClient) RoundTrip(req *http.Request) (*http.Response, error) {
|
|
ctx := req.Context()
|
|
conn, err := nc.getConn(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return conn.RoundTrip(req)
|
|
}
|
|
|
|
// connClosed removes the connection with the provided ID from the pool
|
|
// of active connections.
|
|
func (nc *NoiseClient) connClosed(id int) {
|
|
nc.mu.Lock()
|
|
defer nc.mu.Unlock()
|
|
conn := nc.connPool[id]
|
|
if conn != nil {
|
|
delete(nc.connPool, id)
|
|
if nc.last == conn {
|
|
nc.last = nil
|
|
}
|
|
}
|
|
}
|
|
|
|
// Close closes all the underlying noise connections.
|
|
// It is a no-op and returns nil if the connection is already closed.
|
|
func (nc *NoiseClient) Close() error {
|
|
nc.mu.Lock()
|
|
nc.closed = true
|
|
conns := nc.connPool
|
|
nc.connPool = nil
|
|
nc.mu.Unlock()
|
|
|
|
var errors []error
|
|
for _, c := range conns {
|
|
if err := c.Close(); err != nil {
|
|
errors = append(errors, err)
|
|
}
|
|
}
|
|
return multierr.New(errors...)
|
|
}
|
|
|
|
// dial opens a new connection to tailcontrol, fetching the server noise key
|
|
// if not cached.
|
|
func (nc *NoiseClient) dial(ctx context.Context) (*noiseconn.Conn, error) {
|
|
nc.mu.Lock()
|
|
connID := nc.nextID
|
|
nc.nextID++
|
|
nc.mu.Unlock()
|
|
|
|
if tailcfg.CurrentCapabilityVersion > math.MaxUint16 {
|
|
// Panic, because a test should have started failing several
|
|
// thousand version numbers before getting to this point.
|
|
panic("capability version is too high to fit in the wire protocol")
|
|
}
|
|
|
|
var dialPlan *tailcfg.ControlDialPlan
|
|
if nc.dialPlan != nil {
|
|
dialPlan = nc.dialPlan()
|
|
}
|
|
|
|
// If we have a dial plan, then set our timeout as slightly longer than
|
|
// the maximum amount of time contained therein; we assume that
|
|
// explicit instructions on timeouts are more useful than a single
|
|
// hard-coded timeout.
|
|
//
|
|
// The default value of 5 is chosen so that, when there's no dial plan,
|
|
// we retain the previous behaviour of 10 seconds end-to-end timeout.
|
|
timeoutSec := 5.0
|
|
if dialPlan != nil {
|
|
for _, c := range dialPlan.Candidates {
|
|
if v := c.DialStartDelaySec + c.DialTimeoutSec; v > timeoutSec {
|
|
timeoutSec = v
|
|
}
|
|
}
|
|
}
|
|
|
|
// After we establish a connection, we need some time to actually
|
|
// upgrade it into a Noise connection. With a ballpark worst-case RTT
|
|
// of 1000ms, give ourselves an extra 5 seconds to complete the
|
|
// handshake.
|
|
timeoutSec += 5
|
|
|
|
// Be extremely defensive and ensure that the timeout is in the range
|
|
// [5, 60] seconds (e.g. if we accidentally get a negative number).
|
|
if timeoutSec > 60 {
|
|
timeoutSec = 60
|
|
} else if timeoutSec < 5 {
|
|
timeoutSec = 5
|
|
}
|
|
|
|
timeout := time.Duration(timeoutSec * float64(time.Second))
|
|
ctx, cancel := context.WithTimeout(ctx, timeout)
|
|
defer cancel()
|
|
|
|
clientConn, err := (&controlhttp.Dialer{
|
|
Hostname: nc.host,
|
|
HTTPPort: nc.httpPort,
|
|
HTTPSPort: cmp.Or(nc.httpsPort, controlhttp.NoPort),
|
|
MachineKey: nc.privKey,
|
|
ControlKey: nc.serverPubKey,
|
|
ProtocolVersion: uint16(tailcfg.CurrentCapabilityVersion),
|
|
Dialer: nc.dialer.SystemDial,
|
|
DNSCache: nc.dnsCache,
|
|
DialPlan: dialPlan,
|
|
Logf: nc.logf,
|
|
NetMon: nc.netMon,
|
|
HealthTracker: nc.health,
|
|
Clock: tstime.StdClock{},
|
|
}).Dial(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
ncc, err := noiseconn.New(clientConn.Conn, nc.h2t, connID, nc.connClosed)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
nc.mu.Lock()
|
|
if nc.closed {
|
|
nc.mu.Unlock()
|
|
ncc.Close() // Needs to be called without holding the lock.
|
|
return nil, errors.New("noise client closed")
|
|
}
|
|
defer nc.mu.Unlock()
|
|
mak.Set(&nc.connPool, connID, ncc)
|
|
nc.last = ncc
|
|
return ncc, nil
|
|
}
|
|
|
|
// post does a POST to the control server at the given path, JSON-encoding body.
|
|
// The provided nodeKey is an optional load balancing hint.
|
|
func (nc *NoiseClient) post(ctx context.Context, path string, nodeKey key.NodePublic, body any) (*http.Response, error) {
|
|
jbody, err := json.Marshal(body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
req, err := http.NewRequestWithContext(ctx, "POST", "https://"+nc.host+path, bytes.NewReader(jbody))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
addLBHeader(req, nodeKey)
|
|
req.Header.Set("Content-Type", "application/json")
|
|
|
|
conn, err := nc.getConn(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return conn.RoundTrip(req)
|
|
}
|