mirror of
https://github.com/tailscale/tailscale.git
synced 2024-12-04 23:45:34 +00:00
a17c45fd6e
Updates #8587 Signed-off-by: Claire Wang <claire@tailscale.com>
533 lines
17 KiB
Go
533 lines
17 KiB
Go
// Copyright (c) Tailscale Inc & AUTHORS
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
//go:build !js
|
|
|
|
// Package controlhttp implements the Tailscale 2021 control protocol
|
|
// base transport over HTTP.
|
|
//
|
|
// This tunnels the protocol in control/controlbase over HTTP with a
|
|
// variety of compatibility fallbacks for handling picky or deep
|
|
// inspecting proxies.
|
|
//
|
|
// In the happy path, a client makes a single cleartext HTTP request
|
|
// to the server, the server responds with 101 Switching Protocols,
|
|
// and the control base protocol takes place over plain TCP.
|
|
//
|
|
// In the compatibility path, the client does the above over HTTPS,
|
|
// resulting in double encryption (once for the control transport, and
|
|
// once for the outer TLS layer).
|
|
package controlhttp
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"encoding/base64"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"net"
|
|
"net/http"
|
|
"net/http/httptrace"
|
|
"net/netip"
|
|
"net/url"
|
|
"sort"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"tailscale.com/control/controlbase"
|
|
"tailscale.com/envknob"
|
|
"tailscale.com/net/dnscache"
|
|
"tailscale.com/net/dnsfallback"
|
|
"tailscale.com/net/netutil"
|
|
"tailscale.com/net/sockstats"
|
|
"tailscale.com/net/tlsdial"
|
|
"tailscale.com/net/tshttpproxy"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/tstime"
|
|
"tailscale.com/util/multierr"
|
|
)
|
|
|
|
var stdDialer net.Dialer
|
|
|
|
// Dial connects to the HTTP server at this Dialer's Host:HTTPPort, requests to
|
|
// switch to the Tailscale control protocol, and returns an established control
|
|
// protocol connection.
|
|
//
|
|
// If Dial fails to connect using HTTP, it also tries to tunnel over TLS to the
|
|
// Dialer's Host:HTTPSPort as a compatibility fallback.
|
|
//
|
|
// The provided ctx is only used for the initial connection, until
|
|
// Dial returns. It does not affect the connection once established.
|
|
func (a *Dialer) Dial(ctx context.Context) (*ClientConn, error) {
|
|
if a.Hostname == "" {
|
|
return nil, errors.New("required Dialer.Hostname empty")
|
|
}
|
|
return a.dial(ctx)
|
|
}
|
|
|
|
func (a *Dialer) logf(format string, args ...any) {
|
|
if a.Logf != nil {
|
|
a.Logf(format, args...)
|
|
}
|
|
}
|
|
|
|
func (a *Dialer) getProxyFunc() func(*http.Request) (*url.URL, error) {
|
|
if a.proxyFunc != nil {
|
|
return a.proxyFunc
|
|
}
|
|
return tshttpproxy.ProxyFromEnvironment
|
|
}
|
|
|
|
// httpsFallbackDelay is how long we'll wait for a.HTTPPort to work before
|
|
// starting to try a.HTTPSPort.
|
|
func (a *Dialer) httpsFallbackDelay() time.Duration {
|
|
if forceNoise443() {
|
|
return time.Nanosecond
|
|
}
|
|
if v := a.testFallbackDelay; v != 0 {
|
|
return v
|
|
}
|
|
return 500 * time.Millisecond
|
|
}
|
|
|
|
var _ = envknob.RegisterBool("TS_USE_CONTROL_DIAL_PLAN") // to record at init time whether it's in use
|
|
|
|
func (a *Dialer) dial(ctx context.Context) (*ClientConn, error) {
|
|
// If we don't have a dial plan, just fall back to dialing the single
|
|
// host we know about.
|
|
useDialPlan := envknob.BoolDefaultTrue("TS_USE_CONTROL_DIAL_PLAN")
|
|
if !useDialPlan || a.DialPlan == nil || len(a.DialPlan.Candidates) == 0 {
|
|
return a.dialHost(ctx, netip.Addr{})
|
|
}
|
|
candidates := a.DialPlan.Candidates
|
|
|
|
// Otherwise, we try dialing per the plan. Store the highest priority
|
|
// in the list, so that if we get a connection to one of those
|
|
// candidates we can return quickly.
|
|
var highestPriority int = math.MinInt
|
|
for _, c := range candidates {
|
|
if c.Priority > highestPriority {
|
|
highestPriority = c.Priority
|
|
}
|
|
}
|
|
|
|
// This context allows us to cancel in-flight connections if we get a
|
|
// highest-priority connection before we're all done.
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
// Now, for each candidate, kick off a dial in parallel.
|
|
type dialResult struct {
|
|
conn *ClientConn
|
|
err error
|
|
addr netip.Addr
|
|
priority int
|
|
}
|
|
resultsCh := make(chan dialResult, len(candidates))
|
|
|
|
var pending atomic.Int32
|
|
pending.Store(int32(len(candidates)))
|
|
for _, c := range candidates {
|
|
go func(ctx context.Context, c tailcfg.ControlIPCandidate) {
|
|
var (
|
|
conn *ClientConn
|
|
err error
|
|
)
|
|
|
|
// Always send results back to our channel.
|
|
defer func() {
|
|
resultsCh <- dialResult{conn, err, c.IP, c.Priority}
|
|
if pending.Add(-1) == 0 {
|
|
close(resultsCh)
|
|
}
|
|
}()
|
|
|
|
// If non-zero, wait the configured start timeout
|
|
// before we do anything.
|
|
if c.DialStartDelaySec > 0 {
|
|
a.logf("[v2] controlhttp: waiting %.2f seconds before dialing %q @ %v", c.DialStartDelaySec, a.Hostname, c.IP)
|
|
if a.Clock == nil {
|
|
a.Clock = tstime.StdClock{}
|
|
}
|
|
tmr, tmrChannel := a.Clock.NewTimer(time.Duration(c.DialStartDelaySec * float64(time.Second)))
|
|
defer tmr.Stop()
|
|
select {
|
|
case <-ctx.Done():
|
|
err = ctx.Err()
|
|
return
|
|
case <-tmrChannel:
|
|
}
|
|
}
|
|
|
|
// Now, create a sub-context with the given timeout and
|
|
// try dialing the provided host.
|
|
ctx, cancel := context.WithTimeout(ctx, time.Duration(c.DialTimeoutSec*float64(time.Second)))
|
|
defer cancel()
|
|
|
|
// This will dial, and the defer above sends it back to our parent.
|
|
a.logf("[v2] controlhttp: trying to dial %q @ %v", a.Hostname, c.IP)
|
|
conn, err = a.dialHost(ctx, c.IP)
|
|
}(ctx, c)
|
|
}
|
|
|
|
var results []dialResult
|
|
for res := range resultsCh {
|
|
// If we get a response that has the highest priority, we don't
|
|
// need to wait for any of the other connections to finish; we
|
|
// can just return this connection.
|
|
//
|
|
// TODO(andrew): we could make this better by keeping track of
|
|
// the highest remaining priority dynamically, instead of just
|
|
// checking for the highest total
|
|
if res.priority == highestPriority && res.conn != nil {
|
|
a.logf("[v1] controlhttp: high-priority success dialing %q @ %v from dial plan", a.Hostname, res.addr)
|
|
|
|
// Drain the channel and any existing connections in
|
|
// the background.
|
|
go func() {
|
|
for _, res := range results {
|
|
if res.conn != nil {
|
|
res.conn.Close()
|
|
}
|
|
}
|
|
for res := range resultsCh {
|
|
if res.conn != nil {
|
|
res.conn.Close()
|
|
}
|
|
}
|
|
if a.drainFinished != nil {
|
|
close(a.drainFinished)
|
|
}
|
|
}()
|
|
return res.conn, nil
|
|
}
|
|
|
|
// This isn't a highest-priority result, so just store it until
|
|
// we're done.
|
|
results = append(results, res)
|
|
}
|
|
|
|
// After we finish this function, close any remaining open connections.
|
|
defer func() {
|
|
for _, result := range results {
|
|
// Note: below, we nil out the returned connection (if
|
|
// any) in the slice so we don't close it.
|
|
if result.conn != nil {
|
|
result.conn.Close()
|
|
}
|
|
}
|
|
|
|
// We don't drain asynchronously after this point, so notify our
|
|
// channel when we return.
|
|
if a.drainFinished != nil {
|
|
close(a.drainFinished)
|
|
}
|
|
}()
|
|
|
|
// Sort by priority, then take the first non-error response.
|
|
sort.Slice(results, func(i, j int) bool {
|
|
// NOTE: intentionally inverted so that the highest priority
|
|
// item comes first
|
|
return results[i].priority > results[j].priority
|
|
})
|
|
|
|
var (
|
|
conn *ClientConn
|
|
errs []error
|
|
)
|
|
for i, result := range results {
|
|
if result.err != nil {
|
|
errs = append(errs, result.err)
|
|
continue
|
|
}
|
|
|
|
a.logf("[v1] controlhttp: succeeded dialing %q @ %v from dial plan", a.Hostname, result.addr)
|
|
conn = result.conn
|
|
results[i].conn = nil // so we don't close it in the defer
|
|
return conn, nil
|
|
}
|
|
merr := multierr.New(errs...)
|
|
|
|
// If we get here, then we didn't get anywhere with our dial plan; fall back to just using DNS.
|
|
a.logf("controlhttp: failed dialing using DialPlan, falling back to DNS; errs=%s", merr.Error())
|
|
return a.dialHost(ctx, netip.Addr{})
|
|
}
|
|
|
|
// The TS_FORCE_NOISE_443 envknob forces the controlclient noise dialer to
|
|
// always use port 443 HTTPS connections to the controlplane and not try the
|
|
// port 80 HTTP fast path.
|
|
//
|
|
// This is currently (2023-01-17) needed for Docker Desktop's "VPNKit" proxy
|
|
// that breaks port 80 for us post-Noise-handshake, causing us to never try port
|
|
// 443. Until one of Docker's proxy and/or this package's port 443 fallback is
|
|
// fixed, this is a workaround. It might also be useful for future debugging.
|
|
var forceNoise443 = envknob.RegisterBool("TS_FORCE_NOISE_443")
|
|
|
|
var debugNoiseDial = envknob.RegisterBool("TS_DEBUG_NOISE_DIAL")
|
|
|
|
// dialHost connects to the configured Dialer.Hostname and upgrades the
|
|
// connection into a controlbase.Conn. If addr is valid, then no DNS is used
|
|
// and the connection will be made to the provided address.
|
|
func (a *Dialer) dialHost(ctx context.Context, addr netip.Addr) (*ClientConn, error) {
|
|
// Create one shared context used by both port 80 and port 443 dials.
|
|
// If port 80 is still in flight when 443 returns, this deferred cancel
|
|
// will stop the port 80 dial.
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
ctx = sockstats.WithSockStats(ctx, sockstats.LabelControlClientDialer, a.logf)
|
|
|
|
// u80 and u443 are the URLs we'll try to hit over HTTP or HTTPS,
|
|
// respectively, in order to do the HTTP upgrade to a net.Conn over which
|
|
// we'll speak Noise.
|
|
u80 := &url.URL{
|
|
Scheme: "http",
|
|
Host: net.JoinHostPort(a.Hostname, strDef(a.HTTPPort, "80")),
|
|
Path: serverUpgradePath,
|
|
}
|
|
u443 := &url.URL{
|
|
Scheme: "https",
|
|
Host: net.JoinHostPort(a.Hostname, strDef(a.HTTPSPort, "443")),
|
|
Path: serverUpgradePath,
|
|
}
|
|
|
|
type tryURLRes struct {
|
|
u *url.URL // input (the URL conn+err are for/from)
|
|
conn *ClientConn // result (mutually exclusive with err)
|
|
err error
|
|
}
|
|
ch := make(chan tryURLRes) // must be unbuffered
|
|
try := func(u *url.URL) {
|
|
if debugNoiseDial() {
|
|
a.logf("trying noise dial (%v, %v) ...", u, addr)
|
|
}
|
|
cbConn, err := a.dialURL(ctx, u, addr)
|
|
if debugNoiseDial() {
|
|
a.logf("noise dial (%v, %v) = (%v, %v)", u, addr, cbConn, err)
|
|
}
|
|
select {
|
|
case ch <- tryURLRes{u, cbConn, err}:
|
|
case <-ctx.Done():
|
|
if cbConn != nil {
|
|
cbConn.Close()
|
|
}
|
|
}
|
|
}
|
|
|
|
// Start the plaintext HTTP attempt first, unless disabled by the envknob.
|
|
if !forceNoise443() {
|
|
go try(u80)
|
|
}
|
|
|
|
// In case outbound port 80 blocked or MITM'ed poorly, start a backup timer
|
|
// to dial port 443 if port 80 doesn't either succeed or fail quickly.
|
|
if a.Clock == nil {
|
|
a.Clock = tstime.StdClock{}
|
|
}
|
|
try443Timer := a.Clock.AfterFunc(a.httpsFallbackDelay(), func() { try(u443) })
|
|
defer try443Timer.Stop()
|
|
|
|
var err80, err443 error
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil, fmt.Errorf("connection attempts aborted by context: %w", ctx.Err())
|
|
case res := <-ch:
|
|
if res.err == nil {
|
|
return res.conn, nil
|
|
}
|
|
switch res.u {
|
|
case u80:
|
|
// Connecting over plain HTTP failed; assume it's an HTTP proxy
|
|
// being difficult and see if we can get through over HTTPS.
|
|
err80 = res.err
|
|
// Stop the fallback timer and run it immediately. We don't use
|
|
// Timer.Reset(0) here because on AfterFuncs, that can run it
|
|
// again.
|
|
if try443Timer.Stop() {
|
|
go try(u443)
|
|
} // else we lost the race and it started already which is what we want
|
|
case u443:
|
|
err443 = res.err
|
|
default:
|
|
panic("invalid")
|
|
}
|
|
if err80 != nil && err443 != nil {
|
|
return nil, fmt.Errorf("all connection attempts failed (HTTP: %v, HTTPS: %v)", err80, err443)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// dialURL attempts to connect to the given URL.
|
|
func (a *Dialer) dialURL(ctx context.Context, u *url.URL, addr netip.Addr) (*ClientConn, error) {
|
|
init, cont, err := controlbase.ClientDeferred(a.MachineKey, a.ControlKey, a.ProtocolVersion)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
netConn, err := a.tryURLUpgrade(ctx, u, addr, init)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cbConn, err := cont(ctx, netConn)
|
|
if err != nil {
|
|
netConn.Close()
|
|
return nil, err
|
|
}
|
|
return &ClientConn{
|
|
Conn: cbConn,
|
|
}, nil
|
|
}
|
|
|
|
// resolver returns a.DNSCache if non-nil or a new *dnscache.Resolver
|
|
// otherwise.
|
|
func (a *Dialer) resolver() *dnscache.Resolver {
|
|
if a.DNSCache != nil {
|
|
return a.DNSCache
|
|
}
|
|
|
|
return &dnscache.Resolver{
|
|
Forward: dnscache.Get().Forward,
|
|
LookupIPFallback: dnsfallback.MakeLookupFunc(a.logf, a.NetMon),
|
|
UseLastGood: true,
|
|
Logf: a.Logf, // not a.logf method; we want to propagate nil-ness
|
|
NetMon: a.NetMon,
|
|
}
|
|
}
|
|
|
|
// tryURLUpgrade connects to u, and tries to upgrade it to a net.Conn. If addr
|
|
// is valid, then no DNS is used and the connection will be made to the
|
|
// provided address.
|
|
//
|
|
// Only the provided ctx is used, not a.ctx.
|
|
func (a *Dialer) tryURLUpgrade(ctx context.Context, u *url.URL, addr netip.Addr, init []byte) (net.Conn, error) {
|
|
var dns *dnscache.Resolver
|
|
|
|
// If we were provided an address to dial, then create a resolver that just
|
|
// returns that value; otherwise, fall back to DNS.
|
|
if addr.IsValid() {
|
|
dns = &dnscache.Resolver{
|
|
SingleHostStaticResult: []netip.Addr{addr},
|
|
SingleHost: u.Hostname(),
|
|
Logf: a.Logf, // not a.logf method; we want to propagate nil-ness
|
|
NetMon: a.NetMon,
|
|
}
|
|
} else {
|
|
dns = a.resolver()
|
|
}
|
|
|
|
var dialer dnscache.DialContextFunc
|
|
if a.Dialer != nil {
|
|
dialer = a.Dialer
|
|
} else {
|
|
dialer = stdDialer.DialContext
|
|
}
|
|
|
|
tr := http.DefaultTransport.(*http.Transport).Clone()
|
|
defer tr.CloseIdleConnections()
|
|
tr.Proxy = a.getProxyFunc()
|
|
tshttpproxy.SetTransportGetProxyConnectHeader(tr)
|
|
tr.DialContext = dnscache.Dialer(dialer, dns)
|
|
// Disable HTTP2, since h2 can't do protocol switching.
|
|
tr.TLSClientConfig.NextProtos = []string{}
|
|
tr.TLSNextProto = map[string]func(string, *tls.Conn) http.RoundTripper{}
|
|
tr.TLSClientConfig = tlsdial.Config(a.Hostname, tr.TLSClientConfig)
|
|
if !tr.TLSClientConfig.InsecureSkipVerify {
|
|
panic("unexpected") // should be set by tlsdial.Config
|
|
}
|
|
verify := tr.TLSClientConfig.VerifyConnection
|
|
if verify == nil {
|
|
panic("unexpected") // should be set by tlsdial.Config
|
|
}
|
|
// Demote all cert verification errors to log messages. We don't actually
|
|
// care about the TLS security (because we just do the Noise crypto atop whatever
|
|
// connection we get, including HTTP port 80 plaintext) so this permits
|
|
// middleboxes to MITM their users. All they'll see is some Noise.
|
|
tr.TLSClientConfig.VerifyConnection = func(cs tls.ConnectionState) error {
|
|
if err := verify(cs); err != nil && a.Logf != nil && !a.omitCertErrorLogging {
|
|
a.Logf("warning: TLS cert verificication for %q failed: %v", a.Hostname, err)
|
|
}
|
|
return nil // regardless
|
|
}
|
|
|
|
tr.DialTLSContext = dnscache.TLSDialer(dialer, dns, tr.TLSClientConfig)
|
|
tr.DisableCompression = true
|
|
|
|
// (mis)use httptrace to extract the underlying net.Conn from the
|
|
// transport. We make exactly 1 request using this transport, so
|
|
// there will be exactly 1 GotConn call. Additionally, the
|
|
// transport handles 101 Switching Protocols correctly, such that
|
|
// the Conn will not be reused or kept alive by the transport once
|
|
// the response has been handed back from RoundTrip.
|
|
//
|
|
// In theory, the machinery of net/http should make it such that
|
|
// the trace callback happens-before we get the response, but
|
|
// there's no promise of that. So, to make sure, we use a buffered
|
|
// channel as a synchronization step to avoid data races.
|
|
//
|
|
// Note that even though we're able to extract a net.Conn via this
|
|
// mechanism, we must still keep using the eventual resp.Body to
|
|
// read from, because it includes a buffer we can't get rid of. If
|
|
// the server never sends any data after sending the HTTP
|
|
// response, we could get away with it, but violating this
|
|
// assumption leads to very mysterious transport errors (lockups,
|
|
// unexpected EOFs...), and we're bound to forget someday and
|
|
// introduce a protocol optimization at a higher level that starts
|
|
// eagerly transmitting from the server.
|
|
connCh := make(chan net.Conn, 1)
|
|
trace := httptrace.ClientTrace{
|
|
GotConn: func(info httptrace.GotConnInfo) {
|
|
connCh <- info.Conn
|
|
},
|
|
}
|
|
ctx = httptrace.WithClientTrace(ctx, &trace)
|
|
req := &http.Request{
|
|
Method: "POST",
|
|
URL: u,
|
|
Header: http.Header{
|
|
"Upgrade": []string{upgradeHeaderValue},
|
|
"Connection": []string{"upgrade"},
|
|
handshakeHeaderName: []string{base64.StdEncoding.EncodeToString(init)},
|
|
},
|
|
}
|
|
req = req.WithContext(ctx)
|
|
|
|
resp, err := tr.RoundTrip(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusSwitchingProtocols {
|
|
return nil, fmt.Errorf("unexpected HTTP response: %s", resp.Status)
|
|
}
|
|
|
|
// From here on, the underlying net.Conn is ours to use, but there
|
|
// is still a read buffer attached to it within resp.Body. So, we
|
|
// must direct I/O through resp.Body, but we can still use the
|
|
// underlying net.Conn for stuff like deadlines.
|
|
var switchedConn net.Conn
|
|
select {
|
|
case switchedConn = <-connCh:
|
|
default:
|
|
}
|
|
if switchedConn == nil {
|
|
resp.Body.Close()
|
|
return nil, fmt.Errorf("httptrace didn't provide a connection")
|
|
}
|
|
|
|
if next := resp.Header.Get("Upgrade"); next != upgradeHeaderValue {
|
|
resp.Body.Close()
|
|
return nil, fmt.Errorf("server switched to unexpected protocol %q", next)
|
|
}
|
|
|
|
rwc, ok := resp.Body.(io.ReadWriteCloser)
|
|
if !ok {
|
|
resp.Body.Close()
|
|
return nil, errors.New("http Transport did not provide a writable body")
|
|
}
|
|
|
|
return netutil.NewAltReadWriteCloserConn(rwc, switchedConn), nil
|
|
}
|