mirror of
https://github.com/tailscale/tailscale.git
synced 2024-12-01 22:15:51 +00:00
7330aa593e
On some platforms (notably macOS and iOS) we look up the default interface to bind outgoing connections to. This is both duplicated work and results in logspam when the default interface is not available (i.e. when a phone has no connectivity, we log an error and thus cause more things that we will try to upload and fail). Fixed by passing around a netmon.Monitor to more places, so that we can use its cached interface state. Fixes #7850 Updates #7621 Signed-off-by: Mihai Parparita <mihai@tailscale.com>
516 lines
16 KiB
Go
516 lines
16 KiB
Go
// Copyright (c) Tailscale Inc & AUTHORS
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
//go:build !js
|
|
|
|
// Package controlhttp implements the Tailscale 2021 control protocol
|
|
// base transport over HTTP.
|
|
//
|
|
// This tunnels the protocol in control/controlbase over HTTP with a
|
|
// variety of compatibility fallbacks for handling picky or deep
|
|
// inspecting proxies.
|
|
//
|
|
// In the happy path, a client makes a single cleartext HTTP request
|
|
// to the server, the server responds with 101 Switching Protocols,
|
|
// and the control base protocol takes place over plain TCP.
|
|
//
|
|
// In the compatibility path, the client does the above over HTTPS,
|
|
// resulting in double encryption (once for the control transport, and
|
|
// once for the outer TLS layer).
|
|
package controlhttp
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"encoding/base64"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"net"
|
|
"net/http"
|
|
"net/http/httptrace"
|
|
"net/netip"
|
|
"net/url"
|
|
"sort"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"tailscale.com/control/controlbase"
|
|
"tailscale.com/envknob"
|
|
"tailscale.com/net/dnscache"
|
|
"tailscale.com/net/dnsfallback"
|
|
"tailscale.com/net/netutil"
|
|
"tailscale.com/net/sockstats"
|
|
"tailscale.com/net/tlsdial"
|
|
"tailscale.com/net/tshttpproxy"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/util/multierr"
|
|
)
|
|
|
|
var stdDialer net.Dialer
|
|
|
|
// Dial connects to the HTTP server at this Dialer's Host:HTTPPort, requests to
|
|
// switch to the Tailscale control protocol, and returns an established control
|
|
// protocol connection.
|
|
//
|
|
// If Dial fails to connect using HTTP, it also tries to tunnel over TLS to the
|
|
// Dialer's Host:HTTPSPort as a compatibility fallback.
|
|
//
|
|
// The provided ctx is only used for the initial connection, until
|
|
// Dial returns. It does not affect the connection once established.
|
|
func (a *Dialer) Dial(ctx context.Context) (*ClientConn, error) {
|
|
if a.Hostname == "" {
|
|
return nil, errors.New("required Dialer.Hostname empty")
|
|
}
|
|
return a.dial(ctx)
|
|
}
|
|
|
|
func (a *Dialer) logf(format string, args ...any) {
|
|
if a.Logf != nil {
|
|
a.Logf(format, args...)
|
|
}
|
|
}
|
|
|
|
func (a *Dialer) getProxyFunc() func(*http.Request) (*url.URL, error) {
|
|
if a.proxyFunc != nil {
|
|
return a.proxyFunc
|
|
}
|
|
return tshttpproxy.ProxyFromEnvironment
|
|
}
|
|
|
|
// httpsFallbackDelay is how long we'll wait for a.HTTPPort to work before
|
|
// starting to try a.HTTPSPort.
|
|
func (a *Dialer) httpsFallbackDelay() time.Duration {
|
|
if forceNoise443() {
|
|
return time.Nanosecond
|
|
}
|
|
if v := a.testFallbackDelay; v != 0 {
|
|
return v
|
|
}
|
|
return 500 * time.Millisecond
|
|
}
|
|
|
|
var _ = envknob.RegisterBool("TS_USE_CONTROL_DIAL_PLAN") // to record at init time whether it's in use
|
|
|
|
func (a *Dialer) dial(ctx context.Context) (*ClientConn, error) {
|
|
// If we don't have a dial plan, just fall back to dialing the single
|
|
// host we know about.
|
|
useDialPlan := envknob.BoolDefaultTrue("TS_USE_CONTROL_DIAL_PLAN")
|
|
if !useDialPlan || a.DialPlan == nil || len(a.DialPlan.Candidates) == 0 {
|
|
return a.dialHost(ctx, netip.Addr{})
|
|
}
|
|
candidates := a.DialPlan.Candidates
|
|
|
|
// Otherwise, we try dialing per the plan. Store the highest priority
|
|
// in the list, so that if we get a connection to one of those
|
|
// candidates we can return quickly.
|
|
var highestPriority int = math.MinInt
|
|
for _, c := range candidates {
|
|
if c.Priority > highestPriority {
|
|
highestPriority = c.Priority
|
|
}
|
|
}
|
|
|
|
// This context allows us to cancel in-flight connections if we get a
|
|
// highest-priority connection before we're all done.
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
// Now, for each candidate, kick off a dial in parallel.
|
|
type dialResult struct {
|
|
conn *ClientConn
|
|
err error
|
|
addr netip.Addr
|
|
priority int
|
|
}
|
|
resultsCh := make(chan dialResult, len(candidates))
|
|
|
|
var pending atomic.Int32
|
|
pending.Store(int32(len(candidates)))
|
|
for _, c := range candidates {
|
|
go func(ctx context.Context, c tailcfg.ControlIPCandidate) {
|
|
var (
|
|
conn *ClientConn
|
|
err error
|
|
)
|
|
|
|
// Always send results back to our channel.
|
|
defer func() {
|
|
resultsCh <- dialResult{conn, err, c.IP, c.Priority}
|
|
if pending.Add(-1) == 0 {
|
|
close(resultsCh)
|
|
}
|
|
}()
|
|
|
|
// If non-zero, wait the configured start timeout
|
|
// before we do anything.
|
|
if c.DialStartDelaySec > 0 {
|
|
a.logf("[v2] controlhttp: waiting %.2f seconds before dialing %q @ %v", c.DialStartDelaySec, a.Hostname, c.IP)
|
|
tmr := time.NewTimer(time.Duration(c.DialStartDelaySec * float64(time.Second)))
|
|
defer tmr.Stop()
|
|
select {
|
|
case <-ctx.Done():
|
|
err = ctx.Err()
|
|
return
|
|
case <-tmr.C:
|
|
}
|
|
}
|
|
|
|
// Now, create a sub-context with the given timeout and
|
|
// try dialing the provided host.
|
|
ctx, cancel := context.WithTimeout(ctx, time.Duration(c.DialTimeoutSec*float64(time.Second)))
|
|
defer cancel()
|
|
|
|
// This will dial, and the defer above sends it back to our parent.
|
|
a.logf("[v2] controlhttp: trying to dial %q @ %v", a.Hostname, c.IP)
|
|
conn, err = a.dialHost(ctx, c.IP)
|
|
}(ctx, c)
|
|
}
|
|
|
|
var results []dialResult
|
|
for res := range resultsCh {
|
|
// If we get a response that has the highest priority, we don't
|
|
// need to wait for any of the other connections to finish; we
|
|
// can just return this connection.
|
|
//
|
|
// TODO(andrew): we could make this better by keeping track of
|
|
// the highest remaining priority dynamically, instead of just
|
|
// checking for the highest total
|
|
if res.priority == highestPriority && res.conn != nil {
|
|
a.logf("[v1] controlhttp: high-priority success dialing %q @ %v from dial plan", a.Hostname, res.addr)
|
|
|
|
// Drain the channel and any existing connections in
|
|
// the background.
|
|
go func() {
|
|
for _, res := range results {
|
|
if res.conn != nil {
|
|
res.conn.Close()
|
|
}
|
|
}
|
|
for res := range resultsCh {
|
|
if res.conn != nil {
|
|
res.conn.Close()
|
|
}
|
|
}
|
|
if a.drainFinished != nil {
|
|
close(a.drainFinished)
|
|
}
|
|
}()
|
|
return res.conn, nil
|
|
}
|
|
|
|
// This isn't a highest-priority result, so just store it until
|
|
// we're done.
|
|
results = append(results, res)
|
|
}
|
|
|
|
// After we finish this function, close any remaining open connections.
|
|
defer func() {
|
|
for _, result := range results {
|
|
// Note: below, we nil out the returned connection (if
|
|
// any) in the slice so we don't close it.
|
|
if result.conn != nil {
|
|
result.conn.Close()
|
|
}
|
|
}
|
|
|
|
// We don't drain asynchronously after this point, so notify our
|
|
// channel when we return.
|
|
if a.drainFinished != nil {
|
|
close(a.drainFinished)
|
|
}
|
|
}()
|
|
|
|
// Sort by priority, then take the first non-error response.
|
|
sort.Slice(results, func(i, j int) bool {
|
|
// NOTE: intentionally inverted so that the highest priority
|
|
// item comes first
|
|
return results[i].priority > results[j].priority
|
|
})
|
|
|
|
var (
|
|
conn *ClientConn
|
|
errs []error
|
|
)
|
|
for i, result := range results {
|
|
if result.err != nil {
|
|
errs = append(errs, result.err)
|
|
continue
|
|
}
|
|
|
|
a.logf("[v1] controlhttp: succeeded dialing %q @ %v from dial plan", a.Hostname, result.addr)
|
|
conn = result.conn
|
|
results[i].conn = nil // so we don't close it in the defer
|
|
return conn, nil
|
|
}
|
|
merr := multierr.New(errs...)
|
|
|
|
// If we get here, then we didn't get anywhere with our dial plan; fall back to just using DNS.
|
|
a.logf("controlhttp: failed dialing using DialPlan, falling back to DNS; errs=%s", merr.Error())
|
|
return a.dialHost(ctx, netip.Addr{})
|
|
}
|
|
|
|
// The TS_FORCE_NOISE_443 envknob forces the controlclient noise dialer to
|
|
// always use port 443 HTTPS connections to the controlplane and not try the
|
|
// port 80 HTTP fast path.
|
|
//
|
|
// This is currently (2023-01-17) needed for Docker Desktop's "VPNKit" proxy
|
|
// that breaks port 80 for us post-Noise-handshake, causing us to never try port
|
|
// 443. Until one of Docker's proxy and/or this package's port 443 fallback is
|
|
// fixed, this is a workaround. It might also be useful for future debugging.
|
|
var forceNoise443 = envknob.RegisterBool("TS_FORCE_NOISE_443")
|
|
|
|
var debugNoiseDial = envknob.RegisterBool("TS_DEBUG_NOISE_DIAL")
|
|
|
|
// dialHost connects to the configured Dialer.Hostname and upgrades the
|
|
// connection into a controlbase.Conn. If addr is valid, then no DNS is used
|
|
// and the connection will be made to the provided address.
|
|
func (a *Dialer) dialHost(ctx context.Context, addr netip.Addr) (*ClientConn, error) {
|
|
// Create one shared context used by both port 80 and port 443 dials.
|
|
// If port 80 is still in flight when 443 returns, this deferred cancel
|
|
// will stop the port 80 dial.
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
ctx = sockstats.WithSockStats(ctx, sockstats.LabelControlClientDialer, a.logf)
|
|
|
|
// u80 and u443 are the URLs we'll try to hit over HTTP or HTTPS,
|
|
// respectively, in order to do the HTTP upgrade to a net.Conn over which
|
|
// we'll speak Noise.
|
|
u80 := &url.URL{
|
|
Scheme: "http",
|
|
Host: net.JoinHostPort(a.Hostname, strDef(a.HTTPPort, "80")),
|
|
Path: serverUpgradePath,
|
|
}
|
|
u443 := &url.URL{
|
|
Scheme: "https",
|
|
Host: net.JoinHostPort(a.Hostname, strDef(a.HTTPSPort, "443")),
|
|
Path: serverUpgradePath,
|
|
}
|
|
|
|
type tryURLRes struct {
|
|
u *url.URL // input (the URL conn+err are for/from)
|
|
conn *ClientConn // result (mutually exclusive with err)
|
|
err error
|
|
}
|
|
ch := make(chan tryURLRes) // must be unbuffered
|
|
try := func(u *url.URL) {
|
|
if debugNoiseDial() {
|
|
a.logf("trying noise dial (%v, %v) ...", u, addr)
|
|
}
|
|
cbConn, err := a.dialURL(ctx, u, addr)
|
|
if debugNoiseDial() {
|
|
a.logf("noise dial (%v, %v) = (%v, %v)", u, addr, cbConn, err)
|
|
}
|
|
select {
|
|
case ch <- tryURLRes{u, cbConn, err}:
|
|
case <-ctx.Done():
|
|
if cbConn != nil {
|
|
cbConn.Close()
|
|
}
|
|
}
|
|
}
|
|
|
|
// Start the plaintext HTTP attempt first, unless disabled by the envknob.
|
|
if !forceNoise443() {
|
|
go try(u80)
|
|
}
|
|
|
|
// In case outbound port 80 blocked or MITM'ed poorly, start a backup timer
|
|
// to dial port 443 if port 80 doesn't either succeed or fail quickly.
|
|
try443Timer := time.AfterFunc(a.httpsFallbackDelay(), func() { try(u443) })
|
|
defer try443Timer.Stop()
|
|
|
|
var err80, err443 error
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil, fmt.Errorf("connection attempts aborted by context: %w", ctx.Err())
|
|
case res := <-ch:
|
|
if res.err == nil {
|
|
return res.conn, nil
|
|
}
|
|
switch res.u {
|
|
case u80:
|
|
// Connecting over plain HTTP failed; assume it's an HTTP proxy
|
|
// being difficult and see if we can get through over HTTPS.
|
|
err80 = res.err
|
|
// Stop the fallback timer and run it immediately. We don't use
|
|
// Timer.Reset(0) here because on AfterFuncs, that can run it
|
|
// again.
|
|
if try443Timer.Stop() {
|
|
go try(u443)
|
|
} // else we lost the race and it started already which is what we want
|
|
case u443:
|
|
err443 = res.err
|
|
default:
|
|
panic("invalid")
|
|
}
|
|
if err80 != nil && err443 != nil {
|
|
return nil, fmt.Errorf("all connection attempts failed (HTTP: %v, HTTPS: %v)", err80, err443)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// dialURL attempts to connect to the given URL.
|
|
func (a *Dialer) dialURL(ctx context.Context, u *url.URL, addr netip.Addr) (*ClientConn, error) {
|
|
init, cont, err := controlbase.ClientDeferred(a.MachineKey, a.ControlKey, a.ProtocolVersion)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
netConn, err := a.tryURLUpgrade(ctx, u, addr, init)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cbConn, err := cont(ctx, netConn)
|
|
if err != nil {
|
|
netConn.Close()
|
|
return nil, err
|
|
}
|
|
return &ClientConn{
|
|
Conn: cbConn,
|
|
}, nil
|
|
}
|
|
|
|
// tryURLUpgrade connects to u, and tries to upgrade it to a net.Conn. If addr
|
|
// is valid, then no DNS is used and the connection will be made to the
|
|
// provided address.
|
|
//
|
|
// Only the provided ctx is used, not a.ctx.
|
|
func (a *Dialer) tryURLUpgrade(ctx context.Context, u *url.URL, addr netip.Addr, init []byte) (net.Conn, error) {
|
|
var dns *dnscache.Resolver
|
|
|
|
// If we were provided an address to dial, then create a resolver that just
|
|
// returns that value; otherwise, fall back to DNS.
|
|
if addr.IsValid() {
|
|
dns = &dnscache.Resolver{
|
|
SingleHostStaticResult: []netip.Addr{addr},
|
|
SingleHost: u.Hostname(),
|
|
Logf: a.Logf, // not a.logf method; we want to propagate nil-ness
|
|
NetMon: a.NetMon,
|
|
}
|
|
} else {
|
|
dns = &dnscache.Resolver{
|
|
Forward: dnscache.Get().Forward,
|
|
LookupIPFallback: dnsfallback.MakeLookupFunc(a.logf, a.NetMon),
|
|
UseLastGood: true,
|
|
Logf: a.Logf, // not a.logf method; we want to propagate nil-ness
|
|
NetMon: a.NetMon,
|
|
}
|
|
}
|
|
|
|
var dialer dnscache.DialContextFunc
|
|
if a.Dialer != nil {
|
|
dialer = a.Dialer
|
|
} else {
|
|
dialer = stdDialer.DialContext
|
|
}
|
|
|
|
tr := http.DefaultTransport.(*http.Transport).Clone()
|
|
defer tr.CloseIdleConnections()
|
|
tr.Proxy = a.getProxyFunc()
|
|
tshttpproxy.SetTransportGetProxyConnectHeader(tr)
|
|
tr.DialContext = dnscache.Dialer(dialer, dns)
|
|
// Disable HTTP2, since h2 can't do protocol switching.
|
|
tr.TLSClientConfig.NextProtos = []string{}
|
|
tr.TLSNextProto = map[string]func(string, *tls.Conn) http.RoundTripper{}
|
|
tr.TLSClientConfig = tlsdial.Config(a.Hostname, tr.TLSClientConfig)
|
|
if !tr.TLSClientConfig.InsecureSkipVerify {
|
|
panic("unexpected") // should be set by tlsdial.Config
|
|
}
|
|
verify := tr.TLSClientConfig.VerifyConnection
|
|
if verify == nil {
|
|
panic("unexpected") // should be set by tlsdial.Config
|
|
}
|
|
// Demote all cert verification errors to log messages. We don't actually
|
|
// care about the TLS security (because we just do the Noise crypto atop whatever
|
|
// connection we get, including HTTP port 80 plaintext) so this permits
|
|
// middleboxes to MITM their users. All they'll see is some Noise.
|
|
tr.TLSClientConfig.VerifyConnection = func(cs tls.ConnectionState) error {
|
|
if err := verify(cs); err != nil && a.Logf != nil && !a.omitCertErrorLogging {
|
|
a.Logf("warning: TLS cert verificication for %q failed: %v", a.Hostname, err)
|
|
}
|
|
return nil // regardless
|
|
}
|
|
|
|
tr.DialTLSContext = dnscache.TLSDialer(dialer, dns, tr.TLSClientConfig)
|
|
tr.DisableCompression = true
|
|
|
|
// (mis)use httptrace to extract the underlying net.Conn from the
|
|
// transport. We make exactly 1 request using this transport, so
|
|
// there will be exactly 1 GotConn call. Additionally, the
|
|
// transport handles 101 Switching Protocols correctly, such that
|
|
// the Conn will not be reused or kept alive by the transport once
|
|
// the response has been handed back from RoundTrip.
|
|
//
|
|
// In theory, the machinery of net/http should make it such that
|
|
// the trace callback happens-before we get the response, but
|
|
// there's no promise of that. So, to make sure, we use a buffered
|
|
// channel as a synchronization step to avoid data races.
|
|
//
|
|
// Note that even though we're able to extract a net.Conn via this
|
|
// mechanism, we must still keep using the eventual resp.Body to
|
|
// read from, because it includes a buffer we can't get rid of. If
|
|
// the server never sends any data after sending the HTTP
|
|
// response, we could get away with it, but violating this
|
|
// assumption leads to very mysterious transport errors (lockups,
|
|
// unexpected EOFs...), and we're bound to forget someday and
|
|
// introduce a protocol optimization at a higher level that starts
|
|
// eagerly transmitting from the server.
|
|
connCh := make(chan net.Conn, 1)
|
|
trace := httptrace.ClientTrace{
|
|
GotConn: func(info httptrace.GotConnInfo) {
|
|
connCh <- info.Conn
|
|
},
|
|
}
|
|
ctx = httptrace.WithClientTrace(ctx, &trace)
|
|
req := &http.Request{
|
|
Method: "POST",
|
|
URL: u,
|
|
Header: http.Header{
|
|
"Upgrade": []string{upgradeHeaderValue},
|
|
"Connection": []string{"upgrade"},
|
|
handshakeHeaderName: []string{base64.StdEncoding.EncodeToString(init)},
|
|
},
|
|
}
|
|
req = req.WithContext(ctx)
|
|
|
|
resp, err := tr.RoundTrip(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if resp.StatusCode != http.StatusSwitchingProtocols {
|
|
return nil, fmt.Errorf("unexpected HTTP response: %s", resp.Status)
|
|
}
|
|
|
|
// From here on, the underlying net.Conn is ours to use, but there
|
|
// is still a read buffer attached to it within resp.Body. So, we
|
|
// must direct I/O through resp.Body, but we can still use the
|
|
// underlying net.Conn for stuff like deadlines.
|
|
var switchedConn net.Conn
|
|
select {
|
|
case switchedConn = <-connCh:
|
|
default:
|
|
}
|
|
if switchedConn == nil {
|
|
resp.Body.Close()
|
|
return nil, fmt.Errorf("httptrace didn't provide a connection")
|
|
}
|
|
|
|
if next := resp.Header.Get("Upgrade"); next != upgradeHeaderValue {
|
|
resp.Body.Close()
|
|
return nil, fmt.Errorf("server switched to unexpected protocol %q", next)
|
|
}
|
|
|
|
rwc, ok := resp.Body.(io.ReadWriteCloser)
|
|
if !ok {
|
|
resp.Body.Close()
|
|
return nil, errors.New("http Transport did not provide a writable body")
|
|
}
|
|
|
|
return netutil.NewAltReadWriteCloserConn(rwc, switchedConn), nil
|
|
}
|