tailscale/derp/derphttp/derphttp_client.go
Brad Fitzpatrick e6b84f2159 all: make client use server-provided DERP map, add DERP region support
Instead of hard-coding the DERP map (except for cmd/tailscale netcheck
for now), get it from the control server at runtime.

And make the DERP map support multiple nodes per region with clients
picking the first one that's available. (The server will balance the
order presented to clients for load balancing)

This deletes the stunner package, merging it into the netcheck package
instead, to minimize all the config hooks that would've been
required.

Also fix some test flakes & races.

Fixes #387 (Don't hard-code the DERP map)
Updates #388 (Add DERP region support)
Fixes #399 (wgengine: flaky tests)

Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
2020-05-23 22:31:59 -07:00

524 lines
13 KiB
Go

// Copyright (c) 2020 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package derphttp implements DERP-over-HTTP.
//
// This makes DERP look exactly like WebSockets.
// A server can implement DERP over HTTPS and even if the TLS connection
// intercepted using a fake root CA, unless the interceptor knows how to
// detect DERP packets, it will look like a web socket.
package derphttp
import (
"bufio"
"context"
"crypto/tls"
"errors"
"fmt"
"io"
"io/ioutil"
"net"
"net/http"
"net/url"
"sync"
"time"
"inet.af/netaddr"
"tailscale.com/derp"
"tailscale.com/net/dnscache"
"tailscale.com/net/tlsdial"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
"tailscale.com/types/logger"
)
// Client is a DERP-over-HTTP client.
//
// It automatically reconnects on error retry. That is, a failed Send or
// Recv will report the error and not retry, but subsequent calls to
// Send/Recv will completely re-establish the connection (unless Close
// has been called).
type Client struct {
TLSConfig *tls.Config // optional; nil means default
DNSCache *dnscache.Resolver // optional; nil means no caching
privateKey key.Private
logf logger.Logf
// Either url or getRegion is non-nil:
url *url.URL
getRegion func() *tailcfg.DERPRegion
ctx context.Context // closed via cancelCtx in Client.Close
cancelCtx context.CancelFunc
mu sync.Mutex
preferred bool
closed bool
netConn io.Closer
client *derp.Client
}
// NewRegionClient returns a new DERP-over-HTTP client. It connects lazily.
// To trigger a connection, use Connect.
func NewRegionClient(privateKey key.Private, logf logger.Logf, getRegion func() *tailcfg.DERPRegion) *Client {
ctx, cancel := context.WithCancel(context.Background())
c := &Client{
privateKey: privateKey,
logf: logf,
getRegion: getRegion,
ctx: ctx,
cancelCtx: cancel,
}
return c
}
// NewClient returns a new DERP-over-HTTP client. It connects lazily.
// To trigger a connection, use Connect.
func NewClient(privateKey key.Private, serverURL string, logf logger.Logf) (*Client, error) {
u, err := url.Parse(serverURL)
if err != nil {
return nil, fmt.Errorf("derphttp.NewClient: %v", err)
}
if urlPort(u) == "" {
return nil, fmt.Errorf("derphttp.NewClient: invalid URL scheme %q", u.Scheme)
}
ctx, cancel := context.WithCancel(context.Background())
c := &Client{
privateKey: privateKey,
logf: logf,
url: u,
ctx: ctx,
cancelCtx: cancel,
}
return c, nil
}
type dialer interface {
Dial(network, address string) (net.Conn, error)
DialContext(ctx context.Context, network, address string) (net.Conn, error)
}
// Connect connects or reconnects to the server, unless already connected.
// It returns nil if there was already a good connection, or if one was made.
func (c *Client) Connect(ctx context.Context) error {
_, err := c.connect(ctx, "derphttp.Client.Connect")
return err
}
func urlPort(u *url.URL) string {
if p := u.Port(); p != "" {
return p
}
switch u.Scheme {
case "https":
return "443"
case "http":
return "80"
}
return ""
}
func (c *Client) targetString(reg *tailcfg.DERPRegion) string {
if c.url != nil {
return c.url.String()
}
return fmt.Sprintf("region %d (%v)", reg.RegionID, reg.RegionCode)
}
func (c *Client) useHTTPS() bool {
if c.url != nil && c.url.Scheme == "http" {
return false
}
return true
}
func (c *Client) tlsServerName(node *tailcfg.DERPNode) string {
if c.url != nil {
return c.url.Host
}
if node.CertName != "" {
return node.CertName
}
return node.HostName
}
func (c *Client) urlString(node *tailcfg.DERPNode) string {
if c.url != nil {
return c.url.String()
}
return fmt.Sprintf("https://%s/derp", node.HostName)
}
func (c *Client) connect(ctx context.Context, caller string) (client *derp.Client, err error) {
c.mu.Lock()
defer c.mu.Unlock()
if c.closed {
return nil, ErrClientClosed
}
if c.client != nil {
return c.client, nil
}
// timeout is the fallback maximum time (if ctx doesn't limit
// it further) to do all of: DNS + TCP + TLS + HTTP Upgrade +
// DERP upgrade.
const timeout = 10 * time.Second
ctx, cancel := context.WithTimeout(ctx, timeout)
go func() {
select {
case <-ctx.Done():
// Either timeout fired (handled below), or
// we're returning via the defer cancel()
// below.
case <-c.ctx.Done():
// Propagate a Client.Close call into
// cancelling this context.
cancel()
}
}()
defer cancel()
var reg *tailcfg.DERPRegion // nil when using c.url to dial
if c.getRegion != nil {
reg = c.getRegion()
if reg == nil {
return nil, errors.New("DERP region not available")
}
}
var tcpConn net.Conn
defer func() {
if err != nil {
if ctx.Err() != nil {
err = fmt.Errorf("%v: %v", ctx.Err(), err)
}
err = fmt.Errorf("%s connect to %v: %v", caller, c.targetString(reg), err)
if tcpConn != nil {
go tcpConn.Close()
}
}
}()
var node *tailcfg.DERPNode // nil when using c.url to dial
if c.url != nil {
c.logf("%s: connecting to %v", caller, c.url)
tcpConn, err = c.dialURL(ctx)
} else {
c.logf("%s: connecting to derp-%d (%v)", caller, reg.RegionID, reg.RegionCode)
tcpConn, node, err = c.dialRegion(ctx, reg)
}
if err != nil {
return nil, err
}
// Now that we have a TCP connection, force close it if the
// TLS handshake + DERP setup takes too long.
done := make(chan struct{})
defer close(done)
go func() {
select {
case <-done:
// Normal path. Upgrade occurred in time.
case <-ctx.Done():
select {
case <-done:
// Normal path. Upgrade occurred in time.
// But the ctx.Done() is also done because
// the "defer cancel()" above scheduled
// before this goroutine.
default:
// The TLS or HTTP or DERP exchanges didn't complete
// in time. Force close the TCP connection to force
// them to fail quickly.
tcpConn.Close()
}
}
}()
var httpConn net.Conn // a TCP conn or a TLS conn; what we speak HTTP to
if c.useHTTPS() {
tlsConf := tlsdial.Config(c.tlsServerName(node), c.TLSConfig)
if node != nil && node.DERPTestPort != 0 {
tlsConf.InsecureSkipVerify = true
}
httpConn = tls.Client(tcpConn, tlsConf)
} else {
httpConn = tcpConn
}
brw := bufio.NewReadWriter(bufio.NewReader(httpConn), bufio.NewWriter(httpConn))
req, err := http.NewRequest("GET", c.urlString(node), nil)
if err != nil {
return nil, err
}
req.Header.Set("Upgrade", "DERP")
req.Header.Set("Connection", "Upgrade")
if err := req.Write(brw); err != nil {
return nil, err
}
if err := brw.Flush(); err != nil {
return nil, err
}
resp, err := http.ReadResponse(brw.Reader, req)
if err != nil {
return nil, err
}
if resp.StatusCode != http.StatusSwitchingProtocols {
b, _ := ioutil.ReadAll(resp.Body)
resp.Body.Close()
return nil, fmt.Errorf("GET failed: %v: %s", err, b)
}
derpClient, err := derp.NewClient(c.privateKey, httpConn, brw, c.logf)
if err != nil {
return nil, err
}
if c.preferred {
if err := derpClient.NotePreferred(true); err != nil {
go httpConn.Close()
return nil, err
}
}
c.client = derpClient
c.netConn = tcpConn
return c.client, nil
}
func (c *Client) dialURL(ctx context.Context) (net.Conn, error) {
host := c.url.Hostname()
hostOrIP := host
var stdDialer dialer = new(net.Dialer)
var dialer = stdDialer
if wrapDialer != nil {
dialer = wrapDialer(dialer)
}
if c.DNSCache != nil {
ip, err := c.DNSCache.LookupIP(ctx, host)
if err == nil {
hostOrIP = ip.String()
}
if err != nil && dialer == stdDialer {
// Return an error if we're not using a dial
// proxy that can do DNS lookups for us.
return nil, err
}
}
tcpConn, err := dialer.DialContext(ctx, "tcp", net.JoinHostPort(hostOrIP, urlPort(c.url)))
if err != nil {
return nil, fmt.Errorf("dial of %v: %v", host, err)
}
return tcpConn, nil
}
// dialRegion returns a TCP connection to the provided region, trying
// each node in order (with dialNode) until one connects or ctx is
// done.
func (c *Client) dialRegion(ctx context.Context, reg *tailcfg.DERPRegion) (net.Conn, *tailcfg.DERPNode, error) {
if len(reg.Nodes) == 0 {
return nil, nil, fmt.Errorf("no nodes for %s", c.targetString(reg))
}
var firstErr error
for _, n := range reg.Nodes {
if n.STUNOnly {
continue
}
c, err := c.dialNode(ctx, n)
if err == nil {
return c, n, nil
}
if firstErr == nil {
firstErr = err
}
}
return nil, nil, firstErr
}
func (c *Client) dialContext(ctx context.Context, proto, addr string) (net.Conn, error) {
var stdDialer dialer = new(net.Dialer)
var dialer = stdDialer
if wrapDialer != nil {
dialer = wrapDialer(dialer)
}
return dialer.DialContext(ctx, proto, addr)
}
// shouldDialProto reports whether an explicitly provided IPv4 or IPv6
// address (given in s) is valid. An empty value means to dial, but to
// use DNS. The predicate function reports whether the non-empty
// string s contained a valid IP address of the right family.
func shouldDialProto(s string, pred func(netaddr.IP) bool) bool {
if s == "" {
return true
}
ip, _ := netaddr.ParseIP(s)
return pred(ip)
}
const dialNodeTimeout = 1500 * time.Millisecond
// dialNode returns a TCP connection to node n, racing IPv4 and IPv6
// (both as applicable) against each other.
// A node is only given dialNodeTimeout to connect.
//
// TODO(bradfitz): longer if no options remain perhaps? ... Or longer
// overall but have dialRegion start overlapping races?
func (c *Client) dialNode(ctx context.Context, n *tailcfg.DERPNode) (net.Conn, error) {
type res struct {
c net.Conn
err error
}
resc := make(chan res) // must be unbuffered
ctx, cancel := context.WithTimeout(ctx, dialNodeTimeout)
defer cancel()
nwait := 0
startDial := func(dstPrimary, proto string) {
nwait++
go func() {
dst := dstPrimary
if dst == "" {
dst = n.HostName
}
port := "443"
if n.DERPTestPort != 0 {
port = fmt.Sprint(n.DERPTestPort)
}
c, err := c.dialContext(ctx, proto, net.JoinHostPort(dst, port))
select {
case resc <- res{c, err}:
case <-ctx.Done():
if c != nil {
c.Close()
}
}
}()
}
if shouldDialProto(n.IPv4, netaddr.IP.Is4) {
startDial(n.IPv4, "tcp4")
}
if shouldDialProto(n.IPv6, netaddr.IP.Is6) {
startDial(n.IPv6, "tcp6")
}
if nwait == 0 {
return nil, errors.New("both IPv4 and IPv6 are explicitly disabled for node")
}
var firstErr error
for {
select {
case res := <-resc:
nwait--
if res.err == nil {
return res.c, nil
}
if firstErr == nil {
firstErr = res.err
}
if nwait == 0 {
return nil, firstErr
}
case <-ctx.Done():
return nil, ctx.Err()
}
}
}
func (c *Client) Send(dstKey key.Public, b []byte) error {
client, err := c.connect(context.TODO(), "derphttp.Client.Send")
if err != nil {
return err
}
if err := client.Send(dstKey, b); err != nil {
c.closeForReconnect(client)
}
return err
}
// NotePreferred notes whether this Client is the caller's preferred
// (home) DERP node. It's only used for stats.
func (c *Client) NotePreferred(v bool) {
c.mu.Lock()
if c.preferred == v {
c.mu.Unlock()
return
}
c.preferred = v
client := c.client
c.mu.Unlock()
if client != nil {
if err := client.NotePreferred(v); err != nil {
c.closeForReconnect(client)
}
}
}
func (c *Client) Recv(b []byte) (derp.ReceivedMessage, error) {
client, err := c.connect(context.TODO(), "derphttp.Client.Recv")
if err != nil {
return nil, err
}
m, err := client.Recv(b)
if err != nil {
c.closeForReconnect(client)
}
return m, err
}
// Close closes the client. It will not automatically reconnect after
// being closed.
func (c *Client) Close() error {
c.cancelCtx() // not in lock, so it can cancel Connect, which holds mu
c.mu.Lock()
defer c.mu.Unlock()
if c.closed {
return ErrClientClosed
}
c.closed = true
if c.netConn != nil {
c.netConn.Close()
}
return nil
}
// closeForReconnect closes the underlying network connection and
// zeros out the client field so future calls to Connect will
// reconnect.
//
// The provided brokenClient is the client to forget. If current
// client is not brokenClient, closeForReconnect does nothing. (This
// prevents a send and receive goroutine from failing at the ~same
// time and both calling closeForReconnect and the caller goroutines
// forever calling closeForReconnect in lockstep endlessly;
// https://github.com/tailscale/tailscale/pull/264)
func (c *Client) closeForReconnect(brokenClient *derp.Client) {
c.mu.Lock()
defer c.mu.Unlock()
if c.client != brokenClient {
return
}
if c.netConn != nil {
c.netConn.Close()
c.netConn = nil
}
c.client = nil
}
var ErrClientClosed = errors.New("derphttp.Client closed")
// wrapDialer, if non-nil, specifies a function to wrap a dialer in a
// SOCKS-using dialer. It's set conditionally by socks.go.
var wrapDialer func(dialer) dialer