From 0fedf1e1061436e71b9ee12de0ae6634617b643e Mon Sep 17 00:00:00 2001 From: Mario Minardi Date: Tue, 3 Dec 2024 22:01:37 -0700 Subject: [PATCH 1/3] derp/derphttp: extract client TLS handshake logic into a separate method Extract the TLS handshake logic done in the connect method in derphttp_client.go into its own tlsHandshake method. This is a no-op for behaviour and is being done for future usage of this method. Updates https://github.com/tailscale/tailscale/issues/12724 Signed-off-by: Mario Minardi --- derp/derphttp/derphttp_client.go | 103 +++++++++++++++++++------------ 1 file changed, 63 insertions(+), 40 deletions(-) diff --git a/derp/derphttp/derphttp_client.go b/derp/derphttp/derphttp_client.go index c95d072b1..ed2ba7554 100644 --- a/derp/derphttp/derphttp_client.go +++ b/derp/derphttp/derphttp_client.go @@ -325,6 +325,59 @@ func useWebsockets() bool { return false } +type tlsHandshakeData struct { + httpConn net.Conn + tlsState tls.ConnectionState + serverPub key.NodePublic // zero if unknown (if not using TLS or TLS middlebox eats it) + serverProtoVersion int +} + +// tlsHandshake forces a TLS handshake and returns tlsHandshakeData +// containing the TLS connection, connection state, and server metadata +// on success. +func (c *Client) tlsHandshake(tcpConn net.Conn, node *tailcfg.DERPNode) (tlsHandshakeData, error) { + // Special case when using http for dev purposes. + if !c.useHTTPS() { + return tlsHandshakeData{ + httpConn: tcpConn, + }, nil + } + + tlsConn := c.tlsClient(tcpConn, node) + + // Force a handshake now (instead of waiting for it to + // be done implicitly on read/write) so we can check + // the ConnectionState. + if err := tlsConn.Handshake(); err != nil { + return tlsHandshakeData{}, err + } + + // We expect to be using TLS 1.3 to our own servers, and only + // starting at TLS 1.3 are the server's returned certificates + // encrypted, so only look for and use our "meta cert" if we're + // using TLS 1.3. If we're not using TLS 1.3, it might be a user + // running cmd/derper themselves with a different configuration, + // in which case we can avoid this fast-start optimization. + // (If a corporate proxy is MITM'ing TLS 1.3 connections with + // corp-mandated TLS root certs than all bets are off anyway.) + // Note that we're not specifically concerned about TLS downgrade + // attacks. TLS handles that fine: + // https://blog.gypsyengineer.com/en/security/how-does-tls-1-3-protect-against-downgrade-attacks.html + cs := tlsConn.ConnectionState() + var serverPub key.NodePublic + var serverProtoVersion int + if cs.Version >= tls.VersionTLS13 { + serverPub, serverProtoVersion = parseMetaCert(cs.PeerCertificates) + } + + return tlsHandshakeData{ + httpConn: tlsConn, + tlsState: cs, + serverPub: serverPub, + serverProtoVersion: serverProtoVersion, + }, nil +} + func (c *Client) connect(ctx context.Context, caller string) (client *derp.Client, connGen int, err error) { c.mu.Lock() defer c.mu.Unlock() @@ -455,42 +508,12 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien } }() - var httpConn net.Conn // a TCP conn or a TLS conn; what we speak HTTP to - var serverPub key.NodePublic // or zero if unknown (if not using TLS or TLS middlebox eats it) - var serverProtoVersion int - var tlsState *tls.ConnectionState - if c.useHTTPS() { - tlsConn := c.tlsClient(tcpConn, node) - httpConn = tlsConn - - // Force a handshake now (instead of waiting for it to - // be done implicitly on read/write) so we can check - // the ConnectionState. - if err := tlsConn.Handshake(); err != nil { - return nil, 0, err - } - - // We expect to be using TLS 1.3 to our own servers, and only - // starting at TLS 1.3 are the server's returned certificates - // encrypted, so only look for and use our "meta cert" if we're - // using TLS 1.3. If we're not using TLS 1.3, it might be a user - // running cmd/derper themselves with a different configuration, - // in which case we can avoid this fast-start optimization. - // (If a corporate proxy is MITM'ing TLS 1.3 connections with - // corp-mandated TLS root certs than all bets are off anyway.) - // Note that we're not specifically concerned about TLS downgrade - // attacks. TLS handles that fine: - // https://blog.gypsyengineer.com/en/security/how-does-tls-1-3-protect-against-downgrade-attacks.html - cs := tlsConn.ConnectionState() - tlsState = &cs - if cs.Version >= tls.VersionTLS13 { - serverPub, serverProtoVersion = parseMetaCert(cs.PeerCertificates) - } - } else { - httpConn = tcpConn + connData, err := c.tlsHandshake(tcpConn, node) + if err != nil { + return nil, 0, err } - brw := bufio.NewReadWriter(bufio.NewReader(httpConn), bufio.NewWriter(httpConn)) + brw := bufio.NewReadWriter(bufio.NewReader(connData.httpConn), bufio.NewWriter(connData.httpConn)) var derpClient *derp.Client req, err := http.NewRequest("GET", c.urlString(node), nil) @@ -512,7 +535,7 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien // https://github.com/tailscale/tailscale/issues/12724 } - if !serverPub.IsZero() && serverProtoVersion != 0 { + if !connData.serverPub.IsZero() && connData.serverProtoVersion != 0 { // parseMetaCert found the server's public key (no TLS // middlebox was in the way), so skip the HTTP upgrade // exchange. See https://github.com/tailscale/tailscale/issues/693 @@ -544,9 +567,9 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien return nil, 0, fmt.Errorf("GET failed: %v: %s", err, b) } } - derpClient, err = derp.NewClient(c.privateKey, httpConn, brw, c.logf, + derpClient, err = derp.NewClient(c.privateKey, connData.httpConn, brw, c.logf, derp.MeshKey(c.MeshKey), - derp.ServerPublicKey(serverPub), + derp.ServerPublicKey(connData.serverPub), derp.CanAckPings(c.canAckPings), derp.IsProber(c.IsProber), ) @@ -555,14 +578,14 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien } if c.preferred { if err := derpClient.NotePreferred(true); err != nil { - go httpConn.Close() + go connData.httpConn.Close() return nil, 0, err } } if c.WatchConnectionChanges { if err := derpClient.WatchConnectionChanges(); err != nil { - go httpConn.Close() + go connData.httpConn.Close() return nil, 0, err } } @@ -570,7 +593,7 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien c.serverPubKey = derpClient.ServerPublicKey() c.client = derpClient c.netConn = tcpConn - c.tlsState = tlsState + c.tlsState = &connData.tlsState c.connGen++ localAddr, _ := c.client.LocalAddr() From 7a093c383f63704e43043f1a69418004a680ba02 Mon Sep 17 00:00:00 2001 From: Mario Minardi Date: Tue, 3 Dec 2024 22:28:53 -0700 Subject: [PATCH 2/3] derp/derphttp: extract client connection upgrade into a separate method Extract the connection upgrade logic done in the connect method in derphttp_client.go into its own upgradeConnection method. This is a no-op for behaviour and is being done for future usage of this method. Updates https://github.com/tailscale/tailscale/issues/12724 Signed-off-by: Mario Minardi --- derp/derphttp/derphttp_client.go | 109 ++++++++++++++++--------------- 1 file changed, 58 insertions(+), 51 deletions(-) diff --git a/derp/derphttp/derphttp_client.go b/derp/derphttp/derphttp_client.go index ed2ba7554..423a4acea 100644 --- a/derp/derphttp/derphttp_client.go +++ b/derp/derphttp/derphttp_client.go @@ -378,6 +378,58 @@ func (c *Client) tlsHandshake(tcpConn net.Conn, node *tailcfg.DERPNode) (tlsHand }, nil } +// upgradeConnection upgrades the connection to the given DERPNode. +func (c *Client) upgradeConnection(connData tlsHandshakeData, node *tailcfg.DERPNode, idealNodeHeaderValue *string) (*bufio.ReadWriter, error) { + brw := bufio.NewReadWriter(bufio.NewReader(connData.httpConn), bufio.NewWriter(connData.httpConn)) + + req, err := http.NewRequest("GET", c.urlString(node), nil) + if err != nil { + return nil, err + } + + req.Header.Set("Upgrade", "DERP") + req.Header.Set("Connection", "Upgrade") + if idealNodeHeaderValue != nil { + // This is purely informative for now (2024-07-06) for stats: + req.Header.Set(derp.IdealNodeHeader, *idealNodeHeaderValue) + } + + if !connData.serverPub.IsZero() && connData.serverProtoVersion != 0 { + // parseMetaCert found the server's public key (no TLS + // middlebox was in the way), so skip the HTTP upgrade + // exchange. See https://github.com/tailscale/tailscale/issues/693 + // for an overview. We still send the HTTP request + // just to get routed into the server's HTTP Handler so it + // can Hijack the request, but we signal with a special header + // that we don't want to deal with its HTTP response. + req.Header.Set(fastStartHeader, "1") // suppresses the server's HTTP response + if err := req.Write(brw); err != nil { + return nil, err + } + // No need to flush the HTTP request. the derp.Client's initial + // client auth frame will flush it. + } else { + if err := req.Write(brw); err != nil { + return nil, err + } + if err := brw.Flush(); err != nil { + return nil, err + } + + resp, err := http.ReadResponse(brw.Reader, req) + if err != nil { + return nil, err + } + if resp.StatusCode != http.StatusSwitchingProtocols { + b, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return nil, fmt.Errorf("GET failed: %v: %s", err, b) + } + } + + return brw, nil +} + func (c *Client) connect(ctx context.Context, caller string) (client *derp.Client, connGen int, err error) { c.mu.Lock() defer c.mu.Unlock() @@ -513,61 +565,16 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien return nil, 0, err } - brw := bufio.NewReadWriter(bufio.NewReader(connData.httpConn), bufio.NewWriter(connData.httpConn)) - var derpClient *derp.Client - - req, err := http.NewRequest("GET", c.urlString(node), nil) + var idealNodeHeaderValue *string + if !idealNodeInRegion && reg != nil { + idealNodeHeaderValue = ®.Nodes[0].Name + } + brw, err := c.upgradeConnection(connData, node, idealNodeHeaderValue) if err != nil { return nil, 0, err } - req.Header.Set("Upgrade", "DERP") - req.Header.Set("Connection", "Upgrade") - if !idealNodeInRegion && reg != nil { - // This is purely informative for now (2024-07-06) for stats: - req.Header.Set(derp.IdealNodeHeader, reg.Nodes[0].Name) - // TODO(bradfitz,raggi): start a time.AfterFunc for 30m-1h or so to - // dialNode(reg.Nodes[0]) and see if we can even TCP connect to it. If - // so, TLS handshake it as well (which is mixed up in this massive - // connect method) and then if it all appears good, grab the mutex, bump - // connGen, finish the Upgrade, close the old one, and set a new field - // on Client that's like "here's the connect result and connGen for the - // next connect that comes in"). Tracking bug for all this is: - // https://github.com/tailscale/tailscale/issues/12724 - } - if !connData.serverPub.IsZero() && connData.serverProtoVersion != 0 { - // parseMetaCert found the server's public key (no TLS - // middlebox was in the way), so skip the HTTP upgrade - // exchange. See https://github.com/tailscale/tailscale/issues/693 - // for an overview. We still send the HTTP request - // just to get routed into the server's HTTP Handler so it - // can Hijack the request, but we signal with a special header - // that we don't want to deal with its HTTP response. - req.Header.Set(fastStartHeader, "1") // suppresses the server's HTTP response - if err := req.Write(brw); err != nil { - return nil, 0, err - } - // No need to flush the HTTP request. the derp.Client's initial - // client auth frame will flush it. - } else { - if err := req.Write(brw); err != nil { - return nil, 0, err - } - if err := brw.Flush(); err != nil { - return nil, 0, err - } - - resp, err := http.ReadResponse(brw.Reader, req) - if err != nil { - return nil, 0, err - } - if resp.StatusCode != http.StatusSwitchingProtocols { - b, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return nil, 0, fmt.Errorf("GET failed: %v: %s", err, b) - } - } - derpClient, err = derp.NewClient(c.privateKey, connData.httpConn, brw, c.logf, + derpClient, err := derp.NewClient(c.privateKey, connData.httpConn, brw, c.logf, derp.MeshKey(c.MeshKey), derp.ServerPublicKey(connData.serverPub), derp.CanAckPings(c.canAckPings), From 4719d9c0cb3da15aec3c158ed714b15054321208 Mon Sep 17 00:00:00 2001 From: Mario Minardi Date: Tue, 3 Dec 2024 22:56:03 -0700 Subject: [PATCH 3/3] derp/derphttp: automatically reconnect to ideal DERP node Add logic to attempt to connect to the client's ideal DERP node in a background go routine with an exponential backoff when the client is connected to a non-ideal derp. Updates https://github.com/tailscale/tailscale/issues/12724 Signed-off-by: Mario Minardi --- cmd/derper/depaware.txt | 1 + cmd/tailscale/depaware.txt | 1 + derp/derphttp/derphttp_client.go | 154 ++++++++++++++++++++++++++++--- 3 files changed, 143 insertions(+), 13 deletions(-) diff --git a/cmd/derper/depaware.txt b/cmd/derper/depaware.txt index 076074f25..ba881e1de 100644 --- a/cmd/derper/depaware.txt +++ b/cmd/derper/depaware.txt @@ -99,6 +99,7 @@ tailscale.com/cmd/derper dependencies: (generated by github.com/tailscale/depawa tailscale.com/ipn from tailscale.com/client/tailscale tailscale.com/ipn/ipnstate from tailscale.com/client/tailscale+ tailscale.com/kube/kubetypes from tailscale.com/envknob + tailscale.com/logtail/backoff from tailscale.com/derp/derphttp tailscale.com/metrics from tailscale.com/cmd/derper+ tailscale.com/net/dnscache from tailscale.com/derp/derphttp tailscale.com/net/ktimeout from tailscale.com/cmd/derper diff --git a/cmd/tailscale/depaware.txt b/cmd/tailscale/depaware.txt index d18d88873..02e0f70df 100644 --- a/cmd/tailscale/depaware.txt +++ b/cmd/tailscale/depaware.txt @@ -98,6 +98,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep tailscale.com/ipn/ipnstate from tailscale.com/client/tailscale+ tailscale.com/kube/kubetypes from tailscale.com/envknob tailscale.com/licenses from tailscale.com/client/web+ + tailscale.com/logtail/backoff from tailscale.com/derp/derphttp tailscale.com/metrics from tailscale.com/derp+ tailscale.com/net/captivedetection from tailscale.com/net/netcheck tailscale.com/net/dns/recursive from tailscale.com/net/dnsfallback diff --git a/derp/derphttp/derphttp_client.go b/derp/derphttp/derphttp_client.go index 423a4acea..c6ab86a09 100644 --- a/derp/derphttp/derphttp_client.go +++ b/derp/derphttp/derphttp_client.go @@ -32,6 +32,7 @@ import ( "tailscale.com/derp" "tailscale.com/envknob" "tailscale.com/health" + "tailscale.com/logtail/backoff" "tailscale.com/net/dnscache" "tailscale.com/net/netmon" "tailscale.com/net/netns" @@ -87,19 +88,20 @@ type Client struct { // Client.conn holds mu. addrFamSelAtomic syncs.AtomicValue[AddressFamilySelector] - mu sync.Mutex - atomicState syncs.AtomicValue[ConnectedState] // hold mu to write - started bool // true upon first connect, never transitions to false - preferred bool - canAckPings bool - closed bool - netConn io.Closer - client *derp.Client - connGen int // incremented once per new connection; valid values are >0 - serverPubKey key.NodePublic - tlsState *tls.ConnectionState - pingOut map[derp.PingMessage]chan<- bool // chan to send to on pong - clock tstime.Clock + mu sync.Mutex + atomicState syncs.AtomicValue[ConnectedState] // hold mu to write + started bool // true upon first connect, never transitions to false + preferred bool + canAckPings bool + closed bool + netConn io.Closer + client *derp.Client + connGen int // incremented once per new connection; valid values are >0 + serverPubKey key.NodePublic + tlsState *tls.ConnectionState + pingOut map[derp.PingMessage]chan<- bool // chan to send to on pong + clock tstime.Clock + attemptingIdealConnection bool } // ConnectedState describes the state of a derphttp Client. @@ -430,6 +432,110 @@ func (c *Client) upgradeConnection(connData tlsHandshakeData, node *tailcfg.DERP return brw, nil } +// attemptIdealConnection attempts to establish a connection to the passed +// in primary DERPNode and updates the connection data on the client if +// successful. +// +// A successful connection here means that we have both a TCP connection and +// that a TLS handshake has succeeded, after which we close the existing connection +// to the non-ideal DERP before proceeding with upgrading our ideal connection +// and updating the connection data on the client using the ideal connection. +// +// This method should only be called if we have a non-ideal connection already from +// the regular flow through connect. +func (c *Client) attemptIdealConnection(ctx context.Context, node *tailcfg.DERPNode, caller string) (err error) { + // timeout is the fallback maximum time (if ctx doesn't limit + // it further) to do all of: DNS + TCP + TLS + HTTP Upgrade + + // DERP upgrade. + const timeout = 10 * time.Second + ctx, cancel := context.WithTimeout(ctx, timeout) + go func() { + select { + case <-ctx.Done(): + // Either timeout fired (handled below), or + // we're returning via the defer cancel() + // below. + case <-c.ctx.Done(): + // Propagate a Client.Close call into + // cancelling this context. + cancel() + } + }() + defer cancel() + + var tcpConn net.Conn + + defer func() { + if err != nil { + if ctx.Err() != nil { + err = fmt.Errorf("%v: %v", ctx.Err(), err) + } + err = fmt.Errorf("%s connect to %v: %v", caller, node.HostName, err) + if tcpConn != nil { + go tcpConn.Close() + } + } + }() + + tcpConn, err = c.dialNode(ctx, node) + if err != nil { + return err + } + + connData, err := c.tlsHandshake(tcpConn, node) + if err != nil { + return err + } + + // We've established a TCP connection and successfully performed + // a TLS handshake with our ideal home DERP, close the existing + // non-ideal connection so that we can upgrade our new ideal + // connection and swap the client over to using this ideal connection. + c.closeForReconnect(c.client) + + c.mu.Lock() + defer c.mu.Unlock() + + brw, err := c.upgradeConnection(connData, node, nil) + if err != nil { + return err + } + + derpClient, err := derp.NewClient(c.privateKey, connData.httpConn, brw, c.logf, + derp.MeshKey(c.MeshKey), + derp.ServerPublicKey(connData.serverPub), + derp.CanAckPings(c.canAckPings), + derp.IsProber(c.IsProber), + ) + if err != nil { + return err + } + if err := derpClient.NotePreferred(true); err != nil { + go connData.httpConn.Close() + return err + } + + if c.WatchConnectionChanges { + if err := derpClient.WatchConnectionChanges(); err != nil { + go connData.httpConn.Close() + return err + } + } + c.preferred = true + c.serverPubKey = derpClient.ServerPublicKey() + c.client = derpClient + c.netConn = tcpConn + c.tlsState = &connData.tlsState + c.connGen++ + + localAddr, _ := c.client.LocalAddr() + c.atomicState.Store(ConnectedState{ + Connected: true, + LocalAddr: localAddr, + }) + return nil +} + func (c *Client) connect(ctx context.Context, caller string) (client *derp.Client, connGen int, err error) { c.mu.Lock() defer c.mu.Unlock() @@ -603,6 +709,28 @@ func (c *Client) connect(ctx context.Context, caller string) (client *derp.Clien c.tlsState = &connData.tlsState c.connGen++ + if !idealNodeInRegion && !c.attemptingIdealConnection && reg != nil { + c.attemptingIdealConnection = true + + go func() { + bo := backoff.NewBackoff("attempt-ideal-derp-connection", c.logf, 1*time.Hour) + + for { + newCtx := c.newContext() + + err := c.attemptIdealConnection(newCtx, reg.Nodes[0], "derphttp.Client.connect") + if err == nil { + break + } + bo.BackOff(newCtx, err) + } + + c.mu.Lock() + c.attemptingIdealConnection = false + c.mu.Unlock() + }() + } + localAddr, _ := c.client.LocalAddr() c.atomicState.Store(ConnectedState{ Connected: true,