mirror of
https://github.com/tailscale/tailscale.git
synced 2025-01-08 09:07:44 +00:00
8d6cf14456
If we've already connected to a certain name's IP in the past, don't assume the problem was DNS related. That just puts unnecessarily load on our bootstrap DNS servers during regular restarts of Tailscale infrastructure components. Also, if we do do a bootstrap DNS lookup and it gives the same IP(s) that we already tried, don't try them again. Change-Id: I743e8991a7f957381b8e4c1508b8e9d0df1782fe Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
581 lines
15 KiB
Go
581 lines
15 KiB
Go
// Copyright (c) 2020 Tailscale Inc & AUTHORS All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// TODO(bradfitz): update this code to use netaddr more
|
|
|
|
// Package dnscache contains a minimal DNS cache that makes a bunch of
|
|
// assumptions that are only valid for us. Not recommended for general use.
|
|
package dnscache
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"net"
|
|
"runtime"
|
|
"sync"
|
|
"time"
|
|
|
|
"golang.org/x/sync/singleflight"
|
|
"inet.af/netaddr"
|
|
"tailscale.com/envknob"
|
|
)
|
|
|
|
var single = &Resolver{
|
|
Forward: &net.Resolver{PreferGo: preferGoResolver()},
|
|
}
|
|
|
|
func preferGoResolver() bool {
|
|
// There does not appear to be a local resolver running
|
|
// on iOS, and NetworkExtension is good at isolating DNS.
|
|
// So do not use the Go resolver on macOS/iOS.
|
|
if runtime.GOOS == "darwin" || runtime.GOOS == "ios" {
|
|
return false
|
|
}
|
|
|
|
// The local resolver is not available on Android.
|
|
if runtime.GOOS == "android" {
|
|
return false
|
|
}
|
|
|
|
// Otherwise, the Go resolver is fine and slightly preferred
|
|
// since it's lighter, not using cgo calls & threads.
|
|
return true
|
|
}
|
|
|
|
// Get returns a caching Resolver singleton.
|
|
func Get() *Resolver { return single }
|
|
|
|
// Resolver is a minimal DNS caching resolver.
|
|
//
|
|
// The TTL is always fixed for now. It's not intended for general use.
|
|
// Cache entries are never cleaned up so it's intended that this is
|
|
// only used with a fixed set of hostnames.
|
|
type Resolver struct {
|
|
// Forward is the resolver to use to populate the cache.
|
|
// If nil, net.DefaultResolver is used.
|
|
Forward *net.Resolver
|
|
|
|
// LookupIPFallback optionally provides a backup DNS mechanism
|
|
// to use if Forward returns an error or no results.
|
|
LookupIPFallback func(ctx context.Context, host string) ([]netaddr.IP, error)
|
|
|
|
// TTL is how long to keep entries cached
|
|
//
|
|
// If zero, a default (currently 10 minutes) is used.
|
|
TTL time.Duration
|
|
|
|
// UseLastGood controls whether a cached entry older than TTL is used
|
|
// if a refresh fails.
|
|
UseLastGood bool
|
|
|
|
sf singleflight.Group
|
|
|
|
mu sync.Mutex
|
|
ipCache map[string]ipCacheEntry
|
|
}
|
|
|
|
type ipCacheEntry struct {
|
|
ip net.IP // either v4 or v6
|
|
ip6 net.IP // nil if no v4 or no v6
|
|
allIPs []net.IPAddr // 1+ v4 and/or v6
|
|
expires time.Time
|
|
}
|
|
|
|
func (r *Resolver) fwd() *net.Resolver {
|
|
if r.Forward != nil {
|
|
return r.Forward
|
|
}
|
|
return net.DefaultResolver
|
|
}
|
|
|
|
func (r *Resolver) ttl() time.Duration {
|
|
if r.TTL > 0 {
|
|
return r.TTL
|
|
}
|
|
return 10 * time.Minute
|
|
}
|
|
|
|
var debug = envknob.Bool("TS_DEBUG_DNS_CACHE")
|
|
|
|
// LookupIP returns the host's primary IP address (either IPv4 or
|
|
// IPv6, but preferring IPv4) and optionally its IPv6 address, if
|
|
// there is both IPv4 and IPv6.
|
|
//
|
|
// If err is nil, ip will be non-nil. The v6 address may be nil even
|
|
// with a nil error.
|
|
func (r *Resolver) LookupIP(ctx context.Context, host string) (ip, v6 net.IP, allIPs []net.IPAddr, err error) {
|
|
if ip := net.ParseIP(host); ip != nil {
|
|
if ip4 := ip.To4(); ip4 != nil {
|
|
return ip4, nil, []net.IPAddr{{IP: ip4}}, nil
|
|
}
|
|
if debug {
|
|
log.Printf("dnscache: %q is an IP", host)
|
|
}
|
|
return ip, nil, []net.IPAddr{{IP: ip}}, nil
|
|
}
|
|
|
|
if ip, ip6, allIPs, ok := r.lookupIPCache(host); ok {
|
|
if debug {
|
|
log.Printf("dnscache: %q = %v (cached)", host, ip)
|
|
}
|
|
return ip, ip6, allIPs, nil
|
|
}
|
|
|
|
type ipRes struct {
|
|
ip, ip6 net.IP
|
|
allIPs []net.IPAddr
|
|
}
|
|
ch := r.sf.DoChan(host, func() (interface{}, error) {
|
|
ip, ip6, allIPs, err := r.lookupIP(host)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return ipRes{ip, ip6, allIPs}, nil
|
|
})
|
|
select {
|
|
case res := <-ch:
|
|
if res.Err != nil {
|
|
if r.UseLastGood {
|
|
if ip, ip6, allIPs, ok := r.lookupIPCacheExpired(host); ok {
|
|
if debug {
|
|
log.Printf("dnscache: %q using %v after error", host, ip)
|
|
}
|
|
return ip, ip6, allIPs, nil
|
|
}
|
|
}
|
|
if debug {
|
|
log.Printf("dnscache: error resolving %q: %v", host, res.Err)
|
|
}
|
|
return nil, nil, nil, res.Err
|
|
}
|
|
r := res.Val.(ipRes)
|
|
return r.ip, r.ip6, r.allIPs, nil
|
|
case <-ctx.Done():
|
|
if debug {
|
|
log.Printf("dnscache: context done while resolving %q: %v", host, ctx.Err())
|
|
}
|
|
return nil, nil, nil, ctx.Err()
|
|
}
|
|
}
|
|
|
|
func (r *Resolver) lookupIPCache(host string) (ip, ip6 net.IP, allIPs []net.IPAddr, ok bool) {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
if ent, ok := r.ipCache[host]; ok && ent.expires.After(time.Now()) {
|
|
return ent.ip, ent.ip6, ent.allIPs, true
|
|
}
|
|
return nil, nil, nil, false
|
|
}
|
|
|
|
func (r *Resolver) lookupIPCacheExpired(host string) (ip, ip6 net.IP, allIPs []net.IPAddr, ok bool) {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
if ent, ok := r.ipCache[host]; ok {
|
|
return ent.ip, ent.ip6, ent.allIPs, true
|
|
}
|
|
return nil, nil, nil, false
|
|
}
|
|
|
|
func (r *Resolver) lookupTimeoutForHost(host string) time.Duration {
|
|
if r.UseLastGood {
|
|
if _, _, _, ok := r.lookupIPCacheExpired(host); ok {
|
|
// If we have some previous good value for this host,
|
|
// don't give this DNS lookup much time. If we're in a
|
|
// situation where the user's DNS server is unreachable
|
|
// (e.g. their corp DNS server is behind a subnet router
|
|
// that can't come up due to Tailscale needing to
|
|
// connect to itself), then we want to fail fast and let
|
|
// our caller (who set UseLastGood) fall back to using
|
|
// the last-known-good IP address.
|
|
return 3 * time.Second
|
|
}
|
|
}
|
|
return 10 * time.Second
|
|
}
|
|
|
|
func (r *Resolver) lookupIP(host string) (ip, ip6 net.IP, allIPs []net.IPAddr, err error) {
|
|
if ip, ip6, allIPs, ok := r.lookupIPCache(host); ok {
|
|
if debug {
|
|
log.Printf("dnscache: %q found in cache as %v", host, ip)
|
|
}
|
|
return ip, ip6, allIPs, nil
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), r.lookupTimeoutForHost(host))
|
|
defer cancel()
|
|
ips, err := r.fwd().LookupIPAddr(ctx, host)
|
|
if (err != nil || len(ips) == 0) && r.LookupIPFallback != nil {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer cancel()
|
|
var fips []netaddr.IP
|
|
fips, err = r.LookupIPFallback(ctx, host)
|
|
if err == nil {
|
|
ips = nil
|
|
for _, fip := range fips {
|
|
ips = append(ips, *fip.IPAddr())
|
|
}
|
|
}
|
|
}
|
|
if err != nil {
|
|
return nil, nil, nil, err
|
|
}
|
|
if len(ips) == 0 {
|
|
return nil, nil, nil, fmt.Errorf("no IPs for %q found", host)
|
|
}
|
|
|
|
have4 := false
|
|
for _, ipa := range ips {
|
|
if ip4 := ipa.IP.To4(); ip4 != nil {
|
|
if !have4 {
|
|
ip6 = ip
|
|
ip = ip4
|
|
have4 = true
|
|
}
|
|
} else {
|
|
if have4 {
|
|
ip6 = ipa.IP
|
|
} else {
|
|
ip = ipa.IP
|
|
}
|
|
}
|
|
}
|
|
r.addIPCache(host, ip, ip6, ips, r.ttl())
|
|
return ip, ip6, ips, nil
|
|
}
|
|
|
|
func (r *Resolver) addIPCache(host string, ip, ip6 net.IP, allIPs []net.IPAddr, d time.Duration) {
|
|
if naIP, _ := netaddr.FromStdIP(ip); naIP.IsPrivate() {
|
|
// Don't cache obviously wrong entries from captive portals.
|
|
// TODO: use DoH or DoT for the forwarding resolver?
|
|
if debug {
|
|
log.Printf("dnscache: %q resolved to private IP %v; using but not caching", host, ip)
|
|
}
|
|
return
|
|
}
|
|
|
|
if debug {
|
|
log.Printf("dnscache: %q resolved to IP %v; caching", host, ip)
|
|
}
|
|
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
if r.ipCache == nil {
|
|
r.ipCache = make(map[string]ipCacheEntry)
|
|
}
|
|
r.ipCache[host] = ipCacheEntry{
|
|
ip: ip,
|
|
ip6: ip6,
|
|
allIPs: allIPs,
|
|
expires: time.Now().Add(d),
|
|
}
|
|
}
|
|
|
|
type DialContextFunc func(ctx context.Context, network, address string) (net.Conn, error)
|
|
|
|
// Dialer returns a wrapped DialContext func that uses the provided dnsCache.
|
|
func Dialer(fwd DialContextFunc, dnsCache *Resolver) DialContextFunc {
|
|
d := &dialer{
|
|
fwd: fwd,
|
|
dnsCache: dnsCache,
|
|
pastConnect: map[netaddr.IP]time.Time{},
|
|
}
|
|
return d.DialContext
|
|
}
|
|
|
|
// dialer is the config and accumulated state for a dial func returned by Dialer.
|
|
type dialer struct {
|
|
fwd DialContextFunc
|
|
dnsCache *Resolver
|
|
|
|
mu sync.Mutex
|
|
pastConnect map[netaddr.IP]time.Time
|
|
}
|
|
|
|
func (d *dialer) DialContext(ctx context.Context, network, address string) (retConn net.Conn, ret error) {
|
|
host, port, err := net.SplitHostPort(address)
|
|
if err != nil {
|
|
// Bogus. But just let the real dialer return an error rather than
|
|
// inventing a similar one.
|
|
return d.fwd(ctx, network, address)
|
|
}
|
|
dc := &dialCall{
|
|
d: d,
|
|
network: network,
|
|
address: address,
|
|
host: host,
|
|
port: port,
|
|
}
|
|
defer func() {
|
|
// On failure, consider that our DNS might be wrong and ask the DNS fallback mechanism for
|
|
// some other IPs to try.
|
|
if ret == nil || d.dnsCache.LookupIPFallback == nil || dc.dnsWasTrustworthy() {
|
|
return
|
|
}
|
|
ips, err := d.dnsCache.LookupIPFallback(ctx, host)
|
|
if err != nil {
|
|
// Return with original error
|
|
return
|
|
}
|
|
if c, err := dc.raceDial(ctx, ips); err == nil {
|
|
retConn = c
|
|
ret = nil
|
|
return
|
|
}
|
|
}()
|
|
|
|
ip, ip6, allIPs, err := d.dnsCache.LookupIP(ctx, host)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to resolve %q: %w", host, err)
|
|
}
|
|
i4s := v4addrs(allIPs)
|
|
if len(i4s) < 2 {
|
|
if debug {
|
|
log.Printf("dnscache: dialing %s, %s for %s", network, ip, address)
|
|
}
|
|
ipNA, ok := netaddr.FromStdIP(ip)
|
|
if !ok {
|
|
return nil, fmt.Errorf("invalid IP %q", ip)
|
|
}
|
|
c, err := dc.dialOne(ctx, ipNA)
|
|
if err == nil || ctx.Err() != nil {
|
|
return c, err
|
|
}
|
|
// Fall back to trying IPv6, if any.
|
|
ip6NA, ok := netaddr.FromStdIP(ip6)
|
|
if !ok {
|
|
return nil, err
|
|
}
|
|
return dc.dialOne(ctx, ip6NA)
|
|
}
|
|
|
|
// Multiple IPv4 candidates, and 0+ IPv6.
|
|
ipsToTry := append(i4s, v6addrs(allIPs)...)
|
|
return dc.raceDial(ctx, ipsToTry)
|
|
}
|
|
|
|
// dialCall is the state around a single call to dial.
|
|
type dialCall struct {
|
|
d *dialer
|
|
network, address, host, port string
|
|
|
|
mu sync.Mutex // lock ordering: dialer.mu, then dialCall.mu
|
|
fails map[netaddr.IP]error // set of IPs that failed to dial thus far
|
|
}
|
|
|
|
// dnsWasTrustworthy reports whether we think the IP address(es) we
|
|
// tried (and failed) to dial were probably the correct IPs. Currently
|
|
// the heuristic is whether they ever worked previously.
|
|
func (dc *dialCall) dnsWasTrustworthy() bool {
|
|
dc.d.mu.Lock()
|
|
defer dc.d.mu.Unlock()
|
|
dc.mu.Lock()
|
|
defer dc.mu.Unlock()
|
|
|
|
if len(dc.fails) == 0 {
|
|
// No information.
|
|
return false
|
|
}
|
|
|
|
// If any of the IPs we failed to dial worked previously in
|
|
// this dialer, assume the DNS is fine.
|
|
for ip := range dc.fails {
|
|
if _, ok := dc.d.pastConnect[ip]; ok {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (dc *dialCall) dialOne(ctx context.Context, ip netaddr.IP) (net.Conn, error) {
|
|
c, err := dc.d.fwd(ctx, dc.network, net.JoinHostPort(ip.String(), dc.port))
|
|
dc.noteDialResult(ip, err)
|
|
return c, err
|
|
}
|
|
|
|
// noteDialResult records that a dial to ip either succeeded or
|
|
// failed.
|
|
func (dc *dialCall) noteDialResult(ip netaddr.IP, err error) {
|
|
if err == nil {
|
|
d := dc.d
|
|
d.mu.Lock()
|
|
defer d.mu.Unlock()
|
|
d.pastConnect[ip] = time.Now()
|
|
return
|
|
}
|
|
dc.mu.Lock()
|
|
defer dc.mu.Unlock()
|
|
if dc.fails == nil {
|
|
dc.fails = map[netaddr.IP]error{}
|
|
}
|
|
dc.fails[ip] = err
|
|
}
|
|
|
|
// uniqueIPs returns a possibly-mutated subslice of ips, filtering out
|
|
// dups and ones that have already failed previously.
|
|
func (dc *dialCall) uniqueIPs(ips []netaddr.IP) (ret []netaddr.IP) {
|
|
dc.mu.Lock()
|
|
defer dc.mu.Unlock()
|
|
seen := map[netaddr.IP]bool{}
|
|
ret = ips[:0]
|
|
for _, ip := range ips {
|
|
if seen[ip] {
|
|
continue
|
|
}
|
|
seen[ip] = true
|
|
if dc.fails[ip] != nil {
|
|
continue
|
|
}
|
|
ret = append(ret, ip)
|
|
}
|
|
return ret
|
|
}
|
|
|
|
// fallbackDelay is how long to wait between trying subsequent
|
|
// addresses when multiple options are available.
|
|
// 300ms is the same as Go's Happy Eyeballs fallbackDelay value.
|
|
const fallbackDelay = 300 * time.Millisecond
|
|
|
|
// raceDial tries to dial port on each ip in ips, starting a new race
|
|
// dial every fallbackDelay apart, returning whichever completes first.
|
|
func (dc *dialCall) raceDial(ctx context.Context, ips []netaddr.IP) (net.Conn, error) {
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
|
|
type res struct {
|
|
c net.Conn
|
|
err error
|
|
}
|
|
resc := make(chan res) // must be unbuffered
|
|
failBoost := make(chan struct{}) // best effort send on dial failure
|
|
|
|
// Remove IPs that we tried & failed to dial previously
|
|
// (such as when we're being called after a dnsfallback lookup and get
|
|
// the same results)
|
|
ips = dc.uniqueIPs(ips)
|
|
if len(ips) == 0 {
|
|
return nil, errors.New("no IPs")
|
|
}
|
|
|
|
go func() {
|
|
for i, ip := range ips {
|
|
if i != 0 {
|
|
timer := time.NewTimer(fallbackDelay)
|
|
select {
|
|
case <-timer.C:
|
|
case <-failBoost:
|
|
timer.Stop()
|
|
case <-ctx.Done():
|
|
timer.Stop()
|
|
return
|
|
}
|
|
}
|
|
go func(ip netaddr.IP) {
|
|
c, err := dc.dialOne(ctx, ip)
|
|
if err != nil {
|
|
// Best effort wake-up a pending dial.
|
|
// e.g. IPv4 dials failing quickly on an IPv6-only system.
|
|
// In that case we don't want to wait 300ms per IPv4 before
|
|
// we get to the IPv6 addresses.
|
|
select {
|
|
case failBoost <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
select {
|
|
case resc <- res{c, err}:
|
|
case <-ctx.Done():
|
|
if c != nil {
|
|
c.Close()
|
|
}
|
|
}
|
|
}(ip)
|
|
}
|
|
}()
|
|
|
|
var firstErr error
|
|
var fails int
|
|
for {
|
|
select {
|
|
case r := <-resc:
|
|
if r.c != nil {
|
|
return r.c, nil
|
|
}
|
|
fails++
|
|
if firstErr == nil {
|
|
firstErr = r.err
|
|
}
|
|
if fails == len(ips) {
|
|
return nil, firstErr
|
|
}
|
|
case <-ctx.Done():
|
|
return nil, ctx.Err()
|
|
}
|
|
}
|
|
}
|
|
|
|
func v4addrs(aa []net.IPAddr) (ret []netaddr.IP) {
|
|
for _, a := range aa {
|
|
if ip, ok := netaddr.FromStdIP(a.IP); ok && ip.Is4() {
|
|
ret = append(ret, ip)
|
|
}
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func v6addrs(aa []net.IPAddr) (ret []netaddr.IP) {
|
|
for _, a := range aa {
|
|
if ip, ok := netaddr.FromStdIP(a.IP); ok && ip.Is6() {
|
|
ret = append(ret, ip)
|
|
}
|
|
}
|
|
return ret
|
|
}
|
|
|
|
var errTLSHandshakeTimeout = errors.New("timeout doing TLS handshake")
|
|
|
|
// TLSDialer is like Dialer but returns a func suitable for using with net/http.Transport.DialTLSContext.
|
|
// It returns a *tls.Conn type on success.
|
|
// On TLS cert validation failure, it can invoke a backup DNS resolution strategy.
|
|
func TLSDialer(fwd DialContextFunc, dnsCache *Resolver, tlsConfigBase *tls.Config) DialContextFunc {
|
|
tcpDialer := Dialer(fwd, dnsCache)
|
|
return func(ctx context.Context, network, address string) (net.Conn, error) {
|
|
host, _, err := net.SplitHostPort(address)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
tcpConn, err := tcpDialer(ctx, network, address)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
cfg := cloneTLSConfig(tlsConfigBase)
|
|
if cfg.ServerName == "" {
|
|
cfg.ServerName = host
|
|
}
|
|
tlsConn := tls.Client(tcpConn, cfg)
|
|
|
|
handshakeCtx, handshakeTimeoutCancel := context.WithTimeout(ctx, 5*time.Second)
|
|
defer handshakeTimeoutCancel()
|
|
if err := tlsConn.HandshakeContext(handshakeCtx); err != nil {
|
|
tcpConn.Close()
|
|
// TODO: if err != errTLSHandshakeTimeout,
|
|
// assume it might be some captive portal or
|
|
// otherwise incorrect DNS and try the backup
|
|
// DNS mechanism.
|
|
return nil, err
|
|
}
|
|
return tlsConn, nil
|
|
}
|
|
}
|
|
|
|
func cloneTLSConfig(cfg *tls.Config) *tls.Config {
|
|
if cfg == nil {
|
|
return &tls.Config{}
|
|
}
|
|
return cfg.Clone()
|
|
}
|