tailscale/net/dnscache/dnscache.go

581 lines
15 KiB
Go
Raw Normal View History

// Copyright (c) 2020 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// TODO(bradfitz): update this code to use netaddr more
// Package dnscache contains a minimal DNS cache that makes a bunch of
// assumptions that are only valid for us. Not recommended for general use.
package dnscache
import (
"context"
"crypto/tls"
"errors"
"fmt"
"log"
"net"
"runtime"
"sync"
"time"
"golang.org/x/sync/singleflight"
"inet.af/netaddr"
"tailscale.com/envknob"
)
var single = &Resolver{
Forward: &net.Resolver{PreferGo: preferGoResolver()},
}
func preferGoResolver() bool {
// There does not appear to be a local resolver running
// on iOS, and NetworkExtension is good at isolating DNS.
// So do not use the Go resolver on macOS/iOS.
if runtime.GOOS == "darwin" || runtime.GOOS == "ios" {
return false
}
// The local resolver is not available on Android.
if runtime.GOOS == "android" {
return false
}
// Otherwise, the Go resolver is fine and slightly preferred
// since it's lighter, not using cgo calls & threads.
return true
}
// Get returns a caching Resolver singleton.
func Get() *Resolver { return single }
// Resolver is a minimal DNS caching resolver.
//
// The TTL is always fixed for now. It's not intended for general use.
// Cache entries are never cleaned up so it's intended that this is
// only used with a fixed set of hostnames.
type Resolver struct {
// Forward is the resolver to use to populate the cache.
// If nil, net.DefaultResolver is used.
Forward *net.Resolver
// LookupIPFallback optionally provides a backup DNS mechanism
// to use if Forward returns an error or no results.
LookupIPFallback func(ctx context.Context, host string) ([]netaddr.IP, error)
// TTL is how long to keep entries cached
//
// If zero, a default (currently 10 minutes) is used.
TTL time.Duration
// UseLastGood controls whether a cached entry older than TTL is used
// if a refresh fails.
UseLastGood bool
sf singleflight.Group
mu sync.Mutex
ipCache map[string]ipCacheEntry
}
type ipCacheEntry struct {
ip net.IP // either v4 or v6
ip6 net.IP // nil if no v4 or no v6
allIPs []net.IPAddr // 1+ v4 and/or v6
expires time.Time
}
func (r *Resolver) fwd() *net.Resolver {
if r.Forward != nil {
return r.Forward
}
return net.DefaultResolver
}
func (r *Resolver) ttl() time.Duration {
if r.TTL > 0 {
return r.TTL
}
return 10 * time.Minute
}
var debug = envknob.Bool("TS_DEBUG_DNS_CACHE")
// LookupIP returns the host's primary IP address (either IPv4 or
// IPv6, but preferring IPv4) and optionally its IPv6 address, if
// there is both IPv4 and IPv6.
//
// If err is nil, ip will be non-nil. The v6 address may be nil even
// with a nil error.
func (r *Resolver) LookupIP(ctx context.Context, host string) (ip, v6 net.IP, allIPs []net.IPAddr, err error) {
if ip := net.ParseIP(host); ip != nil {
if ip4 := ip.To4(); ip4 != nil {
return ip4, nil, []net.IPAddr{{IP: ip4}}, nil
}
if debug {
log.Printf("dnscache: %q is an IP", host)
}
return ip, nil, []net.IPAddr{{IP: ip}}, nil
}
if ip, ip6, allIPs, ok := r.lookupIPCache(host); ok {
if debug {
log.Printf("dnscache: %q = %v (cached)", host, ip)
}
return ip, ip6, allIPs, nil
}
type ipRes struct {
ip, ip6 net.IP
allIPs []net.IPAddr
}
ch := r.sf.DoChan(host, func() (any, error) {
ip, ip6, allIPs, err := r.lookupIP(host)
if err != nil {
return nil, err
}
return ipRes{ip, ip6, allIPs}, nil
})
select {
case res := <-ch:
if res.Err != nil {
if r.UseLastGood {
if ip, ip6, allIPs, ok := r.lookupIPCacheExpired(host); ok {
if debug {
log.Printf("dnscache: %q using %v after error", host, ip)
}
return ip, ip6, allIPs, nil
}
}
if debug {
log.Printf("dnscache: error resolving %q: %v", host, res.Err)
}
return nil, nil, nil, res.Err
}
r := res.Val.(ipRes)
return r.ip, r.ip6, r.allIPs, nil
case <-ctx.Done():
if debug {
log.Printf("dnscache: context done while resolving %q: %v", host, ctx.Err())
}
return nil, nil, nil, ctx.Err()
}
}
func (r *Resolver) lookupIPCache(host string) (ip, ip6 net.IP, allIPs []net.IPAddr, ok bool) {
r.mu.Lock()
defer r.mu.Unlock()
if ent, ok := r.ipCache[host]; ok && ent.expires.After(time.Now()) {
return ent.ip, ent.ip6, ent.allIPs, true
}
return nil, nil, nil, false
}
func (r *Resolver) lookupIPCacheExpired(host string) (ip, ip6 net.IP, allIPs []net.IPAddr, ok bool) {
r.mu.Lock()
defer r.mu.Unlock()
if ent, ok := r.ipCache[host]; ok {
return ent.ip, ent.ip6, ent.allIPs, true
}
return nil, nil, nil, false
}
func (r *Resolver) lookupTimeoutForHost(host string) time.Duration {
if r.UseLastGood {
if _, _, _, ok := r.lookupIPCacheExpired(host); ok {
// If we have some previous good value for this host,
// don't give this DNS lookup much time. If we're in a
// situation where the user's DNS server is unreachable
// (e.g. their corp DNS server is behind a subnet router
// that can't come up due to Tailscale needing to
// connect to itself), then we want to fail fast and let
// our caller (who set UseLastGood) fall back to using
// the last-known-good IP address.
return 3 * time.Second
}
}
return 10 * time.Second
}
func (r *Resolver) lookupIP(host string) (ip, ip6 net.IP, allIPs []net.IPAddr, err error) {
if ip, ip6, allIPs, ok := r.lookupIPCache(host); ok {
if debug {
log.Printf("dnscache: %q found in cache as %v", host, ip)
}
return ip, ip6, allIPs, nil
}
ctx, cancel := context.WithTimeout(context.Background(), r.lookupTimeoutForHost(host))
defer cancel()
ips, err := r.fwd().LookupIPAddr(ctx, host)
if (err != nil || len(ips) == 0) && r.LookupIPFallback != nil {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
var fips []netaddr.IP
fips, err = r.LookupIPFallback(ctx, host)
if err == nil {
ips = nil
for _, fip := range fips {
ips = append(ips, *fip.IPAddr())
}
}
}
if err != nil {
return nil, nil, nil, err
}
if len(ips) == 0 {
return nil, nil, nil, fmt.Errorf("no IPs for %q found", host)
}
have4 := false
for _, ipa := range ips {
if ip4 := ipa.IP.To4(); ip4 != nil {
if !have4 {
ip6 = ip
ip = ip4
have4 = true
}
} else {
if have4 {
ip6 = ipa.IP
} else {
ip = ipa.IP
}
}
}
r.addIPCache(host, ip, ip6, ips, r.ttl())
return ip, ip6, ips, nil
}
func (r *Resolver) addIPCache(host string, ip, ip6 net.IP, allIPs []net.IPAddr, d time.Duration) {
if naIP, _ := netaddr.FromStdIP(ip); naIP.IsPrivate() {
// Don't cache obviously wrong entries from captive portals.
// TODO: use DoH or DoT for the forwarding resolver?
if debug {
log.Printf("dnscache: %q resolved to private IP %v; using but not caching", host, ip)
}
return
}
if debug {
log.Printf("dnscache: %q resolved to IP %v; caching", host, ip)
}
r.mu.Lock()
defer r.mu.Unlock()
if r.ipCache == nil {
r.ipCache = make(map[string]ipCacheEntry)
}
r.ipCache[host] = ipCacheEntry{
ip: ip,
ip6: ip6,
allIPs: allIPs,
expires: time.Now().Add(d),
}
}
type DialContextFunc func(ctx context.Context, network, address string) (net.Conn, error)
// Dialer returns a wrapped DialContext func that uses the provided dnsCache.
func Dialer(fwd DialContextFunc, dnsCache *Resolver) DialContextFunc {
d := &dialer{
fwd: fwd,
dnsCache: dnsCache,
pastConnect: map[netaddr.IP]time.Time{},
}
return d.DialContext
}
// dialer is the config and accumulated state for a dial func returned by Dialer.
type dialer struct {
fwd DialContextFunc
dnsCache *Resolver
mu sync.Mutex
pastConnect map[netaddr.IP]time.Time
}
func (d *dialer) DialContext(ctx context.Context, network, address string) (retConn net.Conn, ret error) {
host, port, err := net.SplitHostPort(address)
if err != nil {
// Bogus. But just let the real dialer return an error rather than
// inventing a similar one.
return d.fwd(ctx, network, address)
}
dc := &dialCall{
d: d,
network: network,
address: address,
host: host,
port: port,
}
defer func() {
// On failure, consider that our DNS might be wrong and ask the DNS fallback mechanism for
// some other IPs to try.
if ret == nil || ctx.Err() != nil || d.dnsCache.LookupIPFallback == nil || dc.dnsWasTrustworthy() {
return
}
ips, err := d.dnsCache.LookupIPFallback(ctx, host)
if err != nil {
// Return with original error
return
}
if c, err := dc.raceDial(ctx, ips); err == nil {
retConn = c
ret = nil
return
}
}()
ip, ip6, allIPs, err := d.dnsCache.LookupIP(ctx, host)
if err != nil {
return nil, fmt.Errorf("failed to resolve %q: %w", host, err)
}
i4s := v4addrs(allIPs)
if len(i4s) < 2 {
if debug {
log.Printf("dnscache: dialing %s, %s for %s", network, ip, address)
}
ipNA, ok := netaddr.FromStdIP(ip)
if !ok {
return nil, fmt.Errorf("invalid IP %q", ip)
}
c, err := dc.dialOne(ctx, ipNA)
if err == nil || ctx.Err() != nil {
return c, err
}
// Fall back to trying IPv6, if any.
ip6NA, ok := netaddr.FromStdIP(ip6)
if !ok {
return nil, err
}
return dc.dialOne(ctx, ip6NA)
}
// Multiple IPv4 candidates, and 0+ IPv6.
ipsToTry := append(i4s, v6addrs(allIPs)...)
return dc.raceDial(ctx, ipsToTry)
}
// dialCall is the state around a single call to dial.
type dialCall struct {
d *dialer
network, address, host, port string
mu sync.Mutex // lock ordering: dialer.mu, then dialCall.mu
fails map[netaddr.IP]error // set of IPs that failed to dial thus far
}
// dnsWasTrustworthy reports whether we think the IP address(es) we
// tried (and failed) to dial were probably the correct IPs. Currently
// the heuristic is whether they ever worked previously.
func (dc *dialCall) dnsWasTrustworthy() bool {
dc.d.mu.Lock()
defer dc.d.mu.Unlock()
dc.mu.Lock()
defer dc.mu.Unlock()
if len(dc.fails) == 0 {
// No information.
return false
}
// If any of the IPs we failed to dial worked previously in
// this dialer, assume the DNS is fine.
for ip := range dc.fails {
if _, ok := dc.d.pastConnect[ip]; ok {
return true
}
}
return false
}
func (dc *dialCall) dialOne(ctx context.Context, ip netaddr.IP) (net.Conn, error) {
c, err := dc.d.fwd(ctx, dc.network, net.JoinHostPort(ip.String(), dc.port))
dc.noteDialResult(ip, err)
return c, err
}
// noteDialResult records that a dial to ip either succeeded or
// failed.
func (dc *dialCall) noteDialResult(ip netaddr.IP, err error) {
if err == nil {
d := dc.d
d.mu.Lock()
defer d.mu.Unlock()
d.pastConnect[ip] = time.Now()
return
}
dc.mu.Lock()
defer dc.mu.Unlock()
if dc.fails == nil {
dc.fails = map[netaddr.IP]error{}
}
dc.fails[ip] = err
}
// uniqueIPs returns a possibly-mutated subslice of ips, filtering out
// dups and ones that have already failed previously.
func (dc *dialCall) uniqueIPs(ips []netaddr.IP) (ret []netaddr.IP) {
dc.mu.Lock()
defer dc.mu.Unlock()
seen := map[netaddr.IP]bool{}
ret = ips[:0]
for _, ip := range ips {
if seen[ip] {
continue
}
seen[ip] = true
if dc.fails[ip] != nil {
continue
}
ret = append(ret, ip)
}
return ret
}
// fallbackDelay is how long to wait between trying subsequent
// addresses when multiple options are available.
// 300ms is the same as Go's Happy Eyeballs fallbackDelay value.
const fallbackDelay = 300 * time.Millisecond
// raceDial tries to dial port on each ip in ips, starting a new race
// dial every fallbackDelay apart, returning whichever completes first.
func (dc *dialCall) raceDial(ctx context.Context, ips []netaddr.IP) (net.Conn, error) {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
type res struct {
c net.Conn
err error
}
resc := make(chan res) // must be unbuffered
failBoost := make(chan struct{}) // best effort send on dial failure
// Remove IPs that we tried & failed to dial previously
// (such as when we're being called after a dnsfallback lookup and get
// the same results)
ips = dc.uniqueIPs(ips)
if len(ips) == 0 {
return nil, errors.New("no IPs")
}
go func() {
for i, ip := range ips {
if i != 0 {
timer := time.NewTimer(fallbackDelay)
select {
case <-timer.C:
case <-failBoost:
timer.Stop()
case <-ctx.Done():
timer.Stop()
return
}
}
go func(ip netaddr.IP) {
c, err := dc.dialOne(ctx, ip)
if err != nil {
// Best effort wake-up a pending dial.
// e.g. IPv4 dials failing quickly on an IPv6-only system.
// In that case we don't want to wait 300ms per IPv4 before
// we get to the IPv6 addresses.
select {
case failBoost <- struct{}{}:
default:
}
}
select {
case resc <- res{c, err}:
case <-ctx.Done():
if c != nil {
c.Close()
}
}
}(ip)
}
}()
var firstErr error
var fails int
for {
select {
case r := <-resc:
if r.c != nil {
return r.c, nil
}
fails++
if firstErr == nil {
firstErr = r.err
}
if fails == len(ips) {
return nil, firstErr
}
case <-ctx.Done():
return nil, ctx.Err()
}
}
}
func v4addrs(aa []net.IPAddr) (ret []netaddr.IP) {
for _, a := range aa {
if ip, ok := netaddr.FromStdIP(a.IP); ok && ip.Is4() {
ret = append(ret, ip)
}
}
return ret
}
func v6addrs(aa []net.IPAddr) (ret []netaddr.IP) {
for _, a := range aa {
if ip, ok := netaddr.FromStdIP(a.IP); ok && ip.Is6() {
ret = append(ret, ip)
}
}
return ret
}
var errTLSHandshakeTimeout = errors.New("timeout doing TLS handshake")
// TLSDialer is like Dialer but returns a func suitable for using with net/http.Transport.DialTLSContext.
// It returns a *tls.Conn type on success.
// On TLS cert validation failure, it can invoke a backup DNS resolution strategy.
func TLSDialer(fwd DialContextFunc, dnsCache *Resolver, tlsConfigBase *tls.Config) DialContextFunc {
tcpDialer := Dialer(fwd, dnsCache)
return func(ctx context.Context, network, address string) (net.Conn, error) {
host, _, err := net.SplitHostPort(address)
if err != nil {
return nil, err
}
tcpConn, err := tcpDialer(ctx, network, address)
if err != nil {
return nil, err
}
cfg := cloneTLSConfig(tlsConfigBase)
if cfg.ServerName == "" {
cfg.ServerName = host
}
tlsConn := tls.Client(tcpConn, cfg)
handshakeCtx, handshakeTimeoutCancel := context.WithTimeout(ctx, 5*time.Second)
defer handshakeTimeoutCancel()
if err := tlsConn.HandshakeContext(handshakeCtx); err != nil {
tcpConn.Close()
// TODO: if err != errTLSHandshakeTimeout,
// assume it might be some captive portal or
// otherwise incorrect DNS and try the backup
// DNS mechanism.
return nil, err
}
return tlsConn, nil
}
}
func cloneTLSConfig(cfg *tls.Config) *tls.Config {
if cfg == nil {
return &tls.Config{}
}
return cfg.Clone()
}