ipn/ipnlocal, net/dnscache: allow configuring dnscache logging via capability

This allows users to temporarily enable/disable dnscache logging via a
new node capability, to aid in debugging strange connectivity issues.

Signed-off-by: Andrew Dunham <andrew@du.nham.ca>
Change-Id: I46cf2596a8ae4c1913880a78d0033f8b668edc08
This commit is contained in:
Andrew Dunham 2023-01-24 14:31:20 -05:00
parent d8feeeee4c
commit 44d73ce932
4 changed files with 89 additions and 39 deletions

View File

@ -45,6 +45,7 @@
"tailscale.com/ipn/ipnstate" "tailscale.com/ipn/ipnstate"
"tailscale.com/ipn/policy" "tailscale.com/ipn/policy"
"tailscale.com/net/dns" "tailscale.com/net/dns"
"tailscale.com/net/dnscache"
"tailscale.com/net/dnsfallback" "tailscale.com/net/dnsfallback"
"tailscale.com/net/interfaces" "tailscale.com/net/interfaces"
"tailscale.com/net/netutil" "tailscale.com/net/netutil"
@ -3820,6 +3821,8 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
} }
b.capFileSharing = fs b.capFileSharing = fs
b.setDebugLogsByCapabilityLocked(nm)
b.setTCPPortsInterceptedFromNetmapAndPrefsLocked(b.pm.CurrentPrefs()) b.setTCPPortsInterceptedFromNetmapAndPrefsLocked(b.pm.CurrentPrefs())
if nm == nil { if nm == nil {
b.nodeByAddr = nil b.nodeByAddr = nil
@ -3855,6 +3858,18 @@ func (b *LocalBackend) setNetMapLocked(nm *netmap.NetworkMap) {
} }
} }
// setDebugLogsByCapabilityLocked sets debug logging based on the self node's
// capabilities in the provided NetMap.
func (b *LocalBackend) setDebugLogsByCapabilityLocked(nm *netmap.NetworkMap) {
// These are sufficiently cheap (atomic bools) that we don't need to
// store state and compare.
if hasCapability(nm, tailcfg.CapabilityDebugTSDNSResolution) {
dnscache.SetDebugLoggingEnabled(true)
} else {
dnscache.SetDebugLoggingEnabled(false)
}
}
func (b *LocalBackend) reloadServeConfigLocked(prefs ipn.PrefsView) { func (b *LocalBackend) reloadServeConfigLocked(prefs ipn.PrefsView) {
if b.netMap == nil || b.netMap.SelfNode == nil || !prefs.Valid() || b.pm.CurrentProfile().ID == "" { if b.netMap == nil || b.netMap.SelfNode == nil || !prefs.Valid() || b.pm.CurrentProfile().ID == "" {
// We're not logged in, so we don't have a profile. // We're not logged in, so we don't have a profile.

View File

@ -18,9 +18,11 @@
"net/netip" "net/netip"
"runtime" "runtime"
"sync" "sync"
"sync/atomic"
"time" "time"
"tailscale.com/envknob" "tailscale.com/envknob"
"tailscale.com/types/logger"
"tailscale.com/util/cloudenv" "tailscale.com/util/cloudenv"
"tailscale.com/util/singleflight" "tailscale.com/util/singleflight"
) )
@ -84,6 +86,11 @@ type Resolver struct {
// It is required when SingleHostStaticResult is present. // It is required when SingleHostStaticResult is present.
SingleHost string SingleHost string
// Logf optionally provides a log function to use for debug logs. If
// not present, log.Printf will be used. The prefix "dnscache: " will
// be added to all log messages printed with this logger.
Logf logger.Logf
sf singleflight.Group[string, ipRes] sf singleflight.Group[string, ipRes]
mu sync.Mutex mu sync.Mutex
@ -110,6 +117,20 @@ func (r *Resolver) fwd() *net.Resolver {
return net.DefaultResolver return net.DefaultResolver
} }
// dlogf logs a debug message if debug logging is enabled either globally via
// the TS_DEBUG_DNS_CACHE environment variable or via the per-Resolver
// configuration.
func (r *Resolver) dlogf(format string, args ...any) {
logf := r.Logf
if logf == nil {
logf = log.Printf
}
if debug() || debugLogging.Load() {
logf("dnscache: "+format, args...)
}
}
// cloudHostResolver returns a Resolver for the current cloud hosting environment. // cloudHostResolver returns a Resolver for the current cloud hosting environment.
// It currently only supports Google Cloud. // It currently only supports Google Cloud.
func (r *Resolver) cloudHostResolver() (v *net.Resolver, ok bool) { func (r *Resolver) cloudHostResolver() (v *net.Resolver, ok bool) {
@ -143,6 +164,27 @@ func (r *Resolver) ttl() time.Duration {
var debug = envknob.RegisterBool("TS_DEBUG_DNS_CACHE") var debug = envknob.RegisterBool("TS_DEBUG_DNS_CACHE")
// debugLogging allows enabling debug logging at runtime, via
// SetDebugLoggingEnabled.
//
// This is a global variable instead of a per-Resolver variable because we
// create new Resolvers throughout the lifetime of the program (e.g. on every
// new Direct client, etc.). When we enable debug logs, though, we want to do
// so for every single created Resolver; we'd need to plumb a bunch of new code
// through all of the intermediate packages to accomplish the same behaviour as
// just using a global variable.
var debugLogging atomic.Bool
// SetDebugLoggingEnabled controls whether debug logging is enabled for this
// package.
//
// These logs are also printed when the TS_DEBUG_DNS_CACHE envknob is set, but
// we allow configuring this manually as well so that it can be changed at
// runtime.
func SetDebugLoggingEnabled(v bool) {
debugLogging.Store(v)
}
// LookupIP returns the host's primary IP address (either IPv4 or // LookupIP returns the host's primary IP address (either IPv4 or
// IPv6, but preferring IPv4) and optionally its IPv6 address, if // IPv6, but preferring IPv4) and optionally its IPv6 address, if
// there is both IPv4 and IPv6. // there is both IPv4 and IPv6.
@ -163,20 +205,17 @@ func (r *Resolver) LookupIP(ctx context.Context, host string) (ip, v6 netip.Addr
} }
allIPs = append(allIPs, naIP) allIPs = append(allIPs, naIP)
} }
r.dlogf("returning %d static results", len(allIPs))
return return
} }
if ip, err := netip.ParseAddr(host); err == nil { if ip, err := netip.ParseAddr(host); err == nil {
ip = ip.Unmap() ip = ip.Unmap()
if debug() { r.dlogf("%q is an IP", host)
log.Printf("dnscache: %q is an IP", host)
}
return ip, zaddr, []netip.Addr{ip}, nil return ip, zaddr, []netip.Addr{ip}, nil
} }
if ip, ip6, allIPs, ok := r.lookupIPCache(host); ok { if ip, ip6, allIPs, ok := r.lookupIPCache(host); ok {
if debug() { r.dlogf("%q = %v (cached)", host, ip)
log.Printf("dnscache: %q = %v (cached)", host, ip)
}
return ip, ip6, allIPs, nil return ip, ip6, allIPs, nil
} }
@ -192,23 +231,17 @@ func (r *Resolver) LookupIP(ctx context.Context, host string) (ip, v6 netip.Addr
if res.Err != nil { if res.Err != nil {
if r.UseLastGood { if r.UseLastGood {
if ip, ip6, allIPs, ok := r.lookupIPCacheExpired(host); ok { if ip, ip6, allIPs, ok := r.lookupIPCacheExpired(host); ok {
if debug() { r.dlogf("%q using %v after error", host, ip)
log.Printf("dnscache: %q using %v after error", host, ip)
}
return ip, ip6, allIPs, nil return ip, ip6, allIPs, nil
} }
} }
if debug() { r.dlogf("error resolving %q: %v", host, res.Err)
log.Printf("dnscache: error resolving %q: %v", host, res.Err)
}
return zaddr, zaddr, nil, res.Err return zaddr, zaddr, nil, res.Err
} }
r := res.Val r := res.Val
return r.ip, r.ip6, r.allIPs, nil return r.ip, r.ip6, r.allIPs, nil
case <-ctx.Done(): case <-ctx.Done():
if debug() { r.dlogf("context done while resolving %q: %v", host, ctx.Err())
log.Printf("dnscache: context done while resolving %q: %v", host, ctx.Err())
}
return zaddr, zaddr, nil, ctx.Err() return zaddr, zaddr, nil, ctx.Err()
} }
} }
@ -250,9 +283,7 @@ func (r *Resolver) lookupTimeoutForHost(host string) time.Duration {
func (r *Resolver) lookupIP(host string) (ip, ip6 netip.Addr, allIPs []netip.Addr, err error) { func (r *Resolver) lookupIP(host string) (ip, ip6 netip.Addr, allIPs []netip.Addr, err error) {
if ip, ip6, allIPs, ok := r.lookupIPCache(host); ok { if ip, ip6, allIPs, ok := r.lookupIPCache(host); ok {
if debug() { r.dlogf("%q found in cache as %v", host, ip)
log.Printf("dnscache: %q found in cache as %v", host, ip)
}
return ip, ip6, allIPs, nil return ip, ip6, allIPs, nil
} }
@ -261,12 +292,18 @@ func (r *Resolver) lookupIP(host string) (ip, ip6 netip.Addr, allIPs []netip.Add
ips, err := r.fwd().LookupNetIP(ctx, "ip", host) ips, err := r.fwd().LookupNetIP(ctx, "ip", host)
if err != nil || len(ips) == 0 { if err != nil || len(ips) == 0 {
if resolver, ok := r.cloudHostResolver(); ok { if resolver, ok := r.cloudHostResolver(); ok {
r.dlogf("resolving %q via cloud resolver", host)
ips, err = resolver.LookupNetIP(ctx, "ip", host) ips, err = resolver.LookupNetIP(ctx, "ip", host)
} }
} }
if (err != nil || len(ips) == 0) && r.LookupIPFallback != nil { if (err != nil || len(ips) == 0) && r.LookupIPFallback != nil {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel() defer cancel()
if err != nil {
r.dlogf("resolving %q using fallback resolver due to error", host)
} else {
r.dlogf("resolving %q using fallback resolver due to no returned IPs", host)
}
ips, err = r.LookupIPFallback(ctx, host) ips, err = r.LookupIPFallback(ctx, host)
} }
if err != nil { if err != nil {
@ -305,15 +342,11 @@ func (r *Resolver) addIPCache(host string, ip, ip6 netip.Addr, allIPs []netip.Ad
if ip.IsPrivate() { if ip.IsPrivate() {
// Don't cache obviously wrong entries from captive portals. // Don't cache obviously wrong entries from captive portals.
// TODO: use DoH or DoT for the forwarding resolver? // TODO: use DoH or DoT for the forwarding resolver?
if debug() { r.dlogf("%q resolved to private IP %v; using but not caching", host, ip)
log.Printf("dnscache: %q resolved to private IP %v; using but not caching", host, ip)
}
return return
} }
if debug() { r.dlogf("%q resolved to IP %v; caching", host, ip)
log.Printf("dnscache: %q resolved to IP %v; caching", host, ip)
}
r.mu.Lock() r.mu.Lock()
defer r.mu.Unlock() defer r.mu.Unlock()
@ -387,9 +420,7 @@ func (d *dialer) DialContext(ctx context.Context, network, address string) (retC
} }
i4s := v4addrs(allIPs) i4s := v4addrs(allIPs)
if len(i4s) < 2 { if len(i4s) < 2 {
if debug() { d.dnsCache.dlogf("dialing %s, %s for %s", network, ip, address)
log.Printf("dnscache: dialing %s, %s for %s", network, ip, address)
}
c, err := dc.dialOne(ctx, ip.Unmap()) c, err := dc.dialOne(ctx, ip.Unmap())
if err == nil || ctx.Err() != nil { if err == nil || ctx.Err() != nil {
return c, err return c, err
@ -411,26 +442,20 @@ func (d *dialer) shouldTryBootstrap(ctx context.Context, err error, dc *dialCall
// Can't try bootstrap DNS if we don't have a fallback function // Can't try bootstrap DNS if we don't have a fallback function
if d.dnsCache.LookupIPFallback == nil { if d.dnsCache.LookupIPFallback == nil {
if debug() { d.dnsCache.dlogf("not using bootstrap DNS: no fallback")
log.Printf("dnscache: not using bootstrap DNS: no fallback")
}
return false return false
} }
// We can't retry if the context is canceled, since any further // We can't retry if the context is canceled, since any further
// operations with this context will fail. // operations with this context will fail.
if ctxErr := ctx.Err(); ctxErr != nil { if ctxErr := ctx.Err(); ctxErr != nil {
if debug() { d.dnsCache.dlogf("not using bootstrap DNS: context error: %v", ctxErr)
log.Printf("dnscache: not using bootstrap DNS: context error: %v", ctxErr)
}
return false return false
} }
wasTrustworthy := dc.dnsWasTrustworthy() wasTrustworthy := dc.dnsWasTrustworthy()
if wasTrustworthy { if wasTrustworthy {
if debug() { d.dnsCache.dlogf("not using bootstrap DNS: DNS was trustworthy")
log.Printf("dnscache: not using bootstrap DNS: DNS was trustworthy")
}
return false return false
} }

View File

@ -22,7 +22,7 @@ func TestDialer(t *testing.T) {
if *dialTest == "" { if *dialTest == "" {
t.Skip("skipping; --dial-test is blank") t.Skip("skipping; --dial-test is blank")
} }
r := new(Resolver) r := &Resolver{Logf: t.Logf}
var std net.Dialer var std net.Dialer
dialer := Dialer(std.DialContext, r) dialer := Dialer(std.DialContext, r)
t0 := time.Now() t0 := time.Now()
@ -113,6 +113,7 @@ func TestDialCall_uniqueIPs(t *testing.T) {
func TestResolverAllHostStaticResult(t *testing.T) { func TestResolverAllHostStaticResult(t *testing.T) {
r := &Resolver{ r := &Resolver{
Logf: t.Logf,
SingleHost: "foo.bar", SingleHost: "foo.bar",
SingleHostStaticResult: []netip.Addr{ SingleHostStaticResult: []netip.Addr{
netip.MustParseAddr("2001:4860:4860::8888"), netip.MustParseAddr("2001:4860:4860::8888"),
@ -185,11 +186,12 @@ type step struct {
errFailed := errors.New("some failure") errFailed := errors.New("some failure")
cacheWithFallback := &Resolver{ cacheWithFallback := &Resolver{
Logf: t.Logf,
LookupIPFallback: func(_ context.Context, _ string) ([]netip.Addr, error) { LookupIPFallback: func(_ context.Context, _ string) ([]netip.Addr, error) {
panic("unimplemented") panic("unimplemented")
}, },
} }
cacheNoFallback := &Resolver{} cacheNoFallback := &Resolver{Logf: t.Logf}
testCases := []struct { testCases := []struct {
name string name string

View File

@ -93,7 +93,8 @@
// - 53: 2023-01-18: client respects explicit Node.Expired + auto-sets based on Node.KeyExpiry // - 53: 2023-01-18: client respects explicit Node.Expired + auto-sets based on Node.KeyExpiry
// - 54: 2023-01-19: Node.Cap added, PeersChangedPatch.Cap, uses Node.Cap for ExitDNS before Hostinfo.Services fallback // - 54: 2023-01-19: Node.Cap added, PeersChangedPatch.Cap, uses Node.Cap for ExitDNS before Hostinfo.Services fallback
// - 55: 2023-01-23: start of c2n GET+POST /update handler // - 55: 2023-01-23: start of c2n GET+POST /update handler
const CurrentCapabilityVersion CapabilityVersion = 55 // - 56: 2023-01-24: Client understands CapabilityDebugTSDNSResolution
const CurrentCapabilityVersion CapabilityVersion = 56
type StableID string type StableID string
@ -1754,6 +1755,13 @@ type Oauth2Token struct {
// CapabilityWarnFunnelNoHTTPS indicates HTTPS has not been enabled for the tailnet. // CapabilityWarnFunnelNoHTTPS indicates HTTPS has not been enabled for the tailnet.
CapabilityWarnFunnelNoHTTPS = "https://tailscale.com/cap/warn-funnel-no-https" CapabilityWarnFunnelNoHTTPS = "https://tailscale.com/cap/warn-funnel-no-https"
// Debug logging capabilities
// CapabilityDebugTSDNSResolution enables verbose debug logging for DNS
// resolution for Tailscale-controlled domains (the control server, log
// server, DERP servers, etc.)
CapabilityDebugTSDNSResolution = "https://tailscale.com/cap/debug-ts-dns-resolution"
) )
const ( const (