From 19b0c8a0248830c8325f340439f00fd75e7ffdc5 Mon Sep 17 00:00:00 2001 From: Jonathan Nobels Date: Mon, 29 Jul 2024 13:48:46 -0400 Subject: [PATCH] net/dns, health: raise health warning for failing forwarded DNS queries (#12888) updates tailscale/corp#21823 Misconfigured, broken, or blocked DNS will often present as "internet is broken'" to the end user. This plumbs the health tracker into the dns manager and forwarder and adds a health warning with a 5 second delay that is raised on failures in the forwarder and lowered on successes. Signed-off-by: Jonathan Nobels --- health/args.go | 4 ++++ net/dns/manager.go | 2 +- net/dns/manager_tcp_test.go | 5 +++-- net/dns/resolver/forwarder.go | 32 +++++++++++++++++++++++++++++- net/dns/resolver/forwarder_test.go | 3 ++- net/dns/resolver/tsdns.go | 11 ++++++++-- net/dns/resolver/tsdns_test.go | 4 +++- 7 files changed, 53 insertions(+), 8 deletions(-) diff --git a/health/args.go b/health/args.go index ced42bba8..01a75aa2d 100644 --- a/health/args.go +++ b/health/args.go @@ -32,4 +32,8 @@ // ArgServerName provides a Warnable with the hostname of a server involved in the unhealthy state. ArgServerName Arg = "server-name" + + // ArgServerName provides a Warnable with comma delimited list of the hostname of the servers involved in the unhealthy state. + // If no nameservers were available to query, this will be an empty string. + ArgDNSServers Arg = "dns-servers" ) diff --git a/net/dns/manager.go b/net/dns/manager.go index eee2d5a7d..818748ac4 100644 --- a/net/dns/manager.go +++ b/net/dns/manager.go @@ -82,7 +82,7 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker, m := &Manager{ logf: logf, - resolver: resolver.New(logf, linkSel, dialer, knobs), + resolver: resolver.New(logf, linkSel, dialer, health, knobs), os: oscfg, health: health, knobs: knobs, diff --git a/net/dns/manager_tcp_test.go b/net/dns/manager_tcp_test.go index 2982c6129..f4c42791e 100644 --- a/net/dns/manager_tcp_test.go +++ b/net/dns/manager_tcp_test.go @@ -15,6 +15,7 @@ "github.com/google/go-cmp/cmp" dns "golang.org/x/net/dns/dnsmessage" + "tailscale.com/health" "tailscale.com/net/netmon" "tailscale.com/net/tsdial" "tailscale.com/tstest" @@ -88,7 +89,7 @@ func TestDNSOverTCP(t *testing.T) { SearchDomains: fqdns("coffee.shop"), }, } - m := NewManager(t.Logf, &f, nil, tsdial.NewDialer(netmon.NewStatic()), nil, nil, "") + m := NewManager(t.Logf, &f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "") m.resolver.TestOnlySetHook(f.SetResolver) m.Set(Config{ Hosts: hosts( @@ -173,7 +174,7 @@ func TestDNSOverTCP_TooLarge(t *testing.T) { SearchDomains: fqdns("coffee.shop"), }, } - m := NewManager(log, &f, nil, tsdial.NewDialer(netmon.NewStatic()), nil, nil, "") + m := NewManager(log, &f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "") m.resolver.TestOnlySetHook(f.SetResolver) m.Set(Config{ Hosts: hosts("andrew.ts.com.", "1.2.3.4"), diff --git a/net/dns/resolver/forwarder.go b/net/dns/resolver/forwarder.go index ca3227aab..e217e0785 100644 --- a/net/dns/resolver/forwarder.go +++ b/net/dns/resolver/forwarder.go @@ -23,6 +23,7 @@ dns "golang.org/x/net/dns/dnsmessage" "tailscale.com/control/controlknobs" "tailscale.com/envknob" + "tailscale.com/health" "tailscale.com/net/dns/publicdns" "tailscale.com/net/dnscache" "tailscale.com/net/neterror" @@ -164,6 +165,23 @@ func clampEDNSSize(packet []byte, maxSize uint16) { binary.BigEndian.PutUint16(opt[3:5], maxSize) } +// dnsForwarderFailing should be raised when the forwarder is unable to reach the +// upstream resolvers. This is a high severity warning as it results in "no internet". +// This warning must be cleared when the forwarder is working again. +// +// We allow for 5 second grace period to ensure this is not raised for spurious errors +// under the assumption that DNS queries are relatively frequent and a subsequent +// successful query will clear any one-off errors. +var dnsForwarderFailing = health.Register(&health.Warnable{ + Code: "dns-forward-failing", + Title: "DNS unavailable", + Severity: health.SeverityHigh, + DependsOn: []*health.Warnable{health.NetworkStatusWarnable}, + Text: health.StaticMessage("Tailscale can't reach the configured DNS servers. Internet connectivity may be affected."), + ImpactsConnectivity: true, + TimeToVisible: 5 * time.Second, +}) + type route struct { Suffix dnsname.FQDN Resolvers []resolverAndDelay @@ -188,6 +206,7 @@ type forwarder struct { netMon *netmon.Monitor // always non-nil linkSel ForwardLinkSelector // TODO(bradfitz): remove this when tsdial.Dialer absorbs it dialer *tsdial.Dialer + health *health.Tracker // always non-nil controlKnobs *controlknobs.Knobs // or nil @@ -219,7 +238,7 @@ type forwarder struct { missingUpstreamRecovery func() } -func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder { +func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *forwarder { if netMon == nil { panic("nil netMon") } @@ -228,6 +247,7 @@ func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkS netMon: netMon, linkSel: linkSel, dialer: dialer, + health: health, controlKnobs: knobs, missingUpstreamRecovery: func() {}, } @@ -887,6 +907,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo resolvers = f.resolvers(domain) if len(resolvers) == 0 { metricDNSFwdErrorNoUpstream.Add(1) + f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: ""}) f.logf("no upstream resolvers set, returning SERVFAIL") // Attempt to recompile the DNS configuration @@ -909,6 +930,8 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo case responseChan <- res: return nil } + } else { + f.health.SetHealthy(dnsForwarderFailing) } } @@ -960,6 +983,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo return fmt.Errorf("waiting to send response: %w", ctx.Err()) case responseChan <- packet{v, query.family, query.addr}: metricDNSFwdSuccess.Add(1) + f.health.SetHealthy(dnsForwarderFailing) return nil } case err := <-errc: @@ -979,6 +1003,11 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo case <-ctx.Done(): metricDNSFwdErrorContext.Add(1) metricDNSFwdErrorContextGotError.Add(1) + var resolverAddrs []string + for _, rr := range resolvers { + resolverAddrs = append(resolverAddrs, rr.name.Addr) + } + f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")}) case responseChan <- res: } } @@ -999,6 +1028,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo for _, rr := range resolvers { resolverAddrs = append(resolverAddrs, rr.name.Addr) } + f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")}) return fmt.Errorf("waiting for response or error from %v: %w", resolverAddrs, ctx.Err()) } } diff --git a/net/dns/resolver/forwarder_test.go b/net/dns/resolver/forwarder_test.go index e723af620..398989840 100644 --- a/net/dns/resolver/forwarder_test.go +++ b/net/dns/resolver/forwarder_test.go @@ -24,6 +24,7 @@ dns "golang.org/x/net/dns/dnsmessage" "tailscale.com/control/controlknobs" "tailscale.com/envknob" + "tailscale.com/health" "tailscale.com/net/netmon" "tailscale.com/net/tsdial" "tailscale.com/types/dnstype" @@ -457,7 +458,7 @@ func runTestQuery(tb testing.TB, port uint16, request []byte, modify func(*forwa var dialer tsdial.Dialer dialer.SetNetMon(netMon) - fwd := newForwarder(tb.Logf, netMon, nil, &dialer, nil) + fwd := newForwarder(tb.Logf, netMon, nil, &dialer, new(health.Tracker), nil) if modify != nil { modify(fwd) } diff --git a/net/dns/resolver/tsdns.go b/net/dns/resolver/tsdns.go index a3f3d7010..90e447020 100644 --- a/net/dns/resolver/tsdns.go +++ b/net/dns/resolver/tsdns.go @@ -25,6 +25,7 @@ dns "golang.org/x/net/dns/dnsmessage" "tailscale.com/control/controlknobs" "tailscale.com/envknob" + "tailscale.com/health" "tailscale.com/net/dns/resolvconffile" "tailscale.com/net/netaddr" "tailscale.com/net/netmon" @@ -202,6 +203,7 @@ type Resolver struct { logf logger.Logf netMon *netmon.Monitor // non-nil dialer *tsdial.Dialer // non-nil + health *health.Tracker // non-nil saveConfigForTests func(cfg Config) // used in tests to capture resolver config // forwarder forwards requests to upstream nameservers. forwarder *forwarder @@ -224,10 +226,14 @@ type ForwardLinkSelector interface { } // New returns a new resolver. -func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *Resolver { +// dialer and health must be non-nil. +func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *Resolver { if dialer == nil { panic("nil Dialer") } + if health == nil { + panic("nil health") + } netMon := dialer.NetMon() if netMon == nil { logf("nil netMon") @@ -239,8 +245,9 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, k hostToIP: map[dnsname.FQDN][]netip.Addr{}, ipToHost: map[netip.Addr]dnsname.FQDN{}, dialer: dialer, + health: health, } - r.forwarder = newForwarder(r.logf, netMon, linkSel, dialer, knobs) + r.forwarder = newForwarder(r.logf, netMon, linkSel, dialer, health, knobs) return r } diff --git a/net/dns/resolver/tsdns_test.go b/net/dns/resolver/tsdns_test.go index e1477e342..e2c4750b5 100644 --- a/net/dns/resolver/tsdns_test.go +++ b/net/dns/resolver/tsdns_test.go @@ -23,6 +23,7 @@ miekdns "github.com/miekg/dns" dns "golang.org/x/net/dns/dnsmessage" + "tailscale.com/health" "tailscale.com/net/netaddr" "tailscale.com/net/netmon" "tailscale.com/net/tsdial" @@ -354,6 +355,7 @@ func newResolver(t testing.TB) *Resolver { return New(t.Logf, nil, // no link selector tsdial.NewDialer(netmon.NewStatic()), + new(health.Tracker), nil, // no control knobs ) } @@ -1068,7 +1070,7 @@ func TestForwardLinkSelection(t *testing.T) { return "special" } return "" - }), new(tsdial.Dialer), nil /* no control knobs */) + }), new(tsdial.Dialer), new(health.Tracker), nil /* no control knobs */) // Test non-special IP. if got, err := fwd.packetListener(netip.Addr{}); err != nil {