From 78b0bd29570d1131c01c217ed83085b12f636ce3 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Fri, 26 Nov 2021 14:43:38 -0800 Subject: [PATCH] net/dns/resolver: add clientmetrics for DNS Fixes tailscale/corp#1811 Change-Id: I864d11e0332a177e8c5ff403591bff6fec548f5a Signed-off-by: Brad Fitzpatrick --- net/dns/resolver/forwarder.go | 29 +++++++++++++- net/dns/resolver/tsdns.go | 75 +++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/net/dns/resolver/forwarder.go b/net/dns/resolver/forwarder.go index 98c6e1e36..7776fc5da 100644 --- a/net/dns/resolver/forwarder.go +++ b/net/dns/resolver/forwarder.go @@ -385,6 +385,7 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client, } defer f.releaseDoHSem() + metricDNSFwdDoH.Add(1) req, err := http.NewRequestWithContext(ctx, "POST", urlBase, bytes.NewReader(packet)) if err != nil { return nil, err @@ -398,16 +399,23 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client, hres, err := c.Do(req) if err != nil { + metricDNSFwdDoHErrorTransport.Add(1) return nil, err } defer hres.Body.Close() if hres.StatusCode != 200 { + metricDNSFwdDoHErrorStatus.Add(1) return nil, errors.New(hres.Status) } if ct := hres.Header.Get("Content-Type"); ct != dohType { + metricDNSFwdDoHErrorCT.Add(1) return nil, fmt.Errorf("unexpected response Content-Type %q", ct) } - return ioutil.ReadAll(hres.Body) + res, err := ioutil.ReadAll(hres.Body) + if err != nil { + metricDNSFwdDoHErrorBody.Add(1) + } + return res, err } // send sends packet to dst. It is best effort. @@ -415,12 +423,15 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client, // send expects the reply to have the same txid as txidOut. func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDelay) ([]byte, error) { if strings.HasPrefix(rr.name.Addr, "http://") { + metricDNSFwdErrorType.Add(1) return nil, fmt.Errorf("http:// resolvers not supported yet") } if strings.HasPrefix(rr.name.Addr, "https://") { + metricDNSFwdErrorType.Add(1) return nil, fmt.Errorf("https:// resolvers not supported yet") } if strings.HasPrefix(rr.name.Addr, "tls://") { + metricDNSFwdErrorType.Add(1) return nil, fmt.Errorf("tls:// resolvers not supported yet") } ipp, err := netaddr.ParseIPPort(rr.name.Addr) @@ -438,6 +449,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe f.logf("DoH error from %v: %v", ipp.IP(), err) } + metricDNSFwdUDP.Add(1) ln, err := f.packetListener(ipp.IP()) if err != nil { return nil, err @@ -453,11 +465,13 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe defer fq.closeOnCtxDone.Remove(conn) if _, err := conn.WriteTo(fq.packet, ipp.UDPAddr()); err != nil { + metricDNSFwdUDPErrorWrite.Add(1) if err := ctx.Err(); err != nil { return nil, err } return nil, err } + metricDNSFwdUDPWrote.Add(1) // The 1 extra byte is to detect packet truncation. out := make([]byte, maxResponseBytes+1) @@ -469,6 +483,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe if packetWasTruncated(err) { err = nil } else { + metricDNSFwdUDPErrorRead.Add(1) return nil, err } } @@ -482,12 +497,14 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe out = out[:n] txid := getTxID(out) if txid != fq.txid { + metricDNSFwdUDPErrorTxID.Add(1) return nil, errors.New("txid doesn't match") } rcode := getRCode(out) // don't forward transient errors back to the client when the server fails if rcode == dns.RCodeServerFailure { f.logf("recv: response code indicating server failure: %d", rcode) + metricDNSFwdUDPErrorServer.Add(1) return nil, errors.New("response code indicates server issue") } @@ -505,7 +522,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe } clampEDNSSize(out, maxResponseBytes) - + metricDNSFwdUDPSuccess.Add(1) return out, nil } @@ -566,8 +583,10 @@ func (f *forwarder) forward(query packet) error { // If backupResolvers are specified, they're used in the case that no // upstreams are available. func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, responseChan chan<- packet, backupResolvers ...resolverAndDelay) error { + metricDNSFwd.Add(1) domain, err := nameFromQuery(query.bs) if err != nil { + metricDNSFwdErrorName.Add(1) return err } @@ -576,6 +595,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo // when browsing for LAN devices. But even when filtering this // out, playing on Sonos still works. if hasRDNSBonjourPrefix(domain) { + metricDNSFwdDropBonjour.Add(1) return nil } @@ -586,6 +606,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo resolvers = backupResolvers } if len(resolvers) == 0 { + metricDNSFwdErrorNoUpstream.Add(1) return errNoUpstreams } @@ -633,14 +654,18 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo case v := <-resc: select { case <-ctx.Done(): + metricDNSFwdErrorContext.Add(1) return ctx.Err() case responseChan <- packet{v, query.addr}: + metricDNSFwdSuccess.Add(1) return nil } case <-ctx.Done(): mu.Lock() defer mu.Unlock() + metricDNSFwdErrorContext.Add(1) if firstErr != nil { + metricDNSFwdErrorContextGotError.Add(1) return firstErr } return ctx.Err() diff --git a/net/dns/resolver/tsdns.go b/net/dns/resolver/tsdns.go index f68247af3..ee67995e4 100644 --- a/net/dns/resolver/tsdns.go +++ b/net/dns/resolver/tsdns.go @@ -24,6 +24,7 @@ "inet.af/netaddr" "tailscale.com/types/dnstype" "tailscale.com/types/logger" + "tailscale.com/util/clientmetric" "tailscale.com/util/dnsname" "tailscale.com/wgengine/monitor" ) @@ -273,13 +274,16 @@ func (r *Resolver) Close() { // It takes ownership of the payload and does not block. // If the queue is full, the request will be dropped and an error will be returned. func (r *Resolver) EnqueueRequest(bs []byte, from netaddr.IPPort) error { + metricDNSQueryLocal.Add(1) select { case <-r.closed: + metricDNSQueryErrorClosed.Add(1) return ErrClosed default: } if n := atomic.AddInt32(&r.activeQueriesAtomic, 1); n > maxActiveQueries() { atomic.AddInt32(&r.activeQueriesAtomic, -1) + metricDNSQueryErrorQueue.Add(1) return errFullQueue } go r.handleQuery(packet{bs, from}) @@ -303,6 +307,7 @@ func (r *Resolver) NextResponse() (packet []byte, to netaddr.IPPort, err error) // via the peerapi's DoH server. This is only used when the local // node is being an exit node. func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from netaddr.IPPort) (res []byte, err error) { + metricDNSQueryForPeer.Add(1) ch := make(chan packet, 1) err = r.forwarder.forwardWithDestChan(ctx, packet{q, from}, ch) @@ -338,6 +343,7 @@ func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from ne } } if err != nil { + metricDNSQueryForPeerError.Add(1) return nil, err } select { @@ -357,8 +363,10 @@ func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from ne // Returns dns.RCodeRefused to indicate that the local map is not // authoritative for domain. func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP, dns.RCode) { + metricDNSResolveLocal.Add(1) // Reject .onion domains per RFC 7686. if dnsname.HasSuffix(domain.WithoutTrailingDot(), ".onion") { + metricDNSResolveLocalErrorOnion.Add(1) return netaddr.IP{}, dns.RCodeNameError } @@ -372,6 +380,7 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP, for _, suffix := range localDomains { if suffix.Contains(domain) { // We are authoritative for the queried domain. + metricDNSResolveLocalErrorMissing.Add(1) return netaddr.IP{}, dns.RCodeNameError } } @@ -389,30 +398,37 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP, case dns.TypeA: for _, ip := range addrs { if ip.Is4() { + metricDNSResolveLocalOKA.Add(1) return ip, dns.RCodeSuccess } } + metricDNSResolveLocalNoA.Add(1) return netaddr.IP{}, dns.RCodeSuccess case dns.TypeAAAA: for _, ip := range addrs { if ip.Is6() { + metricDNSResolveLocalOKAAAA.Add(1) return ip, dns.RCodeSuccess } } + metricDNSResolveLocalNoAAAA.Add(1) return netaddr.IP{}, dns.RCodeSuccess case dns.TypeALL: // Answer with whatever we've got. // It could be IPv4, IPv6, or a zero addr. // TODO: Return all available resolutions (A and AAAA, if we have them). if len(addrs) == 0 { + metricDNSResolveLocalNoAll.Add(1) return netaddr.IP{}, dns.RCodeSuccess } + metricDNSResolveLocalOKAll.Add(1) return addrs[0], dns.RCodeSuccess // Leave some some record types explicitly unimplemented. // These types relate to recursive resolution or special // DNS semantics and might be implemented in the future. case dns.TypeNS, dns.TypeSOA, dns.TypeAXFR, dns.TypeHINFO: + metricDNSResolveNotImplType.Add(1) return netaddr.IP{}, dns.RCodeNotImplemented // For everything except for the few types above that are explicitly not implemented, return no records. @@ -422,6 +438,7 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP, // dig -t TYPE9824 example.com // and note that NOERROR is returned, despite that record type being made up. default: + metricDNSResolveNoRecordType.Add(1) // The name exists, but no records exist of the requested type. return netaddr.IP{}, dns.RCodeSuccess } @@ -753,6 +770,7 @@ func (r *Resolver) respondReverse(query []byte, name dnsname.FQDN, resp *respons return nil, errNotOurName } + metricDNSMagicDNSSuccessReverse.Add(1) return marshalResponse(resp) } @@ -769,8 +787,10 @@ func (r *Resolver) respond(query []byte) ([]byte, error) { // We will not return this error: it is the sender's fault. if err != nil { if errors.Is(err, dns.ErrSectionDone) { + metricDNSErrorParseNoQ.Add(1) r.logf("parseQuery(%02x): no DNS questions", query) } else { + metricDNSErrorParseQuery.Add(1) r.logf("parseQuery(%02x): %v", query, err) } resp := parser.response() @@ -780,6 +800,7 @@ func (r *Resolver) respond(query []byte) ([]byte, error) { rawName := parser.Question.Name.Data[:parser.Question.Name.Length] name, err := dnsname.ToFQDN(rawNameToLower(rawName)) if err != nil { + metricDNSErrorNotFQDN.Add(1) // DNS packet unexpectedly contains an invalid FQDN. resp := parser.response() resp.Header.RCode = dns.RCodeFormatError @@ -803,3 +824,57 @@ func (r *Resolver) respond(query []byte) ([]byte, error) { resp.IP = ip return marshalResponse(resp) } + +var ( + metricDNSQueryLocal = clientmetric.NewCounter("dns_query_local") + metricDNSQueryErrorClosed = clientmetric.NewCounter("dns_query_local_error_closed") + metricDNSQueryErrorQueue = clientmetric.NewCounter("dns_query_local_error_queue") + + metricDNSErrorParseNoQ = clientmetric.NewCounter("dns_query_respond_error_no_question") + metricDNSErrorParseQuery = clientmetric.NewCounter("dns_query_respond_error_parse") + metricDNSErrorNotFQDN = clientmetric.NewCounter("dns_query_respond_error_not_fqdn") + + metricDNSMagicDNSSuccessName = clientmetric.NewCounter("dns_query_magic_success_name") + metricDNSMagicDNSSuccessReverse = clientmetric.NewCounter("dns_query_magic_success_reverse") + + metricDNSQueryForPeer = clientmetric.NewCounter("dns_query_peerapi") + metricDNSQueryForPeerError = clientmetric.NewCounter("dns_query_peerapi_error") + + metricDNSFwd = clientmetric.NewCounter("dns_query_fwd") + metricDNSFwdDropBonjour = clientmetric.NewCounter("dns_query_fwd_drop_bonjour") + metricDNSFwdErrorName = clientmetric.NewCounter("dns_query_fwd_error_name") + metricDNSFwdErrorNoUpstream = clientmetric.NewCounter("dns_query_fwd_error_no_upstream") + metricDNSFwdSuccess = clientmetric.NewCounter("dns_query_fwd_success") + metricDNSFwdErrorContext = clientmetric.NewCounter("dns_query_fwd_error_context") + metricDNSFwdErrorContextGotError = clientmetric.NewCounter("dns_query_fwd_error_context_got_error") + + metricDNSFwdErrorType = clientmetric.NewCounter("dns_query_fwd_error_type") + metricDNSFwdErrorParseAddr = clientmetric.NewCounter("dns_query_fwd_error_parse_addr") + + metricDNSFwdUDP = clientmetric.NewCounter("dns_query_fwd_udp") // on entry + metricDNSFwdUDPWrote = clientmetric.NewCounter("dns_query_fwd_udp_wrote") // sent UDP packet + metricDNSFwdUDPErrorWrite = clientmetric.NewCounter("dns_query_fwd_udp_error_write") + metricDNSFwdUDPErrorServer = clientmetric.NewCounter("dns_query_fwd_udp_error_server") + metricDNSFwdUDPErrorTxID = clientmetric.NewCounter("dns_query_fwd_udp_error_txid") + metricDNSFwdUDPErrorRead = clientmetric.NewCounter("dns_query_fwd_udp_error_read") + metricDNSFwdUDPSuccess = clientmetric.NewCounter("dns_query_fwd_udp_success") + + metricDNSFwdDoH = clientmetric.NewCounter("dns_query_fwd_doh") + metricDNSFwdDoHErrorStatus = clientmetric.NewCounter("dns_query_fwd_doh_error_status") + metricDNSFwdDoHErrorCT = clientmetric.NewCounter("dns_query_fwd_doh_error_content_type") + metricDNSFwdDoHErrorTransport = clientmetric.NewCounter("dns_query_fwd_doh_error_transport") + metricDNSFwdDoHErrorBody = clientmetric.NewCounter("dns_query_fwd_doh_error_body") + + metricDNSResolveLocal = clientmetric.NewCounter("dns_resolve_local") + metricDNSResolveLocalErrorOnion = clientmetric.NewCounter("dns_resolve_local_error_onion") + metricDNSResolveLocalErrorMissing = clientmetric.NewCounter("dns_resolve_local_error_missing") + metricDNSResolveLocalErrorRefused = clientmetric.NewCounter("dns_resolve_local_error_refused") + metricDNSResolveLocalOKA = clientmetric.NewCounter("dns_resolve_local_ok_a") + metricDNSResolveLocalOKAAAA = clientmetric.NewCounter("dns_resolve_local_ok_aaaa") + metricDNSResolveLocalOKAll = clientmetric.NewCounter("dns_resolve_local_ok_all") + metricDNSResolveLocalNoA = clientmetric.NewCounter("dns_resolve_local_no_a") + metricDNSResolveLocalNoAAAA = clientmetric.NewCounter("dns_resolve_local_no_aaaa") + metricDNSResolveLocalNoAll = clientmetric.NewCounter("dns_resolve_local_no_all") + metricDNSResolveNotImplType = clientmetric.NewCounter("dns_resolve_local_not_impl_type") + metricDNSResolveNoRecordType = clientmetric.NewCounter("dns_resolve_local_no_record_type") +)