net/dns/resolver: add clientmetrics for DNS

Fixes tailscale/corp#1811

Change-Id: I864d11e0332a177e8c5ff403591bff6fec548f5a
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2021-11-26 14:43:38 -08:00 committed by Brad Fitzpatrick
parent 097602b3ca
commit 78b0bd2957
2 changed files with 102 additions and 2 deletions

View File

@ -385,6 +385,7 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client,
}
defer f.releaseDoHSem()
metricDNSFwdDoH.Add(1)
req, err := http.NewRequestWithContext(ctx, "POST", urlBase, bytes.NewReader(packet))
if err != nil {
return nil, err
@ -398,16 +399,23 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client,
hres, err := c.Do(req)
if err != nil {
metricDNSFwdDoHErrorTransport.Add(1)
return nil, err
}
defer hres.Body.Close()
if hres.StatusCode != 200 {
metricDNSFwdDoHErrorStatus.Add(1)
return nil, errors.New(hres.Status)
}
if ct := hres.Header.Get("Content-Type"); ct != dohType {
metricDNSFwdDoHErrorCT.Add(1)
return nil, fmt.Errorf("unexpected response Content-Type %q", ct)
}
return ioutil.ReadAll(hres.Body)
res, err := ioutil.ReadAll(hres.Body)
if err != nil {
metricDNSFwdDoHErrorBody.Add(1)
}
return res, err
}
// send sends packet to dst. It is best effort.
@ -415,12 +423,15 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client,
// send expects the reply to have the same txid as txidOut.
func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDelay) ([]byte, error) {
if strings.HasPrefix(rr.name.Addr, "http://") {
metricDNSFwdErrorType.Add(1)
return nil, fmt.Errorf("http:// resolvers not supported yet")
}
if strings.HasPrefix(rr.name.Addr, "https://") {
metricDNSFwdErrorType.Add(1)
return nil, fmt.Errorf("https:// resolvers not supported yet")
}
if strings.HasPrefix(rr.name.Addr, "tls://") {
metricDNSFwdErrorType.Add(1)
return nil, fmt.Errorf("tls:// resolvers not supported yet")
}
ipp, err := netaddr.ParseIPPort(rr.name.Addr)
@ -438,6 +449,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
f.logf("DoH error from %v: %v", ipp.IP(), err)
}
metricDNSFwdUDP.Add(1)
ln, err := f.packetListener(ipp.IP())
if err != nil {
return nil, err
@ -453,11 +465,13 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
defer fq.closeOnCtxDone.Remove(conn)
if _, err := conn.WriteTo(fq.packet, ipp.UDPAddr()); err != nil {
metricDNSFwdUDPErrorWrite.Add(1)
if err := ctx.Err(); err != nil {
return nil, err
}
return nil, err
}
metricDNSFwdUDPWrote.Add(1)
// The 1 extra byte is to detect packet truncation.
out := make([]byte, maxResponseBytes+1)
@ -469,6 +483,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
if packetWasTruncated(err) {
err = nil
} else {
metricDNSFwdUDPErrorRead.Add(1)
return nil, err
}
}
@ -482,12 +497,14 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
out = out[:n]
txid := getTxID(out)
if txid != fq.txid {
metricDNSFwdUDPErrorTxID.Add(1)
return nil, errors.New("txid doesn't match")
}
rcode := getRCode(out)
// don't forward transient errors back to the client when the server fails
if rcode == dns.RCodeServerFailure {
f.logf("recv: response code indicating server failure: %d", rcode)
metricDNSFwdUDPErrorServer.Add(1)
return nil, errors.New("response code indicates server issue")
}
@ -505,7 +522,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
}
clampEDNSSize(out, maxResponseBytes)
metricDNSFwdUDPSuccess.Add(1)
return out, nil
}
@ -566,8 +583,10 @@ func (f *forwarder) forward(query packet) error {
// If backupResolvers are specified, they're used in the case that no
// upstreams are available.
func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, responseChan chan<- packet, backupResolvers ...resolverAndDelay) error {
metricDNSFwd.Add(1)
domain, err := nameFromQuery(query.bs)
if err != nil {
metricDNSFwdErrorName.Add(1)
return err
}
@ -576,6 +595,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
// when browsing for LAN devices. But even when filtering this
// out, playing on Sonos still works.
if hasRDNSBonjourPrefix(domain) {
metricDNSFwdDropBonjour.Add(1)
return nil
}
@ -586,6 +606,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
resolvers = backupResolvers
}
if len(resolvers) == 0 {
metricDNSFwdErrorNoUpstream.Add(1)
return errNoUpstreams
}
@ -633,14 +654,18 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
case v := <-resc:
select {
case <-ctx.Done():
metricDNSFwdErrorContext.Add(1)
return ctx.Err()
case responseChan <- packet{v, query.addr}:
metricDNSFwdSuccess.Add(1)
return nil
}
case <-ctx.Done():
mu.Lock()
defer mu.Unlock()
metricDNSFwdErrorContext.Add(1)
if firstErr != nil {
metricDNSFwdErrorContextGotError.Add(1)
return firstErr
}
return ctx.Err()

View File

@ -24,6 +24,7 @@
"inet.af/netaddr"
"tailscale.com/types/dnstype"
"tailscale.com/types/logger"
"tailscale.com/util/clientmetric"
"tailscale.com/util/dnsname"
"tailscale.com/wgengine/monitor"
)
@ -273,13 +274,16 @@ func (r *Resolver) Close() {
// It takes ownership of the payload and does not block.
// If the queue is full, the request will be dropped and an error will be returned.
func (r *Resolver) EnqueueRequest(bs []byte, from netaddr.IPPort) error {
metricDNSQueryLocal.Add(1)
select {
case <-r.closed:
metricDNSQueryErrorClosed.Add(1)
return ErrClosed
default:
}
if n := atomic.AddInt32(&r.activeQueriesAtomic, 1); n > maxActiveQueries() {
atomic.AddInt32(&r.activeQueriesAtomic, -1)
metricDNSQueryErrorQueue.Add(1)
return errFullQueue
}
go r.handleQuery(packet{bs, from})
@ -303,6 +307,7 @@ func (r *Resolver) NextResponse() (packet []byte, to netaddr.IPPort, err error)
// via the peerapi's DoH server. This is only used when the local
// node is being an exit node.
func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from netaddr.IPPort) (res []byte, err error) {
metricDNSQueryForPeer.Add(1)
ch := make(chan packet, 1)
err = r.forwarder.forwardWithDestChan(ctx, packet{q, from}, ch)
@ -338,6 +343,7 @@ func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from ne
}
}
if err != nil {
metricDNSQueryForPeerError.Add(1)
return nil, err
}
select {
@ -357,8 +363,10 @@ func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from ne
// Returns dns.RCodeRefused to indicate that the local map is not
// authoritative for domain.
func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP, dns.RCode) {
metricDNSResolveLocal.Add(1)
// Reject .onion domains per RFC 7686.
if dnsname.HasSuffix(domain.WithoutTrailingDot(), ".onion") {
metricDNSResolveLocalErrorOnion.Add(1)
return netaddr.IP{}, dns.RCodeNameError
}
@ -372,6 +380,7 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP,
for _, suffix := range localDomains {
if suffix.Contains(domain) {
// We are authoritative for the queried domain.
metricDNSResolveLocalErrorMissing.Add(1)
return netaddr.IP{}, dns.RCodeNameError
}
}
@ -389,30 +398,37 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP,
case dns.TypeA:
for _, ip := range addrs {
if ip.Is4() {
metricDNSResolveLocalOKA.Add(1)
return ip, dns.RCodeSuccess
}
}
metricDNSResolveLocalNoA.Add(1)
return netaddr.IP{}, dns.RCodeSuccess
case dns.TypeAAAA:
for _, ip := range addrs {
if ip.Is6() {
metricDNSResolveLocalOKAAAA.Add(1)
return ip, dns.RCodeSuccess
}
}
metricDNSResolveLocalNoAAAA.Add(1)
return netaddr.IP{}, dns.RCodeSuccess
case dns.TypeALL:
// Answer with whatever we've got.
// It could be IPv4, IPv6, or a zero addr.
// TODO: Return all available resolutions (A and AAAA, if we have them).
if len(addrs) == 0 {
metricDNSResolveLocalNoAll.Add(1)
return netaddr.IP{}, dns.RCodeSuccess
}
metricDNSResolveLocalOKAll.Add(1)
return addrs[0], dns.RCodeSuccess
// Leave some some record types explicitly unimplemented.
// These types relate to recursive resolution or special
// DNS semantics and might be implemented in the future.
case dns.TypeNS, dns.TypeSOA, dns.TypeAXFR, dns.TypeHINFO:
metricDNSResolveNotImplType.Add(1)
return netaddr.IP{}, dns.RCodeNotImplemented
// For everything except for the few types above that are explicitly not implemented, return no records.
@ -422,6 +438,7 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP,
// dig -t TYPE9824 example.com
// and note that NOERROR is returned, despite that record type being made up.
default:
metricDNSResolveNoRecordType.Add(1)
// The name exists, but no records exist of the requested type.
return netaddr.IP{}, dns.RCodeSuccess
}
@ -753,6 +770,7 @@ func (r *Resolver) respondReverse(query []byte, name dnsname.FQDN, resp *respons
return nil, errNotOurName
}
metricDNSMagicDNSSuccessReverse.Add(1)
return marshalResponse(resp)
}
@ -769,8 +787,10 @@ func (r *Resolver) respond(query []byte) ([]byte, error) {
// We will not return this error: it is the sender's fault.
if err != nil {
if errors.Is(err, dns.ErrSectionDone) {
metricDNSErrorParseNoQ.Add(1)
r.logf("parseQuery(%02x): no DNS questions", query)
} else {
metricDNSErrorParseQuery.Add(1)
r.logf("parseQuery(%02x): %v", query, err)
}
resp := parser.response()
@ -780,6 +800,7 @@ func (r *Resolver) respond(query []byte) ([]byte, error) {
rawName := parser.Question.Name.Data[:parser.Question.Name.Length]
name, err := dnsname.ToFQDN(rawNameToLower(rawName))
if err != nil {
metricDNSErrorNotFQDN.Add(1)
// DNS packet unexpectedly contains an invalid FQDN.
resp := parser.response()
resp.Header.RCode = dns.RCodeFormatError
@ -803,3 +824,57 @@ func (r *Resolver) respond(query []byte) ([]byte, error) {
resp.IP = ip
return marshalResponse(resp)
}
var (
metricDNSQueryLocal = clientmetric.NewCounter("dns_query_local")
metricDNSQueryErrorClosed = clientmetric.NewCounter("dns_query_local_error_closed")
metricDNSQueryErrorQueue = clientmetric.NewCounter("dns_query_local_error_queue")
metricDNSErrorParseNoQ = clientmetric.NewCounter("dns_query_respond_error_no_question")
metricDNSErrorParseQuery = clientmetric.NewCounter("dns_query_respond_error_parse")
metricDNSErrorNotFQDN = clientmetric.NewCounter("dns_query_respond_error_not_fqdn")
metricDNSMagicDNSSuccessName = clientmetric.NewCounter("dns_query_magic_success_name")
metricDNSMagicDNSSuccessReverse = clientmetric.NewCounter("dns_query_magic_success_reverse")
metricDNSQueryForPeer = clientmetric.NewCounter("dns_query_peerapi")
metricDNSQueryForPeerError = clientmetric.NewCounter("dns_query_peerapi_error")
metricDNSFwd = clientmetric.NewCounter("dns_query_fwd")
metricDNSFwdDropBonjour = clientmetric.NewCounter("dns_query_fwd_drop_bonjour")
metricDNSFwdErrorName = clientmetric.NewCounter("dns_query_fwd_error_name")
metricDNSFwdErrorNoUpstream = clientmetric.NewCounter("dns_query_fwd_error_no_upstream")
metricDNSFwdSuccess = clientmetric.NewCounter("dns_query_fwd_success")
metricDNSFwdErrorContext = clientmetric.NewCounter("dns_query_fwd_error_context")
metricDNSFwdErrorContextGotError = clientmetric.NewCounter("dns_query_fwd_error_context_got_error")
metricDNSFwdErrorType = clientmetric.NewCounter("dns_query_fwd_error_type")
metricDNSFwdErrorParseAddr = clientmetric.NewCounter("dns_query_fwd_error_parse_addr")
metricDNSFwdUDP = clientmetric.NewCounter("dns_query_fwd_udp") // on entry
metricDNSFwdUDPWrote = clientmetric.NewCounter("dns_query_fwd_udp_wrote") // sent UDP packet
metricDNSFwdUDPErrorWrite = clientmetric.NewCounter("dns_query_fwd_udp_error_write")
metricDNSFwdUDPErrorServer = clientmetric.NewCounter("dns_query_fwd_udp_error_server")
metricDNSFwdUDPErrorTxID = clientmetric.NewCounter("dns_query_fwd_udp_error_txid")
metricDNSFwdUDPErrorRead = clientmetric.NewCounter("dns_query_fwd_udp_error_read")
metricDNSFwdUDPSuccess = clientmetric.NewCounter("dns_query_fwd_udp_success")
metricDNSFwdDoH = clientmetric.NewCounter("dns_query_fwd_doh")
metricDNSFwdDoHErrorStatus = clientmetric.NewCounter("dns_query_fwd_doh_error_status")
metricDNSFwdDoHErrorCT = clientmetric.NewCounter("dns_query_fwd_doh_error_content_type")
metricDNSFwdDoHErrorTransport = clientmetric.NewCounter("dns_query_fwd_doh_error_transport")
metricDNSFwdDoHErrorBody = clientmetric.NewCounter("dns_query_fwd_doh_error_body")
metricDNSResolveLocal = clientmetric.NewCounter("dns_resolve_local")
metricDNSResolveLocalErrorOnion = clientmetric.NewCounter("dns_resolve_local_error_onion")
metricDNSResolveLocalErrorMissing = clientmetric.NewCounter("dns_resolve_local_error_missing")
metricDNSResolveLocalErrorRefused = clientmetric.NewCounter("dns_resolve_local_error_refused")
metricDNSResolveLocalOKA = clientmetric.NewCounter("dns_resolve_local_ok_a")
metricDNSResolveLocalOKAAAA = clientmetric.NewCounter("dns_resolve_local_ok_aaaa")
metricDNSResolveLocalOKAll = clientmetric.NewCounter("dns_resolve_local_ok_all")
metricDNSResolveLocalNoA = clientmetric.NewCounter("dns_resolve_local_no_a")
metricDNSResolveLocalNoAAAA = clientmetric.NewCounter("dns_resolve_local_no_aaaa")
metricDNSResolveLocalNoAll = clientmetric.NewCounter("dns_resolve_local_no_all")
metricDNSResolveNotImplType = clientmetric.NewCounter("dns_resolve_local_not_impl_type")
metricDNSResolveNoRecordType = clientmetric.NewCounter("dns_resolve_local_no_record_type")
)