mirror of
https://github.com/tailscale/tailscale.git
synced 2025-01-05 14:57:49 +00:00
net/dns/resolver: add clientmetrics for DNS
Fixes tailscale/corp#1811 Change-Id: I864d11e0332a177e8c5ff403591bff6fec548f5a Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
parent
097602b3ca
commit
78b0bd2957
@ -385,6 +385,7 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client,
|
||||
}
|
||||
defer f.releaseDoHSem()
|
||||
|
||||
metricDNSFwdDoH.Add(1)
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", urlBase, bytes.NewReader(packet))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -398,16 +399,23 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client,
|
||||
|
||||
hres, err := c.Do(req)
|
||||
if err != nil {
|
||||
metricDNSFwdDoHErrorTransport.Add(1)
|
||||
return nil, err
|
||||
}
|
||||
defer hres.Body.Close()
|
||||
if hres.StatusCode != 200 {
|
||||
metricDNSFwdDoHErrorStatus.Add(1)
|
||||
return nil, errors.New(hres.Status)
|
||||
}
|
||||
if ct := hres.Header.Get("Content-Type"); ct != dohType {
|
||||
metricDNSFwdDoHErrorCT.Add(1)
|
||||
return nil, fmt.Errorf("unexpected response Content-Type %q", ct)
|
||||
}
|
||||
return ioutil.ReadAll(hres.Body)
|
||||
res, err := ioutil.ReadAll(hres.Body)
|
||||
if err != nil {
|
||||
metricDNSFwdDoHErrorBody.Add(1)
|
||||
}
|
||||
return res, err
|
||||
}
|
||||
|
||||
// send sends packet to dst. It is best effort.
|
||||
@ -415,12 +423,15 @@ func (f *forwarder) sendDoH(ctx context.Context, urlBase string, c *http.Client,
|
||||
// send expects the reply to have the same txid as txidOut.
|
||||
func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDelay) ([]byte, error) {
|
||||
if strings.HasPrefix(rr.name.Addr, "http://") {
|
||||
metricDNSFwdErrorType.Add(1)
|
||||
return nil, fmt.Errorf("http:// resolvers not supported yet")
|
||||
}
|
||||
if strings.HasPrefix(rr.name.Addr, "https://") {
|
||||
metricDNSFwdErrorType.Add(1)
|
||||
return nil, fmt.Errorf("https:// resolvers not supported yet")
|
||||
}
|
||||
if strings.HasPrefix(rr.name.Addr, "tls://") {
|
||||
metricDNSFwdErrorType.Add(1)
|
||||
return nil, fmt.Errorf("tls:// resolvers not supported yet")
|
||||
}
|
||||
ipp, err := netaddr.ParseIPPort(rr.name.Addr)
|
||||
@ -438,6 +449,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
|
||||
f.logf("DoH error from %v: %v", ipp.IP(), err)
|
||||
}
|
||||
|
||||
metricDNSFwdUDP.Add(1)
|
||||
ln, err := f.packetListener(ipp.IP())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -453,11 +465,13 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
|
||||
defer fq.closeOnCtxDone.Remove(conn)
|
||||
|
||||
if _, err := conn.WriteTo(fq.packet, ipp.UDPAddr()); err != nil {
|
||||
metricDNSFwdUDPErrorWrite.Add(1)
|
||||
if err := ctx.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
metricDNSFwdUDPWrote.Add(1)
|
||||
|
||||
// The 1 extra byte is to detect packet truncation.
|
||||
out := make([]byte, maxResponseBytes+1)
|
||||
@ -469,6 +483,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
|
||||
if packetWasTruncated(err) {
|
||||
err = nil
|
||||
} else {
|
||||
metricDNSFwdUDPErrorRead.Add(1)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
@ -482,12 +497,14 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
|
||||
out = out[:n]
|
||||
txid := getTxID(out)
|
||||
if txid != fq.txid {
|
||||
metricDNSFwdUDPErrorTxID.Add(1)
|
||||
return nil, errors.New("txid doesn't match")
|
||||
}
|
||||
rcode := getRCode(out)
|
||||
// don't forward transient errors back to the client when the server fails
|
||||
if rcode == dns.RCodeServerFailure {
|
||||
f.logf("recv: response code indicating server failure: %d", rcode)
|
||||
metricDNSFwdUDPErrorServer.Add(1)
|
||||
return nil, errors.New("response code indicates server issue")
|
||||
}
|
||||
|
||||
@ -505,7 +522,7 @@ func (f *forwarder) send(ctx context.Context, fq *forwardQuery, rr resolverAndDe
|
||||
}
|
||||
|
||||
clampEDNSSize(out, maxResponseBytes)
|
||||
|
||||
metricDNSFwdUDPSuccess.Add(1)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
@ -566,8 +583,10 @@ func (f *forwarder) forward(query packet) error {
|
||||
// If backupResolvers are specified, they're used in the case that no
|
||||
// upstreams are available.
|
||||
func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, responseChan chan<- packet, backupResolvers ...resolverAndDelay) error {
|
||||
metricDNSFwd.Add(1)
|
||||
domain, err := nameFromQuery(query.bs)
|
||||
if err != nil {
|
||||
metricDNSFwdErrorName.Add(1)
|
||||
return err
|
||||
}
|
||||
|
||||
@ -576,6 +595,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
|
||||
// when browsing for LAN devices. But even when filtering this
|
||||
// out, playing on Sonos still works.
|
||||
if hasRDNSBonjourPrefix(domain) {
|
||||
metricDNSFwdDropBonjour.Add(1)
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -586,6 +606,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
|
||||
resolvers = backupResolvers
|
||||
}
|
||||
if len(resolvers) == 0 {
|
||||
metricDNSFwdErrorNoUpstream.Add(1)
|
||||
return errNoUpstreams
|
||||
}
|
||||
|
||||
@ -633,14 +654,18 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
|
||||
case v := <-resc:
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
metricDNSFwdErrorContext.Add(1)
|
||||
return ctx.Err()
|
||||
case responseChan <- packet{v, query.addr}:
|
||||
metricDNSFwdSuccess.Add(1)
|
||||
return nil
|
||||
}
|
||||
case <-ctx.Done():
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
metricDNSFwdErrorContext.Add(1)
|
||||
if firstErr != nil {
|
||||
metricDNSFwdErrorContextGotError.Add(1)
|
||||
return firstErr
|
||||
}
|
||||
return ctx.Err()
|
||||
|
@ -24,6 +24,7 @@
|
||||
"inet.af/netaddr"
|
||||
"tailscale.com/types/dnstype"
|
||||
"tailscale.com/types/logger"
|
||||
"tailscale.com/util/clientmetric"
|
||||
"tailscale.com/util/dnsname"
|
||||
"tailscale.com/wgengine/monitor"
|
||||
)
|
||||
@ -273,13 +274,16 @@ func (r *Resolver) Close() {
|
||||
// It takes ownership of the payload and does not block.
|
||||
// If the queue is full, the request will be dropped and an error will be returned.
|
||||
func (r *Resolver) EnqueueRequest(bs []byte, from netaddr.IPPort) error {
|
||||
metricDNSQueryLocal.Add(1)
|
||||
select {
|
||||
case <-r.closed:
|
||||
metricDNSQueryErrorClosed.Add(1)
|
||||
return ErrClosed
|
||||
default:
|
||||
}
|
||||
if n := atomic.AddInt32(&r.activeQueriesAtomic, 1); n > maxActiveQueries() {
|
||||
atomic.AddInt32(&r.activeQueriesAtomic, -1)
|
||||
metricDNSQueryErrorQueue.Add(1)
|
||||
return errFullQueue
|
||||
}
|
||||
go r.handleQuery(packet{bs, from})
|
||||
@ -303,6 +307,7 @@ func (r *Resolver) NextResponse() (packet []byte, to netaddr.IPPort, err error)
|
||||
// via the peerapi's DoH server. This is only used when the local
|
||||
// node is being an exit node.
|
||||
func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from netaddr.IPPort) (res []byte, err error) {
|
||||
metricDNSQueryForPeer.Add(1)
|
||||
ch := make(chan packet, 1)
|
||||
|
||||
err = r.forwarder.forwardWithDestChan(ctx, packet{q, from}, ch)
|
||||
@ -338,6 +343,7 @@ func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from ne
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
metricDNSQueryForPeerError.Add(1)
|
||||
return nil, err
|
||||
}
|
||||
select {
|
||||
@ -357,8 +363,10 @@ func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from ne
|
||||
// Returns dns.RCodeRefused to indicate that the local map is not
|
||||
// authoritative for domain.
|
||||
func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP, dns.RCode) {
|
||||
metricDNSResolveLocal.Add(1)
|
||||
// Reject .onion domains per RFC 7686.
|
||||
if dnsname.HasSuffix(domain.WithoutTrailingDot(), ".onion") {
|
||||
metricDNSResolveLocalErrorOnion.Add(1)
|
||||
return netaddr.IP{}, dns.RCodeNameError
|
||||
}
|
||||
|
||||
@ -372,6 +380,7 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP,
|
||||
for _, suffix := range localDomains {
|
||||
if suffix.Contains(domain) {
|
||||
// We are authoritative for the queried domain.
|
||||
metricDNSResolveLocalErrorMissing.Add(1)
|
||||
return netaddr.IP{}, dns.RCodeNameError
|
||||
}
|
||||
}
|
||||
@ -389,30 +398,37 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP,
|
||||
case dns.TypeA:
|
||||
for _, ip := range addrs {
|
||||
if ip.Is4() {
|
||||
metricDNSResolveLocalOKA.Add(1)
|
||||
return ip, dns.RCodeSuccess
|
||||
}
|
||||
}
|
||||
metricDNSResolveLocalNoA.Add(1)
|
||||
return netaddr.IP{}, dns.RCodeSuccess
|
||||
case dns.TypeAAAA:
|
||||
for _, ip := range addrs {
|
||||
if ip.Is6() {
|
||||
metricDNSResolveLocalOKAAAA.Add(1)
|
||||
return ip, dns.RCodeSuccess
|
||||
}
|
||||
}
|
||||
metricDNSResolveLocalNoAAAA.Add(1)
|
||||
return netaddr.IP{}, dns.RCodeSuccess
|
||||
case dns.TypeALL:
|
||||
// Answer with whatever we've got.
|
||||
// It could be IPv4, IPv6, or a zero addr.
|
||||
// TODO: Return all available resolutions (A and AAAA, if we have them).
|
||||
if len(addrs) == 0 {
|
||||
metricDNSResolveLocalNoAll.Add(1)
|
||||
return netaddr.IP{}, dns.RCodeSuccess
|
||||
}
|
||||
metricDNSResolveLocalOKAll.Add(1)
|
||||
return addrs[0], dns.RCodeSuccess
|
||||
|
||||
// Leave some some record types explicitly unimplemented.
|
||||
// These types relate to recursive resolution or special
|
||||
// DNS semantics and might be implemented in the future.
|
||||
case dns.TypeNS, dns.TypeSOA, dns.TypeAXFR, dns.TypeHINFO:
|
||||
metricDNSResolveNotImplType.Add(1)
|
||||
return netaddr.IP{}, dns.RCodeNotImplemented
|
||||
|
||||
// For everything except for the few types above that are explicitly not implemented, return no records.
|
||||
@ -422,6 +438,7 @@ func (r *Resolver) resolveLocal(domain dnsname.FQDN, typ dns.Type) (netaddr.IP,
|
||||
// dig -t TYPE9824 example.com
|
||||
// and note that NOERROR is returned, despite that record type being made up.
|
||||
default:
|
||||
metricDNSResolveNoRecordType.Add(1)
|
||||
// The name exists, but no records exist of the requested type.
|
||||
return netaddr.IP{}, dns.RCodeSuccess
|
||||
}
|
||||
@ -753,6 +770,7 @@ func (r *Resolver) respondReverse(query []byte, name dnsname.FQDN, resp *respons
|
||||
return nil, errNotOurName
|
||||
}
|
||||
|
||||
metricDNSMagicDNSSuccessReverse.Add(1)
|
||||
return marshalResponse(resp)
|
||||
}
|
||||
|
||||
@ -769,8 +787,10 @@ func (r *Resolver) respond(query []byte) ([]byte, error) {
|
||||
// We will not return this error: it is the sender's fault.
|
||||
if err != nil {
|
||||
if errors.Is(err, dns.ErrSectionDone) {
|
||||
metricDNSErrorParseNoQ.Add(1)
|
||||
r.logf("parseQuery(%02x): no DNS questions", query)
|
||||
} else {
|
||||
metricDNSErrorParseQuery.Add(1)
|
||||
r.logf("parseQuery(%02x): %v", query, err)
|
||||
}
|
||||
resp := parser.response()
|
||||
@ -780,6 +800,7 @@ func (r *Resolver) respond(query []byte) ([]byte, error) {
|
||||
rawName := parser.Question.Name.Data[:parser.Question.Name.Length]
|
||||
name, err := dnsname.ToFQDN(rawNameToLower(rawName))
|
||||
if err != nil {
|
||||
metricDNSErrorNotFQDN.Add(1)
|
||||
// DNS packet unexpectedly contains an invalid FQDN.
|
||||
resp := parser.response()
|
||||
resp.Header.RCode = dns.RCodeFormatError
|
||||
@ -803,3 +824,57 @@ func (r *Resolver) respond(query []byte) ([]byte, error) {
|
||||
resp.IP = ip
|
||||
return marshalResponse(resp)
|
||||
}
|
||||
|
||||
var (
|
||||
metricDNSQueryLocal = clientmetric.NewCounter("dns_query_local")
|
||||
metricDNSQueryErrorClosed = clientmetric.NewCounter("dns_query_local_error_closed")
|
||||
metricDNSQueryErrorQueue = clientmetric.NewCounter("dns_query_local_error_queue")
|
||||
|
||||
metricDNSErrorParseNoQ = clientmetric.NewCounter("dns_query_respond_error_no_question")
|
||||
metricDNSErrorParseQuery = clientmetric.NewCounter("dns_query_respond_error_parse")
|
||||
metricDNSErrorNotFQDN = clientmetric.NewCounter("dns_query_respond_error_not_fqdn")
|
||||
|
||||
metricDNSMagicDNSSuccessName = clientmetric.NewCounter("dns_query_magic_success_name")
|
||||
metricDNSMagicDNSSuccessReverse = clientmetric.NewCounter("dns_query_magic_success_reverse")
|
||||
|
||||
metricDNSQueryForPeer = clientmetric.NewCounter("dns_query_peerapi")
|
||||
metricDNSQueryForPeerError = clientmetric.NewCounter("dns_query_peerapi_error")
|
||||
|
||||
metricDNSFwd = clientmetric.NewCounter("dns_query_fwd")
|
||||
metricDNSFwdDropBonjour = clientmetric.NewCounter("dns_query_fwd_drop_bonjour")
|
||||
metricDNSFwdErrorName = clientmetric.NewCounter("dns_query_fwd_error_name")
|
||||
metricDNSFwdErrorNoUpstream = clientmetric.NewCounter("dns_query_fwd_error_no_upstream")
|
||||
metricDNSFwdSuccess = clientmetric.NewCounter("dns_query_fwd_success")
|
||||
metricDNSFwdErrorContext = clientmetric.NewCounter("dns_query_fwd_error_context")
|
||||
metricDNSFwdErrorContextGotError = clientmetric.NewCounter("dns_query_fwd_error_context_got_error")
|
||||
|
||||
metricDNSFwdErrorType = clientmetric.NewCounter("dns_query_fwd_error_type")
|
||||
metricDNSFwdErrorParseAddr = clientmetric.NewCounter("dns_query_fwd_error_parse_addr")
|
||||
|
||||
metricDNSFwdUDP = clientmetric.NewCounter("dns_query_fwd_udp") // on entry
|
||||
metricDNSFwdUDPWrote = clientmetric.NewCounter("dns_query_fwd_udp_wrote") // sent UDP packet
|
||||
metricDNSFwdUDPErrorWrite = clientmetric.NewCounter("dns_query_fwd_udp_error_write")
|
||||
metricDNSFwdUDPErrorServer = clientmetric.NewCounter("dns_query_fwd_udp_error_server")
|
||||
metricDNSFwdUDPErrorTxID = clientmetric.NewCounter("dns_query_fwd_udp_error_txid")
|
||||
metricDNSFwdUDPErrorRead = clientmetric.NewCounter("dns_query_fwd_udp_error_read")
|
||||
metricDNSFwdUDPSuccess = clientmetric.NewCounter("dns_query_fwd_udp_success")
|
||||
|
||||
metricDNSFwdDoH = clientmetric.NewCounter("dns_query_fwd_doh")
|
||||
metricDNSFwdDoHErrorStatus = clientmetric.NewCounter("dns_query_fwd_doh_error_status")
|
||||
metricDNSFwdDoHErrorCT = clientmetric.NewCounter("dns_query_fwd_doh_error_content_type")
|
||||
metricDNSFwdDoHErrorTransport = clientmetric.NewCounter("dns_query_fwd_doh_error_transport")
|
||||
metricDNSFwdDoHErrorBody = clientmetric.NewCounter("dns_query_fwd_doh_error_body")
|
||||
|
||||
metricDNSResolveLocal = clientmetric.NewCounter("dns_resolve_local")
|
||||
metricDNSResolveLocalErrorOnion = clientmetric.NewCounter("dns_resolve_local_error_onion")
|
||||
metricDNSResolveLocalErrorMissing = clientmetric.NewCounter("dns_resolve_local_error_missing")
|
||||
metricDNSResolveLocalErrorRefused = clientmetric.NewCounter("dns_resolve_local_error_refused")
|
||||
metricDNSResolveLocalOKA = clientmetric.NewCounter("dns_resolve_local_ok_a")
|
||||
metricDNSResolveLocalOKAAAA = clientmetric.NewCounter("dns_resolve_local_ok_aaaa")
|
||||
metricDNSResolveLocalOKAll = clientmetric.NewCounter("dns_resolve_local_ok_all")
|
||||
metricDNSResolveLocalNoA = clientmetric.NewCounter("dns_resolve_local_no_a")
|
||||
metricDNSResolveLocalNoAAAA = clientmetric.NewCounter("dns_resolve_local_no_aaaa")
|
||||
metricDNSResolveLocalNoAll = clientmetric.NewCounter("dns_resolve_local_no_all")
|
||||
metricDNSResolveNotImplType = clientmetric.NewCounter("dns_resolve_local_not_impl_type")
|
||||
metricDNSResolveNoRecordType = clientmetric.NewCounter("dns_resolve_local_no_record_type")
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user