net/dns: retry forwarder requests over TCP

We weren't correctly retrying truncated requests to an upstream DNS
server with TCP. Instead, we'd return a truncated request to the user,
even if the user was querying us over TCP and thus able to handle a
large response.

Also, add an envknob and controlknob to allow users/us to disable this
behaviour if it turns out to be buggy ( DNS ).

Updates #9264

Signed-off-by: Andrew Dunham <andrew@du.nham.ca>
Change-Id: Ifb04b563839a9614c0ba03e9c564e8924c1a2bfd
This commit is contained in:
Andrew Dunham
2023-09-07 16:27:50 -04:00
parent 098d110746
commit 530aaa52f1
13 changed files with 448 additions and 49 deletions

View File

@@ -23,6 +23,7 @@ import (
"time"
dns "golang.org/x/net/dns/dnsmessage"
"tailscale.com/control/controlknobs"
"tailscale.com/envknob"
"tailscale.com/net/dns/resolvconffile"
"tailscale.com/net/netaddr"
@@ -53,8 +54,9 @@ var (
)
type packet struct {
bs []byte
addr netip.AddrPort // src for a request, dst for a response
bs []byte
family string // either "tcp" or "udp"
addr netip.AddrPort // src for a request, dst for a response
}
// Config is a resolver configuration.
@@ -206,7 +208,7 @@ type ForwardLinkSelector interface {
// New returns a new resolver.
// netMon optionally specifies a network monitor to use for socket rebinding.
func New(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer) *Resolver {
func New(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *Resolver {
if dialer == nil {
panic("nil Dialer")
}
@@ -218,7 +220,7 @@ func New(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector,
ipToHost: map[netip.Addr]dnsname.FQDN{},
dialer: dialer,
}
r.forwarder = newForwarder(r.logf, netMon, linkSel, dialer)
r.forwarder = newForwarder(r.logf, netMon, linkSel, dialer, knobs)
return r
}
@@ -266,7 +268,7 @@ func (r *Resolver) Close() {
// bound on per-query resource usage.
const dnsQueryTimeout = 10 * time.Second
func (r *Resolver) Query(ctx context.Context, bs []byte, from netip.AddrPort) ([]byte, error) {
func (r *Resolver) Query(ctx context.Context, bs []byte, family string, from netip.AddrPort) ([]byte, error) {
metricDNSQueryLocal.Add(1)
select {
case <-r.closed:
@@ -281,7 +283,7 @@ func (r *Resolver) Query(ctx context.Context, bs []byte, from netip.AddrPort) ([
ctx, cancel := context.WithTimeout(ctx, dnsQueryTimeout)
defer close(responses)
defer cancel()
err = r.forwarder.forwardWithDestChan(ctx, packet{bs, from}, responses)
err = r.forwarder.forwardWithDestChan(ctx, packet{bs, family, from}, responses)
if err != nil {
select {
// Best effort: use any error response sent by forwardWithDestChan.
@@ -369,7 +371,7 @@ func (r *Resolver) HandleExitNodeDNSQuery(ctx context.Context, q []byte, from ne
}}
}
err = r.forwarder.forwardWithDestChan(ctx, packet{q, from}, ch, resolvers...)
err = r.forwarder.forwardWithDestChan(ctx, packet{q, "tcp", from}, ch, resolvers...)
if err != nil {
metricDNSExitProxyErrorForward.Add(1)
return nil, err
@@ -1306,6 +1308,14 @@ var (
metricDNSFwdUDPErrorRead = clientmetric.NewCounter("dns_query_fwd_udp_error_read")
metricDNSFwdUDPSuccess = clientmetric.NewCounter("dns_query_fwd_udp_success")
metricDNSFwdTCP = clientmetric.NewCounter("dns_query_fwd_tcp") // on entry
metricDNSFwdTCPWrote = clientmetric.NewCounter("dns_query_fwd_tcp_wrote") // sent TCP packet
metricDNSFwdTCPErrorWrite = clientmetric.NewCounter("dns_query_fwd_tcp_error_write")
metricDNSFwdTCPErrorServer = clientmetric.NewCounter("dns_query_fwd_tcp_error_server")
metricDNSFwdTCPErrorTxID = clientmetric.NewCounter("dns_query_fwd_tcp_error_txid")
metricDNSFwdTCPErrorRead = clientmetric.NewCounter("dns_query_fwd_tcp_error_read")
metricDNSFwdTCPSuccess = clientmetric.NewCounter("dns_query_fwd_tcp_success")
metricDNSFwdDoH = clientmetric.NewCounter("dns_query_fwd_doh")
metricDNSFwdDoHErrorStatus = clientmetric.NewCounter("dns_query_fwd_doh_error_status")
metricDNSFwdDoHErrorCT = clientmetric.NewCounter("dns_query_fwd_doh_error_content_type")