diff --git a/net/dns/osconfig.go b/net/dns/osconfig.go index 866760a8e..12bde1f31 100644 --- a/net/dns/osconfig.go +++ b/net/dns/osconfig.go @@ -17,6 +17,7 @@ type OSConfigurator interface { // If cfg is the zero value, all Tailscale-related DNS // configuration is removed. // SetDNS must not be called after Close. + // SetDNS takes ownership of cfg. SetDNS(cfg OSConfig) error // SupportsSplitDNS reports whether the configurator is capable of // installing a resolver only for specific DNS suffixes. If false, diff --git a/net/dns/resolved.go b/net/dns/resolved.go index b731c7594..5642cfd49 100644 --- a/net/dns/resolved.go +++ b/net/dns/resolved.go @@ -13,8 +13,9 @@ import ( "fmt" "net" "strings" - "sync" + "time" + "tailscale.com/logtail/backoff" "github.com/godbus/dbus/v5" "golang.org/x/sys/unix" "inet.af/netaddr" @@ -74,76 +75,173 @@ type resolvedLinkDomain struct { RoutingOnly bool } +// changeRequest tracks latest OSConfig and related error responses to update. +type changeRequest struct { + config OSConfig // configs OSConfigs, one per each SetDNS call + res chan <- error // response channel +} + // resolvedManager is an OSConfigurator which uses the systemd-resolved DBus API. type resolvedManager struct { + ctx context.Context + cancel func() // terminate the context, for close + logf logger.Logf ifidx int - cancelSyncer context.CancelFunc // run to shut down syncer goroutine - syncerDone chan struct{} // closed when syncer is stopped - resolved dbus.BusObject - - mu sync.Mutex // guards RPCs made by syncLocked, and the following - config OSConfig // last SetDNS config + configCR chan changeRequest // tracks OSConfigs changes and error responses } func newResolvedManager(logf logger.Logf, interfaceName string) (*resolvedManager, error) { - conn, err := dbus.SystemBus() - if err != nil { - return nil, err - } - iface, err := net.InterfaceByName(interfaceName) if err != nil { return nil, err } ctx, cancel := context.WithCancel(context.Background()) + logf = logger.WithPrefix(logf, "dns: ") - ret := &resolvedManager{ - logf: logf, - ifidx: iface.Index, - cancelSyncer: cancel, - syncerDone: make(chan struct{}), - resolved: conn.Object(dbusResolvedObject, dbus.ObjectPath(dbusResolvedPath)), + mgr := &resolvedManager{ + ctx: ctx, + cancel: cancel, + + logf: logf, + ifidx: iface.Index, + + configCR: make(chan changeRequest), } - signals := make(chan *dbus.Signal, 16) - go ret.resync(ctx, signals) - // Only receive the DBus signals we need to resync our config on - // resolved restart. Failure to set filters isn't a fatal error, - // we'll just receive all broadcast signals and have to ignore - // them on our end. - if err := conn.AddMatchSignal(dbus.WithMatchObjectPath(dbusPath), dbus.WithMatchInterface(dbusInterface), dbus.WithMatchMember(dbusOwnerSignal), dbus.WithMatchArg(0, dbusResolvedObject)); err != nil { - logf("[v1] Setting DBus signal filter failed: %v", err) - } - conn.Signal(signals) - return ret, nil + + go mgr.run(ctx) + + return mgr, nil } func (m *resolvedManager) SetDNS(config OSConfig) error { - m.mu.Lock() - defer m.mu.Unlock() + errc := make(chan error, 1) + defer close(errc) - m.config = config - return m.syncLocked(context.TODO()) // would be nice to plumb context through from SetDNS + select { + case <-m.ctx.Done(): + return m.ctx.Err() + case m.configCR <- changeRequest{config, errc}: + } + + select { + case <-m.ctx.Done(): + return m.ctx.Err() + case err := <-errc: + if err != nil { + m.logf("failed to configure resolved: %v", err) + } + return err + } } -func (m *resolvedManager) resync(ctx context.Context, signals chan *dbus.Signal) { - defer close(m.syncerDone) +func (m *resolvedManager) run(ctx context.Context) { + var ( + conn *dbus.Conn + signals chan *dbus.Signal + rManager dbus.BusObject // rManager is the Resolved DBus connection + ) + bo := backoff.NewBackoff("resolved-dbus", m.logf, 30*time.Second) + needsReconnect := make(chan bool, 1) + defer func() { + if conn != nil { + conn.Close() + } + }() + + // Reconnect the systemBus if disconnected. + reconnect := func() error { + var err error + signals = make(chan *dbus.Signal, 16) + conn, err = dbus.SystemBus() + if err != nil { + m.logf("dbus connection error: %v", err) + } else { + m.logf("[v1] dbus connected") + } + + if err != nil { + // Backoff increases time between reconnect attempts. + go func() { + bo.BackOff(ctx, err) + needsReconnect <- true + }() + return err + } + + rManager = conn.Object(dbusResolvedObject, dbus.ObjectPath(dbusResolvedPath)) + + // Only receive the DBus signals we need to resync our config on + // resolved restart. Failure to set filters isn't a fatal error, + // we'll just receive all broadcast signals and have to ignore + // them on our end. + if err = conn.AddMatchSignal(dbus.WithMatchObjectPath(dbusPath), dbus.WithMatchInterface(dbusInterface), dbus.WithMatchMember(dbusOwnerSignal), dbus.WithMatchArg(0, dbusResolvedObject)); err != nil { + m.logf("[v1] Setting DBus signal filter failed: %v", err) + } + conn.Signal(signals) + + // Reset backoff and SetNSOSHealth after successful on reconnect. + bo.BackOff(ctx, nil) + health.SetDNSOSHealth(nil) + return nil + } + + // Create initial systemBus connection. + reconnect() + + lastConfig := OSConfig{} + for { select { case <-ctx.Done(): + if rManager == nil { + return + } + // RevertLink resets all per-interface settings on systemd-resolved to defaults. + // When ctx goes away systemd-resolved auto reverts. + // Keeping for potential use in future refactor. + if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".RevertLink", 0, m.ifidx); call.Err != nil { + m.logf("[v1] RevertLink: %w", call.Err) + return + } return - case signal := <-signals: + case configCR := <-m.configCR: + // Track and update sync with latest config change. + lastConfig = configCR.config + + if rManager == nil { + configCR.res <- fmt.Errorf("resolved DBus does not have a connection") + continue + } + err := m.setConfigOverDBus(ctx, rManager, configCR.config) + configCR.res <- err + case <-needsReconnect: + if err := reconnect(); err != nil { + m.logf("[v1] SystemBus reconnect error %T", err) + } + continue + case signal, ok := <-signals: + // If signal ends and is nil then program tries to reconnect. + if !ok { + if err := reconnect(); err != nil { + m.logf("[v1] SystemBus reconnect error %T", err) + } + continue + } // In theory the signal was filtered by DBus, but if // AddMatchSignal in the constructor failed, we may be // getting other spam. if signal.Path != dbusPath || signal.Name != dbusInterface+"."+dbusOwnerSignal { continue } + if lastConfig.IsZero() { + continue + } // signal.Body is a []any of 3 strings: bus name, previous owner, new owner. if len(signal.Body) != 3 { - m.logf("[unexpectected] DBus NameOwnerChanged len(Body) = %d, want 3") + m.logf("[unexpected] DBus NameOwnerChanged len(Body) = %d, want 3") } if name, ok := signal.Body[0].(string); !ok || name != dbusResolvedObject { continue @@ -160,13 +258,11 @@ func (m *resolvedManager) resync(ctx context.Context, signals chan *dbus.Signal) // The resolved bus name has a new owner, meaning resolved // restarted. Reprogram current config. m.logf("systemd-resolved restarted, syncing DNS config") - m.mu.Lock() - err := m.syncLocked(ctx) + err := m.setConfigOverDBus(ctx, rManager, lastConfig) // Set health while holding the lock, because this will // graciously serialize the resync's health outcome with a // concurrent SetDNS call. health.SetDNSOSHealth(err) - m.mu.Unlock() if err != nil { m.logf("failed to configure systemd-resolved: %v", err) } @@ -174,12 +270,13 @@ func (m *resolvedManager) resync(ctx context.Context, signals chan *dbus.Signal) } } -func (m *resolvedManager) syncLocked(ctx context.Context) error { +// setConfigOverDBus updates resolved DBus config and is only called from the run goroutine. +func (m *resolvedManager) setConfigOverDBus(ctx context.Context, rManager dbus.BusObject, config OSConfig) error { ctx, cancel := context.WithTimeout(ctx, reconfigTimeout) defer cancel() - var linkNameservers = make([]resolvedLinkNameserver, len(m.config.Nameservers)) - for i, server := range m.config.Nameservers { + var linkNameservers = make([]resolvedLinkNameserver, len(config.Nameservers)) + for i, server := range config.Nameservers { ip := server.As16() if server.Is4() { linkNameservers[i] = resolvedLinkNameserver{ @@ -193,18 +290,16 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error { } } } - - err := m.resolved.CallWithContext( + err := rManager.CallWithContext( ctx, dbusResolvedInterface+".SetLinkDNS", 0, m.ifidx, linkNameservers, ).Store() if err != nil { return fmt.Errorf("setLinkDNS: %w", err) } - - linkDomains := make([]resolvedLinkDomain, 0, len(m.config.SearchDomains)+len(m.config.MatchDomains)) + linkDomains := make([]resolvedLinkDomain, 0, len(config.SearchDomains)+len(config.MatchDomains)) seenDomains := map[dnsname.FQDN]bool{} - for _, domain := range m.config.SearchDomains { + for _, domain := range config.SearchDomains { if seenDomains[domain] { continue } @@ -214,7 +309,7 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error { RoutingOnly: false, }) } - for _, domain := range m.config.MatchDomains { + for _, domain := range config.MatchDomains { if seenDomains[domain] { // Search domains act as both search and match in // resolved, so it's correct to skip. @@ -226,7 +321,7 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error { RoutingOnly: true, }) } - if len(m.config.MatchDomains) == 0 && len(m.config.Nameservers) > 0 { + if len(config.MatchDomains) == 0 && len(config.Nameservers) > 0 { // Caller requested full DNS interception, install a // routing-only root domain. linkDomains = append(linkDomains, resolvedLinkDomain{ @@ -235,14 +330,14 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error { }) } - err = m.resolved.CallWithContext( + err = rManager.CallWithContext( ctx, dbusResolvedInterface+".SetLinkDomains", 0, m.ifidx, linkDomains, ).Store() if err != nil && err.Error() == "Argument list too long" { // TODO: better error match // Issue 3188: older systemd-resolved had argument length limits. // Trim out the *.arpa. entries and try again. - err = m.resolved.CallWithContext( + err = rManager.CallWithContext( ctx, dbusResolvedInterface+".SetLinkDomains", 0, m.ifidx, linkDomainsWithoutReverseDNS(linkDomains), ).Store() @@ -251,7 +346,7 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error { return fmt.Errorf("setLinkDomains: %w", err) } - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDefaultRoute", 0, m.ifidx, len(m.config.MatchDomains) == 0); call.Err != nil { + if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDefaultRoute", 0, m.ifidx, len(config.MatchDomains) == 0); call.Err != nil { if dbusErr, ok := call.Err.(dbus.Error); ok && dbusErr.Name == dbus.ErrMsgUnknownMethod.Name { // on some older systems like Kubuntu 18.04.6 with systemd 237 method SetLinkDefaultRoute is absent, // but otherwise it's working good @@ -266,29 +361,28 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error { // or something). // Disable LLMNR, we don't do multicast. - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkLLMNR", 0, m.ifidx, "no"); call.Err != nil { + if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkLLMNR", 0, m.ifidx, "no"); call.Err != nil { m.logf("[v1] failed to disable LLMNR: %v", call.Err) } // Disable mdns. - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkMulticastDNS", 0, m.ifidx, "no"); call.Err != nil { + if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkMulticastDNS", 0, m.ifidx, "no"); call.Err != nil { m.logf("[v1] failed to disable mdns: %v", call.Err) } // We don't support dnssec consistently right now, force it off to // avoid partial failures when we split DNS internally. - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSSEC", 0, m.ifidx, "no"); call.Err != nil { + if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSSEC", 0, m.ifidx, "no"); call.Err != nil { m.logf("[v1] failed to disable DNSSEC: %v", call.Err) } - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSOverTLS", 0, m.ifidx, "no"); call.Err != nil { + if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSOverTLS", 0, m.ifidx, "no"); call.Err != nil { m.logf("[v1] failed to disable DoT: %v", call.Err) } - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".FlushCaches", 0); call.Err != nil { + if call := rManager.CallWithContext(ctx, dbusResolvedInterface+".FlushCaches", 0); call.Err != nil { m.logf("failed to flush resolved DNS cache: %v", call.Err) } - return nil } @@ -301,20 +395,7 @@ func (m *resolvedManager) GetBaseConfig() (OSConfig, error) { } func (m *resolvedManager) Close() error { - m.cancelSyncer() - - ctx, cancel := context.WithTimeout(context.Background(), reconfigTimeout) - defer cancel() - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".RevertLink", 0, m.ifidx); call.Err != nil { - return fmt.Errorf("RevertLink: %w", call.Err) - } - - select { - case <-m.syncerDone: - case <-ctx.Done(): - m.logf("timeout in systemd-resolved syncer shutdown") - } - + m.cancel() // stops the 'run' method goroutine return nil }