diff --git a/net/dns/resolved.go b/net/dns/resolved.go index b731c7594..40a2aaea2 100644 --- a/net/dns/resolved.go +++ b/net/dns/resolved.go @@ -14,6 +14,8 @@ "net" "strings" "sync" + "tailscale.com/logtail/backoff" + "time" "github.com/godbus/dbus/v5" "golang.org/x/sys/unix" @@ -79,102 +81,106 @@ type resolvedManager struct { logf logger.Logf ifidx int - cancelSyncer context.CancelFunc // run to shut down syncer goroutine - syncerDone chan struct{} // closed when syncer is stopped - resolved dbus.BusObject - mu sync.Mutex // guards RPCs made by syncLocked, and the following config OSConfig // last SetDNS config } func newResolvedManager(logf logger.Logf, interfaceName string) (*resolvedManager, error) { - conn, err := dbus.SystemBus() - if err != nil { - return nil, err - } - iface, err := net.InterfaceByName(interfaceName) if err != nil { return nil, err } - ctx, cancel := context.WithCancel(context.Background()) + return &resolvedManager{ + logf: logf, + ifidx: iface.Index, + }, nil +} - ret := &resolvedManager{ - logf: logf, - ifidx: iface.Index, - cancelSyncer: cancel, - syncerDone: make(chan struct{}), - resolved: conn.Object(dbusResolvedObject, dbus.ObjectPath(dbusResolvedPath)), +func (m *resolvedManager) SetDNS(config OSConfig) error { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var err error + bo := backoff.NewBackoff("resolvedSetDNS", m.logf, 30*time.Second) + m.config = config + + for ctx.Err() == nil { + err = m.trySet(ctx) + if err == nil { + bo.BackOff(ctx, err) + } } + + return err +} + +func (m *resolvedManager) trySet(ctx context.Context) error { + + conn, err := dbus.SystemBus() + if err != nil { + return nil + } + r1Manager := conn.Object(dbusResolvedObject, dbus.ObjectPath(dbusResolvedPath)) signals := make(chan *dbus.Signal, 16) - go ret.resync(ctx, signals) + // Only receive the DBus signals we need to resync our config on // resolved restart. Failure to set filters isn't a fatal error, // we'll just receive all broadcast signals and have to ignore // them on our end. if err := conn.AddMatchSignal(dbus.WithMatchObjectPath(dbusPath), dbus.WithMatchInterface(dbusInterface), dbus.WithMatchMember(dbusOwnerSignal), dbus.WithMatchArg(0, dbusResolvedObject)); err != nil { - logf("[v1] Setting DBus signal filter failed: %v", err) + m.logf("[v1] Setting DBus signal filter failed: %v", err) } conn.Signal(signals) - return ret, nil -} -func (m *resolvedManager) SetDNS(config OSConfig) error { - m.mu.Lock() - defer m.mu.Unlock() - - m.config = config - return m.syncLocked(context.TODO()) // would be nice to plumb context through from SetDNS -} - -func (m *resolvedManager) resync(ctx context.Context, signals chan *dbus.Signal) { - defer close(m.syncerDone) - for { - select { - case <-ctx.Done(): - return - case signal := <-signals: - // In theory the signal was filtered by DBus, but if - // AddMatchSignal in the constructor failed, we may be - // getting other spam. - if signal.Path != dbusPath || signal.Name != dbusInterface+"."+dbusOwnerSignal { - continue - } - // signal.Body is a []any of 3 strings: bus name, previous owner, new owner. - if len(signal.Body) != 3 { - m.logf("[unexpectected] DBus NameOwnerChanged len(Body) = %d, want 3") - } - if name, ok := signal.Body[0].(string); !ok || name != dbusResolvedObject { - continue - } - newOwner, ok := signal.Body[2].(string) - if !ok { - m.logf("[unexpected] DBus NameOwnerChanged.new_owner is a %T, not a string", signal.Body[2]) - } - if newOwner == "" { - // systemd-resolved left the bus, no current owner, - // nothing to do. - continue - } - // The resolved bus name has a new owner, meaning resolved - // restarted. Reprogram current config. - m.logf("systemd-resolved restarted, syncing DNS config") - m.mu.Lock() - err := m.syncLocked(ctx) - // Set health while holding the lock, because this will - // graciously serialize the resync's health outcome with a - // concurrent SetDNS call. - health.SetDNSOSHealth(err) - m.mu.Unlock() - if err != nil { - m.logf("failed to configure systemd-resolved: %v", err) - } + for signal := range signals { + if signal == nil { + return fmt.Errorf("Empty signal returned.") + } + // In theory the signal was filtered by DBus, but if + // AddMatchSignal in the constructor failed, we may be + // getting other spam. + if signal.Path != dbusPath || signal.Name != dbusInterface+"."+dbusOwnerSignal { + continue + } + // signal.Body is a []any of 3 strings: bus name, previous owner, new owner. + if len(signal.Body) != 3 { + m.logf("[unexpectected] DBus NameOwnerChanged len(Body) = %d, want 3") + } + if name, ok := signal.Body[0].(string); !ok || name != dbusResolvedObject { + continue + } + newOwner, ok := signal.Body[2].(string) + if !ok { + m.logf("[unexpected] DBus NameOwnerChanged.new_owner is a %T, not a string", signal.Body[2]) + } + if newOwner == "" { + // systemd-resolved left the bus, no current owner, + // nothing to do. + continue + } + // The resolved bus name has a new owner, meaning resolved + // restarted. Reprogram current config. + m.logf("systemd-resolved restarted, syncing DNS config") + m.mu.Lock() + err := m.syncLocked(ctx, r1Manager) + // Set health while holding the lock, because this will + // graciously serialize the resync's health outcome with a + // concurrent SetDNS call. + health.SetDNSOSHealth(err) + m.mu.Unlock() + if err != nil { + m.logf("failed to configure systemd-resolved: %v", err) } } + // RevertLink on connection if signals are interrupted in order to create a new connection. + if call := r1Manager.CallWithContext(ctx, dbusResolvedInterface+".RevertLink", 0, m.ifidx); call.Err != nil { + return fmt.Errorf("RevertLink: %w", call.Err) + } + return nil } -func (m *resolvedManager) syncLocked(ctx context.Context) error { +func (m *resolvedManager) syncLocked(ctx context.Context, r1Manager dbus.BusObject) error { ctx, cancel := context.WithTimeout(ctx, reconfigTimeout) defer cancel() @@ -194,7 +200,7 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error { } } - err := m.resolved.CallWithContext( + err := r1Manager.CallWithContext( ctx, dbusResolvedInterface+".SetLinkDNS", 0, m.ifidx, linkNameservers, ).Store() @@ -235,14 +241,14 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error { }) } - err = m.resolved.CallWithContext( + err = r1Manager.CallWithContext( ctx, dbusResolvedInterface+".SetLinkDomains", 0, m.ifidx, linkDomains, ).Store() if err != nil && err.Error() == "Argument list too long" { // TODO: better error match // Issue 3188: older systemd-resolved had argument length limits. // Trim out the *.arpa. entries and try again. - err = m.resolved.CallWithContext( + err = r1Manager.CallWithContext( ctx, dbusResolvedInterface+".SetLinkDomains", 0, m.ifidx, linkDomainsWithoutReverseDNS(linkDomains), ).Store() @@ -251,7 +257,7 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error { return fmt.Errorf("setLinkDomains: %w", err) } - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDefaultRoute", 0, m.ifidx, len(m.config.MatchDomains) == 0); call.Err != nil { + if call := r1Manager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDefaultRoute", 0, m.ifidx, len(m.config.MatchDomains) == 0); call.Err != nil { if dbusErr, ok := call.Err.(dbus.Error); ok && dbusErr.Name == dbus.ErrMsgUnknownMethod.Name { // on some older systems like Kubuntu 18.04.6 with systemd 237 method SetLinkDefaultRoute is absent, // but otherwise it's working good @@ -266,26 +272,26 @@ func (m *resolvedManager) syncLocked(ctx context.Context) error { // or something). // Disable LLMNR, we don't do multicast. - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkLLMNR", 0, m.ifidx, "no"); call.Err != nil { + if call := r1Manager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkLLMNR", 0, m.ifidx, "no"); call.Err != nil { m.logf("[v1] failed to disable LLMNR: %v", call.Err) } // Disable mdns. - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkMulticastDNS", 0, m.ifidx, "no"); call.Err != nil { + if call := r1Manager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkMulticastDNS", 0, m.ifidx, "no"); call.Err != nil { m.logf("[v1] failed to disable mdns: %v", call.Err) } // We don't support dnssec consistently right now, force it off to // avoid partial failures when we split DNS internally. - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSSEC", 0, m.ifidx, "no"); call.Err != nil { + if call := r1Manager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSSEC", 0, m.ifidx, "no"); call.Err != nil { m.logf("[v1] failed to disable DNSSEC: %v", call.Err) } - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSOverTLS", 0, m.ifidx, "no"); call.Err != nil { + if call := r1Manager.CallWithContext(ctx, dbusResolvedInterface+".SetLinkDNSOverTLS", 0, m.ifidx, "no"); call.Err != nil { m.logf("[v1] failed to disable DoT: %v", call.Err) } - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".FlushCaches", 0); call.Err != nil { + if call := r1Manager.CallWithContext(ctx, dbusResolvedInterface+".FlushCaches", 0); call.Err != nil { m.logf("failed to flush resolved DNS cache: %v", call.Err) } @@ -301,20 +307,7 @@ func (m *resolvedManager) GetBaseConfig() (OSConfig, error) { } func (m *resolvedManager) Close() error { - m.cancelSyncer() - - ctx, cancel := context.WithTimeout(context.Background(), reconfigTimeout) - defer cancel() - if call := m.resolved.CallWithContext(ctx, dbusResolvedInterface+".RevertLink", 0, m.ifidx); call.Err != nil { - return fmt.Errorf("RevertLink: %w", call.Err) - } - - select { - case <-m.syncerDone: - case <-ctx.Done(): - m.logf("timeout in systemd-resolved syncer shutdown") - } - + // No need to do anything on close which is handled in trySet return nil }