From 969927c47c3d4de05e90f5b26a6d8d931c5ceed4 Mon Sep 17 00:00:00 2001 From: Tom Proctor Date: Thu, 3 Jul 2025 13:48:48 +0100 Subject: [PATCH] cmd/{containerboot,k8s-operator}: reissue auth keys for broken proxies Adds logic for containerboot to signal that it can't auth, so the operator can reissue a new auth key. This only applies when running with a config file and with a kube state store. If the operator sees reissue_authkey in a state Secret, it will create a new auth key iff the config has no auth key or its auth key matches the value of reissue_authkey from the state Secret. This is to ensure we don't reissue auth keys in a tight loop if the proxy is slow to start or failing for some other reason. The reissue logic also uses a burstable rate limiter to ensure there's no way a terminally misconfigured or buggy operator can automatically generate new auth keys in a tight loop. Updates #14080 Change-Id: I6982f8e741932a6891f2f48a2936f7f6a455317f Signed-off-by: Tom Proctor --- cmd/containerboot/kube.go | 57 ++++--- cmd/containerboot/kube_test.go | 76 ++++++++- cmd/containerboot/main.go | 46 +++++- cmd/containerboot/main_test.go | 144 +++++++++++++--- cmd/k8s-operator/operator.go | 2 + cmd/k8s-operator/proxygroup.go | 178 ++++++++++++++------ cmd/k8s-operator/proxygroup_test.go | 244 +++++++++++++++++++++++++--- cmd/k8s-operator/sts.go | 14 +- cmd/k8s-operator/testutils_test.go | 4 +- cmd/k8s-operator/tsrecorder_test.go | 2 +- kube/kubetypes/types.go | 16 +- 11 files changed, 636 insertions(+), 147 deletions(-) diff --git a/cmd/containerboot/kube.go b/cmd/containerboot/kube.go index d4a974e6f..fa600866f 100644 --- a/cmd/containerboot/kube.go +++ b/cmd/containerboot/kube.go @@ -26,9 +26,10 @@ import ( "tailscale.com/logtail/backoff" "tailscale.com/tailcfg" "tailscale.com/types/logger" - "tailscale.com/util/set" ) +const fieldManager = "tailscale-container" + // kubeClient is a wrapper around Tailscale's internal kube client that knows how to talk to the kube API server. We use // this rather than any of the upstream Kubernetes client libaries to avoid extra imports. type kubeClient struct { @@ -63,7 +64,7 @@ func (kc *kubeClient) storeDeviceID(ctx context.Context, deviceID tailcfg.Stable kubetypes.KeyDeviceID: []byte(deviceID), }, } - return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, "tailscale-container") + return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager) } // storeDeviceEndpoints writes device's tailnet IPs and MagicDNS name to fields 'device_ips', 'device_fqdn' of client's @@ -84,7 +85,7 @@ func (kc *kubeClient) storeDeviceEndpoints(ctx context.Context, fqdn string, add kubetypes.KeyDeviceIPs: deviceIPs, }, } - return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, "tailscale-container") + return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager) } // storeHTTPSEndpoint writes an HTTPS endpoint exposed by this device via 'tailscale serve' to the client's state @@ -96,7 +97,7 @@ func (kc *kubeClient) storeHTTPSEndpoint(ctx context.Context, ep string) error { kubetypes.KeyHTTPSEndpoint: []byte(ep), }, } - return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, "tailscale-container") + return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager) } // deleteAuthKey deletes the 'authkey' field of the given kube @@ -122,38 +123,44 @@ func (kc *kubeClient) deleteAuthKey(ctx context.Context) error { // resetContainerbootState resets state from previous runs of containerboot to // ensure the operator doesn't use stale state when a Pod is first recreated. -func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string) error { - existingSecret, err := kc.GetSecret(ctx, kc.stateSecret) - if err != nil { - return fmt.Errorf("failed to read state Secret %q to reset state: %w", kc.stateSecret, err) - } - +func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string, tailscaledConfigAuthkey string) error { s := &kubeapi.Secret{ Data: map[string][]byte{ kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion), + + // TODO(tomhjp): Perhaps shouldn't clear device ID and use a different signal, as this could leak tailnet devices. + kubetypes.KeyDeviceID: nil, + kubetypes.KeyDeviceFQDN: nil, + kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyHTTPSEndpoint: nil, + egressservices.KeyEgressServices: nil, + ingressservices.IngressConfigKey: nil, }, } if podUID != "" { s.Data[kubetypes.KeyPodUID] = []byte(podUID) } - toClear := set.SetOf([]string{ - kubetypes.KeyDeviceID, - kubetypes.KeyDeviceFQDN, - kubetypes.KeyDeviceIPs, - kubetypes.KeyHTTPSEndpoint, - egressservices.KeyEgressServices, - ingressservices.IngressConfigKey, - }) - for key := range existingSecret.Data { - if toClear.Contains(key) { - // It's fine to leave the key in place as a debugging breadcrumb, - // it should get a new value soon. - s.Data[key] = nil - } + // Only clear reissue_authkey if the operator has actioned it. + existingSecret, err := kc.GetSecret(ctx, kc.stateSecret) + if err != nil { + return fmt.Errorf("failed to read state Secret %q to reset state: %w", kc.stateSecret, err) + } + brokenAuthkey, ok := existingSecret.Data[kubetypes.KeyReissueAuthkey] + if ok && tailscaledConfigAuthkey != "" && string(brokenAuthkey) != tailscaledConfigAuthkey { + s.Data[kubetypes.KeyReissueAuthkey] = nil } - return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, "tailscale-container") + return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager) +} + +func (kc *kubeClient) setReissueAuthKey(ctx context.Context, authKey string) error { + s := &kubeapi.Secret{ + Data: map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte(authKey), // Empty string means no auth key. + }, + } + return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, fieldManager) } // waitForConsistentState waits for tailscaled to finish writing state if it diff --git a/cmd/containerboot/kube_test.go b/cmd/containerboot/kube_test.go index c33714ed1..fe4231704 100644 --- a/cmd/containerboot/kube_test.go +++ b/cmd/containerboot/kube_test.go @@ -248,25 +248,42 @@ func TestResetContainerbootState(t *testing.T) { capver := fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion) for name, tc := range map[string]struct { podUID string + authkey string initial map[string][]byte expected map[string][]byte }{ "empty_initial": { podUID: "1234", + authkey: "new-authkey", initial: map[string][]byte{}, expected: map[string][]byte{ kubetypes.KeyCapVer: capver, kubetypes.KeyPodUID: []byte("1234"), + // Cleared keys. + kubetypes.KeyDeviceID: nil, + kubetypes.KeyDeviceFQDN: nil, + kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyHTTPSEndpoint: nil, + egressservices.KeyEgressServices: nil, + ingressservices.IngressConfigKey: nil, }, }, "empty_initial_no_pod_uid": { initial: map[string][]byte{}, expected: map[string][]byte{ kubetypes.KeyCapVer: capver, + // Cleared keys. + kubetypes.KeyDeviceID: nil, + kubetypes.KeyDeviceFQDN: nil, + kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyHTTPSEndpoint: nil, + egressservices.KeyEgressServices: nil, + ingressservices.IngressConfigKey: nil, }, }, "only_relevant_keys_updated": { - podUID: "1234", + podUID: "1234", + authkey: "new-authkey", initial: map[string][]byte{ kubetypes.KeyCapVer: []byte("1"), kubetypes.KeyPodUID: []byte("5678"), @@ -295,6 +312,57 @@ func TestResetContainerbootState(t *testing.T) { // Tailscaled keys not included in patch. }, }, + "new_authkey_issued": { + initial: map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte("old-authkey"), + }, + authkey: "new-authkey", + expected: map[string][]byte{ + kubetypes.KeyCapVer: capver, + kubetypes.KeyReissueAuthkey: nil, + // Cleared keys. + kubetypes.KeyDeviceID: nil, + kubetypes.KeyDeviceFQDN: nil, + kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyHTTPSEndpoint: nil, + egressservices.KeyEgressServices: nil, + ingressservices.IngressConfigKey: nil, + }, + }, + "authkey_not_yet_updated": { + initial: map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte("old-authkey"), + }, + authkey: "old-authkey", + expected: map[string][]byte{ + kubetypes.KeyCapVer: capver, + // reissue_authkey not cleared. + // Cleared keys. + kubetypes.KeyDeviceID: nil, + kubetypes.KeyDeviceFQDN: nil, + kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyHTTPSEndpoint: nil, + egressservices.KeyEgressServices: nil, + ingressservices.IngressConfigKey: nil, + }, + }, + "authkey_deleted_from_config": { + initial: map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte("old-authkey"), + }, + authkey: "", + expected: map[string][]byte{ + kubetypes.KeyCapVer: capver, + // reissue_authkey not cleared. + // Cleared keys. + kubetypes.KeyDeviceID: nil, + kubetypes.KeyDeviceFQDN: nil, + kubetypes.KeyDeviceIPs: nil, + kubetypes.KeyHTTPSEndpoint: nil, + egressservices.KeyEgressServices: nil, + ingressservices.IngressConfigKey: nil, + }, + }, } { t.Run(name, func(t *testing.T) { var actual map[string][]byte @@ -309,11 +377,11 @@ func TestResetContainerbootState(t *testing.T) { return nil }, }} - if err := kc.resetContainerbootState(context.Background(), tc.podUID); err != nil { + if err := kc.resetContainerbootState(context.Background(), tc.podUID, tc.authkey); err != nil { t.Fatalf("resetContainerbootState() error = %v", err) } - if diff := cmp.Diff(tc.expected, actual); diff != "" { - t.Errorf("resetContainerbootState() mismatch (-want +got):\n%s", diff) + if diff := cmp.Diff(actual, tc.expected); diff != "" { + t.Errorf("Merge patch mismatch (-got +want):\n%s", diff) } }) } diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go index 52b30b837..6bdb9a324 100644 --- a/cmd/containerboot/main.go +++ b/cmd/containerboot/main.go @@ -119,7 +119,9 @@ import ( "golang.org/x/sys/unix" "tailscale.com/client/tailscale" + "tailscale.com/health" "tailscale.com/ipn" + "tailscale.com/ipn/conffile" kubeutils "tailscale.com/k8s-operator" "tailscale.com/kube/kubetypes" "tailscale.com/tailcfg" @@ -179,6 +181,11 @@ func run() error { bootCtx, cancel := context.WithTimeout(ctx, 60*time.Second) defer cancel() + var tailscaledConfigAuthkey string + if isOneStepConfig(cfg) { + tailscaledConfigAuthkey = authkeyFromTailscaledConfig(cfg.TailscaledConfigFilePath) + } + var kc *kubeClient if cfg.InKubernetes { kc, err = newKubeClient(cfg.Root, cfg.KubeSecret) @@ -192,7 +199,7 @@ func run() error { // hasKubeStateStore because although we know we're in kube, that // doesn't guarantee the state store is properly configured. if hasKubeStateStore(cfg) { - if err := kc.resetContainerbootState(bootCtx, cfg.PodUID); err != nil { + if err := kc.resetContainerbootState(bootCtx, cfg.PodUID, tailscaledConfigAuthkey); err != nil { return fmt.Errorf("error clearing previous state from Secret: %w", err) } } @@ -311,7 +318,7 @@ func run() error { if err := tailscaleUp(bootCtx, cfg); err != nil { return fmt.Errorf("failed to auth tailscale: %w", err) } - w, err = client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState) + w, err = client.WatchIPNBus(bootCtx, ipn.NotifyInitialNetMap|ipn.NotifyInitialState|ipn.NotifyInitialHealthState) if err != nil { return fmt.Errorf("rewatching tailscaled for updates after auth: %w", err) } @@ -337,7 +344,15 @@ authLoop: if isOneStepConfig(cfg) { // This could happen if this is the first time tailscaled was run for this // device and the auth key was not passed via the configfile. - return fmt.Errorf("invalid state: tailscaled daemon started with a config file, but tailscale is not logged in: ensure you pass a valid auth key in the config file.") + if hasKubeStateStore(cfg) { + setErr := kc.setReissueAuthKey(bootCtx, tailscaledConfigAuthkey) + if setErr != nil { + return fmt.Errorf("failed to set reissue_authkey in kube Secret after NeedsLogin state change: %w", setErr) + } + + return fmt.Errorf("invalid state: tailscaled daemon started with a config file, but tailscale is not logged in; auth key reissue from operator requested") + } + return fmt.Errorf("invalid state: tailscaled daemon started with a config file, but tailscale is not logged in: ensure you pass a valid auth key in the config file") } if err := authTailscale(); err != nil { return fmt.Errorf("failed to auth tailscale: %w", err) @@ -356,6 +371,21 @@ authLoop: log.Printf("tailscaled in state %q, waiting", *n.State) } } + + if n.Health != nil { + // This can happen if the config has an auth key but its invalid, + // for example if it was single-use and already got used, but the + // device state was lost. + if _, ok := n.Health.Warnings[health.LoginStateWarnable.Code]; ok { + if isOneStepConfig(cfg) && hasKubeStateStore(cfg) { + err := kc.setReissueAuthKey(bootCtx, tailscaledConfigAuthkey) + if err != nil { + return fmt.Errorf("failed to set reissue_authkey in kube Secret after login state warning: %w", err) + } + return fmt.Errorf("tailscaled failed to log in with the auth key from its config file; auth key reissue from operator requested") + } + } + } } w.Close() @@ -889,3 +919,13 @@ func runHTTPServer(mux *http.ServeMux, addr string) (close func() error) { return errors.Join(err, ln.Close()) } } + +func authkeyFromTailscaledConfig(path string) string { + if cfg, err := conffile.Load(path); err != nil { + return "" + } else if cfg.Parsed.AuthKey != nil { + return *cfg.Parsed.AuthKey + } + + return "" +} diff --git a/cmd/containerboot/main_test.go b/cmd/containerboot/main_test.go index 96feef682..0c4fa882e 100644 --- a/cmd/containerboot/main_test.go +++ b/cmd/containerboot/main_test.go @@ -31,6 +31,7 @@ import ( "github.com/google/go-cmp/cmp" "golang.org/x/sys/unix" + "tailscale.com/health" "tailscale.com/ipn" "tailscale.com/kube/egressservices" "tailscale.com/kube/kubeclient" @@ -41,6 +42,8 @@ import ( "tailscale.com/types/ptr" ) +const configFileAuthKey = "some-auth-key" + func TestContainerBoot(t *testing.T) { boot := filepath.Join(t.TempDir(), "containerboot") if err := exec.Command("go", "build", "-ldflags", "-X main.testSleepDuration=1ms", "-o", boot, "tailscale.com/cmd/containerboot").Run(); err != nil { @@ -781,6 +784,101 @@ func TestContainerBoot(t *testing.T) { }, } }, + "sets_reissue_authkey_if_needs_login": func(env *testEnv) testCase { + return testCase{ + Env: map[string]string{ + "TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR": filepath.Join(env.d, "etc/tailscaled/"), + "KUBERNETES_SERVICE_HOST": env.kube.Host, + "KUBERNETES_SERVICE_PORT_HTTPS": env.kube.Port, + }, + Phases: []phase{ + { + WantCmds: []string{ + "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking --config=/etc/tailscaled/cap-95.hujson", + }, + WantKubeSecret: map[string]string{ + kubetypes.KeyCapVer: capver, + }, + }, { + Notify: &ipn.Notify{ + State: ptr.To(ipn.NeedsLogin), + }, + WantKubeSecret: map[string]string{ + kubetypes.KeyCapVer: capver, + kubetypes.KeyReissueAuthkey: configFileAuthKey, + }, + WantExitCode: ptr.To(1), + WantLog: "invalid state: tailscaled daemon started with a config file, but tailscale is not logged in; auth key reissue from operator requested", + }, + }, + } + }, + "sets_reissue_authkey_if_auth_fails": func(env *testEnv) testCase { + return testCase{ + Env: map[string]string{ + "TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR": filepath.Join(env.d, "etc/tailscaled/"), + "KUBERNETES_SERVICE_HOST": env.kube.Host, + "KUBERNETES_SERVICE_PORT_HTTPS": env.kube.Port, + }, + Phases: []phase{ + { + WantCmds: []string{ + "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking --config=/etc/tailscaled/cap-95.hujson", + }, + WantKubeSecret: map[string]string{ + kubetypes.KeyCapVer: capver, + }, + }, { + Notify: &ipn.Notify{ + Health: &health.State{ + Warnings: map[health.WarnableCode]health.UnhealthyState{ + health.LoginStateWarnable.Code: {}, + }, + }, + }, + WantKubeSecret: map[string]string{ + kubetypes.KeyCapVer: capver, + kubetypes.KeyReissueAuthkey: configFileAuthKey, + }, + WantExitCode: ptr.To(1), + WantLog: "tailscaled failed to log in with the auth key from its config file; auth key reissue from operator requested", + }, + }, + } + }, + "clears_reissue_authkey_on_change": func(env *testEnv) testCase { + return testCase{ + Env: map[string]string{ + "TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR": filepath.Join(env.d, "etc/tailscaled/"), + "KUBERNETES_SERVICE_HOST": env.kube.Host, + "KUBERNETES_SERVICE_PORT_HTTPS": env.kube.Port, + }, + KubeSecret: map[string]string{ + kubetypes.KeyReissueAuthkey: "some-older-authkey", + "foo": "bar", // Check not everything is cleared. + }, + Phases: []phase{ + { + WantCmds: []string{ + "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=kube:tailscale --statedir=/tmp --tun=userspace-networking --config=/etc/tailscaled/cap-95.hujson", + }, + WantKubeSecret: map[string]string{ + kubetypes.KeyCapVer: capver, + "foo": "bar", + }, + }, { + Notify: runningNotify, + WantKubeSecret: map[string]string{ + kubetypes.KeyCapVer: capver, + "foo": "bar", + kubetypes.KeyDeviceFQDN: "test-node.test.ts.net", + kubetypes.KeyDeviceID: "myID", + kubetypes.KeyDeviceIPs: `["100.64.0.1"]`, + }, + }, + }, + } + }, "metrics_enabled": func(env *testEnv) testCase { return testCase{ Env: map[string]string{ @@ -1113,21 +1211,6 @@ func TestContainerBoot(t *testing.T) { } } - if p.WantExitCode != nil { - state, err := cmd.Process.Wait() - if err != nil { - t.Fatal(err) - } - if state.ExitCode() != *p.WantExitCode { - t.Fatalf("phase %d: want exit code %d, got %d", i, *p.WantExitCode, state.ExitCode()) - } - - // Early test return, we don't expect the successful startup log message. - return - } - - wantCmds = append(wantCmds, p.WantCmds...) - waitArgs(t, 2*time.Second, env.d, env.argFile, strings.Join(wantCmds, "\n")) err := tstest.WaitFor(2*time.Second, func() error { if p.WantKubeSecret != nil { got := env.kube.Secret() @@ -1145,6 +1228,23 @@ func TestContainerBoot(t *testing.T) { if err != nil { t.Fatalf("phase %d: %v", i, err) } + + if p.WantExitCode != nil { + state, err := cmd.Process.Wait() + if err != nil { + t.Fatal(err) + } + if state.ExitCode() != *p.WantExitCode { + t.Fatalf("phase %d: want exit code %d, got %d", i, *p.WantExitCode, state.ExitCode()) + } + + // Early test return, we don't expect the successful startup log message. + return + } + + wantCmds = append(wantCmds, p.WantCmds...) + waitArgs(t, 2*time.Second, env.d, env.argFile, strings.Join(wantCmds, "\n")) + err = tstest.WaitFor(2*time.Second, func() error { for path, want := range p.WantFiles { gotBs, err := os.ReadFile(filepath.Join(env.d, path)) @@ -1538,7 +1638,11 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) { panic(fmt.Sprintf("json decode failed: %v. Body:\n\n%s", err, string(bs))) } for key, val := range req.Data { - k.secret[key] = string(val) + if val == nil { + delete(k.secret, key) + } else { + k.secret[key] = string(val) + } } default: panic(fmt.Sprintf("unknown content type %q", r.Header.Get("Content-Type"))) @@ -1548,12 +1652,6 @@ func (k *kubeServer) serveSecret(w http.ResponseWriter, r *http.Request) { } } -func mustBase64(t *testing.T, v any) string { - b := mustJSON(t, v) - s := base64.StdEncoding.WithPadding('=').EncodeToString(b) - return s -} - func mustJSON(t *testing.T, v any) []byte { b, err := json.Marshal(v) if err != nil { @@ -1611,7 +1709,7 @@ func newTestEnv(t *testing.T) testEnv { kube.Start(t) t.Cleanup(kube.Close) - tailscaledConf := &ipn.ConfigVAlpha{AuthKey: ptr.To("foo"), Version: "alpha0"} + tailscaledConf := &ipn.ConfigVAlpha{AuthKey: ptr.To(configFileAuthKey), Version: "alpha0"} serveConf := ipn.ServeConfig{TCP: map[uint16]*ipn.TCPPortHandler{80: {HTTP: true}}} egressCfg := egressSvcConfig("foo", "foo.tailnetxyz.ts.net") diff --git a/cmd/k8s-operator/operator.go b/cmd/k8s-operator/operator.go index e5f7d932c..4e52e151c 100644 --- a/cmd/k8s-operator/operator.go +++ b/cmd/k8s-operator/operator.go @@ -20,6 +20,7 @@ import ( "github.com/go-logr/zapr" "go.uber.org/zap" "go.uber.org/zap/zapcore" + "golang.org/x/time/rate" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" discoveryv1 "k8s.io/api/discovery/v1" @@ -644,6 +645,7 @@ func runReconcilers(opts reconcilerOpts) { tsFirewallMode: opts.proxyFirewallMode, defaultProxyClass: opts.defaultProxyClass, loginServer: opts.tsServer.ControlURL, + authKeyRateLimits: make(map[string]*rate.Limiter), }) if err != nil { startlog.Fatalf("could not create ProxyGroup reconciler: %v", err) diff --git a/cmd/k8s-operator/proxygroup.go b/cmd/k8s-operator/proxygroup.go index 1b622c920..e917ad267 100644 --- a/cmd/k8s-operator/proxygroup.go +++ b/cmd/k8s-operator/proxygroup.go @@ -15,9 +15,11 @@ import ( "slices" "strings" "sync" + "time" "go.uber.org/zap" xslices "golang.org/x/exp/slices" + "golang.org/x/time/rate" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -87,9 +89,10 @@ type ProxyGroupReconciler struct { defaultProxyClass string loginServer string - mu sync.Mutex // protects following - egressProxyGroups set.Slice[types.UID] // for egress proxygroups gauge - ingressProxyGroups set.Slice[types.UID] // for ingress proxygroups gauge + mu sync.Mutex // protects following + authKeyRateLimits map[string]*rate.Limiter // per-ProxyGroup rate limiters for auth key re-issuance. + egressProxyGroups set.Slice[types.UID] // for egress proxygroups gauge + ingressProxyGroups set.Slice[types.UID] // for ingress proxygroups gauge } func (r *ProxyGroupReconciler) logger(name string) *zap.SugaredLogger { @@ -274,7 +277,7 @@ func validateProxyClassForPG(logger *zap.SugaredLogger, pg *tsapi.ProxyGroup, pc func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (isProvisioned bool, err error) { logger := r.logger(pg.Name) r.mu.Lock() - r.ensureAddedToGaugeForProxyGroup(pg) + r.ensureStateAddedForProxyGroup(pg) r.mu.Unlock() svcToNodePorts := make(map[string]uint16) @@ -541,13 +544,13 @@ func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, pg } for _, m := range metadata { - if m.ordinal+1 <= int(pgReplicas(pg)) { + if m.ordinal+1 <= pgReplicas(pg) { continue } // Dangling resource, delete the config + state Secrets, as well as // deleting the device from the tailnet. - if err := r.deleteTailnetDevice(ctx, m.tsID, logger); err != nil { + if err := r.ensureDeviceDeleted(ctx, m.tsID, logger); err != nil { return err } if err := r.Delete(ctx, m.stateSecret); err != nil && !apierrors.IsNotFound(err) { @@ -599,7 +602,7 @@ func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, pg *tsapi.Proxy } for _, m := range metadata { - if err := r.deleteTailnetDevice(ctx, m.tsID, logger); err != nil { + if err := r.ensureDeviceDeleted(ctx, m.tsID, logger); err != nil { return false, err } } @@ -615,12 +618,13 @@ func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, pg *tsapi.Proxy logger.Infof("cleaned up ProxyGroup resources") r.mu.Lock() - r.ensureRemovedFromGaugeForProxyGroup(pg) + r.ensureStateRemovedForProxyGroup(pg) r.mu.Unlock() return true, nil } -func (r *ProxyGroupReconciler) deleteTailnetDevice(ctx context.Context, id tailcfg.StableNodeID, logger *zap.SugaredLogger) error { +// ensureDeviceDeleted ensures the device is deleted. Returns nil if not found. +func (r *ProxyGroupReconciler) ensureDeviceDeleted(ctx context.Context, id tailcfg.StableNodeID, logger *zap.SugaredLogger) error { logger.Debugf("deleting device %s from control", string(id)) if err := r.tsClient.DeleteDevice(ctx, string(id)); err != nil { errResp := &tailscale.ErrResponse{} @@ -640,6 +644,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p logger := r.logger(pg.Name) endpoints = make(map[string][]netip.AddrPort, pgReplicas(pg)) for i := range pgReplicas(pg) { + logger = logger.With("Pod", fmt.Sprintf("%s-%d", pg.Name, i)) cfgSecret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: pgConfigSecretName(pg.Name, i), @@ -657,38 +662,9 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p return nil, err } - var authKey *string - if existingCfgSecret == nil { - logger.Debugf("Creating authkey for new ProxyGroup proxy") - tags := pg.Spec.Tags.Stringify() - if len(tags) == 0 { - tags = r.defaultTags - } - key, err := newAuthKey(ctx, r.tsClient, tags) - if err != nil { - return nil, err - } - authKey = &key - } - - if authKey == nil { - // Get state Secret to check if it's already authed. - stateSecret := &corev1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: pgStateSecretName(pg.Name, i), - Namespace: r.tsNamespace, - }, - } - if err := r.Get(ctx, client.ObjectKeyFromObject(stateSecret), stateSecret); err != nil && !apierrors.IsNotFound(err) { - return nil, err - } - - if shouldRetainAuthKey(stateSecret) && existingCfgSecret != nil { - authKey, err = authKeyFromSecret(existingCfgSecret) - if err != nil { - return nil, fmt.Errorf("error retrieving auth key from existing config Secret: %w", err) - } - } + authKey, err := r.getAuthKey(ctx, pg, existingCfgSecret, i, logger) + if err != nil { + return nil, fmt.Errorf("failed to get auth key: %w", err) } replicaName := pgNodePortServiceName(pg.Name, i) @@ -742,6 +718,104 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p return endpoints, nil } +// getAuthKey looks at the proxy's state and config Secrets, and may return: +// * a newly created auth key, +// * an existing auth key from the config Secret, +// * or nil if the device is authed. +// +// It will create a new auth key if the config Secret is not yet created, +// or if the proxy has set reissue_authkey in its state Secret. +func (r *ProxyGroupReconciler) getAuthKey(ctx context.Context, pg *tsapi.ProxyGroup, existingCfgSecret *corev1.Secret, ordinal int32, logger *zap.SugaredLogger) (*string, error) { + // Get state Secret to check if it's already authed or has requested + // a fresh auth key. + stateSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: pgStateSecretName(pg.Name, ordinal), + Namespace: r.tsNamespace, + }, + } + if err := r.Get(ctx, client.ObjectKeyFromObject(stateSecret), stateSecret); err != nil && !apierrors.IsNotFound(err) { + return nil, err + } + + var createAuthKey bool + var cfgAuthKey *string + if existingCfgSecret == nil { + createAuthKey = true + } else { + var err error + cfgAuthKey, err = authKeyFromSecret(existingCfgSecret) + if err != nil { + return nil, fmt.Errorf("error retrieving auth key from existing config Secret: %w", err) + } + } + + if shouldReissueAuthKey(stateSecret, cfgAuthKey) { + logger.Infof("Proxy is failing to auth; will attempt to clean up the old device if any found and issue a new auth key") + + // If the device is already authed, we want to delete it from the tailnet. + if tsID, ok := stateSecret.Data[kubetypes.KeyDeviceID]; ok && len(tsID) > 0 { + if err := r.ensureDeviceDeleted(ctx, tailcfg.StableNodeID(tsID), logger); err != nil { + return nil, err + } + } + + if lim := r.authKeyRateLimits[pg.Name]; lim.Allow() { + createAuthKey = true + } else { + logger.Debugf("auth key re-issuance rate limit exceeded, limit: %.2f, burst: %d, tokens: %.2f", lim.Limit(), lim.Burst(), lim.Tokens()) + return nil, fmt.Errorf("auth key re-issuance rate limit exceeded for ProxyGroup %q, will retry with backoff", pg.Name) + } + } + + var authKey *string + if createAuthKey { + logger.Debugf("Creating auth key for ProxyGroup proxy") + + tags := pg.Spec.Tags.Stringify() + if len(tags) == 0 { + tags = r.defaultTags + } + key, err := newAuthKey(ctx, r.tsClient, tags) + if err != nil { + return nil, err + } + authKey = &key + } else if !deviceAuthed(stateSecret) { + // Retain auth key from existing config. + authKey = cfgAuthKey + } + + return authKey, nil +} + +// shouldReissueAuthKey extracts the value of reissue_authkey from the proxy's +// state Secret, and returns true if a new auth key is needed. The proxy will +// set the value of reissue_authkey to the auth key with which the it failed to +// auth, or empty if it didn't have an auth key in its config file. +func shouldReissueAuthKey(s *corev1.Secret, authKeyInConfig *string) bool { + // If the key exists but the value is empty, that means a previous reissue + // request got cleared. + brokenAuthkey, reissueRequested := s.Data[kubetypes.KeyReissueAuthkey] + if !reissueRequested { + return false + } + + // Reissue requested and no auth key in config, definitely reissue. + if authKeyInConfig == nil || *authKeyInConfig == "" { + return true + } + + // The auth key we were going to use is already reported broken, reissue. + if *authKeyInConfig == string(brokenAuthkey) { + return true + } + + // Make sure we don't reissue again if we happened to reconcile again before + // the proxy got a chance to auth with a reissued auth key. + return false +} + type FindStaticEndpointErr struct { msg string } @@ -835,9 +909,9 @@ func getStaticEndpointAddress(a *corev1.NodeAddress, port uint16) *netip.AddrPor return ptr.To(netip.AddrPortFrom(addr, port)) } -// ensureAddedToGaugeForProxyGroup ensures the gauge metric for the ProxyGroup resource is updated when the ProxyGroup -// is created. r.mu must be held. -func (r *ProxyGroupReconciler) ensureAddedToGaugeForProxyGroup(pg *tsapi.ProxyGroup) { +// ensureStateAddedForProxyGroup ensures the gauge metric for the ProxyGroup resource is updated when the ProxyGroup +// is created, and initialises per-ProxyGroup rate limits on re-issuing auth keys. r.mu must be held. +func (r *ProxyGroupReconciler) ensureStateAddedForProxyGroup(pg *tsapi.ProxyGroup) { switch pg.Spec.Type { case tsapi.ProxyGroupTypeEgress: r.egressProxyGroups.Add(pg.UID) @@ -846,11 +920,16 @@ func (r *ProxyGroupReconciler) ensureAddedToGaugeForProxyGroup(pg *tsapi.ProxyGr } gaugeEgressProxyGroupResources.Set(int64(r.egressProxyGroups.Len())) gaugeIngressProxyGroupResources.Set(int64(r.ingressProxyGroups.Len())) + if _, ok := r.authKeyRateLimits[pg.Name]; !ok { + // Allow every replica to have its auth key re-issued quickly the first + // time, but with an overall limit of 1 every 30s after a burst. + r.authKeyRateLimits[pg.Name] = rate.NewLimiter(rate.Every(30*time.Second), int(pgReplicas(pg))) + } } -// ensureRemovedFromGaugeForProxyGroup ensures the gauge metric for the ProxyGroup resource type is updated when the -// ProxyGroup is deleted. r.mu must be held. -func (r *ProxyGroupReconciler) ensureRemovedFromGaugeForProxyGroup(pg *tsapi.ProxyGroup) { +// ensureStateRemovedForProxyGroup ensures the gauge metric for the ProxyGroup resource type is updated when the +// ProxyGroup is deleted, and deletes the per-ProxyGroup rate limiter to free memory. r.mu must be held. +func (r *ProxyGroupReconciler) ensureStateRemovedForProxyGroup(pg *tsapi.ProxyGroup) { switch pg.Spec.Type { case tsapi.ProxyGroupTypeEgress: r.egressProxyGroups.Remove(pg.UID) @@ -859,6 +938,7 @@ func (r *ProxyGroupReconciler) ensureRemovedFromGaugeForProxyGroup(pg *tsapi.Pro } gaugeEgressProxyGroupResources.Set(int64(r.egressProxyGroups.Len())) gaugeIngressProxyGroupResources.Set(int64(r.ingressProxyGroups.Len())) + delete(r.authKeyRateLimits, pg.Name) } func pgTailscaledConfig(pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass, idx int32, authKey *string, staticEndpoints []netip.AddrPort, oldAdvertiseServices []string, loginServer string) (tailscaledConfigs, error) { @@ -925,7 +1005,7 @@ func (r *ProxyGroupReconciler) getNodeMetadata(ctx context.Context, pg *tsapi.Pr return nil, fmt.Errorf("failed to list state Secrets: %w", err) } for _, secret := range secrets.Items { - var ordinal int + var ordinal int32 if _, err := fmt.Sscanf(secret.Name, pg.Name+"-%d", &ordinal); err != nil { return nil, fmt.Errorf("unexpected secret %s was labelled as owned by the ProxyGroup %s: %w", secret.Name, pg.Name, err) } @@ -997,7 +1077,7 @@ func (r *ProxyGroupReconciler) getDeviceInfo(ctx context.Context, staticEndpoint } type nodeMetadata struct { - ordinal int + ordinal int32 stateSecret *corev1.Secret // podUID is the UID of the current Pod or empty if the Pod does not exist. podUID string diff --git a/cmd/k8s-operator/proxygroup_test.go b/cmd/k8s-operator/proxygroup_test.go index 87b04a434..5378a23e5 100644 --- a/cmd/k8s-operator/proxygroup_test.go +++ b/cmd/k8s-operator/proxygroup_test.go @@ -10,12 +10,15 @@ import ( "encoding/json" "fmt" "net/netip" + "reflect" "slices" + "strings" "testing" "time" "github.com/google/go-cmp/cmp" "go.uber.org/zap" + "golang.org/x/time/rate" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -634,10 +637,11 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) { tsFirewallMode: "auto", defaultProxyClass: "default-pc", - Client: fc, - tsClient: tsClient, - recorder: fr, - clock: cl, + Client: fc, + tsClient: tsClient, + recorder: fr, + clock: cl, + authKeyRateLimits: make(map[string]*rate.Limiter), } for i, r := range tt.reconciles { @@ -777,11 +781,12 @@ func TestProxyGroupWithStaticEndpoints(t *testing.T) { tsFirewallMode: "auto", defaultProxyClass: "default-pc", - Client: fc, - tsClient: tsClient, - recorder: fr, - l: zl.Sugar().With("TestName", tt.name).With("Reconcile", "cleanup"), - clock: cl, + Client: fc, + tsClient: tsClient, + recorder: fr, + l: zl.Sugar().With("TestName", tt.name).With("Reconcile", "cleanup"), + clock: cl, + authKeyRateLimits: make(map[string]*rate.Limiter), } if err := fc.Delete(context.Background(), pg); err != nil { @@ -837,11 +842,12 @@ func TestProxyGroup(t *testing.T) { tsFirewallMode: "auto", defaultProxyClass: "default-pc", - Client: fc, - tsClient: tsClient, - recorder: fr, - l: zl.Sugar(), - clock: cl, + Client: fc, + tsClient: tsClient, + recorder: fr, + l: zl.Sugar(), + clock: cl, + authKeyRateLimits: make(map[string]*rate.Limiter), } crd := &apiextensionsv1.CustomResourceDefinition{ObjectMeta: metav1.ObjectMeta{Name: serviceMonitorCRD}} opts := configOpts{ @@ -1024,12 +1030,13 @@ func TestProxyGroupTypes(t *testing.T) { zl, _ := zap.NewDevelopment() reconciler := &ProxyGroupReconciler{ - tsNamespace: tsNamespace, - proxyImage: testProxyImage, - Client: fc, - l: zl.Sugar(), - tsClient: &fakeTSClient{}, - clock: tstest.NewClock(tstest.ClockOpts{}), + tsNamespace: tsNamespace, + proxyImage: testProxyImage, + Client: fc, + l: zl.Sugar(), + tsClient: &fakeTSClient{}, + clock: tstest.NewClock(tstest.ClockOpts{}), + authKeyRateLimits: make(map[string]*rate.Limiter), } t.Run("egress_type", func(t *testing.T) { @@ -1205,12 +1212,13 @@ func TestIngressAdvertiseServicesConfigPreserved(t *testing.T) { WithStatusSubresource(&tsapi.ProxyGroup{}). Build() reconciler := &ProxyGroupReconciler{ - tsNamespace: tsNamespace, - proxyImage: testProxyImage, - Client: fc, - l: zap.Must(zap.NewDevelopment()).Sugar(), - tsClient: &fakeTSClient{}, - clock: tstest.NewClock(tstest.ClockOpts{}), + tsNamespace: tsNamespace, + proxyImage: testProxyImage, + Client: fc, + l: zap.Must(zap.NewDevelopment()).Sugar(), + tsClient: &fakeTSClient{}, + clock: tstest.NewClock(tstest.ClockOpts{}), + authKeyRateLimits: make(map[string]*rate.Limiter), } existingServices := []string{"svc1", "svc2"} @@ -1271,6 +1279,189 @@ func TestIngressAdvertiseServicesConfigPreserved(t *testing.T) { }) } +func TestProxyGroupGetAuthKey(t *testing.T) { + pg := &tsapi.ProxyGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Finalizers: []string{"tailscale.com/finalizer"}, + }, + Spec: tsapi.ProxyGroupSpec{ + Type: tsapi.ProxyGroupTypeEgress, + Replicas: ptr.To[int32](1), + }, + } + tsClient := &fakeTSClient{} + + // Variables to reference in test cases. + existingAuthKey := ptr.To("existing-auth-key") + newAuthKey := ptr.To("new-authkey") + configWith := func(authKey *string) map[string][]byte { + value := []byte("{}") + if authKey != nil { + value = fmt.Appendf(nil, `{"AuthKey": "%s"}`, *authKey) + } + return map[string][]byte{ + tsoperator.TailscaledConfigFileName(pgMinCapabilityVersion): value, + } + } + + initTest := func() (*ProxyGroupReconciler, client.WithWatch) { + fc := fake.NewClientBuilder(). + WithScheme(tsapi.GlobalScheme). + WithObjects(pg). + WithStatusSubresource(pg). + Build() + zl, _ := zap.NewDevelopment() + fr := record.NewFakeRecorder(1) + cl := tstest.NewClock(tstest.ClockOpts{}) + reconciler := &ProxyGroupReconciler{ + tsNamespace: tsNamespace, + proxyImage: testProxyImage, + defaultTags: []string{"tag:test-tag"}, + tsFirewallMode: "auto", + + Client: fc, + tsClient: tsClient, + recorder: fr, + l: zl.Sugar(), + clock: cl, + authKeyRateLimits: make(map[string]*rate.Limiter), + } + reconciler.ensureStateAddedForProxyGroup(pg) + + return reconciler, fc + } + + // Config Secret: exists or not, has key or not. + // State Secret: has device ID or not, requested reissue or not. + for name, tc := range map[string]struct { + configData map[string][]byte + stateData map[string][]byte + expectedAuthKey *string + expectReissue bool + }{ + "no_secrets_needs_new": { + expectedAuthKey: newAuthKey, // New ProxyGroup or manually cleared Pod. + }, + "no_config_secret_state_authed_ok": { + stateData: map[string][]byte{ + kubetypes.KeyDeviceID: []byte("nodeid-0"), + }, + expectedAuthKey: newAuthKey, // Always create an auth key if we're creating the config Secret. + }, + "config_secret_without_key_state_authed_with_reissue_needs_new": { + configData: configWith(nil), + stateData: map[string][]byte{ + kubetypes.KeyDeviceID: []byte("nodeid-0"), + kubetypes.KeyReissueAuthkey: []byte(""), + }, + expectedAuthKey: newAuthKey, + expectReissue: true, // Device is authed but reissue was requested. + }, + "config_secret_with_key_state_with_reissue_stale_ok": { + configData: configWith(existingAuthKey), + stateData: map[string][]byte{ + kubetypes.KeyReissueAuthkey: []byte("some-older-authkey"), + }, + expectedAuthKey: existingAuthKey, // Config's auth key is different from the one marked for reissue. + }, + "config_secret_with_key_state_with_reissue_existing_key_needs_new": { + configData: configWith(existingAuthKey), + stateData: map[string][]byte{ + kubetypes.KeyDeviceID: []byte("nodeid-0"), + kubetypes.KeyReissueAuthkey: []byte(*existingAuthKey), + }, + expectedAuthKey: newAuthKey, + expectReissue: true, // Current config's auth key is marked for reissue. + }, + "config_secret_without_key_no_state_ok": { + configData: configWith(nil), + expectedAuthKey: nil, // Proxy will set reissue_authkey and then next reconcile will reissue. + }, + "config_secret_without_key_state_authed_ok": { + configData: configWith(nil), + stateData: map[string][]byte{ + kubetypes.KeyDeviceID: []byte("nodeid-0"), + }, + expectedAuthKey: nil, // Device is already authed. + }, + "config_secret_with_key_state_authed_ok": { + configData: configWith(existingAuthKey), + stateData: map[string][]byte{ + kubetypes.KeyDeviceID: []byte("nodeid-0"), + }, + expectedAuthKey: nil, // Auth key getting removed because device is authed. + }, + "config_secret_with_key_no_state_keeps_existing": { + configData: configWith(existingAuthKey), + expectedAuthKey: existingAuthKey, // No state, waiting for containerboot to try the auth key. + }, + } { + t.Run(name, func(t *testing.T) { + tsClient.deleted = tsClient.deleted[:0] // Reset deleted devices for each test case. + reconciler, fc := initTest() + var cfgSecret *corev1.Secret + if tc.configData != nil { + cfgSecret = &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: pgConfigSecretName(pg.Name, 0), + Namespace: tsNamespace, + }, + Data: tc.configData, + } + } + if tc.stateData != nil { + mustCreate(t, fc, &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: pgStateSecretName(pg.Name, 0), + Namespace: tsNamespace, + }, + Data: tc.stateData, + }) + } + + authKey, err := reconciler.getAuthKey(context.Background(), pg, cfgSecret, 0, reconciler.l.With("TestName", t.Name())) + if err != nil { + t.Fatalf("unexpected error getting auth key: %v", err) + } + if !reflect.DeepEqual(authKey, tc.expectedAuthKey) { + deref := func(s *string) string { + if s == nil { + return "" + } + return *s + } + t.Errorf("expected auth key %v, got %v", deref(tc.expectedAuthKey), deref(authKey)) + } + + // Use the device deletion as a proxy for the fact the new auth key + // was due to a reissue. + switch { + case tc.expectReissue && len(tsClient.deleted) != 1: + t.Errorf("expected 1 deleted device, got %v", tsClient.deleted) + case !tc.expectReissue && len(tsClient.deleted) != 0: + t.Errorf("expected no deleted devices, got %v", tsClient.deleted) + } + + if tc.expectReissue { + // Trigger the rate limit in a tight loop. Up to 100 iterations + // to allow for CI that is extremely slow, but should happen on + // first try for any reasonable machine. + for range 100 { + _, err := reconciler.getAuthKey(context.Background(), pg, cfgSecret, 0, reconciler.l.With("TestName", t.Name())) + if err != nil { + if !strings.Contains(err.Error(), "rate limit exceeded") { + t.Fatalf("unexpected error getting auth key: %v", err) + } + return // Expected rate limit error. + } + } + t.Fatal("expected rate limit error, but got none") + } + }) + } +} + func proxyClassesForLEStagingTest() (*tsapi.ProxyClass, *tsapi.ProxyClass, *tsapi.ProxyClass) { pcLEStaging := &tsapi.ProxyClass{ ObjectMeta: metav1.ObjectMeta{ @@ -1505,6 +1696,7 @@ func TestProxyGroupLetsEncryptStaging(t *testing.T) { tsClient: &fakeTSClient{}, l: zl.Sugar(), clock: cl, + authKeyRateLimits: make(map[string]*rate.Limiter), } expectReconciled(t, reconciler, "", pg.Name) diff --git a/cmd/k8s-operator/sts.go b/cmd/k8s-operator/sts.go index 193acad87..a0b5ae9e7 100644 --- a/cmd/k8s-operator/sts.go +++ b/cmd/k8s-operator/sts.go @@ -935,7 +935,7 @@ func tailscaledConfig(stsC *tailscaleSTSConfig, newAuthkey string, oldSecret *co if newAuthkey != "" { conf.AuthKey = &newAuthkey - } else if shouldRetainAuthKey(oldSecret) { + } else if !deviceAuthed(oldSecret) { key, err := authKeyFromSecret(oldSecret) if err != nil { return nil, fmt.Errorf("error retrieving auth key from Secret: %w", err) @@ -984,6 +984,8 @@ func latestConfigFromSecret(s *corev1.Secret) (*ipn.ConfigVAlpha, error) { return conf, nil } +// authKeyFromSecret returns the auth key from the latest config version if +// found, or else nil. func authKeyFromSecret(s *corev1.Secret) (key *string, err error) { conf, err := latestConfigFromSecret(s) if err != nil { @@ -1000,13 +1002,13 @@ func authKeyFromSecret(s *corev1.Secret) (key *string, err error) { return key, nil } -// shouldRetainAuthKey returns true if the state stored in a proxy's state Secret suggests that auth key should be -// retained (because the proxy has not yet successfully authenticated). -func shouldRetainAuthKey(s *corev1.Secret) bool { +// deviceAuthed returns true if the state stored in a proxy's state Secret +// suggests that the proxy has successfully authenticated. +func deviceAuthed(s *corev1.Secret) bool { if s == nil { - return false // nothing to retain here + return false // No state Secret means no device state. } - return len(s.Data["device_id"]) == 0 // proxy has not authed yet + return len(s.Data["device_id"]) > 0 } func shouldAcceptRoutes(pc *tsapi.ProxyClass) bool { diff --git a/cmd/k8s-operator/testutils_test.go b/cmd/k8s-operator/testutils_test.go index 56542700d..acfe6f275 100644 --- a/cmd/k8s-operator/testutils_test.go +++ b/cmd/k8s-operator/testutils_test.go @@ -494,7 +494,7 @@ func expectedSecret(t *testing.T, cl client.Client, opts configOpts) *corev1.Sec AcceptDNS: "false", Hostname: &opts.hostname, Locked: "false", - AuthKey: ptr.To("secret-authkey"), + AuthKey: ptr.To("new-authkey"), AcceptRoutes: "false", AppConnector: &ipn.AppConnectorPrefs{Advertise: false}, NoStatefulFiltering: "true", @@ -801,7 +801,7 @@ func (c *fakeTSClient) CreateKey(ctx context.Context, caps tailscale.KeyCapabili Created: time.Now(), Capabilities: caps, } - return "secret-authkey", k, nil + return "new-authkey", k, nil } func (c *fakeTSClient) Device(ctx context.Context, deviceID string, fields *tailscale.DeviceFieldsOpts) (*tailscale.Device, error) { diff --git a/cmd/k8s-operator/tsrecorder_test.go b/cmd/k8s-operator/tsrecorder_test.go index e6d56ef2f..495c81c7e 100644 --- a/cmd/k8s-operator/tsrecorder_test.go +++ b/cmd/k8s-operator/tsrecorder_test.go @@ -229,7 +229,7 @@ func TestRecorder(t *testing.T) { func expectRecorderResources(t *testing.T, fc client.WithWatch, tsr *tsapi.Recorder, shouldExist bool) { t.Helper() - auth := tsrAuthSecret(tsr, tsNamespace, "secret-authkey") + auth := tsrAuthSecret(tsr, tsNamespace, "new-authkey") state := tsrStateSecret(tsr, tsNamespace) role := tsrRole(tsr, tsNamespace) roleBinding := tsrRoleBinding(tsr, tsNamespace) diff --git a/kube/kubetypes/types.go b/kube/kubetypes/types.go index 6f96875dd..a21b5f309 100644 --- a/kube/kubetypes/types.go +++ b/kube/kubetypes/types.go @@ -33,17 +33,17 @@ const ( // Keys that containerboot writes to state file that can be used to determine its state. // fields set in Tailscale state Secret. These are mostly used by the Tailscale Kubernetes operator to determine // the state of this tailscale device. - KeyDeviceID string = "device_id" // node stable ID of the device - KeyDeviceFQDN string = "device_fqdn" // device's tailnet hostname - KeyDeviceIPs string = "device_ips" // device's tailnet IPs - KeyPodUID string = "pod_uid" // Pod UID - // KeyCapVer contains Tailscale capability version of this proxy instance. - KeyCapVer string = "tailscale_capver" + KeyDeviceID = "device_id" // node stable ID of the device + KeyDeviceFQDN = "device_fqdn" // device's tailnet hostname + KeyDeviceIPs = "device_ips" // device's tailnet IPs + KeyPodUID = "pod_uid" // Pod UID + KeyCapVer = "tailscale_capver" // tailcfg.CurrentCapabilityVersion of this proxy instance. + KeyReissueAuthkey = "reissue_authkey" // Proxies will set this to the authkey that failed, or "no-authkey", if they can't log in. // KeyHTTPSEndpoint is a name of a field that can be set to the value of any HTTPS endpoint currently exposed by // this device to the tailnet. This is used by the Kubernetes operator Ingress proxy to communicate to the operator // that cluster workloads behind the Ingress can now be accessed via the given DNS name over HTTPS. - KeyHTTPSEndpoint string = "https_endpoint" - ValueNoHTTPS string = "no-https" + KeyHTTPSEndpoint = "https_endpoint" + ValueNoHTTPS = "no-https" // Pod's IPv4 address header key as returned by containerboot health check endpoint. PodIPv4Header string = "Pod-IPv4"