cmd/{containerboot,k8s-operator}: use state Secret for checking device auth (#16328)

Previously, the operator checked the ProxyGroup status fields for
information on how many of the proxies had successfully authed. Use
their state Secrets instead as a more reliable source of truth.

containerboot has written device_fqdn and device_ips keys to the
state Secret since inception, and pod_uid since 1.78.0, so there's
no need to use the API for that data. Read it from the state Secret
for consistency. However, to ensure we don't read data from a
previous run of containerboot, make sure we reset containerboot's
state keys on startup.

One other knock-on effect of that is ProxyGroups can briefly be
marked not Ready while a Pod is restarting. Introduce a new
ProxyGroupAvailable condition to more accurately reflect
when downstream controllers can implement flows that rely on a
ProxyGroup having at least 1 proxy Pod running.

Fixes #16327

Change-Id: I026c18e9d23e87109a471a87b8e4fb6271716a66

Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
This commit is contained in:
Tom Proctor
2025-06-27 18:10:04 +01:00
committed by GitHub
parent f81baa2d56
commit 711698f5a9
19 changed files with 373 additions and 202 deletions

View File

@@ -18,12 +18,15 @@ import (
"time"
"tailscale.com/ipn"
"tailscale.com/kube/egressservices"
"tailscale.com/kube/ingressservices"
"tailscale.com/kube/kubeapi"
"tailscale.com/kube/kubeclient"
"tailscale.com/kube/kubetypes"
"tailscale.com/logtail/backoff"
"tailscale.com/tailcfg"
"tailscale.com/types/logger"
"tailscale.com/util/set"
)
// kubeClient is a wrapper around Tailscale's internal kube client that knows how to talk to the kube API server. We use
@@ -117,20 +120,39 @@ func (kc *kubeClient) deleteAuthKey(ctx context.Context) error {
return nil
}
// storeCapVerUID stores the current capability version of tailscale and, if provided, UID of the Pod in the tailscale
// state Secret.
// These two fields are used by the Kubernetes Operator to observe the current capability version of tailscaled running in this container.
func (kc *kubeClient) storeCapVerUID(ctx context.Context, podUID string) error {
capVerS := fmt.Sprintf("%d", tailcfg.CurrentCapabilityVersion)
d := map[string][]byte{
kubetypes.KeyCapVer: []byte(capVerS),
// resetContainerbootState resets state from previous runs of containerboot to
// ensure the operator doesn't use stale state when a Pod is first recreated.
func (kc *kubeClient) resetContainerbootState(ctx context.Context, podUID string) error {
existingSecret, err := kc.GetSecret(ctx, kc.stateSecret)
if err != nil {
return fmt.Errorf("failed to read state Secret %q to reset state: %w", kc.stateSecret, err)
}
s := &kubeapi.Secret{
Data: map[string][]byte{
kubetypes.KeyCapVer: fmt.Appendf(nil, "%d", tailcfg.CurrentCapabilityVersion),
},
}
if podUID != "" {
d[kubetypes.KeyPodUID] = []byte(podUID)
s.Data[kubetypes.KeyPodUID] = []byte(podUID)
}
s := &kubeapi.Secret{
Data: d,
toClear := set.SetOf([]string{
kubetypes.KeyDeviceID,
kubetypes.KeyDeviceFQDN,
kubetypes.KeyDeviceIPs,
kubetypes.KeyHTTPSEndpoint,
egressservices.KeyEgressServices,
ingressservices.IngressConfigKey,
})
for key := range existingSecret.Data {
if toClear.Contains(key) {
// It's fine to leave the key in place as a debugging breadcrumb,
// it should get a new value soon.
s.Data[key] = nil
}
}
return kc.StrategicMergePatchSecret(ctx, kc.stateSecret, s, "tailscale-container")
}