mirror of
https://github.com/tailscale/tailscale.git
synced 2025-12-24 01:26:39 +00:00
cmd/{containerboot,k8s-operator}: use state Secret for checking device auth (#16328)
Previously, the operator checked the ProxyGroup status fields for information on how many of the proxies had successfully authed. Use their state Secrets instead as a more reliable source of truth. containerboot has written device_fqdn and device_ips keys to the state Secret since inception, and pod_uid since 1.78.0, so there's no need to use the API for that data. Read it from the state Secret for consistency. However, to ensure we don't read data from a previous run of containerboot, make sure we reset containerboot's state keys on startup. One other knock-on effect of that is ProxyGroups can briefly be marked not Ready while a Pod is restarting. Introduce a new ProxyGroupAvailable condition to more accurately reflect when downstream controllers can implement flows that rely on a ProxyGroup having at least 1 proxy Pod running. Fixes #16327 Change-Id: I026c18e9d23e87109a471a87b8e4fb6271716a66 Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
This commit is contained in:
@@ -52,6 +52,17 @@ const (
|
||||
// Copied from k8s.io/apiserver/pkg/registry/generic/registry/store.go@cccad306d649184bf2a0e319ba830c53f65c445c
|
||||
optimisticLockErrorMsg = "the object has been modified; please apply your changes to the latest version and try again"
|
||||
staticEndpointsMaxAddrs = 2
|
||||
|
||||
// The minimum tailcfg.CapabilityVersion that deployed clients are expected
|
||||
// to support to be compatible with the current ProxyGroup controller.
|
||||
// If the controller needs to depend on newer client behaviour, it should
|
||||
// maintain backwards compatible logic for older capability versions for 3
|
||||
// stable releases, as per documentation on supported version drift:
|
||||
// https://tailscale.com/kb/1236/kubernetes-operator#supported-versions
|
||||
//
|
||||
// tailcfg.CurrentCapabilityVersion was 106 when the ProxyGroup controller was
|
||||
// first introduced.
|
||||
pgMinCapabilityVersion = 106
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -204,14 +215,27 @@ func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Requ
|
||||
}
|
||||
|
||||
desiredReplicas := int(pgReplicas(pg))
|
||||
|
||||
// Set ProxyGroupAvailable condition.
|
||||
status := metav1.ConditionFalse
|
||||
reason := reasonProxyGroupCreating
|
||||
message := fmt.Sprintf("%d/%d ProxyGroup pods running", len(pg.Status.Devices), desiredReplicas)
|
||||
if len(pg.Status.Devices) > 0 {
|
||||
status = metav1.ConditionTrue
|
||||
if len(pg.Status.Devices) == desiredReplicas {
|
||||
reason = reasonProxyGroupReady
|
||||
}
|
||||
}
|
||||
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupAvailable, status, reason, message, pg.Generation, r.clock, logger)
|
||||
|
||||
// Set ProxyGroupReady condition.
|
||||
if len(pg.Status.Devices) < desiredReplicas {
|
||||
message := fmt.Sprintf("%d/%d ProxyGroup pods running", len(pg.Status.Devices), desiredReplicas)
|
||||
logger.Debug(message)
|
||||
return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreating, message)
|
||||
}
|
||||
|
||||
if len(pg.Status.Devices) > desiredReplicas {
|
||||
message := fmt.Sprintf("waiting for %d ProxyGroup pods to shut down", len(pg.Status.Devices)-desiredReplicas)
|
||||
message = fmt.Sprintf("waiting for %d ProxyGroup pods to shut down", len(pg.Status.Devices)-desiredReplicas)
|
||||
logger.Debug(message)
|
||||
return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreating, message)
|
||||
}
|
||||
@@ -524,17 +548,13 @@ func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, pg
|
||||
if err := r.deleteTailnetDevice(ctx, m.tsID, logger); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := r.Delete(ctx, m.stateSecret); err != nil {
|
||||
if !apierrors.IsNotFound(err) {
|
||||
return fmt.Errorf("error deleting state Secret %s: %w", m.stateSecret.Name, err)
|
||||
}
|
||||
if err := r.Delete(ctx, m.stateSecret); err != nil && !apierrors.IsNotFound(err) {
|
||||
return fmt.Errorf("error deleting state Secret %q: %w", m.stateSecret.Name, err)
|
||||
}
|
||||
configSecret := m.stateSecret.DeepCopy()
|
||||
configSecret.Name += "-config"
|
||||
if err := r.Delete(ctx, configSecret); err != nil {
|
||||
if !apierrors.IsNotFound(err) {
|
||||
return fmt.Errorf("error deleting config Secret %s: %w", configSecret.Name, err)
|
||||
}
|
||||
if err := r.Delete(ctx, configSecret); err != nil && !apierrors.IsNotFound(err) {
|
||||
return fmt.Errorf("error deleting config Secret %q: %w", configSecret.Name, err)
|
||||
}
|
||||
// NOTE(ChaosInTheCRD): we shouldn't need to get the service first, checking for a not found error should be enough
|
||||
svc := &corev1.Service{
|
||||
@@ -635,17 +655,38 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var authKey string
|
||||
var authKey *string
|
||||
if existingCfgSecret == nil {
|
||||
logger.Debugf("Creating authkey for new ProxyGroup proxy")
|
||||
tags := pg.Spec.Tags.Stringify()
|
||||
if len(tags) == 0 {
|
||||
tags = r.defaultTags
|
||||
}
|
||||
authKey, err = newAuthKey(ctx, r.tsClient, tags)
|
||||
key, err := newAuthKey(ctx, r.tsClient, tags)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
authKey = &key
|
||||
}
|
||||
|
||||
if authKey == nil {
|
||||
// Get state Secret to check if it's already authed.
|
||||
stateSecret := &corev1.Secret{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: pgStateSecretName(pg.Name, i),
|
||||
Namespace: r.tsNamespace,
|
||||
},
|
||||
}
|
||||
if err := r.Get(ctx, client.ObjectKeyFromObject(stateSecret), stateSecret); err != nil && !apierrors.IsNotFound(err) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if shouldRetainAuthKey(stateSecret) && existingCfgSecret != nil {
|
||||
authKey, err = authKeyFromSecret(existingCfgSecret)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error retrieving auth key from existing config Secret: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
replicaName := pgNodePortServiceName(pg.Name, i)
|
||||
@@ -661,7 +702,14 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
|
||||
}
|
||||
}
|
||||
|
||||
configs, err := pgTailscaledConfig(pg, proxyClass, i, authKey, existingCfgSecret, endpoints[replicaName])
|
||||
// AdvertiseServices config is set by ingress-pg-reconciler, so make sure we
|
||||
// don't overwrite it if already set.
|
||||
existingAdvertiseServices, err := extractAdvertiseServicesConfig(existingCfgSecret)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
configs, err := pgTailscaledConfig(pg, proxyClass, i, authKey, endpoints[replicaName], existingAdvertiseServices)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error creating tailscaled config: %w", err)
|
||||
}
|
||||
@@ -811,20 +859,22 @@ func (r *ProxyGroupReconciler) ensureRemovedFromGaugeForProxyGroup(pg *tsapi.Pro
|
||||
gaugeIngressProxyGroupResources.Set(int64(r.ingressProxyGroups.Len()))
|
||||
}
|
||||
|
||||
func pgTailscaledConfig(pg *tsapi.ProxyGroup, class *tsapi.ProxyClass, idx int32, authKey string, oldSecret *corev1.Secret, staticEndpoints []netip.AddrPort) (tailscaledConfigs, error) {
|
||||
func pgTailscaledConfig(pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass, idx int32, authKey *string, staticEndpoints []netip.AddrPort, oldAdvertiseServices []string) (tailscaledConfigs, error) {
|
||||
conf := &ipn.ConfigVAlpha{
|
||||
Version: "alpha0",
|
||||
AcceptDNS: "false",
|
||||
AcceptRoutes: "false", // AcceptRoutes defaults to true
|
||||
Locked: "false",
|
||||
Hostname: ptr.To(fmt.Sprintf("%s-%d", pg.Name, idx)),
|
||||
Version: "alpha0",
|
||||
AcceptDNS: "false",
|
||||
AcceptRoutes: "false", // AcceptRoutes defaults to true
|
||||
Locked: "false",
|
||||
Hostname: ptr.To(fmt.Sprintf("%s-%d", pg.Name, idx)),
|
||||
AdvertiseServices: oldAdvertiseServices,
|
||||
AuthKey: authKey,
|
||||
}
|
||||
|
||||
if pg.Spec.HostnamePrefix != "" {
|
||||
conf.Hostname = ptr.To(fmt.Sprintf("%s-%d", pg.Spec.HostnamePrefix, idx))
|
||||
}
|
||||
|
||||
if shouldAcceptRoutes(class) {
|
||||
if shouldAcceptRoutes(pc) {
|
||||
conf.AcceptRoutes = "true"
|
||||
}
|
||||
|
||||
@@ -832,51 +882,26 @@ func pgTailscaledConfig(pg *tsapi.ProxyGroup, class *tsapi.ProxyClass, idx int32
|
||||
conf.StaticEndpoints = staticEndpoints
|
||||
}
|
||||
|
||||
deviceAuthed := false
|
||||
for _, d := range pg.Status.Devices {
|
||||
if d.Hostname == *conf.Hostname {
|
||||
deviceAuthed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if authKey != "" {
|
||||
conf.AuthKey = &authKey
|
||||
} else if !deviceAuthed {
|
||||
key, err := authKeyFromSecret(oldSecret)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error retrieving auth key from Secret: %w", err)
|
||||
}
|
||||
conf.AuthKey = key
|
||||
}
|
||||
capVerConfigs := make(map[tailcfg.CapabilityVersion]ipn.ConfigVAlpha)
|
||||
|
||||
// AdvertiseServices config is set by ingress-pg-reconciler, so make sure we
|
||||
// don't overwrite it here.
|
||||
if err := copyAdvertiseServicesConfig(conf, oldSecret, 106); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
capVerConfigs[106] = *conf
|
||||
return capVerConfigs, nil
|
||||
return map[tailcfg.CapabilityVersion]ipn.ConfigVAlpha{
|
||||
pgMinCapabilityVersion: *conf,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func copyAdvertiseServicesConfig(conf *ipn.ConfigVAlpha, oldSecret *corev1.Secret, capVer tailcfg.CapabilityVersion) error {
|
||||
if oldSecret == nil {
|
||||
return nil
|
||||
func extractAdvertiseServicesConfig(cfgSecret *corev1.Secret) ([]string, error) {
|
||||
if cfgSecret == nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
oldConfB := oldSecret.Data[tsoperator.TailscaledConfigFileName(capVer)]
|
||||
if len(oldConfB) == 0 {
|
||||
return nil
|
||||
conf, err := latestConfigFromSecret(cfgSecret)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var oldConf ipn.ConfigVAlpha
|
||||
if err := json.Unmarshal(oldConfB, &oldConf); err != nil {
|
||||
return fmt.Errorf("error unmarshalling existing config: %w", err)
|
||||
if conf == nil {
|
||||
return nil, nil
|
||||
}
|
||||
conf.AdvertiseServices = oldConf.AdvertiseServices
|
||||
|
||||
return nil
|
||||
return conf.AdvertiseServices, nil
|
||||
}
|
||||
|
||||
func (r *ProxyGroupReconciler) validate(_ *tsapi.ProxyGroup) error {
|
||||
@@ -914,7 +939,7 @@ func (r *ProxyGroupReconciler) getNodeMetadata(ctx context.Context, pg *tsapi.Pr
|
||||
dnsName: prefs.Config.UserProfile.LoginName,
|
||||
}
|
||||
pod := &corev1.Pod{}
|
||||
if err := r.Get(ctx, client.ObjectKey{Namespace: r.tsNamespace, Name: secret.Name}, pod); err != nil && !apierrors.IsNotFound(err) {
|
||||
if err := r.Get(ctx, client.ObjectKey{Namespace: r.tsNamespace, Name: fmt.Sprintf("%s-%d", pg.Name, ordinal)}, pod); err != nil && !apierrors.IsNotFound(err) {
|
||||
return nil, err
|
||||
} else if err == nil {
|
||||
nm.podUID = string(pod.UID)
|
||||
@@ -932,17 +957,23 @@ func (r *ProxyGroupReconciler) getDeviceInfo(ctx context.Context, staticEndpoint
|
||||
}
|
||||
|
||||
for _, m := range metadata {
|
||||
device, ok, err := getDeviceInfo(ctx, r.tsClient, m.stateSecret)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !ok {
|
||||
if !strings.EqualFold(string(m.stateSecret.Data[kubetypes.KeyPodUID]), m.podUID) {
|
||||
// Current Pod has not yet written its UID to the state Secret, data may
|
||||
// be stale.
|
||||
continue
|
||||
}
|
||||
|
||||
dev := tsapi.TailnetDevice{
|
||||
Hostname: device.Hostname,
|
||||
TailnetIPs: device.TailnetIPs,
|
||||
device := tsapi.TailnetDevice{}
|
||||
if ipsB := m.stateSecret.Data[kubetypes.KeyDeviceIPs]; len(ipsB) > 0 {
|
||||
ips := []string{}
|
||||
if err := json.Unmarshal(ipsB, &ips); err != nil {
|
||||
return nil, fmt.Errorf("failed to extract device IPs from state Secret %q: %w", m.stateSecret.Name, err)
|
||||
}
|
||||
device.TailnetIPs = ips
|
||||
}
|
||||
|
||||
if hostname, _, ok := strings.Cut(string(m.stateSecret.Data[kubetypes.KeyDeviceFQDN]), "."); ok {
|
||||
device.Hostname = hostname
|
||||
}
|
||||
|
||||
if ep, ok := staticEndpoints[device.Hostname]; ok && len(ep) > 0 {
|
||||
@@ -950,10 +981,10 @@ func (r *ProxyGroupReconciler) getDeviceInfo(ctx context.Context, staticEndpoint
|
||||
for _, e := range ep {
|
||||
eps = append(eps, e.String())
|
||||
}
|
||||
dev.StaticEndpoints = eps
|
||||
device.StaticEndpoints = eps
|
||||
}
|
||||
|
||||
devices = append(devices, dev)
|
||||
devices = append(devices, device)
|
||||
}
|
||||
|
||||
return devices, nil
|
||||
|
||||
Reference in New Issue
Block a user