tailscale/cmd/k8s-operator/proxygroup.go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause

//go:build !plan9

package main

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"net/http"
	"net/netip"
	"slices"
	"strings"
	"sync"

	"go.uber.org/zap"
	xslices "golang.org/x/exp/slices"
	appsv1 "k8s.io/api/apps/v1"
	corev1 "k8s.io/api/core/v1"
	rbacv1 "k8s.io/api/rbac/v1"
	apiequality "k8s.io/apimachinery/pkg/api/equality"
	apierrors "k8s.io/apimachinery/pkg/api/errors"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/types"
	"k8s.io/apimachinery/pkg/util/intstr"
	"k8s.io/client-go/tools/record"
	"sigs.k8s.io/controller-runtime/pkg/client"
	"sigs.k8s.io/controller-runtime/pkg/reconcile"
	"tailscale.com/client/tailscale"
	"tailscale.com/ipn"
	tsoperator "tailscale.com/k8s-operator"
	tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
	"tailscale.com/kube/egressservices"
	"tailscale.com/kube/kubetypes"
	"tailscale.com/tailcfg"
	"tailscale.com/tstime"
	"tailscale.com/types/ptr"
	"tailscale.com/util/clientmetric"
	"tailscale.com/util/mak"
	"tailscale.com/util/set"
)

const (
	reasonProxyGroupCreationFailed = "ProxyGroupCreationFailed"
	reasonProxyGroupReady          = "ProxyGroupReady"
	reasonProxyGroupCreating       = "ProxyGroupCreating"
	reasonProxyGroupInvalid        = "ProxyGroupInvalid"

	// Copied from k8s.io/apiserver/pkg/registry/generic/registry/store.go@cccad306d649184bf2a0e319ba830c53f65c445c
	optimisticLockErrorMsg  = "the object has been modified; please apply your changes to the latest version and try again"
	staticEndpointsMaxAddrs = 2

	// The minimum tailcfg.CapabilityVersion that deployed clients are expected
	// to support to be compatible with the current ProxyGroup controller.
	// If the controller needs to depend on newer client behaviour, it should
	// maintain backwards compatible logic for older capability versions for 3
	// stable releases, as per documentation on supported version drift:
	// https://tailscale.com/kb/1236/kubernetes-operator#supported-versions
	//
	// tailcfg.CurrentCapabilityVersion was 106 when the ProxyGroup controller was
	// first introduced.
	pgMinCapabilityVersion = 106
)

var (
	gaugeEgressProxyGroupResources  = clientmetric.NewGauge(kubetypes.MetricProxyGroupEgressCount)
	gaugeIngressProxyGroupResources = clientmetric.NewGauge(kubetypes.MetricProxyGroupIngressCount)
)

// ProxyGroupReconciler ensures cluster resources for a ProxyGroup definition.
type ProxyGroupReconciler struct {
	client.Client
	l        *zap.SugaredLogger
	recorder record.EventRecorder
	clock    tstime.Clock
	tsClient tsClient

	// User-specified defaults from the helm installation.
	tsNamespace       string
	proxyImage        string
	defaultTags       []string
	tsFirewallMode    string
	defaultProxyClass string

	mu                 sync.Mutex           // protects following
	egressProxyGroups  set.Slice[types.UID] // for egress proxygroups gauge
	ingressProxyGroups set.Slice[types.UID] // for ingress proxygroups gauge
}

func (r *ProxyGroupReconciler) logger(name string) *zap.SugaredLogger {
	return r.l.With("ProxyGroup", name)
}

func (r *ProxyGroupReconciler) Reconcile(ctx context.Context, req reconcile.Request) (_ reconcile.Result, err error) {
	logger := r.logger(req.Name)
	logger.Debugf("starting reconcile")
	defer logger.Debugf("reconcile finished")

	pg := new(tsapi.ProxyGroup)
	err = r.Get(ctx, req.NamespacedName, pg)
	if apierrors.IsNotFound(err) {
		logger.Debugf("ProxyGroup not found, assuming it was deleted")
		return reconcile.Result{}, nil
	} else if err != nil {
		return reconcile.Result{}, fmt.Errorf("failed to get tailscale.com ProxyGroup: %w", err)
	}
	if markedForDeletion(pg) {
		logger.Debugf("ProxyGroup is being deleted, cleaning up resources")
		ix := xslices.Index(pg.Finalizers, FinalizerName)
		if ix < 0 {
			logger.Debugf("no finalizer, nothing to do")
			return reconcile.Result{}, nil
		}

		if done, err := r.maybeCleanup(ctx, pg); err != nil {
			return reconcile.Result{}, err
		} else if !done {
			logger.Debugf("ProxyGroup resource cleanup not yet finished, will retry...")
			return reconcile.Result{RequeueAfter: shortRequeue}, nil
		}

		pg.Finalizers = slices.Delete(pg.Finalizers, ix, ix+1)
		if err := r.Update(ctx, pg); err != nil {
			return reconcile.Result{}, err
		}
		return reconcile.Result{}, nil
	}

	oldPGStatus := pg.Status.DeepCopy()
	setStatusReady := func(pg *tsapi.ProxyGroup, status metav1.ConditionStatus, reason, message string) (reconcile.Result, error) {
		tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, status, reason, message, pg.Generation, r.clock, logger)
		if !apiequality.Semantic.DeepEqual(oldPGStatus, &pg.Status) {
			// An error encountered here should get returned by the Reconcile function.
			if updateErr := r.Client.Status().Update(ctx, pg); updateErr != nil {
				err = errors.Join(err, updateErr)
			}
		}
		return reconcile.Result{}, err
	}

	if !slices.Contains(pg.Finalizers, FinalizerName) {
		// This log line is printed exactly once during initial provisioning,
		// because once the finalizer is in place this block gets skipped. So,
		// this is a nice place to log that the high level, multi-reconcile
		// operation is underway.
		logger.Infof("ensuring ProxyGroup is set up")
		pg.Finalizers = append(pg.Finalizers, FinalizerName)
		if err = r.Update(ctx, pg); err != nil {
			err = fmt.Errorf("error adding finalizer: %w", err)
			return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreationFailed, reasonProxyGroupCreationFailed)
		}
	}

	if err = r.validate(pg); err != nil {
		message := fmt.Sprintf("ProxyGroup is invalid: %s", err)
		r.recorder.Eventf(pg, corev1.EventTypeWarning, reasonProxyGroupInvalid, message)
		return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupInvalid, message)
	}

	proxyClassName := r.defaultProxyClass
	if pg.Spec.ProxyClass != "" {
		proxyClassName = pg.Spec.ProxyClass
	}

	var proxyClass *tsapi.ProxyClass
	if proxyClassName != "" {
		proxyClass = new(tsapi.ProxyClass)
		err := r.Get(ctx, types.NamespacedName{Name: proxyClassName}, proxyClass)
		if apierrors.IsNotFound(err) {
			err = nil
			message := fmt.Sprintf("the ProxyGroup's ProxyClass %s does not (yet) exist", proxyClassName)
			logger.Info(message)
			return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreating, message)
		}
		if err != nil {
			err = fmt.Errorf("error getting ProxyGroup's ProxyClass %s: %s", proxyClassName, err)
			r.recorder.Eventf(pg, corev1.EventTypeWarning, reasonProxyGroupCreationFailed, err.Error())
			return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreationFailed, err.Error())
		}
		validateProxyClassForPG(logger, pg, proxyClass)
		if !tsoperator.ProxyClassIsReady(proxyClass) {
			message := fmt.Sprintf("the ProxyGroup's ProxyClass %s is not yet in a ready state, waiting...", proxyClassName)
			logger.Info(message)
			return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreating, message)
		}
	}

	isProvisioned, err := r.maybeProvision(ctx, pg, proxyClass)
	if err != nil {
		reason := reasonProxyGroupCreationFailed
		msg := fmt.Sprintf("error provisioning ProxyGroup resources: %s", err)
		if strings.Contains(err.Error(), optimisticLockErrorMsg) {
			reason = reasonProxyGroupCreating
			msg = fmt.Sprintf("optimistic lock error, retrying: %s", err)
			err = nil
			logger.Info(msg)
		} else {
			r.recorder.Eventf(pg, corev1.EventTypeWarning, reason, msg)
		}

		return setStatusReady(pg, metav1.ConditionFalse, reason, msg)
	}

	if !isProvisioned {
		if !apiequality.Semantic.DeepEqual(oldPGStatus, &pg.Status) {
			// An error encountered here should get returned by the Reconcile function.
			if updateErr := r.Client.Status().Update(ctx, pg); updateErr != nil {
				return reconcile.Result{}, errors.Join(err, updateErr)
			}
		}
		return
	}

	desiredReplicas := int(pgReplicas(pg))

	// Set ProxyGroupAvailable condition.
	status := metav1.ConditionFalse
	reason := reasonProxyGroupCreating
	message := fmt.Sprintf("%d/%d ProxyGroup pods running", len(pg.Status.Devices), desiredReplicas)
	if len(pg.Status.Devices) > 0 {
		status = metav1.ConditionTrue
		if len(pg.Status.Devices) == desiredReplicas {
			reason = reasonProxyGroupReady
		}
	}
	tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupAvailable, status, reason, message, pg.Generation, r.clock, logger)

	// Set ProxyGroupReady condition.
	if len(pg.Status.Devices) < desiredReplicas {
		logger.Debug(message)
		return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreating, message)
	}

	if len(pg.Status.Devices) > desiredReplicas {
		message = fmt.Sprintf("waiting for %d ProxyGroup pods to shut down", len(pg.Status.Devices)-desiredReplicas)
		logger.Debug(message)
		return setStatusReady(pg, metav1.ConditionFalse, reasonProxyGroupCreating, message)
	}

	logger.Info("ProxyGroup resources synced")
	return setStatusReady(pg, metav1.ConditionTrue, reasonProxyGroupReady, reasonProxyGroupReady)
}

// validateProxyClassForPG applies custom validation logic for ProxyClass applied to ProxyGroup.
func validateProxyClassForPG(logger *zap.SugaredLogger, pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass) {
	if pg.Spec.Type == tsapi.ProxyGroupTypeIngress {
		return
	}
	// Our custom logic for ensuring minimum downtime ProxyGroup update rollouts relies on the local health check
	// beig accessible on the replica Pod IP:9002. This address can also be modified by users, via
	// TS_LOCAL_ADDR_PORT env var.
	//
	// Currently TS_LOCAL_ADDR_PORT controls Pod's health check and metrics address. _Probably_ there is no need for
	// users to set this to a custom value. Users who want to consume metrics, should integrate with the metrics
	// Service and/or ServiceMonitor, rather than Pods directly. The health check is likely not useful to integrate
	// directly with for operator proxies (and we should aim for unified lifecycle logic in the operator, users
	// shouldn't need to set their own).
	//
	// TODO(irbekrm): maybe disallow configuring this env var in future (in Tailscale 1.84 or later).
	if hasLocalAddrPortSet(pc) {
		msg := fmt.Sprintf("ProxyClass %s applied to an egress ProxyGroup has TS_LOCAL_ADDR_PORT env var set to a custom value."+
			"This will disable the ProxyGroup graceful failover mechanism, so you might experience downtime when ProxyGroup pods are restarted."+
			"In future we will remove the ability to set custom TS_LOCAL_ADDR_PORT for egress ProxyGroups."+
			"Please raise an issue if you expect that this will cause issues for your workflow.", pc.Name)
		logger.Warn(msg)
	}
}

func (r *ProxyGroupReconciler) maybeProvision(ctx context.Context, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (isProvisioned bool, err error) {
	logger := r.logger(pg.Name)
	r.mu.Lock()
	r.ensureAddedToGaugeForProxyGroup(pg)
	r.mu.Unlock()

	svcToNodePorts := make(map[string]uint16)
	var tailscaledPort *uint16
	if proxyClass != nil && proxyClass.Spec.StaticEndpoints != nil {
		svcToNodePorts, tailscaledPort, err = r.ensureNodePortServiceCreated(ctx, pg, proxyClass)
		if err != nil {
			wrappedErr := fmt.Errorf("error provisioning NodePort Services for static endpoints: %w", err)
			var allocatePortErr *allocatePortsErr
			if errors.As(err, &allocatePortErr) {
				reason := reasonProxyGroupCreationFailed
				msg := fmt.Sprintf("error provisioning ProxyGroup resources: %s", wrappedErr)
				r.setStatusReady(pg, metav1.ConditionFalse, reason, msg, logger)
				return false, nil
			}
			return false, wrappedErr
		}
	}

	staticEndpoints, err := r.ensureConfigSecretsCreated(ctx, pg, proxyClass, svcToNodePorts)
	if err != nil {
		wrappedErr := fmt.Errorf("error provisioning config Secrets: %w", err)
		var selectorErr *FindStaticEndpointErr
		if errors.As(err, &selectorErr) {
			reason := reasonProxyGroupCreationFailed
			msg := fmt.Sprintf("error provisioning ProxyGroup resources: %s", wrappedErr)
			r.setStatusReady(pg, metav1.ConditionFalse, reason, msg, logger)
			return false, nil
		}
		return false, wrappedErr
	}

	// State secrets are precreated so we can use the ProxyGroup CR as their owner ref.
	stateSecrets := pgStateSecrets(pg, r.tsNamespace)
	for _, sec := range stateSecrets {
		if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, sec, func(s *corev1.Secret) {
			s.ObjectMeta.Labels = sec.ObjectMeta.Labels
			s.ObjectMeta.Annotations = sec.ObjectMeta.Annotations
			s.ObjectMeta.OwnerReferences = sec.ObjectMeta.OwnerReferences
		}); err != nil {
			return false, fmt.Errorf("error provisioning state Secrets: %w", err)
		}
	}
	sa := pgServiceAccount(pg, r.tsNamespace)
	if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, sa, func(s *corev1.ServiceAccount) {
		s.ObjectMeta.Labels = sa.ObjectMeta.Labels
		s.ObjectMeta.Annotations = sa.ObjectMeta.Annotations
		s.ObjectMeta.OwnerReferences = sa.ObjectMeta.OwnerReferences
	}); err != nil {
		return false, fmt.Errorf("error provisioning ServiceAccount: %w", err)
	}
	role := pgRole(pg, r.tsNamespace)
	if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, role, func(r *rbacv1.Role) {
		r.ObjectMeta.Labels = role.ObjectMeta.Labels
		r.ObjectMeta.Annotations = role.ObjectMeta.Annotations
		r.ObjectMeta.OwnerReferences = role.ObjectMeta.OwnerReferences
		r.Rules = role.Rules
	}); err != nil {
		return false, fmt.Errorf("error provisioning Role: %w", err)
	}
	roleBinding := pgRoleBinding(pg, r.tsNamespace)
	if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, roleBinding, func(r *rbacv1.RoleBinding) {
		r.ObjectMeta.Labels = roleBinding.ObjectMeta.Labels
		r.ObjectMeta.Annotations = roleBinding.ObjectMeta.Annotations
		r.ObjectMeta.OwnerReferences = roleBinding.ObjectMeta.OwnerReferences
		r.RoleRef = roleBinding.RoleRef
		r.Subjects = roleBinding.Subjects
	}); err != nil {
		return false, fmt.Errorf("error provisioning RoleBinding: %w", err)
	}
	if pg.Spec.Type == tsapi.ProxyGroupTypeEgress {
		cm, hp := pgEgressCM(pg, r.tsNamespace)
		if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, cm, func(existing *corev1.ConfigMap) {
			existing.ObjectMeta.Labels = cm.ObjectMeta.Labels
			existing.ObjectMeta.OwnerReferences = cm.ObjectMeta.OwnerReferences
			mak.Set(&existing.BinaryData, egressservices.KeyHEPPings, hp)
		}); err != nil {
			return false, fmt.Errorf("error provisioning egress ConfigMap %q: %w", cm.Name, err)
		}
	}
	if pg.Spec.Type == tsapi.ProxyGroupTypeIngress {
		cm := pgIngressCM(pg, r.tsNamespace)
		if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, cm, func(existing *corev1.ConfigMap) {
			existing.ObjectMeta.Labels = cm.ObjectMeta.Labels
			existing.ObjectMeta.OwnerReferences = cm.ObjectMeta.OwnerReferences
		}); err != nil {
			return false, fmt.Errorf("error provisioning ingress ConfigMap %q: %w", cm.Name, err)
		}
	}
	ss, err := pgStatefulSet(pg, r.tsNamespace, r.proxyImage, r.tsFirewallMode, tailscaledPort, proxyClass)
	if err != nil {
		return false, fmt.Errorf("error generating StatefulSet spec: %w", err)
	}
	cfg := &tailscaleSTSConfig{
		proxyType: string(pg.Spec.Type),
	}
	ss = applyProxyClassToStatefulSet(proxyClass, ss, cfg, logger)

	updateSS := func(s *appsv1.StatefulSet) {
		s.Spec = ss.Spec

		s.ObjectMeta.Labels = ss.ObjectMeta.Labels
		s.ObjectMeta.Annotations = ss.ObjectMeta.Annotations
		s.ObjectMeta.OwnerReferences = ss.ObjectMeta.OwnerReferences
	}
	if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, ss, updateSS); err != nil {
		return false, fmt.Errorf("error provisioning StatefulSet: %w", err)
	}
	mo := &metricsOpts{
		tsNamespace:  r.tsNamespace,
		proxyStsName: pg.Name,
		proxyLabels:  pgLabels(pg.Name, nil),
		proxyType:    "proxygroup",
	}
	if err := reconcileMetricsResources(ctx, logger, mo, proxyClass, r.Client); err != nil {
		return false, fmt.Errorf("error reconciling metrics resources: %w", err)
	}

	if err := r.cleanupDanglingResources(ctx, pg, proxyClass); err != nil {
		return false, fmt.Errorf("error cleaning up dangling resources: %w", err)
	}

	devices, err := r.getDeviceInfo(ctx, staticEndpoints, pg)
	if err != nil {
		return false, fmt.Errorf("failed to get device info: %w", err)
	}

	pg.Status.Devices = devices

	return true, nil
}

// getServicePortsForProxyGroups returns a map of ProxyGroup Service names to their NodePorts,
// and a set of all allocated NodePorts for quick occupancy checking.
func getServicePortsForProxyGroups(ctx context.Context, c client.Client, namespace string, portRanges tsapi.PortRanges) (map[string]uint16, set.Set[uint16], error) {
	svcs := new(corev1.ServiceList)
	matchingLabels := client.MatchingLabels(map[string]string{
		LabelParentType: "proxygroup",
	})

	err := c.List(ctx, svcs, matchingLabels, client.InNamespace(namespace))
	if err != nil {
		return nil, nil, fmt.Errorf("failed to list ProxyGroup Services: %w", err)
	}

	svcToNodePorts := map[string]uint16{}
	usedPorts := set.Set[uint16]{}
	for _, svc := range svcs.Items {
		if len(svc.Spec.Ports) == 1 && svc.Spec.Ports[0].NodePort != 0 {
			p := uint16(svc.Spec.Ports[0].NodePort)
			if portRanges.Contains(p) {
				svcToNodePorts[svc.Name] = p
				usedPorts.Add(p)
			}
		}
	}

	return svcToNodePorts, usedPorts, nil
}

type allocatePortsErr struct {
	msg string
}

func (e *allocatePortsErr) Error() string {
	return e.msg
}

func (r *ProxyGroupReconciler) allocatePorts(ctx context.Context, pg *tsapi.ProxyGroup, proxyClassName string, portRanges tsapi.PortRanges) (map[string]uint16, error) {
	replicaCount := int(pgReplicas(pg))
	svcToNodePorts, usedPorts, err := getServicePortsForProxyGroups(ctx, r.Client, r.tsNamespace, portRanges)
	if err != nil {
		return nil, &allocatePortsErr{msg: fmt.Sprintf("failed to find ports for existing ProxyGroup NodePort Services: %s", err.Error())}
	}

	replicasAllocated := 0
	for i := range pgReplicas(pg) {
		if _, ok := svcToNodePorts[pgNodePortServiceName(pg.Name, i)]; !ok {
			svcToNodePorts[pgNodePortServiceName(pg.Name, i)] = 0
		} else {
			replicasAllocated++
		}
	}

	for replica, port := range svcToNodePorts {
		if port == 0 {
			for p := range portRanges.All() {
				if !usedPorts.Contains(p) {
					svcToNodePorts[replica] = p
					usedPorts.Add(p)
					replicasAllocated++
					break
				}
			}
		}
	}

	if replicasAllocated < replicaCount {
		return nil, &allocatePortsErr{msg: fmt.Sprintf("not enough available ports to allocate all replicas (needed %d, got %d). Field 'spec.staticEndpoints.nodePort.ports' on ProxyClass %q must have bigger range allocated", replicaCount, usedPorts.Len(), proxyClassName)}
	}

	return svcToNodePorts, nil
}

func (r *ProxyGroupReconciler) ensureNodePortServiceCreated(ctx context.Context, pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass) (map[string]uint16, *uint16, error) {
	// NOTE: (ChaosInTheCRD) we want the same TargetPort for every static endpoint NodePort Service for the ProxyGroup
	tailscaledPort := getRandomPort()
	svcs := []*corev1.Service{}
	for i := range pgReplicas(pg) {
		replicaName := pgNodePortServiceName(pg.Name, i)

		svc := &corev1.Service{}
		err := r.Get(ctx, types.NamespacedName{Name: replicaName, Namespace: r.tsNamespace}, svc)
		if err != nil && !apierrors.IsNotFound(err) {
			return nil, nil, fmt.Errorf("error getting Kubernetes Service %q: %w", replicaName, err)
		}
		if apierrors.IsNotFound(err) {
			svcs = append(svcs, pgNodePortService(pg, replicaName, r.tsNamespace))
		} else {
			// NOTE: if we can we want to recover the random port used for tailscaled,
			// as well as the NodePort previously used for that Service
			if len(svc.Spec.Ports) == 1 {
				if svc.Spec.Ports[0].Port != 0 {
					tailscaledPort = uint16(svc.Spec.Ports[0].Port)
				}
			}
			svcs = append(svcs, svc)
		}
	}

	svcToNodePorts, err := r.allocatePorts(ctx, pg, pc.Name, pc.Spec.StaticEndpoints.NodePort.Ports)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to allocate NodePorts to ProxyGroup Services: %w", err)
	}

	for _, svc := range svcs {
		// NOTE: we know that every service is going to have 1 port here
		svc.Spec.Ports[0].Port = int32(tailscaledPort)
		svc.Spec.Ports[0].TargetPort = intstr.FromInt(int(tailscaledPort))
		svc.Spec.Ports[0].NodePort = int32(svcToNodePorts[svc.Name])

		_, err = createOrUpdate(ctx, r.Client, r.tsNamespace, svc, func(s *corev1.Service) {
			s.ObjectMeta.Labels = svc.ObjectMeta.Labels
			s.ObjectMeta.Annotations = svc.ObjectMeta.Annotations
			s.ObjectMeta.OwnerReferences = svc.ObjectMeta.OwnerReferences
			s.Spec.Selector = svc.Spec.Selector
			s.Spec.Ports = svc.Spec.Ports
		})
		if err != nil {
			return nil, nil, fmt.Errorf("error creating/updating Kubernetes NodePort Service %q: %w", svc.Name, err)
		}
	}

	return svcToNodePorts, ptr.To(tailscaledPort), nil
}

// cleanupDanglingResources ensures we don't leak config secrets, state secrets, and
// tailnet devices when the number of replicas specified is reduced.
func (r *ProxyGroupReconciler) cleanupDanglingResources(ctx context.Context, pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass) error {
	logger := r.logger(pg.Name)
	metadata, err := r.getNodeMetadata(ctx, pg)
	if err != nil {
		return err
	}

	for _, m := range metadata {
		if m.ordinal+1 <= int(pgReplicas(pg)) {
			continue
		}

		// Dangling resource, delete the config + state Secrets, as well as
		// deleting the device from the tailnet.
		if err := r.deleteTailnetDevice(ctx, m.tsID, logger); err != nil {
			return err
		}
		if err := r.Delete(ctx, m.stateSecret); err != nil && !apierrors.IsNotFound(err) {
			return fmt.Errorf("error deleting state Secret %q: %w", m.stateSecret.Name, err)
		}
		configSecret := m.stateSecret.DeepCopy()
		configSecret.Name += "-config"
		if err := r.Delete(ctx, configSecret); err != nil && !apierrors.IsNotFound(err) {
			return fmt.Errorf("error deleting config Secret %q: %w", configSecret.Name, err)
		}
		// NOTE(ChaosInTheCRD): we shouldn't need to get the service first, checking for a not found error should be enough
		svc := &corev1.Service{
			ObjectMeta: metav1.ObjectMeta{
				Name:      fmt.Sprintf("%s-nodeport", m.stateSecret.Name),
				Namespace: m.stateSecret.Namespace,
			},
		}
		if err := r.Delete(ctx, svc); err != nil {
			if !apierrors.IsNotFound(err) {
				return fmt.Errorf("error deleting static endpoints Kubernetes Service %q: %w", svc.Name, err)
			}
		}
	}

	// If the ProxyClass has its StaticEndpoints config removed, we want to remove all of the NodePort Services
	if pc != nil && pc.Spec.StaticEndpoints == nil {
		labels := map[string]string{
			kubetypes.LabelManaged: "true",
			LabelParentType:        proxyTypeProxyGroup,
			LabelParentName:        pg.Name,
		}
		if err := r.DeleteAllOf(ctx, &corev1.Service{}, client.InNamespace(r.tsNamespace), client.MatchingLabels(labels)); err != nil {
			return fmt.Errorf("error deleting Kubernetes Services for static endpoints: %w", err)
		}
	}

	return nil
}

// maybeCleanup just deletes the device from the tailnet. All the kubernetes
// resources linked to a ProxyGroup will get cleaned up via owner references
// (which we can use because they are all in the same namespace).
func (r *ProxyGroupReconciler) maybeCleanup(ctx context.Context, pg *tsapi.ProxyGroup) (bool, error) {
	logger := r.logger(pg.Name)

	metadata, err := r.getNodeMetadata(ctx, pg)
	if err != nil {
		return false, err
	}

	for _, m := range metadata {
		if err := r.deleteTailnetDevice(ctx, m.tsID, logger); err != nil {
			return false, err
		}
	}

	mo := &metricsOpts{
		proxyLabels: pgLabels(pg.Name, nil),
		tsNamespace: r.tsNamespace,
		proxyType:   "proxygroup",
	}
	if err := maybeCleanupMetricsResources(ctx, mo, r.Client); err != nil {
		return false, fmt.Errorf("error cleaning up metrics resources: %w", err)
	}

	logger.Infof("cleaned up ProxyGroup resources")
	r.mu.Lock()
	r.ensureRemovedFromGaugeForProxyGroup(pg)
	r.mu.Unlock()
	return true, nil
}

func (r *ProxyGroupReconciler) deleteTailnetDevice(ctx context.Context, id tailcfg.StableNodeID, logger *zap.SugaredLogger) error {
	logger.Debugf("deleting device %s from control", string(id))
	if err := r.tsClient.DeleteDevice(ctx, string(id)); err != nil {
		errResp := &tailscale.ErrResponse{}
		if ok := errors.As(err, errResp); ok && errResp.Status == http.StatusNotFound {
			logger.Debugf("device %s not found, likely because it has already been deleted from control", string(id))
		} else {
			return fmt.Errorf("error deleting device: %w", err)
		}
	} else {
		logger.Debugf("device %s deleted from control", string(id))
	}

	return nil
}

func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass, svcToNodePorts map[string]uint16) (endpoints map[string][]netip.AddrPort, err error) {
	logger := r.logger(pg.Name)
	endpoints = make(map[string][]netip.AddrPort, pgReplicas(pg))
	for i := range pgReplicas(pg) {
		cfgSecret := &corev1.Secret{
			ObjectMeta: metav1.ObjectMeta{
				Name:            pgConfigSecretName(pg.Name, i),
				Namespace:       r.tsNamespace,
				Labels:          pgSecretLabels(pg.Name, "config"),
				OwnerReferences: pgOwnerReference(pg),
			},
		}

		var existingCfgSecret *corev1.Secret // unmodified copy of secret
		if err := r.Get(ctx, client.ObjectKeyFromObject(cfgSecret), cfgSecret); err == nil {
			logger.Debugf("Secret %s/%s already exists", cfgSecret.GetNamespace(), cfgSecret.GetName())
			existingCfgSecret = cfgSecret.DeepCopy()
		} else if !apierrors.IsNotFound(err) {
			return nil, err
		}

		var authKey *string
		if existingCfgSecret == nil {
			logger.Debugf("Creating authkey for new ProxyGroup proxy")
			tags := pg.Spec.Tags.Stringify()
			if len(tags) == 0 {
				tags = r.defaultTags
			}
			key, err := newAuthKey(ctx, r.tsClient, tags)
			if err != nil {
				return nil, err
			}
			authKey = &key
		}

		if authKey == nil {
			// Get state Secret to check if it's already authed.
			stateSecret := &corev1.Secret{
				ObjectMeta: metav1.ObjectMeta{
					Name:      pgStateSecretName(pg.Name, i),
					Namespace: r.tsNamespace,
				},
			}
			if err := r.Get(ctx, client.ObjectKeyFromObject(stateSecret), stateSecret); err != nil && !apierrors.IsNotFound(err) {
				return nil, err
			}

			if shouldRetainAuthKey(stateSecret) && existingCfgSecret != nil {
				authKey, err = authKeyFromSecret(existingCfgSecret)
				if err != nil {
					return nil, fmt.Errorf("error retrieving auth key from existing config Secret: %w", err)
				}
			}
		}

		replicaName := pgNodePortServiceName(pg.Name, i)
		if len(svcToNodePorts) > 0 {
			port, ok := svcToNodePorts[replicaName]
			if !ok {
				return nil, fmt.Errorf("could not find configured NodePort for ProxyGroup replica %q", replicaName)
			}

			endpoints[replicaName], err = r.findStaticEndpoints(ctx, existingCfgSecret, proxyClass, port, logger)
			if err != nil {
				return nil, fmt.Errorf("could not find static endpoints for replica %q: %w", replicaName, err)
			}
		}

		// AdvertiseServices config is set by ingress-pg-reconciler, so make sure we
		// don't overwrite it if already set.
		existingAdvertiseServices, err := extractAdvertiseServicesConfig(existingCfgSecret)
		if err != nil {
			return nil, err
		}

		configs, err := pgTailscaledConfig(pg, proxyClass, i, authKey, endpoints[replicaName], existingAdvertiseServices)
		if err != nil {
			return nil, fmt.Errorf("error creating tailscaled config: %w", err)
		}

		for cap, cfg := range configs {
			cfgJSON, err := json.Marshal(cfg)
			if err != nil {
				return nil, fmt.Errorf("error marshalling tailscaled config: %w", err)
			}
			mak.Set(&cfgSecret.Data, tsoperator.TailscaledConfigFileName(cap), cfgJSON)
		}

		if existingCfgSecret != nil {
			if !apiequality.Semantic.DeepEqual(existingCfgSecret, cfgSecret) {
				logger.Debugf("Updating the existing ProxyGroup config Secret %s", cfgSecret.Name)
				if err := r.Update(ctx, cfgSecret); err != nil {
					return nil, err
				}
			}
		} else {
			logger.Debugf("Creating a new config Secret %s for the ProxyGroup", cfgSecret.Name)
			if err := r.Create(ctx, cfgSecret); err != nil {
				return nil, err
			}
		}
	}

	return endpoints, nil
}

type FindStaticEndpointErr struct {
	msg string
}

func (e *FindStaticEndpointErr) Error() string {
	return e.msg
}

// findStaticEndpoints returns up to two `netip.AddrPort` entries, derived from the ExternalIPs of Nodes that
// match the `proxyClass`'s selector within the StaticEndpoints configuration. The port is set to the replica's NodePort Service Port.
func (r *ProxyGroupReconciler) findStaticEndpoints(ctx context.Context, existingCfgSecret *corev1.Secret, proxyClass *tsapi.ProxyClass, port uint16, logger *zap.SugaredLogger) ([]netip.AddrPort, error) {
	var currAddrs []netip.AddrPort
	if existingCfgSecret != nil {
		oldConfB := existingCfgSecret.Data[tsoperator.TailscaledConfigFileName(106)]
		if len(oldConfB) > 0 {
			var oldConf ipn.ConfigVAlpha
			if err := json.Unmarshal(oldConfB, &oldConf); err == nil {
				currAddrs = oldConf.StaticEndpoints
			} else {
				logger.Debugf("failed to unmarshal tailscaled config from secret %q: %v", existingCfgSecret.Name, err)
			}
		} else {
			logger.Debugf("failed to get tailscaled config from secret %q: empty data", existingCfgSecret.Name)
		}
	}

	nodes := new(corev1.NodeList)
	selectors := client.MatchingLabels(proxyClass.Spec.StaticEndpoints.NodePort.Selector)

	err := r.List(ctx, nodes, selectors)
	if err != nil {
		return nil, fmt.Errorf("failed to list nodes: %w", err)
	}

	if len(nodes.Items) == 0 {
		return nil, &FindStaticEndpointErr{msg: fmt.Sprintf("failed to match nodes to configured Selectors on `spec.staticEndpoints.nodePort.selectors` field for ProxyClass %q", proxyClass.Name)}
	}

	endpoints := []netip.AddrPort{}

	// NOTE(ChaosInTheCRD): Setting a hard limit of two static endpoints.
	newAddrs := []netip.AddrPort{}
	for _, n := range nodes.Items {
		for _, a := range n.Status.Addresses {
			if a.Type == corev1.NodeExternalIP {
				addr := getStaticEndpointAddress(&a, port)
				if addr == nil {
					logger.Debugf("failed to parse %q address on node %q: %q", corev1.NodeExternalIP, n.Name, a.Address)
					continue
				}

				// we want to add the currently used IPs first before
				// adding new ones.
				if currAddrs != nil && slices.Contains(currAddrs, *addr) {
					endpoints = append(endpoints, *addr)
				} else {
					newAddrs = append(newAddrs, *addr)
				}
			}

			if len(endpoints) == 2 {
				break
			}
		}
	}

	// if the 2 endpoints limit hasn't been reached, we
	// can start adding newIPs.
	if len(endpoints) < 2 {
		for _, a := range newAddrs {
			endpoints = append(endpoints, a)
			if len(endpoints) == 2 {
				break
			}
		}
	}

	if len(endpoints) == 0 {
		return nil, &FindStaticEndpointErr{msg: fmt.Sprintf("failed to find any `status.addresses` of type %q on nodes using configured Selectors on `spec.staticEndpoints.nodePort.selectors` for ProxyClass %q", corev1.NodeExternalIP, proxyClass.Name)}
	}

	return endpoints, nil
}

func getStaticEndpointAddress(a *corev1.NodeAddress, port uint16) *netip.AddrPort {
	addr, err := netip.ParseAddr(a.Address)
	if err != nil {
		return nil
	}

	return ptr.To(netip.AddrPortFrom(addr, port))
}

// ensureAddedToGaugeForProxyGroup ensures the gauge metric for the ProxyGroup resource is updated when the ProxyGroup
// is created. r.mu must be held.
func (r *ProxyGroupReconciler) ensureAddedToGaugeForProxyGroup(pg *tsapi.ProxyGroup) {
	switch pg.Spec.Type {
	case tsapi.ProxyGroupTypeEgress:
		r.egressProxyGroups.Add(pg.UID)
	case tsapi.ProxyGroupTypeIngress:
		r.ingressProxyGroups.Add(pg.UID)
	}
	gaugeEgressProxyGroupResources.Set(int64(r.egressProxyGroups.Len()))
	gaugeIngressProxyGroupResources.Set(int64(r.ingressProxyGroups.Len()))
}

// ensureRemovedFromGaugeForProxyGroup ensures the gauge metric for the ProxyGroup resource type is updated when the
// ProxyGroup is deleted. r.mu must be held.
func (r *ProxyGroupReconciler) ensureRemovedFromGaugeForProxyGroup(pg *tsapi.ProxyGroup) {
	switch pg.Spec.Type {
	case tsapi.ProxyGroupTypeEgress:
		r.egressProxyGroups.Remove(pg.UID)
	case tsapi.ProxyGroupTypeIngress:
		r.ingressProxyGroups.Remove(pg.UID)
	}
	gaugeEgressProxyGroupResources.Set(int64(r.egressProxyGroups.Len()))
	gaugeIngressProxyGroupResources.Set(int64(r.ingressProxyGroups.Len()))
}

func pgTailscaledConfig(pg *tsapi.ProxyGroup, pc *tsapi.ProxyClass, idx int32, authKey *string, staticEndpoints []netip.AddrPort, oldAdvertiseServices []string) (tailscaledConfigs, error) {
	conf := &ipn.ConfigVAlpha{
		Version:           "alpha0",
		AcceptDNS:         "false",
		AcceptRoutes:      "false", // AcceptRoutes defaults to true
		Locked:            "false",
		Hostname:          ptr.To(fmt.Sprintf("%s-%d", pg.Name, idx)),
		AdvertiseServices: oldAdvertiseServices,
		AuthKey:           authKey,
	}

	if pg.Spec.HostnamePrefix != "" {
		conf.Hostname = ptr.To(fmt.Sprintf("%s-%d", pg.Spec.HostnamePrefix, idx))
	}

	if shouldAcceptRoutes(pc) {
		conf.AcceptRoutes = "true"
	}

	if len(staticEndpoints) > 0 {
		conf.StaticEndpoints = staticEndpoints
	}

	return map[tailcfg.CapabilityVersion]ipn.ConfigVAlpha{
		pgMinCapabilityVersion: *conf,
	}, nil
}

func extractAdvertiseServicesConfig(cfgSecret *corev1.Secret) ([]string, error) {
	if cfgSecret == nil {
		return nil, nil
	}

	conf, err := latestConfigFromSecret(cfgSecret)
	if err != nil {
		return nil, err
	}

	if conf == nil {
		return nil, nil
	}

	return conf.AdvertiseServices, nil
}

func (r *ProxyGroupReconciler) validate(_ *tsapi.ProxyGroup) error {
	return nil
}

// getNodeMetadata gets metadata for all the pods owned by this ProxyGroup by
// querying their state Secrets. It may not return the same number of items as
// specified in the ProxyGroup spec if e.g. it is getting scaled up or down, or
// some pods have failed to write state.
func (r *ProxyGroupReconciler) getNodeMetadata(ctx context.Context, pg *tsapi.ProxyGroup) (metadata []nodeMetadata, _ error) {
	// List all state secrets owned by this ProxyGroup.
	secrets := &corev1.SecretList{}
	if err := r.List(ctx, secrets, client.InNamespace(r.tsNamespace), client.MatchingLabels(pgSecretLabels(pg.Name, "state"))); err != nil {
		return nil, fmt.Errorf("failed to list state Secrets: %w", err)
	}
	for _, secret := range secrets.Items {
		var ordinal int
		if _, err := fmt.Sscanf(secret.Name, pg.Name+"-%d", &ordinal); err != nil {
			return nil, fmt.Errorf("unexpected secret %s was labelled as owned by the ProxyGroup %s: %w", secret.Name, pg.Name, err)
		}

		prefs, ok, err := getDevicePrefs(&secret)
		if err != nil {
			return nil, err
		}
		if !ok {
			continue
		}

		nm := nodeMetadata{
			ordinal:     ordinal,
			stateSecret: &secret,
			tsID:        prefs.Config.NodeID,
			dnsName:     prefs.Config.UserProfile.LoginName,
		}
		pod := &corev1.Pod{}
		if err := r.Get(ctx, client.ObjectKey{Namespace: r.tsNamespace, Name: fmt.Sprintf("%s-%d", pg.Name, ordinal)}, pod); err != nil && !apierrors.IsNotFound(err) {
			return nil, err
		} else if err == nil {
			nm.podUID = string(pod.UID)
		}
		metadata = append(metadata, nm)
	}

	return metadata, nil
}

func (r *ProxyGroupReconciler) getDeviceInfo(ctx context.Context, staticEndpoints map[string][]netip.AddrPort, pg *tsapi.ProxyGroup) (devices []tsapi.TailnetDevice, _ error) {
	metadata, err := r.getNodeMetadata(ctx, pg)
	if err != nil {
		return nil, err
	}

	for _, m := range metadata {
		if !strings.EqualFold(string(m.stateSecret.Data[kubetypes.KeyPodUID]), m.podUID) {
			// Current Pod has not yet written its UID to the state Secret, data may
			// be stale.
			continue
		}

		device := tsapi.TailnetDevice{}
		if ipsB := m.stateSecret.Data[kubetypes.KeyDeviceIPs]; len(ipsB) > 0 {
			ips := []string{}
			if err := json.Unmarshal(ipsB, &ips); err != nil {
				return nil, fmt.Errorf("failed to extract device IPs from state Secret %q: %w", m.stateSecret.Name, err)
			}
			device.TailnetIPs = ips
		}

		if hostname, _, ok := strings.Cut(string(m.stateSecret.Data[kubetypes.KeyDeviceFQDN]), "."); ok {
			device.Hostname = hostname
		}

		if ep, ok := staticEndpoints[device.Hostname]; ok && len(ep) > 0 {
			eps := make([]string, 0, len(ep))
			for _, e := range ep {
				eps = append(eps, e.String())
			}
			device.StaticEndpoints = eps
		}

		devices = append(devices, device)
	}

	return devices, nil
}

type nodeMetadata struct {
	ordinal     int
	stateSecret *corev1.Secret
	// podUID is the UID of the current Pod or empty if the Pod does not exist.
	podUID  string
	tsID    tailcfg.StableNodeID
	dnsName string
}

func (pr *ProxyGroupReconciler) setStatusReady(pg *tsapi.ProxyGroup, status metav1.ConditionStatus, reason string, msg string, logger *zap.SugaredLogger) {
	pr.recorder.Eventf(pg, corev1.EventTypeWarning, reason, msg)
	tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, status, reason, msg, pg.Generation, pr.clock, logger)
}