tailscale/cmd/k8s-operator/egress-services-readiness.go
Irbe Krumina 3f39211f98
cmd/k8s-operator: check that cluster traffic is routed to egress ProxyGroup Pod before marking it as ready (#14792)
This change builds on top of #14436 to ensure minimum downtime during egress ProxyGroup update rollouts:

- adds a readiness gate for ProxyGroup replicas that prevents kubelet from marking
the replica Pod as ready before a corresponding readiness condition has been added
to the Pod

- adds a reconciler that reconciles egress ProxyGroup Pods and, for each that is not ready,
if cluster traffic for relevant egress endpoints is routed via this Pod- if so add the
readiness condition to allow kubelet to mark the Pod as ready.

During the sequenced StatefulSet update rollouts kubelet does not restart
a Pod before the previous replica has been updated and marked as ready, so
ensuring that a replica is not marked as ready allows to avoid a temporary
post-update situation where all replicas have been restarted, but none of the
new ones are yet set up as an endpoint for the egress service, so cluster traffic is dropped.

Updates tailscale/tailscale#14326

Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2025-01-30 08:47:45 +00:00

181 lines
5.8 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build !plan9
package main
import (
"context"
"errors"
"fmt"
"strings"
"go.uber.org/zap"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
discoveryv1 "k8s.io/api/discovery/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
tsoperator "tailscale.com/k8s-operator"
tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
"tailscale.com/tstime"
)
const (
reasonReadinessCheckFailed = "ReadinessCheckFailed"
reasonClusterResourcesNotReady = "ClusterResourcesNotReady"
reasonNoProxies = "NoProxiesConfigured"
reasonNotReady = "NotReadyToRouteTraffic"
reasonReady = "ReadyToRouteTraffic"
reasonPartiallyReady = "PartiallyReadyToRouteTraffic"
msgReadyToRouteTemplate = "%d out of %d replicas are ready to route traffic"
)
type egressSvcsReadinessReconciler struct {
client.Client
logger *zap.SugaredLogger
clock tstime.Clock
tsNamespace string
}
// Reconcile reconciles an ExternalName Service that defines a tailnet target to be exposed on a ProxyGroup and sets the
// EgressSvcReady condition on it. The condition gets set to true if at least one of the proxies is currently ready to
// route traffic to the target. It compares proxy Pod IPs with the endpoints set on the EndpointSlice for the egress
// service to determine how many replicas are currently able to route traffic.
func (esrr *egressSvcsReadinessReconciler) Reconcile(ctx context.Context, req reconcile.Request) (res reconcile.Result, err error) {
l := esrr.logger.With("Service", req.NamespacedName)
l.Debugf("starting reconcile")
defer l.Debugf("reconcile finished")
svc := new(corev1.Service)
if err = esrr.Get(ctx, req.NamespacedName, svc); apierrors.IsNotFound(err) {
l.Debugf("Service not found")
return res, nil
} else if err != nil {
return res, fmt.Errorf("failed to get Service: %w", err)
}
var (
reason, msg string
st metav1.ConditionStatus = metav1.ConditionUnknown
)
oldStatus := svc.Status.DeepCopy()
defer func() {
tsoperator.SetServiceCondition(svc, tsapi.EgressSvcReady, st, reason, msg, esrr.clock, l)
if !apiequality.Semantic.DeepEqual(oldStatus, &svc.Status) {
err = errors.Join(err, esrr.Status().Update(ctx, svc))
}
}()
crl := egressSvcChildResourceLabels(svc)
eps, err := getSingleObject[discoveryv1.EndpointSlice](ctx, esrr.Client, esrr.tsNamespace, crl)
if err != nil {
err = fmt.Errorf("error getting EndpointSlice: %w", err)
reason = reasonReadinessCheckFailed
msg = err.Error()
return res, err
}
if eps == nil {
l.Infof("EndpointSlice for Service does not yet exist, waiting...")
reason, msg = reasonClusterResourcesNotReady, reasonClusterResourcesNotReady
st = metav1.ConditionFalse
return res, nil
}
pg := &tsapi.ProxyGroup{
ObjectMeta: metav1.ObjectMeta{
Name: svc.Annotations[AnnotationProxyGroup],
},
}
err = esrr.Get(ctx, client.ObjectKeyFromObject(pg), pg)
if apierrors.IsNotFound(err) {
l.Infof("ProxyGroup for Service does not exist, waiting...")
reason, msg = reasonClusterResourcesNotReady, reasonClusterResourcesNotReady
st = metav1.ConditionFalse
return res, nil
}
if err != nil {
err = fmt.Errorf("error retrieving ProxyGroup: %w", err)
reason = reasonReadinessCheckFailed
msg = err.Error()
return res, err
}
if !tsoperator.ProxyGroupIsReady(pg) {
l.Infof("ProxyGroup for Service is not ready, waiting...")
reason, msg = reasonClusterResourcesNotReady, reasonClusterResourcesNotReady
st = metav1.ConditionFalse
return res, nil
}
replicas := pgReplicas(pg)
if replicas == 0 {
l.Infof("ProxyGroup replicas set to 0")
reason, msg = reasonNoProxies, reasonNoProxies
st = metav1.ConditionFalse
return res, nil
}
podLabels := pgLabels(pg.Name, nil)
var readyReplicas int32
for i := range replicas {
podLabels[appsv1.PodIndexLabel] = fmt.Sprintf("%d", i)
pod, err := getSingleObject[corev1.Pod](ctx, esrr.Client, esrr.tsNamespace, podLabels)
if err != nil {
err = fmt.Errorf("error retrieving ProxyGroup Pod: %w", err)
reason = reasonReadinessCheckFailed
msg = err.Error()
return res, err
}
if pod == nil {
l.Warnf("[unexpected] ProxyGroup is ready, but replica %d was not found", i)
reason, msg = reasonClusterResourcesNotReady, reasonClusterResourcesNotReady
return res, nil
}
l.Debugf("looking at Pod with IPs %v", pod.Status.PodIPs)
ready := false
for _, ep := range eps.Endpoints {
l.Debugf("looking at endpoint with addresses %v", ep.Addresses)
if endpointReadyForPod(&ep, pod, l) {
l.Debugf("endpoint is ready for Pod")
ready = true
break
}
}
if ready {
readyReplicas++
}
}
msg = fmt.Sprintf(msgReadyToRouteTemplate, readyReplicas, replicas)
if readyReplicas == 0 {
reason = reasonNotReady
st = metav1.ConditionFalse
return res, nil
}
st = metav1.ConditionTrue
if readyReplicas < replicas {
reason = reasonPartiallyReady
} else {
reason = reasonReady
}
return res, nil
}
// endpointReadyForPod returns true if the endpoint is for the Pod's IPv4 address and is ready to serve traffic.
// Endpoint must not be nil.
func endpointReadyForPod(ep *discoveryv1.Endpoint, pod *corev1.Pod, l *zap.SugaredLogger) bool {
podIP, err := podIPv4(pod)
if err != nil {
l.Warnf("[unexpected] error retrieving Pod's IPv4 address: %v", err)
return false
}
// Currently we only ever set a single address on and Endpoint and nothing else is meant to modify this.
if len(ep.Addresses) != 1 {
return false
}
return strings.EqualFold(ep.Addresses[0], podIP) &&
*ep.Conditions.Ready &&
*ep.Conditions.Serving &&
!*ep.Conditions.Terminating
}