tailscale/cmd/k8s-operator/egress-pod-readiness.go

275 lines
10 KiB
Go
Raw Normal View History

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build !plan9
package main
import (
"context"
"errors"
"fmt"
"net/http"
"slices"
"strings"
"sync/atomic"
"time"
"go.uber.org/zap"
xslices "golang.org/x/exp/slices"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
"tailscale.com/kube/kubetypes"
"tailscale.com/logtail/backoff"
"tailscale.com/tstime"
"tailscale.com/util/httpm"
)
const tsEgressReadinessGate = "tailscale.com/egress-services"
// egressPodsReconciler is responsible for setting tailscale.com/egress-services condition on egress ProxyGroup Pods.
// The condition is used as a readiness gate for the Pod, meaning that kubelet will not mark the Pod as ready before the
// condition is set. The ProxyGroup StatefulSet updates are rolled out in such a way that no Pod is restarted, before
// the previous Pod is marked as ready, so ensuring that the Pod does not get marked as ready when it is not yet able to
// route traffic for egress service prevents downtime during restarts caused by no available endpoints left because
// every Pod has been recreated and is not yet added to endpoints.
// https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-readiness-gate
type egressPodsReconciler struct {
client.Client
logger *zap.SugaredLogger
tsNamespace string
clock tstime.Clock
httpClient doer // http client that can be set to a mock client in tests
maxBackoff time.Duration // max backoff period between health check calls
}
// Reconcile reconciles an egress ProxyGroup Pods on changes to those Pods and ProxyGroup EndpointSlices. It ensures
// that for each Pod who is ready to route traffic to all egress services for the ProxyGroup, the Pod has a
// tailscale.com/egress-services condition to set, so that kubelet will mark the Pod as ready.
//
// For the Pod to be ready
// to route traffic to the egress service, the kube proxy needs to have set up the Pod's IP as an endpoint for the
// ClusterIP Service corresponding to the egress service.
//
// Note that the endpoints for the ClusterIP Service are configured by the operator itself using custom
// EndpointSlices(egress-eps-reconciler), so the routing is not blocked on Pod's readiness.
//
// Each egress service has a corresponding ClusterIP Service, that exposes all user configured
// tailnet ports, as well as a health check port for the proxy.
//
// The reconciler calls the health check endpoint of each Service up to N number of times, where N is the number of
// replicas for the ProxyGroup x 3, and checks if the received response is healthy response from the Pod being reconciled.
//
// The health check response contains a header with the
// Pod's IP address- this is used to determine whether the response is received from this Pod.
//
// If the Pod does not appear to be serving the health check endpoint (pre-v1.80 proxies), the reconciler just sets the
// readiness condition for backwards compatibility reasons.
func (er *egressPodsReconciler) Reconcile(ctx context.Context, req reconcile.Request) (res reconcile.Result, err error) {
l := er.logger.With("Pod", req.NamespacedName)
l.Debugf("starting reconcile")
defer l.Debugf("reconcile finished")
pod := new(corev1.Pod)
err = er.Get(ctx, req.NamespacedName, pod)
if apierrors.IsNotFound(err) {
return reconcile.Result{}, nil
}
if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to get Pod: %w", err)
}
if !pod.DeletionTimestamp.IsZero() {
l.Debugf("Pod is being deleted, do nothing")
return res, nil
}
if pod.Labels[LabelParentType] != proxyTypeProxyGroup {
l.Infof("[unexpected] reconciler called for a Pod that is not a ProxyGroup Pod")
return res, nil
}
// If the Pod does not have the readiness gate set, there is no need to add the readiness condition. In practice
// this will happen if the user has configured custom TS_LOCAL_ADDR_PORT, thus disabling the graceful failover.
if !slices.ContainsFunc(pod.Spec.ReadinessGates, func(r corev1.PodReadinessGate) bool {
return r.ConditionType == tsEgressReadinessGate
}) {
l.Debug("Pod does not have egress readiness gate set, skipping")
return res, nil
}
proxyGroupName := pod.Labels[LabelParentName]
pg := new(tsapi.ProxyGroup)
if err := er.Get(ctx, types.NamespacedName{Name: proxyGroupName}, pg); err != nil {
return res, fmt.Errorf("error getting ProxyGroup %q: %w", proxyGroupName, err)
}
if pg.Spec.Type != typeEgress {
l.Infof("[unexpected] reconciler called for %q ProxyGroup Pod", pg.Spec.Type)
return res, nil
}
// Get all ClusterIP Services for all egress targets exposed to cluster via this ProxyGroup.
lbls := map[string]string{
LabelManaged: "true",
labelProxyGroup: proxyGroupName,
labelSvcType: typeEgress,
}
svcs := &corev1.ServiceList{}
if err := er.List(ctx, svcs, client.InNamespace(er.tsNamespace), client.MatchingLabels(lbls)); err != nil {
return res, fmt.Errorf("error listing ClusterIP Services")
}
idx := xslices.IndexFunc(pod.Status.Conditions, func(c corev1.PodCondition) bool {
return c.Type == tsEgressReadinessGate
})
if idx != -1 {
l.Debugf("Pod is already ready, do nothing")
return res, nil
}
var routesMissing atomic.Bool
errChan := make(chan error, len(svcs.Items))
for _, svc := range svcs.Items {
s := svc
go func() {
ll := l.With("service_name", s.Name)
d := retrieveClusterDomain(er.tsNamespace, ll)
healthCheckAddr := healthCheckForSvc(&s, d)
if healthCheckAddr == "" {
ll.Debugf("ClusterIP Service does not expose a health check endpoint, unable to verify if routing is set up")
errChan <- nil
return
}
var routesSetup bool
bo := backoff.NewBackoff(s.Name, ll.Infof, er.maxBackoff)
for range numCalls(pgReplicas(pg)) {
if ctx.Err() != nil {
errChan <- nil
return
}
state, err := er.lookupPodRouteViaSvc(ctx, pod, healthCheckAddr, ll)
if err != nil {
errChan <- fmt.Errorf("error validating if routing has been set up for Pod: %w", err)
return
}
if state == healthy || state == cannotVerify {
routesSetup = true
break
}
if state == unreachable || state == unhealthy || state == podNotReady {
bo.BackOff(ctx, errors.New("backoff"))
}
}
if !routesSetup {
ll.Debugf("Pod is not yet configured as Service endpoint")
routesMissing.Store(true)
}
errChan <- nil
}()
}
for range len(svcs.Items) {
e := <-errChan
err = errors.Join(err, e)
}
if err != nil {
return res, fmt.Errorf("error verifying conectivity: %w", err)
}
if rm := routesMissing.Load(); rm {
l.Info("Pod is not yet added as an endpoint for all egress targets, waiting...")
return reconcile.Result{RequeueAfter: shortRequeue}, nil
}
if err := er.setPodReady(ctx, pod, l); err != nil {
return res, fmt.Errorf("error setting Pod as ready: %w", err)
}
return res, nil
}
func (er *egressPodsReconciler) setPodReady(ctx context.Context, pod *corev1.Pod, l *zap.SugaredLogger) error {
if slices.ContainsFunc(pod.Status.Conditions, func(c corev1.PodCondition) bool {
return c.Type == tsEgressReadinessGate
}) {
return nil
}
l.Infof("Pod is ready to route traffic to all egress targets")
pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{
Type: tsEgressReadinessGate,
Status: corev1.ConditionTrue,
LastTransitionTime: metav1.Time{Time: er.clock.Now()},
})
return er.Status().Update(ctx, pod)
}
// healthCheckState is the result of a single request to an egress Service health check endpoint with a goal to hit a
// specific backend Pod.
type healthCheckState int8
const (
cannotVerify healthCheckState = iota // not verifiable for this setup (i.e earlier proxy version)
unreachable // no backends or another network error
notFound // hit another backend
unhealthy // not 200
podNotReady // Pod is not ready, i.e does not have an IP address yet
healthy // 200
)
// lookupPodRouteViaSvc attempts to reach a Pod using a health check endpoint served by a Service and returns the state of the health check.
func (er *egressPodsReconciler) lookupPodRouteViaSvc(ctx context.Context, pod *corev1.Pod, healthCheckAddr string, l *zap.SugaredLogger) (healthCheckState, error) {
if !slices.ContainsFunc(pod.Spec.Containers[0].Env, func(e corev1.EnvVar) bool {
return e.Name == "TS_ENABLE_HEALTH_CHECK" && e.Value == "true"
}) {
l.Debugf("Pod does not have health check enabled, unable to verify if it is currently routable via Service")
return cannotVerify, nil
}
wantsIP, err := podIPv4(pod)
if err != nil {
return -1, fmt.Errorf("error determining Pod's IP address: %w", err)
}
if wantsIP == "" {
return podNotReady, nil
}
ctx, cancel := context.WithTimeout(ctx, time.Second*3)
defer cancel()
req, err := http.NewRequestWithContext(ctx, httpm.GET, healthCheckAddr, nil)
if err != nil {
return -1, fmt.Errorf("error creating new HTTP request: %w", err)
}
// Do not re-use the same connection for the next request so to maximize the chance of hitting all backends equally.
req.Close = true
resp, err := er.httpClient.Do(req)
if err != nil {
// This is most likely because this is the first Pod and is not yet added to Service endoints. Other
// error types are possible, but checking for those would likely make the system too fragile.
return unreachable, nil
}
defer resp.Body.Close()
gotIP := resp.Header.Get(kubetypes.PodIPv4Header)
if gotIP == "" {
l.Debugf("Health check does not return Pod's IP header, unable to verify if Pod is currently routable via Service")
return cannotVerify, nil
}
if !strings.EqualFold(wantsIP, gotIP) {
return notFound, nil
}
if resp.StatusCode != http.StatusOK {
return unhealthy, nil
}
return healthy, nil
}
// numCalls return the number of times an endpoint on a ProxyGroup Service should be called till it can be safely
// assumed that, if none of the responses came back from a specific Pod then traffic for the Service is currently not
// being routed to that Pod. This assumes that traffic for the Service is routed via round robin, so
// InternalTrafficPolicy must be 'Cluster' and session affinity must be None.
func numCalls(replicas int32) int32 {
return replicas * 3
}
// doer is an interface for HTTP client that can be set to a mock client in tests.
type doer interface {
Do(*http.Request) (*http.Response, error)
}