mirror of
https://github.com/tailscale/tailscale.git
synced 2025-02-28 19:27:41 +00:00
275 lines
10 KiB
Go
275 lines
10 KiB
Go
![]() |
// Copyright (c) Tailscale Inc & AUTHORS
|
||
|
// SPDX-License-Identifier: BSD-3-Clause
|
||
|
|
||
|
//go:build !plan9
|
||
|
|
||
|
package main
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"errors"
|
||
|
"fmt"
|
||
|
"net/http"
|
||
|
"slices"
|
||
|
"strings"
|
||
|
"sync/atomic"
|
||
|
"time"
|
||
|
|
||
|
"go.uber.org/zap"
|
||
|
xslices "golang.org/x/exp/slices"
|
||
|
corev1 "k8s.io/api/core/v1"
|
||
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||
|
"k8s.io/apimachinery/pkg/types"
|
||
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
||
|
"sigs.k8s.io/controller-runtime/pkg/reconcile"
|
||
|
tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
|
||
|
"tailscale.com/kube/kubetypes"
|
||
|
"tailscale.com/logtail/backoff"
|
||
|
"tailscale.com/tstime"
|
||
|
"tailscale.com/util/httpm"
|
||
|
)
|
||
|
|
||
|
const tsEgressReadinessGate = "tailscale.com/egress-services"
|
||
|
|
||
|
// egressPodsReconciler is responsible for setting tailscale.com/egress-services condition on egress ProxyGroup Pods.
|
||
|
// The condition is used as a readiness gate for the Pod, meaning that kubelet will not mark the Pod as ready before the
|
||
|
// condition is set. The ProxyGroup StatefulSet updates are rolled out in such a way that no Pod is restarted, before
|
||
|
// the previous Pod is marked as ready, so ensuring that the Pod does not get marked as ready when it is not yet able to
|
||
|
// route traffic for egress service prevents downtime during restarts caused by no available endpoints left because
|
||
|
// every Pod has been recreated and is not yet added to endpoints.
|
||
|
// https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-readiness-gate
|
||
|
type egressPodsReconciler struct {
|
||
|
client.Client
|
||
|
logger *zap.SugaredLogger
|
||
|
tsNamespace string
|
||
|
clock tstime.Clock
|
||
|
httpClient doer // http client that can be set to a mock client in tests
|
||
|
maxBackoff time.Duration // max backoff period between health check calls
|
||
|
}
|
||
|
|
||
|
// Reconcile reconciles an egress ProxyGroup Pods on changes to those Pods and ProxyGroup EndpointSlices. It ensures
|
||
|
// that for each Pod who is ready to route traffic to all egress services for the ProxyGroup, the Pod has a
|
||
|
// tailscale.com/egress-services condition to set, so that kubelet will mark the Pod as ready.
|
||
|
//
|
||
|
// For the Pod to be ready
|
||
|
// to route traffic to the egress service, the kube proxy needs to have set up the Pod's IP as an endpoint for the
|
||
|
// ClusterIP Service corresponding to the egress service.
|
||
|
//
|
||
|
// Note that the endpoints for the ClusterIP Service are configured by the operator itself using custom
|
||
|
// EndpointSlices(egress-eps-reconciler), so the routing is not blocked on Pod's readiness.
|
||
|
//
|
||
|
// Each egress service has a corresponding ClusterIP Service, that exposes all user configured
|
||
|
// tailnet ports, as well as a health check port for the proxy.
|
||
|
//
|
||
|
// The reconciler calls the health check endpoint of each Service up to N number of times, where N is the number of
|
||
|
// replicas for the ProxyGroup x 3, and checks if the received response is healthy response from the Pod being reconciled.
|
||
|
//
|
||
|
// The health check response contains a header with the
|
||
|
// Pod's IP address- this is used to determine whether the response is received from this Pod.
|
||
|
//
|
||
|
// If the Pod does not appear to be serving the health check endpoint (pre-v1.80 proxies), the reconciler just sets the
|
||
|
// readiness condition for backwards compatibility reasons.
|
||
|
func (er *egressPodsReconciler) Reconcile(ctx context.Context, req reconcile.Request) (res reconcile.Result, err error) {
|
||
|
l := er.logger.With("Pod", req.NamespacedName)
|
||
|
l.Debugf("starting reconcile")
|
||
|
defer l.Debugf("reconcile finished")
|
||
|
|
||
|
pod := new(corev1.Pod)
|
||
|
err = er.Get(ctx, req.NamespacedName, pod)
|
||
|
if apierrors.IsNotFound(err) {
|
||
|
return reconcile.Result{}, nil
|
||
|
}
|
||
|
if err != nil {
|
||
|
return reconcile.Result{}, fmt.Errorf("failed to get Pod: %w", err)
|
||
|
}
|
||
|
if !pod.DeletionTimestamp.IsZero() {
|
||
|
l.Debugf("Pod is being deleted, do nothing")
|
||
|
return res, nil
|
||
|
}
|
||
|
if pod.Labels[LabelParentType] != proxyTypeProxyGroup {
|
||
|
l.Infof("[unexpected] reconciler called for a Pod that is not a ProxyGroup Pod")
|
||
|
return res, nil
|
||
|
}
|
||
|
|
||
|
// If the Pod does not have the readiness gate set, there is no need to add the readiness condition. In practice
|
||
|
// this will happen if the user has configured custom TS_LOCAL_ADDR_PORT, thus disabling the graceful failover.
|
||
|
if !slices.ContainsFunc(pod.Spec.ReadinessGates, func(r corev1.PodReadinessGate) bool {
|
||
|
return r.ConditionType == tsEgressReadinessGate
|
||
|
}) {
|
||
|
l.Debug("Pod does not have egress readiness gate set, skipping")
|
||
|
return res, nil
|
||
|
}
|
||
|
|
||
|
proxyGroupName := pod.Labels[LabelParentName]
|
||
|
pg := new(tsapi.ProxyGroup)
|
||
|
if err := er.Get(ctx, types.NamespacedName{Name: proxyGroupName}, pg); err != nil {
|
||
|
return res, fmt.Errorf("error getting ProxyGroup %q: %w", proxyGroupName, err)
|
||
|
}
|
||
|
if pg.Spec.Type != typeEgress {
|
||
|
l.Infof("[unexpected] reconciler called for %q ProxyGroup Pod", pg.Spec.Type)
|
||
|
return res, nil
|
||
|
}
|
||
|
// Get all ClusterIP Services for all egress targets exposed to cluster via this ProxyGroup.
|
||
|
lbls := map[string]string{
|
||
|
LabelManaged: "true",
|
||
|
labelProxyGroup: proxyGroupName,
|
||
|
labelSvcType: typeEgress,
|
||
|
}
|
||
|
svcs := &corev1.ServiceList{}
|
||
|
if err := er.List(ctx, svcs, client.InNamespace(er.tsNamespace), client.MatchingLabels(lbls)); err != nil {
|
||
|
return res, fmt.Errorf("error listing ClusterIP Services")
|
||
|
}
|
||
|
|
||
|
idx := xslices.IndexFunc(pod.Status.Conditions, func(c corev1.PodCondition) bool {
|
||
|
return c.Type == tsEgressReadinessGate
|
||
|
})
|
||
|
if idx != -1 {
|
||
|
l.Debugf("Pod is already ready, do nothing")
|
||
|
return res, nil
|
||
|
}
|
||
|
|
||
|
var routesMissing atomic.Bool
|
||
|
errChan := make(chan error, len(svcs.Items))
|
||
|
for _, svc := range svcs.Items {
|
||
|
s := svc
|
||
|
go func() {
|
||
|
ll := l.With("service_name", s.Name)
|
||
|
d := retrieveClusterDomain(er.tsNamespace, ll)
|
||
|
healthCheckAddr := healthCheckForSvc(&s, d)
|
||
|
if healthCheckAddr == "" {
|
||
|
ll.Debugf("ClusterIP Service does not expose a health check endpoint, unable to verify if routing is set up")
|
||
|
errChan <- nil
|
||
|
return
|
||
|
}
|
||
|
|
||
|
var routesSetup bool
|
||
|
bo := backoff.NewBackoff(s.Name, ll.Infof, er.maxBackoff)
|
||
|
for range numCalls(pgReplicas(pg)) {
|
||
|
if ctx.Err() != nil {
|
||
|
errChan <- nil
|
||
|
return
|
||
|
}
|
||
|
state, err := er.lookupPodRouteViaSvc(ctx, pod, healthCheckAddr, ll)
|
||
|
if err != nil {
|
||
|
errChan <- fmt.Errorf("error validating if routing has been set up for Pod: %w", err)
|
||
|
return
|
||
|
}
|
||
|
if state == healthy || state == cannotVerify {
|
||
|
routesSetup = true
|
||
|
break
|
||
|
}
|
||
|
if state == unreachable || state == unhealthy || state == podNotReady {
|
||
|
bo.BackOff(ctx, errors.New("backoff"))
|
||
|
}
|
||
|
}
|
||
|
if !routesSetup {
|
||
|
ll.Debugf("Pod is not yet configured as Service endpoint")
|
||
|
routesMissing.Store(true)
|
||
|
}
|
||
|
errChan <- nil
|
||
|
}()
|
||
|
}
|
||
|
for range len(svcs.Items) {
|
||
|
e := <-errChan
|
||
|
err = errors.Join(err, e)
|
||
|
}
|
||
|
if err != nil {
|
||
|
return res, fmt.Errorf("error verifying conectivity: %w", err)
|
||
|
}
|
||
|
if rm := routesMissing.Load(); rm {
|
||
|
l.Info("Pod is not yet added as an endpoint for all egress targets, waiting...")
|
||
|
return reconcile.Result{RequeueAfter: shortRequeue}, nil
|
||
|
}
|
||
|
if err := er.setPodReady(ctx, pod, l); err != nil {
|
||
|
return res, fmt.Errorf("error setting Pod as ready: %w", err)
|
||
|
}
|
||
|
return res, nil
|
||
|
}
|
||
|
|
||
|
func (er *egressPodsReconciler) setPodReady(ctx context.Context, pod *corev1.Pod, l *zap.SugaredLogger) error {
|
||
|
if slices.ContainsFunc(pod.Status.Conditions, func(c corev1.PodCondition) bool {
|
||
|
return c.Type == tsEgressReadinessGate
|
||
|
}) {
|
||
|
return nil
|
||
|
}
|
||
|
l.Infof("Pod is ready to route traffic to all egress targets")
|
||
|
pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{
|
||
|
Type: tsEgressReadinessGate,
|
||
|
Status: corev1.ConditionTrue,
|
||
|
LastTransitionTime: metav1.Time{Time: er.clock.Now()},
|
||
|
})
|
||
|
return er.Status().Update(ctx, pod)
|
||
|
}
|
||
|
|
||
|
// healthCheckState is the result of a single request to an egress Service health check endpoint with a goal to hit a
|
||
|
// specific backend Pod.
|
||
|
type healthCheckState int8
|
||
|
|
||
|
const (
|
||
|
cannotVerify healthCheckState = iota // not verifiable for this setup (i.e earlier proxy version)
|
||
|
unreachable // no backends or another network error
|
||
|
notFound // hit another backend
|
||
|
unhealthy // not 200
|
||
|
podNotReady // Pod is not ready, i.e does not have an IP address yet
|
||
|
healthy // 200
|
||
|
)
|
||
|
|
||
|
// lookupPodRouteViaSvc attempts to reach a Pod using a health check endpoint served by a Service and returns the state of the health check.
|
||
|
func (er *egressPodsReconciler) lookupPodRouteViaSvc(ctx context.Context, pod *corev1.Pod, healthCheckAddr string, l *zap.SugaredLogger) (healthCheckState, error) {
|
||
|
if !slices.ContainsFunc(pod.Spec.Containers[0].Env, func(e corev1.EnvVar) bool {
|
||
|
return e.Name == "TS_ENABLE_HEALTH_CHECK" && e.Value == "true"
|
||
|
}) {
|
||
|
l.Debugf("Pod does not have health check enabled, unable to verify if it is currently routable via Service")
|
||
|
return cannotVerify, nil
|
||
|
}
|
||
|
wantsIP, err := podIPv4(pod)
|
||
|
if err != nil {
|
||
|
return -1, fmt.Errorf("error determining Pod's IP address: %w", err)
|
||
|
}
|
||
|
if wantsIP == "" {
|
||
|
return podNotReady, nil
|
||
|
}
|
||
|
|
||
|
ctx, cancel := context.WithTimeout(ctx, time.Second*3)
|
||
|
defer cancel()
|
||
|
req, err := http.NewRequestWithContext(ctx, httpm.GET, healthCheckAddr, nil)
|
||
|
if err != nil {
|
||
|
return -1, fmt.Errorf("error creating new HTTP request: %w", err)
|
||
|
}
|
||
|
// Do not re-use the same connection for the next request so to maximize the chance of hitting all backends equally.
|
||
|
req.Close = true
|
||
|
resp, err := er.httpClient.Do(req)
|
||
|
if err != nil {
|
||
|
// This is most likely because this is the first Pod and is not yet added to Service endoints. Other
|
||
|
// error types are possible, but checking for those would likely make the system too fragile.
|
||
|
return unreachable, nil
|
||
|
}
|
||
|
defer resp.Body.Close()
|
||
|
gotIP := resp.Header.Get(kubetypes.PodIPv4Header)
|
||
|
if gotIP == "" {
|
||
|
l.Debugf("Health check does not return Pod's IP header, unable to verify if Pod is currently routable via Service")
|
||
|
return cannotVerify, nil
|
||
|
}
|
||
|
if !strings.EqualFold(wantsIP, gotIP) {
|
||
|
return notFound, nil
|
||
|
}
|
||
|
if resp.StatusCode != http.StatusOK {
|
||
|
return unhealthy, nil
|
||
|
}
|
||
|
return healthy, nil
|
||
|
}
|
||
|
|
||
|
// numCalls return the number of times an endpoint on a ProxyGroup Service should be called till it can be safely
|
||
|
// assumed that, if none of the responses came back from a specific Pod then traffic for the Service is currently not
|
||
|
// being routed to that Pod. This assumes that traffic for the Service is routed via round robin, so
|
||
|
// InternalTrafficPolicy must be 'Cluster' and session affinity must be None.
|
||
|
func numCalls(replicas int32) int32 {
|
||
|
return replicas * 3
|
||
|
}
|
||
|
|
||
|
// doer is an interface for HTTP client that can be set to a mock client in tests.
|
||
|
type doer interface {
|
||
|
Do(*http.Request) (*http.Response, error)
|
||
|
}
|