mirror of
https://github.com/tailscale/tailscale.git
synced 2025-04-24 09:51:41 +00:00

Ensure no services are advertised as part of shutting down tailscaled. Prefs are only edited if services are currently advertised, and they're edited we wait for control's ~15s (+ buffer) delay to failover. Note that editing prefs will trigger a synchronous write to the state Secret, so it may fail to persist state if the ProxyGroup is getting scaled down and therefore has its RBAC deleted at the same time, but that failure doesn't stop prefs being updated within the local backend, doesn't affect connectivity to control, and the state Secret is about to get deleted anyway, so the only negative side effect is a harmless error log during shutdown. Control still learns that the node is no longer advertising the service and triggers the failover. Note that the first version of this used a PreStop lifecycle hook, but that only supports GET methods and we need the shutdown to trigger side effects (updating prefs) so it didn't seem appropriate to expose that functionality on a GET endpoint that's accessible on the k8s network. Updates tailscale/corp#24795 Change-Id: I0a9a4fe7a5395ca76135ceead05cbc3ee32b3d3c Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
404 lines
12 KiB
Go
404 lines
12 KiB
Go
// Copyright (c) Tailscale Inc & AUTHORS
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
//go:build !plan9
|
|
|
|
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"slices"
|
|
"strconv"
|
|
|
|
appsv1 "k8s.io/api/apps/v1"
|
|
corev1 "k8s.io/api/core/v1"
|
|
rbacv1 "k8s.io/api/rbac/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/util/intstr"
|
|
"sigs.k8s.io/yaml"
|
|
tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
|
|
"tailscale.com/kube/egressservices"
|
|
"tailscale.com/kube/kubetypes"
|
|
"tailscale.com/types/ptr"
|
|
)
|
|
|
|
// deletionGracePeriodSeconds is set to 6 minutes to ensure that the pre-stop hook of these proxies have enough chance to terminate gracefully.
|
|
const deletionGracePeriodSeconds int64 = 360
|
|
|
|
// Returns the base StatefulSet definition for a ProxyGroup. A ProxyClass may be
|
|
// applied over the top after.
|
|
func pgStatefulSet(pg *tsapi.ProxyGroup, namespace, image, tsFirewallMode string, proxyClass *tsapi.ProxyClass) (*appsv1.StatefulSet, error) {
|
|
ss := new(appsv1.StatefulSet)
|
|
if err := yaml.Unmarshal(proxyYaml, &ss); err != nil {
|
|
return nil, fmt.Errorf("failed to unmarshal proxy spec: %w", err)
|
|
}
|
|
// Validate some base assumptions.
|
|
if len(ss.Spec.Template.Spec.InitContainers) != 1 {
|
|
return nil, fmt.Errorf("[unexpected] base proxy config had %d init containers instead of 1", len(ss.Spec.Template.Spec.InitContainers))
|
|
}
|
|
if len(ss.Spec.Template.Spec.Containers) != 1 {
|
|
return nil, fmt.Errorf("[unexpected] base proxy config had %d containers instead of 1", len(ss.Spec.Template.Spec.Containers))
|
|
}
|
|
|
|
// StatefulSet config.
|
|
ss.ObjectMeta = metav1.ObjectMeta{
|
|
Name: pg.Name,
|
|
Namespace: namespace,
|
|
Labels: pgLabels(pg.Name, nil),
|
|
OwnerReferences: pgOwnerReference(pg),
|
|
}
|
|
ss.Spec.Replicas = ptr.To(pgReplicas(pg))
|
|
ss.Spec.Selector = &metav1.LabelSelector{
|
|
MatchLabels: pgLabels(pg.Name, nil),
|
|
}
|
|
|
|
// Template config.
|
|
tmpl := &ss.Spec.Template
|
|
tmpl.ObjectMeta = metav1.ObjectMeta{
|
|
Name: pg.Name,
|
|
Namespace: namespace,
|
|
Labels: pgLabels(pg.Name, nil),
|
|
DeletionGracePeriodSeconds: ptr.To[int64](10),
|
|
}
|
|
tmpl.Spec.ServiceAccountName = pg.Name
|
|
tmpl.Spec.InitContainers[0].Image = image
|
|
proxyConfigVolName := pgEgressCMName(pg.Name)
|
|
if pg.Spec.Type == tsapi.ProxyGroupTypeIngress {
|
|
proxyConfigVolName = pgIngressCMName(pg.Name)
|
|
}
|
|
tmpl.Spec.Volumes = func() []corev1.Volume {
|
|
var volumes []corev1.Volume
|
|
for i := range pgReplicas(pg) {
|
|
volumes = append(volumes, corev1.Volume{
|
|
Name: fmt.Sprintf("tailscaledconfig-%d", i),
|
|
VolumeSource: corev1.VolumeSource{
|
|
Secret: &corev1.SecretVolumeSource{
|
|
SecretName: pgConfigSecretName(pg.Name, i),
|
|
},
|
|
},
|
|
})
|
|
}
|
|
|
|
volumes = append(volumes, corev1.Volume{
|
|
Name: proxyConfigVolName,
|
|
VolumeSource: corev1.VolumeSource{
|
|
ConfigMap: &corev1.ConfigMapVolumeSource{
|
|
LocalObjectReference: corev1.LocalObjectReference{
|
|
Name: proxyConfigVolName,
|
|
},
|
|
},
|
|
},
|
|
})
|
|
|
|
return volumes
|
|
}()
|
|
|
|
// Main container config.
|
|
c := &ss.Spec.Template.Spec.Containers[0]
|
|
c.Image = image
|
|
c.VolumeMounts = func() []corev1.VolumeMount {
|
|
var mounts []corev1.VolumeMount
|
|
|
|
// TODO(tomhjp): Read config directly from the secret instead. The
|
|
// mounts change on scaling up/down which causes unnecessary restarts
|
|
// for pods that haven't meaningfully changed.
|
|
for i := range pgReplicas(pg) {
|
|
mounts = append(mounts, corev1.VolumeMount{
|
|
Name: fmt.Sprintf("tailscaledconfig-%d", i),
|
|
ReadOnly: true,
|
|
MountPath: fmt.Sprintf("/etc/tsconfig/%s-%d", pg.Name, i),
|
|
})
|
|
}
|
|
|
|
mounts = append(mounts, corev1.VolumeMount{
|
|
Name: proxyConfigVolName,
|
|
MountPath: "/etc/proxies",
|
|
ReadOnly: true,
|
|
})
|
|
|
|
return mounts
|
|
}()
|
|
c.Env = func() []corev1.EnvVar {
|
|
envs := []corev1.EnvVar{
|
|
{
|
|
// TODO(irbekrm): verify that .status.podIPs are always set, else read in .status.podIP as well.
|
|
Name: "POD_IPS", // this will be a comma separate list i.e 10.136.0.6,2600:1900:4011:161:0:e:0:6
|
|
ValueFrom: &corev1.EnvVarSource{
|
|
FieldRef: &corev1.ObjectFieldSelector{
|
|
FieldPath: "status.podIPs",
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Name: "TS_KUBE_SECRET",
|
|
Value: "$(POD_NAME)",
|
|
},
|
|
{
|
|
Name: "TS_STATE",
|
|
Value: "kube:$(POD_NAME)",
|
|
},
|
|
{
|
|
Name: "TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR",
|
|
Value: "/etc/tsconfig/$(POD_NAME)",
|
|
},
|
|
}
|
|
|
|
if tsFirewallMode != "" {
|
|
envs = append(envs, corev1.EnvVar{
|
|
Name: "TS_DEBUG_FIREWALL_MODE",
|
|
Value: tsFirewallMode,
|
|
})
|
|
}
|
|
|
|
if pg.Spec.Type == tsapi.ProxyGroupTypeEgress {
|
|
envs = append(envs,
|
|
// TODO(irbekrm): in 1.80 we deprecated TS_EGRESS_SERVICES_CONFIG_PATH in favour of
|
|
// TS_EGRESS_PROXIES_CONFIG_PATH. Remove it in 1.84.
|
|
corev1.EnvVar{
|
|
Name: "TS_EGRESS_SERVICES_CONFIG_PATH",
|
|
Value: fmt.Sprintf("/etc/proxies/%s", egressservices.KeyEgressServices),
|
|
},
|
|
corev1.EnvVar{
|
|
Name: "TS_EGRESS_PROXIES_CONFIG_PATH",
|
|
Value: "/etc/proxies",
|
|
},
|
|
corev1.EnvVar{
|
|
Name: "TS_INTERNAL_APP",
|
|
Value: kubetypes.AppProxyGroupEgress,
|
|
},
|
|
corev1.EnvVar{
|
|
Name: "TS_ENABLE_HEALTH_CHECK",
|
|
Value: "true",
|
|
})
|
|
} else { // ingress
|
|
envs = append(envs, corev1.EnvVar{
|
|
Name: "TS_INTERNAL_APP",
|
|
Value: kubetypes.AppProxyGroupIngress,
|
|
},
|
|
corev1.EnvVar{
|
|
Name: "TS_SERVE_CONFIG",
|
|
Value: fmt.Sprintf("/etc/proxies/%s", serveConfigKey),
|
|
},
|
|
corev1.EnvVar{
|
|
// Run proxies in cert share mode to
|
|
// ensure that only one TLS cert is
|
|
// issued for an HA Ingress.
|
|
Name: "TS_EXPERIMENTAL_CERT_SHARE",
|
|
Value: "true",
|
|
},
|
|
)
|
|
}
|
|
return append(c.Env, envs...)
|
|
}()
|
|
|
|
// The pre-stop hook is used to ensure that a replica does not get terminated while cluster traffic for egress
|
|
// services is still being routed to it.
|
|
//
|
|
// This mechanism currently (2025-01-26) rely on the local health check being accessible on the Pod's
|
|
// IP, so they are not supported for ProxyGroups where users have configured TS_LOCAL_ADDR_PORT to a custom
|
|
// value.
|
|
//
|
|
// NB: For _Ingress_ ProxyGroups, we run shutdown logic within containerboot
|
|
// in reaction to a SIGTERM signal instead of using a pre-stop hook. This is
|
|
// because Ingress pods need to unadvertise services, and it's preferable to
|
|
// avoid triggering those side-effects from a GET request that would be
|
|
// accessible to the whole cluster network (in the absence of NetworkPolicy
|
|
// rules).
|
|
//
|
|
// TODO(tomhjp): add a readiness probe or gate to Ingress Pods. There is a
|
|
// small window where the Pod is marked ready but routing can still fail.
|
|
if pg.Spec.Type == tsapi.ProxyGroupTypeEgress && !hasLocalAddrPortSet(proxyClass) {
|
|
c.Lifecycle = &corev1.Lifecycle{
|
|
PreStop: &corev1.LifecycleHandler{
|
|
HTTPGet: &corev1.HTTPGetAction{
|
|
Path: kubetypes.EgessServicesPreshutdownEP,
|
|
Port: intstr.FromInt(defaultLocalAddrPort),
|
|
},
|
|
},
|
|
}
|
|
// Set the deletion grace period to 6 minutes to ensure that the pre-stop hook has enough time to terminate
|
|
// gracefully.
|
|
ss.Spec.Template.DeletionGracePeriodSeconds = ptr.To(deletionGracePeriodSeconds)
|
|
}
|
|
return ss, nil
|
|
}
|
|
|
|
func pgServiceAccount(pg *tsapi.ProxyGroup, namespace string) *corev1.ServiceAccount {
|
|
return &corev1.ServiceAccount{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: pg.Name,
|
|
Namespace: namespace,
|
|
Labels: pgLabels(pg.Name, nil),
|
|
OwnerReferences: pgOwnerReference(pg),
|
|
},
|
|
}
|
|
}
|
|
|
|
func pgRole(pg *tsapi.ProxyGroup, namespace string) *rbacv1.Role {
|
|
return &rbacv1.Role{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: pg.Name,
|
|
Namespace: namespace,
|
|
Labels: pgLabels(pg.Name, nil),
|
|
OwnerReferences: pgOwnerReference(pg),
|
|
},
|
|
Rules: []rbacv1.PolicyRule{
|
|
{
|
|
APIGroups: []string{""},
|
|
Resources: []string{"secrets"},
|
|
Verbs: []string{
|
|
"list",
|
|
},
|
|
},
|
|
{
|
|
APIGroups: []string{""},
|
|
Resources: []string{"secrets"},
|
|
Verbs: []string{
|
|
"get",
|
|
"patch",
|
|
"update",
|
|
},
|
|
ResourceNames: func() (secrets []string) {
|
|
for i := range pgReplicas(pg) {
|
|
secrets = append(secrets,
|
|
pgConfigSecretName(pg.Name, i), // Config with auth key.
|
|
fmt.Sprintf("%s-%d", pg.Name, i), // State.
|
|
)
|
|
}
|
|
return secrets
|
|
}(),
|
|
},
|
|
{
|
|
APIGroups: []string{""},
|
|
Resources: []string{"events"},
|
|
Verbs: []string{
|
|
"create",
|
|
"patch",
|
|
"get",
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func pgRoleBinding(pg *tsapi.ProxyGroup, namespace string) *rbacv1.RoleBinding {
|
|
return &rbacv1.RoleBinding{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: pg.Name,
|
|
Namespace: namespace,
|
|
Labels: pgLabels(pg.Name, nil),
|
|
OwnerReferences: pgOwnerReference(pg),
|
|
},
|
|
Subjects: []rbacv1.Subject{
|
|
{
|
|
Kind: "ServiceAccount",
|
|
Name: pg.Name,
|
|
Namespace: namespace,
|
|
},
|
|
},
|
|
RoleRef: rbacv1.RoleRef{
|
|
Kind: "Role",
|
|
Name: pg.Name,
|
|
},
|
|
}
|
|
}
|
|
|
|
func pgStateSecrets(pg *tsapi.ProxyGroup, namespace string) (secrets []*corev1.Secret) {
|
|
for i := range pgReplicas(pg) {
|
|
secrets = append(secrets, &corev1.Secret{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: fmt.Sprintf("%s-%d", pg.Name, i),
|
|
Namespace: namespace,
|
|
Labels: pgSecretLabels(pg.Name, "state"),
|
|
OwnerReferences: pgOwnerReference(pg),
|
|
},
|
|
})
|
|
}
|
|
|
|
return secrets
|
|
}
|
|
|
|
func pgEgressCM(pg *tsapi.ProxyGroup, namespace string) (*corev1.ConfigMap, []byte) {
|
|
hp := hepPings(pg)
|
|
hpBs := []byte(strconv.Itoa(hp))
|
|
return &corev1.ConfigMap{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: pgEgressCMName(pg.Name),
|
|
Namespace: namespace,
|
|
Labels: pgLabels(pg.Name, nil),
|
|
OwnerReferences: pgOwnerReference(pg),
|
|
},
|
|
BinaryData: map[string][]byte{egressservices.KeyHEPPings: hpBs},
|
|
}, hpBs
|
|
}
|
|
|
|
func pgIngressCM(pg *tsapi.ProxyGroup, namespace string) *corev1.ConfigMap {
|
|
return &corev1.ConfigMap{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: pgIngressCMName(pg.Name),
|
|
Namespace: namespace,
|
|
Labels: pgLabels(pg.Name, nil),
|
|
OwnerReferences: pgOwnerReference(pg),
|
|
},
|
|
}
|
|
}
|
|
|
|
func pgSecretLabels(pgName, secretType string) map[string]string {
|
|
return pgLabels(pgName, map[string]string{
|
|
kubetypes.LabelSecretType: secretType, // "config" or "state".
|
|
})
|
|
}
|
|
|
|
func pgLabels(pgName string, customLabels map[string]string) map[string]string {
|
|
l := make(map[string]string, len(customLabels)+3)
|
|
for k, v := range customLabels {
|
|
l[k] = v
|
|
}
|
|
|
|
l[kubetypes.LabelManaged] = "true"
|
|
l[LabelParentType] = "proxygroup"
|
|
l[LabelParentName] = pgName
|
|
|
|
return l
|
|
}
|
|
|
|
func pgOwnerReference(owner *tsapi.ProxyGroup) []metav1.OwnerReference {
|
|
return []metav1.OwnerReference{*metav1.NewControllerRef(owner, tsapi.SchemeGroupVersion.WithKind("ProxyGroup"))}
|
|
}
|
|
|
|
func pgReplicas(pg *tsapi.ProxyGroup) int32 {
|
|
if pg.Spec.Replicas != nil {
|
|
return *pg.Spec.Replicas
|
|
}
|
|
|
|
return 2
|
|
}
|
|
|
|
func pgConfigSecretName(pgName string, i int32) string {
|
|
return fmt.Sprintf("%s-%d-config", pgName, i)
|
|
}
|
|
|
|
func pgEgressCMName(pg string) string {
|
|
return fmt.Sprintf("%s-egress-config", pg)
|
|
}
|
|
|
|
// hasLocalAddrPortSet returns true if the proxyclass has the TS_LOCAL_ADDR_PORT env var set. For egress ProxyGroups,
|
|
// currently (2025-01-26) this means that the ProxyGroup does not support graceful failover.
|
|
func hasLocalAddrPortSet(proxyClass *tsapi.ProxyClass) bool {
|
|
if proxyClass == nil || proxyClass.Spec.StatefulSet == nil || proxyClass.Spec.StatefulSet.Pod == nil || proxyClass.Spec.StatefulSet.Pod.TailscaleContainer == nil {
|
|
return false
|
|
}
|
|
return slices.ContainsFunc(proxyClass.Spec.StatefulSet.Pod.TailscaleContainer.Env, func(env tsapi.Env) bool {
|
|
return env.Name == envVarTSLocalAddrPort
|
|
})
|
|
}
|
|
|
|
// hepPings returns the number of times a health check endpoint exposed by a Service fronting ProxyGroup replicas should
|
|
// be pinged to ensure that all currently configured backend replicas are hit.
|
|
func hepPings(pg *tsapi.ProxyGroup) int {
|
|
rc := pgReplicas(pg)
|
|
// Assuming a Service implemented using round robin load balancing, number-of-replica-times should be enough, but in
|
|
// practice, we cannot assume that the requests will be load balanced perfectly.
|
|
return int(rc) * 3
|
|
}
|