cmd/k8s-operator: restart ProxyGroup pods less (#14045)

We currently annotate pods with a hash of the tailscaled config so that
we can trigger pod restarts whenever it changes. However, the hash
updates more frequently than is necessary causing more restarts than is
necessary. This commit removes two causes; scaling up/down and removing
the auth key after pods have initially authed to control. However, note
that pods will still restart on scale-up/down because of the updated set
of volumes mounted into each pod. Hopefully we can fix that in a planned
follow-up PR.

Updates #13406

Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
This commit is contained in:
Tom Proctor 2024-11-12 14:18:19 +00:00 committed by GitHub
parent 4e0fc037e6
commit d8a3683fdf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 66 additions and 26 deletions

View File

@ -353,7 +353,7 @@ func (r *ProxyGroupReconciler) deleteTailnetDevice(ctx context.Context, id tailc
func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (hash string, err error) { func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (hash string, err error) {
logger := r.logger(pg.Name) logger := r.logger(pg.Name)
var allConfigs []tailscaledConfigs var configSHA256Sum string
for i := range pgReplicas(pg) { for i := range pgReplicas(pg) {
cfgSecret := &corev1.Secret{ cfgSecret := &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
@ -389,7 +389,6 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
if err != nil { if err != nil {
return "", fmt.Errorf("error creating tailscaled config: %w", err) return "", fmt.Errorf("error creating tailscaled config: %w", err)
} }
allConfigs = append(allConfigs, configs)
for cap, cfg := range configs { for cap, cfg := range configs {
cfgJSON, err := json.Marshal(cfg) cfgJSON, err := json.Marshal(cfg)
@ -399,6 +398,32 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
mak.Set(&cfgSecret.StringData, tsoperator.TailscaledConfigFileName(cap), string(cfgJSON)) mak.Set(&cfgSecret.StringData, tsoperator.TailscaledConfigFileName(cap), string(cfgJSON))
} }
// The config sha256 sum is a value for a hash annotation used to trigger
// pod restarts when tailscaled config changes. Any config changes apply
// to all replicas, so it is sufficient to only hash the config for the
// first replica.
//
// In future, we're aiming to eliminate restarts altogether and have
// pods dynamically reload their config when it changes.
if i == 0 {
sum := sha256.New()
for _, cfg := range configs {
// Zero out the auth key so it doesn't affect the sha256 hash when we
// remove it from the config after the pods have all authed. Otherwise
// all the pods will need to restart immediately after authing.
cfg.AuthKey = nil
b, err := json.Marshal(cfg)
if err != nil {
return "", err
}
if _, err := sum.Write(b); err != nil {
return "", err
}
}
configSHA256Sum = fmt.Sprintf("%x", sum.Sum(nil))
}
if existingCfgSecret != nil { if existingCfgSecret != nil {
logger.Debugf("patching the existing ProxyGroup config Secret %s", cfgSecret.Name) logger.Debugf("patching the existing ProxyGroup config Secret %s", cfgSecret.Name)
if err := r.Patch(ctx, cfgSecret, client.MergeFrom(existingCfgSecret)); err != nil { if err := r.Patch(ctx, cfgSecret, client.MergeFrom(existingCfgSecret)); err != nil {
@ -412,16 +437,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
} }
} }
sum := sha256.New() return configSHA256Sum, nil
b, err := json.Marshal(allConfigs)
if err != nil {
return "", err
}
if _, err := sum.Write(b); err != nil {
return "", err
}
return fmt.Sprintf("%x", sum.Sum(nil)), nil
} }
func pgTailscaledConfig(pg *tsapi.ProxyGroup, class *tsapi.ProxyClass, idx int32, authKey string, oldSecret *corev1.Secret) (tailscaledConfigs, error) { func pgTailscaledConfig(pg *tsapi.ProxyGroup, class *tsapi.ProxyClass, idx int32, authKey string, oldSecret *corev1.Secret) (tailscaledConfigs, error) {

View File

@ -93,6 +93,10 @@ func pgStatefulSet(pg *tsapi.ProxyGroup, namespace, image, tsFirewallMode, cfgHa
c.Image = image c.Image = image
c.VolumeMounts = func() []corev1.VolumeMount { c.VolumeMounts = func() []corev1.VolumeMount {
var mounts []corev1.VolumeMount var mounts []corev1.VolumeMount
// TODO(tomhjp): Read config directly from the secret instead. The
// mounts change on scaling up/down which causes unnecessary restarts
// for pods that haven't meaningfully changed.
for i := range pgReplicas(pg) { for i := range pgReplicas(pg) {
mounts = append(mounts, corev1.VolumeMount{ mounts = append(mounts, corev1.VolumeMount{
Name: fmt.Sprintf("tailscaledconfig-%d", i), Name: fmt.Sprintf("tailscaledconfig-%d", i),

View File

@ -35,6 +35,8 @@
} }
func TestProxyGroup(t *testing.T) { func TestProxyGroup(t *testing.T) {
const initialCfgHash = "6632726be70cf224049580deb4d317bba065915b5fd415461d60ed621c91b196"
pc := &tsapi.ProxyClass{ pc := &tsapi.ProxyClass{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: "default-pc", Name: "default-pc",
@ -80,6 +82,7 @@ func TestProxyGroup(t *testing.T) {
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "the ProxyGroup's ProxyClass default-pc is not yet in a ready state, waiting...", 0, cl, zl.Sugar()) tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "the ProxyGroup's ProxyClass default-pc is not yet in a ready state, waiting...", 0, cl, zl.Sugar())
expectEqual(t, fc, pg, nil) expectEqual(t, fc, pg, nil)
expectProxyGroupResources(t, fc, pg, false, "")
}) })
t.Run("observe_ProxyGroupCreating_status_reason", func(t *testing.T) { t.Run("observe_ProxyGroupCreating_status_reason", func(t *testing.T) {
@ -100,10 +103,11 @@ func TestProxyGroup(t *testing.T) {
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "0/2 ProxyGroup pods running", 0, cl, zl.Sugar()) tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "0/2 ProxyGroup pods running", 0, cl, zl.Sugar())
expectEqual(t, fc, pg, nil) expectEqual(t, fc, pg, nil)
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
if expected := 1; reconciler.proxyGroups.Len() != expected { if expected := 1; reconciler.proxyGroups.Len() != expected {
t.Fatalf("expected %d recorders, got %d", expected, reconciler.proxyGroups.Len()) t.Fatalf("expected %d recorders, got %d", expected, reconciler.proxyGroups.Len())
} }
expectProxyGroupResources(t, fc, pg, true) expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
keyReq := tailscale.KeyCapabilities{ keyReq := tailscale.KeyCapabilities{
Devices: tailscale.KeyDeviceCapabilities{ Devices: tailscale.KeyDeviceCapabilities{
Create: tailscale.KeyDeviceCreateCapabilities{ Create: tailscale.KeyDeviceCreateCapabilities{
@ -135,7 +139,7 @@ func TestProxyGroup(t *testing.T) {
} }
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionTrue, reasonProxyGroupReady, reasonProxyGroupReady, 0, cl, zl.Sugar()) tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionTrue, reasonProxyGroupReady, reasonProxyGroupReady, 0, cl, zl.Sugar())
expectEqual(t, fc, pg, nil) expectEqual(t, fc, pg, nil)
expectProxyGroupResources(t, fc, pg, true) expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
}) })
t.Run("scale_up_to_3", func(t *testing.T) { t.Run("scale_up_to_3", func(t *testing.T) {
@ -146,6 +150,7 @@ func TestProxyGroup(t *testing.T) {
expectReconciled(t, reconciler, "", pg.Name) expectReconciled(t, reconciler, "", pg.Name)
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "2/3 ProxyGroup pods running", 0, cl, zl.Sugar()) tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "2/3 ProxyGroup pods running", 0, cl, zl.Sugar())
expectEqual(t, fc, pg, nil) expectEqual(t, fc, pg, nil)
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
addNodeIDToStateSecrets(t, fc, pg) addNodeIDToStateSecrets(t, fc, pg)
expectReconciled(t, reconciler, "", pg.Name) expectReconciled(t, reconciler, "", pg.Name)
@ -155,7 +160,7 @@ func TestProxyGroup(t *testing.T) {
TailnetIPs: []string{"1.2.3.4", "::1"}, TailnetIPs: []string{"1.2.3.4", "::1"},
}) })
expectEqual(t, fc, pg, nil) expectEqual(t, fc, pg, nil)
expectProxyGroupResources(t, fc, pg, true) expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
}) })
t.Run("scale_down_to_1", func(t *testing.T) { t.Run("scale_down_to_1", func(t *testing.T) {
@ -163,11 +168,26 @@ func TestProxyGroup(t *testing.T) {
mustUpdate(t, fc, "", pg.Name, func(p *tsapi.ProxyGroup) { mustUpdate(t, fc, "", pg.Name, func(p *tsapi.ProxyGroup) {
p.Spec = pg.Spec p.Spec = pg.Spec
}) })
expectReconciled(t, reconciler, "", pg.Name) expectReconciled(t, reconciler, "", pg.Name)
pg.Status.Devices = pg.Status.Devices[:1] // truncate to only the first device. pg.Status.Devices = pg.Status.Devices[:1] // truncate to only the first device.
expectEqual(t, fc, pg, nil) expectEqual(t, fc, pg, nil)
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
})
expectProxyGroupResources(t, fc, pg, true) t.Run("trigger_config_change_and_observe_new_config_hash", func(t *testing.T) {
pc.Spec.TailscaleConfig = &tsapi.TailscaleConfig{
AcceptRoutes: true,
}
mustUpdate(t, fc, "", pc.Name, func(p *tsapi.ProxyClass) {
p.Spec = pc.Spec
})
expectReconciled(t, reconciler, "", pg.Name)
expectEqual(t, fc, pg, nil)
expectProxyGroupResources(t, fc, pg, true, "518a86e9fae64f270f8e0ec2a2ea6ca06c10f725035d3d6caca132cd61e42a74")
}) })
t.Run("delete_and_cleanup", func(t *testing.T) { t.Run("delete_and_cleanup", func(t *testing.T) {
@ -191,13 +211,13 @@ func TestProxyGroup(t *testing.T) {
}) })
} }
func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.ProxyGroup, shouldExist bool) { func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.ProxyGroup, shouldExist bool, cfgHash string) {
t.Helper() t.Helper()
role := pgRole(pg, tsNamespace) role := pgRole(pg, tsNamespace)
roleBinding := pgRoleBinding(pg, tsNamespace) roleBinding := pgRoleBinding(pg, tsNamespace)
serviceAccount := pgServiceAccount(pg, tsNamespace) serviceAccount := pgServiceAccount(pg, tsNamespace)
statefulSet, err := pgStatefulSet(pg, tsNamespace, testProxyImage, "auto", "") statefulSet, err := pgStatefulSet(pg, tsNamespace, testProxyImage, "auto", cfgHash)
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -207,9 +227,7 @@ func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.Prox
expectEqual(t, fc, role, nil) expectEqual(t, fc, role, nil)
expectEqual(t, fc, roleBinding, nil) expectEqual(t, fc, roleBinding, nil)
expectEqual(t, fc, serviceAccount, nil) expectEqual(t, fc, serviceAccount, nil)
expectEqual(t, fc, statefulSet, func(ss *appsv1.StatefulSet) { expectEqual(t, fc, statefulSet, nil)
ss.Spec.Template.Annotations[podAnnotationLastSetConfigFileHash] = ""
})
} else { } else {
expectMissing[rbacv1.Role](t, fc, role.Namespace, role.Name) expectMissing[rbacv1.Role](t, fc, role.Namespace, role.Name)
expectMissing[rbacv1.RoleBinding](t, fc, roleBinding.Namespace, roleBinding.Name) expectMissing[rbacv1.RoleBinding](t, fc, roleBinding.Namespace, roleBinding.Name)
@ -218,11 +236,13 @@ func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.Prox
} }
var expectedSecrets []string var expectedSecrets []string
for i := range pgReplicas(pg) { if shouldExist {
expectedSecrets = append(expectedSecrets, for i := range pgReplicas(pg) {
fmt.Sprintf("%s-%d", pg.Name, i), expectedSecrets = append(expectedSecrets,
fmt.Sprintf("%s-%d-config", pg.Name, i), fmt.Sprintf("%s-%d", pg.Name, i),
) fmt.Sprintf("%s-%d-config", pg.Name, i),
)
}
} }
expectSecrets(t, fc, expectedSecrets) expectSecrets(t, fc, expectedSecrets)
} }