mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-25 11:05:45 +00:00
cmd/k8s-operator: restart ProxyGroup pods less (#14045)
We currently annotate pods with a hash of the tailscaled config so that we can trigger pod restarts whenever it changes. However, the hash updates more frequently than is necessary causing more restarts than is necessary. This commit removes two causes; scaling up/down and removing the auth key after pods have initially authed to control. However, note that pods will still restart on scale-up/down because of the updated set of volumes mounted into each pod. Hopefully we can fix that in a planned follow-up PR. Updates #13406 Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
This commit is contained in:
parent
4e0fc037e6
commit
d8a3683fdf
@ -353,7 +353,7 @@ func (r *ProxyGroupReconciler) deleteTailnetDevice(ctx context.Context, id tailc
|
||||
|
||||
func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, pg *tsapi.ProxyGroup, proxyClass *tsapi.ProxyClass) (hash string, err error) {
|
||||
logger := r.logger(pg.Name)
|
||||
var allConfigs []tailscaledConfigs
|
||||
var configSHA256Sum string
|
||||
for i := range pgReplicas(pg) {
|
||||
cfgSecret := &corev1.Secret{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
@ -389,7 +389,6 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error creating tailscaled config: %w", err)
|
||||
}
|
||||
allConfigs = append(allConfigs, configs)
|
||||
|
||||
for cap, cfg := range configs {
|
||||
cfgJSON, err := json.Marshal(cfg)
|
||||
@ -399,6 +398,32 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
|
||||
mak.Set(&cfgSecret.StringData, tsoperator.TailscaledConfigFileName(cap), string(cfgJSON))
|
||||
}
|
||||
|
||||
// The config sha256 sum is a value for a hash annotation used to trigger
|
||||
// pod restarts when tailscaled config changes. Any config changes apply
|
||||
// to all replicas, so it is sufficient to only hash the config for the
|
||||
// first replica.
|
||||
//
|
||||
// In future, we're aiming to eliminate restarts altogether and have
|
||||
// pods dynamically reload their config when it changes.
|
||||
if i == 0 {
|
||||
sum := sha256.New()
|
||||
for _, cfg := range configs {
|
||||
// Zero out the auth key so it doesn't affect the sha256 hash when we
|
||||
// remove it from the config after the pods have all authed. Otherwise
|
||||
// all the pods will need to restart immediately after authing.
|
||||
cfg.AuthKey = nil
|
||||
b, err := json.Marshal(cfg)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if _, err := sum.Write(b); err != nil {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
|
||||
configSHA256Sum = fmt.Sprintf("%x", sum.Sum(nil))
|
||||
}
|
||||
|
||||
if existingCfgSecret != nil {
|
||||
logger.Debugf("patching the existing ProxyGroup config Secret %s", cfgSecret.Name)
|
||||
if err := r.Patch(ctx, cfgSecret, client.MergeFrom(existingCfgSecret)); err != nil {
|
||||
@ -412,16 +437,7 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
|
||||
}
|
||||
}
|
||||
|
||||
sum := sha256.New()
|
||||
b, err := json.Marshal(allConfigs)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if _, err := sum.Write(b); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%x", sum.Sum(nil)), nil
|
||||
return configSHA256Sum, nil
|
||||
}
|
||||
|
||||
func pgTailscaledConfig(pg *tsapi.ProxyGroup, class *tsapi.ProxyClass, idx int32, authKey string, oldSecret *corev1.Secret) (tailscaledConfigs, error) {
|
||||
|
@ -93,6 +93,10 @@ func pgStatefulSet(pg *tsapi.ProxyGroup, namespace, image, tsFirewallMode, cfgHa
|
||||
c.Image = image
|
||||
c.VolumeMounts = func() []corev1.VolumeMount {
|
||||
var mounts []corev1.VolumeMount
|
||||
|
||||
// TODO(tomhjp): Read config directly from the secret instead. The
|
||||
// mounts change on scaling up/down which causes unnecessary restarts
|
||||
// for pods that haven't meaningfully changed.
|
||||
for i := range pgReplicas(pg) {
|
||||
mounts = append(mounts, corev1.VolumeMount{
|
||||
Name: fmt.Sprintf("tailscaledconfig-%d", i),
|
||||
|
@ -35,6 +35,8 @@
|
||||
}
|
||||
|
||||
func TestProxyGroup(t *testing.T) {
|
||||
const initialCfgHash = "6632726be70cf224049580deb4d317bba065915b5fd415461d60ed621c91b196"
|
||||
|
||||
pc := &tsapi.ProxyClass{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "default-pc",
|
||||
@ -80,6 +82,7 @@ func TestProxyGroup(t *testing.T) {
|
||||
|
||||
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "the ProxyGroup's ProxyClass default-pc is not yet in a ready state, waiting...", 0, cl, zl.Sugar())
|
||||
expectEqual(t, fc, pg, nil)
|
||||
expectProxyGroupResources(t, fc, pg, false, "")
|
||||
})
|
||||
|
||||
t.Run("observe_ProxyGroupCreating_status_reason", func(t *testing.T) {
|
||||
@ -100,10 +103,11 @@ func TestProxyGroup(t *testing.T) {
|
||||
|
||||
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "0/2 ProxyGroup pods running", 0, cl, zl.Sugar())
|
||||
expectEqual(t, fc, pg, nil)
|
||||
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
|
||||
if expected := 1; reconciler.proxyGroups.Len() != expected {
|
||||
t.Fatalf("expected %d recorders, got %d", expected, reconciler.proxyGroups.Len())
|
||||
}
|
||||
expectProxyGroupResources(t, fc, pg, true)
|
||||
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
|
||||
keyReq := tailscale.KeyCapabilities{
|
||||
Devices: tailscale.KeyDeviceCapabilities{
|
||||
Create: tailscale.KeyDeviceCreateCapabilities{
|
||||
@ -135,7 +139,7 @@ func TestProxyGroup(t *testing.T) {
|
||||
}
|
||||
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionTrue, reasonProxyGroupReady, reasonProxyGroupReady, 0, cl, zl.Sugar())
|
||||
expectEqual(t, fc, pg, nil)
|
||||
expectProxyGroupResources(t, fc, pg, true)
|
||||
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
|
||||
})
|
||||
|
||||
t.Run("scale_up_to_3", func(t *testing.T) {
|
||||
@ -146,6 +150,7 @@ func TestProxyGroup(t *testing.T) {
|
||||
expectReconciled(t, reconciler, "", pg.Name)
|
||||
tsoperator.SetProxyGroupCondition(pg, tsapi.ProxyGroupReady, metav1.ConditionFalse, reasonProxyGroupCreating, "2/3 ProxyGroup pods running", 0, cl, zl.Sugar())
|
||||
expectEqual(t, fc, pg, nil)
|
||||
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
|
||||
|
||||
addNodeIDToStateSecrets(t, fc, pg)
|
||||
expectReconciled(t, reconciler, "", pg.Name)
|
||||
@ -155,7 +160,7 @@ func TestProxyGroup(t *testing.T) {
|
||||
TailnetIPs: []string{"1.2.3.4", "::1"},
|
||||
})
|
||||
expectEqual(t, fc, pg, nil)
|
||||
expectProxyGroupResources(t, fc, pg, true)
|
||||
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
|
||||
})
|
||||
|
||||
t.Run("scale_down_to_1", func(t *testing.T) {
|
||||
@ -163,11 +168,26 @@ func TestProxyGroup(t *testing.T) {
|
||||
mustUpdate(t, fc, "", pg.Name, func(p *tsapi.ProxyGroup) {
|
||||
p.Spec = pg.Spec
|
||||
})
|
||||
|
||||
expectReconciled(t, reconciler, "", pg.Name)
|
||||
|
||||
pg.Status.Devices = pg.Status.Devices[:1] // truncate to only the first device.
|
||||
expectEqual(t, fc, pg, nil)
|
||||
expectProxyGroupResources(t, fc, pg, true, initialCfgHash)
|
||||
})
|
||||
|
||||
expectProxyGroupResources(t, fc, pg, true)
|
||||
t.Run("trigger_config_change_and_observe_new_config_hash", func(t *testing.T) {
|
||||
pc.Spec.TailscaleConfig = &tsapi.TailscaleConfig{
|
||||
AcceptRoutes: true,
|
||||
}
|
||||
mustUpdate(t, fc, "", pc.Name, func(p *tsapi.ProxyClass) {
|
||||
p.Spec = pc.Spec
|
||||
})
|
||||
|
||||
expectReconciled(t, reconciler, "", pg.Name)
|
||||
|
||||
expectEqual(t, fc, pg, nil)
|
||||
expectProxyGroupResources(t, fc, pg, true, "518a86e9fae64f270f8e0ec2a2ea6ca06c10f725035d3d6caca132cd61e42a74")
|
||||
})
|
||||
|
||||
t.Run("delete_and_cleanup", func(t *testing.T) {
|
||||
@ -191,13 +211,13 @@ func TestProxyGroup(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.ProxyGroup, shouldExist bool) {
|
||||
func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.ProxyGroup, shouldExist bool, cfgHash string) {
|
||||
t.Helper()
|
||||
|
||||
role := pgRole(pg, tsNamespace)
|
||||
roleBinding := pgRoleBinding(pg, tsNamespace)
|
||||
serviceAccount := pgServiceAccount(pg, tsNamespace)
|
||||
statefulSet, err := pgStatefulSet(pg, tsNamespace, testProxyImage, "auto", "")
|
||||
statefulSet, err := pgStatefulSet(pg, tsNamespace, testProxyImage, "auto", cfgHash)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -207,9 +227,7 @@ func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.Prox
|
||||
expectEqual(t, fc, role, nil)
|
||||
expectEqual(t, fc, roleBinding, nil)
|
||||
expectEqual(t, fc, serviceAccount, nil)
|
||||
expectEqual(t, fc, statefulSet, func(ss *appsv1.StatefulSet) {
|
||||
ss.Spec.Template.Annotations[podAnnotationLastSetConfigFileHash] = ""
|
||||
})
|
||||
expectEqual(t, fc, statefulSet, nil)
|
||||
} else {
|
||||
expectMissing[rbacv1.Role](t, fc, role.Namespace, role.Name)
|
||||
expectMissing[rbacv1.RoleBinding](t, fc, roleBinding.Namespace, roleBinding.Name)
|
||||
@ -218,11 +236,13 @@ func expectProxyGroupResources(t *testing.T, fc client.WithWatch, pg *tsapi.Prox
|
||||
}
|
||||
|
||||
var expectedSecrets []string
|
||||
for i := range pgReplicas(pg) {
|
||||
expectedSecrets = append(expectedSecrets,
|
||||
fmt.Sprintf("%s-%d", pg.Name, i),
|
||||
fmt.Sprintf("%s-%d-config", pg.Name, i),
|
||||
)
|
||||
if shouldExist {
|
||||
for i := range pgReplicas(pg) {
|
||||
expectedSecrets = append(expectedSecrets,
|
||||
fmt.Sprintf("%s-%d", pg.Name, i),
|
||||
fmt.Sprintf("%s-%d-config", pg.Name, i),
|
||||
)
|
||||
}
|
||||
}
|
||||
expectSecrets(t, fc, expectedSecrets)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user