From b0095a5da4a0f10e85d9c6a0c5c8005a3d7ea3a1 Mon Sep 17 00:00:00 2001 From: Tom Proctor Date: Wed, 19 Mar 2025 01:53:15 -0700 Subject: [PATCH] cmd/k8s-operator: wait for VIPService before updating HA Ingress status (#15343) Update the HA Ingress controller to wait until it sees AdvertisedServices config propagated into at least 1 Pod's prefs before it updates the status on the Ingress, to ensure the ProxyGroup Pods are ready to serve traffic before indicating that the Ingress is ready Updates tailscale/corp#24795 Change-Id: I1b8ce23c9e312d08f9d02e48d70bdebd9e1a4757 Signed-off-by: Tom Proctor --- cmd/k8s-operator/ingress-for-pg.go | 91 ++++++++++++++++++------- cmd/k8s-operator/ingress-for-pg_test.go | 25 +++++++ cmd/k8s-operator/operator.go | 42 +++++++++++- cmd/k8s-operator/proxygroup.go | 6 +- cmd/k8s-operator/proxygroup_specs.go | 4 +- cmd/k8s-operator/tsrecorder.go | 41 ++++++----- 6 files changed, 158 insertions(+), 51 deletions(-) diff --git a/cmd/k8s-operator/ingress-for-pg.go b/cmd/k8s-operator/ingress-for-pg.go index cdbfecb35..fe85509ad 100644 --- a/cmd/k8s-operator/ingress-for-pg.go +++ b/cmd/k8s-operator/ingress-for-pg.go @@ -154,13 +154,13 @@ func (r *HAIngressReconciler) maybeProvision(ctx context.Context, hostname strin pg := &tsapi.ProxyGroup{} if err := r.Get(ctx, client.ObjectKey{Name: pgName}, pg); err != nil { if apierrors.IsNotFound(err) { - logger.Infof("ProxyGroup %q does not exist", pgName) + logger.Infof("ProxyGroup does not exist") return false, nil } return false, fmt.Errorf("getting ProxyGroup %q: %w", pgName, err) } if !tsoperator.ProxyGroupIsReady(pg) { - logger.Infof("ProxyGroup %q is not (yet) ready", pgName) + logger.Infof("ProxyGroup is not (yet) ready") return false, nil } @@ -175,8 +175,6 @@ func (r *HAIngressReconciler) maybeProvision(ctx context.Context, hostname strin r.recorder.Event(ing, corev1.EventTypeWarning, "HTTPSNotEnabled", "HTTPS is not enabled on the tailnet; ingress may not work") } - logger = logger.With("proxy-group", pg.Name) - if !slices.Contains(ing.Finalizers, FinalizerNamePG) { // This log line is printed exactly once during initial provisioning, // because once the finalizer is in place this block gets skipped. So, @@ -326,7 +324,7 @@ func (r *HAIngressReconciler) maybeProvision(ctx context.Context, hostname strin !reflect.DeepEqual(vipSvc.Tags, existingVIPSvc.Tags) || !reflect.DeepEqual(vipSvc.Ports, existingVIPSvc.Ports) || !strings.EqualFold(vipSvc.Comment, existingVIPSvc.Comment) { - logger.Infof("Ensuring VIPService %q exists and is up to date", hostname) + logger.Infof("Ensuring VIPService exists and is up to date") if err := r.tsClient.CreateOrUpdateVIPService(ctx, vipSvc); err != nil { return false, fmt.Errorf("error creating VIPService: %w", err) } @@ -338,31 +336,48 @@ func (r *HAIngressReconciler) maybeProvision(ctx context.Context, hostname strin return false, fmt.Errorf("failed to update tailscaled config: %w", err) } - // TODO(irbekrm): check that the replicas are ready to route traffic for the VIPService before updating Ingress - // status. - // 6. Update Ingress status + // 6. Update Ingress status if ProxyGroup Pods are ready. + count, err := r.numberPodsAdvertising(ctx, pg.Name, serviceName) + if err != nil { + return false, fmt.Errorf("failed to check if any Pods are configured: %w", err) + } + oldStatus := ing.Status.DeepCopy() - ports := []networkingv1.IngressPortStatus{ - { - Protocol: "TCP", - Port: 443, - }, + + switch count { + case 0: + ing.Status.LoadBalancer.Ingress = nil + default: + ports := []networkingv1.IngressPortStatus{ + { + Protocol: "TCP", + Port: 443, + }, + } + if isHTTPEndpointEnabled(ing) { + ports = append(ports, networkingv1.IngressPortStatus{ + Protocol: "TCP", + Port: 80, + }) + } + ing.Status.LoadBalancer.Ingress = []networkingv1.IngressLoadBalancerIngress{ + { + Hostname: dnsName, + Ports: ports, + }, + } } - if isHTTPEndpointEnabled(ing) { - ports = append(ports, networkingv1.IngressPortStatus{ - Protocol: "TCP", - Port: 80, - }) - } - ing.Status.LoadBalancer.Ingress = []networkingv1.IngressLoadBalancerIngress{ - { - Hostname: dnsName, - Ports: ports, - }, - } - if apiequality.Semantic.DeepEqual(oldStatus, ing.Status) { + if apiequality.Semantic.DeepEqual(oldStatus, &ing.Status) { return svcsChanged, nil } + + const prefix = "Updating Ingress status" + if count == 0 { + logger.Infof("%s. No Pods are advertising VIPService yet", prefix) + } else { + logger.Infof("%s. %d Pod(s) advertising VIPService", prefix, count) + } + if err := r.Status().Update(ctx, ing); err != nil { return false, fmt.Errorf("failed to update Ingress status: %w", err) } @@ -726,6 +741,30 @@ func (a *HAIngressReconciler) maybeUpdateAdvertiseServicesConfig(ctx context.Con return nil } +func (a *HAIngressReconciler) numberPodsAdvertising(ctx context.Context, pgName string, serviceName tailcfg.ServiceName) (int, error) { + // Get all state Secrets for this ProxyGroup. + secrets := &corev1.SecretList{} + if err := a.List(ctx, secrets, client.InNamespace(a.tsNamespace), client.MatchingLabels(pgSecretLabels(pgName, "state"))); err != nil { + return 0, fmt.Errorf("failed to list ProxyGroup %q state Secrets: %w", pgName, err) + } + + var count int + for _, secret := range secrets.Items { + prefs, ok, err := getDevicePrefs(&secret) + if err != nil { + return 0, fmt.Errorf("error getting node metadata: %w", err) + } + if !ok { + continue + } + if slices.Contains(prefs.AdvertiseServices, serviceName.String()) { + count++ + } + } + + return count, nil +} + // OwnerRef is an owner reference that uniquely identifies a Tailscale // Kubernetes operator instance. type OwnerRef struct { diff --git a/cmd/k8s-operator/ingress-for-pg_test.go b/cmd/k8s-operator/ingress-for-pg_test.go index 2f675337e..0e90ec980 100644 --- a/cmd/k8s-operator/ingress-for-pg_test.go +++ b/cmd/k8s-operator/ingress-for-pg_test.go @@ -461,6 +461,31 @@ func TestIngressPGReconciler_HTTPEndpoint(t *testing.T) { t.Fatal(err) } + // Status will be empty until the VIPService shows up in prefs. + if !reflect.DeepEqual(ing.Status.LoadBalancer.Ingress, []networkingv1.IngressLoadBalancerIngress(nil)) { + t.Errorf("incorrect Ingress status: got %v, want empty", + ing.Status.LoadBalancer.Ingress) + } + + // Add the VIPService to prefs to have the Ingress recognised as ready. + mustCreate(t, fc, &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pg-0", + Namespace: "operator-ns", + Labels: pgSecretLabels("test-pg", "state"), + }, + Data: map[string][]byte{ + "_current-profile": []byte("profile-foo"), + "profile-foo": []byte(`{"AdvertiseServices":["svc:my-svc"],"Config":{"NodeID":"node-foo"}}`), + }, + }) + + // Reconcile and re-fetch Ingress. + expectReconciled(t, ingPGR, "default", "test-ingress") + if err := fc.Get(context.Background(), client.ObjectKeyFromObject(ing), ing); err != nil { + t.Fatal(err) + } + wantStatus := []networkingv1.IngressPortStatus{ {Port: 443, Protocol: "TCP"}, {Port: 80, Protocol: "TCP"}, diff --git a/cmd/k8s-operator/operator.go b/cmd/k8s-operator/operator.go index 1dcd130fb..ff2a959bd 100644 --- a/cmd/k8s-operator/operator.go +++ b/cmd/k8s-operator/operator.go @@ -347,6 +347,7 @@ func runReconcilers(opts reconcilerOpts) { For(&networkingv1.Ingress{}). Named("ingress-pg-reconciler"). Watches(&corev1.Service{}, handler.EnqueueRequestsFromMapFunc(serviceHandlerForIngressPG(mgr.GetClient(), startlog))). + Watches(&corev1.Secret{}, handler.EnqueueRequestsFromMapFunc(ingressesFromPGStateSecret(mgr.GetClient(), startlog))). Watches(&tsapi.ProxyGroup{}, ingressProxyGroupFilter). Complete(&HAIngressReconciler{ recorder: eventRecorder, @@ -978,8 +979,6 @@ func egressEpsFromPGStateSecrets(cl client.Client, ns string) handler.MapFunc { if v, ok := o.GetLabels()[LabelManaged]; !ok || v != "true" { return nil } - // TODO(irbekrm): for now this is good enough as all ProxyGroups are egress. Add a type check once we - // have ingress ProxyGroups. if parentType := o.GetLabels()[LabelParentType]; parentType != "proxygroup" { return nil } @@ -1040,6 +1039,45 @@ func reconcileRequestsForPG(pg string, cl client.Client, ns string) []reconcile. return reqs } +func ingressesFromPGStateSecret(cl client.Client, logger *zap.SugaredLogger) handler.MapFunc { + return func(ctx context.Context, o client.Object) []reconcile.Request { + secret, ok := o.(*corev1.Secret) + if !ok { + logger.Infof("[unexpected] ProxyGroup handler triggered for an object that is not a ProxyGroup") + return nil + } + if secret.ObjectMeta.Labels[LabelManaged] != "true" { + return nil + } + if secret.ObjectMeta.Labels[LabelParentType] != "proxygroup" { + return nil + } + if secret.ObjectMeta.Labels[labelSecretType] != "state" { + return nil + } + pgName, ok := secret.ObjectMeta.Labels[LabelParentName] + if !ok { + return nil + } + + ingList := &networkingv1.IngressList{} + if err := cl.List(ctx, ingList, client.MatchingFields{indexIngressProxyGroup: pgName}); err != nil { + logger.Infof("error listing Ingresses, skipping a reconcile for event on Secret %s: %v", secret.Name, err) + return nil + } + reqs := make([]reconcile.Request, 0) + for _, ing := range ingList.Items { + reqs = append(reqs, reconcile.Request{ + NamespacedName: types.NamespacedName{ + Namespace: ing.Namespace, + Name: ing.Name, + }, + }) + } + return reqs + } +} + // egressSvcsFromEgressProxyGroup is an event handler for egress ProxyGroups. It returns reconcile requests for all // user-created ExternalName Services that should be exposed on this ProxyGroup. func egressSvcsFromEgressProxyGroup(cl client.Client, logger *zap.SugaredLogger) handler.MapFunc { diff --git a/cmd/k8s-operator/proxygroup.go b/cmd/k8s-operator/proxygroup.go index 463d29249..c961c0471 100644 --- a/cmd/k8s-operator/proxygroup.go +++ b/cmd/k8s-operator/proxygroup.go @@ -645,7 +645,7 @@ func (r *ProxyGroupReconciler) getNodeMetadata(ctx context.Context, pg *tsapi.Pr return nil, fmt.Errorf("unexpected secret %s was labelled as owned by the ProxyGroup %s: %w", secret.Name, pg.Name, err) } - id, dnsName, ok, err := getNodeMetadata(ctx, &secret) + prefs, ok, err := getDevicePrefs(&secret) if err != nil { return nil, err } @@ -656,8 +656,8 @@ func (r *ProxyGroupReconciler) getNodeMetadata(ctx context.Context, pg *tsapi.Pr nm := nodeMetadata{ ordinal: ordinal, stateSecret: &secret, - tsID: id, - dnsName: dnsName, + tsID: prefs.Config.NodeID, + dnsName: prefs.Config.UserProfile.LoginName, } pod := &corev1.Pod{} if err := r.Get(ctx, client.ObjectKey{Namespace: r.tsNamespace, Name: secret.Name}, pod); err != nil && !apierrors.IsNotFound(err) { diff --git a/cmd/k8s-operator/proxygroup_specs.go b/cmd/k8s-operator/proxygroup_specs.go index 40bbaec17..8c17c7b6b 100644 --- a/cmd/k8s-operator/proxygroup_specs.go +++ b/cmd/k8s-operator/proxygroup_specs.go @@ -318,9 +318,9 @@ func pgIngressCM(pg *tsapi.ProxyGroup, namespace string) *corev1.ConfigMap { } } -func pgSecretLabels(pgName, typ string) map[string]string { +func pgSecretLabels(pgName, secretType string) map[string]string { return pgLabels(pgName, map[string]string{ - labelSecretType: typ, // "config" or "state". + labelSecretType: secretType, // "config" or "state". }) } diff --git a/cmd/k8s-operator/tsrecorder.go b/cmd/k8s-operator/tsrecorder.go index 44ce731fe..e9e6b2c6c 100644 --- a/cmd/k8s-operator/tsrecorder.go +++ b/cmd/k8s-operator/tsrecorder.go @@ -230,7 +230,7 @@ func (r *RecorderReconciler) maybeProvision(ctx context.Context, tsr *tsapi.Reco func (r *RecorderReconciler) maybeCleanup(ctx context.Context, tsr *tsapi.Recorder) (bool, error) { logger := r.logger(tsr.Name) - id, _, ok, err := r.getNodeMetadata(ctx, tsr.Name) + prefs, ok, err := r.getDevicePrefs(ctx, tsr.Name) if err != nil { return false, err } @@ -243,6 +243,7 @@ func (r *RecorderReconciler) maybeCleanup(ctx context.Context, tsr *tsapi.Record return true, nil } + id := string(prefs.Config.NodeID) logger.Debugf("deleting device %s from control", string(id)) if err := r.tsClient.DeleteDevice(ctx, string(id)); err != nil { errResp := &tailscale.ErrResponse{} @@ -327,34 +328,33 @@ func (r *RecorderReconciler) getStateSecret(ctx context.Context, tsrName string) return secret, nil } -func (r *RecorderReconciler) getNodeMetadata(ctx context.Context, tsrName string) (id tailcfg.StableNodeID, dnsName string, ok bool, err error) { +func (r *RecorderReconciler) getDevicePrefs(ctx context.Context, tsrName string) (prefs prefs, ok bool, err error) { secret, err := r.getStateSecret(ctx, tsrName) if err != nil || secret == nil { - return "", "", false, err + return prefs, false, err } - return getNodeMetadata(ctx, secret) + return getDevicePrefs(secret) } -// getNodeMetadata returns 'ok == true' iff the node ID is found. The dnsName +// getDevicePrefs returns 'ok == true' iff the node ID is found. The dnsName // is expected to always be non-empty if the node ID is, but not required. -func getNodeMetadata(ctx context.Context, secret *corev1.Secret) (id tailcfg.StableNodeID, dnsName string, ok bool, err error) { +func getDevicePrefs(secret *corev1.Secret) (prefs prefs, ok bool, err error) { // TODO(tomhjp): Should maybe use ipn to parse the following info instead. currentProfile, ok := secret.Data[currentProfileKey] if !ok { - return "", "", false, nil + return prefs, false, nil } profileBytes, ok := secret.Data[string(currentProfile)] if !ok { - return "", "", false, nil + return prefs, false, nil } - var profile profile - if err := json.Unmarshal(profileBytes, &profile); err != nil { - return "", "", false, fmt.Errorf("failed to extract node profile info from state Secret %s: %w", secret.Name, err) + if err := json.Unmarshal(profileBytes, &prefs); err != nil { + return prefs, false, fmt.Errorf("failed to extract node profile info from state Secret %s: %w", secret.Name, err) } - ok = profile.Config.NodeID != "" - return tailcfg.StableNodeID(profile.Config.NodeID), profile.Config.UserProfile.LoginName, ok, nil + ok = prefs.Config.NodeID != "" + return prefs, ok, nil } func (r *RecorderReconciler) getDeviceInfo(ctx context.Context, tsrName string) (d tsapi.RecorderTailnetDevice, ok bool, err error) { @@ -367,14 +367,14 @@ func (r *RecorderReconciler) getDeviceInfo(ctx context.Context, tsrName string) } func getDeviceInfo(ctx context.Context, tsClient tsClient, secret *corev1.Secret) (d tsapi.RecorderTailnetDevice, ok bool, err error) { - nodeID, dnsName, ok, err := getNodeMetadata(ctx, secret) + prefs, ok, err := getDevicePrefs(secret) if !ok || err != nil { return tsapi.RecorderTailnetDevice{}, false, err } // TODO(tomhjp): The profile info doesn't include addresses, which is why we // need the API. Should we instead update the profile to include addresses? - device, err := tsClient.Device(ctx, string(nodeID), nil) + device, err := tsClient.Device(ctx, string(prefs.Config.NodeID), nil) if err != nil { return tsapi.RecorderTailnetDevice{}, false, fmt.Errorf("failed to get device info from API: %w", err) } @@ -383,20 +383,25 @@ func getDeviceInfo(ctx context.Context, tsClient tsClient, secret *corev1.Secret Hostname: device.Hostname, TailnetIPs: device.Addresses, } - if dnsName != "" { + if dnsName := prefs.Config.UserProfile.LoginName; dnsName != "" { d.URL = fmt.Sprintf("https://%s", dnsName) } return d, true, nil } -type profile struct { +// [prefs] is a subset of the ipn.Prefs struct used for extracting information +// from the state Secret of Tailscale devices. +type prefs struct { Config struct { - NodeID string `json:"NodeID"` + NodeID tailcfg.StableNodeID `json:"NodeID"` UserProfile struct { + // LoginName is the MagicDNS name of the device, e.g. foo.tail-scale.ts.net. LoginName string `json:"LoginName"` } `json:"UserProfile"` } `json:"Config"` + + AdvertiseServices []string `json:"AdvertiseServices"` } func markedForDeletion(obj metav1.Object) bool {