Raj Singh 6ed86a0251 cmd/k8s-operator: add IDP CRD for OpenID Connect identity provider
Adds a new IDP (Identity Provider) Custom Resource Definition to the Tailscale Kubernetes operator. This allows users to deploy and manage   tsidp instances as Kubernetes resources.

Updates #16666

Signed-off-by: Raj Singh <raj@tailscale.com>
2025-07-27 12:19:02 -05:00

541 lines
18 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build !plan9
package main
import (
"context"
"errors"
"fmt"
"net/http"
"regexp"
"slices"
"strings"
"sync"
"go.uber.org/zap"
xslices "golang.org/x/exp/slices"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"tailscale.com/client/tailscale"
tsoperator "tailscale.com/k8s-operator"
tsapi "tailscale.com/k8s-operator/apis/v1alpha1"
"tailscale.com/kube/kubetypes"
"tailscale.com/tailcfg"
"tailscale.com/tstime"
"tailscale.com/util/clientmetric"
"tailscale.com/util/set"
)
const (
reasonIDPCreationFailed = "IDPCreationFailed"
reasonIDPCreating = "IDPCreating"
reasonIDPCreated = "IDPCreated"
reasonIDPInvalid = "IDPInvalid"
// emptyJSONObject is the initial value for funnel clients secret data
emptyJSONObject = "{}"
// Network constants
minPort = 1
maxPort = 65535
)
var (
// dnsLabelRegex validates DNS labels according to RFC 1123
dnsLabelRegex = regexp.MustCompile(`^[a-z0-9]([-a-z0-9]*[a-z0-9])?$`)
)
var gaugeIDPResources = clientmetric.NewGauge(kubetypes.MetricIDPCount)
// IDPReconciler syncs IDP statefulsets with their definition in
// IDP CRs.
type IDPReconciler struct {
client.Client
l *zap.SugaredLogger
recorder record.EventRecorder
clock tstime.Clock
tsNamespace string
tsClient tsClient
loginServer string // optional URL of the control server
mu sync.Mutex // protects following
idps set.Slice[types.UID] // for idps gauge
}
func (r *IDPReconciler) logger(name string) *zap.SugaredLogger {
return r.l.With("IDP", name)
}
func (r *IDPReconciler) Reconcile(ctx context.Context, req reconcile.Request) (res reconcile.Result, err error) {
logger := r.logger(req.Name)
logger.Debugf("starting reconcile")
defer func() {
if err != nil {
logger.Errorf("reconcile finished with error: %v", err)
} else {
logger.Debugf("reconcile finished")
}
}()
idp := new(tsapi.IDP)
err = r.Get(ctx, req.NamespacedName, idp)
if apierrors.IsNotFound(err) {
logger.Debugf("IDP not found, assuming it was deleted")
return reconcile.Result{}, nil
} else if err != nil {
return reconcile.Result{}, fmt.Errorf("failed to get tailscale.com IDP: %w", err)
}
if markedForDeletion(idp) {
logger.Debugf("IDP is being deleted, cleaning up resources")
ix := xslices.Index(idp.Finalizers, FinalizerName)
if ix < 0 {
logger.Debugf("no finalizer, nothing to do")
return reconcile.Result{}, nil
}
if done, err := r.maybeCleanup(ctx, idp); err != nil {
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
logger.Debugf("optimistic lock error during cleanup, retrying: %v", err)
return reconcile.Result{RequeueAfter: shortRequeue}, nil
}
return reconcile.Result{}, err
} else if !done {
logger.Debugf("IDP resource cleanup not yet finished, will retry...")
return reconcile.Result{RequeueAfter: shortRequeue}, nil
}
idp.Finalizers = slices.Delete(idp.Finalizers, ix, ix+1)
if err := r.Update(ctx, idp); err != nil {
return reconcile.Result{}, err
}
return reconcile.Result{}, nil
}
oldIDPStatus := idp.Status.DeepCopy()
setStatusReady := func(idp *tsapi.IDP, status metav1.ConditionStatus, reason, message string) (reconcile.Result, error) {
tsoperator.SetIDPCondition(idp, tsapi.IDPReady, status, reason, message, idp.Generation, r.clock, logger)
if !apiequality.Semantic.DeepEqual(oldIDPStatus, &idp.Status) {
// An error encountered here should get returned by the Reconcile function.
if updateErr := r.Client.Status().Update(ctx, idp); updateErr != nil {
err = errors.Join(err, updateErr)
}
}
return reconcile.Result{}, err
}
if !slices.Contains(idp.Finalizers, FinalizerName) {
// Log once during initial provisioning when finalizer is added.
logger.Infof("ensuring IDP is set up")
idp.Finalizers = append(idp.Finalizers, FinalizerName)
if err := r.Update(ctx, idp); err != nil {
return setStatusReady(idp, metav1.ConditionFalse, reasonIDPCreationFailed, fmt.Sprintf("failed to add finalizer: %v", err))
}
}
if err := r.validate(ctx, idp); err != nil {
message := fmt.Sprintf("IDP is invalid: %s", err)
r.recorder.Eventf(idp, corev1.EventTypeWarning, reasonIDPInvalid, message)
return setStatusReady(idp, metav1.ConditionFalse, reasonIDPInvalid, message)
}
if err = r.maybeProvision(ctx, idp); err != nil {
reason := reasonIDPCreationFailed
message := fmt.Sprintf("failed creating IDP: %s", err)
if strings.Contains(err.Error(), optimisticLockErrorMsg) {
reason = reasonIDPCreating
message = fmt.Sprintf("optimistic lock error, retrying: %s", err)
err = nil
logger.Info(message)
} else {
r.recorder.Eventf(idp, corev1.EventTypeWarning, reasonIDPCreationFailed, message)
}
return setStatusReady(idp, metav1.ConditionFalse, reason, message)
}
logger.Info("IDP resources synced")
// Update status with device information, similar to how Recorder does it
if err = r.updateStatus(ctx, idp); err != nil {
return setStatusReady(idp, metav1.ConditionFalse, reasonIDPCreationFailed, fmt.Sprintf("failed updating status: %s", err))
}
// Update the status after successful provisioning.
// Note: oldIDPStatus was captured before maybeProvision, so any status
// updates made during provisioning will be included in the update.
return setStatusReady(idp, metav1.ConditionTrue, reasonIDPCreated, reasonIDPCreated)
}
// validate validates the IDP spec.
func (r *IDPReconciler) validate(_ context.Context, idp *tsapi.IDP) error {
// Validate tags using the standard CheckTag function
for _, tag := range idp.Spec.Tags {
if err := tailcfg.CheckTag(string(tag)); err != nil {
return fmt.Errorf("invalid tag %q: %w", tag, err)
}
}
// Validate hostname
if idp.Spec.Hostname != "" {
if len(idp.Spec.Hostname) > 63 {
return fmt.Errorf("hostname %q must be 63 characters or less", idp.Spec.Hostname)
}
// Validate hostname format (DNS label)
if !isValidDNSLabel(idp.Spec.Hostname) {
return fmt.Errorf("hostname %q must be a valid DNS label (lowercase letters, numbers, and hyphens only; cannot start or end with hyphen)", idp.Spec.Hostname)
}
}
// Validate port
if idp.Spec.Port != 0 {
if idp.Spec.Port < minPort || idp.Spec.Port > maxPort {
return fmt.Errorf("port %d is out of valid range (%d-%d)", idp.Spec.Port, minPort, maxPort)
}
}
// Validate local port
if idp.Spec.LocalPort != nil {
if *idp.Spec.LocalPort < minPort || *idp.Spec.LocalPort > maxPort {
return fmt.Errorf("localPort %d is out of valid range (%d-%d)", *idp.Spec.LocalPort, minPort, maxPort)
}
}
// Validate funnel with port
if idp.Spec.EnableFunnel && idp.Spec.Port != 0 && idp.Spec.Port != 443 {
return fmt.Errorf("when enableFunnel is true, port must be 443 or unset")
}
return nil
}
// maybeProvision ensures that all IDP resources are created as needed.
func (r *IDPReconciler) maybeProvision(ctx context.Context, idp *tsapi.IDP) error {
logger := r.logger(idp.Name)
// Ensure ServiceAccount exists
logger.Debugf("ensuring ServiceAccount %s exists", idp.Name)
sa := idpServiceAccount(idp, r.tsNamespace)
if _, err := createOrMaybeUpdate(ctx, r.Client, r.tsNamespace, sa, func(existing *corev1.ServiceAccount) error {
// Check that we don't clobber a pre-existing ServiceAccount not owned by this IDP
if sa.Name != idp.Name && !apiequality.Semantic.DeepEqual(existing.OwnerReferences, idpOwnerReference(idp)) {
return fmt.Errorf("custom ServiceAccount name %q specified but conflicts with a pre-existing ServiceAccount in the %s namespace", sa.Name, sa.Namespace)
}
existing.Annotations = sa.Annotations
existing.Labels = sa.Labels
return nil
}); err != nil {
return fmt.Errorf("failed to create or update ServiceAccount: %w", err)
}
// Clean up any old ServiceAccounts if the name changed
if err := r.maybeCleanupServiceAccounts(ctx, idp, sa.Name); err != nil {
return fmt.Errorf("failed to cleanup old ServiceAccounts: %w", err)
}
logger.Debugf("ServiceAccount synced")
// Ensure Role exists
role := idpRole(idp, r.tsNamespace)
if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, role, func(existing *rbacv1.Role) {
existing.Rules = role.Rules
existing.Labels = role.Labels
existing.Annotations = role.Annotations
}); err != nil {
return fmt.Errorf("failed to create or update Role: %w", err)
}
logger.Debugf("Role synced")
// Ensure RoleBinding exists
roleBinding := idpRoleBinding(idp, r.tsNamespace)
if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, roleBinding, func(existing *rbacv1.RoleBinding) {
existing.RoleRef = roleBinding.RoleRef
existing.Subjects = roleBinding.Subjects
existing.Labels = roleBinding.Labels
existing.Annotations = roleBinding.Annotations
}); err != nil {
return fmt.Errorf("failed to create or update RoleBinding: %w", err)
}
logger.Debugf("RoleBinding synced")
// Create auth secret
logger.Debugf("ensuring auth secret exists")
authSecret, err := r.authSecret(ctx, idp)
if err != nil {
return fmt.Errorf("failed to create auth secret: %w", err)
}
if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, authSecret, func(existing *corev1.Secret) {
existing.StringData = authSecret.StringData
}); err != nil {
return fmt.Errorf("failed to create or update auth Secret: %w", err)
}
logger.Debugf("Auth Secret synced")
// State Secret is precreated so we can use the IDP CR as its owner ref.
// This follows the same pattern as the Recorder reconciler.
stateSecret := idpStateSecret(idp, r.tsNamespace)
if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, stateSecret, func(s *corev1.Secret) {
s.ObjectMeta.Labels = stateSecret.ObjectMeta.Labels
s.ObjectMeta.Annotations = stateSecret.ObjectMeta.Annotations
}); err != nil {
return fmt.Errorf("error creating state Secret: %w", err)
}
logger.Debugf("State Secret synced")
// Ensure funnel clients secret exists with proper owner reference.
// This secret stores state for the IDP when running with funnel enabled.
funnelClientsSecret := &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-funnel-clients", idp.Name),
Namespace: r.tsNamespace,
Labels: map[string]string{"app": "idp", "idp": idp.Name},
OwnerReferences: idpOwnerReference(idp),
},
Data: map[string][]byte{
"funnel-clients": []byte(emptyJSONObject),
},
}
if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, funnelClientsSecret, func(existing *corev1.Secret) {
existing.Labels = funnelClientsSecret.Labels
// Initialize data if it doesn't exist, but don't overwrite existing data
if existing.Data == nil {
existing.Data = map[string][]byte{}
}
if _, exists := existing.Data["funnel-clients"]; !exists {
existing.Data["funnel-clients"] = []byte(emptyJSONObject)
}
}); err != nil {
return fmt.Errorf("failed to create or update funnel clients Secret: %w", err)
}
logger.Debugf("Funnel clients Secret synced")
// Ensure StatefulSet exists
sts := idpStatefulSet(idp, r.tsNamespace, r.loginServer)
if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, sts, func(existing *appsv1.StatefulSet) {
existing.Spec.Replicas = sts.Spec.Replicas
existing.Spec.Template = sts.Spec.Template
existing.Labels = sts.Labels
existing.Annotations = sts.Annotations
}); err != nil {
return fmt.Errorf("failed to create or update StatefulSet: %w", err)
}
logger.Debugf("StatefulSet synced")
// Create Service for OIDC endpoints
svc := idpService(idp, r.tsNamespace)
if _, err := createOrUpdate(ctx, r.Client, r.tsNamespace, svc, func(existing *corev1.Service) {
existing.Spec.Selector = svc.Spec.Selector
existing.Spec.Ports = svc.Spec.Ports
existing.Spec.Type = svc.Spec.Type
}); err != nil {
return fmt.Errorf("failed to create or update Service: %w", err)
}
logger.Debugf("Service synced")
// Update gauge metrics
r.mu.Lock()
r.idps.Add(idp.UID)
gaugeIDPResources.Set(int64(r.idps.Len()))
r.mu.Unlock()
logger.Debugf("updated metrics, total IDPs: %d", r.idps.Len())
// Don't update status here - it will be updated in the main reconcile loop
// after provisioning is complete, similar to how Recorder works
return nil
}
// updateStatus updates the IDP status with current device information.
func (r *IDPReconciler) updateStatus(ctx context.Context, idp *tsapi.IDP) error {
logger := r.logger(idp.Name)
// Update basic status fields
idp.Status.ObservedGeneration = idp.Generation
// Set hostname
if idp.Spec.Hostname != "" {
idp.Status.Hostname = idp.Spec.Hostname
} else {
idp.Status.Hostname = "idp"
}
// Check kubestore state secret for device info.
stateSecretName := fmt.Sprintf("%s-state", idp.Name)
stateSecret := &corev1.Secret{}
if err := r.Get(ctx, client.ObjectKey{
Name: stateSecretName,
Namespace: r.tsNamespace,
}, stateSecret); err != nil {
// Device not ready yet, don't set URL
logger.Debugf("state secret not found yet, device may still be initializing")
return nil
}
// Extract device info from kubestore state
prefs, ok, err := getDevicePrefs(stateSecret)
if err != nil {
return fmt.Errorf("error parsing state secret: %w", err)
}
if !ok || prefs.Config == nil || prefs.Config.NodeID == "" {
// Device not fully registered yet
logger.Debugf("device not fully registered yet")
return nil
}
// Get device details from API
device, err := r.tsClient.Device(ctx, string(prefs.Config.NodeID), nil)
if err != nil {
logger.Debugf("failed to get device info: %v", err)
// Don't fail on API errors, device exists but we can't get details
return nil
}
// Update status with actual device information
if device.Hostname != "" {
idp.Status.Hostname = device.Hostname
}
if len(device.Addresses) > 0 {
idp.Status.TailnetIPs = device.Addresses
}
// Set URL based on LoginName from prefs (MagicDNS name)
if dnsName := prefs.Config.UserProfile.LoginName; dnsName != "" {
idp.Status.URL = fmt.Sprintf("https://%s", dnsName)
}
logger.Debugf("updated status with device info from API")
return nil
}
// maybeCleanupServiceAccounts deletes any dangling ServiceAccounts owned by the IDP
// if the ServiceAccount name has been changed. This is a no-op if the name hasn't changed.
func (r *IDPReconciler) maybeCleanupServiceAccounts(ctx context.Context, idp *tsapi.IDP, currentName string) error {
logger := r.logger(idp.Name)
// List all ServiceAccounts owned by this IDP
sas := &corev1.ServiceAccountList{}
if err := r.List(ctx, sas, client.InNamespace(r.tsNamespace), client.MatchingLabels(map[string]string{
"app": "idp",
"idp": idp.Name,
})); err != nil {
return fmt.Errorf("error listing ServiceAccounts for cleanup: %w", err)
}
for _, sa := range sas.Items {
if sa.Name == currentName {
continue
}
if err := r.Delete(ctx, &sa); err != nil {
if apierrors.IsNotFound(err) {
logger.Debugf("ServiceAccount %s not found, likely already deleted", sa.Name)
} else {
return fmt.Errorf("error deleting ServiceAccount %s: %w", sa.Name, err)
}
} else {
logger.Debugf("deleted old ServiceAccount %s", sa.Name)
}
}
return nil
}
// maybeCleanup just deletes the device from the tailnet. All the kubernetes
// resources linked to an IDP will get cleaned up via owner references
// (which we can use because they are all in the same namespace).
func (r *IDPReconciler) maybeCleanup(ctx context.Context, idp *tsapi.IDP) (bool, error) {
logger := r.logger(idp.Name)
// Get the state secret
stateSecretName := fmt.Sprintf("%s-state", idp.Name)
stateSecret := &corev1.Secret{}
err := r.Get(ctx, client.ObjectKey{
Name: stateSecretName,
Namespace: r.tsNamespace,
}, stateSecret)
if apierrors.IsNotFound(err) {
logger.Debugf("state Secret %s not found, device may not have been registered, continuing cleanup", stateSecretName)
r.mu.Lock()
r.idps.Remove(idp.UID)
gaugeIDPResources.Set(int64(r.idps.Len()))
r.mu.Unlock()
return true, nil
}
if err != nil {
return false, fmt.Errorf("error getting state Secret: %w", err)
}
// Extract device info from kubestore state secret
prefs, ok, err := getDevicePrefs(stateSecret)
if err != nil {
return false, fmt.Errorf("error parsing state Secret: %w", err)
}
if !ok || prefs.Config == nil {
logger.Debugf("state Secret %s does not contain node ID, continuing cleanup", stateSecretName)
r.mu.Lock()
r.idps.Remove(idp.UID)
gaugeIDPResources.Set(int64(r.idps.Len()))
r.mu.Unlock()
return true, nil
}
// Delete device from tailnet
nodeID := string(prefs.Config.NodeID)
logger.Debugf("deleting device %s from control", nodeID)
if err := r.tsClient.DeleteDevice(ctx, nodeID); err != nil {
errResp := &tailscale.ErrResponse{}
if ok := errors.As(err, errResp); ok && errResp.Status == http.StatusNotFound {
logger.Debugf("device %s not found, likely because it has already been deleted from control", nodeID)
} else {
return false, fmt.Errorf("error deleting device: %w", err)
}
} else {
logger.Debugf("device %s deleted from control", nodeID)
}
// Log final cleanup completion before removing finalizer.
logger.Infof("cleaned up IDP resources")
r.mu.Lock()
r.idps.Remove(idp.UID)
gaugeIDPResources.Set(int64(r.idps.Len()))
r.mu.Unlock()
return true, nil
}
// authSecret creates a secret containing the auth key for the IDP.
func (r *IDPReconciler) authSecret(ctx context.Context, idp *tsapi.IDP) (*corev1.Secret, error) {
logger := r.logger(idp.Name)
tags := idp.Spec.Tags
if len(tags) == 0 {
tags = tsapi.Tags{"tag:k8s"}
}
tagsSlice := make([]string, len(tags))
for i, tag := range tags {
tagsSlice[i] = string(tag)
}
authKey, err := newAuthKey(ctx, r.tsClient, tagsSlice)
if err != nil {
return nil, fmt.Errorf("failed to create auth key: %w", err)
}
logger.Debugf("created auth key for tags %v", tags)
return idpAuthSecret(idp, r.tsNamespace, authKey), nil
}
// isValidDNSLabel checks if a string is a valid DNS label according to RFC 1123
func isValidDNSLabel(label string) bool {
return dnsLabelRegex.MatchString(label)
}