all-kube: create Tailscale Service for HA kube-apiserver ProxyGroup (#16572)

Adds a new reconciler for ProxyGroups of type kube-apiserver that will provision a Tailscale Service for each replica to advertise. Adds two new condition types to the ProxyGroup, TailscaleServiceValid and TailscaleServiceConfigured, to post updates on the state of that reconciler in a way that's consistent with the service-pg reconciler. The created Tailscale Service name is configurable via a new ProxyGroup field spec.kubeAPISserver.ServiceName, which expects a string of the form "svc:<dns-label>". Lots of supporting changes were needed to implement this in a way that's consistent with other operator workflows, including: * Pulled containerboot's ensureServicesUnadvertised and certManager into kube/ libraries to be shared with k8s-proxy. Use those in k8s-proxy to aid Service cert sharing between replicas and graceful Service shutdown. * For certManager, add an initial wait to the cert loop to wait until the domain appears in the devices's netmap to avoid a guaranteed error on the first issue attempt when it's quick to start. * Made several methods in ingress-for-pg.go and svc-for-pg.go into functions to share with the new reconciler * Added a Resource struct to the owner refs stored in Tailscale Service annotations to be able to distinguish between Ingress- and ProxyGroup- based Services that need cleaning up in the Tailscale API. * Added a ListVIPServices method to the internal tailscale client to aid cleaning up orphaned Services * Support for reading config from a kube Secret, and partial support for config reloading, to prevent us having to force Pod restarts when config changes. * Fixed up the zap logger so it's possible to set debug log level. Updates #13358 Change-Id: Ia9607441157dd91fb9b6ecbc318eecbef446e116 Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
2025-08-21 02:17:36 +00:00 · 2025-07-21 11:03:21 +01:00
parent 5adde9e3f3
commit f421907c38
39 changed files with 2551 additions and 397 deletions
--- a/cmd/containerboot/certs.go
+++ b/cmd/containerboot/certs.go
@@ -1,156 +0,0 @@
-// Copyright (c) Tailscale Inc & AUTHORS
-// SPDX-License-Identifier: BSD-3-Clause
-
-//go:build linux
-
-package main
-
-import (
-	"context"
-	"fmt"
-	"log"
-	"net"
-	"sync"
-	"time"
-
-	"tailscale.com/ipn"
-	"tailscale.com/util/goroutines"
-	"tailscale.com/util/mak"
-)
-
-// certManager is responsible for issuing certificates for known domains and for
-// maintaining a loop that re-attempts issuance daily.
-// Currently cert manager logic is only run on ingress ProxyGroup replicas that are responsible for managing certs for
-// HA Ingress HTTPS endpoints ('write' replicas).
-type certManager struct {
-	lc      localClient
-	tracker goroutines.Tracker // tracks running goroutines
-	mu      sync.Mutex         // guards the following
-	// certLoops contains a map of DNS names, for which we currently need to
-	// manage certs to cancel functions that allow stopping a goroutine when
-	// we no longer need to manage certs for the DNS name.
-	certLoops map[string]context.CancelFunc
-}
-
-// ensureCertLoops ensures that, for all currently managed Service HTTPS
-// endpoints, there is a cert loop responsible for issuing and ensuring the
-// renewal of the TLS certs.
-// ServeConfig must not be nil.
-func (cm *certManager) ensureCertLoops(ctx context.Context, sc *ipn.ServeConfig) error {
-	if sc == nil {
-		return fmt.Errorf("[unexpected] ensureCertLoops called with nil ServeConfig")
-	}
-	currentDomains := make(map[string]bool)
-	const httpsPort = "443"
-	for _, service := range sc.Services {
-		for hostPort := range service.Web {
-			domain, port, err := net.SplitHostPort(string(hostPort))
-			if err != nil {
-				return fmt.Errorf("[unexpected] unable to parse HostPort %s", hostPort)
-			}
-			if port != httpsPort { // HA Ingress' HTTP endpoint
-				continue
-			}
-			currentDomains[domain] = true
-		}
-	}
-	cm.mu.Lock()
-	defer cm.mu.Unlock()
-	for domain := range currentDomains {
-		if _, exists := cm.certLoops[domain]; !exists {
-			cancelCtx, cancel := context.WithCancel(ctx)
-			mak.Set(&cm.certLoops, domain, cancel)
-			// Note that most of the issuance anyway happens
-			// serially because the cert client has a shared lock
-			// that's held during any issuance.
-			cm.tracker.Go(func() { cm.runCertLoop(cancelCtx, domain) })
-		}
-	}
-
-	// Stop goroutines for domain names that are no longer in the config.
-	for domain, cancel := range cm.certLoops {
-		if !currentDomains[domain] {
-			cancel()
-			delete(cm.certLoops, domain)
-		}
-	}
-	return nil
-}
-
-// runCertLoop:
-// - calls localAPI certificate endpoint to ensure that certs are issued for the
-// given domain name
-// - calls localAPI certificate endpoint daily to ensure that certs are renewed
-// - if certificate issuance failed retries after an exponential backoff period
-// starting at 1 minute and capped at 24 hours. Reset the backoff once issuance succeeds.
-// Note that renewal check also happens when the node receives an HTTPS request and it is possible that certs get
-// renewed at that point. Renewal here is needed to prevent the shared certs from expiry in edge cases where the 'write'
-// replica does not get any HTTPS requests.
-// https://letsencrypt.org/docs/integration-guide/#retrying-failures
-func (cm *certManager) runCertLoop(ctx context.Context, domain string) {
-	const (
-		normalInterval   = 24 * time.Hour  // regular renewal check
-		initialRetry     = 1 * time.Minute // initial backoff after a failure
-		maxRetryInterval = 24 * time.Hour  // max backoff period
-	)
-	timer := time.NewTimer(0) // fire off timer immediately
-	defer timer.Stop()
-	retryCount := 0
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		case <-timer.C:
-			// We call the certificate endpoint, but don't do anything
-			// with the returned certs here.
-			// The call to the certificate endpoint will ensure that
-			// certs are issued/renewed as needed and stored in the
-			// relevant state store. For example, for HA Ingress
-			// 'write' replica, the cert and key will be stored in a
-			// Kubernetes Secret named after the domain for which we
-			// are issuing.
-			// Note that renewals triggered by the call to the
-			// certificates endpoint here and by renewal check
-			// triggered during a call to node's HTTPS endpoint
-			// share the same state/renewal lock mechanism, so we
-			// should not run into redundant issuances during
-			// concurrent renewal checks.
-			// TODO(irbekrm): maybe it is worth adding a new
-			// issuance endpoint that explicitly only triggers
-			// issuance and stores certs in the relevant store, but
-			// does not return certs to the caller?
-
-			// An issuance holds a shared lock, so we need to avoid
-			// a situation where other services cannot issue certs
-			// because a single one is holding the lock.
-			ctxT, cancel := context.WithTimeout(ctx, time.Second*300)
-			defer cancel()
-			_, _, err := cm.lc.CertPair(ctxT, domain)
-			if err != nil {
-				log.Printf("error refreshing certificate for %s: %v", domain, err)
-			}
-			var nextInterval time.Duration
-			// TODO(irbekrm): distinguish between LE rate limit
-			// errors and other error types like transient network
-			// errors.
-			if err == nil {
-				retryCount = 0
-				nextInterval = normalInterval
-			} else {
-				retryCount++
-				// Calculate backoff: initialRetry * 2^(retryCount-1)
-				// For retryCount=1: 1min * 2^0 = 1min
-				// For retryCount=2: 1min * 2^1 = 2min
-				// For retryCount=3: 1min * 2^2 = 4min
-				backoff := initialRetry * time.Duration(1<<(retryCount-1))
-				if backoff > maxRetryInterval {
-					backoff = maxRetryInterval
-				}
-				nextInterval = backoff
-				log.Printf("Error refreshing certificate for %s (retry %d): %v. Will retry in %v\n",
-					domain, retryCount, err, nextInterval)
-			}
-			timer.Reset(nextInterval)
-		}
-	}
-}
--- a/cmd/containerboot/certs_test.go
+++ b/cmd/containerboot/certs_test.go
@@ -1,229 +0,0 @@
-// Copyright (c) Tailscale Inc & AUTHORS
-// SPDX-License-Identifier: BSD-3-Clause
-
-//go:build linux
-
-package main
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"tailscale.com/ipn"
-	"tailscale.com/tailcfg"
-)
-
-// TestEnsureCertLoops tests that the certManager correctly starts and stops
-// update loops for certs when the serve config changes. It tracks goroutine
-// count and uses that as a validator that the expected number of cert loops are
-// running.
-func TestEnsureCertLoops(t *testing.T) {
-	tests := []struct {
-		name              string
-		initialConfig     *ipn.ServeConfig
-		updatedConfig     *ipn.ServeConfig
-		initialGoroutines int64 // after initial serve config is applied
-		updatedGoroutines int64 // after updated serve config is applied
-		wantErr           bool
-	}{
-		{
-			name:              "empty_serve_config",
-			initialConfig:     &ipn.ServeConfig{},
-			initialGoroutines: 0,
-		},
-		{
-			name:              "nil_serve_config",
-			initialConfig:     nil,
-			initialGoroutines: 0,
-			wantErr:           true,
-		},
-		{
-			name:          "empty_to_one_service",
-			initialConfig: &ipn.ServeConfig{},
-			updatedConfig: &ipn.ServeConfig{
-				Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{
-					"svc:my-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-app.tailnetxyz.ts.net:443": {},
-						},
-					},
-				},
-			},
-			initialGoroutines: 0,
-			updatedGoroutines: 1,
-		},
-		{
-			name: "single_service",
-			initialConfig: &ipn.ServeConfig{
-				Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{
-					"svc:my-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-app.tailnetxyz.ts.net:443": {},
-						},
-					},
-				},
-			},
-			initialGoroutines: 1,
-		},
-		{
-			name: "multiple_services",
-			initialConfig: &ipn.ServeConfig{
-				Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{
-					"svc:my-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-app.tailnetxyz.ts.net:443": {},
-						},
-					},
-					"svc:my-other-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-other-app.tailnetxyz.ts.net:443": {},
-						},
-					},
-				},
-			},
-			initialGoroutines: 2, // one loop per domain across all services
-		},
-		{
-			name: "ignore_non_https_ports",
-			initialConfig: &ipn.ServeConfig{
-				Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{
-					"svc:my-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-app.tailnetxyz.ts.net:443": {},
-							"my-app.tailnetxyz.ts.net:80":  {},
-						},
-					},
-				},
-			},
-			initialGoroutines: 1, // only one loop for the 443 endpoint
-		},
-		{
-			name: "remove_domain",
-			initialConfig: &ipn.ServeConfig{
-				Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{
-					"svc:my-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-app.tailnetxyz.ts.net:443": {},
-						},
-					},
-					"svc:my-other-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-other-app.tailnetxyz.ts.net:443": {},
-						},
-					},
-				},
-			},
-			updatedConfig: &ipn.ServeConfig{
-				Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{
-					"svc:my-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-app.tailnetxyz.ts.net:443": {},
-						},
-					},
-				},
-			},
-			initialGoroutines: 2, // initially two loops (one per service)
-			updatedGoroutines: 1, // one loop after removing service2
-		},
-		{
-			name: "add_domain",
-			initialConfig: &ipn.ServeConfig{
-				Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{
-					"svc:my-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-app.tailnetxyz.ts.net:443": {},
-						},
-					},
-				},
-			},
-			updatedConfig: &ipn.ServeConfig{
-				Services: map[tailcfg.ServiceName]*ipn.ServiceConfig{
-					"svc:my-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-app.tailnetxyz.ts.net:443": {},
-						},
-					},
-					"svc:my-other-app": {
-						Web: map[ipn.HostPort]*ipn.WebServerConfig{
-							"my-other-app.tailnetxyz.ts.net:443": {},
-						},
-					},
-				},
-			},
-			initialGoroutines: 1,
-			updatedGoroutines: 2,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			ctx, cancel := context.WithCancel(context.Background())
-			defer cancel()
-
-			cm := &certManager{
-				lc:        &fakeLocalClient{},
-				certLoops: make(map[string]context.CancelFunc),
-			}
-
-			allDone := make(chan bool, 1)
-			defer cm.tracker.AddDoneCallback(func() {
-				cm.mu.Lock()
-				defer cm.mu.Unlock()
-				if cm.tracker.RunningGoroutines() > 0 {
-					return
-				}
-				select {
-				case allDone <- true:
-				default:
-				}
-			})()
-
-			err := cm.ensureCertLoops(ctx, tt.initialConfig)
-			if (err != nil) != tt.wantErr {
-				t.Fatalf("ensureCertLoops() error = %v", err)
-			}
-
-			if got := cm.tracker.RunningGoroutines(); got != tt.initialGoroutines {
-				t.Errorf("after initial config: got %d running goroutines, want %d", got, tt.initialGoroutines)
-			}
-
-			if tt.updatedConfig != nil {
-				if err := cm.ensureCertLoops(ctx, tt.updatedConfig); err != nil {
-					t.Fatalf("ensureCertLoops() error on update = %v", err)
-				}
-
-				// Although starting goroutines and cancelling
-				// the context happens in the main goroutine, it
-				// the actual goroutine exit when a context is
-				// cancelled does not- so wait for a bit for the
-				// running goroutine count to reach the expected
-				// number.
-				deadline := time.After(5 * time.Second)
-				for {
-					if got := cm.tracker.RunningGoroutines(); got == tt.updatedGoroutines {
-						break
-					}
-					select {
-					case <-deadline:
-						t.Fatalf("timed out waiting for goroutine count to reach %d, currently at %d",
-							tt.updatedGoroutines, cm.tracker.RunningGoroutines())
-					case <-time.After(10 * time.Millisecond):
-						continue
-					}
-				}
-			}
-
-			if tt.updatedGoroutines == 0 {
-				return // no goroutines to wait for
-			}
-			// cancel context to make goroutines exit
-			cancel()
-			select {
-			case <-time.After(5 * time.Second):
-				t.Fatal("timed out waiting for goroutine to finish")
-			case <-allDone:
-			}
-		})
-	}
-}
--- a/cmd/containerboot/main.go
+++ b/cmd/containerboot/main.go
@@ -122,6 +122,7 @@ import (
 	"tailscale.com/ipn"
 	kubeutils "tailscale.com/k8s-operator"
 	"tailscale.com/kube/kubetypes"
+	"tailscale.com/kube/services"
 	"tailscale.com/tailcfg"
 	"tailscale.com/types/logger"
 	"tailscale.com/types/ptr"
@@ -210,7 +211,7 @@ func run() error {
 		ctx, cancel := context.WithTimeout(context.Background(), 25*time.Second)
 		defer cancel()

-		if err := ensureServicesNotAdvertised(ctx, client); err != nil {
+		if err := services.EnsureServicesNotAdvertised(ctx, client, log.Printf); err != nil {
 			log.Printf("Error ensuring services are not advertised: %v", err)
 		}

--- a/cmd/containerboot/serve.go
+++ b/cmd/containerboot/serve.go
@@ -19,7 +19,9 @@ import (
 	"github.com/fsnotify/fsnotify"
 	"tailscale.com/client/local"
 	"tailscale.com/ipn"
+	"tailscale.com/kube/certs"
 	"tailscale.com/kube/kubetypes"
+	klc "tailscale.com/kube/localclient"
 	"tailscale.com/types/netmap"
 )

@@ -52,11 +54,9 @@ func watchServeConfigChanges(ctx context.Context, cdChanged <-chan bool, certDom

 	var certDomain string
 	var prevServeConfig *ipn.ServeConfig
-	var cm certManager
+	var cm *certs.CertManager
 	if cfg.CertShareMode == "rw" {
-		cm = certManager{
-			lc: lc,
-		}
+		cm = certs.NewCertManager(klc.New(lc), log.Printf)
 	}
 	for {
 		select {
@@ -93,7 +93,7 @@ func watchServeConfigChanges(ctx context.Context, cdChanged <-chan bool, certDom
 		if cfg.CertShareMode != "rw" {
 			continue
 		}
-		if err := cm.ensureCertLoops(ctx, sc); err != nil {
+		if err := cm.EnsureCertLoops(ctx, sc); err != nil {
 			log.Fatalf("serve proxy: error ensuring cert loops: %v", err)
 		}
 	}
--- a/cmd/containerboot/services.go
+++ b/cmd/containerboot/services.go
@@ -1,63 +0,0 @@
-// Copyright (c) Tailscale Inc & AUTHORS
-// SPDX-License-Identifier: BSD-3-Clause
-
-//go:build linux
-
-package main
-
-import (
-	"context"
-	"fmt"
-	"log"
-	"time"
-
-	"tailscale.com/client/local"
-	"tailscale.com/ipn"
-)
-
-// ensureServicesNotAdvertised is a function that gets called on containerboot
-// termination and ensures that any currently advertised VIPServices get
-// unadvertised to give clients time to switch to another node before this one
-// is shut down.
-func ensureServicesNotAdvertised(ctx context.Context, lc *local.Client) error {
-	prefs, err := lc.GetPrefs(ctx)
-	if err != nil {
-		return fmt.Errorf("error getting prefs: %w", err)
-	}
-	if len(prefs.AdvertiseServices) == 0 {
-		return nil
-	}
-
-	log.Printf("unadvertising services: %v", prefs.AdvertiseServices)
-	if _, err := lc.EditPrefs(ctx, &ipn.MaskedPrefs{
-		AdvertiseServicesSet: true,
-		Prefs: ipn.Prefs{
-			AdvertiseServices: nil,
-		},
-	}); err != nil {
-		// EditPrefs only returns an error if it fails _set_ its local prefs.
-		// If it fails to _persist_ the prefs in state, we don't get an error
-		// and we continue waiting below, as control will failover as usual.
-		return fmt.Errorf("error setting prefs AdvertiseServices: %w", err)
-	}
-
-	// Services use the same (failover XOR regional routing) mechanism that
-	// HA subnet routers use. Unfortunately we don't yet get a reliable signal
-	// from control that it's responded to our unadvertisement, so the best we
-	// can do is wait for 20 seconds, where 15s is the approximate maximum time
-	// it should take for control to choose a new primary, and 5s is for buffer.
-	//
-	// Note: There is no guarantee that clients have been _informed_ of the new
-	// primary no matter how long we wait. We would need a mechanism to await
-	// netmap updates for peers to know for sure.
-	//
-	// See https://tailscale.com/kb/1115/high-availability for more details.
-	// TODO(tomhjp): Wait for a netmap update instead of sleeping when control
-	// supports that.
-	select {
-	case <-ctx.Done():
-		return nil
-	case <-time.After(20 * time.Second):
-		return nil
-	}
-}