From 44947054967e3eda476c92206e0a14fd1ffc4ec0 Mon Sep 17 00:00:00 2001 From: David Bond Date: Tue, 22 Jul 2025 17:07:51 +0100 Subject: [PATCH] cmd/{k8s-proxy,containerboot,k8s-operator},kube: add health check and metrics endpoints for k8s-proxy (#16540) * Modifies the k8s-proxy to expose health check and metrics endpoints on the Pod's IP. * Moves cmd/containerboot/healthz.go and cmd/containerboot/metrics.go to /kube to be shared with /k8s-proxy. Updates #13358 Signed-off-by: David Bond --- cmd/containerboot/healthz.go | 57 ------------- cmd/containerboot/main.go | 16 ++-- cmd/k8s-operator/proxygroup.go | 8 +- cmd/k8s-operator/proxygroup_test.go | 2 + cmd/k8s-proxy/k8s-proxy.go | 67 +++++++++++++-- kube/health/healthz.go | 84 +++++++++++++++++++ kube/k8s-proxy/conf/conf.go | 36 ++++++-- .../containerboot => kube/metrics}/metrics.go | 8 +- 8 files changed, 196 insertions(+), 82 deletions(-) delete mode 100644 cmd/containerboot/healthz.go create mode 100644 kube/health/healthz.go rename {cmd/containerboot => kube/metrics}/metrics.go (90%) diff --git a/cmd/containerboot/healthz.go b/cmd/containerboot/healthz.go deleted file mode 100644 index d6a64a37c..000000000 --- a/cmd/containerboot/healthz.go +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) Tailscale Inc & AUTHORS -// SPDX-License-Identifier: BSD-3-Clause - -//go:build linux - -package main - -import ( - "fmt" - "log" - "net/http" - "sync" - - "tailscale.com/kube/kubetypes" -) - -// healthz is a simple health check server, if enabled it returns 200 OK if -// this tailscale node currently has at least one tailnet IP address else -// returns 503. -type healthz struct { - sync.Mutex - hasAddrs bool - podIPv4 string -} - -func (h *healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) { - h.Lock() - defer h.Unlock() - - if h.hasAddrs { - w.Header().Add(kubetypes.PodIPv4Header, h.podIPv4) - if _, err := w.Write([]byte("ok")); err != nil { - http.Error(w, fmt.Sprintf("error writing status: %v", err), http.StatusInternalServerError) - } - } else { - http.Error(w, "node currently has no tailscale IPs", http.StatusServiceUnavailable) - } -} - -func (h *healthz) update(healthy bool) { - h.Lock() - defer h.Unlock() - - if h.hasAddrs != healthy { - log.Println("Setting healthy", healthy) - } - h.hasAddrs = healthy -} - -// registerHealthHandlers registers a simple health handler at /healthz. -// A containerized tailscale instance is considered healthy if -// it has at least one tailnet IP address. -func registerHealthHandlers(mux *http.ServeMux, podIPv4 string) *healthz { - h := &healthz{podIPv4: podIPv4} - mux.Handle("GET /healthz", h) - return h -} diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go index 49c8a473a..f056d26f3 100644 --- a/cmd/containerboot/main.go +++ b/cmd/containerboot/main.go @@ -121,7 +121,9 @@ import ( "tailscale.com/client/tailscale" "tailscale.com/ipn" kubeutils "tailscale.com/k8s-operator" + healthz "tailscale.com/kube/health" "tailscale.com/kube/kubetypes" + "tailscale.com/kube/metrics" "tailscale.com/kube/services" "tailscale.com/tailcfg" "tailscale.com/types/logger" @@ -232,13 +234,13 @@ func run() error { } defer killTailscaled() - var healthCheck *healthz + var healthCheck *healthz.Healthz ep := &egressProxy{} if cfg.HealthCheckAddrPort != "" { mux := http.NewServeMux() log.Printf("Running healthcheck endpoint at %s/healthz", cfg.HealthCheckAddrPort) - healthCheck = registerHealthHandlers(mux, cfg.PodIPv4) + healthCheck = healthz.RegisterHealthHandlers(mux, cfg.PodIPv4, log.Printf) close := runHTTPServer(mux, cfg.HealthCheckAddrPort) defer close() @@ -249,12 +251,12 @@ func run() error { if cfg.localMetricsEnabled() { log.Printf("Running metrics endpoint at %s/metrics", cfg.LocalAddrPort) - registerMetricsHandlers(mux, client, cfg.DebugAddrPort) + metrics.RegisterMetricsHandlers(mux, client, cfg.DebugAddrPort) } if cfg.localHealthEnabled() { log.Printf("Running healthcheck endpoint at %s/healthz", cfg.LocalAddrPort) - healthCheck = registerHealthHandlers(mux, cfg.PodIPv4) + healthCheck = healthz.RegisterHealthHandlers(mux, cfg.PodIPv4, log.Printf) } if cfg.egressSvcsTerminateEPEnabled() { @@ -438,8 +440,8 @@ authLoop: ) // egressSvcsErrorChan will get an error sent to it if this containerboot instance is configured to expose 1+ // egress services in HA mode and errored. - var egressSvcsErrorChan = make(chan error) - var ingressSvcsErrorChan = make(chan error) + egressSvcsErrorChan := make(chan error) + ingressSvcsErrorChan := make(chan error) defer t.Stop() // resetTimer resets timer for when to next attempt to resolve the DNS // name for the proxy configured with TS_EXPERIMENTAL_DEST_DNS_NAME. The @@ -644,7 +646,7 @@ runLoop: } if healthCheck != nil { - healthCheck.update(len(addrs) != 0) + healthCheck.Update(len(addrs) != 0) } if cfg.ServeConfigPath != "" { diff --git a/cmd/k8s-operator/proxygroup.go b/cmd/k8s-operator/proxygroup.go index f9c12797d..debeb5c6b 100644 --- a/cmd/k8s-operator/proxygroup.go +++ b/cmd/k8s-operator/proxygroup.go @@ -826,6 +826,8 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p // as containerboot does for ingress-pg-reconciler. IssueCerts: opt.NewBool(i == 0), }, + LocalPort: ptr.To(uint16(9002)), + HealthCheckEnabled: opt.NewBool(true), }, } @@ -849,7 +851,11 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p } if proxyClass != nil && proxyClass.Spec.TailscaleConfig != nil { - cfg.AcceptRoutes = &proxyClass.Spec.TailscaleConfig.AcceptRoutes + cfg.AcceptRoutes = opt.NewBool(proxyClass.Spec.TailscaleConfig.AcceptRoutes) + } + + if proxyClass != nil && proxyClass.Spec.Metrics != nil { + cfg.MetricsEnabled = opt.NewBool(proxyClass.Spec.Metrics.Enable) } if len(endpoints[nodePortSvcName]) > 0 { diff --git a/cmd/k8s-operator/proxygroup_test.go b/cmd/k8s-operator/proxygroup_test.go index 0dc791b04..d763cf922 100644 --- a/cmd/k8s-operator/proxygroup_test.go +++ b/cmd/k8s-operator/proxygroup_test.go @@ -1379,6 +1379,8 @@ func TestKubeAPIServerType_DoesNotOverwriteServicesConfig(t *testing.T) { Mode: ptr.To(kubetypes.APIServerProxyModeNoAuth), IssueCerts: opt.NewBool(true), }, + LocalPort: ptr.To(uint16(9002)), + HealthCheckEnabled: opt.NewBool(true), }, } cfgB, err := json.Marshal(cfg) diff --git a/cmd/k8s-proxy/k8s-proxy.go b/cmd/k8s-proxy/k8s-proxy.go index b56ceaab0..448bbe397 100644 --- a/cmd/k8s-proxy/k8s-proxy.go +++ b/cmd/k8s-proxy/k8s-proxy.go @@ -12,9 +12,12 @@ import ( "context" "errors" "fmt" + "net" + "net/http" "os" "os/signal" "reflect" + "strconv" "strings" "syscall" "time" @@ -33,9 +36,11 @@ import ( "tailscale.com/ipn/store" apiproxy "tailscale.com/k8s-operator/api-proxy" "tailscale.com/kube/certs" + healthz "tailscale.com/kube/health" "tailscale.com/kube/k8s-proxy/conf" "tailscale.com/kube/kubetypes" klc "tailscale.com/kube/localclient" + "tailscale.com/kube/metrics" "tailscale.com/kube/services" "tailscale.com/kube/state" "tailscale.com/tailcfg" @@ -63,6 +68,7 @@ func run(logger *zap.SugaredLogger) error { var ( configPath = os.Getenv("TS_K8S_PROXY_CONFIG") podUID = os.Getenv("POD_UID") + podIP = os.Getenv("POD_IP") ) if configPath == "" { return errors.New("TS_K8S_PROXY_CONFIG unset") @@ -201,10 +207,57 @@ func run(logger *zap.SugaredLogger) error { }) } - if cfg.Parsed.AcceptRoutes != nil { + if cfg.Parsed.HealthCheckEnabled.EqualBool(true) || cfg.Parsed.MetricsEnabled.EqualBool(true) { + addr := podIP + if addr == "" { + addr = cfg.GetLocalAddr() + } + + addrPort := getLocalAddrPort(addr, cfg.GetLocalPort()) + mux := http.NewServeMux() + localSrv := &http.Server{Addr: addrPort, Handler: mux} + + if cfg.Parsed.MetricsEnabled.EqualBool(true) { + logger.Infof("Running metrics endpoint at %s/metrics", addrPort) + metrics.RegisterMetricsHandlers(mux, lc, "") + } + + if cfg.Parsed.HealthCheckEnabled.EqualBool(true) { + ipV4, _ := ts.TailscaleIPs() + hz := healthz.RegisterHealthHandlers(mux, ipV4.String(), logger.Infof) + group.Go(func() error { + err := hz.MonitorHealth(ctx, lc) + if err == nil || errors.Is(err, context.Canceled) { + return nil + } + return err + }) + } + + group.Go(func() error { + errChan := make(chan error) + go func() { + if err := localSrv.ListenAndServe(); err != nil { + errChan <- err + } + close(errChan) + }() + + select { + case <-ctx.Done(): + sCtx, scancel := context.WithTimeout(serveCtx, 10*time.Second) + defer scancel() + return localSrv.Shutdown(sCtx) + case err := <-errChan: + return err + } + }) + } + + if v, ok := cfg.Parsed.AcceptRoutes.Get(); ok { _, err = lc.EditPrefs(ctx, &ipn.MaskedPrefs{ RouteAllSet: true, - Prefs: ipn.Prefs{RouteAll: *cfg.Parsed.AcceptRoutes}, + Prefs: ipn.Prefs{RouteAll: v}, }) if err != nil { return fmt.Errorf("error editing prefs: %w", err) @@ -285,10 +338,10 @@ func run(logger *zap.SugaredLogger) error { prefs.HostnameSet = true prefs.Hostname = *cfg.Parsed.Hostname } - if cfg.Parsed.AcceptRoutes != nil && *cfg.Parsed.AcceptRoutes != currentPrefs.RouteAll { - cfgLogger = cfgLogger.With("AcceptRoutes", fmt.Sprintf("%v -> %v", currentPrefs.RouteAll, *cfg.Parsed.AcceptRoutes)) + if v, ok := cfg.Parsed.AcceptRoutes.Get(); ok && v != currentPrefs.RouteAll { + cfgLogger = cfgLogger.With("AcceptRoutes", fmt.Sprintf("%v -> %v", currentPrefs.RouteAll, v)) prefs.RouteAllSet = true - prefs.Prefs.RouteAll = *cfg.Parsed.AcceptRoutes + prefs.Prefs.RouteAll = v } if !prefs.IsEmpty() { if _, err := lc.EditPrefs(ctx, &prefs); err != nil { @@ -304,6 +357,10 @@ func run(logger *zap.SugaredLogger) error { } } +func getLocalAddrPort(addr string, port uint16) string { + return net.JoinHostPort(addr, strconv.FormatUint(uint64(port), 10)) +} + func getStateStore(path *string, logger *zap.SugaredLogger) (ipn.StateStore, error) { p := "mem:" if path != nil { diff --git a/kube/health/healthz.go b/kube/health/healthz.go new file mode 100644 index 000000000..c8cfcc7ec --- /dev/null +++ b/kube/health/healthz.go @@ -0,0 +1,84 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !plan9 + +// Package health contains shared types and underlying methods for serving +// a `/healthz` endpoint for containerboot and k8s-proxy. +package health + +import ( + "context" + "fmt" + "net/http" + "sync" + + "tailscale.com/client/local" + "tailscale.com/ipn" + "tailscale.com/kube/kubetypes" + "tailscale.com/types/logger" +) + +// Healthz is a simple health check server, if enabled it returns 200 OK if +// this tailscale node currently has at least one tailnet IP address else +// returns 503. +type Healthz struct { + sync.Mutex + hasAddrs bool + podIPv4 string + logger logger.Logf +} + +func (h *Healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) { + h.Lock() + defer h.Unlock() + + if h.hasAddrs { + w.Header().Add(kubetypes.PodIPv4Header, h.podIPv4) + if _, err := w.Write([]byte("ok")); err != nil { + http.Error(w, fmt.Sprintf("error writing status: %v", err), http.StatusInternalServerError) + } + } else { + http.Error(w, "node currently has no tailscale IPs", http.StatusServiceUnavailable) + } +} + +func (h *Healthz) Update(healthy bool) { + h.Lock() + defer h.Unlock() + + if h.hasAddrs != healthy { + h.logger("Setting healthy %v", healthy) + } + h.hasAddrs = healthy +} + +func (h *Healthz) MonitorHealth(ctx context.Context, lc *local.Client) error { + w, err := lc.WatchIPNBus(ctx, ipn.NotifyInitialNetMap) + if err != nil { + return fmt.Errorf("failed to watch IPN bus: %w", err) + } + + for { + n, err := w.Next() + if err != nil { + return err + } + + if n.NetMap != nil { + h.Update(n.NetMap.SelfNode.Addresses().Len() != 0) + } + } +} + +// RegisterHealthHandlers registers a simple health handler at /healthz. +// A containerized tailscale instance is considered healthy if +// it has at least one tailnet IP address. +func RegisterHealthHandlers(mux *http.ServeMux, podIPv4 string, logger logger.Logf) *Healthz { + h := &Healthz{ + podIPv4: podIPv4, + logger: logger, + } + mux.Handle("GET /healthz", h) + return h +} diff --git a/kube/k8s-proxy/conf/conf.go b/kube/k8s-proxy/conf/conf.go index fdb6301ac..529495243 100644 --- a/kube/k8s-proxy/conf/conf.go +++ b/kube/k8s-proxy/conf/conf.go @@ -49,21 +49,23 @@ type VersionedConfig struct { } type ConfigV1Alpha1 struct { - AuthKey *string `json:",omitempty"` // Tailscale auth key to use. - State *string `json:",omitempty"` // Path to the Tailscale state. - LogLevel *string `json:",omitempty"` // "debug", "info". Defaults to "info". - App *string `json:",omitempty"` // e.g. kubetypes.AppProxyGroupKubeAPIServer - ServerURL *string `json:",omitempty"` // URL of the Tailscale coordination server. - // StaticEndpoints are additional, user-defined endpoints that this node - // should advertise amongst its wireguard endpoints. - StaticEndpoints []netip.AddrPort `json:",omitempty"` + AuthKey *string `json:",omitempty"` // Tailscale auth key to use. + State *string `json:",omitempty"` // Path to the Tailscale state. + LogLevel *string `json:",omitempty"` // "debug", "info". Defaults to "info". + App *string `json:",omitempty"` // e.g. kubetypes.AppProxyGroupKubeAPIServer + ServerURL *string `json:",omitempty"` // URL of the Tailscale coordination server. + LocalAddr *string `json:",omitempty"` // The address to use for serving HTTP health checks and metrics (defaults to all interfaces). + LocalPort *uint16 `json:",omitempty"` // The port to use for serving HTTP health checks and metrics (defaults to 9002). + MetricsEnabled opt.Bool `json:",omitempty"` // Serve metrics on :/metrics. + HealthCheckEnabled opt.Bool `json:",omitempty"` // Serve health check on :/metrics. // TODO(tomhjp): The remaining fields should all be reloadable during // runtime, but currently missing most of the APIServerProxy fields. Hostname *string `json:",omitempty"` // Tailscale device hostname. - AcceptRoutes *bool `json:",omitempty"` // Accepts routes advertised by other Tailscale nodes. + AcceptRoutes opt.Bool `json:",omitempty"` // Accepts routes advertised by other Tailscale nodes. AdvertiseServices []string `json:",omitempty"` // Tailscale Services to advertise. APIServerProxy *APIServerProxyConfig `json:",omitempty"` // Config specific to the API Server proxy. + StaticEndpoints []netip.AddrPort `json:",omitempty"` // StaticEndpoints are additional, user-defined endpoints that this node should advertise amongst its wireguard endpoints. } type APIServerProxyConfig struct { @@ -108,3 +110,19 @@ func Load(raw []byte) (c Config, err error) { return c, nil } + +func (c *Config) GetLocalAddr() string { + if c.Parsed.LocalAddr == nil { + return "[::]" + } + + return *c.Parsed.LocalAddr +} + +func (c *Config) GetLocalPort() uint16 { + if c.Parsed.LocalPort == nil { + return uint16(9002) + } + + return *c.Parsed.LocalPort +} diff --git a/cmd/containerboot/metrics.go b/kube/metrics/metrics.go similarity index 90% rename from cmd/containerboot/metrics.go rename to kube/metrics/metrics.go index bbd050de6..0db683008 100644 --- a/cmd/containerboot/metrics.go +++ b/kube/metrics/metrics.go @@ -1,9 +1,11 @@ // Copyright (c) Tailscale Inc & AUTHORS // SPDX-License-Identifier: BSD-3-Clause -//go:build linux +//go:build !plan9 -package main +// Package metrics contains shared types and underlying methods for serving +// localapi metrics. This is primarily consumed by containerboot and k8s-proxy. +package metrics import ( "fmt" @@ -68,7 +70,7 @@ func (m *metrics) handleDebug(w http.ResponseWriter, r *http.Request) { // In 1.78.x and 1.80.x, it also proxies debug paths to tailscaled's debug // endpoint if configured to ease migration for a breaking change serving user // metrics instead of debug metrics on the "metrics" port. -func registerMetricsHandlers(mux *http.ServeMux, lc *local.Client, debugAddrPort string) { +func RegisterMetricsHandlers(mux *http.ServeMux, lc *local.Client, debugAddrPort string) { m := &metrics{ lc: lc, debugEndpoint: debugAddrPort,