cmd/{k8s-proxy,containerboot,k8s-operator},kube: add health check and metrics endpoints for k8s-proxy (#16540)

* Modifies the k8s-proxy to expose health check and metrics
endpoints on the Pod's IP.

* Moves cmd/containerboot/healthz.go and cmd/containerboot/metrics.go to
  /kube to be shared with /k8s-proxy.

Updates #13358

Signed-off-by: David Bond <davidsbond93@gmail.com>
This commit is contained in:
David Bond 2025-07-22 17:07:51 +01:00 committed by GitHub
parent 22a8e0ac50
commit 4494705496
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 196 additions and 82 deletions

View File

@ -1,57 +0,0 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build linux
package main
import (
"fmt"
"log"
"net/http"
"sync"
"tailscale.com/kube/kubetypes"
)
// healthz is a simple health check server, if enabled it returns 200 OK if
// this tailscale node currently has at least one tailnet IP address else
// returns 503.
type healthz struct {
sync.Mutex
hasAddrs bool
podIPv4 string
}
func (h *healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) {
h.Lock()
defer h.Unlock()
if h.hasAddrs {
w.Header().Add(kubetypes.PodIPv4Header, h.podIPv4)
if _, err := w.Write([]byte("ok")); err != nil {
http.Error(w, fmt.Sprintf("error writing status: %v", err), http.StatusInternalServerError)
}
} else {
http.Error(w, "node currently has no tailscale IPs", http.StatusServiceUnavailable)
}
}
func (h *healthz) update(healthy bool) {
h.Lock()
defer h.Unlock()
if h.hasAddrs != healthy {
log.Println("Setting healthy", healthy)
}
h.hasAddrs = healthy
}
// registerHealthHandlers registers a simple health handler at /healthz.
// A containerized tailscale instance is considered healthy if
// it has at least one tailnet IP address.
func registerHealthHandlers(mux *http.ServeMux, podIPv4 string) *healthz {
h := &healthz{podIPv4: podIPv4}
mux.Handle("GET /healthz", h)
return h
}

View File

@ -121,7 +121,9 @@ import (
"tailscale.com/client/tailscale" "tailscale.com/client/tailscale"
"tailscale.com/ipn" "tailscale.com/ipn"
kubeutils "tailscale.com/k8s-operator" kubeutils "tailscale.com/k8s-operator"
healthz "tailscale.com/kube/health"
"tailscale.com/kube/kubetypes" "tailscale.com/kube/kubetypes"
"tailscale.com/kube/metrics"
"tailscale.com/kube/services" "tailscale.com/kube/services"
"tailscale.com/tailcfg" "tailscale.com/tailcfg"
"tailscale.com/types/logger" "tailscale.com/types/logger"
@ -232,13 +234,13 @@ func run() error {
} }
defer killTailscaled() defer killTailscaled()
var healthCheck *healthz var healthCheck *healthz.Healthz
ep := &egressProxy{} ep := &egressProxy{}
if cfg.HealthCheckAddrPort != "" { if cfg.HealthCheckAddrPort != "" {
mux := http.NewServeMux() mux := http.NewServeMux()
log.Printf("Running healthcheck endpoint at %s/healthz", cfg.HealthCheckAddrPort) log.Printf("Running healthcheck endpoint at %s/healthz", cfg.HealthCheckAddrPort)
healthCheck = registerHealthHandlers(mux, cfg.PodIPv4) healthCheck = healthz.RegisterHealthHandlers(mux, cfg.PodIPv4, log.Printf)
close := runHTTPServer(mux, cfg.HealthCheckAddrPort) close := runHTTPServer(mux, cfg.HealthCheckAddrPort)
defer close() defer close()
@ -249,12 +251,12 @@ func run() error {
if cfg.localMetricsEnabled() { if cfg.localMetricsEnabled() {
log.Printf("Running metrics endpoint at %s/metrics", cfg.LocalAddrPort) log.Printf("Running metrics endpoint at %s/metrics", cfg.LocalAddrPort)
registerMetricsHandlers(mux, client, cfg.DebugAddrPort) metrics.RegisterMetricsHandlers(mux, client, cfg.DebugAddrPort)
} }
if cfg.localHealthEnabled() { if cfg.localHealthEnabled() {
log.Printf("Running healthcheck endpoint at %s/healthz", cfg.LocalAddrPort) log.Printf("Running healthcheck endpoint at %s/healthz", cfg.LocalAddrPort)
healthCheck = registerHealthHandlers(mux, cfg.PodIPv4) healthCheck = healthz.RegisterHealthHandlers(mux, cfg.PodIPv4, log.Printf)
} }
if cfg.egressSvcsTerminateEPEnabled() { if cfg.egressSvcsTerminateEPEnabled() {
@ -438,8 +440,8 @@ authLoop:
) )
// egressSvcsErrorChan will get an error sent to it if this containerboot instance is configured to expose 1+ // egressSvcsErrorChan will get an error sent to it if this containerboot instance is configured to expose 1+
// egress services in HA mode and errored. // egress services in HA mode and errored.
var egressSvcsErrorChan = make(chan error) egressSvcsErrorChan := make(chan error)
var ingressSvcsErrorChan = make(chan error) ingressSvcsErrorChan := make(chan error)
defer t.Stop() defer t.Stop()
// resetTimer resets timer for when to next attempt to resolve the DNS // resetTimer resets timer for when to next attempt to resolve the DNS
// name for the proxy configured with TS_EXPERIMENTAL_DEST_DNS_NAME. The // name for the proxy configured with TS_EXPERIMENTAL_DEST_DNS_NAME. The
@ -644,7 +646,7 @@ runLoop:
} }
if healthCheck != nil { if healthCheck != nil {
healthCheck.update(len(addrs) != 0) healthCheck.Update(len(addrs) != 0)
} }
if cfg.ServeConfigPath != "" { if cfg.ServeConfigPath != "" {

View File

@ -826,6 +826,8 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
// as containerboot does for ingress-pg-reconciler. // as containerboot does for ingress-pg-reconciler.
IssueCerts: opt.NewBool(i == 0), IssueCerts: opt.NewBool(i == 0),
}, },
LocalPort: ptr.To(uint16(9002)),
HealthCheckEnabled: opt.NewBool(true),
}, },
} }
@ -849,7 +851,11 @@ func (r *ProxyGroupReconciler) ensureConfigSecretsCreated(ctx context.Context, p
} }
if proxyClass != nil && proxyClass.Spec.TailscaleConfig != nil { if proxyClass != nil && proxyClass.Spec.TailscaleConfig != nil {
cfg.AcceptRoutes = &proxyClass.Spec.TailscaleConfig.AcceptRoutes cfg.AcceptRoutes = opt.NewBool(proxyClass.Spec.TailscaleConfig.AcceptRoutes)
}
if proxyClass != nil && proxyClass.Spec.Metrics != nil {
cfg.MetricsEnabled = opt.NewBool(proxyClass.Spec.Metrics.Enable)
} }
if len(endpoints[nodePortSvcName]) > 0 { if len(endpoints[nodePortSvcName]) > 0 {

View File

@ -1379,6 +1379,8 @@ func TestKubeAPIServerType_DoesNotOverwriteServicesConfig(t *testing.T) {
Mode: ptr.To(kubetypes.APIServerProxyModeNoAuth), Mode: ptr.To(kubetypes.APIServerProxyModeNoAuth),
IssueCerts: opt.NewBool(true), IssueCerts: opt.NewBool(true),
}, },
LocalPort: ptr.To(uint16(9002)),
HealthCheckEnabled: opt.NewBool(true),
}, },
} }
cfgB, err := json.Marshal(cfg) cfgB, err := json.Marshal(cfg)

View File

@ -12,9 +12,12 @@ import (
"context" "context"
"errors" "errors"
"fmt" "fmt"
"net"
"net/http"
"os" "os"
"os/signal" "os/signal"
"reflect" "reflect"
"strconv"
"strings" "strings"
"syscall" "syscall"
"time" "time"
@ -33,9 +36,11 @@ import (
"tailscale.com/ipn/store" "tailscale.com/ipn/store"
apiproxy "tailscale.com/k8s-operator/api-proxy" apiproxy "tailscale.com/k8s-operator/api-proxy"
"tailscale.com/kube/certs" "tailscale.com/kube/certs"
healthz "tailscale.com/kube/health"
"tailscale.com/kube/k8s-proxy/conf" "tailscale.com/kube/k8s-proxy/conf"
"tailscale.com/kube/kubetypes" "tailscale.com/kube/kubetypes"
klc "tailscale.com/kube/localclient" klc "tailscale.com/kube/localclient"
"tailscale.com/kube/metrics"
"tailscale.com/kube/services" "tailscale.com/kube/services"
"tailscale.com/kube/state" "tailscale.com/kube/state"
"tailscale.com/tailcfg" "tailscale.com/tailcfg"
@ -63,6 +68,7 @@ func run(logger *zap.SugaredLogger) error {
var ( var (
configPath = os.Getenv("TS_K8S_PROXY_CONFIG") configPath = os.Getenv("TS_K8S_PROXY_CONFIG")
podUID = os.Getenv("POD_UID") podUID = os.Getenv("POD_UID")
podIP = os.Getenv("POD_IP")
) )
if configPath == "" { if configPath == "" {
return errors.New("TS_K8S_PROXY_CONFIG unset") return errors.New("TS_K8S_PROXY_CONFIG unset")
@ -201,10 +207,57 @@ func run(logger *zap.SugaredLogger) error {
}) })
} }
if cfg.Parsed.AcceptRoutes != nil { if cfg.Parsed.HealthCheckEnabled.EqualBool(true) || cfg.Parsed.MetricsEnabled.EqualBool(true) {
addr := podIP
if addr == "" {
addr = cfg.GetLocalAddr()
}
addrPort := getLocalAddrPort(addr, cfg.GetLocalPort())
mux := http.NewServeMux()
localSrv := &http.Server{Addr: addrPort, Handler: mux}
if cfg.Parsed.MetricsEnabled.EqualBool(true) {
logger.Infof("Running metrics endpoint at %s/metrics", addrPort)
metrics.RegisterMetricsHandlers(mux, lc, "")
}
if cfg.Parsed.HealthCheckEnabled.EqualBool(true) {
ipV4, _ := ts.TailscaleIPs()
hz := healthz.RegisterHealthHandlers(mux, ipV4.String(), logger.Infof)
group.Go(func() error {
err := hz.MonitorHealth(ctx, lc)
if err == nil || errors.Is(err, context.Canceled) {
return nil
}
return err
})
}
group.Go(func() error {
errChan := make(chan error)
go func() {
if err := localSrv.ListenAndServe(); err != nil {
errChan <- err
}
close(errChan)
}()
select {
case <-ctx.Done():
sCtx, scancel := context.WithTimeout(serveCtx, 10*time.Second)
defer scancel()
return localSrv.Shutdown(sCtx)
case err := <-errChan:
return err
}
})
}
if v, ok := cfg.Parsed.AcceptRoutes.Get(); ok {
_, err = lc.EditPrefs(ctx, &ipn.MaskedPrefs{ _, err = lc.EditPrefs(ctx, &ipn.MaskedPrefs{
RouteAllSet: true, RouteAllSet: true,
Prefs: ipn.Prefs{RouteAll: *cfg.Parsed.AcceptRoutes}, Prefs: ipn.Prefs{RouteAll: v},
}) })
if err != nil { if err != nil {
return fmt.Errorf("error editing prefs: %w", err) return fmt.Errorf("error editing prefs: %w", err)
@ -285,10 +338,10 @@ func run(logger *zap.SugaredLogger) error {
prefs.HostnameSet = true prefs.HostnameSet = true
prefs.Hostname = *cfg.Parsed.Hostname prefs.Hostname = *cfg.Parsed.Hostname
} }
if cfg.Parsed.AcceptRoutes != nil && *cfg.Parsed.AcceptRoutes != currentPrefs.RouteAll { if v, ok := cfg.Parsed.AcceptRoutes.Get(); ok && v != currentPrefs.RouteAll {
cfgLogger = cfgLogger.With("AcceptRoutes", fmt.Sprintf("%v -> %v", currentPrefs.RouteAll, *cfg.Parsed.AcceptRoutes)) cfgLogger = cfgLogger.With("AcceptRoutes", fmt.Sprintf("%v -> %v", currentPrefs.RouteAll, v))
prefs.RouteAllSet = true prefs.RouteAllSet = true
prefs.Prefs.RouteAll = *cfg.Parsed.AcceptRoutes prefs.Prefs.RouteAll = v
} }
if !prefs.IsEmpty() { if !prefs.IsEmpty() {
if _, err := lc.EditPrefs(ctx, &prefs); err != nil { if _, err := lc.EditPrefs(ctx, &prefs); err != nil {
@ -304,6 +357,10 @@ func run(logger *zap.SugaredLogger) error {
} }
} }
func getLocalAddrPort(addr string, port uint16) string {
return net.JoinHostPort(addr, strconv.FormatUint(uint64(port), 10))
}
func getStateStore(path *string, logger *zap.SugaredLogger) (ipn.StateStore, error) { func getStateStore(path *string, logger *zap.SugaredLogger) (ipn.StateStore, error) {
p := "mem:" p := "mem:"
if path != nil { if path != nil {

84
kube/health/healthz.go Normal file
View File

@ -0,0 +1,84 @@
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
//go:build !plan9
// Package health contains shared types and underlying methods for serving
// a `/healthz` endpoint for containerboot and k8s-proxy.
package health
import (
"context"
"fmt"
"net/http"
"sync"
"tailscale.com/client/local"
"tailscale.com/ipn"
"tailscale.com/kube/kubetypes"
"tailscale.com/types/logger"
)
// Healthz is a simple health check server, if enabled it returns 200 OK if
// this tailscale node currently has at least one tailnet IP address else
// returns 503.
type Healthz struct {
sync.Mutex
hasAddrs bool
podIPv4 string
logger logger.Logf
}
func (h *Healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) {
h.Lock()
defer h.Unlock()
if h.hasAddrs {
w.Header().Add(kubetypes.PodIPv4Header, h.podIPv4)
if _, err := w.Write([]byte("ok")); err != nil {
http.Error(w, fmt.Sprintf("error writing status: %v", err), http.StatusInternalServerError)
}
} else {
http.Error(w, "node currently has no tailscale IPs", http.StatusServiceUnavailable)
}
}
func (h *Healthz) Update(healthy bool) {
h.Lock()
defer h.Unlock()
if h.hasAddrs != healthy {
h.logger("Setting healthy %v", healthy)
}
h.hasAddrs = healthy
}
func (h *Healthz) MonitorHealth(ctx context.Context, lc *local.Client) error {
w, err := lc.WatchIPNBus(ctx, ipn.NotifyInitialNetMap)
if err != nil {
return fmt.Errorf("failed to watch IPN bus: %w", err)
}
for {
n, err := w.Next()
if err != nil {
return err
}
if n.NetMap != nil {
h.Update(n.NetMap.SelfNode.Addresses().Len() != 0)
}
}
}
// RegisterHealthHandlers registers a simple health handler at /healthz.
// A containerized tailscale instance is considered healthy if
// it has at least one tailnet IP address.
func RegisterHealthHandlers(mux *http.ServeMux, podIPv4 string, logger logger.Logf) *Healthz {
h := &Healthz{
podIPv4: podIPv4,
logger: logger,
}
mux.Handle("GET /healthz", h)
return h
}

View File

@ -49,21 +49,23 @@ type VersionedConfig struct {
} }
type ConfigV1Alpha1 struct { type ConfigV1Alpha1 struct {
AuthKey *string `json:",omitempty"` // Tailscale auth key to use. AuthKey *string `json:",omitempty"` // Tailscale auth key to use.
State *string `json:",omitempty"` // Path to the Tailscale state. State *string `json:",omitempty"` // Path to the Tailscale state.
LogLevel *string `json:",omitempty"` // "debug", "info". Defaults to "info". LogLevel *string `json:",omitempty"` // "debug", "info". Defaults to "info".
App *string `json:",omitempty"` // e.g. kubetypes.AppProxyGroupKubeAPIServer App *string `json:",omitempty"` // e.g. kubetypes.AppProxyGroupKubeAPIServer
ServerURL *string `json:",omitempty"` // URL of the Tailscale coordination server. ServerURL *string `json:",omitempty"` // URL of the Tailscale coordination server.
// StaticEndpoints are additional, user-defined endpoints that this node LocalAddr *string `json:",omitempty"` // The address to use for serving HTTP health checks and metrics (defaults to all interfaces).
// should advertise amongst its wireguard endpoints. LocalPort *uint16 `json:",omitempty"` // The port to use for serving HTTP health checks and metrics (defaults to 9002).
StaticEndpoints []netip.AddrPort `json:",omitempty"` MetricsEnabled opt.Bool `json:",omitempty"` // Serve metrics on <LocalAddr>:<LocalPort>/metrics.
HealthCheckEnabled opt.Bool `json:",omitempty"` // Serve health check on <LocalAddr>:<LocalPort>/metrics.
// TODO(tomhjp): The remaining fields should all be reloadable during // TODO(tomhjp): The remaining fields should all be reloadable during
// runtime, but currently missing most of the APIServerProxy fields. // runtime, but currently missing most of the APIServerProxy fields.
Hostname *string `json:",omitempty"` // Tailscale device hostname. Hostname *string `json:",omitempty"` // Tailscale device hostname.
AcceptRoutes *bool `json:",omitempty"` // Accepts routes advertised by other Tailscale nodes. AcceptRoutes opt.Bool `json:",omitempty"` // Accepts routes advertised by other Tailscale nodes.
AdvertiseServices []string `json:",omitempty"` // Tailscale Services to advertise. AdvertiseServices []string `json:",omitempty"` // Tailscale Services to advertise.
APIServerProxy *APIServerProxyConfig `json:",omitempty"` // Config specific to the API Server proxy. APIServerProxy *APIServerProxyConfig `json:",omitempty"` // Config specific to the API Server proxy.
StaticEndpoints []netip.AddrPort `json:",omitempty"` // StaticEndpoints are additional, user-defined endpoints that this node should advertise amongst its wireguard endpoints.
} }
type APIServerProxyConfig struct { type APIServerProxyConfig struct {
@ -108,3 +110,19 @@ func Load(raw []byte) (c Config, err error) {
return c, nil return c, nil
} }
func (c *Config) GetLocalAddr() string {
if c.Parsed.LocalAddr == nil {
return "[::]"
}
return *c.Parsed.LocalAddr
}
func (c *Config) GetLocalPort() uint16 {
if c.Parsed.LocalPort == nil {
return uint16(9002)
}
return *c.Parsed.LocalPort
}

View File

@ -1,9 +1,11 @@
// Copyright (c) Tailscale Inc & AUTHORS // Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause // SPDX-License-Identifier: BSD-3-Clause
//go:build linux //go:build !plan9
package main // Package metrics contains shared types and underlying methods for serving
// localapi metrics. This is primarily consumed by containerboot and k8s-proxy.
package metrics
import ( import (
"fmt" "fmt"
@ -68,7 +70,7 @@ func (m *metrics) handleDebug(w http.ResponseWriter, r *http.Request) {
// In 1.78.x and 1.80.x, it also proxies debug paths to tailscaled's debug // In 1.78.x and 1.80.x, it also proxies debug paths to tailscaled's debug
// endpoint if configured to ease migration for a breaking change serving user // endpoint if configured to ease migration for a breaking change serving user
// metrics instead of debug metrics on the "metrics" port. // metrics instead of debug metrics on the "metrics" port.
func registerMetricsHandlers(mux *http.ServeMux, lc *local.Client, debugAddrPort string) { func RegisterMetricsHandlers(mux *http.ServeMux, lc *local.Client, debugAddrPort string) {
m := &metrics{ m := &metrics{
lc: lc, lc: lc,
debugEndpoint: debugAddrPort, debugEndpoint: debugAddrPort,