diff --git a/cmd/containerboot/healthz.go b/cmd/containerboot/healthz.go index 12e7ee9f8..895290733 100644 --- a/cmd/containerboot/healthz.go +++ b/cmd/containerboot/healthz.go @@ -7,7 +7,6 @@ import ( "log" - "net" "net/http" "sync" ) @@ -23,29 +22,29 @@ type healthz struct { func (h *healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) { h.Lock() defer h.Unlock() + if h.hasAddrs { w.Write([]byte("ok")) } else { - http.Error(w, "node currently has no tailscale IPs", http.StatusInternalServerError) + http.Error(w, "node currently has no tailscale IPs", http.StatusServiceUnavailable) } } -// runHealthz runs a simple HTTP health endpoint on /healthz, listening on the -// provided address. A containerized tailscale instance is considered healthy if +func (h *healthz) update(healthy bool) { + h.Lock() + defer h.Unlock() + + if h.hasAddrs != healthy { + log.Println("Setting healthy", healthy) + } + h.hasAddrs = healthy +} + +// healthHandlers registers a simple health handler at /healthz. +// A containerized tailscale instance is considered healthy if // it has at least one tailnet IP address. -func runHealthz(addr string, h *healthz) { - lis, err := net.Listen("tcp", addr) - if err != nil { - log.Fatalf("error listening on the provided health endpoint address %q: %v", addr, err) - } - mux := http.NewServeMux() +func healthHandlers(mux *http.ServeMux) *healthz { + h := &healthz{} mux.Handle("GET /healthz", h) - log.Printf("Running healthcheck endpoint at %s/healthz", addr) - hs := &http.Server{Handler: mux} - - go func() { - if err := hs.Serve(lis); err != nil { - log.Fatalf("failed running health endpoint: %v", err) - } - }() + return h } diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go index 313e8deb0..0af9062a5 100644 --- a/cmd/containerboot/main.go +++ b/cmd/containerboot/main.go @@ -52,11 +52,17 @@ // ${TS_CERT_DOMAIN}, it will be replaced with the value of the available FQDN. // It cannot be used in conjunction with TS_DEST_IP. The file is watched for changes, // and will be re-applied when it changes. -// - TS_HEALTHCHECK_ADDR_PORT: if specified, an HTTP health endpoint will be -// served at /healthz at the provided address, which should be in form [
]:. -// If not set, no health check will be run. If set to :, addr will default to 0.0.0.0 -// The health endpoint will return 200 OK if this node has at least one tailnet IP address, -// otherwise returns 503. +// - TS_HEALTHCHECK_ADDR_PORT: deprecated, use TS_ENABLE_HEALTH_CHECK instead and optionally +// set TS_LOCAL_ADDR_PORT. Will be removed in 1.82.0. +// - TS_LOCAL_ADDR_PORT: the address and port to serve local metrics and health +// check endpoints if enabled via TS_ENABLE_METRICS and/or TS_ENABLE_HEALTH_CHECK. +// Defaults to [::]:9002, serving on all available interfaces. +// - TS_ENABLE_METRICS: if true, a metrics endpoint will be served at /metrics on +// the address specified by TS_LOCAL_ADDR_PORT. See https://tailscale.com/kb/1482/client-metrics +// for more information on the metrics exposed. +// - TS_ENABLE_HEALTH_CHECK: if true, a health check endpoint will be served at /healthz on +// the address specified by TS_LOCAL_ADDR_PORT. The health endpoint will return 200 +// OK if this node has at least one tailnet IP address, otherwise returns 503. // NB: the health criteria might change in the future. // - TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR: if specified, a path to a // directory that containers tailscaled config in file. The config file needs to be @@ -99,6 +105,7 @@ "log" "math" "net" + "net/http" "net/netip" "os" "os/signal" @@ -178,12 +185,32 @@ func main() { } defer killTailscaled() - if cfg.LocalAddrPort != "" && cfg.MetricsEnabled { - m := &metrics{ - lc: client, - debugEndpoint: cfg.DebugAddrPort, + var healthCheck *healthz + if cfg.HealthCheckAddrPort != "" { + mux := http.NewServeMux() + + log.Printf("Running healthcheck endpoint at %s/healthz", cfg.HealthCheckAddrPort) + healthCheck = healthHandlers(mux) + + close := runHTTPServer(mux, cfg.HealthCheckAddrPort) + defer close() + } + + if cfg.localMetricsEnabled() || cfg.localHealthEnabled() { + mux := http.NewServeMux() + + if cfg.localMetricsEnabled() { + log.Printf("Running metrics endpoint at %s/metrics", cfg.LocalAddrPort) + metricsHandlers(mux, client, cfg.DebugAddrPort) } - runMetrics(cfg.LocalAddrPort, m) + + if cfg.localHealthEnabled() { + log.Printf("Running healthcheck endpoint at %s/healthz", cfg.LocalAddrPort) + healthCheck = healthHandlers(mux) + } + + close := runHTTPServer(mux, cfg.LocalAddrPort) + defer close() } if cfg.EnableForwardingOptimizations { @@ -328,9 +355,6 @@ func main() { certDomain = new(atomic.Pointer[string]) certDomainChanged = make(chan bool, 1) - - h = &healthz{} // http server for the healthz endpoint - healthzRunner = sync.OnceFunc(func() { runHealthz(cfg.HealthCheckAddrPort, h) }) ) if cfg.ServeConfigPath != "" { go watchServeConfigChanges(ctx, cfg.ServeConfigPath, certDomainChanged, certDomain, client) @@ -556,11 +580,8 @@ func main() { } } - if cfg.HealthCheckAddrPort != "" { - h.Lock() - h.hasAddrs = len(addrs) != 0 - h.Unlock() - healthzRunner() + if healthCheck != nil { + healthCheck.update(len(addrs) != 0) } if egressSvcsNotify != nil { egressSvcsNotify <- n @@ -751,3 +772,22 @@ func tailscaledConfigFilePath() string { log.Printf("Using tailscaled config file %q to match current capability version %d", filePath, tailcfg.CurrentCapabilityVersion) return filePath } + +func runHTTPServer(mux *http.ServeMux, addr string) (close func() error) { + ln, err := net.Listen("tcp", addr) + if err != nil { + log.Fatalf("failed to listen on addr %q: %v", addr, err) + } + srv := &http.Server{Handler: mux} + + go func() { + if err := srv.Serve(ln); err != nil { + log.Fatalf("failed running server: %v", err) + } + }() + + return func() error { + err := srv.Shutdown(context.Background()) + return errors.Join(err, ln.Close()) + } +} diff --git a/cmd/containerboot/main_test.go b/cmd/containerboot/main_test.go index 5c92787ce..47d7c19cf 100644 --- a/cmd/containerboot/main_test.go +++ b/cmd/containerboot/main_test.go @@ -101,6 +101,24 @@ func TestContainerBoot(t *testing.T) { argFile := filepath.Join(d, "args") runningSockPath := filepath.Join(d, "tmp/tailscaled.sock") + var localAddrPort, healthAddrPort int + for _, p := range []*int{&localAddrPort, &healthAddrPort} { + ln, err := net.Listen("tcp", ":0") + if err != nil { + t.Fatalf("Failed to open listener: %v", err) + } + if err := ln.Close(); err != nil { + t.Fatalf("Failed to close listener: %v", err) + } + port := ln.Addr().(*net.TCPAddr).Port + *p = port + } + metricsURL := func(port int) string { + return fmt.Sprintf("http://127.0.0.1:%d/metrics", port) + } + healthURL := func(port int) string { + return fmt.Sprintf("http://127.0.0.1:%d/healthz", port) + } type phase struct { // If non-nil, send this IPN bus notification (and remember it as the @@ -119,6 +137,8 @@ type phase struct { // WantFatalLog is the fatal log message we expect from containerboot. // If set for a phase, the test will finish on that phase. WantFatalLog string + + EndpointStatuses map[string]int } runningNotify := &ipn.Notify{ State: ptr.To(ipn.Running), @@ -147,6 +167,11 @@ type phase struct { "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking", "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false", }, + // No metrics or health by default. + EndpointStatuses: map[string]int{ + metricsURL(9002): -1, + healthURL(9002): -1, + }, }, { Notify: runningNotify, @@ -700,6 +725,104 @@ type phase struct { }, }, }, + { + Name: "metrics_enabled", + Env: map[string]string{ + "TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort), + "TS_ENABLE_METRICS": "true", + }, + Phases: []phase{ + { + WantCmds: []string{ + "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking", + "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false", + }, + EndpointStatuses: map[string]int{ + metricsURL(localAddrPort): 200, + healthURL(localAddrPort): -1, + }, + }, { + Notify: runningNotify, + }, + }, + }, + { + Name: "health_enabled", + Env: map[string]string{ + "TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort), + "TS_ENABLE_HEALTH_CHECK": "true", + }, + Phases: []phase{ + { + WantCmds: []string{ + "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking", + "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false", + }, + EndpointStatuses: map[string]int{ + metricsURL(localAddrPort): -1, + healthURL(localAddrPort): 503, // Doesn't start passing until the next phase. + }, + }, { + Notify: runningNotify, + EndpointStatuses: map[string]int{ + metricsURL(localAddrPort): -1, + healthURL(localAddrPort): 200, + }, + }, + }, + }, + { + Name: "metrics_and_health_on_same_port", + Env: map[string]string{ + "TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort), + "TS_ENABLE_METRICS": "true", + "TS_ENABLE_HEALTH_CHECK": "true", + }, + Phases: []phase{ + { + WantCmds: []string{ + "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking", + "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false", + }, + EndpointStatuses: map[string]int{ + metricsURL(localAddrPort): 200, + healthURL(localAddrPort): 503, // Doesn't start passing until the next phase. + }, + }, { + Notify: runningNotify, + EndpointStatuses: map[string]int{ + metricsURL(localAddrPort): 200, + healthURL(localAddrPort): 200, + }, + }, + }, + }, + { + Name: "local_metrics_and_deprecated_health", + Env: map[string]string{ + "TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort), + "TS_ENABLE_METRICS": "true", + "TS_HEALTHCHECK_ADDR_PORT": fmt.Sprintf("[::]:%d", healthAddrPort), + }, + Phases: []phase{ + { + WantCmds: []string{ + "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking", + "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false", + }, + EndpointStatuses: map[string]int{ + metricsURL(localAddrPort): 200, + healthURL(healthAddrPort): 503, // Doesn't start passing until the next phase. + }, + }, { + Notify: runningNotify, + EndpointStatuses: map[string]int{ + metricsURL(localAddrPort): 200, + healthURL(healthAddrPort): 200, + }, + }, + }, + }, } for _, test := range tests { @@ -796,7 +919,26 @@ type phase struct { return nil }) if err != nil { - t.Fatal(err) + t.Fatalf("phase %d: %v", i, err) + } + + for url, want := range p.EndpointStatuses { + err := tstest.WaitFor(2*time.Second, func() error { + resp, err := http.Get(url) + if err != nil && want != -1 { + return fmt.Errorf("GET %s: %v", url, err) + } + if want > 0 && resp.StatusCode != want { + defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("GET %s, want %d, got %d\n%s", url, want, resp.StatusCode, string(body)) + } + + return nil + }) + if err != nil { + t.Fatalf("phase %d: %v", i, err) + } } } waitLogLine(t, 2*time.Second, cbOut, "Startup complete, waiting for shutdown signal") @@ -955,6 +1097,12 @@ func (l *localAPI) ServeHTTP(w http.ResponseWriter, r *http.Request) { if r.Method != "GET" { panic(fmt.Sprintf("unsupported method %q", r.Method)) } + case "/localapi/v0/usermetrics": + if r.Method != "GET" { + panic(fmt.Sprintf("unsupported method %q", r.Method)) + } + w.Write([]byte("fake metrics")) + return default: panic(fmt.Sprintf("unsupported path %q", r.URL.Path)) } diff --git a/cmd/containerboot/metrics.go b/cmd/containerboot/metrics.go index 874774d7a..a8b9222a5 100644 --- a/cmd/containerboot/metrics.go +++ b/cmd/containerboot/metrics.go @@ -8,8 +8,6 @@ import ( "fmt" "io" - "log" - "net" "net/http" "tailscale.com/client/tailscale" @@ -64,28 +62,18 @@ func (m *metrics) handleDebug(w http.ResponseWriter, r *http.Request) { proxy(w, r, debugURL, http.DefaultClient.Do) } -// runMetrics runs a simple HTTP metrics endpoint at /metrics, forwarding +// metricsHandlers registers a simple HTTP metrics handler at /metrics, forwarding // requests to tailscaled's /localapi/v0/usermetrics API. // // In 1.78.x and 1.80.x, it also proxies debug paths to tailscaled's debug // endpoint if configured to ease migration for a breaking change serving user // metrics instead of debug metrics on the "metrics" port. -func runMetrics(addr string, m *metrics) { - ln, err := net.Listen("tcp", addr) - if err != nil { - log.Fatalf("error listening on the provided metrics endpoint address %q: %v", addr, err) +func metricsHandlers(mux *http.ServeMux, lc *tailscale.LocalClient, debugAddrPort string) { + m := &metrics{ + lc: lc, + debugEndpoint: debugAddrPort, } - mux := http.NewServeMux() mux.HandleFunc("GET /metrics", m.handleMetrics) mux.HandleFunc("/debug/", m.handleDebug) // TODO(tomhjp): Remove for 1.82.0 release. - - log.Printf("Running metrics endpoint at %s/metrics", addr) - ms := &http.Server{Handler: mux} - - go func() { - if err := ms.Serve(ln); err != nil { - log.Fatalf("failed running metrics endpoint: %v", err) - } - }() } diff --git a/cmd/containerboot/settings.go b/cmd/containerboot/settings.go index c877682b9..1262a0e18 100644 --- a/cmd/containerboot/settings.go +++ b/cmd/containerboot/settings.go @@ -67,18 +67,15 @@ type settings struct { PodIP string PodIPv4 string PodIPv6 string - HealthCheckAddrPort string // TODO(tomhjp): use the local addr/port instead. + HealthCheckAddrPort string LocalAddrPort string MetricsEnabled bool + HealthCheckEnabled bool DebugAddrPort string EgressSvcsCfgPath string } func configFromEnv() (*settings, error) { - defaultLocalAddrPort := "" - if v, ok := os.LookupEnv("POD_IP"); ok && v != "" { - defaultLocalAddrPort = fmt.Sprintf("%s:9002", v) - } cfg := &settings{ AuthKey: defaultEnvs([]string{"TS_AUTHKEY", "TS_AUTH_KEY"}, ""), Hostname: defaultEnv("TS_HOSTNAME", ""), @@ -105,8 +102,9 @@ func configFromEnv() (*settings, error) { PodIP: defaultEnv("POD_IP", ""), EnableForwardingOptimizations: defaultBool("TS_EXPERIMENTAL_ENABLE_FORWARDING_OPTIMIZATIONS", false), HealthCheckAddrPort: defaultEnv("TS_HEALTHCHECK_ADDR_PORT", ""), - LocalAddrPort: defaultEnv("TS_LOCAL_ADDR_PORT", defaultLocalAddrPort), - MetricsEnabled: defaultBool("TS_METRICS_ENABLED", false), + LocalAddrPort: defaultEnv("TS_LOCAL_ADDR_PORT", "[::]:9002"), + MetricsEnabled: defaultBool("TS_ENABLE_METRICS", false), + HealthCheckEnabled: defaultBool("TS_ENABLE_HEALTH_CHECK", false), DebugAddrPort: defaultEnv("TS_DEBUG_ADDR_PORT", ""), EgressSvcsCfgPath: defaultEnv("TS_EGRESS_SERVICES_CONFIG_PATH", ""), } @@ -181,11 +179,12 @@ func (s *settings) validate() error { return errors.New("TS_EXPERIMENTAL_ENABLE_FORWARDING_OPTIMIZATIONS is not supported in userspace mode") } if s.HealthCheckAddrPort != "" { + log.Printf("[warning] TS_HEALTHCHECK_ADDR_PORT is deprecated and will be removed in 1.82.0. Please use TS_ENABLE_HEALTH_CHECK and optionally TS_LOCAL_ADDR_PORT instead.") if _, err := netip.ParseAddrPort(s.HealthCheckAddrPort); err != nil { - return fmt.Errorf("error parsing TS_HEALTH_CHECK_ADDR_PORT value %q: %w", s.HealthCheckAddrPort, err) + return fmt.Errorf("error parsing TS_HEALTHCHECK_ADDR_PORT value %q: %w", s.HealthCheckAddrPort, err) } } - if s.LocalAddrPort != "" { + if s.localMetricsEnabled() || s.localHealthEnabled() { if _, err := netip.ParseAddrPort(s.LocalAddrPort); err != nil { return fmt.Errorf("error parsing TS_LOCAL_ADDR_PORT value %q: %w", s.LocalAddrPort, err) } @@ -195,6 +194,9 @@ func (s *settings) validate() error { return fmt.Errorf("error parsing TS_DEBUG_ADDR_PORT value %q: %w", s.DebugAddrPort, err) } } + if s.HealthCheckEnabled && s.HealthCheckAddrPort != "" { + return errors.New("TS_HEALTHCHECK_ADDR_PORT is deprecated and will be removed in 1.82.0, use TS_ENABLE_HEALTH_CHECK and optionally TS_LOCAL_ADDR_PORT") + } return nil } @@ -292,6 +294,14 @@ func hasKubeStateStore(cfg *settings) bool { return cfg.InKubernetes && cfg.KubernetesCanPatch && cfg.KubeSecret != "" } +func (cfg *settings) localMetricsEnabled() bool { + return cfg.LocalAddrPort != "" && cfg.MetricsEnabled +} + +func (cfg *settings) localHealthEnabled() bool { + return cfg.LocalAddrPort != "" && cfg.HealthCheckEnabled +} + // defaultEnv returns the value of the given envvar name, or defVal if // unset. func defaultEnv(name, defVal string) string { diff --git a/cmd/k8s-operator/sts.go b/cmd/k8s-operator/sts.go index b12b1cdd0..73c54a93d 100644 --- a/cmd/k8s-operator/sts.go +++ b/cmd/k8s-operator/sts.go @@ -818,7 +818,7 @@ func enableEndpoints(ss *appsv1.StatefulSet, metrics, debug bool) { Value: "$(POD_IP):9002", }, corev1.EnvVar{ - Name: "TS_METRICS_ENABLED", + Name: "TS_ENABLE_METRICS", Value: "true", }, ) diff --git a/cmd/k8s-operator/sts_test.go b/cmd/k8s-operator/sts_test.go index 7986d1b91..05aafaee6 100644 --- a/cmd/k8s-operator/sts_test.go +++ b/cmd/k8s-operator/sts_test.go @@ -258,7 +258,7 @@ func Test_applyProxyClassToStatefulSet(t *testing.T) { corev1.EnvVar{Name: "TS_DEBUG_ADDR_PORT", Value: "$(POD_IP):9001"}, corev1.EnvVar{Name: "TS_TAILSCALED_EXTRA_ARGS", Value: "--debug=$(TS_DEBUG_ADDR_PORT)"}, corev1.EnvVar{Name: "TS_LOCAL_ADDR_PORT", Value: "$(POD_IP):9002"}, - corev1.EnvVar{Name: "TS_METRICS_ENABLED", Value: "true"}, + corev1.EnvVar{Name: "TS_ENABLE_METRICS", Value: "true"}, ) wantSS.Spec.Template.Spec.Containers[0].Ports = []corev1.ContainerPort{ {Name: "debug", Protocol: "TCP", ContainerPort: 9001}, @@ -273,7 +273,7 @@ func Test_applyProxyClassToStatefulSet(t *testing.T) { wantSS = nonUserspaceProxySS.DeepCopy() wantSS.Spec.Template.Spec.Containers[0].Env = append(wantSS.Spec.Template.Spec.Containers[0].Env, corev1.EnvVar{Name: "TS_LOCAL_ADDR_PORT", Value: "$(POD_IP):9002"}, - corev1.EnvVar{Name: "TS_METRICS_ENABLED", Value: "true"}, + corev1.EnvVar{Name: "TS_ENABLE_METRICS", Value: "true"}, ) wantSS.Spec.Template.Spec.Containers[0].Ports = []corev1.ContainerPort{{Name: "metrics", Protocol: "TCP", ContainerPort: 9002}} gotSS = applyProxyClassToStatefulSet(proxyClassWithMetricsDebug(true, ptr.To(false)), nonUserspaceProxySS.DeepCopy(), new(tailscaleSTSConfig), zl.Sugar())