diff --git a/cmd/containerboot/healthz.go b/cmd/containerboot/healthz.go
index 12e7ee9f8..895290733 100644
--- a/cmd/containerboot/healthz.go
+++ b/cmd/containerboot/healthz.go
@@ -7,7 +7,6 @@
import (
"log"
- "net"
"net/http"
"sync"
)
@@ -23,29 +22,29 @@ type healthz struct {
func (h *healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) {
h.Lock()
defer h.Unlock()
+
if h.hasAddrs {
w.Write([]byte("ok"))
} else {
- http.Error(w, "node currently has no tailscale IPs", http.StatusInternalServerError)
+ http.Error(w, "node currently has no tailscale IPs", http.StatusServiceUnavailable)
}
}
-// runHealthz runs a simple HTTP health endpoint on /healthz, listening on the
-// provided address. A containerized tailscale instance is considered healthy if
+func (h *healthz) update(healthy bool) {
+ h.Lock()
+ defer h.Unlock()
+
+ if h.hasAddrs != healthy {
+ log.Println("Setting healthy", healthy)
+ }
+ h.hasAddrs = healthy
+}
+
+// healthHandlers registers a simple health handler at /healthz.
+// A containerized tailscale instance is considered healthy if
// it has at least one tailnet IP address.
-func runHealthz(addr string, h *healthz) {
- lis, err := net.Listen("tcp", addr)
- if err != nil {
- log.Fatalf("error listening on the provided health endpoint address %q: %v", addr, err)
- }
- mux := http.NewServeMux()
+func healthHandlers(mux *http.ServeMux) *healthz {
+ h := &healthz{}
mux.Handle("GET /healthz", h)
- log.Printf("Running healthcheck endpoint at %s/healthz", addr)
- hs := &http.Server{Handler: mux}
-
- go func() {
- if err := hs.Serve(lis); err != nil {
- log.Fatalf("failed running health endpoint: %v", err)
- }
- }()
+ return h
}
diff --git a/cmd/containerboot/main.go b/cmd/containerboot/main.go
index 313e8deb0..0af9062a5 100644
--- a/cmd/containerboot/main.go
+++ b/cmd/containerboot/main.go
@@ -52,11 +52,17 @@
// ${TS_CERT_DOMAIN}, it will be replaced with the value of the available FQDN.
// It cannot be used in conjunction with TS_DEST_IP. The file is watched for changes,
// and will be re-applied when it changes.
-// - TS_HEALTHCHECK_ADDR_PORT: if specified, an HTTP health endpoint will be
-// served at /healthz at the provided address, which should be in form [
]:.
-// If not set, no health check will be run. If set to :, addr will default to 0.0.0.0
-// The health endpoint will return 200 OK if this node has at least one tailnet IP address,
-// otherwise returns 503.
+// - TS_HEALTHCHECK_ADDR_PORT: deprecated, use TS_ENABLE_HEALTH_CHECK instead and optionally
+// set TS_LOCAL_ADDR_PORT. Will be removed in 1.82.0.
+// - TS_LOCAL_ADDR_PORT: the address and port to serve local metrics and health
+// check endpoints if enabled via TS_ENABLE_METRICS and/or TS_ENABLE_HEALTH_CHECK.
+// Defaults to [::]:9002, serving on all available interfaces.
+// - TS_ENABLE_METRICS: if true, a metrics endpoint will be served at /metrics on
+// the address specified by TS_LOCAL_ADDR_PORT. See https://tailscale.com/kb/1482/client-metrics
+// for more information on the metrics exposed.
+// - TS_ENABLE_HEALTH_CHECK: if true, a health check endpoint will be served at /healthz on
+// the address specified by TS_LOCAL_ADDR_PORT. The health endpoint will return 200
+// OK if this node has at least one tailnet IP address, otherwise returns 503.
// NB: the health criteria might change in the future.
// - TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR: if specified, a path to a
// directory that containers tailscaled config in file. The config file needs to be
@@ -99,6 +105,7 @@
"log"
"math"
"net"
+ "net/http"
"net/netip"
"os"
"os/signal"
@@ -178,12 +185,32 @@ func main() {
}
defer killTailscaled()
- if cfg.LocalAddrPort != "" && cfg.MetricsEnabled {
- m := &metrics{
- lc: client,
- debugEndpoint: cfg.DebugAddrPort,
+ var healthCheck *healthz
+ if cfg.HealthCheckAddrPort != "" {
+ mux := http.NewServeMux()
+
+ log.Printf("Running healthcheck endpoint at %s/healthz", cfg.HealthCheckAddrPort)
+ healthCheck = healthHandlers(mux)
+
+ close := runHTTPServer(mux, cfg.HealthCheckAddrPort)
+ defer close()
+ }
+
+ if cfg.localMetricsEnabled() || cfg.localHealthEnabled() {
+ mux := http.NewServeMux()
+
+ if cfg.localMetricsEnabled() {
+ log.Printf("Running metrics endpoint at %s/metrics", cfg.LocalAddrPort)
+ metricsHandlers(mux, client, cfg.DebugAddrPort)
}
- runMetrics(cfg.LocalAddrPort, m)
+
+ if cfg.localHealthEnabled() {
+ log.Printf("Running healthcheck endpoint at %s/healthz", cfg.LocalAddrPort)
+ healthCheck = healthHandlers(mux)
+ }
+
+ close := runHTTPServer(mux, cfg.LocalAddrPort)
+ defer close()
}
if cfg.EnableForwardingOptimizations {
@@ -328,9 +355,6 @@ func main() {
certDomain = new(atomic.Pointer[string])
certDomainChanged = make(chan bool, 1)
-
- h = &healthz{} // http server for the healthz endpoint
- healthzRunner = sync.OnceFunc(func() { runHealthz(cfg.HealthCheckAddrPort, h) })
)
if cfg.ServeConfigPath != "" {
go watchServeConfigChanges(ctx, cfg.ServeConfigPath, certDomainChanged, certDomain, client)
@@ -556,11 +580,8 @@ func main() {
}
}
- if cfg.HealthCheckAddrPort != "" {
- h.Lock()
- h.hasAddrs = len(addrs) != 0
- h.Unlock()
- healthzRunner()
+ if healthCheck != nil {
+ healthCheck.update(len(addrs) != 0)
}
if egressSvcsNotify != nil {
egressSvcsNotify <- n
@@ -751,3 +772,22 @@ func tailscaledConfigFilePath() string {
log.Printf("Using tailscaled config file %q to match current capability version %d", filePath, tailcfg.CurrentCapabilityVersion)
return filePath
}
+
+func runHTTPServer(mux *http.ServeMux, addr string) (close func() error) {
+ ln, err := net.Listen("tcp", addr)
+ if err != nil {
+ log.Fatalf("failed to listen on addr %q: %v", addr, err)
+ }
+ srv := &http.Server{Handler: mux}
+
+ go func() {
+ if err := srv.Serve(ln); err != nil {
+ log.Fatalf("failed running server: %v", err)
+ }
+ }()
+
+ return func() error {
+ err := srv.Shutdown(context.Background())
+ return errors.Join(err, ln.Close())
+ }
+}
diff --git a/cmd/containerboot/main_test.go b/cmd/containerboot/main_test.go
index 5c92787ce..47d7c19cf 100644
--- a/cmd/containerboot/main_test.go
+++ b/cmd/containerboot/main_test.go
@@ -101,6 +101,24 @@ func TestContainerBoot(t *testing.T) {
argFile := filepath.Join(d, "args")
runningSockPath := filepath.Join(d, "tmp/tailscaled.sock")
+ var localAddrPort, healthAddrPort int
+ for _, p := range []*int{&localAddrPort, &healthAddrPort} {
+ ln, err := net.Listen("tcp", ":0")
+ if err != nil {
+ t.Fatalf("Failed to open listener: %v", err)
+ }
+ if err := ln.Close(); err != nil {
+ t.Fatalf("Failed to close listener: %v", err)
+ }
+ port := ln.Addr().(*net.TCPAddr).Port
+ *p = port
+ }
+ metricsURL := func(port int) string {
+ return fmt.Sprintf("http://127.0.0.1:%d/metrics", port)
+ }
+ healthURL := func(port int) string {
+ return fmt.Sprintf("http://127.0.0.1:%d/healthz", port)
+ }
type phase struct {
// If non-nil, send this IPN bus notification (and remember it as the
@@ -119,6 +137,8 @@ type phase struct {
// WantFatalLog is the fatal log message we expect from containerboot.
// If set for a phase, the test will finish on that phase.
WantFatalLog string
+
+ EndpointStatuses map[string]int
}
runningNotify := &ipn.Notify{
State: ptr.To(ipn.Running),
@@ -147,6 +167,11 @@ type phase struct {
"/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
"/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false",
},
+ // No metrics or health by default.
+ EndpointStatuses: map[string]int{
+ metricsURL(9002): -1,
+ healthURL(9002): -1,
+ },
},
{
Notify: runningNotify,
@@ -700,6 +725,104 @@ type phase struct {
},
},
},
+ {
+ Name: "metrics_enabled",
+ Env: map[string]string{
+ "TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort),
+ "TS_ENABLE_METRICS": "true",
+ },
+ Phases: []phase{
+ {
+ WantCmds: []string{
+ "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
+ "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false",
+ },
+ EndpointStatuses: map[string]int{
+ metricsURL(localAddrPort): 200,
+ healthURL(localAddrPort): -1,
+ },
+ }, {
+ Notify: runningNotify,
+ },
+ },
+ },
+ {
+ Name: "health_enabled",
+ Env: map[string]string{
+ "TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort),
+ "TS_ENABLE_HEALTH_CHECK": "true",
+ },
+ Phases: []phase{
+ {
+ WantCmds: []string{
+ "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
+ "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false",
+ },
+ EndpointStatuses: map[string]int{
+ metricsURL(localAddrPort): -1,
+ healthURL(localAddrPort): 503, // Doesn't start passing until the next phase.
+ },
+ }, {
+ Notify: runningNotify,
+ EndpointStatuses: map[string]int{
+ metricsURL(localAddrPort): -1,
+ healthURL(localAddrPort): 200,
+ },
+ },
+ },
+ },
+ {
+ Name: "metrics_and_health_on_same_port",
+ Env: map[string]string{
+ "TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort),
+ "TS_ENABLE_METRICS": "true",
+ "TS_ENABLE_HEALTH_CHECK": "true",
+ },
+ Phases: []phase{
+ {
+ WantCmds: []string{
+ "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
+ "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false",
+ },
+ EndpointStatuses: map[string]int{
+ metricsURL(localAddrPort): 200,
+ healthURL(localAddrPort): 503, // Doesn't start passing until the next phase.
+ },
+ }, {
+ Notify: runningNotify,
+ EndpointStatuses: map[string]int{
+ metricsURL(localAddrPort): 200,
+ healthURL(localAddrPort): 200,
+ },
+ },
+ },
+ },
+ {
+ Name: "local_metrics_and_deprecated_health",
+ Env: map[string]string{
+ "TS_LOCAL_ADDR_PORT": fmt.Sprintf("[::]:%d", localAddrPort),
+ "TS_ENABLE_METRICS": "true",
+ "TS_HEALTHCHECK_ADDR_PORT": fmt.Sprintf("[::]:%d", healthAddrPort),
+ },
+ Phases: []phase{
+ {
+ WantCmds: []string{
+ "/usr/bin/tailscaled --socket=/tmp/tailscaled.sock --state=mem: --statedir=/tmp --tun=userspace-networking",
+ "/usr/bin/tailscale --socket=/tmp/tailscaled.sock up --accept-dns=false",
+ },
+ EndpointStatuses: map[string]int{
+ metricsURL(localAddrPort): 200,
+ healthURL(healthAddrPort): 503, // Doesn't start passing until the next phase.
+ },
+ }, {
+ Notify: runningNotify,
+ EndpointStatuses: map[string]int{
+ metricsURL(localAddrPort): 200,
+ healthURL(healthAddrPort): 200,
+ },
+ },
+ },
+ },
}
for _, test := range tests {
@@ -796,7 +919,26 @@ type phase struct {
return nil
})
if err != nil {
- t.Fatal(err)
+ t.Fatalf("phase %d: %v", i, err)
+ }
+
+ for url, want := range p.EndpointStatuses {
+ err := tstest.WaitFor(2*time.Second, func() error {
+ resp, err := http.Get(url)
+ if err != nil && want != -1 {
+ return fmt.Errorf("GET %s: %v", url, err)
+ }
+ if want > 0 && resp.StatusCode != want {
+ defer resp.Body.Close()
+ body, _ := io.ReadAll(resp.Body)
+ return fmt.Errorf("GET %s, want %d, got %d\n%s", url, want, resp.StatusCode, string(body))
+ }
+
+ return nil
+ })
+ if err != nil {
+ t.Fatalf("phase %d: %v", i, err)
+ }
}
}
waitLogLine(t, 2*time.Second, cbOut, "Startup complete, waiting for shutdown signal")
@@ -955,6 +1097,12 @@ func (l *localAPI) ServeHTTP(w http.ResponseWriter, r *http.Request) {
if r.Method != "GET" {
panic(fmt.Sprintf("unsupported method %q", r.Method))
}
+ case "/localapi/v0/usermetrics":
+ if r.Method != "GET" {
+ panic(fmt.Sprintf("unsupported method %q", r.Method))
+ }
+ w.Write([]byte("fake metrics"))
+ return
default:
panic(fmt.Sprintf("unsupported path %q", r.URL.Path))
}
diff --git a/cmd/containerboot/metrics.go b/cmd/containerboot/metrics.go
index 874774d7a..a8b9222a5 100644
--- a/cmd/containerboot/metrics.go
+++ b/cmd/containerboot/metrics.go
@@ -8,8 +8,6 @@
import (
"fmt"
"io"
- "log"
- "net"
"net/http"
"tailscale.com/client/tailscale"
@@ -64,28 +62,18 @@ func (m *metrics) handleDebug(w http.ResponseWriter, r *http.Request) {
proxy(w, r, debugURL, http.DefaultClient.Do)
}
-// runMetrics runs a simple HTTP metrics endpoint at /metrics, forwarding
+// metricsHandlers registers a simple HTTP metrics handler at /metrics, forwarding
// requests to tailscaled's /localapi/v0/usermetrics API.
//
// In 1.78.x and 1.80.x, it also proxies debug paths to tailscaled's debug
// endpoint if configured to ease migration for a breaking change serving user
// metrics instead of debug metrics on the "metrics" port.
-func runMetrics(addr string, m *metrics) {
- ln, err := net.Listen("tcp", addr)
- if err != nil {
- log.Fatalf("error listening on the provided metrics endpoint address %q: %v", addr, err)
+func metricsHandlers(mux *http.ServeMux, lc *tailscale.LocalClient, debugAddrPort string) {
+ m := &metrics{
+ lc: lc,
+ debugEndpoint: debugAddrPort,
}
- mux := http.NewServeMux()
mux.HandleFunc("GET /metrics", m.handleMetrics)
mux.HandleFunc("/debug/", m.handleDebug) // TODO(tomhjp): Remove for 1.82.0 release.
-
- log.Printf("Running metrics endpoint at %s/metrics", addr)
- ms := &http.Server{Handler: mux}
-
- go func() {
- if err := ms.Serve(ln); err != nil {
- log.Fatalf("failed running metrics endpoint: %v", err)
- }
- }()
}
diff --git a/cmd/containerboot/settings.go b/cmd/containerboot/settings.go
index c877682b9..1262a0e18 100644
--- a/cmd/containerboot/settings.go
+++ b/cmd/containerboot/settings.go
@@ -67,18 +67,15 @@ type settings struct {
PodIP string
PodIPv4 string
PodIPv6 string
- HealthCheckAddrPort string // TODO(tomhjp): use the local addr/port instead.
+ HealthCheckAddrPort string
LocalAddrPort string
MetricsEnabled bool
+ HealthCheckEnabled bool
DebugAddrPort string
EgressSvcsCfgPath string
}
func configFromEnv() (*settings, error) {
- defaultLocalAddrPort := ""
- if v, ok := os.LookupEnv("POD_IP"); ok && v != "" {
- defaultLocalAddrPort = fmt.Sprintf("%s:9002", v)
- }
cfg := &settings{
AuthKey: defaultEnvs([]string{"TS_AUTHKEY", "TS_AUTH_KEY"}, ""),
Hostname: defaultEnv("TS_HOSTNAME", ""),
@@ -105,8 +102,9 @@ func configFromEnv() (*settings, error) {
PodIP: defaultEnv("POD_IP", ""),
EnableForwardingOptimizations: defaultBool("TS_EXPERIMENTAL_ENABLE_FORWARDING_OPTIMIZATIONS", false),
HealthCheckAddrPort: defaultEnv("TS_HEALTHCHECK_ADDR_PORT", ""),
- LocalAddrPort: defaultEnv("TS_LOCAL_ADDR_PORT", defaultLocalAddrPort),
- MetricsEnabled: defaultBool("TS_METRICS_ENABLED", false),
+ LocalAddrPort: defaultEnv("TS_LOCAL_ADDR_PORT", "[::]:9002"),
+ MetricsEnabled: defaultBool("TS_ENABLE_METRICS", false),
+ HealthCheckEnabled: defaultBool("TS_ENABLE_HEALTH_CHECK", false),
DebugAddrPort: defaultEnv("TS_DEBUG_ADDR_PORT", ""),
EgressSvcsCfgPath: defaultEnv("TS_EGRESS_SERVICES_CONFIG_PATH", ""),
}
@@ -181,11 +179,12 @@ func (s *settings) validate() error {
return errors.New("TS_EXPERIMENTAL_ENABLE_FORWARDING_OPTIMIZATIONS is not supported in userspace mode")
}
if s.HealthCheckAddrPort != "" {
+ log.Printf("[warning] TS_HEALTHCHECK_ADDR_PORT is deprecated and will be removed in 1.82.0. Please use TS_ENABLE_HEALTH_CHECK and optionally TS_LOCAL_ADDR_PORT instead.")
if _, err := netip.ParseAddrPort(s.HealthCheckAddrPort); err != nil {
- return fmt.Errorf("error parsing TS_HEALTH_CHECK_ADDR_PORT value %q: %w", s.HealthCheckAddrPort, err)
+ return fmt.Errorf("error parsing TS_HEALTHCHECK_ADDR_PORT value %q: %w", s.HealthCheckAddrPort, err)
}
}
- if s.LocalAddrPort != "" {
+ if s.localMetricsEnabled() || s.localHealthEnabled() {
if _, err := netip.ParseAddrPort(s.LocalAddrPort); err != nil {
return fmt.Errorf("error parsing TS_LOCAL_ADDR_PORT value %q: %w", s.LocalAddrPort, err)
}
@@ -195,6 +194,9 @@ func (s *settings) validate() error {
return fmt.Errorf("error parsing TS_DEBUG_ADDR_PORT value %q: %w", s.DebugAddrPort, err)
}
}
+ if s.HealthCheckEnabled && s.HealthCheckAddrPort != "" {
+ return errors.New("TS_HEALTHCHECK_ADDR_PORT is deprecated and will be removed in 1.82.0, use TS_ENABLE_HEALTH_CHECK and optionally TS_LOCAL_ADDR_PORT")
+ }
return nil
}
@@ -292,6 +294,14 @@ func hasKubeStateStore(cfg *settings) bool {
return cfg.InKubernetes && cfg.KubernetesCanPatch && cfg.KubeSecret != ""
}
+func (cfg *settings) localMetricsEnabled() bool {
+ return cfg.LocalAddrPort != "" && cfg.MetricsEnabled
+}
+
+func (cfg *settings) localHealthEnabled() bool {
+ return cfg.LocalAddrPort != "" && cfg.HealthCheckEnabled
+}
+
// defaultEnv returns the value of the given envvar name, or defVal if
// unset.
func defaultEnv(name, defVal string) string {
diff --git a/cmd/k8s-operator/sts.go b/cmd/k8s-operator/sts.go
index b12b1cdd0..73c54a93d 100644
--- a/cmd/k8s-operator/sts.go
+++ b/cmd/k8s-operator/sts.go
@@ -818,7 +818,7 @@ func enableEndpoints(ss *appsv1.StatefulSet, metrics, debug bool) {
Value: "$(POD_IP):9002",
},
corev1.EnvVar{
- Name: "TS_METRICS_ENABLED",
+ Name: "TS_ENABLE_METRICS",
Value: "true",
},
)
diff --git a/cmd/k8s-operator/sts_test.go b/cmd/k8s-operator/sts_test.go
index 7986d1b91..05aafaee6 100644
--- a/cmd/k8s-operator/sts_test.go
+++ b/cmd/k8s-operator/sts_test.go
@@ -258,7 +258,7 @@ func Test_applyProxyClassToStatefulSet(t *testing.T) {
corev1.EnvVar{Name: "TS_DEBUG_ADDR_PORT", Value: "$(POD_IP):9001"},
corev1.EnvVar{Name: "TS_TAILSCALED_EXTRA_ARGS", Value: "--debug=$(TS_DEBUG_ADDR_PORT)"},
corev1.EnvVar{Name: "TS_LOCAL_ADDR_PORT", Value: "$(POD_IP):9002"},
- corev1.EnvVar{Name: "TS_METRICS_ENABLED", Value: "true"},
+ corev1.EnvVar{Name: "TS_ENABLE_METRICS", Value: "true"},
)
wantSS.Spec.Template.Spec.Containers[0].Ports = []corev1.ContainerPort{
{Name: "debug", Protocol: "TCP", ContainerPort: 9001},
@@ -273,7 +273,7 @@ func Test_applyProxyClassToStatefulSet(t *testing.T) {
wantSS = nonUserspaceProxySS.DeepCopy()
wantSS.Spec.Template.Spec.Containers[0].Env = append(wantSS.Spec.Template.Spec.Containers[0].Env,
corev1.EnvVar{Name: "TS_LOCAL_ADDR_PORT", Value: "$(POD_IP):9002"},
- corev1.EnvVar{Name: "TS_METRICS_ENABLED", Value: "true"},
+ corev1.EnvVar{Name: "TS_ENABLE_METRICS", Value: "true"},
)
wantSS.Spec.Template.Spec.Containers[0].Ports = []corev1.ContainerPort{{Name: "metrics", Protocol: "TCP", ContainerPort: 9002}}
gotSS = applyProxyClassToStatefulSet(proxyClassWithMetricsDebug(true, ptr.To(false)), nonUserspaceProxySS.DeepCopy(), new(tailscaleSTSConfig), zl.Sugar())