2023-01-27 13:37:20 -08:00
// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
2022-10-25 13:12:54 -07:00
//go:build linux
2022-12-16 14:09:46 -08:00
// The containerboot binary is a wrapper for starting tailscaled in a container.
// It handles reading the desired mode of operation out of environment
// variables, bringing up and authenticating Tailscale, and any other
// kubernetes-specific side jobs.
2022-10-25 13:12:54 -07:00
//
2022-12-16 14:09:46 -08:00
// As with most container things, configuration is passed through environment
// variables. All configuration is optional.
2022-10-25 13:12:54 -07:00
//
2023-01-03 15:17:44 -08:00
// - TS_AUTHKEY: the authkey to use for login.
2023-01-25 10:16:59 -08:00
// - TS_HOSTNAME: the hostname to request for the node.
2024-01-04 09:17:04 +00:00
// - TS_ROUTES: subnet routes to advertise. Explicitly setting it to an empty
// value will cause containerboot to stop acting as a subnet router for any
// previously advertised routes. To accept routes, use TS_EXTRA_ARGS to pass
// in --accept-routes.
2022-11-16 23:04:07 +05:00
// - TS_DEST_IP: proxy all incoming Tailscale traffic to the given
2024-04-23 17:30:00 +01:00
// destination defined by an IP address.
// - TS_EXPERIMENTAL_DEST_DNS_NAME: proxy all incoming Tailscale traffic to the given
// destination defined by a DNS name. The DNS name will be periodically resolved and firewall rules updated accordingly.
// This is currently intended to be used by the Kubernetes operator (ExternalName Services).
// This is an experimental env var and will likely change in the future.
2023-08-30 08:31:37 +01:00
// - TS_TAILNET_TARGET_IP: proxy all incoming non-Tailscale traffic to the given
2023-11-24 16:24:48 +00:00
// destination defined by an IP.
// - TS_TAILNET_TARGET_FQDN: proxy all incoming non-Tailscale traffic to the given
// destination defined by a MagicDNS name.
2022-11-16 23:04:07 +05:00
// - TS_TAILSCALED_EXTRA_ARGS: extra arguments to 'tailscaled'.
2023-10-15 18:41:28 -07:00
// - TS_EXTRA_ARGS: extra arguments to 'tailscale up'.
2022-11-16 23:04:07 +05:00
// - TS_USERSPACE: run with userspace networking (the default)
// instead of kernel networking.
// - TS_STATE_DIR: the directory in which to store tailscaled
// state. The data should persist across container
// restarts.
// - TS_ACCEPT_DNS: whether to use the tailnet's DNS configuration.
// - TS_KUBE_SECRET: the name of the Kubernetes secret in which to
// store tailscaled state.
// - TS_SOCKS5_SERVER: the address on which to listen for SOCKS5
// proxying into the tailnet.
// - TS_OUTBOUND_HTTP_PROXY_LISTEN: the address on which to listen
// for HTTP proxying into the tailnet.
// - TS_SOCKET: the path where the tailscaled LocalAPI socket should
// be created.
// - TS_AUTH_ONCE: if true, only attempt to log in if not already
2023-10-15 18:41:28 -07:00
// logged in. If false (the default, for backwards
// compatibility), forcibly log in every time the
// container starts.
2023-08-24 12:08:50 -04:00
// - TS_SERVE_CONFIG: if specified, is the file path where the ipn.ServeConfig is located.
// It will be applied once tailscaled is up and running. If the file contains
// ${TS_CERT_DOMAIN}, it will be replaced with the value of the available FQDN.
2023-08-25 16:26:04 -04:00
// It cannot be used in conjunction with TS_DEST_IP. The file is watched for changes,
// and will be re-applied when it changes.
2024-12-02 12:18:09 +00:00
// - TS_HEALTHCHECK_ADDR_PORT: deprecated, use TS_ENABLE_HEALTH_CHECK instead and optionally
// set TS_LOCAL_ADDR_PORT. Will be removed in 1.82.0.
// - TS_LOCAL_ADDR_PORT: the address and port to serve local metrics and health
// check endpoints if enabled via TS_ENABLE_METRICS and/or TS_ENABLE_HEALTH_CHECK.
// Defaults to [::]:9002, serving on all available interfaces.
// - TS_ENABLE_METRICS: if true, a metrics endpoint will be served at /metrics on
// the address specified by TS_LOCAL_ADDR_PORT. See https://tailscale.com/kb/1482/client-metrics
// for more information on the metrics exposed.
// - TS_ENABLE_HEALTH_CHECK: if true, a health check endpoint will be served at /healthz on
// the address specified by TS_LOCAL_ADDR_PORT. The health endpoint will return 200
// OK if this node has at least one tailnet IP address, otherwise returns 503.
2024-08-14 07:28:29 +03:00
// NB: the health criteria might change in the future.
2024-05-10 16:32:37 +01:00
// - TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR: if specified, a path to a
// directory that containers tailscaled config in file. The config file needs to be
// named cap-<current-tailscaled-cap>.hujson. If this is set, TS_HOSTNAME,
// TS_EXTRA_ARGS, TS_AUTHKEY,
2024-01-08 16:14:06 +00:00
// TS_ROUTES, TS_ACCEPT_DNS env vars must not be set. If this is set,
// containerboot only runs `tailscaled --config <path-to-this-configfile>`
// and not `tailscale up` or `tailscale set`.
// The config file contents are currently read once on container start.
// NB: This env var is currently experimental and the logic will likely change!
2024-06-10 19:19:03 +01:00
// TS_EXPERIMENTAL_ENABLE_FORWARDING_OPTIMIZATIONS: set to true to
// autoconfigure the default network interface for optimal performance for
// Tailscale subnet router/exit node.
// https://tailscale.com/kb/1320/performance-best-practices#linux-optimizations-for-subnet-routers-and-exit-nodes
// NB: This env var is currently experimental and the logic will likely change!
2024-02-08 06:45:42 +00:00
// - EXPERIMENTAL_ALLOW_PROXYING_CLUSTER_TRAFFIC_VIA_INGRESS: if set to true
// and if this containerboot instance is an L7 ingress proxy (created by
// the Kubernetes operator), set up rules to allow proxying cluster traffic,
// received on the Pod IP of this node, to the ingress target in the cluster.
// This, in conjunction with MagicDNS name resolution in cluster, can be
// useful for cases where a cluster workload needs to access a target in
// cluster using the same hostname (in this case, the MagicDNS name of the ingress proxy)
// as a non-cluster workload on tailnet.
// This is only meant to be configured by the Kubernetes operator.
2022-10-25 13:12:54 -07:00
//
2022-12-16 14:09:46 -08:00
// When running on Kubernetes, containerboot defaults to storing state in the
// "tailscale" kube secret. To store state on local disk instead, set
// TS_KUBE_SECRET="" and TS_STATE_DIR=/path/to/storage/dir. The state dir should
// be persistent storage.
//
2023-01-03 15:17:44 -08:00
// Additionally, if TS_AUTHKEY is not set and the TS_KUBE_SECRET contains an
2022-12-16 14:09:46 -08:00
// "authkey" field, that key is used as the tailscale authkey.
2022-10-25 13:12:54 -07:00
package main
import (
"context"
"errors"
"fmt"
"io/fs"
"log"
2024-04-23 17:30:00 +01:00
"math"
"net"
2024-12-02 12:18:09 +00:00
"net/http"
2022-10-25 13:12:54 -07:00
"net/netip"
"os"
"os/signal"
2022-11-09 22:01:34 -08:00
"path/filepath"
2024-04-23 17:30:00 +01:00
"slices"
2022-10-25 13:12:54 -07:00
"strings"
2023-11-16 20:23:18 +01:00
"sync"
2023-08-24 12:08:50 -04:00
"sync/atomic"
2022-10-25 13:12:54 -07:00
"syscall"
"time"
"golang.org/x/sys/unix"
"tailscale.com/client/tailscale"
2022-12-07 12:29:45 -08:00
"tailscale.com/ipn"
2024-05-10 16:32:37 +01:00
kubeutils "tailscale.com/k8s-operator"
2023-11-24 16:24:48 +00:00
"tailscale.com/tailcfg"
2023-10-11 07:26:40 -07:00
"tailscale.com/types/logger"
2023-08-24 12:08:50 -04:00
"tailscale.com/types/ptr"
2022-12-07 12:29:45 -08:00
"tailscale.com/util/deephash"
2023-10-11 07:26:40 -07:00
"tailscale.com/util/linuxfw"
2022-10-25 13:12:54 -07:00
)
2023-10-11 07:26:40 -07:00
func newNetfilterRunner ( logf logger . Logf ) ( linuxfw . NetfilterRunner , error ) {
if defaultBool ( "TS_TEST_FAKE_NETFILTER" , false ) {
return linuxfw . NewFakeIPTablesRunner ( ) , nil
}
2023-12-04 12:08:56 -05:00
return linuxfw . New ( logf , "" )
2023-10-11 07:26:40 -07:00
}
2022-10-25 13:12:54 -07:00
func main ( ) {
log . SetPrefix ( "boot: " )
tailscale . I_Acknowledge_This_API_Is_Unstable = true
2024-02-08 06:45:42 +00:00
2024-10-08 18:35:23 +01:00
cfg , err := configFromEnv ( )
if err != nil {
2024-01-08 16:14:06 +00:00
log . Fatalf ( "invalid configuration: %v" , err )
2023-11-24 16:24:48 +00:00
}
2022-10-25 13:12:54 -07:00
if ! cfg . UserspaceMode {
2022-11-09 22:01:34 -08:00
if err := ensureTunFile ( cfg . Root ) ; err != nil {
2022-10-25 13:12:54 -07:00
log . Fatalf ( "Unable to create tuntap device file: %v" , err )
}
2024-04-23 17:30:00 +01:00
if cfg . ProxyTargetIP != "" || cfg . ProxyTargetDNSName != "" || cfg . Routes != nil || cfg . TailnetTargetIP != "" || cfg . TailnetTargetFQDN != "" {
if err := ensureIPForwarding ( cfg . Root , cfg . ProxyTargetIP , cfg . TailnetTargetIP , cfg . TailnetTargetFQDN , cfg . Routes ) ; err != nil {
2022-11-07 15:34:08 -08:00
log . Printf ( "Failed to enable IP forwarding: %v" , err )
log . Printf ( "To run tailscale as a proxy or router container, IP forwarding must be enabled." )
if cfg . InKubernetes {
log . Fatalf ( "You can either set the sysctls as a privileged initContainer, or run the tailscale container with privileged=true." )
} else {
log . Fatalf ( "You can fix this by running the container with privileged=true, or the equivalent in your container runtime that permits access to sysctls." )
}
2022-10-25 13:12:54 -07:00
}
}
}
// Context is used for all setup stuff until we're in steady
// state, so that if something is hanging we eventually time out
// and crashloop the container.
2023-08-24 12:08:50 -04:00
bootCtx , cancel := context . WithTimeout ( context . Background ( ) , 60 * time . Second )
2022-10-25 13:12:54 -07:00
defer cancel ( )
2024-04-29 17:03:48 +01:00
if cfg . InKubernetes {
initKubeClient ( cfg . Root )
if err := cfg . setupKube ( bootCtx ) ; err != nil {
log . Fatalf ( "error setting up for running on Kubernetes: %v" , err )
2022-10-25 13:12:54 -07:00
}
}
2023-11-16 20:23:18 +01:00
client , daemonProcess , err := startTailscaled ( bootCtx , cfg )
2022-10-25 13:12:54 -07:00
if err != nil {
log . Fatalf ( "failed to bring up tailscale: %v" , err )
}
2023-11-16 20:23:18 +01:00
killTailscaled := func ( ) {
if err := daemonProcess . Signal ( unix . SIGTERM ) ; err != nil {
log . Fatalf ( "error shutting tailscaled down: %v" , err )
}
}
defer killTailscaled ( )
2022-10-25 13:12:54 -07:00
2024-12-02 12:18:09 +00:00
var healthCheck * healthz
if cfg . HealthCheckAddrPort != "" {
mux := http . NewServeMux ( )
log . Printf ( "Running healthcheck endpoint at %s/healthz" , cfg . HealthCheckAddrPort )
healthCheck = healthHandlers ( mux )
close := runHTTPServer ( mux , cfg . HealthCheckAddrPort )
defer close ( )
}
if cfg . localMetricsEnabled ( ) || cfg . localHealthEnabled ( ) {
mux := http . NewServeMux ( )
if cfg . localMetricsEnabled ( ) {
log . Printf ( "Running metrics endpoint at %s/metrics" , cfg . LocalAddrPort )
metricsHandlers ( mux , client , cfg . DebugAddrPort )
cmd/{containerboot,k8s-operator},k8s-operator: new options to expose user metrics (#14035)
containerboot:
Adds 3 new environment variables for containerboot, `TS_LOCAL_ADDR_PORT` (default
`"${POD_IP}:9002"`), `TS_METRICS_ENABLED` (default `false`), and `TS_DEBUG_ADDR_PORT`
(default `""`), to configure metrics and debug endpoints. In a follow-up PR, the
health check endpoint will be updated to use the `TS_LOCAL_ADDR_PORT` if
`TS_HEALTHCHECK_ADDR_PORT` hasn't been set.
Users previously only had access to internal debug metrics (which are unstable
and not recommended) via passing the `--debug` flag to tailscaled, but can now
set `TS_METRICS_ENABLED=true` to expose the stable metrics documented at
https://tailscale.com/kb/1482/client-metrics at `/metrics` on the addr/port
specified by `TS_LOCAL_ADDR_PORT`.
Users can also now configure a debug endpoint more directly via the
`TS_DEBUG_ADDR_PORT` environment variable. This is not recommended for production
use, but exposes an internal set of debug metrics and pprof endpoints.
operator:
The `ProxyClass` CRD's `.spec.metrics.enable` field now enables serving the
stable user metrics documented at https://tailscale.com/kb/1482/client-metrics
at `/metrics` on the same "metrics" container port that debug metrics were
previously served on. To smooth the transition for anyone relying on the way the
operator previously consumed this field, we also _temporarily_ serve tailscaled's
internal debug metrics on the same `/debug/metrics` path as before, until 1.82.0
when debug metrics will be turned off by default even if `.spec.metrics.enable`
is set. At that point, anyone who wishes to continue using the internal debug
metrics (not recommended) will need to set the new `ProxyClass` field
`.spec.statefulSet.pod.tailscaleContainer.debug.enable`.
Users who wish to opt out of the transitional behaviour, where enabling
`.spec.metrics.enable` also enables debug metrics, can set
`.spec.statefulSet.pod.tailscaleContainer.debug.enable` to false (recommended).
Separately but related, the operator will no longer specify a host port for the
"metrics" container port definition. This caused scheduling conflicts when k8s
needs to schedule more than one proxy per node, and was not necessary for allowing
the pod's port to be exposed to prometheus scrapers.
Updates #11292
---------
Co-authored-by: Kristoffer Dalby <kristoffer@tailscale.com>
Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
2024-11-22 15:41:07 +00:00
}
2024-12-02 12:18:09 +00:00
if cfg . localHealthEnabled ( ) {
log . Printf ( "Running healthcheck endpoint at %s/healthz" , cfg . LocalAddrPort )
healthCheck = healthHandlers ( mux )
}
close := runHTTPServer ( mux , cfg . LocalAddrPort )
defer close ( )
cmd/{containerboot,k8s-operator},k8s-operator: new options to expose user metrics (#14035)
containerboot:
Adds 3 new environment variables for containerboot, `TS_LOCAL_ADDR_PORT` (default
`"${POD_IP}:9002"`), `TS_METRICS_ENABLED` (default `false`), and `TS_DEBUG_ADDR_PORT`
(default `""`), to configure metrics and debug endpoints. In a follow-up PR, the
health check endpoint will be updated to use the `TS_LOCAL_ADDR_PORT` if
`TS_HEALTHCHECK_ADDR_PORT` hasn't been set.
Users previously only had access to internal debug metrics (which are unstable
and not recommended) via passing the `--debug` flag to tailscaled, but can now
set `TS_METRICS_ENABLED=true` to expose the stable metrics documented at
https://tailscale.com/kb/1482/client-metrics at `/metrics` on the addr/port
specified by `TS_LOCAL_ADDR_PORT`.
Users can also now configure a debug endpoint more directly via the
`TS_DEBUG_ADDR_PORT` environment variable. This is not recommended for production
use, but exposes an internal set of debug metrics and pprof endpoints.
operator:
The `ProxyClass` CRD's `.spec.metrics.enable` field now enables serving the
stable user metrics documented at https://tailscale.com/kb/1482/client-metrics
at `/metrics` on the same "metrics" container port that debug metrics were
previously served on. To smooth the transition for anyone relying on the way the
operator previously consumed this field, we also _temporarily_ serve tailscaled's
internal debug metrics on the same `/debug/metrics` path as before, until 1.82.0
when debug metrics will be turned off by default even if `.spec.metrics.enable`
is set. At that point, anyone who wishes to continue using the internal debug
metrics (not recommended) will need to set the new `ProxyClass` field
`.spec.statefulSet.pod.tailscaleContainer.debug.enable`.
Users who wish to opt out of the transitional behaviour, where enabling
`.spec.metrics.enable` also enables debug metrics, can set
`.spec.statefulSet.pod.tailscaleContainer.debug.enable` to false (recommended).
Separately but related, the operator will no longer specify a host port for the
"metrics" container port definition. This caused scheduling conflicts when k8s
needs to schedule more than one proxy per node, and was not necessary for allowing
the pod's port to be exposed to prometheus scrapers.
Updates #11292
---------
Co-authored-by: Kristoffer Dalby <kristoffer@tailscale.com>
Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
2024-11-22 15:41:07 +00:00
}
2024-06-10 19:19:03 +01:00
if cfg . EnableForwardingOptimizations {
if err := client . SetUDPGROForwarding ( bootCtx ) ; err != nil {
log . Printf ( "[unexpected] error enabling UDP GRO forwarding: %v" , err )
}
}
2023-08-24 12:08:50 -04:00
w , err := client . WatchIPNBus ( bootCtx , ipn . NotifyInitialNetMap | ipn . NotifyInitialPrefs | ipn . NotifyInitialState )
2022-12-05 15:38:50 -08:00
if err != nil {
2022-12-07 12:29:45 -08:00
log . Fatalf ( "failed to watch tailscaled for updates: %v" , err )
2022-12-05 15:38:50 -08:00
}
2023-12-08 09:46:32 -08:00
// Now that we've started tailscaled, we can symlink the socket to the
// default location if needed.
const defaultTailscaledSocketPath = "/var/run/tailscale/tailscaled.sock"
if cfg . Socket != "" && cfg . Socket != defaultTailscaledSocketPath {
// If we were given a socket path, symlink it to the default location so
// that the CLI can find it without any extra flags.
// See #6849.
dir := filepath . Dir ( defaultTailscaledSocketPath )
err := os . MkdirAll ( dir , 0700 )
if err == nil {
err = syscall . Symlink ( cfg . Socket , defaultTailscaledSocketPath )
}
if err != nil {
log . Printf ( "[warning] failed to symlink socket: %v\n\tTo interact with the Tailscale CLI please use `tailscale --socket=%q`" , err , cfg . Socket )
}
}
2022-12-07 12:29:45 -08:00
// Because we're still shelling out to `tailscale up` to get access to its
// flag parser, we have to stop watching the IPN bus so that we can block on
// the subcommand without stalling anything. Then once it's done, we resume
// watching the bus.
//
// Depending on the requested mode of operation, this auth step happens at
// different points in containerboot's lifecycle, hence the helper function.
didLogin := false
authTailscale := func ( ) error {
if didLogin {
return nil
}
didLogin = true
w . Close ( )
2023-10-15 18:41:28 -07:00
if err := tailscaleUp ( bootCtx , cfg ) ; err != nil {
2022-12-07 12:29:45 -08:00
return fmt . Errorf ( "failed to auth tailscale: %v" , err )
2022-10-25 13:12:54 -07:00
}
2023-08-24 12:08:50 -04:00
w , err = client . WatchIPNBus ( bootCtx , ipn . NotifyInitialNetMap | ipn . NotifyInitialState )
2022-12-07 12:29:45 -08:00
if err != nil {
return fmt . Errorf ( "rewatching tailscaled for updates after auth: %v" , err )
}
return nil
2022-10-25 13:12:54 -07:00
}
2022-12-07 12:29:45 -08:00
2024-01-08 16:14:06 +00:00
if isTwoStepConfigAlwaysAuth ( cfg ) {
2022-12-07 12:29:45 -08:00
if err := authTailscale ( ) ; err != nil {
log . Fatalf ( "failed to auth tailscale: %v" , err )
2022-10-25 13:12:54 -07:00
}
2022-12-07 12:29:45 -08:00
}
authLoop :
for {
n , err := w . Next ( )
if err != nil {
log . Fatalf ( "failed to read from tailscaled: %v" , err )
}
if n . State != nil {
switch * n . State {
case ipn . NeedsLogin :
2024-01-08 16:14:06 +00:00
if isOneStepConfig ( cfg ) {
cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets (#13531)
* cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets
This commit is first part of the work to allow running multiple
replicas of the Kubernetes operator egress proxies per tailnet service +
to allow exposing multiple tailnet services via each proxy replica.
This expands the existing iptables/nftables-based proxy configuration
mechanism.
A proxy can now be configured to route to one or more tailnet targets
via a (mounted) config file that, for each tailnet target, specifies:
- the target's tailnet IP or FQDN
- mappings of container ports to which cluster workloads will send traffic to
tailnet target ports where the traffic should be forwarded.
Example configfile contents:
{
"some-svc": {"tailnetTarget":{"fqdn":"foo.tailnetxyz.ts.net","ports"{"tcp:4006:80":{"protocol":"tcp","matchPort":4006,"targetPort":80},"tcp:4007:443":{"protocol":"tcp","matchPort":4007,"targetPort":443}}}}
}
A proxy that is configured with this config file will configure firewall rules
to route cluster traffic to the tailnet targets. It will then watch the config file
for updates as well as monitor relevant netmap updates and reconfigure firewall
as needed.
This adds a bunch of new iptables/nftables functionality to make it easier to dynamically update
the firewall rules without needing to restart the proxy Pod as well as to make
it easier to debug/understand the rules:
- for iptables, each portmapping is a DNAT rule with a comment pointing
at the 'service',i.e:
-A PREROUTING ! -i tailscale0 -p tcp -m tcp --dport 4006 -m comment --comment "some-svc:tcp:4006 -> tcp:80" -j DNAT --to-destination 100.64.1.18:80
Additionally there is a SNAT rule for each tailnet target, to mask the source address.
- for nftables, a separate prerouting chain is created for each tailnet target
and all the portmapping rules are placed in that chain. This makes it easier
to look up rules and delete services when no longer needed.
(nftables allows hooking a custom chain to a prerouting hook, so no extra work
is needed to ensure that the rules in the service chains are evaluated).
The next steps will be to get the Kubernetes Operator to generate
the configfile and ensure it is mounted to the relevant proxy nodes.
Updates tailscale/tailscale#13406
Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2024-09-29 16:30:53 +01:00
// This could happen if this is the first time tailscaled was run for this
// device and the auth key was not passed via the configfile.
2024-01-08 16:14:06 +00:00
log . Fatalf ( "invalid state: tailscaled daemon started with a config file, but tailscale is not logged in: ensure you pass a valid auth key in the config file." )
}
2022-12-07 12:29:45 -08:00
if err := authTailscale ( ) ; err != nil {
log . Fatalf ( "failed to auth tailscale: %v" , err )
}
case ipn . NeedsMachineAuth :
log . Printf ( "machine authorization required, please visit the admin panel" )
case ipn . Running :
// Technically, all we want is to keep monitoring the bus for
// netmap updates. However, in order to make the container crash
// if tailscale doesn't initially come up, the watch has a
// startup deadline on it. So, we have to break out of this
// watch loop, cancel the watch, and watch again with no
// deadline to continue monitoring for changes.
break authLoop
default :
log . Printf ( "tailscaled in state %q, waiting" , * n . State )
2022-10-25 13:12:54 -07:00
}
}
}
2022-12-07 12:29:45 -08:00
w . Close ( )
2023-11-16 20:23:18 +01:00
ctx , cancel := contextWithExitSignalWatch ( )
2023-08-24 12:08:50 -04:00
defer cancel ( )
2024-01-08 16:14:06 +00:00
if isTwoStepConfigAuthOnce ( cfg ) {
2023-10-15 18:41:28 -07:00
// Now that we are authenticated, we can set/reset any of the
// settings that we need to.
if err := tailscaleSet ( ctx , cfg ) ; err != nil {
log . Fatalf ( "failed to auth tailscale: %v" , err )
}
2023-08-24 12:08:50 -04:00
}
2023-09-29 09:08:49 -07:00
if cfg . ServeConfigPath != "" {
// Remove any serve config that may have been set by a previous run of
// containerboot, but only if we're providing a new one.
if err := client . SetServeConfig ( ctx , new ( ipn . ServeConfig ) ) ; err != nil {
log . Fatalf ( "failed to unset serve config: %v" , err )
}
2023-08-24 12:08:50 -04:00
}
2024-06-17 18:50:50 +01:00
if hasKubeStateStore ( cfg ) && isTwoStepConfigAuthOnce ( cfg ) {
2022-12-07 12:29:45 -08:00
// We were told to only auth once, so any secret-bound
// authkey is no longer needed. We don't strictly need to
// wipe it, but it's good hygiene.
log . Printf ( "Deleting authkey from kube secret" )
if err := deleteAuthKey ( ctx , cfg . KubeSecret ) ; err != nil {
log . Fatalf ( "deleting authkey from kube secret: %v" , err )
2022-10-25 13:12:54 -07:00
}
2022-12-07 12:29:45 -08:00
}
2023-08-24 12:08:50 -04:00
w , err = client . WatchIPNBus ( ctx , ipn . NotifyInitialNetMap | ipn . NotifyInitialState )
2022-12-07 12:29:45 -08:00
if err != nil {
log . Fatalf ( "rewatching tailscaled for updates after auth: %v" , err )
}
var (
2024-06-17 18:50:50 +01:00
startupTasksDone = false
currentIPs deephash . Sum // tailscale IPs assigned to device
currentDeviceID deephash . Sum // device ID
currentDeviceEndpoints deephash . Sum // device FQDN and IPs
2023-08-24 12:08:50 -04:00
2023-11-24 16:24:48 +00:00
currentEgressIPs deephash . Sum
2024-04-23 17:30:00 +01:00
addrs [ ] netip . Prefix
backendAddrs [ ] net . IP
2023-08-24 12:08:50 -04:00
certDomain = new ( atomic . Pointer [ string ] )
certDomainChanged = make ( chan bool , 1 )
2022-12-07 12:29:45 -08:00
)
2023-08-24 12:08:50 -04:00
if cfg . ServeConfigPath != "" {
go watchServeConfigChanges ( ctx , cfg . ServeConfigPath , certDomainChanged , certDomain , client )
}
2023-10-11 07:26:40 -07:00
var nfr linuxfw . NetfilterRunner
2024-06-17 18:50:50 +01:00
if isL3Proxy ( cfg ) {
2023-10-11 07:26:40 -07:00
nfr , err = newNetfilterRunner ( log . Printf )
if err != nil {
log . Fatalf ( "error creating new netfilter runner: %v" , err )
}
}
2024-04-23 17:30:00 +01:00
// Setup for proxies that are configured to proxy to a target specified
// by a DNS name (TS_EXPERIMENTAL_DEST_DNS_NAME).
const defaultCheckPeriod = time . Minute * 10 // how often to check what IPs the DNS name resolves to
var (
tc = make ( chan string , 1 )
failedResolveAttempts int
t * time . Timer = time . AfterFunc ( defaultCheckPeriod , func ( ) {
if cfg . ProxyTargetDNSName != "" {
tc <- "recheck"
}
} )
)
cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets (#13531)
* cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets
This commit is first part of the work to allow running multiple
replicas of the Kubernetes operator egress proxies per tailnet service +
to allow exposing multiple tailnet services via each proxy replica.
This expands the existing iptables/nftables-based proxy configuration
mechanism.
A proxy can now be configured to route to one or more tailnet targets
via a (mounted) config file that, for each tailnet target, specifies:
- the target's tailnet IP or FQDN
- mappings of container ports to which cluster workloads will send traffic to
tailnet target ports where the traffic should be forwarded.
Example configfile contents:
{
"some-svc": {"tailnetTarget":{"fqdn":"foo.tailnetxyz.ts.net","ports"{"tcp:4006:80":{"protocol":"tcp","matchPort":4006,"targetPort":80},"tcp:4007:443":{"protocol":"tcp","matchPort":4007,"targetPort":443}}}}
}
A proxy that is configured with this config file will configure firewall rules
to route cluster traffic to the tailnet targets. It will then watch the config file
for updates as well as monitor relevant netmap updates and reconfigure firewall
as needed.
This adds a bunch of new iptables/nftables functionality to make it easier to dynamically update
the firewall rules without needing to restart the proxy Pod as well as to make
it easier to debug/understand the rules:
- for iptables, each portmapping is a DNAT rule with a comment pointing
at the 'service',i.e:
-A PREROUTING ! -i tailscale0 -p tcp -m tcp --dport 4006 -m comment --comment "some-svc:tcp:4006 -> tcp:80" -j DNAT --to-destination 100.64.1.18:80
Additionally there is a SNAT rule for each tailnet target, to mask the source address.
- for nftables, a separate prerouting chain is created for each tailnet target
and all the portmapping rules are placed in that chain. This makes it easier
to look up rules and delete services when no longer needed.
(nftables allows hooking a custom chain to a prerouting hook, so no extra work
is needed to ensure that the rules in the service chains are evaluated).
The next steps will be to get the Kubernetes Operator to generate
the configfile and ensure it is mounted to the relevant proxy nodes.
Updates tailscale/tailscale#13406
Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2024-09-29 16:30:53 +01:00
// egressSvcsErrorChan will get an error sent to it if this containerboot instance is configured to expose 1+
// egress services in HA mode and errored.
var egressSvcsErrorChan = make ( chan error )
2024-04-23 17:30:00 +01:00
defer t . Stop ( )
// resetTimer resets timer for when to next attempt to resolve the DNS
// name for the proxy configured with TS_EXPERIMENTAL_DEST_DNS_NAME. The
// timer gets reset to 10 minutes from now unless the last resolution
// attempt failed. If one or more consecutive previous resolution
// attempts failed, the next resolution attempt will happen after the smallest
// of (10 minutes, 2 ^ number-of-consecutive-failed-resolution-attempts
// seconds) i.e 2s, 4s, 8s ... 10 minutes.
resetTimer := func ( lastResolveFailed bool ) {
if ! lastResolveFailed {
log . Printf ( "reconfigureTimer: next DNS resolution attempt in %s" , defaultCheckPeriod )
t . Reset ( defaultCheckPeriod )
failedResolveAttempts = 0
return
}
minDelay := 2 // 2 seconds
nextTick := time . Second * time . Duration ( math . Pow ( float64 ( minDelay ) , float64 ( failedResolveAttempts ) ) )
if nextTick > defaultCheckPeriod {
nextTick = defaultCheckPeriod // cap at 10 minutes
}
log . Printf ( "reconfigureTimer: last DNS resolution attempt failed, next DNS resolution attempt in %v" , nextTick )
t . Reset ( nextTick )
failedResolveAttempts ++
}
cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets (#13531)
* cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets
This commit is first part of the work to allow running multiple
replicas of the Kubernetes operator egress proxies per tailnet service +
to allow exposing multiple tailnet services via each proxy replica.
This expands the existing iptables/nftables-based proxy configuration
mechanism.
A proxy can now be configured to route to one or more tailnet targets
via a (mounted) config file that, for each tailnet target, specifies:
- the target's tailnet IP or FQDN
- mappings of container ports to which cluster workloads will send traffic to
tailnet target ports where the traffic should be forwarded.
Example configfile contents:
{
"some-svc": {"tailnetTarget":{"fqdn":"foo.tailnetxyz.ts.net","ports"{"tcp:4006:80":{"protocol":"tcp","matchPort":4006,"targetPort":80},"tcp:4007:443":{"protocol":"tcp","matchPort":4007,"targetPort":443}}}}
}
A proxy that is configured with this config file will configure firewall rules
to route cluster traffic to the tailnet targets. It will then watch the config file
for updates as well as monitor relevant netmap updates and reconfigure firewall
as needed.
This adds a bunch of new iptables/nftables functionality to make it easier to dynamically update
the firewall rules without needing to restart the proxy Pod as well as to make
it easier to debug/understand the rules:
- for iptables, each portmapping is a DNAT rule with a comment pointing
at the 'service',i.e:
-A PREROUTING ! -i tailscale0 -p tcp -m tcp --dport 4006 -m comment --comment "some-svc:tcp:4006 -> tcp:80" -j DNAT --to-destination 100.64.1.18:80
Additionally there is a SNAT rule for each tailnet target, to mask the source address.
- for nftables, a separate prerouting chain is created for each tailnet target
and all the portmapping rules are placed in that chain. This makes it easier
to look up rules and delete services when no longer needed.
(nftables allows hooking a custom chain to a prerouting hook, so no extra work
is needed to ensure that the rules in the service chains are evaluated).
The next steps will be to get the Kubernetes Operator to generate
the configfile and ensure it is mounted to the relevant proxy nodes.
Updates tailscale/tailscale#13406
Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2024-09-29 16:30:53 +01:00
var egressSvcsNotify chan ipn . Notify
2023-11-16 20:23:18 +01:00
notifyChan := make ( chan ipn . Notify )
errChan := make ( chan error )
go func ( ) {
for {
n , err := w . Next ( )
if err != nil {
errChan <- err
break
} else {
notifyChan <- n
}
}
} ( )
var wg sync . WaitGroup
2024-02-08 06:45:42 +00:00
2023-11-16 20:23:18 +01:00
runLoop :
2022-12-07 12:29:45 -08:00
for {
2023-11-16 20:23:18 +01:00
select {
case <- ctx . Done ( ) :
// Although killTailscaled() is deferred earlier, if we
// have started the reaper defined below, we need to
// kill tailscaled and let reaper clean up child
// processes.
killTailscaled ( )
break runLoop
case err := <- errChan :
2022-12-07 12:29:45 -08:00
log . Fatalf ( "failed to read from tailscaled: %v" , err )
2023-11-16 20:23:18 +01:00
case n := <- notifyChan :
if n . State != nil && * n . State != ipn . Running {
// Something's gone wrong and we've left the authenticated state.
// Our container image never recovered gracefully from this, and the
// control flow required to make it work now is hard. So, just crash
// the container and rely on the container runtime to restart us,
// whereupon we'll go through initial auth again.
log . Fatalf ( "tailscaled left running state (now in state %q), exiting" , * n . State )
2022-12-07 12:29:45 -08:00
}
2023-11-16 20:23:18 +01:00
if n . NetMap != nil {
2024-04-23 17:30:00 +01:00
addrs = n . NetMap . SelfNode . Addresses ( ) . AsSlice ( )
2023-11-16 20:23:18 +01:00
newCurrentIPs := deephash . Hash ( & addrs )
ipsHaveChanged := newCurrentIPs != currentIPs
2023-11-24 16:24:48 +00:00
2024-06-17 18:50:50 +01:00
// Store device ID in a Kubernetes Secret before
// setting up any routing rules. This ensures
// that, for containerboot instances that are
// Kubernetes operator proxies, the operator is
// able to retrieve the device ID from the
// Kubernetes Secret to clean up tailnet nodes
// for proxies whose route setup continuously
// fails.
deviceID := n . NetMap . SelfNode . StableID ( )
if hasKubeStateStore ( cfg ) && deephash . Update ( & currentDeviceID , & deviceID ) {
if err := storeDeviceID ( ctx , cfg . KubeSecret , n . NetMap . SelfNode . StableID ( ) ) ; err != nil {
log . Fatalf ( "storing device ID in Kubernetes Secret: %v" , err )
}
}
2023-11-24 16:24:48 +00:00
if cfg . TailnetTargetFQDN != "" {
var (
egressAddrs [ ] netip . Prefix
newCurentEgressIPs deephash . Sum
egressIPsHaveChanged bool
node tailcfg . NodeView
nodeFound bool
)
for _ , n := range n . NetMap . Peers {
if strings . EqualFold ( n . Name ( ) , cfg . TailnetTargetFQDN ) {
node = n
nodeFound = true
break
}
}
if ! nodeFound {
log . Printf ( "Tailscale node %q not found; it either does not exist, or not reachable because of ACLs" , cfg . TailnetTargetFQDN )
break
}
egressAddrs = node . Addresses ( ) . AsSlice ( )
newCurentEgressIPs = deephash . Hash ( & egressAddrs )
egressIPsHaveChanged = newCurentEgressIPs != currentEgressIPs
2024-10-03 20:15:00 +01:00
// The firewall rules get (re-)installed:
// - on startup
// - when the tailnet IPs of the tailnet target have changed
// - when the tailnet IPs of this node have changed
if ( egressIPsHaveChanged || ipsHaveChanged ) && len ( egressAddrs ) != 0 {
2024-07-05 12:21:48 +01:00
var rulesInstalled bool
2023-11-24 16:24:48 +00:00
for _ , egressAddr := range egressAddrs {
ea := egressAddr . Addr ( )
2024-07-05 12:21:48 +01:00
if ea . Is4 ( ) || ( ea . Is6 ( ) && nfr . HasIPV6NAT ( ) ) {
rulesInstalled = true
log . Printf ( "Installing forwarding rules for destination %v" , ea . String ( ) )
if err := installEgressForwardingRule ( ctx , ea . String ( ) , addrs , nfr ) ; err != nil {
log . Fatalf ( "installing egress proxy rules for destination %s: %v" , ea . String ( ) , err )
}
2023-11-24 16:24:48 +00:00
}
}
2024-07-05 12:21:48 +01:00
if ! rulesInstalled {
log . Fatalf ( "no forwarding rules for egress addresses %v, host supports IPv6: %v" , egressAddrs , nfr . HasIPV6NAT ( ) )
}
2023-11-24 16:24:48 +00:00
}
currentEgressIPs = newCurentEgressIPs
}
2024-04-23 17:30:00 +01:00
if cfg . ProxyTargetIP != "" && len ( addrs ) != 0 && ipsHaveChanged {
2023-11-16 20:23:18 +01:00
log . Printf ( "Installing proxy rules" )
2024-04-23 17:30:00 +01:00
if err := installIngressForwardingRule ( ctx , cfg . ProxyTargetIP , addrs , nfr ) ; err != nil {
2023-11-16 20:23:18 +01:00
log . Fatalf ( "installing ingress proxy rules: %v" , err )
2023-08-24 12:08:50 -04:00
}
}
2024-04-23 17:30:00 +01:00
if cfg . ProxyTargetDNSName != "" && len ( addrs ) != 0 && ipsHaveChanged {
newBackendAddrs , err := resolveDNS ( ctx , cfg . ProxyTargetDNSName )
if err != nil {
log . Printf ( "[unexpected] error resolving DNS name %s: %v" , cfg . ProxyTargetDNSName , err )
resetTimer ( true )
continue
}
backendsHaveChanged := ! ( slices . EqualFunc ( backendAddrs , newBackendAddrs , func ( ip1 net . IP , ip2 net . IP ) bool {
return slices . ContainsFunc ( newBackendAddrs , func ( ip net . IP ) bool { return ip . Equal ( ip1 ) } )
} ) )
if backendsHaveChanged {
log . Printf ( "installing ingress proxy rules for backends %v" , newBackendAddrs )
if err := installIngressForwardingRuleForDNSTarget ( ctx , newBackendAddrs , addrs , nfr ) ; err != nil {
log . Fatalf ( "error installing ingress proxy rules: %v" , err )
}
}
resetTimer ( false )
backendAddrs = newBackendAddrs
}
if cfg . ServeConfigPath != "" && len ( n . NetMap . DNS . CertDomains ) != 0 {
2023-11-16 20:23:18 +01:00
cd := n . NetMap . DNS . CertDomains [ 0 ]
prev := certDomain . Swap ( ptr . To ( cd ) )
if prev == nil || * prev != cd {
select {
case certDomainChanged <- true :
default :
}
}
2023-08-30 08:31:37 +01:00
}
2024-04-23 17:30:00 +01:00
if cfg . TailnetTargetIP != "" && ipsHaveChanged && len ( addrs ) != 0 {
2023-11-24 16:24:48 +00:00
log . Printf ( "Installing forwarding rules for destination %v" , cfg . TailnetTargetIP )
2023-11-16 20:23:18 +01:00
if err := installEgressForwardingRule ( ctx , cfg . TailnetTargetIP , addrs , nfr ) ; err != nil {
log . Fatalf ( "installing egress proxy rules: %v" , err )
}
}
2024-02-08 06:45:42 +00:00
// If this is a L7 cluster ingress proxy (set up
// by Kubernetes operator) and proxying of
// cluster traffic to the ingress target is
// enabled, set up proxy rule each time the
// tailnet IPs of this node change (including
// the first time they become available).
2024-04-23 17:30:00 +01:00
if cfg . AllowProxyingClusterTrafficViaIngress && cfg . ServeConfigPath != "" && ipsHaveChanged && len ( addrs ) != 0 {
2024-02-08 06:45:42 +00:00
log . Printf ( "installing rules to forward traffic for %s to node's tailnet IP" , cfg . PodIP )
if err := installTSForwardingRuleForDestination ( ctx , cfg . PodIP , addrs , nfr ) ; err != nil {
log . Fatalf ( "installing rules to forward traffic to node's tailnet IP: %v" , err )
}
}
2023-11-16 20:23:18 +01:00
currentIPs = newCurrentIPs
2023-08-30 08:31:37 +01:00
2024-06-17 18:50:50 +01:00
// Only store device FQDN and IP addresses to
// Kubernetes Secret when any required proxy
// route setup has succeeded. IPs and FQDN are
// read from the Secret by the Tailscale
// Kubernetes operator and, for some proxy
// types, such as Tailscale Ingress, advertized
// on the Ingress status. Writing them to the
// Secret only after the proxy routing has been
// set up ensures that the operator does not
// advertize endpoints of broken proxies.
// TODO (irbekrm): instead of using the IP and FQDN, have some other mechanism for the proxy signal that it is 'Ready'.
deviceEndpoints := [ ] any { n . NetMap . SelfNode . Name ( ) , n . NetMap . SelfNode . Addresses ( ) }
if hasKubeStateStore ( cfg ) && deephash . Update ( & currentDeviceEndpoints , & deviceEndpoints ) {
if err := storeDeviceEndpoints ( ctx , cfg . KubeSecret , n . NetMap . SelfNode . Name ( ) , n . NetMap . SelfNode . Addresses ( ) . AsSlice ( ) ) ; err != nil {
log . Fatalf ( "storing device IPs and FQDN in Kubernetes Secret: %v" , err )
2023-11-16 20:23:18 +01:00
}
2022-12-07 12:29:45 -08:00
}
2024-08-14 07:28:29 +03:00
2024-12-02 12:18:09 +00:00
if healthCheck != nil {
healthCheck . update ( len ( addrs ) != 0 )
2024-08-14 07:28:29 +03:00
}
cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets (#13531)
* cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets
This commit is first part of the work to allow running multiple
replicas of the Kubernetes operator egress proxies per tailnet service +
to allow exposing multiple tailnet services via each proxy replica.
This expands the existing iptables/nftables-based proxy configuration
mechanism.
A proxy can now be configured to route to one or more tailnet targets
via a (mounted) config file that, for each tailnet target, specifies:
- the target's tailnet IP or FQDN
- mappings of container ports to which cluster workloads will send traffic to
tailnet target ports where the traffic should be forwarded.
Example configfile contents:
{
"some-svc": {"tailnetTarget":{"fqdn":"foo.tailnetxyz.ts.net","ports"{"tcp:4006:80":{"protocol":"tcp","matchPort":4006,"targetPort":80},"tcp:4007:443":{"protocol":"tcp","matchPort":4007,"targetPort":443}}}}
}
A proxy that is configured with this config file will configure firewall rules
to route cluster traffic to the tailnet targets. It will then watch the config file
for updates as well as monitor relevant netmap updates and reconfigure firewall
as needed.
This adds a bunch of new iptables/nftables functionality to make it easier to dynamically update
the firewall rules without needing to restart the proxy Pod as well as to make
it easier to debug/understand the rules:
- for iptables, each portmapping is a DNAT rule with a comment pointing
at the 'service',i.e:
-A PREROUTING ! -i tailscale0 -p tcp -m tcp --dport 4006 -m comment --comment "some-svc:tcp:4006 -> tcp:80" -j DNAT --to-destination 100.64.1.18:80
Additionally there is a SNAT rule for each tailnet target, to mask the source address.
- for nftables, a separate prerouting chain is created for each tailnet target
and all the portmapping rules are placed in that chain. This makes it easier
to look up rules and delete services when no longer needed.
(nftables allows hooking a custom chain to a prerouting hook, so no extra work
is needed to ensure that the rules in the service chains are evaluated).
The next steps will be to get the Kubernetes Operator to generate
the configfile and ensure it is mounted to the relevant proxy nodes.
Updates tailscale/tailscale#13406
Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2024-09-29 16:30:53 +01:00
if egressSvcsNotify != nil {
egressSvcsNotify <- n
}
2022-12-07 12:29:45 -08:00
}
2023-11-16 20:23:18 +01:00
if ! startupTasksDone {
cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets (#13531)
* cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets
This commit is first part of the work to allow running multiple
replicas of the Kubernetes operator egress proxies per tailnet service +
to allow exposing multiple tailnet services via each proxy replica.
This expands the existing iptables/nftables-based proxy configuration
mechanism.
A proxy can now be configured to route to one or more tailnet targets
via a (mounted) config file that, for each tailnet target, specifies:
- the target's tailnet IP or FQDN
- mappings of container ports to which cluster workloads will send traffic to
tailnet target ports where the traffic should be forwarded.
Example configfile contents:
{
"some-svc": {"tailnetTarget":{"fqdn":"foo.tailnetxyz.ts.net","ports"{"tcp:4006:80":{"protocol":"tcp","matchPort":4006,"targetPort":80},"tcp:4007:443":{"protocol":"tcp","matchPort":4007,"targetPort":443}}}}
}
A proxy that is configured with this config file will configure firewall rules
to route cluster traffic to the tailnet targets. It will then watch the config file
for updates as well as monitor relevant netmap updates and reconfigure firewall
as needed.
This adds a bunch of new iptables/nftables functionality to make it easier to dynamically update
the firewall rules without needing to restart the proxy Pod as well as to make
it easier to debug/understand the rules:
- for iptables, each portmapping is a DNAT rule with a comment pointing
at the 'service',i.e:
-A PREROUTING ! -i tailscale0 -p tcp -m tcp --dport 4006 -m comment --comment "some-svc:tcp:4006 -> tcp:80" -j DNAT --to-destination 100.64.1.18:80
Additionally there is a SNAT rule for each tailnet target, to mask the source address.
- for nftables, a separate prerouting chain is created for each tailnet target
and all the portmapping rules are placed in that chain. This makes it easier
to look up rules and delete services when no longer needed.
(nftables allows hooking a custom chain to a prerouting hook, so no extra work
is needed to ensure that the rules in the service chains are evaluated).
The next steps will be to get the Kubernetes Operator to generate
the configfile and ensure it is mounted to the relevant proxy nodes.
Updates tailscale/tailscale#13406
Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2024-09-29 16:30:53 +01:00
// For containerboot instances that act as TCP proxies (proxying traffic to an endpoint
// passed via one of the env vars that containerboot reads) and store state in a
// Kubernetes Secret, we consider startup tasks done at the point when device info has
// been successfully stored to state Secret. For all other containerboot instances, if
// we just get to this point the startup tasks can be considered done.
2024-06-17 18:50:50 +01:00
if ! isL3Proxy ( cfg ) || ! hasKubeStateStore ( cfg ) || ( currentDeviceEndpoints != deephash . Sum { } && currentDeviceID != deephash . Sum { } ) {
2023-11-16 20:23:18 +01:00
// This log message is used in tests to detect when all
// post-auth configuration is done.
log . Println ( "Startup complete, waiting for shutdown signal" )
startupTasksDone = true
cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets (#13531)
* cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets
This commit is first part of the work to allow running multiple
replicas of the Kubernetes operator egress proxies per tailnet service +
to allow exposing multiple tailnet services via each proxy replica.
This expands the existing iptables/nftables-based proxy configuration
mechanism.
A proxy can now be configured to route to one or more tailnet targets
via a (mounted) config file that, for each tailnet target, specifies:
- the target's tailnet IP or FQDN
- mappings of container ports to which cluster workloads will send traffic to
tailnet target ports where the traffic should be forwarded.
Example configfile contents:
{
"some-svc": {"tailnetTarget":{"fqdn":"foo.tailnetxyz.ts.net","ports"{"tcp:4006:80":{"protocol":"tcp","matchPort":4006,"targetPort":80},"tcp:4007:443":{"protocol":"tcp","matchPort":4007,"targetPort":443}}}}
}
A proxy that is configured with this config file will configure firewall rules
to route cluster traffic to the tailnet targets. It will then watch the config file
for updates as well as monitor relevant netmap updates and reconfigure firewall
as needed.
This adds a bunch of new iptables/nftables functionality to make it easier to dynamically update
the firewall rules without needing to restart the proxy Pod as well as to make
it easier to debug/understand the rules:
- for iptables, each portmapping is a DNAT rule with a comment pointing
at the 'service',i.e:
-A PREROUTING ! -i tailscale0 -p tcp -m tcp --dport 4006 -m comment --comment "some-svc:tcp:4006 -> tcp:80" -j DNAT --to-destination 100.64.1.18:80
Additionally there is a SNAT rule for each tailnet target, to mask the source address.
- for nftables, a separate prerouting chain is created for each tailnet target
and all the portmapping rules are placed in that chain. This makes it easier
to look up rules and delete services when no longer needed.
(nftables allows hooking a custom chain to a prerouting hook, so no extra work
is needed to ensure that the rules in the service chains are evaluated).
The next steps will be to get the Kubernetes Operator to generate
the configfile and ensure it is mounted to the relevant proxy nodes.
Updates tailscale/tailscale#13406
Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2024-09-29 16:30:53 +01:00
// Configure egress proxy. Egress proxy will set up firewall rules to proxy
// traffic to tailnet targets configured in the provided configuration file. It
// will then continuously monitor the config file and netmap updates and
// reconfigure the firewall rules as needed. If any of its operations fail, it
// will crash this node.
if cfg . EgressSvcsCfgPath != "" {
log . Printf ( "configuring egress proxy using configuration file at %s" , cfg . EgressSvcsCfgPath )
egressSvcsNotify = make ( chan ipn . Notify )
ep := egressProxy {
cfgPath : cfg . EgressSvcsCfgPath ,
nfr : nfr ,
kc : kc ,
stateSecret : cfg . KubeSecret ,
netmapChan : egressSvcsNotify ,
2024-10-08 18:35:23 +01:00
podIPv4 : cfg . PodIPv4 ,
cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets (#13531)
* cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets
This commit is first part of the work to allow running multiple
replicas of the Kubernetes operator egress proxies per tailnet service +
to allow exposing multiple tailnet services via each proxy replica.
This expands the existing iptables/nftables-based proxy configuration
mechanism.
A proxy can now be configured to route to one or more tailnet targets
via a (mounted) config file that, for each tailnet target, specifies:
- the target's tailnet IP or FQDN
- mappings of container ports to which cluster workloads will send traffic to
tailnet target ports where the traffic should be forwarded.
Example configfile contents:
{
"some-svc": {"tailnetTarget":{"fqdn":"foo.tailnetxyz.ts.net","ports"{"tcp:4006:80":{"protocol":"tcp","matchPort":4006,"targetPort":80},"tcp:4007:443":{"protocol":"tcp","matchPort":4007,"targetPort":443}}}}
}
A proxy that is configured with this config file will configure firewall rules
to route cluster traffic to the tailnet targets. It will then watch the config file
for updates as well as monitor relevant netmap updates and reconfigure firewall
as needed.
This adds a bunch of new iptables/nftables functionality to make it easier to dynamically update
the firewall rules without needing to restart the proxy Pod as well as to make
it easier to debug/understand the rules:
- for iptables, each portmapping is a DNAT rule with a comment pointing
at the 'service',i.e:
-A PREROUTING ! -i tailscale0 -p tcp -m tcp --dport 4006 -m comment --comment "some-svc:tcp:4006 -> tcp:80" -j DNAT --to-destination 100.64.1.18:80
Additionally there is a SNAT rule for each tailnet target, to mask the source address.
- for nftables, a separate prerouting chain is created for each tailnet target
and all the portmapping rules are placed in that chain. This makes it easier
to look up rules and delete services when no longer needed.
(nftables allows hooking a custom chain to a prerouting hook, so no extra work
is needed to ensure that the rules in the service chains are evaluated).
The next steps will be to get the Kubernetes Operator to generate
the configfile and ensure it is mounted to the relevant proxy nodes.
Updates tailscale/tailscale#13406
Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2024-09-29 16:30:53 +01:00
tailnetAddrs : addrs ,
}
go func ( ) {
if err := ep . run ( ctx , n ) ; err != nil {
egressSvcsErrorChan <- err
}
} ( )
}
// Wait on tailscaled process. It won't be cleaned up by default when the
// container exits as it is not PID1. TODO (irbekrm): perhaps we can replace the
// reaper by a running cmd.Wait in a goroutine immediately after starting
// tailscaled?
2023-11-16 20:23:18 +01:00
reaper := func ( ) {
defer wg . Done ( )
for {
var status unix . WaitStatus
2024-04-27 20:28:09 +01:00
_ , err := unix . Wait4 ( daemonProcess . Pid , & status , 0 , nil )
2023-11-16 20:23:18 +01:00
if errors . Is ( err , unix . EINTR ) {
continue
}
if err != nil {
2024-04-27 20:28:09 +01:00
log . Fatalf ( "Waiting for tailscaled to exit: %v" , err )
2023-11-16 20:23:18 +01:00
}
2024-04-27 20:28:09 +01:00
log . Print ( "tailscaled exited" )
os . Exit ( 0 )
2022-12-07 12:29:45 -08:00
}
}
2023-11-16 20:23:18 +01:00
wg . Add ( 1 )
go reaper ( )
}
2022-12-07 12:29:45 -08:00
}
2024-04-23 17:30:00 +01:00
case <- tc :
newBackendAddrs , err := resolveDNS ( ctx , cfg . ProxyTargetDNSName )
if err != nil {
log . Printf ( "[unexpected] error resolving DNS name %s: %v" , cfg . ProxyTargetDNSName , err )
resetTimer ( true )
continue
}
backendsHaveChanged := ! ( slices . EqualFunc ( backendAddrs , newBackendAddrs , func ( ip1 net . IP , ip2 net . IP ) bool {
return slices . ContainsFunc ( newBackendAddrs , func ( ip net . IP ) bool { return ip . Equal ( ip1 ) } )
} ) )
if backendsHaveChanged && len ( addrs ) != 0 {
log . Printf ( "Backend address change detected, installing proxy rules for backends %v" , newBackendAddrs )
if err := installIngressForwardingRuleForDNSTarget ( ctx , newBackendAddrs , addrs , nfr ) ; err != nil {
log . Fatalf ( "installing ingress proxy rules for DNS target %s: %v" , cfg . ProxyTargetDNSName , err )
}
}
backendAddrs = newBackendAddrs
resetTimer ( false )
cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets (#13531)
* cmd/containerboot,kube,util/linuxfw: configure kube egress proxies to route to 1+ tailnet targets
This commit is first part of the work to allow running multiple
replicas of the Kubernetes operator egress proxies per tailnet service +
to allow exposing multiple tailnet services via each proxy replica.
This expands the existing iptables/nftables-based proxy configuration
mechanism.
A proxy can now be configured to route to one or more tailnet targets
via a (mounted) config file that, for each tailnet target, specifies:
- the target's tailnet IP or FQDN
- mappings of container ports to which cluster workloads will send traffic to
tailnet target ports where the traffic should be forwarded.
Example configfile contents:
{
"some-svc": {"tailnetTarget":{"fqdn":"foo.tailnetxyz.ts.net","ports"{"tcp:4006:80":{"protocol":"tcp","matchPort":4006,"targetPort":80},"tcp:4007:443":{"protocol":"tcp","matchPort":4007,"targetPort":443}}}}
}
A proxy that is configured with this config file will configure firewall rules
to route cluster traffic to the tailnet targets. It will then watch the config file
for updates as well as monitor relevant netmap updates and reconfigure firewall
as needed.
This adds a bunch of new iptables/nftables functionality to make it easier to dynamically update
the firewall rules without needing to restart the proxy Pod as well as to make
it easier to debug/understand the rules:
- for iptables, each portmapping is a DNAT rule with a comment pointing
at the 'service',i.e:
-A PREROUTING ! -i tailscale0 -p tcp -m tcp --dport 4006 -m comment --comment "some-svc:tcp:4006 -> tcp:80" -j DNAT --to-destination 100.64.1.18:80
Additionally there is a SNAT rule for each tailnet target, to mask the source address.
- for nftables, a separate prerouting chain is created for each tailnet target
and all the portmapping rules are placed in that chain. This makes it easier
to look up rules and delete services when no longer needed.
(nftables allows hooking a custom chain to a prerouting hook, so no extra work
is needed to ensure that the rules in the service chains are evaluated).
The next steps will be to get the Kubernetes Operator to generate
the configfile and ensure it is mounted to the relevant proxy nodes.
Updates tailscale/tailscale#13406
Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2024-09-29 16:30:53 +01:00
case e := <- egressSvcsErrorChan :
log . Fatalf ( "egress proxy failed: %v" , e )
2022-10-25 13:12:54 -07:00
}
}
2023-11-16 20:23:18 +01:00
wg . Wait ( )
2022-10-25 13:12:54 -07:00
}
// ensureTunFile checks that /dev/net/tun exists, creating it if
// missing.
2022-11-09 22:01:34 -08:00
func ensureTunFile ( root string ) error {
2022-10-25 13:12:54 -07:00
// Verify that /dev/net/tun exists, in some container envs it
// needs to be mknod-ed.
2022-11-09 22:01:34 -08:00
if _ , err := os . Stat ( filepath . Join ( root , "dev/net" ) ) ; errors . Is ( err , fs . ErrNotExist ) {
if err := os . MkdirAll ( filepath . Join ( root , "dev/net" ) , 0755 ) ; err != nil {
2022-10-25 13:12:54 -07:00
return err
}
}
2022-11-09 22:01:34 -08:00
if _ , err := os . Stat ( filepath . Join ( root , "dev/net/tun" ) ) ; errors . Is ( err , fs . ErrNotExist ) {
2022-10-25 13:12:54 -07:00
dev := unix . Mkdev ( 10 , 200 ) // tuntap major and minor
2022-11-09 22:01:34 -08:00
if err := unix . Mknod ( filepath . Join ( root , "dev/net/tun" ) , 0600 | unix . S_IFCHR , int ( dev ) ) ; err != nil {
2022-10-25 13:12:54 -07:00
return err
}
}
return nil
}
2024-04-23 17:30:00 +01:00
func resolveDNS ( ctx context . Context , name string ) ( [ ] net . IP , error ) {
// TODO (irbekrm): look at using recursive.Resolver instead to resolve
// the DNS names as well as retrieve TTLs. It looks though that this
// seems to return very short TTLs (shorter than on the actual records).
ip4s , err := net . DefaultResolver . LookupIP ( ctx , "ip4" , name )
if err != nil {
if e , ok := err . ( * net . DNSError ) ; ! ( ok && e . IsNotFound ) {
return nil , fmt . Errorf ( "error looking up IPv4 addresses: %v" , err )
}
}
ip6s , err := net . DefaultResolver . LookupIP ( ctx , "ip6" , name )
if err != nil {
if e , ok := err . ( * net . DNSError ) ; ! ( ok && e . IsNotFound ) {
return nil , fmt . Errorf ( "error looking up IPv6 addresses: %v" , err )
}
}
if len ( ip4s ) == 0 && len ( ip6s ) == 0 {
return nil , fmt . Errorf ( "no IPv4 or IPv6 addresses found for host: %s" , name )
}
return append ( ip4s , ip6s ... ) , nil
}
2023-11-16 20:23:18 +01:00
// contextWithExitSignalWatch watches for SIGTERM/SIGINT signals. It returns a
// context that gets cancelled when a signal is received and a cancel function
// that can be called to free the resources when the watch should be stopped.
func contextWithExitSignalWatch ( ) ( context . Context , func ( ) ) {
closeChan := make ( chan string )
ctx , cancel := context . WithCancel ( context . Background ( ) )
signalChan := make ( chan os . Signal , 1 )
signal . Notify ( signalChan , syscall . SIGINT , syscall . SIGTERM )
go func ( ) {
select {
case <- signalChan :
cancel ( )
case <- closeChan :
return
}
} ( )
f := func ( ) {
closeChan <- "goodbye"
}
return ctx , f
}
2024-01-08 16:14:06 +00:00
2024-05-10 16:32:37 +01:00
// tailscaledConfigFilePath returns the path to the tailscaled config file that
// should be used for the current capability version. It is determined by the
// TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR environment variable and looks for a
// file named cap-<capability_version>.hujson in the directory. It searches for
// the highest capability version that is less than or equal to the current
// capability version.
func tailscaledConfigFilePath ( ) string {
dir := os . Getenv ( "TS_EXPERIMENTAL_VERSIONED_CONFIG_DIR" )
if dir == "" {
return ""
}
fe , err := os . ReadDir ( dir )
if err != nil {
log . Fatalf ( "error reading tailscaled config directory %q: %v" , dir , err )
}
maxCompatVer := tailcfg . CapabilityVersion ( - 1 )
for _ , e := range fe {
// We don't check if type if file as in most cases this will
// come from a mounted kube Secret, where the directory contents
// will be various symlinks.
if e . Type ( ) . IsDir ( ) {
continue
}
cv , err := kubeutils . CapVerFromFileName ( e . Name ( ) )
if err != nil {
continue
}
if cv > maxCompatVer && cv <= tailcfg . CurrentCapabilityVersion {
maxCompatVer = cv
}
}
if maxCompatVer == - 1 {
2024-11-12 17:13:26 +00:00
log . Fatalf ( "no tailscaled config file found in %q for current capability version %d" , dir , tailcfg . CurrentCapabilityVersion )
2024-05-10 16:32:37 +01:00
}
2024-11-12 17:13:26 +00:00
filePath := filepath . Join ( dir , kubeutils . TailscaledConfigFileName ( maxCompatVer ) )
log . Printf ( "Using tailscaled config file %q to match current capability version %d" , filePath , tailcfg . CurrentCapabilityVersion )
return filePath
2024-05-10 16:32:37 +01:00
}
2024-12-02 12:18:09 +00:00
func runHTTPServer ( mux * http . ServeMux , addr string ) ( close func ( ) error ) {
ln , err := net . Listen ( "tcp" , addr )
if err != nil {
log . Fatalf ( "failed to listen on addr %q: %v" , addr , err )
}
srv := & http . Server { Handler : mux }
go func ( ) {
if err := srv . Serve ( ln ) ; err != nil {
log . Fatalf ( "failed running server: %v" , err )
}
} ( )
return func ( ) error {
err := srv . Shutdown ( context . Background ( ) )
return errors . Join ( err , ln . Close ( ) )
}
}