mirror of
https://github.com/tailscale/tailscale.git
synced 2025-04-22 08:51:41 +00:00

Ensure no services are advertised as part of shutting down tailscaled. Prefs are only edited if services are currently advertised, and they're edited we wait for control's ~15s (+ buffer) delay to failover. Note that editing prefs will trigger a synchronous write to the state Secret, so it may fail to persist state if the ProxyGroup is getting scaled down and therefore has its RBAC deleted at the same time, but that failure doesn't stop prefs being updated within the local backend, doesn't affect connectivity to control, and the state Secret is about to get deleted anyway, so the only negative side effect is a harmless error log during shutdown. Control still learns that the node is no longer advertising the service and triggers the failover. Note that the first version of this used a PreStop lifecycle hook, but that only supports GET methods and we need the shutdown to trigger side effects (updating prefs) so it didn't seem appropriate to expose that functionality on a GET endpoint that's accessible on the k8s network. Updates tailscale/corp#24795 Change-Id: I0a9a4fe7a5395ca76135ceead05cbc3ee32b3d3c Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
216 lines
6.5 KiB
Go
216 lines
6.5 KiB
Go
// Copyright (c) Tailscale Inc & AUTHORS
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
//go:build linux
|
|
|
|
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"reflect"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/fsnotify/fsnotify"
|
|
"tailscale.com/client/local"
|
|
"tailscale.com/ipn"
|
|
"tailscale.com/kube/kubetypes"
|
|
"tailscale.com/types/netmap"
|
|
)
|
|
|
|
// watchServeConfigChanges watches path for changes, and when it sees one, reads
|
|
// the serve config from it, replacing ${TS_CERT_DOMAIN} with certDomain, and
|
|
// applies it to lc. It exits when ctx is canceled. cdChanged is a channel that
|
|
// is written to when the certDomain changes, causing the serve config to be
|
|
// re-read and applied.
|
|
func watchServeConfigChanges(ctx context.Context, cdChanged <-chan bool, certDomainAtomic *atomic.Pointer[string], lc *local.Client, kc *kubeClient, cfg *settings) {
|
|
if certDomainAtomic == nil {
|
|
panic("certDomainAtomic must not be nil")
|
|
}
|
|
|
|
var tickChan <-chan time.Time
|
|
var eventChan <-chan fsnotify.Event
|
|
if w, err := fsnotify.NewWatcher(); err != nil {
|
|
// Creating a new fsnotify watcher would fail for example if inotify was not able to create a new file descriptor.
|
|
// See https://github.com/tailscale/tailscale/issues/15081
|
|
log.Printf("serve proxy: failed to create fsnotify watcher, timer-only mode: %v", err)
|
|
ticker := time.NewTicker(5 * time.Second)
|
|
defer ticker.Stop()
|
|
tickChan = ticker.C
|
|
} else {
|
|
defer w.Close()
|
|
if err := w.Add(filepath.Dir(cfg.ServeConfigPath)); err != nil {
|
|
log.Fatalf("serve proxy: failed to add fsnotify watch: %v", err)
|
|
}
|
|
eventChan = w.Events
|
|
}
|
|
|
|
var certDomain string
|
|
var prevServeConfig *ipn.ServeConfig
|
|
var cm certManager
|
|
if cfg.CertShareMode == "rw" {
|
|
cm = certManager{
|
|
lc: lc,
|
|
}
|
|
}
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-cdChanged:
|
|
certDomain = *certDomainAtomic.Load()
|
|
case <-tickChan:
|
|
case <-eventChan:
|
|
// We can't do any reasonable filtering on the event because of how
|
|
// k8s handles these mounts. So just re-read the file and apply it
|
|
// if it's changed.
|
|
}
|
|
sc, err := readServeConfig(cfg.ServeConfigPath, certDomain)
|
|
if err != nil {
|
|
log.Fatalf("serve proxy: failed to read serve config: %v", err)
|
|
}
|
|
if sc == nil {
|
|
log.Printf("serve proxy: no serve config at %q, skipping", cfg.ServeConfigPath)
|
|
continue
|
|
}
|
|
if prevServeConfig != nil && reflect.DeepEqual(sc, prevServeConfig) {
|
|
continue
|
|
}
|
|
if err := updateServeConfig(ctx, sc, certDomain, lc); err != nil {
|
|
log.Fatalf("serve proxy: error updating serve config: %v", err)
|
|
}
|
|
if kc != nil && kc.canPatch {
|
|
if err := kc.storeHTTPSEndpoint(ctx, certDomain); err != nil {
|
|
log.Fatalf("serve proxy: error storing HTTPS endpoint: %v", err)
|
|
}
|
|
}
|
|
prevServeConfig = sc
|
|
if cfg.CertShareMode != "rw" {
|
|
continue
|
|
}
|
|
if err := cm.ensureCertLoops(ctx, sc); err != nil {
|
|
log.Fatalf("serve proxy: error ensuring cert loops: %v", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
func certDomainFromNetmap(nm *netmap.NetworkMap) string {
|
|
if len(nm.DNS.CertDomains) == 0 {
|
|
return ""
|
|
}
|
|
return nm.DNS.CertDomains[0]
|
|
}
|
|
|
|
// localClient is a subset of [local.Client] that can be mocked for testing.
|
|
type localClient interface {
|
|
SetServeConfig(context.Context, *ipn.ServeConfig) error
|
|
CertPair(context.Context, string) ([]byte, []byte, error)
|
|
}
|
|
|
|
func updateServeConfig(ctx context.Context, sc *ipn.ServeConfig, certDomain string, lc localClient) error {
|
|
if !isValidHTTPSConfig(certDomain, sc) {
|
|
return nil
|
|
}
|
|
log.Printf("serve proxy: applying serve config")
|
|
return lc.SetServeConfig(ctx, sc)
|
|
}
|
|
|
|
func isValidHTTPSConfig(certDomain string, sc *ipn.ServeConfig) bool {
|
|
if certDomain == kubetypes.ValueNoHTTPS && hasHTTPSEndpoint(sc) {
|
|
log.Printf(
|
|
`serve proxy: this node is configured as a proxy that exposes an HTTPS endpoint to tailnet,
|
|
(perhaps a Kubernetes operator Ingress proxy) but it is not able to issue TLS certs, so this will likely not work.
|
|
To make it work, ensure that HTTPS is enabled for your tailnet, see https://tailscale.com/kb/1153/enabling-https for more details.`)
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func hasHTTPSEndpoint(cfg *ipn.ServeConfig) bool {
|
|
if cfg == nil {
|
|
return false
|
|
}
|
|
for _, tcpCfg := range cfg.TCP {
|
|
if tcpCfg.HTTPS {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// readServeConfig reads the ipn.ServeConfig from path, replacing
|
|
// ${TS_CERT_DOMAIN} with certDomain.
|
|
func readServeConfig(path, certDomain string) (*ipn.ServeConfig, error) {
|
|
if path == "" {
|
|
return nil, nil
|
|
}
|
|
j, err := os.ReadFile(path)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, nil
|
|
}
|
|
return nil, err
|
|
}
|
|
// Serve config can be provided by users as well as the Kubernetes Operator (for its proxies). User-provided
|
|
// config could be empty for reasons.
|
|
if len(j) == 0 {
|
|
log.Printf("serve proxy: serve config file is empty, skipping")
|
|
return nil, nil
|
|
}
|
|
j = bytes.ReplaceAll(j, []byte("${TS_CERT_DOMAIN}"), []byte(certDomain))
|
|
var sc ipn.ServeConfig
|
|
if err := json.Unmarshal(j, &sc); err != nil {
|
|
return nil, err
|
|
}
|
|
return &sc, nil
|
|
}
|
|
|
|
func ensureServicesNotAdvertised(ctx context.Context, lc *local.Client) error {
|
|
prefs, err := lc.GetPrefs(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("error getting prefs: %w", err)
|
|
}
|
|
if len(prefs.AdvertiseServices) == 0 {
|
|
return nil
|
|
}
|
|
|
|
log.Printf("serve proxy: unadvertising services: %v", prefs.AdvertiseServices)
|
|
if _, err := lc.EditPrefs(ctx, &ipn.MaskedPrefs{
|
|
AdvertiseServicesSet: true,
|
|
Prefs: ipn.Prefs{
|
|
AdvertiseServices: nil,
|
|
},
|
|
}); err != nil {
|
|
// EditPrefs only returns an error if it fails _set_ its local prefs.
|
|
// If it fails to _persist_ the prefs in state, we don't get an error
|
|
// and we continue waiting below, as control will failover as usual.
|
|
return fmt.Errorf("error setting prefs AdvertiseServices: %w", err)
|
|
}
|
|
|
|
// Services use the same (failover XOR regional routing) mechanism that
|
|
// HA subnet routers use. Unfortunately we don't yet get a reliable signal
|
|
// from control that it's responded to our unadvertisement, so the best we
|
|
// can do is wait for 20 seconds, where 15s is the approximate maximum time
|
|
// it should take for control to choose a new primary, and 5s is for buffer.
|
|
//
|
|
// Note: There is no guarantee that clients have been _informed_ of the new
|
|
// primary no matter how long we wait. We would need a mechanism to await
|
|
// netmap updates for peers to know for sure.
|
|
//
|
|
// See https://tailscale.com/kb/1115/high-availability for more details.
|
|
// TODO(tomhjp): Wait for a netmap update instead of sleeping when control
|
|
// supports that.
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
case <-time.After(20 * time.Second):
|
|
return nil
|
|
}
|
|
}
|