all-kube: create Tailscale Service for HA kube-apiserver ProxyGroup (#16572)

Adds a new reconciler for ProxyGroups of type kube-apiserver that will
provision a Tailscale Service for each replica to advertise. Adds two
new condition types to the ProxyGroup, TailscaleServiceValid and
TailscaleServiceConfigured, to post updates on the state of that
reconciler in a way that's consistent with the service-pg reconciler.
The created Tailscale Service name is configurable via a new ProxyGroup
field spec.kubeAPISserver.ServiceName, which expects a string of the
form "svc:<dns-label>".

Lots of supporting changes were needed to implement this in a way that's
consistent with other operator workflows, including:

* Pulled containerboot's ensureServicesUnadvertised and certManager into
  kube/ libraries to be shared with k8s-proxy. Use those in k8s-proxy to
  aid Service cert sharing between replicas and graceful Service shutdown.
* For certManager, add an initial wait to the cert loop to wait until
  the domain appears in the devices's netmap to avoid a guaranteed error
  on the first issue attempt when it's quick to start.
* Made several methods in ingress-for-pg.go and svc-for-pg.go into
  functions to share with the new reconciler
* Added a Resource struct to the owner refs stored in Tailscale Service
  annotations to be able to distinguish between Ingress- and ProxyGroup-
  based Services that need cleaning up in the Tailscale API.
* Added a ListVIPServices method to the internal tailscale client to aid
  cleaning up orphaned Services
* Support for reading config from a kube Secret, and partial support for
  config reloading, to prevent us having to force Pod restarts when
  config changes.
* Fixed up the zap logger so it's possible to set debug log level.

Updates #13358

Change-Id: Ia9607441157dd91fb9b6ecbc318eecbef446e116
Signed-off-by: Tom Proctor <tomhjp@users.noreply.github.com>
This commit is contained in:
Tom Proctor
2025-07-21 11:03:21 +01:00
committed by GitHub
parent 5adde9e3f3
commit f421907c38
39 changed files with 2551 additions and 397 deletions

View File

@@ -11,11 +11,13 @@
package state
import (
"context"
"encoding/json"
"fmt"
"tailscale.com/ipn"
"tailscale.com/kube/kubetypes"
klc "tailscale.com/kube/localclient"
"tailscale.com/tailcfg"
"tailscale.com/util/deephash"
)
@@ -56,12 +58,20 @@ func SetInitialKeys(store ipn.StateStore, podUID string) error {
// cancelled or it hits an error. The passed in next function is expected to be
// from a local.IPNBusWatcher that is at least subscribed to
// ipn.NotifyInitialNetMap.
func KeepKeysUpdated(store ipn.StateStore, next func() (ipn.Notify, error)) error {
var currentDeviceID, currentDeviceIPs, currentDeviceFQDN deephash.Sum
func KeepKeysUpdated(ctx context.Context, store ipn.StateStore, lc klc.LocalClient) error {
w, err := lc.WatchIPNBus(ctx, ipn.NotifyInitialNetMap)
if err != nil {
return fmt.Errorf("error watching IPN bus: %w", err)
}
defer w.Close()
var currentDeviceID, currentDeviceIPs, currentDeviceFQDN deephash.Sum
for {
n, err := next() // Blocks on a streaming LocalAPI HTTP call.
n, err := w.Next() // Blocks on a streaming LocalAPI HTTP call.
if err != nil {
if err == ctx.Err() {
return nil
}
return err
}
if n.NetMap == nil {

View File

@@ -15,6 +15,7 @@ import (
"github.com/google/go-cmp/cmp"
"tailscale.com/ipn"
"tailscale.com/ipn/store"
klc "tailscale.com/kube/localclient"
"tailscale.com/tailcfg"
"tailscale.com/types/logger"
"tailscale.com/types/netmap"
@@ -100,24 +101,20 @@ func TestSetInitialStateKeys(t *testing.T) {
}
func TestKeepStateKeysUpdated(t *testing.T) {
store, err := store.New(logger.Discard, "mem:")
if err != nil {
t.Fatalf("error creating in-memory store: %v", err)
store := fakeStore{
writeChan: make(chan string),
}
nextWaiting := make(chan struct{})
go func() {
<-nextWaiting // Acknowledge the initial signal.
}()
notifyCh := make(chan ipn.Notify)
next := func() (ipn.Notify, error) {
nextWaiting <- struct{}{} // Send signal to test that state is consistent.
return <-notifyCh, nil // Wait for test input.
errs := make(chan error)
notifyChan := make(chan ipn.Notify)
lc := &klc.FakeLocalClient{
FakeIPNBusWatcher: klc.FakeIPNBusWatcher{
NotifyChan: notifyChan,
},
}
errs := make(chan error, 1)
go func() {
err := KeepKeysUpdated(store, next)
err := KeepKeysUpdated(t.Context(), store, lc)
if err != nil {
errs <- fmt.Errorf("keepStateKeysUpdated returned with error: %w", err)
}
@@ -126,16 +123,12 @@ func TestKeepStateKeysUpdated(t *testing.T) {
for _, tc := range []struct {
name string
notify ipn.Notify
expected map[ipn.StateKey][]byte
expected []string
}{
{
name: "initial_not_authed",
notify: ipn.Notify{},
expected: map[ipn.StateKey][]byte{
keyDeviceID: nil,
keyDeviceFQDN: nil,
keyDeviceIPs: nil,
},
name: "initial_not_authed",
notify: ipn.Notify{},
expected: nil,
},
{
name: "authed",
@@ -148,10 +141,10 @@ func TestKeepStateKeysUpdated(t *testing.T) {
}).View(),
},
},
expected: map[ipn.StateKey][]byte{
keyDeviceID: []byte("TESTCTRL00000001"),
keyDeviceFQDN: []byte("test-node.test.ts.net"),
keyDeviceIPs: []byte(`["100.64.0.1","fd7a:115c:a1e0:ab12:4843:cd96:0:1"]`),
expected: []string{
fmt.Sprintf("%s=%s", keyDeviceID, "TESTCTRL00000001"),
fmt.Sprintf("%s=%s", keyDeviceFQDN, "test-node.test.ts.net"),
fmt.Sprintf("%s=%s", keyDeviceIPs, `["100.64.0.1","fd7a:115c:a1e0:ab12:4843:cd96:0:1"]`),
},
},
{
@@ -165,39 +158,39 @@ func TestKeepStateKeysUpdated(t *testing.T) {
}).View(),
},
},
expected: map[ipn.StateKey][]byte{
keyDeviceID: []byte("TESTCTRL00000001"),
keyDeviceFQDN: []byte("updated.test.ts.net"),
keyDeviceIPs: []byte(`["100.64.0.250"]`),
expected: []string{
fmt.Sprintf("%s=%s", keyDeviceFQDN, "updated.test.ts.net"),
fmt.Sprintf("%s=%s", keyDeviceIPs, `["100.64.0.250"]`),
},
},
} {
t.Run(tc.name, func(t *testing.T) {
// Send test input.
select {
case notifyCh <- tc.notify:
case <-errs:
t.Fatal("keepStateKeysUpdated returned before test input")
case <-time.After(5 * time.Second):
t.Fatal("timed out waiting for next() to be called again")
}
// Wait for next() to be called again so we know the goroutine has
// processed the event.
select {
case <-nextWaiting:
case <-errs:
t.Fatal("keepStateKeysUpdated returned before test input")
case <-time.After(5 * time.Second):
t.Fatal("timed out waiting for next() to be called again")
}
for key, value := range tc.expected {
got, _ := store.ReadState(key)
if !bytes.Equal(got, value) {
t.Errorf("state key %q mismatch: expected %q, got %q", key, value, got)
notifyChan <- tc.notify
for _, expected := range tc.expected {
select {
case got := <-store.writeChan:
if got != expected {
t.Errorf("expected %q, got %q", expected, got)
}
case err := <-errs:
t.Fatalf("unexpected error: %v", err)
case <-time.After(5 * time.Second):
t.Fatalf("timed out waiting for expected write %q", expected)
}
}
})
}
}
type fakeStore struct {
writeChan chan string
}
func (f fakeStore) ReadState(key ipn.StateKey) ([]byte, error) {
return nil, fmt.Errorf("ReadState not implemented")
}
func (f fakeStore) WriteState(key ipn.StateKey, value []byte) error {
f.writeChan <- fmt.Sprintf("%s=%s", key, value)
return nil
}