derp: increase update frequency and harden on failures (#2741)

This commit is contained in:
Kristoffer Dalby
2025-08-22 10:40:38 +02:00
committed by GitHub
parent 51c6367bb1
commit b87567628a
10 changed files with 417 additions and 67 deletions

View File

@@ -26,6 +26,7 @@ Please read the [PR description](https://github.com/juanfont/headscale/pull/2617
for more technical details about the issues and solutions.
**SQLite Database Backup Example:**
```bash
# Stop headscale
systemctl stop headscale
@@ -41,6 +42,13 @@ cp /var/lib/headscale/db.sqlite-shm /var/lib/headscale/db.sqlite-shm.backup
systemctl start headscale
```
### DERPMap update frequency
The default DERPMap update frequency has been changed from 24 hours to 3 hours.
If you set the `derp.update_frequency` configuration option, it is recommended to change
it to `3h` to ensure that the headscale instance gets the latest DERPMap updates when
upstream is changed.
### BREAKING
- Remove support for 32-bit binaries
@@ -55,6 +63,11 @@ systemctl start headscale
- **IMPORTANT: Backup your SQLite database before upgrading**
- Introduces safer table renaming migration strategy
- Addresses longstanding database integrity issues
- DERPmap update frequency default changed from 24h to 3h
[#2741](https://github.com/juanfont/headscale/pull/2741)
- DERPmap update mechanism has been improved with retry,
and is now failing conservatively, preserving the old map upon failure.
[#2741](https://github.com/juanfont/headscale/pull/2741)
- Add support for `autogroup:member`, `autogroup:tagged`
[#2572](https://github.com/juanfont/headscale/pull/2572)
- Remove policy v1 code [#2600](https://github.com/juanfont/headscale/pull/2600)
@@ -72,7 +85,7 @@ systemctl start headscale
[#2643](https://github.com/juanfont/headscale/pull/2643)
- OIDC: Use group claim from UserInfo
[#2663](https://github.com/juanfont/headscale/pull/2663)
- OIDC: Update user with claims from UserInfo *before* comparing with allowed
- OIDC: Update user with claims from UserInfo _before_ comparing with allowed
groups, email and domain [#2663](https://github.com/juanfont/headscale/pull/2663)
## 0.26.1 (2025-06-06)

View File

@@ -128,7 +128,7 @@ derp:
auto_update_enabled: true
# How often should we check for DERP updates?
update_frequency: 24h
update_frequency: 3h
# Disables the automatic check for headscale updates on startup
disable_check_updates: false
@@ -275,7 +275,7 @@ dns:
# `hostname.base_domain` (e.g., _myhost.example.com_).
base_domain: example.com
# Whether to use the local DNS settings of a node or override the local DNS
# Whether to use the local DNS settings of a node or override the local DNS
# settings (default) and force the use of Headscale's DNS configuration.
override_local_dns: true
@@ -293,8 +293,7 @@ dns:
# Split DNS (see https://tailscale.com/kb/1054/dns/),
# a map of domains and which DNS server to use for each.
split:
{}
split: {}
# foo.bar.com:
# - 1.1.1.1
# darp.headscale.net:

View File

@@ -17,6 +17,7 @@ import (
"syscall"
"time"
"github.com/cenkalti/backoff/v5"
"github.com/davecgh/go-spew/spew"
"github.com/gorilla/mux"
grpcRuntime "github.com/grpc-ecosystem/grpc-gateway/v2/runtime"
@@ -284,11 +285,23 @@ func (h *Headscale) scheduledTasks(ctx context.Context) {
case <-derpTickerChan:
log.Info().Msg("Fetching DERPMap updates")
derpMap := derp.GetDERPMap(h.cfg.DERP)
if h.cfg.DERP.ServerEnabled && h.cfg.DERP.AutomaticallyAddEmbeddedDerpRegion {
region, _ := h.DERPServer.GenerateRegion()
derpMap.Regions[region.RegionID] = &region
derpMap, err := backoff.Retry(ctx, func() (*tailcfg.DERPMap, error) {
derpMap, err := derp.GetDERPMap(h.cfg.DERP)
if err != nil {
return nil, err
}
if h.cfg.DERP.ServerEnabled && h.cfg.DERP.AutomaticallyAddEmbeddedDerpRegion {
region, _ := h.DERPServer.GenerateRegion()
derpMap.Regions[region.RegionID] = &region
}
return derpMap, nil
}, backoff.WithBackOff(backoff.NewExponentialBackOff()))
if err != nil {
log.Error().Err(err).Msg("failed to build new DERPMap, retrying later")
continue
}
h.state.SetDERPMap(derpMap)
h.Change(change.DERPSet)
@@ -516,29 +529,31 @@ func (h *Headscale) Serve() error {
h.mapBatcher.Start()
defer h.mapBatcher.Close()
// TODO(kradalby): fix state part.
if h.cfg.DERP.ServerEnabled {
// When embedded DERP is enabled we always need a STUN server
if h.cfg.DERP.STUNAddr == "" {
return errSTUNAddressNotSet
}
region, err := h.DERPServer.GenerateRegion()
if err != nil {
return fmt.Errorf("generating DERP region for embedded server: %w", err)
}
if h.cfg.DERP.AutomaticallyAddEmbeddedDerpRegion {
h.state.DERPMap().Regions[region.RegionID] = &region
}
go h.DERPServer.ServeSTUN()
}
if len(h.state.DERPMap().Regions) == 0 {
derpMap, err := derp.GetDERPMap(h.cfg.DERP)
if err != nil {
return fmt.Errorf("failed to get DERPMap: %w", err)
}
if h.cfg.DERP.ServerEnabled && h.cfg.DERP.AutomaticallyAddEmbeddedDerpRegion {
region, _ := h.DERPServer.GenerateRegion()
derpMap.Regions[region.RegionID] = &region
}
if len(derpMap.Regions) == 0 {
return errEmptyInitialDERPMap
}
h.state.SetDERPMap(derpMap)
// Start ephemeral node garbage collector and schedule all nodes
// that are already in the database and ephemeral. If they are still
// around between restarts, they will reconnect and the GC will

View File

@@ -1,16 +1,22 @@
package derp
import (
"cmp"
"context"
"encoding/json"
"hash/crc64"
"io"
"maps"
"math/rand"
"net/http"
"net/url"
"os"
"reflect"
"sync"
"time"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/rs/zerolog/log"
"github.com/spf13/viper"
"gopkg.in/yaml.v3"
"tailscale.com/tailcfg"
)
@@ -79,26 +85,16 @@ func mergeDERPMaps(derpMaps []*tailcfg.DERPMap) *tailcfg.DERPMap {
return &result
}
func GetDERPMap(cfg types.DERPConfig) *tailcfg.DERPMap {
func GetDERPMap(cfg types.DERPConfig) (*tailcfg.DERPMap, error) {
var derpMaps []*tailcfg.DERPMap
if cfg.DERPMap != nil {
derpMaps = append(derpMaps, cfg.DERPMap)
}
for _, path := range cfg.Paths {
log.Debug().
Str("func", "GetDERPMap").
Str("path", path).
Msg("Loading DERPMap from path")
derpMap, err := loadDERPMapFromPath(path)
if err != nil {
log.Error().
Str("func", "GetDERPMap").
Str("path", path).
Err(err).
Msg("Could not load DERP map from path")
break
return nil, err
}
derpMaps = append(derpMaps, derpMap)
@@ -106,26 +102,59 @@ func GetDERPMap(cfg types.DERPConfig) *tailcfg.DERPMap {
for _, addr := range cfg.URLs {
derpMap, err := loadDERPMapFromURL(addr)
log.Debug().
Str("func", "GetDERPMap").
Str("url", addr.String()).
Msg("Loading DERPMap from path")
if err != nil {
log.Error().
Str("func", "GetDERPMap").
Str("url", addr.String()).
Err(err).
Msg("Could not load DERP map from path")
break
return nil, err
}
derpMaps = append(derpMaps, derpMap)
}
derpMap := mergeDERPMaps(derpMaps)
shuffleDERPMap(derpMap)
log.Trace().Interface("derpMap", derpMap).Msg("DERPMap loaded")
return derpMap
return derpMap, nil
}
func shuffleDERPMap(dm *tailcfg.DERPMap) {
if dm == nil || len(dm.Regions) == 0 {
return
}
for id, region := range dm.Regions {
if len(region.Nodes) == 0 {
continue
}
dm.Regions[id] = shuffleRegionNoClone(region)
}
}
var crc64Table = crc64.MakeTable(crc64.ISO)
var (
derpRandomOnce sync.Once
derpRandomInst *rand.Rand
derpRandomMu sync.RWMutex
)
func derpRandom() *rand.Rand {
derpRandomOnce.Do(func() {
seed := cmp.Or(viper.GetString("dns.base_domain"), time.Now().String())
rnd := rand.New(rand.NewSource(0))
rnd.Seed(int64(crc64.Checksum([]byte(seed), crc64Table)))
derpRandomInst = rnd
})
return derpRandomInst
}
func resetDerpRandomForTesting() {
derpRandomMu.Lock()
defer derpRandomMu.Unlock()
derpRandomOnce = sync.Once{}
derpRandomInst = nil
}
func shuffleRegionNoClone(r *tailcfg.DERPRegion) *tailcfg.DERPRegion {
derpRandom().Shuffle(len(r.Nodes), reflect.Swapper(r.Nodes))
return r
}

284
hscontrol/derp/derp_test.go Normal file
View File

@@ -0,0 +1,284 @@
package derp
import (
"testing"
"github.com/google/go-cmp/cmp"
"github.com/spf13/viper"
"tailscale.com/tailcfg"
)
func TestShuffleDERPMapDeterministic(t *testing.T) {
tests := []struct {
name string
baseDomain string
derpMap *tailcfg.DERPMap
expected *tailcfg.DERPMap
}{
{
name: "single region with 4 nodes",
baseDomain: "test1.example.com",
derpMap: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
1: {
RegionID: 1,
RegionCode: "nyc",
RegionName: "New York City",
Nodes: []*tailcfg.DERPNode{
{Name: "1f", RegionID: 1, HostName: "derp1f.tailscale.com"},
{Name: "1g", RegionID: 1, HostName: "derp1g.tailscale.com"},
{Name: "1h", RegionID: 1, HostName: "derp1h.tailscale.com"},
{Name: "1i", RegionID: 1, HostName: "derp1i.tailscale.com"},
},
},
},
},
expected: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
1: {
RegionID: 1,
RegionCode: "nyc",
RegionName: "New York City",
Nodes: []*tailcfg.DERPNode{
{Name: "1g", RegionID: 1, HostName: "derp1g.tailscale.com"},
{Name: "1f", RegionID: 1, HostName: "derp1f.tailscale.com"},
{Name: "1i", RegionID: 1, HostName: "derp1i.tailscale.com"},
{Name: "1h", RegionID: 1, HostName: "derp1h.tailscale.com"},
},
},
},
},
},
{
name: "multiple regions with nodes",
baseDomain: "test2.example.com",
derpMap: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
10: {
RegionID: 10,
RegionCode: "sea",
RegionName: "Seattle",
Nodes: []*tailcfg.DERPNode{
{Name: "10b", RegionID: 10, HostName: "derp10b.tailscale.com"},
{Name: "10c", RegionID: 10, HostName: "derp10c.tailscale.com"},
{Name: "10d", RegionID: 10, HostName: "derp10d.tailscale.com"},
},
},
2: {
RegionID: 2,
RegionCode: "sfo",
RegionName: "San Francisco",
Nodes: []*tailcfg.DERPNode{
{Name: "2d", RegionID: 2, HostName: "derp2d.tailscale.com"},
{Name: "2e", RegionID: 2, HostName: "derp2e.tailscale.com"},
{Name: "2f", RegionID: 2, HostName: "derp2f.tailscale.com"},
},
},
},
},
expected: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
10: {
RegionID: 10,
RegionCode: "sea",
RegionName: "Seattle",
Nodes: []*tailcfg.DERPNode{
{Name: "10b", RegionID: 10, HostName: "derp10b.tailscale.com"},
{Name: "10c", RegionID: 10, HostName: "derp10c.tailscale.com"},
{Name: "10d", RegionID: 10, HostName: "derp10d.tailscale.com"},
},
},
2: {
RegionID: 2,
RegionCode: "sfo",
RegionName: "San Francisco",
Nodes: []*tailcfg.DERPNode{
{Name: "2f", RegionID: 2, HostName: "derp2f.tailscale.com"},
{Name: "2e", RegionID: 2, HostName: "derp2e.tailscale.com"},
{Name: "2d", RegionID: 2, HostName: "derp2d.tailscale.com"},
},
},
},
},
},
{
name: "large region with many nodes",
baseDomain: "test3.example.com",
derpMap: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
4: {
RegionID: 4,
RegionCode: "fra",
RegionName: "Frankfurt",
Nodes: []*tailcfg.DERPNode{
{Name: "4f", RegionID: 4, HostName: "derp4f.tailscale.com"},
{Name: "4g", RegionID: 4, HostName: "derp4g.tailscale.com"},
{Name: "4h", RegionID: 4, HostName: "derp4h.tailscale.com"},
{Name: "4i", RegionID: 4, HostName: "derp4i.tailscale.com"},
},
},
},
},
expected: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
4: {
RegionID: 4,
RegionCode: "fra",
RegionName: "Frankfurt",
Nodes: []*tailcfg.DERPNode{
{Name: "4f", RegionID: 4, HostName: "derp4f.tailscale.com"},
{Name: "4h", RegionID: 4, HostName: "derp4h.tailscale.com"},
{Name: "4g", RegionID: 4, HostName: "derp4g.tailscale.com"},
{Name: "4i", RegionID: 4, HostName: "derp4i.tailscale.com"},
},
},
},
},
},
{
name: "same region different base domain",
baseDomain: "different.example.com",
derpMap: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
4: {
RegionID: 4,
RegionCode: "fra",
RegionName: "Frankfurt",
Nodes: []*tailcfg.DERPNode{
{Name: "4f", RegionID: 4, HostName: "derp4f.tailscale.com"},
{Name: "4g", RegionID: 4, HostName: "derp4g.tailscale.com"},
{Name: "4h", RegionID: 4, HostName: "derp4h.tailscale.com"},
{Name: "4i", RegionID: 4, HostName: "derp4i.tailscale.com"},
},
},
},
},
expected: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
4: {
RegionID: 4,
RegionCode: "fra",
RegionName: "Frankfurt",
Nodes: []*tailcfg.DERPNode{
{Name: "4g", RegionID: 4, HostName: "derp4g.tailscale.com"},
{Name: "4i", RegionID: 4, HostName: "derp4i.tailscale.com"},
{Name: "4f", RegionID: 4, HostName: "derp4f.tailscale.com"},
{Name: "4h", RegionID: 4, HostName: "derp4h.tailscale.com"},
},
},
},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
viper.Set("dns.base_domain", tt.baseDomain)
defer viper.Reset()
resetDerpRandomForTesting()
testMap := tt.derpMap.View().AsStruct()
shuffleDERPMap(testMap)
if diff := cmp.Diff(tt.expected, testMap); diff != "" {
t.Errorf("Shuffled DERP map doesn't match expected (-expected +actual):\n%s", diff)
}
})
}
}
func TestShuffleDERPMapEdgeCases(t *testing.T) {
tests := []struct {
name string
derpMap *tailcfg.DERPMap
}{
{
name: "nil derp map",
derpMap: nil,
},
{
name: "empty derp map",
derpMap: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{},
},
},
{
name: "region with no nodes",
derpMap: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
1: {
RegionID: 1,
RegionCode: "empty",
RegionName: "Empty Region",
Nodes: []*tailcfg.DERPNode{},
},
},
},
},
{
name: "region with single node",
derpMap: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
1: {
RegionID: 1,
RegionCode: "single",
RegionName: "Single Node Region",
Nodes: []*tailcfg.DERPNode{
{Name: "1a", RegionID: 1, HostName: "derp1a.tailscale.com"},
},
},
},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
shuffleDERPMap(tt.derpMap)
})
}
}
func TestShuffleDERPMapWithoutBaseDomain(t *testing.T) {
viper.Reset()
resetDerpRandomForTesting()
derpMap := &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
1: {
RegionID: 1,
RegionCode: "test",
RegionName: "Test Region",
Nodes: []*tailcfg.DERPNode{
{Name: "1a", RegionID: 1, HostName: "derp1a.test.com"},
{Name: "1b", RegionID: 1, HostName: "derp1b.test.com"},
{Name: "1c", RegionID: 1, HostName: "derp1c.test.com"},
{Name: "1d", RegionID: 1, HostName: "derp1d.test.com"},
},
},
},
}
original := derpMap.View().AsStruct()
shuffleDERPMap(derpMap)
if len(derpMap.Regions) != 1 || len(derpMap.Regions[1].Nodes) != 4 {
t.Error("Shuffle corrupted DERP map structure")
}
originalNodes := make(map[string]bool)
for _, node := range original.Regions[1].Nodes {
originalNodes[node.Name] = true
}
shuffledNodes := make(map[string]bool)
for _, node := range derpMap.Regions[1].Nodes {
shuffledNodes[node.Name] = true
}
if diff := cmp.Diff(originalNodes, shuffledNodes); diff != "" {
t.Errorf("Shuffle changed node set (-original +shuffled):\n%s", diff)
}
}

View File

@@ -276,7 +276,7 @@ func DERPProbeHandler(
// An example implementation is found here https://derp.tailscale.com/bootstrap-dns
// Coordination server is included automatically, since local DERP is using the same DNS Name in d.serverURL.
func DERPBootstrapDNSHandler(
derpMap *tailcfg.DERPMap,
derpMap tailcfg.DERPMapView,
) func(http.ResponseWriter, *http.Request) {
return func(
writer http.ResponseWriter,
@@ -287,18 +287,18 @@ func DERPBootstrapDNSHandler(
resolvCtx, cancel := context.WithTimeout(req.Context(), time.Minute)
defer cancel()
var resolver net.Resolver
for _, region := range derpMap.Regions {
for _, node := range region.Nodes { // we don't care if we override some nodes
addrs, err := resolver.LookupIP(resolvCtx, "ip", node.HostName)
for _, region := range derpMap.Regions().All() {
for _, node := range region.Nodes().All() { // we don't care if we override some nodes
addrs, err := resolver.LookupIP(resolvCtx, "ip", node.HostName())
if err != nil {
log.Trace().
Caller().
Err(err).
Msgf("bootstrap DNS lookup failed %q", node.HostName)
Msgf("bootstrap DNS lookup failed %q", node.HostName())
continue
}
dnsEntries[node.HostName] = addrs
dnsEntries[node.HostName()] = addrs
}
}
writer.Header().Set("Content-Type", "application/json")

View File

@@ -10,6 +10,7 @@ import (
"time"
"github.com/juanfont/headscale/hscontrol/db"
"github.com/juanfont/headscale/hscontrol/derp"
"github.com/juanfont/headscale/hscontrol/state"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/juanfont/headscale/hscontrol/types/change"
@@ -167,6 +168,12 @@ func setupBatcherWithTestData(t *testing.T, bf batcherFunc, userCount, nodesPerU
t.Fatalf("Failed to create state: %v", err)
}
derpMap, err := derp.GetDERPMap(cfg.DERP)
assert.NoError(t, err)
assert.NotNil(t, derpMap)
state.SetDERPMap(derpMap)
// Set up a permissive policy that allows all communication for testing
allowAllPolicy := `{
"acls": [

View File

@@ -79,7 +79,7 @@ func (b *MapResponseBuilder) WithSelfNode() *MapResponseBuilder {
// WithDERPMap adds the DERP map to the response
func (b *MapResponseBuilder) WithDERPMap() *MapResponseBuilder {
b.resp.DERPMap = b.mapper.state.DERPMap()
b.resp.DERPMap = b.mapper.state.DERPMap().AsStruct()
return b
}

View File

@@ -9,10 +9,10 @@ import (
"io"
"net/netip"
"os"
"sync/atomic"
"time"
hsdb "github.com/juanfont/headscale/hscontrol/db"
"github.com/juanfont/headscale/hscontrol/derp"
"github.com/juanfont/headscale/hscontrol/policy"
"github.com/juanfont/headscale/hscontrol/policy/matcher"
"github.com/juanfont/headscale/hscontrol/routes"
@@ -55,7 +55,7 @@ type State struct {
// ipAlloc manages IP address allocation for nodes
ipAlloc *hsdb.IPAllocator
// derpMap contains the current DERP relay configuration
derpMap *tailcfg.DERPMap
derpMap atomic.Pointer[tailcfg.DERPMap]
// polMan handles policy evaluation and management
polMan policy.PolicyManager
// registrationCache caches node registration data to reduce database load
@@ -86,8 +86,6 @@ func NewState(cfg *types.Config) (*State, error) {
return nil, fmt.Errorf("init ip allocatior: %w", err)
}
derpMap := derp.GetDERPMap(cfg.DERP)
nodes, err := db.ListNodes()
if err != nil {
return nil, fmt.Errorf("loading nodes: %w", err)
@@ -107,17 +105,17 @@ func NewState(cfg *types.Config) (*State, error) {
return nil, fmt.Errorf("init policy manager: %w", err)
}
return &State{
s := &State{
cfg: cfg,
db: db,
ipAlloc: ipAlloc,
// TODO(kradalby): Update DERPMap
derpMap: derpMap,
db: db,
ipAlloc: ipAlloc,
polMan: polMan,
registrationCache: registrationCache,
primaryRoutes: routes.New(),
}, nil
}
return s, nil
}
// Close gracefully shuts down the State instance and releases all resources.
@@ -170,9 +168,14 @@ func policyBytes(db *hsdb.HSDatabase, cfg *types.Config) ([]byte, error) {
return nil, fmt.Errorf("%w: %s", ErrUnsupportedPolicyMode, cfg.Policy.Mode)
}
// SetDERPMap updates the DERP relay configuration.
func (s *State) SetDERPMap(dm *tailcfg.DERPMap) {
s.derpMap.Store(dm)
}
// DERPMap returns the current DERP relay configuration for peer-to-peer connectivity.
func (s *State) DERPMap() *tailcfg.DERPMap {
return s.derpMap
func (s *State) DERPMap() tailcfg.DERPMapView {
return s.derpMap.Load().View()
}
// ReloadPolicy reloads the access control policy and triggers auto-approval if changed.
@@ -209,7 +212,6 @@ func (s *State) CreateUser(user types.User) (*types.User, bool, error) {
s.mu.Lock()
defer s.mu.Unlock()
if err := s.db.DB.Save(&user).Error; err != nil {
return nil, false, fmt.Errorf("creating user: %w", err)
}

View File

@@ -300,6 +300,7 @@ func LoadConfig(path string, isFile bool) error {
viper.SetDefault("derp.server.verify_clients", true)
viper.SetDefault("derp.server.stun.enabled", true)
viper.SetDefault("derp.server.automatically_add_embedded_derp_region", true)
viper.SetDefault("derp.update_frequency", "3h")
viper.SetDefault("unix_socket", "/var/run/headscale/headscale.sock")
viper.SetDefault("unix_socket_permission", "0o770")