ipn/ipnlocal: use rendezvous hashing to traffic-steer exit nodes

With auto exit nodes enabled, the client picks exit nodes from the
ones advertised in the network map. Usually, it picks the one with the
highest priority score, but when the top spot is tied, it used to pick
randomly. Then, once it made a selection, it would strongly prefer to
stick with that exit node. It wouldn’t even consider another exit node
unless the client was shutdown or the exit node went offline. This is
to prevent flapping, where a client constantly chooses a different
random exit node.

The major problem with this algorithm is that new exit nodes don’t get
selected as often as they should. In fact, they wouldn’t even move
over if a higher scoring exit node appeared.

Let’s say that you have an exit node and it’s overloaded. So you spin
up a new exit node, right beside your existing one, in the hopes that
the traffic will be split across them. But since the client had this
strong affinity, they stick with the exit node they know and love.

Using rendezvous hashing, we can have different clients spread
their selections equally across their top scoring exit nodes. When an
exit node shuts down, its clients will spread themselves evenly to
their other equal options. When an exit node starts, a proportional
number of clients will migrate to their new best option.

Read more: https://en.wikipedia.org/wiki/Rendezvous_hashing

The trade-off is that starting up a new exit node may cause some
clients to move over, interrupting their existing network connections.
So this change is only enabled for tailnets with `traffic-steering`
enabled.

Updates tailscale/corp#29966
Fixes #16551

Signed-off-by: Simon Law <sfllaw@tailscale.com>
This commit is contained in:
Simon Law 2025-07-16 11:50:13 -07:00 committed by Simon Law
parent 17c5116d46
commit e84e58c567
2 changed files with 33 additions and 71 deletions

View File

@ -11,6 +11,7 @@ import (
"context"
"crypto/sha256"
"encoding/base64"
"encoding/binary"
"encoding/hex"
"encoding/json"
"errors"
@ -7750,7 +7751,7 @@ func suggestExitNode(report *netcheck.Report, nb *nodeBackend, prevSuggestion ta
switch {
case nb.SelfHasCap(tailcfg.NodeAttrTrafficSteering):
// The traffic-steering feature flag is enabled on this tailnet.
return suggestExitNodeUsingTrafficSteering(nb, prevSuggestion, allowList)
return suggestExitNodeUsingTrafficSteering(nb, allowList)
default:
return suggestExitNodeUsingDERP(report, nb, prevSuggestion, selectRegion, selectNode, allowList)
}
@ -7896,12 +7897,17 @@ var ErrNoNetMap = errors.New("no network map, try again later")
// pick one of the best exit nodes. These priorities are provided by Control in
// the nodes [tailcfg.Location]. To be eligible for consideration, the node
// must have NodeAttrSuggestExitNode in its CapMap.
func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, prev tailcfg.StableNodeID, allowed set.Set[tailcfg.StableNodeID]) (apitype.ExitNodeSuggestionResponse, error) {
func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, allowed set.Set[tailcfg.StableNodeID]) (apitype.ExitNodeSuggestionResponse, error) {
nm := nb.NetMap()
if nm == nil {
return apitype.ExitNodeSuggestionResponse{}, ErrNoNetMap
}
self := nb.Self()
if !self.Valid() {
return apitype.ExitNodeSuggestionResponse{}, ErrNoNetMap
}
if !nb.SelfHasCap(tailcfg.NodeAttrTrafficSteering) {
panic("missing traffic-steering capability")
}
@ -7923,12 +7929,6 @@ func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, prev tailcfg.StableNod
if !tsaddr.ContainsExitRoutes(p.AllowedIPs()) {
return false
}
if p.StableID() == prev {
// Prevent flapping: since prev is a valid suggestion,
// force prev to be the only valid pick.
force = p
return false
}
return true
})
if force.Valid() {
@ -7950,6 +7950,7 @@ func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, prev tailcfg.StableNod
}
return s
}
rdvHash := makeRendezvousHasher(self.ID())
var pick tailcfg.NodeView
if len(nodes) == 1 {
@ -7958,25 +7959,18 @@ func suggestExitNodeUsingTrafficSteering(nb *nodeBackend, prev tailcfg.StableNod
if len(nodes) > 1 {
// Find the highest scoring exit nodes.
slices.SortFunc(nodes, func(a, b tailcfg.NodeView) int {
return cmp.Compare(score(b), score(a)) // reverse sort
})
// Find the top exit nodes, which all have the same score.
topI := len(nodes)
ts := score(nodes[0])
for i, n := range nodes[1:] {
if score(n) < ts {
// n is the first node with a lower score.
// Make nodes[:topI] to slice the top exit nodes.
topI = i + 1
break
c := cmp.Compare(score(b), score(a)) // Highest score first.
if c == 0 {
// Rendezvous hashing for reliably picking the
// same node from a list: tailscale/tailscale#16551.
return cmp.Compare(rdvHash(b.ID()), rdvHash(a.ID()))
}
}
return c
})
// TODO(sfllaw): add a temperature knob so that this client has
// a chance of picking the next best option.
randSeed := uint64(nm.SelfNode.ID())
pick = nodes[rands.IntN(randSeed, topI)]
pick = nodes[0]
}
if !pick.Valid() {
@ -8077,6 +8071,19 @@ func longLatDistance(fromLat, fromLong, toLat, toLong float64) float64 {
return earthRadiusMeters * c
}
// makeRendezvousHasher returns a function that hashes a node ID to a uint64.
// https://en.wikipedia.org/wiki/Rendezvous_hashing
func makeRendezvousHasher(seed tailcfg.NodeID) func(tailcfg.NodeID) uint64 {
en := binary.BigEndian
return func(n tailcfg.NodeID) uint64 {
var b [16]byte
en.PutUint64(b[:], uint64(seed))
en.PutUint64(b[8:], uint64(n))
v := sha256.Sum256(b[:])
return en.Uint64(v[:])
}
}
const (
// unresolvedExitNodeID is a special [tailcfg.StableNodeID] value
// used as an exit node ID to install a blackhole route, preventing

View File

@ -5012,7 +5012,7 @@ func TestSuggestExitNodeTrafficSteering(t *testing.T) {
wantName: "peer1",
},
{
name: "many-suggested-exit-nodes",
name: "suggest-exit-node-stable-pick",
netMap: &netmap.NetworkMap{
SelfNode: selfNode.View(),
Peers: []tailcfg.NodeView{
@ -5030,55 +5030,10 @@ func TestSuggestExitNodeTrafficSteering(t *testing.T) {
withSuggest()),
},
},
// Change this, if the hashing function changes.
wantID: "stable3",
wantName: "peer3",
},
{
name: "suggested-exit-node-was-last-suggested",
netMap: &netmap.NetworkMap{
SelfNode: selfNode.View(),
Peers: []tailcfg.NodeView{
makePeer(1,
withExitRoutes(),
withSuggest()),
makePeer(2,
withExitRoutes(),
withSuggest()),
makePeer(3,
withExitRoutes(),
withSuggest()),
makePeer(4,
withExitRoutes(),
withSuggest()),
},
},
lastExit: "stable2", // overrides many-suggested-exit-nodes
wantID: "stable2",
wantName: "peer2",
},
{
name: "suggested-exit-node-was-never-suggested",
netMap: &netmap.NetworkMap{
SelfNode: selfNode.View(),
Peers: []tailcfg.NodeView{
makePeer(1,
withExitRoutes(),
withSuggest()),
makePeer(2,
withExitRoutes(),
withSuggest()),
makePeer(3,
withExitRoutes(),
withSuggest()),
makePeer(4,
withExitRoutes(),
withSuggest()),
},
},
lastExit: "stable10",
wantID: "stable3", // matches many-suggested-exit-nodes
wantName: "peer3",
},
{
name: "exit-nodes-with-and-without-priority",
netMap: &netmap.NetworkMap{
@ -5282,7 +5237,7 @@ func TestSuggestExitNodeTrafficSteering(t *testing.T) {
defer nb.shutdown(errShutdown)
nb.SetNetMap(tt.netMap)
got, err := suggestExitNodeUsingTrafficSteering(nb, tt.lastExit, allowList)
got, err := suggestExitNodeUsingTrafficSteering(nb, allowList)
if tt.wantErr == nil && err != nil {
t.Fatalf("err=%v, want nil", err)
}