feature/relayserver,net/udprelay,wgengine/magicsock: implement retry (#16347)

udprelay.Server is lazily initialized when the first request is received
over peerAPI. These early requests have a high chance of failure until
the first address discovery cycle has completed.

Return an ErrServerNotReady error until the first address discovery
cycle has completed, and plumb retry handling for this error all the
way back to the client in relayManager.

relayManager can now retry after a few seconds instead of waiting for
the next path discovery cycle, which could take another minute or
longer.

Updates tailscale/corp#27502

Signed-off-by: Jordan Whited <jordan@tailscale.com>
This commit is contained in:
Jordan Whited 2025-06-23 15:50:43 -07:00 committed by GitHub
parent 9288efe592
commit a589863d61
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 99 additions and 37 deletions

View File

@ -8,9 +8,11 @@ package relayserver
import ( import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"io" "io"
"net/http" "net/http"
"sync" "sync"
"time"
"tailscale.com/envknob" "tailscale.com/envknob"
"tailscale.com/feature" "tailscale.com/feature"
@ -184,6 +186,12 @@ func handlePeerAPIRelayAllocateEndpoint(h ipnlocal.PeerAPIHandler, w http.Respon
} }
ep, err := rs.AllocateEndpoint(allocateEndpointReq.DiscoKeys[0], allocateEndpointReq.DiscoKeys[1]) ep, err := rs.AllocateEndpoint(allocateEndpointReq.DiscoKeys[0], allocateEndpointReq.DiscoKeys[1])
if err != nil { if err != nil {
var notReady udprelay.ErrServerNotReady
if errors.As(err, &notReady) {
w.Header().Set("Retry-After", fmt.Sprintf("%d", notReady.RetryAfter.Round(time.Second)/time.Second))
httpErrAndLog(err.Error(), http.StatusServiceUnavailable)
return
}
httpErrAndLog(err.Error(), http.StatusInternalServerError) httpErrAndLog(err.Error(), http.StatusInternalServerError)
return return
} }

View File

@ -63,13 +63,14 @@ type Server struct {
closeCh chan struct{} closeCh chan struct{}
netChecker *netcheck.Client netChecker *netcheck.Client
mu sync.Mutex // guards the following fields mu sync.Mutex // guards the following fields
addrPorts []netip.AddrPort // the ip:port pairs returned as candidate endpoints addrDiscoveryOnce bool // addrDiscovery completed once (successfully or unsuccessfully)
closed bool addrPorts []netip.AddrPort // the ip:port pairs returned as candidate endpoints
lamportID uint64 closed bool
vniPool []uint32 // the pool of available VNIs lamportID uint64
byVNI map[uint32]*serverEndpoint vniPool []uint32 // the pool of available VNIs
byDisco map[pairOfDiscoPubKeys]*serverEndpoint byVNI map[uint32]*serverEndpoint
byDisco map[pairOfDiscoPubKeys]*serverEndpoint
} }
// pairOfDiscoPubKeys is a pair of key.DiscoPublic. It must be constructed via // pairOfDiscoPubKeys is a pair of key.DiscoPublic. It must be constructed via
@ -321,8 +322,7 @@ func NewServer(logf logger.Logf, port int, overrideAddrs []netip.Addr) (s *Serve
s.wg.Add(1) s.wg.Add(1)
go s.endpointGCLoop() go s.endpointGCLoop()
if len(overrideAddrs) > 0 { if len(overrideAddrs) > 0 {
var addrPorts set.Set[netip.AddrPort] addrPorts := make(set.Set[netip.AddrPort], len(overrideAddrs))
addrPorts.Make()
for _, addr := range overrideAddrs { for _, addr := range overrideAddrs {
if addr.IsValid() { if addr.IsValid() {
addrPorts.Add(netip.AddrPortFrom(addr, boundPort)) addrPorts.Add(netip.AddrPortFrom(addr, boundPort))
@ -401,12 +401,12 @@ func (s *Server) addrDiscoveryLoop() {
} }
s.mu.Lock() s.mu.Lock()
s.addrPorts = addrPorts s.addrPorts = addrPorts
s.addrDiscoveryOnce = true
s.mu.Unlock() s.mu.Unlock()
case <-s.closeCh: case <-s.closeCh:
return return
} }
} }
} }
func (s *Server) listenOn(port int) (uint16, error) { func (s *Server) listenOn(port int) (uint16, error) {
@ -521,10 +521,22 @@ func (s *Server) packetReadLoop() {
var ErrServerClosed = errors.New("server closed") var ErrServerClosed = errors.New("server closed")
// ErrServerNotReady indicates the server is not ready. Allocation should be
// requested after waiting for at least RetryAfter duration.
type ErrServerNotReady struct {
RetryAfter time.Duration
}
func (e ErrServerNotReady) Error() string {
return fmt.Sprintf("server not ready, retry after %v", e.RetryAfter)
}
// AllocateEndpoint allocates an [endpoint.ServerEndpoint] for the provided pair // AllocateEndpoint allocates an [endpoint.ServerEndpoint] for the provided pair
// of [key.DiscoPublic]'s. If an allocation already exists for discoA and discoB // of [key.DiscoPublic]'s. If an allocation already exists for discoA and discoB
// it is returned without modification/reallocation. AllocateEndpoint returns // it is returned without modification/reallocation. AllocateEndpoint returns
// [ErrServerClosed] if the server has been closed. // the following notable errors:
// 1. [ErrServerClosed] if the server has been closed.
// 2. [ErrServerNotReady] if the server is not ready.
func (s *Server) AllocateEndpoint(discoA, discoB key.DiscoPublic) (endpoint.ServerEndpoint, error) { func (s *Server) AllocateEndpoint(discoA, discoB key.DiscoPublic) (endpoint.ServerEndpoint, error) {
s.mu.Lock() s.mu.Lock()
defer s.mu.Unlock() defer s.mu.Unlock()
@ -533,6 +545,9 @@ func (s *Server) AllocateEndpoint(discoA, discoB key.DiscoPublic) (endpoint.Serv
} }
if len(s.addrPorts) == 0 { if len(s.addrPorts) == 0 {
if !s.addrDiscoveryOnce {
return endpoint.ServerEndpoint{}, ErrServerNotReady{RetryAfter: 3 * time.Second}
}
return endpoint.ServerEndpoint{}, errors.New("server addrPorts are not yet known") return endpoint.ServerEndpoint{}, errors.New("server addrPorts are not yet known")
} }

View File

@ -7,9 +7,12 @@ import (
"bytes" "bytes"
"context" "context"
"encoding/json" "encoding/json"
"errors"
"fmt"
"io" "io"
"net/http" "net/http"
"net/netip" "net/netip"
"strconv"
"sync" "sync"
"time" "time"
@ -716,46 +719,82 @@ func (r *relayManager) allocateAllServersRunLoop(ep *endpoint) {
}() }()
} }
func (r *relayManager) allocateSingleServer(ctx context.Context, wg *sync.WaitGroup, server netip.AddrPort, ep *endpoint) { type errNotReady struct{ retryAfter time.Duration }
// TODO(jwhited): introduce client metrics counters for notable failures
defer wg.Done() func (e errNotReady) Error() string {
var b bytes.Buffer return fmt.Sprintf("server not ready, retry after %v", e.retryAfter)
remoteDisco := ep.disco.Load() }
if remoteDisco == nil {
return const reqTimeout = time.Second * 10
}
func doAllocate(ctx context.Context, server netip.AddrPort, discoKeys [2]key.DiscoPublic) (udprelay.ServerEndpoint, error) {
var reqBody bytes.Buffer
type allocateRelayEndpointReq struct { type allocateRelayEndpointReq struct {
DiscoKeys []key.DiscoPublic DiscoKeys []key.DiscoPublic
} }
a := &allocateRelayEndpointReq{ a := &allocateRelayEndpointReq{
DiscoKeys: []key.DiscoPublic{ep.c.discoPublic, remoteDisco.key}, DiscoKeys: []key.DiscoPublic{discoKeys[0], discoKeys[1]},
} }
err := json.NewEncoder(&b).Encode(a) err := json.NewEncoder(&reqBody).Encode(a)
if err != nil { if err != nil {
return return udprelay.ServerEndpoint{}, err
} }
const reqTimeout = time.Second * 10
reqCtx, cancel := context.WithTimeout(ctx, reqTimeout) reqCtx, cancel := context.WithTimeout(ctx, reqTimeout)
defer cancel() defer cancel()
req, err := http.NewRequestWithContext(reqCtx, httpm.POST, "http://"+server.String()+"/v0/relay/endpoint", &b) req, err := http.NewRequestWithContext(reqCtx, httpm.POST, "http://"+server.String()+"/v0/relay/endpoint", &reqBody)
if err != nil { if err != nil {
return return udprelay.ServerEndpoint{}, err
} }
resp, err := http.DefaultClient.Do(req) resp, err := http.DefaultClient.Do(req)
if err != nil { if err != nil {
return return udprelay.ServerEndpoint{}, err
} }
defer resp.Body.Close() defer resp.Body.Close()
if resp.StatusCode != http.StatusOK { switch resp.StatusCode {
return case http.StatusOK:
var se udprelay.ServerEndpoint
err = json.NewDecoder(io.LimitReader(resp.Body, 4096)).Decode(&se)
return se, err
case http.StatusServiceUnavailable:
raHeader := resp.Header.Get("Retry-After")
raSeconds, err := strconv.ParseUint(raHeader, 10, 32)
if err == nil {
return udprelay.ServerEndpoint{}, errNotReady{retryAfter: time.Second * time.Duration(raSeconds)}
}
fallthrough
default:
return udprelay.ServerEndpoint{}, fmt.Errorf("non-200 status: %d", resp.StatusCode)
}
}
func (r *relayManager) allocateSingleServer(ctx context.Context, wg *sync.WaitGroup, server netip.AddrPort, ep *endpoint) {
// TODO(jwhited): introduce client metrics counters for notable failures
defer wg.Done()
remoteDisco := ep.disco.Load()
if remoteDisco == nil {
return
}
firstTry := true
for {
se, err := doAllocate(ctx, server, [2]key.DiscoPublic{ep.c.discoPublic, remoteDisco.key})
if err == nil {
relayManagerInputEvent(r, ctx, &r.newServerEndpointCh, newRelayServerEndpointEvent{
ep: ep,
se: se,
})
return
}
ep.c.logf("[v1] magicsock: relayManager: error allocating endpoint on %v for %v: %v", server, ep.discoShort(), err)
var notReady errNotReady
if firstTry && errors.As(err, &notReady) {
select {
case <-ctx.Done():
return
case <-time.After(min(notReady.retryAfter, reqTimeout)):
firstTry = false
continue
}
}
return
} }
var se udprelay.ServerEndpoint
err = json.NewDecoder(io.LimitReader(resp.Body, 4096)).Decode(&se)
if err != nil {
return
}
relayManagerInputEvent(r, ctx, &r.newServerEndpointCh, newRelayServerEndpointEvent{
ep: ep,
se: se,
})
} }