ipn/{ipnlocal,localapi},wgengine{,/magicsock}: plumb health.Tracker

Down to 25 health.Global users. After this remains controlclient &
net/dns & wgengine/router.

Updates #11874
Updates #4136

Change-Id: I6dd1856e3d9bf523bdd44b60fb3b8f7501d5dc0d
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2024-04-26 08:06:06 -07:00 committed by Brad Fitzpatrick
parent df8f40905b
commit 6d69fc137f
15 changed files with 79 additions and 46 deletions

View File

@ -138,6 +138,7 @@ tailscale.com/cmd/derper dependencies: (generated by github.com/tailscale/depawa
tailscale.com/types/structs from tailscale.com/ipn+
tailscale.com/types/tkatype from tailscale.com/client/tailscale+
tailscale.com/types/views from tailscale.com/ipn+
tailscale.com/util/cibuild from tailscale.com/health
tailscale.com/util/clientmetric from tailscale.com/net/netmon+
tailscale.com/util/cloudenv from tailscale.com/hostinfo+
W tailscale.com/util/cmpver from tailscale.com/net/tshttpproxy

View File

@ -142,6 +142,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep
tailscale.com/types/structs from tailscale.com/ipn+
tailscale.com/types/tkatype from tailscale.com/types/key+
tailscale.com/types/views from tailscale.com/tailcfg+
tailscale.com/util/cibuild from tailscale.com/health
tailscale.com/util/clientmetric from tailscale.com/net/netcheck+
tailscale.com/util/cloudenv from tailscale.com/net/dnscache+
tailscale.com/util/cmpver from tailscale.com/net/tshttpproxy+

View File

@ -358,6 +358,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
tailscale.com/types/structs from tailscale.com/control/controlclient+
tailscale.com/types/tkatype from tailscale.com/tka+
tailscale.com/types/views from tailscale.com/ipn/ipnlocal+
tailscale.com/util/cibuild from tailscale.com/health
tailscale.com/util/clientmetric from tailscale.com/control/controlclient+
tailscale.com/util/cloudenv from tailscale.com/net/dns/resolver+
tailscale.com/util/cmpver from tailscale.com/net/dns+

View File

@ -651,6 +651,7 @@ func tryEngine(logf logger.Logf, sys *tsd.System, name string) (onlyNetstack boo
conf := wgengine.Config{
ListenPort: args.port,
NetMon: sys.NetMon.Get(),
HealthTracker: sys.HealthTracker(),
Dialer: sys.Dialer.Get(),
SetSubsystem: sys.Set,
ControlKnobs: sys.ControlKnobs(),

View File

@ -9,6 +9,7 @@
"errors"
"fmt"
"net/http"
"os"
"runtime"
"sort"
"sync"
@ -18,6 +19,7 @@
"tailscale.com/envknob"
"tailscale.com/tailcfg"
"tailscale.com/types/opt"
"tailscale.com/util/cibuild"
"tailscale.com/util/mak"
"tailscale.com/util/multierr"
"tailscale.com/util/set"
@ -152,6 +154,11 @@ func (t *Tracker) nil() bool {
if t != nil {
return false
}
if cibuild.On() {
stack := make([]byte, 1<<10)
stack = stack[:runtime.Stack(stack, false)]
fmt.Fprintf(os.Stderr, "## WARNING: (non-fatal) nil health.Tracker (being strict in CI):\n%s\n", stack)
}
// TODO(bradfitz): open source our "unexpected" package
// and use it here to capture samples of stacks where
// t is nil.

View File

@ -327,6 +327,16 @@ type LocalBackend struct {
outgoingFiles map[string]*ipn.OutgoingFile
}
// HealthTracker returns the health tracker for the backend.
func (b *LocalBackend) HealthTracker() *health.Tracker {
return b.health
}
// NetMon returns the network monitor for the backend.
func (b *LocalBackend) NetMon() *netmon.Monitor {
return b.sys.NetMon.Get()
}
type updateStatus struct {
started bool
}

View File

@ -20,7 +20,6 @@
"path/filepath"
"time"
"tailscale.com/health"
"tailscale.com/health/healthmsg"
"tailscale.com/ipn"
"tailscale.com/ipn/ipnstate"
@ -59,11 +58,11 @@ type tkaState struct {
// b.mu must be held.
func (b *LocalBackend) tkaFilterNetmapLocked(nm *netmap.NetworkMap) {
if b.tka == nil && !b.capTailnetLock {
health.Global.SetTKAHealth(nil)
b.health.SetTKAHealth(nil)
return
}
if b.tka == nil {
health.Global.SetTKAHealth(nil)
b.health.SetTKAHealth(nil)
return // TKA not enabled.
}
@ -117,9 +116,9 @@ func (b *LocalBackend) tkaFilterNetmapLocked(nm *netmap.NetworkMap) {
// Check that we ourselves are not locked out, report a health issue if so.
if nm.SelfNode.Valid() && b.tka.authority.NodeKeyAuthorized(nm.SelfNode.Key(), nm.SelfNode.KeySignature().AsSlice()) != nil {
health.Global.SetTKAHealth(errors.New(healthmsg.LockedOut))
b.health.SetTKAHealth(errors.New(healthmsg.LockedOut))
} else {
health.Global.SetTKAHealth(nil)
b.health.SetTKAHealth(nil)
}
}
@ -188,7 +187,7 @@ func (b *LocalBackend) tkaSyncIfNeeded(nm *netmap.NetworkMap, prefs ipn.PrefsVie
b.logf("Disablement failed, leaving TKA enabled. Error: %v", err)
} else {
isEnabled = false
health.Global.SetTKAHealth(nil)
b.health.SetTKAHealth(nil)
}
} else {
return fmt.Errorf("[bug] unreachable invariant of wantEnabled w/ isEnabled")

View File

@ -199,7 +199,7 @@ func (s *Server) serveHTTP(w http.ResponseWriter, r *http.Request) {
defer onDone()
if strings.HasPrefix(r.URL.Path, "/localapi/") {
lah := localapi.NewHandler(lb, s.logf, s.netMon, s.backendLogID)
lah := localapi.NewHandler(lb, s.logf, s.backendLogID)
lah.PermitRead, lah.PermitWrite = s.localAPIPermissions(ci)
lah.PermitCert = s.connCanFetchCerts(ci)
lah.ConnIdentity = ci

View File

@ -140,7 +140,7 @@ func (h *Handler) serveDebugDERPRegion(w http.ResponseWriter, r *http.Request) {
}
checkSTUN4 := func(derpNode *tailcfg.DERPNode) {
u4, err := nettype.MakePacketListenerWithNetIP(netns.Listener(h.logf, h.netMon)).ListenPacket(ctx, "udp4", ":0")
u4, err := nettype.MakePacketListenerWithNetIP(netns.Listener(h.logf, h.b.NetMon())).ListenPacket(ctx, "udp4", ":0")
if err != nil {
st.Errors = append(st.Errors, fmt.Sprintf("Error creating IPv4 STUN listener: %v", err))
return
@ -249,7 +249,7 @@ func (h *Handler) serveDebugDERPRegion(w http.ResponseWriter, r *http.Request) {
serverPubKeys := make(map[key.NodePublic]bool)
for i := range 5 {
func() {
rc := derphttp.NewRegionClient(fakePrivKey, h.logf, h.netMon, func() *tailcfg.DERPRegion {
rc := derphttp.NewRegionClient(fakePrivKey, h.logf, h.b.NetMon(), func() *tailcfg.DERPRegion {
return &tailcfg.DERPRegion{
RegionID: reg.RegionID,
RegionCode: reg.RegionCode,

View File

@ -36,7 +36,6 @@
"tailscale.com/clientupdate"
"tailscale.com/drive"
"tailscale.com/envknob"
"tailscale.com/health"
"tailscale.com/hostinfo"
"tailscale.com/ipn"
"tailscale.com/ipn/ipnauth"
@ -156,8 +155,8 @@
// NewHandler creates a new LocalAPI HTTP handler. All parameters except netMon
// are required (if non-nil it's used to do faster interface lookups).
func NewHandler(b *ipnlocal.LocalBackend, logf logger.Logf, netMon *netmon.Monitor, logID logid.PublicID) *Handler {
return &Handler{b: b, logf: logf, netMon: netMon, backendLogID: logID, clock: tstime.StdClock{}}
func NewHandler(b *ipnlocal.LocalBackend, logf logger.Logf, logID logid.PublicID) *Handler {
return &Handler{b: b, logf: logf, backendLogID: logID, clock: tstime.StdClock{}}
}
type Handler struct {
@ -188,7 +187,6 @@ type Handler struct {
b *ipnlocal.LocalBackend
logf logger.Logf
netMon *netmon.Monitor // optional; nil means interfaces will be looked up on-demand
backendLogID logid.PublicID
clock tstime.Clock
}
@ -358,7 +356,7 @@ func (h *Handler) serveBugReport(w http.ResponseWriter, r *http.Request) {
}
hi, _ := json.Marshal(hostinfo.New())
h.logf("user bugreport hostinfo: %s", hi)
if err := health.Global.OverallError(); err != nil {
if err := h.b.HealthTracker().OverallError(); err != nil {
h.logf("user bugreport health: %s", err.Error())
} else {
h.logf("user bugreport health: ok")
@ -748,7 +746,7 @@ func (h *Handler) serveDebugPortmap(w http.ResponseWriter, r *http.Request) {
done := make(chan bool, 1)
var c *portmapper.Client
c = portmapper.NewClient(logger.WithPrefix(logf, "portmapper: "), h.netMon, debugKnobs, h.b.ControlKnobs(), func() {
c = portmapper.NewClient(logger.WithPrefix(logf, "portmapper: "), h.b.NetMon(), debugKnobs, h.b.ControlKnobs(), func() {
logf("portmapping changed.")
logf("have mapping: %v", c.HaveMapping())

View File

@ -233,7 +233,7 @@ func (s *Server) Loopback() (addr string, proxyCred, localAPICred string, err er
// out the CONNECT code from tailscaled/proxy.go that uses
// httputil.ReverseProxy and adding auth support.
go func() {
lah := localapi.NewHandler(s.lb, s.logf, s.netMon, s.logid)
lah := localapi.NewHandler(s.lb, s.logf, s.logid)
lah.PermitWrite = true
lah.PermitRead = true
lah.RequiredPassword = s.localAPICred
@ -517,11 +517,12 @@ func (s *Server) start() (reterr error) {
sys := new(tsd.System)
s.dialer = &tsdial.Dialer{Logf: logf} // mutated below (before used)
eng, err := wgengine.NewUserspaceEngine(logf, wgengine.Config{
ListenPort: s.Port,
NetMon: s.netMon,
Dialer: s.dialer,
SetSubsystem: sys.Set,
ControlKnobs: sys.ControlKnobs(),
ListenPort: s.Port,
NetMon: s.netMon,
Dialer: s.dialer,
SetSubsystem: sys.Set,
ControlKnobs: sys.ControlKnobs(),
HealthTracker: sys.HealthTracker(),
})
if err != nil {
return err
@ -606,7 +607,7 @@ func (s *Server) start() (reterr error) {
go s.printAuthURLLoop()
// Run the localapi handler, to allow fetching LetsEncrypt certs.
lah := localapi.NewHandler(lb, logf, s.netMon, s.logid)
lah := localapi.NewHandler(lb, logf, s.logid)
lah.PermitWrite = true
lah.PermitRead = true

View File

@ -165,7 +165,7 @@ func (c *Conn) maybeSetNearestDERP(report *netcheck.Report) (preferredDERP int)
if testenv.InTest() && !checkControlHealthDuringNearestDERPInTests {
connectedToControl = true
} else {
connectedToControl = health.Global.GetInPollNetMap()
connectedToControl = c.health.GetInPollNetMap()
}
if !connectedToControl {
c.mu.Lock()
@ -201,12 +201,12 @@ func (c *Conn) setNearestDERP(derpNum int) (wantDERP bool) {
defer c.mu.Unlock()
if !c.wantDerpLocked() {
c.myDerp = 0
health.Global.SetMagicSockDERPHome(0, c.homeless)
c.health.SetMagicSockDERPHome(0, c.homeless)
return false
}
if c.homeless {
c.myDerp = 0
health.Global.SetMagicSockDERPHome(0, c.homeless)
c.health.SetMagicSockDERPHome(0, c.homeless)
return false
}
if derpNum == c.myDerp {
@ -217,7 +217,7 @@ func (c *Conn) setNearestDERP(derpNum int) (wantDERP bool) {
metricDERPHomeChange.Add(1)
}
c.myDerp = derpNum
health.Global.SetMagicSockDERPHome(derpNum, c.homeless)
c.health.SetMagicSockDERPHome(derpNum, c.homeless)
if c.privateKey.IsZero() {
// No private key yet, so DERP connections won't come up anyway.
@ -400,7 +400,7 @@ func (c *Conn) derpWriteChanOfAddr(addr netip.AddrPort, peer key.NodePublic) cha
}
return derpMap.Regions[regionID]
})
dc.HealthTracker = health.Global
dc.HealthTracker = c.health
dc.SetCanAckPings(true)
dc.NotePreferred(c.myDerp == regionID)
@ -526,8 +526,8 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d
return n
}
defer health.Global.SetDERPRegionConnectedState(regionID, false)
defer health.Global.SetDERPRegionHealth(regionID, "")
defer c.health.SetDERPRegionConnectedState(regionID, false)
defer c.health.SetDERPRegionHealth(regionID, "")
// peerPresent is the set of senders we know are present on this
// connection, based on messages we've received from the server.
@ -539,7 +539,7 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d
for {
msg, connGen, err := dc.RecvDetail()
if err != nil {
health.Global.SetDERPRegionConnectedState(regionID, false)
c.health.SetDERPRegionConnectedState(regionID, false)
// Forget that all these peers have routes.
for peer := range peerPresent {
delete(peerPresent, peer)
@ -577,14 +577,14 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d
now := time.Now()
if lastPacketTime.IsZero() || now.Sub(lastPacketTime) > frameReceiveRecordRate {
health.Global.NoteDERPRegionReceivedFrame(regionID)
c.health.NoteDERPRegionReceivedFrame(regionID)
lastPacketTime = now
}
switch m := msg.(type) {
case derp.ServerInfoMessage:
health.Global.SetDERPRegionConnectedState(regionID, true)
health.Global.SetDERPRegionHealth(regionID, "") // until declared otherwise
c.health.SetDERPRegionConnectedState(regionID, true)
c.health.SetDERPRegionHealth(regionID, "") // until declared otherwise
c.logf("magicsock: derp-%d connected; connGen=%v", regionID, connGen)
continue
case derp.ReceivedPacket:
@ -624,7 +624,7 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d
}()
continue
case derp.HealthMessage:
health.Global.SetDERPRegionHealth(regionID, m.Problem)
c.health.SetDERPRegionHealth(regionID, m.Problem)
continue
case derp.PeerGoneMessage:
switch m.Reason {

View File

@ -91,6 +91,7 @@ type Conn struct {
testOnlyPacketListener nettype.PacketListener
noteRecvActivity func(key.NodePublic) // or nil, see Options.NoteRecvActivity
netMon *netmon.Monitor // or nil
health *health.Tracker // or nil
controlKnobs *controlknobs.Knobs // or nil
// ================================================================
@ -369,9 +370,13 @@ type Options struct {
NoteRecvActivity func(key.NodePublic)
// NetMon is the network monitor to use.
// With one, the portmapper won't be used.
// If nil, the portmapper won't be used.
NetMon *netmon.Monitor
// HealthTracker optionally specifies the health tracker to
// report errors and warnings to.
HealthTracker *health.Tracker
// ControlKnobs are the set of control knobs to use.
// If nil, they're ignored and not updated.
ControlKnobs *controlknobs.Knobs
@ -463,6 +468,7 @@ func NewConn(opts Options) (*Conn, error) {
c.portMapper.SetGatewayLookupFunc(opts.NetMon.GatewayAndSelfIP)
}
c.netMon = opts.NetMon
c.health = opts.HealthTracker
c.onPortUpdate = opts.OnPortUpdate
c.getPeerByKey = opts.PeerByKeyFunc
@ -666,7 +672,7 @@ func (c *Conn) updateNetInfo(ctx context.Context) (*netcheck.Report, error) {
// NOTE(andrew-d): I don't love that we're depending on the
// health package here, but I'd rather do that and not store
// the exact same state in two different places.
GetLastDERPActivity: health.Global.GetDERPRegionReceivedTime,
GetLastDERPActivity: c.health.GetDERPRegionReceivedTime,
})
if err != nil {
return nil, err
@ -2471,7 +2477,7 @@ func (c *Conn) bindSocket(ruc *RebindingUDPConn, network string, curPortFate cur
}
ruc.setConnLocked(pconn, network, c.bind.BatchSize())
if network == "udp4" {
health.Global.SetUDP4Unbound(false)
c.health.SetUDP4Unbound(false)
}
return nil
}
@ -2482,7 +2488,7 @@ func (c *Conn) bindSocket(ruc *RebindingUDPConn, network string, curPortFate cur
// we get a link change and we can try binding again.
ruc.setConnLocked(newBlockForeverConn(), "", c.bind.BatchSize())
if network == "udp4" {
health.Global.SetUDP4Unbound(true)
c.health.SetUDP4Unbound(true)
}
return fmt.Errorf("failed to bind any ports (tried %v)", ports)
}

View File

@ -3113,21 +3113,23 @@ func TestMaybeSetNearestDERP(t *testing.T) {
}
for _, tt := range testCases {
t.Run(tt.name, func(t *testing.T) {
ht := new(health.Tracker)
c := newConn()
c.logf = t.Logf
c.myDerp = tt.old
c.derpMap = derpMap
c.health = ht
report := &netcheck.Report{PreferredDERP: tt.reportDERP}
oldConnected := health.Global.GetInPollNetMap()
oldConnected := ht.GetInPollNetMap()
if tt.connectedToControl != oldConnected {
if tt.connectedToControl {
health.Global.GotStreamedMapResponse()
t.Cleanup(health.Global.SetOutOfPollNetMap)
ht.GotStreamedMapResponse()
t.Cleanup(ht.SetOutOfPollNetMap)
} else {
health.Global.SetOutOfPollNetMap()
t.Cleanup(health.Global.GotStreamedMapResponse)
ht.SetOutOfPollNetMap()
t.Cleanup(ht.GotStreamedMapResponse)
}
}

View File

@ -98,6 +98,7 @@ type userspaceEngine struct {
dns *dns.Manager
magicConn *magicsock.Conn
netMon *netmon.Monitor
health *health.Tracker
netMonOwned bool // whether we created netMon (and thus need to close it)
netMonUnregister func() // unsubscribes from changes; used regardless of netMonOwned
birdClient BIRDClient // or nil
@ -188,6 +189,9 @@ type Config struct {
// If nil, a new network monitor is created.
NetMon *netmon.Monitor
// HealthTracker, if non-nil, is the health tracker to use.
HealthTracker *health.Tracker
// Dialer is the dialer to use for outbound connections.
// If nil, a new Dialer is created
Dialer *tsdial.Dialer
@ -310,6 +314,7 @@ func NewUserspaceEngine(logf logger.Logf, conf Config) (_ Engine, reterr error)
birdClient: conf.BIRDClient,
controlKnobs: conf.ControlKnobs,
reconfigureVPN: conf.ReconfigureVPN,
health: conf.HealthTracker,
}
if e.birdClient != nil {
@ -372,6 +377,7 @@ func NewUserspaceEngine(logf logger.Logf, conf Config) (_ Engine, reterr error)
IdleFunc: e.tundev.IdleDuration,
NoteRecvActivity: e.noteRecvActivity,
NetMon: e.netMon,
HealthTracker: e.health,
ControlKnobs: conf.ControlKnobs,
OnPortUpdate: onPortUpdate,
PeerByKeyFunc: e.PeerByKey,
@ -970,7 +976,7 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config,
e.logf("wgengine: Reconfig: configuring router")
e.networkLogger.ReconfigRoutes(routerCfg)
err := e.router.Set(routerCfg)
health.Global.SetRouterHealth(err)
e.health.SetRouterHealth(err)
if err != nil {
return err
}
@ -979,7 +985,7 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config,
// assigned address.
e.logf("wgengine: Reconfig: configuring DNS")
err = e.dns.Set(*dnsCfg)
health.Global.SetDNSHealth(err)
e.health.SetDNSHealth(err)
if err != nil {
return err
}
@ -1183,7 +1189,7 @@ func (e *userspaceEngine) linkChange(delta *netmon.ChangeDelta) {
e.logf("[v1] LinkChange: minor")
}
health.Global.SetAnyInterfaceUp(up)
e.health.SetAnyInterfaceUp(up)
e.magicConn.SetNetworkUp(up)
if !up || changed {
if err := e.dns.FlushCaches(); err != nil {