health: introduce captive-portal-detected Warnable (#12707)

Updates tailscale/tailscale#1634

This PR introduces a new `captive-portal-detected` Warnable which is set to an unhealthy state whenever a captive portal is detected on the local network, preventing Tailscale from connecting.



ipn/ipnlocal: fix captive portal loop shutdown


Change-Id: I7cafdbce68463a16260091bcec1741501a070c95

net/captivedetection: fix mutex misuse

ipn/ipnlocal: ensure that we don't fail to start the timer


Change-Id: I3e43fb19264d793e8707c5031c0898e48e3e7465

Signed-off-by: Andrew Dunham <andrew@du.nham.ca>
Signed-off-by: Andrea Gottardo <andrea@gottardo.me>
This commit is contained in:
Andrea Gottardo
2024-07-26 11:25:55 -07:00
committed by GitHub
parent cf97cff33b
commit 90be06bd5b
15 changed files with 750 additions and 154 deletions

View File

@@ -60,6 +60,7 @@ import (
"tailscale.com/ipn/policy"
"tailscale.com/log/sockstatlog"
"tailscale.com/logpolicy"
"tailscale.com/net/captivedetection"
"tailscale.com/net/dns"
"tailscale.com/net/dnscache"
"tailscale.com/net/dnsfallback"
@@ -344,6 +345,21 @@ type LocalBackend struct {
// refreshAutoExitNode indicates if the exit node should be recomputed when the next netcheck report is available.
refreshAutoExitNode bool
// captiveCtx and captiveCancel are used to control captive portal
// detection. They are protected by 'mu' and can be changed during the
// lifetime of a LocalBackend.
//
// captiveCtx will always be non-nil, though it might be a canceled
// context. captiveCancel is non-nil if checkCaptivePortalLoop is
// running, and is set to nil after being canceled.
captiveCtx context.Context
captiveCancel context.CancelFunc
// needsCaptiveDetection is a channel that is used to signal either
// that captive portal detection is required (sending true) or that the
// backend is healthy and captive portal detection is not required
// (sending false).
needsCaptiveDetection chan bool
}
// HealthTracker returns the health tracker for the backend.
@@ -398,27 +414,35 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
ctx, cancel := context.WithCancel(context.Background())
clock := tstime.StdClock{}
// Until we transition to a Running state, use a canceled context for
// our captive portal detection.
captiveCtx, captiveCancel := context.WithCancel(ctx)
captiveCancel()
b := &LocalBackend{
ctx: ctx,
ctxCancel: cancel,
logf: logf,
keyLogf: logger.LogOnChange(logf, 5*time.Minute, clock.Now),
statsLogf: logger.LogOnChange(logf, 5*time.Minute, clock.Now),
sys: sys,
health: sys.HealthTracker(),
e: e,
dialer: dialer,
store: store,
pm: pm,
backendLogID: logID,
state: ipn.NoState,
portpoll: new(portlist.Poller),
em: newExpiryManager(logf),
gotPortPollRes: make(chan struct{}),
loginFlags: loginFlags,
clock: clock,
selfUpdateProgress: make([]ipnstate.UpdateProgress, 0),
lastSelfUpdateState: ipnstate.UpdateFinished,
ctx: ctx,
ctxCancel: cancel,
logf: logf,
keyLogf: logger.LogOnChange(logf, 5*time.Minute, clock.Now),
statsLogf: logger.LogOnChange(logf, 5*time.Minute, clock.Now),
sys: sys,
health: sys.HealthTracker(),
e: e,
dialer: dialer,
store: store,
pm: pm,
backendLogID: logID,
state: ipn.NoState,
portpoll: new(portlist.Poller),
em: newExpiryManager(logf),
gotPortPollRes: make(chan struct{}),
loginFlags: loginFlags,
clock: clock,
selfUpdateProgress: make([]ipnstate.UpdateProgress, 0),
lastSelfUpdateState: ipnstate.UpdateFinished,
captiveCtx: captiveCtx,
captiveCancel: nil, // so that we start checkCaptivePortalLoop when Running
needsCaptiveDetection: make(chan bool),
}
mConn.SetNetInfoCallback(b.setNetInfo)
@@ -669,6 +693,10 @@ func (b *LocalBackend) pauseOrResumeControlClientLocked() {
b.cc.SetPaused((b.state == ipn.Stopped && b.netMap != nil) || (!networkUp && !testenv.InTest() && !assumeNetworkUpdateForTest()))
}
// captivePortalDetectionInterval is the duration to wait in an unhealthy state with connectivity broken
// before running captive portal detection.
const captivePortalDetectionInterval = 2 * time.Second
// linkChange is our network monitor callback, called whenever the network changes.
func (b *LocalBackend) linkChange(delta *netmon.ChangeDelta) {
b.mu.Lock()
@@ -719,6 +747,44 @@ func (b *LocalBackend) onHealthChange(w *health.Warnable, us *health.UnhealthySt
b.send(ipn.Notify{
Health: state,
})
isConnectivityImpacted := false
for _, w := range state.Warnings {
// Ignore the captive portal warnable itself.
if w.ImpactsConnectivity && w.WarnableCode != captivePortalWarnable.Code {
isConnectivityImpacted = true
break
}
}
// captiveCtx can be changed, and is protected with 'mu'; grab that
// before we start our select, below.
//
// It is guaranteed to be non-nil.
b.mu.Lock()
ctx := b.captiveCtx
b.mu.Unlock()
// If the context is canceled, we don't need to do anything.
if ctx.Err() != nil {
return
}
if isConnectivityImpacted {
b.logf("health: connectivity impacted; triggering captive portal detection")
// Ensure that we select on captiveCtx so that we can time out
// triggering captive portal detection if the backend is shutdown.
select {
case b.needsCaptiveDetection <- true:
case <-ctx.Done():
}
} else {
select {
case b.needsCaptiveDetection <- false:
case <-ctx.Done():
}
}
}
// Shutdown halts the backend and all its sub-components. The backend
@@ -731,6 +797,11 @@ func (b *LocalBackend) Shutdown() {
}
b.shutdownCalled = true
if b.captiveCancel != nil {
b.logf("canceling captive portal context")
b.captiveCancel()
}
if b.loginFlags&controlclient.LoginEphemeral != 0 {
b.mu.Unlock()
ctx, cancel := context.WithTimeout(b.ctx, 5*time.Second)
@@ -2097,6 +2168,122 @@ func (b *LocalBackend) updateFilterLocked(netMap *netmap.NetworkMap, prefs ipn.P
}
}
// captivePortalWarnable is a Warnable which is set to an unhealthy state when a captive portal is detected.
var captivePortalWarnable = health.Register(&health.Warnable{
Code: "captive-portal-detected",
Title: "Captive portal detected",
// High severity, because captive portals block all traffic and require user intervention.
Severity: health.SeverityHigh,
Text: health.StaticMessage("This network requires you to log in using your web browser."),
ImpactsConnectivity: true,
})
func (b *LocalBackend) checkCaptivePortalLoop(ctx context.Context) {
var tmr *time.Timer
maybeStartTimer := func() {
// If there's an existing timer, nothing to do; just continue
// waiting for it to expire. Otherwise, create a new timer.
if tmr == nil {
tmr = time.NewTimer(captivePortalDetectionInterval)
}
}
maybeStopTimer := func() {
if tmr == nil {
return
}
if !tmr.Stop() {
<-tmr.C
}
tmr = nil
}
for {
if ctx.Err() != nil {
maybeStopTimer()
return
}
// First, see if we have a signal on our "healthy" channel, which
// takes priority over an existing timer. Because a select is
// nondeterministic, we explicitly check this channel before
// entering the main select below, so that we're guaranteed to
// stop the timer before starting captive portal detection.
select {
case needsCaptiveDetection := <-b.needsCaptiveDetection:
if needsCaptiveDetection {
maybeStartTimer()
} else {
maybeStopTimer()
}
default:
}
var timerChan <-chan time.Time
if tmr != nil {
timerChan = tmr.C
}
select {
case <-ctx.Done():
// All done; stop the timer and then exit.
maybeStopTimer()
return
case <-timerChan:
// Kick off captive portal check
b.performCaptiveDetection()
// nil out timer to force recreation
tmr = nil
case needsCaptiveDetection := <-b.needsCaptiveDetection:
if needsCaptiveDetection {
maybeStartTimer()
} else {
// Healthy; cancel any existing timer
maybeStopTimer()
}
}
}
}
// performCaptiveDetection checks if captive portal detection is enabled via controlknob. If so, it runs
// the detection and updates the Warnable accordingly.
func (b *LocalBackend) performCaptiveDetection() {
if !b.shouldRunCaptivePortalDetection() {
return
}
d := captivedetection.NewDetector(b.logf)
var dm *tailcfg.DERPMap
b.mu.Lock()
if b.netMap != nil {
dm = b.netMap.DERPMap
}
preferredDERP := 0
if b.hostinfo != nil {
if b.hostinfo.NetInfo != nil {
preferredDERP = b.hostinfo.NetInfo.PreferredDERP
}
}
ctx := b.ctx
netMon := b.NetMon()
b.mu.Unlock()
found := d.Detect(ctx, netMon, dm, preferredDERP)
if found {
b.health.SetUnhealthy(captivePortalWarnable, health.Args{})
} else {
b.health.SetHealthy(captivePortalWarnable)
}
}
// shouldRunCaptivePortalDetection reports whether captive portal detection
// should be run. It is enabled by default, but can be disabled via a control
// knob. It is also only run when the user explicitly wants the backend to be
// running.
func (b *LocalBackend) shouldRunCaptivePortalDetection() bool {
b.mu.Lock()
defer b.mu.Unlock()
return !b.ControlKnobs().DisableCaptivePortalDetection.Load() && b.pm.prefs.WantRunning()
}
// packetFilterPermitsUnlockedNodes reports any peer in peers with the
// UnsignedPeerAPIOnly bool set true has any of its allowed IPs in the packet
// filter.
@@ -4490,9 +4677,27 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock
if newState == ipn.Running {
b.authURL = ""
b.authURLTime = time.Time{}
// Start a captive portal detection loop if none has been
// started. Create a new context if none is present, since it
// can be shut down if we transition away from Running.
if b.captiveCancel == nil {
b.captiveCtx, b.captiveCancel = context.WithCancel(b.ctx)
go b.checkCaptivePortalLoop(b.captiveCtx)
}
} else if oldState == ipn.Running {
// Transitioning away from running.
b.closePeerAPIListenersLocked()
// Stop any existing captive portal detection loop.
if b.captiveCancel != nil {
b.captiveCancel()
b.captiveCancel = nil
// NOTE: don't set captiveCtx to nil here, to ensure
// that we always have a (canceled) context to wait on
// in onHealthChange.
}
}
b.pauseOrResumeControlClientLocked()