From 90be06bd5b79798fb0fdaa996750acceb96f831c Mon Sep 17 00:00:00 2001 From: Andrea Gottardo Date: Fri, 26 Jul 2024 11:25:55 -0700 Subject: [PATCH] health: introduce captive-portal-detected Warnable (#12707) Updates tailscale/tailscale#1634 This PR introduces a new `captive-portal-detected` Warnable which is set to an unhealthy state whenever a captive portal is detected on the local network, preventing Tailscale from connecting. ipn/ipnlocal: fix captive portal loop shutdown Change-Id: I7cafdbce68463a16260091bcec1741501a070c95 net/captivedetection: fix mutex misuse ipn/ipnlocal: ensure that we don't fail to start the timer Change-Id: I3e43fb19264d793e8707c5031c0898e48e3e7465 Signed-off-by: Andrew Dunham Signed-off-by: Andrea Gottardo --- cmd/k8s-operator/depaware.txt | 1 + cmd/tailscale/depaware.txt | 3 +- cmd/tailscaled/depaware.txt | 1 + control/controlknobs/controlknobs.go | 7 + ipn/ipnlocal/local.go | 245 ++++++++++++++++-- net/captivedetection/captivedetection.go | 217 ++++++++++++++++ net/captivedetection/captivedetection_test.go | 58 +++++ net/captivedetection/endpoints.go | 178 +++++++++++++ net/captivedetection/rawconn.go | 19 ++ net/captivedetection/rawconn_apple.go | 24 ++ net/dnsfallback/dnsfallback.go | 11 +- net/dnsfallback/dnsfallback_test.go | 4 +- net/netcheck/netcheck.go | 79 +----- net/netcheck/netcheck_test.go | 50 ---- tailcfg/tailcfg.go | 7 +- 15 files changed, 750 insertions(+), 154 deletions(-) create mode 100644 net/captivedetection/captivedetection.go create mode 100644 net/captivedetection/captivedetection_test.go create mode 100644 net/captivedetection/endpoints.go create mode 100644 net/captivedetection/rawconn.go create mode 100644 net/captivedetection/rawconn_apple.go diff --git a/cmd/k8s-operator/depaware.txt b/cmd/k8s-operator/depaware.txt index b5c0ed517..547f2ec57 100644 --- a/cmd/k8s-operator/depaware.txt +++ b/cmd/k8s-operator/depaware.txt @@ -701,6 +701,7 @@ tailscale.com/cmd/k8s-operator dependencies: (generated by github.com/tailscale/ tailscale.com/logtail/backoff from tailscale.com/control/controlclient+ tailscale.com/logtail/filch from tailscale.com/log/sockstatlog+ tailscale.com/metrics from tailscale.com/derp+ + tailscale.com/net/captivedetection from tailscale.com/ipn/ipnlocal+ tailscale.com/net/connstats from tailscale.com/net/tstun+ tailscale.com/net/dns from tailscale.com/ipn/ipnlocal+ tailscale.com/net/dns/publicdns from tailscale.com/net/dns+ diff --git a/cmd/tailscale/depaware.txt b/cmd/tailscale/depaware.txt index 80b011d04..c03be655d 100644 --- a/cmd/tailscale/depaware.txt +++ b/cmd/tailscale/depaware.txt @@ -100,9 +100,10 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep tailscale.com/ipn/ipnstate from tailscale.com/client/tailscale+ tailscale.com/licenses from tailscale.com/client/web+ tailscale.com/metrics from tailscale.com/derp + tailscale.com/net/captivedetection from tailscale.com/net/netcheck tailscale.com/net/dns/recursive from tailscale.com/net/dnsfallback tailscale.com/net/dnscache from tailscale.com/control/controlhttp+ - tailscale.com/net/dnsfallback from tailscale.com/control/controlhttp + tailscale.com/net/dnsfallback from tailscale.com/control/controlhttp+ tailscale.com/net/flowtrack from tailscale.com/net/packet tailscale.com/net/netaddr from tailscale.com/ipn+ tailscale.com/net/netcheck from tailscale.com/cmd/tailscale/cli diff --git a/cmd/tailscaled/depaware.txt b/cmd/tailscaled/depaware.txt index 5b37778f8..5512e9eff 100644 --- a/cmd/tailscaled/depaware.txt +++ b/cmd/tailscaled/depaware.txt @@ -288,6 +288,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de tailscale.com/logtail/backoff from tailscale.com/cmd/tailscaled+ tailscale.com/logtail/filch from tailscale.com/log/sockstatlog+ tailscale.com/metrics from tailscale.com/derp+ + tailscale.com/net/captivedetection from tailscale.com/ipn/ipnlocal+ tailscale.com/net/connstats from tailscale.com/net/tstun+ tailscale.com/net/dns from tailscale.com/cmd/tailscaled+ tailscale.com/net/dns/publicdns from tailscale.com/net/dns+ diff --git a/control/controlknobs/controlknobs.go b/control/controlknobs/controlknobs.go index 7315a10f7..dd76a3abd 100644 --- a/control/controlknobs/controlknobs.go +++ b/control/controlknobs/controlknobs.go @@ -99,6 +99,10 @@ type Knobs struct { // DisableCryptorouting indicates that the node should not use the // magicsock crypto routing feature. DisableCryptorouting atomic.Bool + + // DisableCaptivePortalDetection is whether the node should not perform captive portal detection + // automatically when the network state changes. + DisableCaptivePortalDetection atomic.Bool } // UpdateFromNodeAttributes updates k (if non-nil) based on the provided self @@ -127,6 +131,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) { disableSplitDNSWhenNoCustomResolvers = has(tailcfg.NodeAttrDisableSplitDNSWhenNoCustomResolvers) disableLocalDNSOverrideViaNRPT = has(tailcfg.NodeAttrDisableLocalDNSOverrideViaNRPT) disableCryptorouting = has(tailcfg.NodeAttrDisableMagicSockCryptoRouting) + disableCaptivePortalDetection = has(tailcfg.NodeAttrDisableCaptivePortalDetection) ) if has(tailcfg.NodeAttrOneCGNATEnable) { @@ -153,6 +158,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) { k.DisableSplitDNSWhenNoCustomResolvers.Store(disableSplitDNSWhenNoCustomResolvers) k.DisableLocalDNSOverrideViaNRPT.Store(disableLocalDNSOverrideViaNRPT) k.DisableCryptorouting.Store(disableCryptorouting) + k.DisableCaptivePortalDetection.Store(disableCaptivePortalDetection) } // AsDebugJSON returns k as something that can be marshalled with json.Marshal @@ -180,5 +186,6 @@ func (k *Knobs) AsDebugJSON() map[string]any { "DisableSplitDNSWhenNoCustomResolvers": k.DisableSplitDNSWhenNoCustomResolvers.Load(), "DisableLocalDNSOverrideViaNRPT": k.DisableLocalDNSOverrideViaNRPT.Load(), "DisableCryptorouting": k.DisableCryptorouting.Load(), + "DisableCaptivePortalDetection": k.DisableCaptivePortalDetection.Load(), } } diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index ede29156b..14b648b9f 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -60,6 +60,7 @@ "tailscale.com/ipn/policy" "tailscale.com/log/sockstatlog" "tailscale.com/logpolicy" + "tailscale.com/net/captivedetection" "tailscale.com/net/dns" "tailscale.com/net/dnscache" "tailscale.com/net/dnsfallback" @@ -344,6 +345,21 @@ type LocalBackend struct { // refreshAutoExitNode indicates if the exit node should be recomputed when the next netcheck report is available. refreshAutoExitNode bool + + // captiveCtx and captiveCancel are used to control captive portal + // detection. They are protected by 'mu' and can be changed during the + // lifetime of a LocalBackend. + // + // captiveCtx will always be non-nil, though it might be a canceled + // context. captiveCancel is non-nil if checkCaptivePortalLoop is + // running, and is set to nil after being canceled. + captiveCtx context.Context + captiveCancel context.CancelFunc + // needsCaptiveDetection is a channel that is used to signal either + // that captive portal detection is required (sending true) or that the + // backend is healthy and captive portal detection is not required + // (sending false). + needsCaptiveDetection chan bool } // HealthTracker returns the health tracker for the backend. @@ -398,27 +414,35 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo ctx, cancel := context.WithCancel(context.Background()) clock := tstime.StdClock{} + // Until we transition to a Running state, use a canceled context for + // our captive portal detection. + captiveCtx, captiveCancel := context.WithCancel(ctx) + captiveCancel() + b := &LocalBackend{ - ctx: ctx, - ctxCancel: cancel, - logf: logf, - keyLogf: logger.LogOnChange(logf, 5*time.Minute, clock.Now), - statsLogf: logger.LogOnChange(logf, 5*time.Minute, clock.Now), - sys: sys, - health: sys.HealthTracker(), - e: e, - dialer: dialer, - store: store, - pm: pm, - backendLogID: logID, - state: ipn.NoState, - portpoll: new(portlist.Poller), - em: newExpiryManager(logf), - gotPortPollRes: make(chan struct{}), - loginFlags: loginFlags, - clock: clock, - selfUpdateProgress: make([]ipnstate.UpdateProgress, 0), - lastSelfUpdateState: ipnstate.UpdateFinished, + ctx: ctx, + ctxCancel: cancel, + logf: logf, + keyLogf: logger.LogOnChange(logf, 5*time.Minute, clock.Now), + statsLogf: logger.LogOnChange(logf, 5*time.Minute, clock.Now), + sys: sys, + health: sys.HealthTracker(), + e: e, + dialer: dialer, + store: store, + pm: pm, + backendLogID: logID, + state: ipn.NoState, + portpoll: new(portlist.Poller), + em: newExpiryManager(logf), + gotPortPollRes: make(chan struct{}), + loginFlags: loginFlags, + clock: clock, + selfUpdateProgress: make([]ipnstate.UpdateProgress, 0), + lastSelfUpdateState: ipnstate.UpdateFinished, + captiveCtx: captiveCtx, + captiveCancel: nil, // so that we start checkCaptivePortalLoop when Running + needsCaptiveDetection: make(chan bool), } mConn.SetNetInfoCallback(b.setNetInfo) @@ -669,6 +693,10 @@ func (b *LocalBackend) pauseOrResumeControlClientLocked() { b.cc.SetPaused((b.state == ipn.Stopped && b.netMap != nil) || (!networkUp && !testenv.InTest() && !assumeNetworkUpdateForTest())) } +// captivePortalDetectionInterval is the duration to wait in an unhealthy state with connectivity broken +// before running captive portal detection. +const captivePortalDetectionInterval = 2 * time.Second + // linkChange is our network monitor callback, called whenever the network changes. func (b *LocalBackend) linkChange(delta *netmon.ChangeDelta) { b.mu.Lock() @@ -719,6 +747,44 @@ func (b *LocalBackend) onHealthChange(w *health.Warnable, us *health.UnhealthySt b.send(ipn.Notify{ Health: state, }) + + isConnectivityImpacted := false + for _, w := range state.Warnings { + // Ignore the captive portal warnable itself. + if w.ImpactsConnectivity && w.WarnableCode != captivePortalWarnable.Code { + isConnectivityImpacted = true + break + } + } + + // captiveCtx can be changed, and is protected with 'mu'; grab that + // before we start our select, below. + // + // It is guaranteed to be non-nil. + b.mu.Lock() + ctx := b.captiveCtx + b.mu.Unlock() + + // If the context is canceled, we don't need to do anything. + if ctx.Err() != nil { + return + } + + if isConnectivityImpacted { + b.logf("health: connectivity impacted; triggering captive portal detection") + + // Ensure that we select on captiveCtx so that we can time out + // triggering captive portal detection if the backend is shutdown. + select { + case b.needsCaptiveDetection <- true: + case <-ctx.Done(): + } + } else { + select { + case b.needsCaptiveDetection <- false: + case <-ctx.Done(): + } + } } // Shutdown halts the backend and all its sub-components. The backend @@ -731,6 +797,11 @@ func (b *LocalBackend) Shutdown() { } b.shutdownCalled = true + if b.captiveCancel != nil { + b.logf("canceling captive portal context") + b.captiveCancel() + } + if b.loginFlags&controlclient.LoginEphemeral != 0 { b.mu.Unlock() ctx, cancel := context.WithTimeout(b.ctx, 5*time.Second) @@ -2097,6 +2168,122 @@ func (b *LocalBackend) updateFilterLocked(netMap *netmap.NetworkMap, prefs ipn.P } } +// captivePortalWarnable is a Warnable which is set to an unhealthy state when a captive portal is detected. +var captivePortalWarnable = health.Register(&health.Warnable{ + Code: "captive-portal-detected", + Title: "Captive portal detected", + // High severity, because captive portals block all traffic and require user intervention. + Severity: health.SeverityHigh, + Text: health.StaticMessage("This network requires you to log in using your web browser."), + ImpactsConnectivity: true, +}) + +func (b *LocalBackend) checkCaptivePortalLoop(ctx context.Context) { + var tmr *time.Timer + + maybeStartTimer := func() { + // If there's an existing timer, nothing to do; just continue + // waiting for it to expire. Otherwise, create a new timer. + if tmr == nil { + tmr = time.NewTimer(captivePortalDetectionInterval) + } + } + maybeStopTimer := func() { + if tmr == nil { + return + } + if !tmr.Stop() { + <-tmr.C + } + tmr = nil + } + + for { + if ctx.Err() != nil { + maybeStopTimer() + return + } + + // First, see if we have a signal on our "healthy" channel, which + // takes priority over an existing timer. Because a select is + // nondeterministic, we explicitly check this channel before + // entering the main select below, so that we're guaranteed to + // stop the timer before starting captive portal detection. + select { + case needsCaptiveDetection := <-b.needsCaptiveDetection: + if needsCaptiveDetection { + maybeStartTimer() + } else { + maybeStopTimer() + } + default: + } + + var timerChan <-chan time.Time + if tmr != nil { + timerChan = tmr.C + } + select { + case <-ctx.Done(): + // All done; stop the timer and then exit. + maybeStopTimer() + return + case <-timerChan: + // Kick off captive portal check + b.performCaptiveDetection() + // nil out timer to force recreation + tmr = nil + case needsCaptiveDetection := <-b.needsCaptiveDetection: + if needsCaptiveDetection { + maybeStartTimer() + } else { + // Healthy; cancel any existing timer + maybeStopTimer() + } + } + } +} + +// performCaptiveDetection checks if captive portal detection is enabled via controlknob. If so, it runs +// the detection and updates the Warnable accordingly. +func (b *LocalBackend) performCaptiveDetection() { + if !b.shouldRunCaptivePortalDetection() { + return + } + + d := captivedetection.NewDetector(b.logf) + var dm *tailcfg.DERPMap + b.mu.Lock() + if b.netMap != nil { + dm = b.netMap.DERPMap + } + preferredDERP := 0 + if b.hostinfo != nil { + if b.hostinfo.NetInfo != nil { + preferredDERP = b.hostinfo.NetInfo.PreferredDERP + } + } + ctx := b.ctx + netMon := b.NetMon() + b.mu.Unlock() + found := d.Detect(ctx, netMon, dm, preferredDERP) + if found { + b.health.SetUnhealthy(captivePortalWarnable, health.Args{}) + } else { + b.health.SetHealthy(captivePortalWarnable) + } +} + +// shouldRunCaptivePortalDetection reports whether captive portal detection +// should be run. It is enabled by default, but can be disabled via a control +// knob. It is also only run when the user explicitly wants the backend to be +// running. +func (b *LocalBackend) shouldRunCaptivePortalDetection() bool { + b.mu.Lock() + defer b.mu.Unlock() + return !b.ControlKnobs().DisableCaptivePortalDetection.Load() && b.pm.prefs.WantRunning() +} + // packetFilterPermitsUnlockedNodes reports any peer in peers with the // UnsignedPeerAPIOnly bool set true has any of its allowed IPs in the packet // filter. @@ -4490,9 +4677,27 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock if newState == ipn.Running { b.authURL = "" b.authURLTime = time.Time{} + + // Start a captive portal detection loop if none has been + // started. Create a new context if none is present, since it + // can be shut down if we transition away from Running. + if b.captiveCancel == nil { + b.captiveCtx, b.captiveCancel = context.WithCancel(b.ctx) + go b.checkCaptivePortalLoop(b.captiveCtx) + } } else if oldState == ipn.Running { // Transitioning away from running. b.closePeerAPIListenersLocked() + + // Stop any existing captive portal detection loop. + if b.captiveCancel != nil { + b.captiveCancel() + b.captiveCancel = nil + + // NOTE: don't set captiveCtx to nil here, to ensure + // that we always have a (canceled) context to wait on + // in onHealthChange. + } } b.pauseOrResumeControlClientLocked() diff --git a/net/captivedetection/captivedetection.go b/net/captivedetection/captivedetection.go new file mode 100644 index 000000000..1b6e61e54 --- /dev/null +++ b/net/captivedetection/captivedetection.go @@ -0,0 +1,217 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +// Package captivedetection provides a way to detect if the system is connected to a network that has +// a captive portal. It does this by making HTTP requests to known captive portal detection endpoints +// and checking if the HTTP responses indicate that a captive portal might be present. +package captivedetection + +import ( + "context" + "net" + "net/http" + "runtime" + "strings" + "sync" + "syscall" + "time" + + "tailscale.com/net/netmon" + "tailscale.com/tailcfg" + "tailscale.com/types/logger" +) + +// Detector checks whether the system is behind a captive portal. +type Detector struct { + + // httpClient is the HTTP client that is used for captive portal detection. It is configured + // to not follow redirects, have a short timeout and no keep-alive. + httpClient *http.Client + // currIfIndex is the index of the interface that is currently being used by the httpClient. + currIfIndex int + // mu guards currIfIndex. + mu sync.Mutex + // logf is the logger used for logging messages. If it is nil, log.Printf is used. + logf logger.Logf +} + +// NewDetector creates a new Detector instance for captive portal detection. +func NewDetector(logf logger.Logf) *Detector { + d := &Detector{logf: logf} + d.httpClient = &http.Client{ + // No redirects allowed + CheckRedirect: func(req *http.Request, via []*http.Request) error { + return http.ErrUseLastResponse + }, + Transport: &http.Transport{ + DialContext: d.dialContext, + DisableKeepAlives: true, + }, + Timeout: Timeout, + } + return d +} + +// Timeout is the timeout for captive portal detection requests. Because the captive portal intercepting our requests +// is usually located on the LAN, this is a relatively short timeout. +const Timeout = 3 * time.Second + +// Detect is the entry point to the API. It attempts to detect if the system is behind a captive portal +// by making HTTP requests to known captive portal detection Endpoints. If any of the requests return a response code +// or body that looks like a captive portal, Detect returns true. It returns false in all other cases, including when any +// error occurs during a detection attempt. +// +// This function might take a while to return, as it will attempt to detect a captive portal on all available interfaces +// by performing multiple HTTP requests. It should be called in a separate goroutine if you want to avoid blocking. +func (d *Detector) Detect(ctx context.Context, netMon *netmon.Monitor, derpMap *tailcfg.DERPMap, preferredDERPRegionID int) (found bool) { + return d.detectCaptivePortalWithGOOS(ctx, netMon, derpMap, preferredDERPRegionID, runtime.GOOS) +} + +func (d *Detector) detectCaptivePortalWithGOOS(ctx context.Context, netMon *netmon.Monitor, derpMap *tailcfg.DERPMap, preferredDERPRegionID int, goos string) (found bool) { + ifState := netMon.InterfaceState() + if !ifState.AnyInterfaceUp() { + d.logf("[v2] DetectCaptivePortal: no interfaces up, returning false") + return false + } + + endpoints := availableEndpoints(derpMap, preferredDERPRegionID, d.logf, goos) + + // Here we try detecting a captive portal using *all* available interfaces on the system + // that have a IPv4 address. We consider to have found a captive portal when any interface + // reports one may exists. This is necessary because most systems have multiple interfaces, + // and most importantly on macOS no default route interface is set until the user has accepted + // the captive portal alert thrown by the system. If no default route interface is known, + // we need to try with anything that might remotely resemble a Wi-Fi interface. + for ifName, i := range ifState.Interface { + if !i.IsUp() || i.IsLoopback() || interfaceNameDoesNotNeedCaptiveDetection(ifName, goos) { + continue + } + addrs, err := i.Addrs() + if err != nil { + d.logf("[v1] DetectCaptivePortal: failed to get addresses for interface %s: %v", ifName, err) + continue + } + if len(addrs) == 0 { + continue + } + d.logf("[v2] attempting to do captive portal detection on interface %s", ifName) + res := d.detectOnInterface(ctx, i.Index, endpoints) + if res { + d.logf("DetectCaptivePortal(found=true,ifName=%s)", found, ifName) + return true + } + } + + d.logf("DetectCaptivePortal(found=false)") + return false +} + +func interfaceNameDoesNotNeedCaptiveDetection(ifName string, goos string) bool { + ifName = strings.ToLower(ifName) + excludedPrefixes := []string{"tailscale", "tun", "tap", "docker", "kube", "wg"} + if goos == "windows" { + excludedPrefixes = append(excludedPrefixes, "loopback", "tunnel", "ppp", "isatap", "teredo", "6to4") + } else if goos == "darwin" || goos == "ios" { + excludedPrefixes = append(excludedPrefixes, "awdl", "bridge", "ap", "utun", "tap", "llw", "anpi", "lo", "stf", "gif", "xhc") + } + for _, prefix := range excludedPrefixes { + if strings.HasPrefix(ifName, prefix) { + return true + } + } + return false +} + +// detectOnInterface reports whether or not we think the system is behind a +// captive portal, detected by making a request to a URL that we know should +// return a "204 No Content" response and checking if that's what we get. +// +// The boolean return is whether we think we have a captive portal. +func (d *Detector) detectOnInterface(ctx context.Context, ifIndex int, endpoints []Endpoint) bool { + defer d.httpClient.CloseIdleConnections() + + d.logf("[v2] %d available captive portal detection endpoints: %v", len(endpoints), endpoints) + + // We try to detect the captive portal more quickly by making requests to multiple endpoints concurrently. + var wg sync.WaitGroup + resultCh := make(chan bool, len(endpoints)) + + for i, e := range endpoints { + if i >= 5 { + // Try a maximum of 5 endpoints, break out (returning false) if we run of attempts. + break + } + wg.Add(1) + go func(endpoint Endpoint) { + defer wg.Done() + found, err := d.verifyCaptivePortalEndpoint(ctx, endpoint, ifIndex) + if err != nil { + d.logf("[v1] checkCaptivePortalEndpoint failed with endpoint %v: %v", endpoint, err) + return + } + if found { + resultCh <- true + } + }(e) + } + + go func() { + wg.Wait() + close(resultCh) + }() + + for result := range resultCh { + if result { + // If any of the endpoints seems to be a captive portal, we consider the system to be behind one. + return true + } + } + + return false +} + +// verifyCaptivePortalEndpoint checks if the given Endpoint is a captive portal by making an HTTP request to the +// given Endpoint URL using the interface with index ifIndex, and checking if the response looks like a captive portal. +func (d *Detector) verifyCaptivePortalEndpoint(ctx context.Context, e Endpoint, ifIndex int) (found bool, err error) { + req, err := http.NewRequestWithContext(ctx, "GET", e.URL.String(), nil) + if err != nil { + return false, err + } + + // Attach the Tailscale challenge header if the endpoint supports it. Not all captive portal detection endpoints + // support this, so we only attach it if the endpoint does. + if e.SupportsTailscaleChallenge { + // Note: the set of valid characters in a challenge and the total + // length is limited; see isChallengeChar in cmd/derper for more + // details. + chal := "ts_" + e.URL.Host + req.Header.Set("X-Tailscale-Challenge", chal) + } + + d.mu.Lock() + d.currIfIndex = ifIndex + d.mu.Unlock() + + // Make the actual request, and check if the response looks like a captive portal or not. + r, err := d.httpClient.Do(req) + if err != nil { + return false, err + } + + return e.responseLooksLikeCaptive(r, d.logf), nil +} + +func (d *Detector) dialContext(ctx context.Context, network, addr string) (net.Conn, error) { + d.mu.Lock() + defer d.mu.Unlock() + + ifIndex := d.currIfIndex + + dl := net.Dialer{ + Control: func(network, address string, c syscall.RawConn) error { + return setSocketInterfaceIndex(c, ifIndex, d.logf) + }, + } + + return dl.DialContext(ctx, network, addr) +} diff --git a/net/captivedetection/captivedetection_test.go b/net/captivedetection/captivedetection_test.go new file mode 100644 index 000000000..033f14cba --- /dev/null +++ b/net/captivedetection/captivedetection_test.go @@ -0,0 +1,58 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package captivedetection + +import ( + "context" + "runtime" + "sync" + "testing" + + "tailscale.com/net/netmon" +) + +func TestAvailableEndpointsAlwaysAtLeastTwo(t *testing.T) { + endpoints := availableEndpoints(nil, 0, t.Logf, runtime.GOOS) + if len(endpoints) == 0 { + t.Errorf("Expected non-empty AvailableEndpoints, got an empty slice instead") + } + if len(endpoints) == 1 { + t.Errorf("Expected at least two AvailableEndpoints for redundancy, got only one instead") + } + for _, e := range endpoints { + if e.URL.Scheme != "http" { + t.Errorf("Expected HTTP URL in Endpoint, got HTTPS") + } + } +} + +func TestDetectCaptivePortalReturnsFalse(t *testing.T) { + d := NewDetector(t.Logf) + found := d.Detect(context.Background(), netmon.NewStatic(), nil, 0) + if found { + t.Errorf("DetectCaptivePortal returned true, expected false.") + } +} + +func TestAllEndpointsAreUpAndReturnExpectedResponse(t *testing.T) { + d := NewDetector(t.Logf) + endpoints := availableEndpoints(nil, 0, t.Logf, runtime.GOOS) + + var wg sync.WaitGroup + for _, e := range endpoints { + wg.Add(1) + go func(endpoint Endpoint) { + defer wg.Done() + found, err := d.verifyCaptivePortalEndpoint(context.Background(), endpoint, 0) + if err != nil { + t.Errorf("verifyCaptivePortalEndpoint failed with endpoint %v: %v", endpoint, err) + } + if found { + t.Errorf("verifyCaptivePortalEndpoint with endpoint %v says we're behind a captive portal, but we aren't", endpoint) + } + }(e) + } + + wg.Wait() +} diff --git a/net/captivedetection/endpoints.go b/net/captivedetection/endpoints.go new file mode 100644 index 000000000..450ed4a1c --- /dev/null +++ b/net/captivedetection/endpoints.go @@ -0,0 +1,178 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +package captivedetection + +import ( + "cmp" + "fmt" + "io" + "net/http" + "net/url" + "slices" + + "go4.org/mem" + "tailscale.com/net/dnsfallback" + "tailscale.com/tailcfg" + "tailscale.com/types/logger" +) + +// EndpointProvider is an enum that represents the source of an Endpoint. +type EndpointProvider int + +const ( + // DERPMapPreferred is used for an endpoint that is a DERP node contained in the current preferred DERP region, + // as provided by the DERPMap. + DERPMapPreferred EndpointProvider = iota + // DERPMapOther is used for an endpoint that is a DERP node, but not contained in the current preferred DERP region. + DERPMapOther + // Tailscale is used for endpoints that are the Tailscale coordination server or admin console. + Tailscale +) + +func (p EndpointProvider) String() string { + switch p { + case DERPMapPreferred: + return "DERPMapPreferred" + case Tailscale: + return "Tailscale" + case DERPMapOther: + return "DERPMapOther" + default: + return fmt.Sprintf("EndpointProvider(%d)", p) + } +} + +// Endpoint represents a URL that can be used to detect a captive portal, along with the expected +// result of the HTTP request. +type Endpoint struct { + // URL is the URL that we make an HTTP request to as part of the captive portal detection process. + URL *url.URL + // StatusCode is the expected HTTP status code that we expect to see in the response. + StatusCode int + // ExpectedContent is a string that we expect to see contained in the response body. If this is non-empty, + // we will check that the response body contains this string. If it is empty, we will not check the response body + // and only check the status code. + ExpectedContent string + // SupportsTailscaleChallenge is true if the endpoint will return the sent value of the X-Tailscale-Challenge + // HTTP header in its HTTP response. + SupportsTailscaleChallenge bool + // Provider is the source of the endpoint. This is used to prioritize certain endpoints over others + // (for example, a DERP node in the preferred region should always be used first). + Provider EndpointProvider +} + +func (e Endpoint) String() string { + return fmt.Sprintf("Endpoint{URL=%q, StatusCode=%d, ExpectedContent=%q, SupportsTailscaleChallenge=%v, Provider=%s}", e.URL, e.StatusCode, e.ExpectedContent, e.SupportsTailscaleChallenge, e.Provider.String()) +} + +func (e Endpoint) Equal(other Endpoint) bool { + return e.URL.String() == other.URL.String() && + e.StatusCode == other.StatusCode && + e.ExpectedContent == other.ExpectedContent && + e.SupportsTailscaleChallenge == other.SupportsTailscaleChallenge && + e.Provider == other.Provider +} + +// availableEndpoints returns a set of Endpoints which can be used for captive portal detection by performing +// one or more HTTP requests and looking at the response. The returned Endpoints are ordered by preference, +// with the most preferred Endpoint being the first in the slice. +func availableEndpoints(derpMap *tailcfg.DERPMap, preferredDERPRegionID int, logf logger.Logf, goos string) []Endpoint { + endpoints := []Endpoint{} + + if derpMap == nil || len(derpMap.Regions) == 0 { + // When the client first starts, we don't have a DERPMap in LocalBackend yet. In this case, + // we use the static DERPMap from dnsfallback. + logf("captivedetection: current DERPMap is empty, using map from dnsfallback") + derpMap = dnsfallback.GetDERPMap() + } + // Use the DERP IPs as captive portal detection endpoints. Using IPs is better than hostnames + // because they do not depend on DNS resolution. + for _, region := range derpMap.Regions { + if region.Avoid { + continue + } + for _, node := range region.Nodes { + if node.IPv4 == "" || !node.CanPort80 { + continue + } + str := "http://" + node.IPv4 + "/generate_204" + u, err := url.Parse(str) + if err != nil { + logf("captivedetection: failed to parse DERP node URL %q: %v", str, err) + continue + } + p := DERPMapOther + if region.RegionID == preferredDERPRegionID { + p = DERPMapPreferred + } + e := Endpoint{u, http.StatusNoContent, "", true, p} + endpoints = append(endpoints, e) + } + } + + // Let's also try the default Tailscale coordination server and admin console. + // These are likely to be blocked on some networks. + appendTailscaleEndpoint := func(urlString string) { + u, err := url.Parse(urlString) + if err != nil { + logf("captivedetection: failed to parse Tailscale URL %q: %v", urlString, err) + return + } + endpoints = append(endpoints, Endpoint{u, http.StatusNoContent, "", false, Tailscale}) + } + appendTailscaleEndpoint("http://controlplane.tailscale.com/generate_204") + appendTailscaleEndpoint("http://login.tailscale.com/generate_204") + + // Sort the endpoints by provider so that we can prioritize DERP nodes in the preferred region, followed by + // any other DERP server elsewhere, then followed by Tailscale endpoints. + slices.SortFunc(endpoints, func(x, y Endpoint) int { + return cmp.Compare(x.Provider, y.Provider) + }) + + return endpoints +} + +// responseLooksLikeCaptive checks if the given HTTP response matches the expected response for the Endpoint. +func (e Endpoint) responseLooksLikeCaptive(r *http.Response, logf logger.Logf) bool { + defer r.Body.Close() + + // Check the status code first. + if r.StatusCode != e.StatusCode { + logf("[v1] unexpected status code in captive portal response: want=%d, got=%d", e.StatusCode, r.StatusCode) + return true + } + + // If the endpoint supports the Tailscale challenge header, check that the response contains the expected header. + if e.SupportsTailscaleChallenge { + expectedResponse := "response ts_" + e.URL.Host + hasResponse := r.Header.Get("X-Tailscale-Response") == expectedResponse + if !hasResponse { + // The response did not contain the expected X-Tailscale-Response header, which means we are most likely + // behind a captive portal (somebody is tampering with the response headers). + logf("captive portal check response did not contain expected X-Tailscale-Response header: want=%q, got=%q", expectedResponse, r.Header.Get("X-Tailscale-Response")) + return true + } + } + + // If we don't have an expected content string, we don't need to check the response body. + if e.ExpectedContent == "" { + return false + } + + // Read the response body and check if it contains the expected content. + b, err := io.ReadAll(io.LimitReader(r.Body, 4096)) + if err != nil { + logf("reading captive portal check response body failed: %v", err) + return false + } + hasExpectedContent := mem.Contains(mem.B(b), mem.S(e.ExpectedContent)) + if !hasExpectedContent { + // The response body did not contain the expected content, that means we are most likely behind a captive portal. + logf("[v1] captive portal check response body did not contain expected content: want=%q", e.ExpectedContent) + return true + } + + // If we got here, the response looks good. + return false +} diff --git a/net/captivedetection/rawconn.go b/net/captivedetection/rawconn.go new file mode 100644 index 000000000..a7197d9df --- /dev/null +++ b/net/captivedetection/rawconn.go @@ -0,0 +1,19 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build !(ios || darwin) + +package captivedetection + +import ( + "syscall" + + "tailscale.com/types/logger" +) + +// setSocketInterfaceIndex sets the IP_BOUND_IF socket option on the given RawConn. +// This forces the socket to use the given interface. +func setSocketInterfaceIndex(c syscall.RawConn, ifIndex int, logf logger.Logf) error { + // No-op on non-Darwin platforms. + return nil +} diff --git a/net/captivedetection/rawconn_apple.go b/net/captivedetection/rawconn_apple.go new file mode 100644 index 000000000..12b4446e6 --- /dev/null +++ b/net/captivedetection/rawconn_apple.go @@ -0,0 +1,24 @@ +// Copyright (c) Tailscale Inc & AUTHORS +// SPDX-License-Identifier: BSD-3-Clause + +//go:build ios || darwin + +package captivedetection + +import ( + "syscall" + + "golang.org/x/sys/unix" + "tailscale.com/types/logger" +) + +// setSocketInterfaceIndex sets the IP_BOUND_IF socket option on the given RawConn. +// This forces the socket to use the given interface. +func setSocketInterfaceIndex(c syscall.RawConn, ifIndex int, logf logger.Logf) error { + return c.Control((func(fd uintptr) { + err := unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_BOUND_IF, ifIndex) + if err != nil { + logf("captivedetection: failed to set IP_BOUND_IF (ifIndex=%d): %v", ifIndex, err) + } + })) +} diff --git a/net/dnsfallback/dnsfallback.go b/net/dnsfallback/dnsfallback.go index 6b3ac864e..8322d3887 100644 --- a/net/dnsfallback/dnsfallback.go +++ b/net/dnsfallback/dnsfallback.go @@ -219,7 +219,7 @@ type nameIP struct { ip netip.Addr } - dm := getDERPMap() + dm := GetDERPMap() var cands4, cands6 []nameIP for _, dr := range dm.Regions { @@ -310,9 +310,12 @@ func bootstrapDNSMap(ctx context.Context, serverName string, serverIP netip.Addr // https://derp10.tailscale.com/bootstrap-dns type dnsMap map[string][]netip.Addr -// getDERPMap returns some DERP map. The DERP servers also run a fallback -// DNS server. -func getDERPMap() *tailcfg.DERPMap { +// GetDERPMap returns a fallback DERP map that is always available, useful for basic +// bootstrapping purposes. The dynamically updated DERP map in LocalBackend should +// always be preferred over this. Use this DERP map only when the control plane is +// unreachable or hasn't been reached yet. The DERP servers in the returned map also +// run a fallback DNS server. +func GetDERPMap() *tailcfg.DERPMap { dm := getStaticDERPMap() // Merge in any DERP servers from the cached map that aren't in the diff --git a/net/dnsfallback/dnsfallback_test.go b/net/dnsfallback/dnsfallback_test.go index 4298499b0..16f5027d4 100644 --- a/net/dnsfallback/dnsfallback_test.go +++ b/net/dnsfallback/dnsfallback_test.go @@ -18,7 +18,7 @@ ) func TestGetDERPMap(t *testing.T) { - dm := getDERPMap() + dm := GetDERPMap() if dm == nil { t.Fatal("nil") } @@ -78,7 +78,7 @@ func TestCache(t *testing.T) { } // Verify that our DERP map is merged with the cache. - dm := getDERPMap() + dm := GetDERPMap() region, ok := dm.Regions[99] if !ok { t.Fatal("expected region 99") diff --git a/net/netcheck/netcheck.go b/net/netcheck/netcheck.go index 80957039e..8eb50a61d 100644 --- a/net/netcheck/netcheck.go +++ b/net/netcheck/netcheck.go @@ -14,13 +14,11 @@ "io" "log" "maps" - "math/rand/v2" "net" "net/http" "net/netip" "runtime" "sort" - "strings" "sync" "syscall" "time" @@ -28,6 +26,7 @@ "github.com/tcnksm/go-httpstat" "tailscale.com/derp/derphttp" "tailscale.com/envknob" + "tailscale.com/net/captivedetection" "tailscale.com/net/dnscache" "tailscale.com/net/neterror" "tailscale.com/net/netmon" @@ -847,11 +846,8 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe tmr := time.AfterFunc(c.captivePortalDelay(), func() { defer close(ch) - found, err := c.checkCaptivePortal(ctx, dm, preferredDERP) - if err != nil { - c.logf("[v1] checkCaptivePortal: %v", err) - return - } + d := captivedetection.NewDetector(c.logf) + found := d.Detect(ctx, c.NetMon, dm, preferredDERP) rs.report.CaptivePortal.Set(found) }) @@ -988,75 +984,6 @@ func (c *Client) finishAndStoreReport(rs *reportState, dm *tailcfg.DERPMap) *Rep return report } -var noRedirectClient = &http.Client{ - // No redirects allowed - CheckRedirect: func(req *http.Request, via []*http.Request) error { - return http.ErrUseLastResponse - }, - - // Remaining fields are the same as the default client. - Transport: http.DefaultClient.Transport, - Jar: http.DefaultClient.Jar, - Timeout: http.DefaultClient.Timeout, -} - -// checkCaptivePortal reports whether or not we think the system is behind a -// captive portal, detected by making a request to a URL that we know should -// return a "204 No Content" response and checking if that's what we get. -// -// The boolean return is whether we think we have a captive portal. -func (c *Client) checkCaptivePortal(ctx context.Context, dm *tailcfg.DERPMap, preferredDERP int) (bool, error) { - defer noRedirectClient.CloseIdleConnections() - - // If we have a preferred DERP region with more than one node, try - // that; otherwise, pick a random one not marked as "Avoid". - if preferredDERP == 0 || dm.Regions[preferredDERP] == nil || - (preferredDERP != 0 && len(dm.Regions[preferredDERP].Nodes) == 0) { - rids := make([]int, 0, len(dm.Regions)) - for id, reg := range dm.Regions { - if reg == nil || reg.Avoid || len(reg.Nodes) == 0 { - continue - } - rids = append(rids, id) - } - if len(rids) == 0 { - return false, nil - } - preferredDERP = rids[rand.IntN(len(rids))] - } - - node := dm.Regions[preferredDERP].Nodes[0] - - if strings.HasSuffix(node.HostName, tailcfg.DotInvalid) { - // Don't try to connect to invalid hostnames. This occurred in tests: - // https://github.com/tailscale/tailscale/issues/6207 - // TODO(bradfitz,andrew-d): how to actually handle this nicely? - return false, nil - } - - req, err := http.NewRequestWithContext(ctx, "GET", "http://"+node.HostName+"/generate_204", nil) - if err != nil { - return false, err - } - - // Note: the set of valid characters in a challenge and the total - // length is limited; see isChallengeChar in cmd/derper for more - // details. - chal := "ts_" + node.HostName - req.Header.Set("X-Tailscale-Challenge", chal) - r, err := noRedirectClient.Do(req) - if err != nil { - return false, err - } - defer r.Body.Close() - - expectedResponse := "response " + chal - validResponse := r.Header.Get("X-Tailscale-Response") == expectedResponse - - c.logf("[v2] checkCaptivePortal url=%q status_code=%d valid_response=%v", req.URL.String(), r.StatusCode, validResponse) - return r.StatusCode != 204 || !validResponse, nil -} - // runHTTPOnlyChecks is the netcheck done by environments that can // only do HTTP requests, such as ws/wasm. func (c *Client) runHTTPOnlyChecks(ctx context.Context, last *Report, rs *reportState, dm *tailcfg.DERPMap) error { diff --git a/net/netcheck/netcheck_test.go b/net/netcheck/netcheck_test.go index 8b7124744..26e52602a 100644 --- a/net/netcheck/netcheck_test.go +++ b/net/netcheck/netcheck_test.go @@ -15,14 +15,12 @@ "sort" "strconv" "strings" - "sync/atomic" "testing" "time" "tailscale.com/net/netmon" "tailscale.com/net/stun/stuntest" "tailscale.com/tailcfg" - "tailscale.com/tstest" "tailscale.com/tstest/nettest" ) @@ -778,54 +776,6 @@ func TestSortRegions(t *testing.T) { } } -func TestNoCaptivePortalWhenUDP(t *testing.T) { - nettest.SkipIfNoNetwork(t) // empirically. not sure why. - - // Override noRedirectClient to handle the /generate_204 endpoint - var generate204Called atomic.Bool - tr := RoundTripFunc(func(req *http.Request) *http.Response { - if !strings.HasSuffix(req.URL.String(), "/generate_204") { - panic("bad URL: " + req.URL.String()) - } - generate204Called.Store(true) - return &http.Response{ - StatusCode: http.StatusNoContent, - Header: make(http.Header), - } - }) - - tstest.Replace(t, &noRedirectClient.Transport, http.RoundTripper(tr)) - - stunAddr, cleanup := stuntest.Serve(t) - defer cleanup() - - c := newTestClient(t) - c.testEnoughRegions = 1 - // Set the delay long enough that we have time to cancel it - // when our STUN probe succeeds. - c.testCaptivePortalDelay = 10 * time.Second - - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) - defer cancel() - - if err := c.Standalone(ctx, "127.0.0.1:0"); err != nil { - t.Fatal(err) - } - - r, err := c.GetReport(ctx, stuntest.DERPMapOf(stunAddr.String()), nil) - if err != nil { - t.Fatal(err) - } - - // Should not have called our captive portal function. - if generate204Called.Load() { - t.Errorf("captive portal check called; expected no call") - } - if r.CaptivePortal != "" { - t.Errorf("got CaptivePortal=%q, want empty", r.CaptivePortal) - } -} - type RoundTripFunc func(req *http.Request) *http.Response func (f RoundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) { diff --git a/tailcfg/tailcfg.go b/tailcfg/tailcfg.go index 60a2244dd..5d639dd54 100644 --- a/tailcfg/tailcfg.go +++ b/tailcfg/tailcfg.go @@ -145,7 +145,8 @@ // - 100: 2024-06-18: Client supports filtertype.Match.SrcCaps (issue #12542) // - 101: 2024-07-01: Client supports SSH agent forwarding when handling connections with /bin/su // - 102: 2024-07-12: NodeAttrDisableMagicSockCryptoRouting support -const CurrentCapabilityVersion CapabilityVersion = 102 +// - 103: 2024-07-24: Client supports NodeAttrDisableCaptivePortalDetection +const CurrentCapabilityVersion CapabilityVersion = 103 type StableID string @@ -2327,6 +2328,10 @@ type Oauth2Token struct { // NodeAttrDisableMagicSockCryptoRouting disables the use of the // magicsock cryptorouting hook. See tailscale/corp#20732. NodeAttrDisableMagicSockCryptoRouting NodeCapability = "disable-magicsock-crypto-routing" + + // NodeAttrDisableCaptivePortalDetection instructs the client to not perform captive portal detection + // automatically when the network state changes. + NodeAttrDisableCaptivePortalDetection NodeCapability = "disable-captive-portal-detection" ) // SetDNSRequest is a request to add a DNS record.