tailscale/net/netmon/netmon.go
Nick Khyl c9836b454d net/netmon: fix goroutine leak in winMon if the monitor is never started
When the portable Monitor creates a winMon via newOSMon, we register
address and route change callbacks with Windows. Once a callback is hit,
it starts a goroutine that attempts to send the event into messagec and returns.
The newly started goroutine then blocks until it can send to the channel.
However, if the monitor is never started and winMon.Receive is never called,
the goroutines remain indefinitely blocked, leading to goroutine leaks and
significant memory consumption in the tailscaled service process on Windows.
Unlike the tailscaled subprocess, the service process creates but never starts
a Monitor.

This PR adds a check within the callbacks to confirm the monitor's active status,
and exits immediately if the monitor hasn't started.

Updates #9864

Signed-off-by: Nick Khyl <nickk@tailscale.com>
2023-12-21 16:36:52 -06:00

579 lines
15 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
// Package monitor provides facilities for monitoring network
// interface and route changes. It primarily exists to know when
// portable devices move between different networks.
package netmon
import (
"encoding/json"
"errors"
"net/netip"
"runtime"
"sync"
"time"
"tailscale.com/net/interfaces"
"tailscale.com/types/logger"
"tailscale.com/util/clientmetric"
"tailscale.com/util/set"
)
// pollWallTimeInterval is how often we check the time to check
// for big jumps in wall (non-monotonic) time as a backup mechanism
// to get notified of a sleeping device waking back up.
// Usually there are also minor network change events on wake that let
// us check the wall time sooner than this.
const pollWallTimeInterval = 15 * time.Second
// message represents a message returned from an osMon.
type message interface {
// Ignore is whether we should ignore this message.
ignore() bool
}
// osMon is the interface that each operating system-specific
// implementation of the link monitor must implement.
type osMon interface {
Close() error
// Receive returns a new network interface change message. It
// should block until there's either something to return, or
// until the osMon is closed. After a Close, the returned
// error is ignored.
Receive() (message, error)
// IsInterestingInterface reports whether the provided interface should
// be considered for network change events.
IsInterestingInterface(iface string) bool
}
// Monitor represents a monitoring instance.
type Monitor struct {
logf logger.Logf
om osMon // nil means not supported on this platform
change chan bool // send false to wake poller, true to also force ChangeDeltas be sent
stop chan struct{} // closed on Stop
// Things that must be set early, before use,
// and not change at runtime.
tsIfName string // tailscale interface name, if known/set ("tailscale0", "utun3", ...)
mu sync.Mutex // guards all following fields
cbs set.HandleSet[ChangeFunc]
ruleDelCB set.HandleSet[RuleDeleteCallback]
ifState *interfaces.State
gwValid bool // whether gw and gwSelfIP are valid
gw netip.Addr // our gateway's IP
gwSelfIP netip.Addr // our own IP address (that corresponds to gw)
started bool
closed bool
goroutines sync.WaitGroup
wallTimer *time.Timer // nil until Started; re-armed AfterFunc per tick
lastWall time.Time
timeJumped bool // whether we need to send a changed=true after a big time jump
}
// ChangeFunc is a callback function registered with Monitor that's called when the
// network changed.
type ChangeFunc func(*ChangeDelta)
// ChangeDelta describes the difference between two network states.
type ChangeDelta struct {
// Monitor is the network monitor that sent this delta.
Monitor *Monitor
// Old is the old interface state, if known.
// It's nil if the old state is unknown.
// Do not mutate it.
Old *interfaces.State
// New is the new network state.
// It is always non-nil.
// Do not mutate it.
New *interfaces.State
// Major is our legacy boolean of whether the network changed in some major
// way.
//
// Deprecated: do not remove. As of 2023-08-23 we're in a renewed effort to
// remove it and ask specific qustions of ChangeDelta instead. Look at Old
// and New (or add methods to ChangeDelta) instead of using Major.
Major bool
// TimeJumped is whether there was a big jump in wall time since the last
// time we checked. This is a hint that a mobile sleeping device might have
// come out of sleep.
TimeJumped bool
// TODO(bradfitz): add some lazy cached fields here as needed with methods
// on *ChangeDelta to let callers ask specific questions
}
// New instantiates and starts a monitoring instance.
// The returned monitor is inactive until it's started by the Start method.
// Use RegisterChangeCallback to get notified of network changes.
func New(logf logger.Logf) (*Monitor, error) {
logf = logger.WithPrefix(logf, "monitor: ")
m := &Monitor{
logf: logf,
change: make(chan bool, 1),
stop: make(chan struct{}),
lastWall: wallTime(),
}
st, err := m.interfaceStateUncached()
if err != nil {
return nil, err
}
m.ifState = st
m.om, err = newOSMon(logf, m)
if err != nil {
return nil, err
}
if m.om == nil {
return nil, errors.New("newOSMon returned nil, nil")
}
return m, nil
}
// InterfaceState returns the latest snapshot of the machine's network
// interfaces.
//
// The returned value is owned by Mon; it must not be modified.
func (m *Monitor) InterfaceState() *interfaces.State {
m.mu.Lock()
defer m.mu.Unlock()
return m.ifState
}
func (m *Monitor) interfaceStateUncached() (*interfaces.State, error) {
return interfaces.GetState()
}
// SetTailscaleInterfaceName sets the name of the Tailscale interface. For
// example, "tailscale0", "tun0", "utun3", etc.
//
// This must be called only early in tailscaled startup before the monitor is
// used.
func (m *Monitor) SetTailscaleInterfaceName(ifName string) {
m.tsIfName = ifName
}
// GatewayAndSelfIP returns the current network's default gateway, and
// the machine's default IP for that gateway.
//
// It's the same as interfaces.LikelyHomeRouterIP, but it caches the
// result until the monitor detects a network change.
func (m *Monitor) GatewayAndSelfIP() (gw, myIP netip.Addr, ok bool) {
m.mu.Lock()
defer m.mu.Unlock()
if m.gwValid {
return m.gw, m.gwSelfIP, true
}
gw, myIP, ok = interfaces.LikelyHomeRouterIP()
changed := false
if ok {
changed = m.gw != gw || m.gwSelfIP != myIP
m.gw, m.gwSelfIP = gw, myIP
m.gwValid = true
}
if changed {
m.logf("gateway and self IP changed: gw=%v self=%v", m.gw, m.gwSelfIP)
}
return gw, myIP, ok
}
// RegisterChangeCallback adds callback to the set of parties to be
// notified (in their own goroutine) when the network state changes.
// To remove this callback, call unregister (or close the monitor).
func (m *Monitor) RegisterChangeCallback(callback ChangeFunc) (unregister func()) {
m.mu.Lock()
defer m.mu.Unlock()
handle := m.cbs.Add(callback)
return func() {
m.mu.Lock()
defer m.mu.Unlock()
delete(m.cbs, handle)
}
}
// RuleDeleteCallback is a callback when a Linux IP policy routing
// rule is deleted. The table is the table number (52, 253, 354) and
// priority is the priority order number (for Tailscale rules
// currently: 5210, 5230, 5250, 5270)
type RuleDeleteCallback func(table uint8, priority uint32)
// RegisterRuleDeleteCallback adds callback to the set of parties to be
// notified (in their own goroutine) when a Linux ip rule is deleted.
// To remove this callback, call unregister (or close the monitor).
func (m *Monitor) RegisterRuleDeleteCallback(callback RuleDeleteCallback) (unregister func()) {
m.mu.Lock()
defer m.mu.Unlock()
handle := m.ruleDelCB.Add(callback)
return func() {
m.mu.Lock()
defer m.mu.Unlock()
delete(m.ruleDelCB, handle)
}
}
// isActive reports whether this monitor has been started and not yet closed.
func (m *Monitor) isActive() bool {
m.mu.Lock()
defer m.mu.Unlock()
return m.started && !m.closed
}
// Start starts the monitor.
// A monitor can only be started & closed once.
func (m *Monitor) Start() {
m.mu.Lock()
defer m.mu.Unlock()
if m.started || m.closed {
return
}
m.started = true
if shouldMonitorTimeJump {
m.wallTimer = time.AfterFunc(pollWallTimeInterval, m.pollWallTime)
}
if m.om == nil {
return
}
m.goroutines.Add(2)
go m.pump()
go m.debounce()
}
// Close closes the monitor.
func (m *Monitor) Close() error {
m.mu.Lock()
if m.closed {
m.mu.Unlock()
return nil
}
m.closed = true
close(m.stop)
if m.wallTimer != nil {
m.wallTimer.Stop()
}
var err error
if m.om != nil {
err = m.om.Close()
}
started := m.started
m.mu.Unlock()
if started {
m.goroutines.Wait()
}
return err
}
// InjectEvent forces the monitor to pretend there was a network
// change and re-check the state of the network. Any registered
// ChangeFunc callbacks will be called within the event coalescing
// period (under a fraction of a second).
func (m *Monitor) InjectEvent() {
select {
case m.change <- true:
default:
// Another change signal is already
// buffered. Debounce will wake up soon
// enough.
}
}
// Poll forces the monitor to pretend there was a network
// change and re-check the state of the network.
//
// This is like InjectEvent but only fires ChangeFunc callbacks
// if the network state differed at all.
func (m *Monitor) Poll() {
select {
case m.change <- false:
default:
}
}
func (m *Monitor) stopped() bool {
select {
case <-m.stop:
return true
default:
return false
}
}
// pump continuously retrieves messages from the connection, notifying
// the change channel of changes, and stopping when a stop is issued.
func (m *Monitor) pump() {
defer m.goroutines.Done()
for !m.stopped() {
msg, err := m.om.Receive()
if err != nil {
if m.stopped() {
return
}
// Keep retrying while we're not closed.
m.logf("error from link monitor: %v", err)
time.Sleep(time.Second)
continue
}
if rdm, ok := msg.(ipRuleDeletedMessage); ok {
m.notifyRuleDeleted(rdm)
continue
}
if msg.ignore() {
continue
}
m.Poll()
}
}
func (m *Monitor) notifyRuleDeleted(rdm ipRuleDeletedMessage) {
m.mu.Lock()
defer m.mu.Unlock()
for _, cb := range m.ruleDelCB {
go cb(rdm.table, rdm.priority)
}
}
// isInterestingInterface reports whether the provided interface should be
// considered when checking for network state changes.
// The ips parameter should be the IPs of the provided interface.
func (m *Monitor) isInterestingInterface(i interfaces.Interface, ips []netip.Prefix) bool {
if !m.om.IsInterestingInterface(i.Name) {
return false
}
return true
}
// debounce calls the callback function with a delay between events
// and exits when a stop is issued.
func (m *Monitor) debounce() {
defer m.goroutines.Done()
for {
var forceCallbacks bool
select {
case <-m.stop:
return
case forceCallbacks = <-m.change:
}
if newState, err := m.interfaceStateUncached(); err != nil {
m.logf("interfaces.State: %v", err)
} else {
m.handlePotentialChange(newState, forceCallbacks)
}
select {
case <-m.stop:
return
case <-time.After(250 * time.Millisecond):
}
}
}
var (
metricChangeEq = clientmetric.NewCounter("netmon_link_change_eq")
metricChange = clientmetric.NewCounter("netmon_link_change")
metricChangeTimeJump = clientmetric.NewCounter("netmon_link_change_timejump")
metricChangeMajor = clientmetric.NewCounter("netmon_link_change_major")
)
// handlePotentialChange considers whether newState is different enough to wake
// up callers and updates the monitor's state if so.
//
// If forceCallbacks is true, they're always notified.
func (m *Monitor) handlePotentialChange(newState *interfaces.State, forceCallbacks bool) {
m.mu.Lock()
defer m.mu.Unlock()
oldState := m.ifState
timeJumped := shouldMonitorTimeJump && m.checkWallTimeAdvanceLocked()
if !timeJumped && !forceCallbacks && oldState.Equal(newState) {
// Exactly equal. Nothing to do.
metricChangeEq.Add(1)
return
}
delta := &ChangeDelta{
Monitor: m,
Old: oldState,
New: newState,
TimeJumped: timeJumped,
}
delta.Major = m.IsMajorChangeFrom(oldState, newState)
if delta.Major {
m.gwValid = false
m.ifState = newState
if s1, s2 := oldState.String(), delta.New.String(); s1 == s2 {
m.logf("[unexpected] network state changed, but stringification didn't: %v", s1)
m.logf("[unexpected] old: %s", jsonSummary(oldState))
m.logf("[unexpected] new: %s", jsonSummary(newState))
}
}
// See if we have a queued or new time jump signal.
if timeJumped {
m.resetTimeJumpedLocked()
if !delta.Major {
// Only log if it wasn't an interesting change.
m.logf("time jumped (probably wake from sleep); synthesizing major change event")
delta.Major = true
}
}
metricChange.Add(1)
if delta.Major {
metricChangeMajor.Add(1)
}
if delta.TimeJumped {
metricChangeTimeJump.Add(1)
}
for _, cb := range m.cbs {
go cb(delta)
}
}
// IsMajorChangeFrom reports whether the transition from s1 to s2 is
// a "major" change, where major roughly means it's worth tearing down
// a bunch of connections and rebinding.
//
// TODO(bradiftz): tigten this definition.
func (m *Monitor) IsMajorChangeFrom(s1, s2 *interfaces.State) bool {
if s1 == nil && s2 == nil {
return false
}
if s1 == nil || s2 == nil {
return true
}
if s1.HaveV6 != s2.HaveV6 ||
s1.HaveV4 != s2.HaveV4 ||
s1.IsExpensive != s2.IsExpensive ||
s1.DefaultRouteInterface != s2.DefaultRouteInterface ||
s1.HTTPProxy != s2.HTTPProxy ||
s1.PAC != s2.PAC {
return true
}
for iname, i := range s1.Interface {
if iname == m.tsIfName {
// Ignore changes in the Tailscale interface itself.
continue
}
ips := s1.InterfaceIPs[iname]
if !m.isInterestingInterface(i, ips) {
continue
}
i2, ok := s2.Interface[iname]
if !ok {
return true
}
ips2, ok := s2.InterfaceIPs[iname]
if !ok {
return true
}
if !i.Equal(i2) || !prefixesMajorEqual(ips, ips2) {
return true
}
}
return false
}
// prefixesMajorEqual reports whether a and b are equal after ignoring
// boring things like link-local, loopback, and multicast addresses.
func prefixesMajorEqual(a, b []netip.Prefix) bool {
// trim returns a subslice of p with link local unicast,
// loopback, and multicast prefixes removed from the front.
trim := func(p []netip.Prefix) []netip.Prefix {
for len(p) > 0 {
a := p[0].Addr()
if a.IsLinkLocalUnicast() || a.IsLoopback() || a.IsMulticast() {
p = p[1:]
continue
}
break
}
return p
}
for {
a = trim(a)
b = trim(b)
if len(a) == 0 || len(b) == 0 {
return len(a) == 0 && len(b) == 0
}
if a[0] != b[0] {
return false
}
a, b = a[1:], b[1:]
}
}
func jsonSummary(x any) any {
j, err := json.Marshal(x)
if err != nil {
return err
}
return j
}
func wallTime() time.Time {
// From time package's docs: "The canonical way to strip a
// monotonic clock reading is to use t = t.Round(0)."
return time.Now().Round(0)
}
func (m *Monitor) pollWallTime() {
m.mu.Lock()
defer m.mu.Unlock()
if m.closed {
return
}
if m.checkWallTimeAdvanceLocked() {
m.InjectEvent()
}
m.wallTimer.Reset(pollWallTimeInterval)
}
// shouldMonitorTimeJump is whether we keep a regular periodic timer running in
// the background watching for jumps in wall time.
//
// We don't do this on mobile platforms for battery reasons, and because these
// platforms don't really sleep in the same way.
const shouldMonitorTimeJump = runtime.GOOS != "android" && runtime.GOOS != "ios"
// checkWallTimeAdvanceLocked reports whether wall time jumped more than 150% of
// pollWallTimeInterval, indicating we probably just came out of sleep. Once a
// time jump is detected it must be reset by calling resetTimeJumpedLocked.
func (m *Monitor) checkWallTimeAdvanceLocked() bool {
if !shouldMonitorTimeJump {
panic("unreachable") // if callers are correct
}
now := wallTime()
if now.Sub(m.lastWall) > pollWallTimeInterval*3/2 {
m.timeJumped = true // it is reset by debounce.
}
m.lastWall = now
return m.timeJumped
}
// resetTimeJumpedLocked consumes the signal set by checkWallTimeAdvanceLocked.
func (m *Monitor) resetTimeJumpedLocked() {
m.timeJumped = false
}
type ipRuleDeletedMessage struct {
table uint8
priority uint32
}
func (ipRuleDeletedMessage) ignore() bool { return true }