wgengine/magicsock: make debug-level stuff not logged by default

And add a CLI/localapi and c2n mechanism to enable it for a fixed
amount of time.

Updates #1548

Change-Id: I71674aaf959a9c6761ff33bbf4a417ffd42195a7
Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
Brad Fitzpatrick 2022-10-03 20:39:45 -07:00 committed by Brad Fitzpatrick
parent 5c69961a57
commit 1841d0bf98
10 changed files with 241 additions and 21 deletions

View File

@ -318,6 +318,28 @@ func (lc *LocalClient) DebugAction(ctx context.Context, action string) error {
return nil
}
// SetComponentDebugLogging sets component's debug logging enabled for
// the provided duration. If the duration is in the past, the debug logging
// is disabled.
func (lc *LocalClient) SetComponentDebugLogging(ctx context.Context, component string, d time.Duration) error {
body, err := lc.send(ctx, "POST",
fmt.Sprintf("/localapi/v0/component-debug-logging?component=%s&secs=%d",
url.QueryEscape(component), int64(d.Seconds())), 200, nil)
if err != nil {
return fmt.Errorf("error %w: %s", err, body)
}
var res struct {
Error string
}
if err := json.Unmarshal(body, &res); err != nil {
return err
}
if res.Error != "" {
return errors.New(res.Error)
}
return nil
}
// Status returns the Tailscale daemon's status.
func Status(ctx context.Context) (*ipnstate.Status, error) {
return defaultLocalClient.Status(ctx)

View File

@ -53,6 +53,16 @@
Exec: runDERPMap,
ShortHelp: "print DERP map",
},
{
Name: "component-logs",
Exec: runDebugComponentLogs,
ShortHelp: "enable/disable debug logs for a component",
FlagSet: (func() *flag.FlagSet {
fs := newFlagSet("component-logs")
fs.DurationVar(&debugComponentLogsArgs.forDur, "for", time.Hour, "how long to enable debug logs for; zero or negative means to disable")
return fs
})(),
},
{
Name: "daemon-goroutines",
Exec: runDaemonGoroutines,
@ -513,3 +523,26 @@ func runTS2021(ctx context.Context, args []string) error {
log.Printf("final underlying conn: %v / %v", conn.LocalAddr(), conn.RemoteAddr())
return nil
}
var debugComponentLogsArgs struct {
forDur time.Duration
}
func runDebugComponentLogs(ctx context.Context, args []string) error {
if len(args) != 1 {
return errors.New("usage: debug component-logs <component>")
}
component := args[0]
dur := debugComponentLogsArgs.forDur
err := localClient.SetComponentDebugLogging(ctx, component, dur)
if err != nil {
return err
}
if debugComponentLogsArgs.forDur <= 0 {
fmt.Printf("Disabled debug logs for component %q\n", component)
} else {
fmt.Printf("Enabled debug logs for component %q for %v\n", component, dur)
}
return nil
}

View File

@ -168,6 +168,11 @@ type PartialFile struct {
// LocalBackend.userID, a string like "user-$USER_ID" (used in
// server mode).
// - on Linux/etc, it's always "_daemon" (ipn.GlobalDaemonStateKey)
//
// Additionally, the StateKey can be debug setting name:
//
// - "_debug_magicsock_until" with value being a unix timestamp stringified
// - "_debug_<component>_until" with value being a unix timestamp stringified
type StateKey string
type Options struct {

View File

@ -8,6 +8,8 @@
"encoding/json"
"io"
"net/http"
"strconv"
"time"
"tailscale.com/tailcfg"
"tailscale.com/util/clientmetric"
@ -32,6 +34,21 @@ func (b *LocalBackend) handleC2N(w http.ResponseWriter, r *http.Request) {
case "/debug/metrics":
w.Header().Set("Content-Type", "text/plain")
clientmetric.WritePrometheusExpositionFormat(w)
case "/debug/component-logging":
component := r.FormValue("component")
secs, _ := strconv.Atoi(r.FormValue("secs"))
if secs == 0 {
secs -= 1
}
until := time.Now().Add(time.Duration(secs) * time.Second)
err := b.SetComponentDebugLogging(component, until)
var res struct {
Error string `json:",omitempty"`
}
if err != nil {
res.Error = err.Error()
}
writeJSON(res)
case "/ssh/usernames":
var req tailcfg.C2NSSHUsernamesRequest
if r.Method == "POST" {

View File

@ -24,6 +24,7 @@
"time"
"go4.org/netipx"
"golang.org/x/exp/slices"
"tailscale.com/client/tailscale/apitype"
"tailscale.com/control/controlclient"
"tailscale.com/doctor"
@ -55,6 +56,7 @@
"tailscale.com/types/views"
"tailscale.com/util/deephash"
"tailscale.com/util/dnsname"
"tailscale.com/util/mak"
"tailscale.com/util/multierr"
"tailscale.com/util/osshare"
"tailscale.com/util/systemd"
@ -186,6 +188,7 @@ type LocalBackend struct {
// *.partial file to its final name on completion.
directFileRoot string
directFileDoFinalRename bool // false on macOS, true on several NAS platforms
componentLogUntil map[string]componentLogState
// statusLock must be held before calling statusChanged.Wait() or
// statusChanged.Broadcast().
@ -267,9 +270,91 @@ func NewLocalBackend(logf logger.Logf, logid string, store ipn.StateStore, diale
b.logf("[unexpected] failed to wire up peer API port for engine %T", e)
}
for _, component := range debuggableComponents {
key := componentStateKey(component)
if ut, err := ipn.ReadStoreInt(store, key); err == nil {
if until := time.Unix(ut, 0); until.After(time.Now()) {
// conditional to avoid log spam at start when off
b.SetComponentDebugLogging(component, until)
}
}
}
return b, nil
}
type componentLogState struct {
until time.Time
timer *time.Timer // if non-nil, the AfterFunc to disable it
}
var debuggableComponents = []string{
"magicsock",
}
func componentStateKey(component string) ipn.StateKey {
return ipn.StateKey("_debug_" + component + "_until")
}
// SetComponentDebugLogging sets component's debug logging enabled until the until time.
// If until is in the past, the component's debug logging is disabled.
//
// The following components are recognized:
//
// - magicsock
func (b *LocalBackend) SetComponentDebugLogging(component string, until time.Time) error {
b.mu.Lock()
defer b.mu.Unlock()
var setEnabled func(bool)
switch component {
case "magicsock":
mc, err := b.magicConn()
if err != nil {
return err
}
setEnabled = mc.SetDebugLoggingEnabled
}
if setEnabled == nil || !slices.Contains(debuggableComponents, component) {
return fmt.Errorf("unknown component %q", component)
}
timeUnixOrZero := func(t time.Time) int64 {
if t.IsZero() {
return 0
}
return t.Unix()
}
ipn.PutStoreInt(b.store, componentStateKey(component), timeUnixOrZero(until))
now := time.Now()
on := now.Before(until)
setEnabled(on)
var onFor time.Duration
if on {
onFor = until.Sub(now)
b.logf("debugging logging for component %q enabled for %v (until %v)", component, onFor.Round(time.Second), until.UTC().Format(time.RFC3339))
} else {
b.logf("debugging logging for component %q disabled", component)
}
if oldSt, ok := b.componentLogUntil[component]; ok && oldSt.timer != nil {
oldSt.timer.Stop()
}
newSt := componentLogState{until: until}
if on {
newSt.timer = time.AfterFunc(onFor, func() {
// Turn off logging after the timer fires, as long as the state is
// unchanged when the timer actually fires.
b.mu.Lock()
defer b.mu.Unlock()
if ls := b.componentLogUntil[component]; ls.until == until {
setEnabled(false)
b.logf("debugging logging for component %q disabled (by timer)", component)
}
})
}
mak.Set(&b.componentLogUntil, component, newSt)
return nil
}
// Dialer returns the backend's dialer.
func (b *LocalBackend) Dialer() *tsdial.Dialer {
return b.dialer

View File

@ -146,6 +146,8 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
h.serveMetrics(w, r)
case "/localapi/v0/debug":
h.serveDebug(w, r)
case "/localapi/v0/component-debug-logging":
h.serveComponentDebugLogging(w, r)
case "/localapi/v0/set-expiry-sooner":
h.serveSetExpirySooner(w, r)
case "/localapi/v0/dial":
@ -329,6 +331,24 @@ func (h *Handler) serveDebug(w http.ResponseWriter, r *http.Request) {
io.WriteString(w, "done\n")
}
func (h *Handler) serveComponentDebugLogging(w http.ResponseWriter, r *http.Request) {
if !h.PermitWrite {
http.Error(w, "debug access denied", http.StatusForbidden)
return
}
component := r.FormValue("component")
secs, _ := strconv.Atoi(r.FormValue("secs"))
err := h.b.SetComponentDebugLogging(component, time.Now().Add(time.Duration(secs)*time.Second))
var res struct {
Error string
}
if err != nil {
res.Error = err.Error()
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(res)
}
// serveProfileFunc is the implementation of Handler.serveProfile, after auth,
// for platforms where we want to link it in.
var serveProfileFunc func(http.ResponseWriter, *http.Request)

View File

@ -6,6 +6,8 @@
import (
"errors"
"fmt"
"strconv"
)
// ErrStateNotExist is returned by StateStore.ReadState when the
@ -35,7 +37,7 @@
// StateKey "user-1234".
ServerModeStartKey = StateKey("server-mode-start-key")
// NLKeyStateKey is the key under which we store the nodes'
// NLKeyStateKey is the key under which we store the node's
// network-lock node key, in its key.NLPrivate.MarshalText representation.
NLKeyStateKey = StateKey("_nl-node-key")
)
@ -48,3 +50,17 @@ type StateStore interface {
// WriteState saves bs as the state associated with ID.
WriteState(id StateKey, bs []byte) error
}
// ReadStoreInt reads an integer from a StateStore.
func ReadStoreInt(store StateStore, id StateKey) (int64, error) {
v, err := store.ReadState(id)
if err != nil {
return 0, err
}
return strconv.ParseInt(string(v), 10, 64)
}
// PutStoreInt puts an integer into a StateStore.
func PutStoreInt(store StateStore, id StateKey, val int64) error {
return store.WriteState(id, fmt.Appendf(nil, "%d", val))
}

View File

@ -82,7 +82,8 @@
// - 43: 2022-09-21: clients can return usernames for SSH
// - 44: 2022-09-22: MapResponse.ControlDialPlan
// - 45: 2022-09-26: c2n /debug/{goroutines,prefs,metrics}
const CurrentCapabilityVersion CapabilityVersion = 45
// - 46: 2022-10-04: c2n /debug/component-logging
const CurrentCapabilityVersion CapabilityVersion = 46
type StableID string

View File

@ -315,6 +315,9 @@ type Conn struct {
// new connection that'll fail.
networkUp atomic.Bool
// Whether debugging logging is enabled.
debugLogging atomic.Bool
// havePrivateKey is whether privateKey is non-zero.
havePrivateKey atomic.Bool
publicKeyAtomic syncs.AtomicValue[key.NodePublic] // or NodeKey zero value if !havePrivateKey
@ -430,6 +433,24 @@ type Conn struct {
peerLastDerp map[key.NodePublic]int
}
// SetDebugLoggingEnabled controls whether spammy debug logging is enabled.
//
// Note that this is currently independent from the log levels, even though
// they're pretty correlated: debugging logs should be [v1] (or higher), but
// some non-debug logs may also still have a [vN] annotation. The [vN] level
// controls which gets shown in stderr. The dlogf method, on the other hand,
// controls which gets even printed or uploaded at any level.
func (c *Conn) SetDebugLoggingEnabled(v bool) {
c.debugLogging.Store(v)
}
// dlogf logs a debug message if debug logging is enabled via SetDebugLoggingEnabled.
func (c *Conn) dlogf(format string, a ...any) {
if c.debugLogging.Load() {
c.logf(format, a...)
}
}
// derpRoute is a route entry for a public key, saying that a certain
// peer should be available at DERP node derpID, as long as the
// current connection for that derpID is dc. (but dc should not be
@ -658,7 +679,7 @@ func (c *Conn) updateEndpoints(why string) {
c.endpointsUpdateActive = false
c.muCond.Broadcast()
}()
c.logf("[v1] magicsock: starting endpoint update (%s)", why)
c.dlogf("[v1] magicsock: starting endpoint update (%s)", why)
if c.noV4Send.Load() && runtime.GOOS != "js" {
c.mu.Lock()
closed := c.closed
@ -706,7 +727,7 @@ func (c *Conn) setEndpoints(endpoints []tailcfg.Endpoint) (changed bool) {
// skipped during the e2e tests because they depend
// too much on the exact sequence of updates. Fix the
// tests. But a protocol rewrite might happen first.
c.logf("[v1] magicsock: ignoring pre-DERP map, STUN-less endpoint update: %v", endpoints)
c.dlogf("[v1] magicsock: ignoring pre-DERP map, STUN-less endpoint update: %v", endpoints)
return false
}
@ -860,7 +881,7 @@ func (c *Conn) callNetInfoCallback(ni *tailcfg.NetInfo) {
func (c *Conn) callNetInfoCallbackLocked(ni *tailcfg.NetInfo) {
c.netInfoLast = ni
if c.netInfoFunc != nil {
c.logf("[v1] magicsock: netInfo update: %+v", ni)
c.dlogf("[v1] magicsock: netInfo update: %+v", ni)
go c.netInfoFunc(ni)
}
}
@ -1831,7 +1852,7 @@ func (c *Conn) sendDiscoMessage(dst netip.AddrPort, dstKey key.NodePublic, dstDi
if !dstKey.IsZero() {
node = dstKey.ShortString()
}
c.logf("[v1] magicsock: disco: %v->%v (%v, %v) sent %v", c.discoShort, dstDisco.ShortString(), node, derpStr(dst.String()), disco.MessageSummary(m))
c.dlogf("[v1] magicsock: disco: %v->%v (%v, %v) sent %v", c.discoShort, dstDisco.ShortString(), node, derpStr(dst.String()), disco.MessageSummary(m))
}
if isDERP {
metricSentDiscoDERP.Add(1)
@ -1997,7 +2018,7 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netip.AddrPort, derpNodeSrc ke
return
}
di.setNodeKey(nodeKey)
c.logf("[v1] magicsock: disco: %v<-%v (%v, %v) got call-me-maybe, %d endpoints",
c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got call-me-maybe, %d endpoints",
c.discoShort, ep.discoShort,
ep.publicKey.ShortString(), derpStr(src.String()),
len(dm.MyNumber))
@ -2099,7 +2120,7 @@ func (c *Conn) handlePingLocked(dm *disco.Ping, src netip.AddrPort, di *discoInf
if numNodes > 1 {
pingNodeSrcStr = "[one-of-multi]"
}
c.logf("[v1] magicsock: disco: %v<-%v (%v, %v) got ping tx=%x", c.discoShort, di.discoShort, pingNodeSrcStr, src, dm.TxID[:6])
c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got ping tx=%x", c.discoShort, di.discoShort, pingNodeSrcStr, src, dm.TxID[:6])
}
ipDst := src
@ -2122,10 +2143,10 @@ func (c *Conn) enqueueCallMeMaybe(derpAddr netip.AddrPort, de *endpoint) {
defer c.mu.Unlock()
if !c.lastEndpointsTime.After(time.Now().Add(-endpointsFreshEnoughDuration)) {
c.logf("[v1] magicsock: want call-me-maybe but endpoints stale; restunning")
c.dlogf("[v1] magicsock: want call-me-maybe but endpoints stale; restunning")
mak.Set(&c.onEndpointRefreshed, de, func() {
c.logf("[v1] magicsock: STUN done; sending call-me-maybe to %v %v", de.discoShort, de.publicKey.ShortString())
c.dlogf("[v1] magicsock: STUN done; sending call-me-maybe to %v %v", de.discoShort, de.publicKey.ShortString())
c.enqueueCallMeMaybe(derpAddr, de)
})
// TODO(bradfitz): make a new 'reSTUNQuickly' method
@ -2352,7 +2373,7 @@ func (c *Conn) SetNetworkMap(nm *netmap.NetworkMap) {
c.logf("[v1] magicsock: got updated network map; %d peers", len(nm.Peers))
if numNoDisco != 0 {
c.logf("[v1] magicsock: %d DERP-only peers (no discokey)", numNoDisco)
c.logf("magicsock: %d DERP-only peers (no discokey)", numNoDisco)
}
c.netMap = nm
@ -2800,7 +2821,7 @@ func (c *Conn) ReSTUN(why string) {
if c.endpointsUpdateActive {
if c.wantEndpointsUpdate != why {
c.logf("[v1] magicsock: ReSTUN: endpoint update active, need another later (%q)", why)
c.dlogf("[v1] magicsock: ReSTUN: endpoint update active, need another later (%q)", why)
c.wantEndpointsUpdate = why
}
} else {
@ -3524,7 +3545,7 @@ func (de *endpoint) heartbeat() {
if mono.Since(de.lastSend) > sessionActiveTimeout {
// Session's idle. Stop heartbeating.
de.c.logf("[v1] magicsock: disco: ending heartbeats for idle session to %v (%v)", de.publicKey.ShortString(), de.discoShort)
de.c.dlogf("[v1] magicsock: disco: ending heartbeats for idle session to %v (%v)", de.publicKey.ShortString(), de.discoShort)
return
}
@ -3637,7 +3658,7 @@ func (de *endpoint) pingTimeout(txid stun.TxID) {
return
}
if debugDisco() || !de.bestAddr.IsValid() || mono.Now().After(de.trustBestAddrUntil) {
de.c.logf("[v1] magicsock: disco: timeout waiting for pong %x from %v (%v, %v)", txid[:6], sp.to, de.publicKey.ShortString(), de.discoShort)
de.c.dlogf("[v1] magicsock: disco: timeout waiting for pong %x from %v (%v, %v)", txid[:6], sp.to, de.publicKey.ShortString(), de.discoShort)
}
de.removeSentPingLocked(txid, sp)
}
@ -3745,7 +3766,7 @@ func (de *endpoint) sendPingsLocked(now mono.Time, sendCallMeMaybe bool) {
sentAny = true
if firstPing && sendCallMeMaybe {
de.c.logf("[v1] magicsock: disco: send, starting discovery for %v (%v)", de.publicKey.ShortString(), de.discoShort)
de.c.dlogf("[v1] magicsock: disco: send, starting discovery for %v (%v)", de.publicKey.ShortString(), de.discoShort)
}
de.startPingLocked(ep, now, pingDiscovery)
@ -3830,7 +3851,7 @@ func (de *endpoint) addCandidateEndpoint(ep netip.AddrPort) {
}
// Newly discovered endpoint. Exciting!
de.c.logf("[v1] magicsock: disco: adding %v as candidate endpoint for %v (%s)", ep, de.discoShort, de.publicKey.ShortString())
de.c.dlogf("[v1] magicsock: disco: adding %v as candidate endpoint for %v (%s)", ep, de.discoShort, de.publicKey.ShortString())
de.endpointState[ep] = &endpointState{
lastGotPing: time.Now(),
}
@ -3843,7 +3864,7 @@ func (de *endpoint) addCandidateEndpoint(ep netip.AddrPort) {
}
}
size2 := len(de.endpointState)
de.c.logf("[v1] magicsock: disco: addCandidateEndpoint pruned %v candidate set from %v to %v entries", size, size2)
de.c.dlogf("[v1] magicsock: disco: addCandidateEndpoint pruned %v candidate set from %v to %v entries", size, size2)
}
}
@ -3897,7 +3918,7 @@ func (de *endpoint) handlePongConnLocked(m *disco.Pong, di *discoInfo, src netip
}
if sp.purpose != pingHeartbeat {
de.c.logf("[v1] magicsock: disco: %v<-%v (%v, %v) got pong tx=%x latency=%v pong.src=%v%v", de.c.discoShort, de.discoShort, de.publicKey.ShortString(), src, m.TxID[:6], latency.Round(time.Millisecond), m.Src, logger.ArgWriter(func(bw *bufio.Writer) {
de.c.dlogf("[v1] magicsock: disco: %v<-%v (%v, %v) got pong tx=%x latency=%v pong.src=%v%v", de.c.discoShort, de.discoShort, de.publicKey.ShortString(), src, m.TxID[:6], latency.Round(time.Millisecond), m.Src, logger.ArgWriter(func(bw *bufio.Writer) {
if sp.to != src {
fmt.Fprintf(bw, " ping.to=%v", sp.to)
}
@ -4010,7 +4031,7 @@ func (de *endpoint) handleCallMeMaybe(m *disco.CallMeMaybe) {
}
}
if len(newEPs) > 0 {
de.c.logf("[v1] magicsock: disco: call-me-maybe from %v %v added new endpoints: %v",
de.c.dlogf("[v1] magicsock: disco: call-me-maybe from %v %v added new endpoints: %v",
de.publicKey.ShortString(), de.discoShort,
logger.ArgWriter(func(w *bufio.Writer) {
for i, ep := range newEPs {

View File

@ -234,12 +234,12 @@ func (c *Conn) receiveDisco(pc net.PacketConn, isIPV6 bool) {
if acceptPort == 0 {
// This should only typically happen if the receiving address family
// was recently disabled.
c.logf("[v1] disco raw: dropping packet for port %d as acceptPort=0", dstPort)
c.dlogf("[v1] disco raw: dropping packet for port %d as acceptPort=0", dstPort)
continue
}
if dstPort != acceptPort {
c.logf("[v1] disco raw: dropping packet for port %d", dstPort)
c.dlogf("[v1] disco raw: dropping packet for port %d", dstPort)
continue
}