diff --git a/net/netcheck/netcheck.go b/net/netcheck/netcheck.go index 0a6b3156b..204d9889b 100644 --- a/net/netcheck/netcheck.go +++ b/net/netcheck/netcheck.go @@ -18,7 +18,9 @@ import ( "log" "net" "net/http" + "os" "sort" + "strconv" "sync" "time" @@ -36,6 +38,45 @@ import ( "tailscale.com/types/opt" ) +// Debugging and experimentation tweakables. +var ( + debugNetcheck, _ = strconv.ParseBool(os.Getenv("TS_DEBUG_NETCHECK")) +) + +// The various default timeouts for things. +const ( + // overallProbeTimeout is the maximum amount of time netcheck will + // spend gathering a single report. + overallProbeTimeout = 5 * time.Second + // stunTimeout is the maximum amount of time netcheck will spend + // probing with STUN packets without getting a reply before + // switching to HTTP probing, on the assumption that outbound UDP + // is blocked. + stunProbeTimeout = 3 * time.Second + // hairpinCheckTimeout is the amount of time we wait for a + // hairpinned packet to come back. + hairpinCheckTimeout = 500 * time.Millisecond + // defaultActiveRetransmitTime is the retransmit interval we use + // for STUN probes when we're in steady state (not in start-up), + // but don't have previous latency information for a DERP + // node. This is a somewhat conservative guess because if we have + // no data, likely the DERP node is very far away and we have no + // data because we timed out the last time we probed it. + defaultActiveRetransmitTime = 200 * time.Millisecond + // defaultInitialRetransmitTime is the retransmit interval used + // when netcheck first runs. We have no past context to work with, + // and we want answers relatively quickly, so it's biased slightly + // more aggressive than defaultActiveRetransmitTime. A few extra + // packets at startup is fine. + defaultInitialRetransmitTime = 100 * time.Millisecond + // portMapServiceProbeTimeout is the time we wait for port mapping + // services (UPnP, NAT-PMP, PCP) to respond before we give up and + // decide that they're not there. Since these services are on the + // same LAN as this machine and a single L3 hop away, we don't + // give them much time to respond. + portMapServiceProbeTimeout = 100 * time.Millisecond +) + type Report struct { UDP bool // UDP works IPv6 bool // IPv6 works @@ -139,7 +180,7 @@ func (c *Client) logf(format string, a ...interface{}) { } func (c *Client) vlogf(format string, a ...interface{}) { - if c.Verbose { + if c.Verbose || debugNetcheck { c.logf(format, a...) } } @@ -170,6 +211,8 @@ func (c *Client) MakeNextReportFull() { } func (c *Client) ReceiveSTUNPacket(pkt []byte, src netaddr.IPPort) { + c.vlogf("received STUN packet from %s", src) + c.mu.Lock() if c.handleHairSTUNLocked(pkt, src) { c.mu.Unlock() @@ -330,7 +373,7 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *interfaces.State, last *Report) n := reg.Nodes[try%len(reg.Nodes)] prevLatency := last.RegionLatency[reg.RegionID] * 120 / 100 if prevLatency == 0 { - prevLatency = 200 * time.Millisecond + prevLatency = defaultActiveRetransmitTime } delay := time.Duration(try) * prevLatency if do4 { @@ -353,16 +396,12 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *interfaces.State, last *Report) func makeProbePlanInitial(dm *tailcfg.DERPMap, ifState *interfaces.State) (plan probePlan) { plan = make(probePlan) - // initialSTUNTimeout is only 100ms because some extra retransmits - // when starting up is tolerable. - const initialSTUNTimeout = 100 * time.Millisecond - for _, reg := range dm.Regions { var p4 []probe var p6 []probe for try := 0; try < 3; try++ { n := reg.Nodes[try%len(reg.Nodes)] - delay := time.Duration(try) * initialSTUNTimeout + delay := time.Duration(try) * defaultInitialRetransmitTime if ifState.HaveV4 && nodeMight4(n) { p4 = append(p4, probe{delay: delay, node: n.Name, proto: probeIPv4}) } @@ -518,7 +557,7 @@ func (rs *reportState) startHairCheckLocked(dst netaddr.IPPort) { ua := dst.UDPAddr() rs.pc4Hair.WriteTo(stun.Request(rs.hairTX), ua) rs.c.vlogf("sent haircheck to %v", ua) - time.AfterFunc(500*time.Millisecond, func() { close(rs.hairTimeout) }) + time.AfterFunc(hairpinCheckTimeout, func() { close(rs.hairTimeout) }) } func (rs *reportState) waitHairCheck(ctx context.Context) { @@ -539,6 +578,7 @@ func (rs *reportState) waitHairCheck(ctx context.Context) { case <-rs.gotHairSTUN: ret.HairPinning.Set(true) case <-rs.hairTimeout: + rs.c.vlogf("hairCheck timeout") ret.HairPinning.Set(false) default: select { @@ -649,7 +689,7 @@ func (rs *reportState) probePortMapServices() { } defer uc.Close() tempPort := uc.LocalAddr().(*net.UDPAddr).Port - uc.SetReadDeadline(time.Now().Add(100 * time.Millisecond)) + uc.SetReadDeadline(time.Now().Add(portMapServiceProbeTimeout)) // Send request packets for all three protocols. uc.WriteTo(uPnPPacket, port1900) @@ -727,15 +767,10 @@ func newReport() *Report { // // It may not be called concurrently with itself. func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (*Report, error) { - // Wait for STUN for 3 seconds, but then give HTTP probing - // another 2 seconds if all UDP failed. - const overallTimeout = 5 * time.Second - const stunTimeout = 3 * time.Second - // Mask user context with ours that we guarantee to cancel so // we can depend on it being closed in goroutines later. // (User ctx might be context.Background, etc) - ctx, cancel := context.WithTimeout(ctx, overallTimeout) + ctx, cancel := context.WithTimeout(ctx, overallProbeTimeout) defer cancel() if dm == nil { @@ -844,7 +879,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (*Report, e }(probeSet) } - stunTimer := time.NewTimer(stunTimeout) + stunTimer := time.NewTimer(stunProbeTimeout) defer stunTimer.Stop() select { @@ -857,7 +892,9 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (*Report, e } rs.waitHairCheck(ctx) + c.vlogf("hairCheck done") rs.waitPortMap.Wait() + c.vlogf("portMap done") rs.stopTimers() // Try HTTPS latency check if all STUN probes failed due to UDP presumably being blocked. @@ -912,7 +949,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (*Report, e func (c *Client) measureHTTPSLatency(ctx context.Context, reg *tailcfg.DERPRegion) (time.Duration, netaddr.IP, error) { var result httpstat.Result - ctx, cancel := context.WithTimeout(httpstat.WithHTTPStat(ctx, &result), 5*time.Second) + ctx, cancel := context.WithTimeout(httpstat.WithHTTPStat(ctx, &result), overallProbeTimeout) defer cancel() var ip netaddr.IP