net/netcheck: centralize all clock values in one place.

This makes it easier to see how long a netcheck might take, and what
the slow bits might be.

Signed-off-by: David Anderson <danderson@tailscale.com>
This commit is contained in:
David Anderson 2020-07-25 02:29:27 +00:00 committed by Dave Anderson
parent 5a370d545a
commit 25288567ec

View File

@ -18,7 +18,9 @@
"log"
"net"
"net/http"
"os"
"sort"
"strconv"
"sync"
"time"
@ -36,6 +38,45 @@
"tailscale.com/types/opt"
)
// Debugging and experimentation tweakables.
var (
debugNetcheck, _ = strconv.ParseBool(os.Getenv("TS_DEBUG_NETCHECK"))
)
// The various default timeouts for things.
const (
// overallProbeTimeout is the maximum amount of time netcheck will
// spend gathering a single report.
overallProbeTimeout = 5 * time.Second
// stunTimeout is the maximum amount of time netcheck will spend
// probing with STUN packets without getting a reply before
// switching to HTTP probing, on the assumption that outbound UDP
// is blocked.
stunProbeTimeout = 3 * time.Second
// hairpinCheckTimeout is the amount of time we wait for a
// hairpinned packet to come back.
hairpinCheckTimeout = 500 * time.Millisecond
// defaultActiveRetransmitTime is the retransmit interval we use
// for STUN probes when we're in steady state (not in start-up),
// but don't have previous latency information for a DERP
// node. This is a somewhat conservative guess because if we have
// no data, likely the DERP node is very far away and we have no
// data because we timed out the last time we probed it.
defaultActiveRetransmitTime = 200 * time.Millisecond
// defaultInitialRetransmitTime is the retransmit interval used
// when netcheck first runs. We have no past context to work with,
// and we want answers relatively quickly, so it's biased slightly
// more aggressive than defaultActiveRetransmitTime. A few extra
// packets at startup is fine.
defaultInitialRetransmitTime = 100 * time.Millisecond
// portMapServiceProbeTimeout is the time we wait for port mapping
// services (UPnP, NAT-PMP, PCP) to respond before we give up and
// decide that they're not there. Since these services are on the
// same LAN as this machine and a single L3 hop away, we don't
// give them much time to respond.
portMapServiceProbeTimeout = 100 * time.Millisecond
)
type Report struct {
UDP bool // UDP works
IPv6 bool // IPv6 works
@ -139,7 +180,7 @@ func (c *Client) logf(format string, a ...interface{}) {
}
func (c *Client) vlogf(format string, a ...interface{}) {
if c.Verbose {
if c.Verbose || debugNetcheck {
c.logf(format, a...)
}
}
@ -170,6 +211,8 @@ func (c *Client) MakeNextReportFull() {
}
func (c *Client) ReceiveSTUNPacket(pkt []byte, src netaddr.IPPort) {
c.vlogf("received STUN packet from %s", src)
c.mu.Lock()
if c.handleHairSTUNLocked(pkt, src) {
c.mu.Unlock()
@ -330,7 +373,7 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *interfaces.State, last *Report)
n := reg.Nodes[try%len(reg.Nodes)]
prevLatency := last.RegionLatency[reg.RegionID] * 120 / 100
if prevLatency == 0 {
prevLatency = 200 * time.Millisecond
prevLatency = defaultActiveRetransmitTime
}
delay := time.Duration(try) * prevLatency
if do4 {
@ -353,16 +396,12 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *interfaces.State, last *Report)
func makeProbePlanInitial(dm *tailcfg.DERPMap, ifState *interfaces.State) (plan probePlan) {
plan = make(probePlan)
// initialSTUNTimeout is only 100ms because some extra retransmits
// when starting up is tolerable.
const initialSTUNTimeout = 100 * time.Millisecond
for _, reg := range dm.Regions {
var p4 []probe
var p6 []probe
for try := 0; try < 3; try++ {
n := reg.Nodes[try%len(reg.Nodes)]
delay := time.Duration(try) * initialSTUNTimeout
delay := time.Duration(try) * defaultInitialRetransmitTime
if ifState.HaveV4 && nodeMight4(n) {
p4 = append(p4, probe{delay: delay, node: n.Name, proto: probeIPv4})
}
@ -518,7 +557,7 @@ func (rs *reportState) startHairCheckLocked(dst netaddr.IPPort) {
ua := dst.UDPAddr()
rs.pc4Hair.WriteTo(stun.Request(rs.hairTX), ua)
rs.c.vlogf("sent haircheck to %v", ua)
time.AfterFunc(500*time.Millisecond, func() { close(rs.hairTimeout) })
time.AfterFunc(hairpinCheckTimeout, func() { close(rs.hairTimeout) })
}
func (rs *reportState) waitHairCheck(ctx context.Context) {
@ -539,6 +578,7 @@ func (rs *reportState) waitHairCheck(ctx context.Context) {
case <-rs.gotHairSTUN:
ret.HairPinning.Set(true)
case <-rs.hairTimeout:
rs.c.vlogf("hairCheck timeout")
ret.HairPinning.Set(false)
default:
select {
@ -649,7 +689,7 @@ func (rs *reportState) probePortMapServices() {
}
defer uc.Close()
tempPort := uc.LocalAddr().(*net.UDPAddr).Port
uc.SetReadDeadline(time.Now().Add(100 * time.Millisecond))
uc.SetReadDeadline(time.Now().Add(portMapServiceProbeTimeout))
// Send request packets for all three protocols.
uc.WriteTo(uPnPPacket, port1900)
@ -727,15 +767,10 @@ func newReport() *Report {
//
// It may not be called concurrently with itself.
func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (*Report, error) {
// Wait for STUN for 3 seconds, but then give HTTP probing
// another 2 seconds if all UDP failed.
const overallTimeout = 5 * time.Second
const stunTimeout = 3 * time.Second
// Mask user context with ours that we guarantee to cancel so
// we can depend on it being closed in goroutines later.
// (User ctx might be context.Background, etc)
ctx, cancel := context.WithTimeout(ctx, overallTimeout)
ctx, cancel := context.WithTimeout(ctx, overallProbeTimeout)
defer cancel()
if dm == nil {
@ -844,7 +879,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (*Report, e
}(probeSet)
}
stunTimer := time.NewTimer(stunTimeout)
stunTimer := time.NewTimer(stunProbeTimeout)
defer stunTimer.Stop()
select {
@ -857,7 +892,9 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (*Report, e
}
rs.waitHairCheck(ctx)
c.vlogf("hairCheck done")
rs.waitPortMap.Wait()
c.vlogf("portMap done")
rs.stopTimers()
// Try HTTPS latency check if all STUN probes failed due to UDP presumably being blocked.
@ -912,7 +949,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap) (*Report, e
func (c *Client) measureHTTPSLatency(ctx context.Context, reg *tailcfg.DERPRegion) (time.Duration, netaddr.IP, error) {
var result httpstat.Result
ctx, cancel := context.WithTimeout(httpstat.WithHTTPStat(ctx, &result), 5*time.Second)
ctx, cancel := context.WithTimeout(httpstat.WithHTTPStat(ctx, &result), overallProbeTimeout)
defer cancel()
var ip netaddr.IP