wgengine/magicsock: bunch of misc discovery path cleanups

* fix tailscale status for peers using discovery
* as part of that, pull out disco address selection into reusable
  and testable discoEndpoint.addrForSendLocked
* truncate ping/pong logged hex txids in half to eliminate noise
* move a bunch of random time constants into named constants
  with docs
* track a history of per-endpoint pong replies for future use &
  status display
* add "send" and " got" prefix to discovery message logging
  immediately before the frame type so it's easier to read than
  searching for the "<-" or "->" arrows earlier in the line; but keep
  those as the more reasily machine readable part for later.

Updates #483
This commit is contained in:
Brad Fitzpatrick 2020-07-03 11:06:33 -07:00
parent e9643ae724
commit f5f3885b5b
2 changed files with 133 additions and 42 deletions

View File

@ -157,9 +157,9 @@ func parsePong(ver uint8, p []byte) (m *Pong, err error) {
func MessageSummary(m Message) string {
switch m := m.(type) {
case *Ping:
return fmt.Sprintf("ping tx=%x", m.TxID)
return fmt.Sprintf("ping tx=%x", m.TxID[:6])
case *Pong:
return fmt.Sprintf("pong tx=%x", m.TxID)
return fmt.Sprintf("pong tx=%x", m.TxID[:6])
case CallMeMaybe:
return "call-me-maybe"
default:

View File

@ -14,6 +14,7 @@
"errors"
"fmt"
"hash/fnv"
"math"
"math/rand"
"net"
"os"
@ -1411,7 +1412,7 @@ func (c *Conn) sendDiscoMessage(dst netaddr.IPPort, dstKey key.Public, dstDisco
pkt = box.SealAfterPrecomputation(pkt, m.AppendMarshal(nil), &nonce, sharedKey)
sent, err = c.sendAddr(dst, dstKey, pkt)
if sent {
c.logf("magicsock: disco: %v->%v (%v, %v): %v", c.discoShort, dstDisco.ShortString(), dstKey.ShortString(), derpStr(dst.String()), disco.MessageSummary(m))
c.logf("magicsock: disco: %v->%v (%v, %v) sent %v", c.discoShort, dstDisco.ShortString(), dstKey.ShortString(), derpStr(dst.String()), disco.MessageSummary(m))
} else if err == nil {
// Can't send. (e.g. no IPv6 locally)
} else {
@ -1511,6 +1512,7 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
c.logf("[unexpected] CallMeMaybe packets should only come via DERP")
return true
}
c.logf("magicsock: disco: %v<-%v (%v, %v) got call-me-maybe", c.discoShort, de.discoShort, de.publicKey.ShortString(), derpStr(src.String()))
go de.handleCallMeMaybe()
}
@ -1518,7 +1520,7 @@ func (c *Conn) handleDiscoMessage(msg []byte, src netaddr.IPPort) bool {
}
func (c *Conn) handlePingLocked(dm *disco.Ping, de *discoEndpoint, src netaddr.IPPort) {
c.logf("magicsock: disco: %v<-%v (%v, %v) ping tx=%x", c.discoShort, de.discoKey.ShortString(), de.publicKey.ShortString(), src, dm.TxID)
c.logf("magicsock: disco: %v<-%v (%v, %v) got ping tx=%x", c.discoShort, de.discoShort, de.publicKey.ShortString(), src, dm.TxID[:6])
// Remember this this route if not present.
c.setAddrToDiscoLocked(src, de.discoKey, nil)
@ -2568,6 +2570,15 @@ func (c *Conn) UpdateStatus(sb *ipnstate.StatusBuilder) {
c.mu.Lock()
defer c.mu.Unlock()
for dk, de := range c.endpointOfDisco {
ps := &ipnstate.PeerStatus{InMagicSock: true}
if node, ok := c.nodeOfDisco[dk]; ok {
ps.Addrs = append(ps.Addrs, node.Endpoints...)
}
de.populatePeerStatus(ps)
sb.AddPeer(de.publicKey, ps)
}
// Old-style (pre-disco) peers:
for k, as := range c.addrsByKey {
ps := &ipnstate.PeerStatus{
InMagicSock: true,
@ -2629,10 +2640,41 @@ type discoEndpoint struct {
timers map[*time.Timer]bool
}
const (
// discoPingInterval is the minimum time between pings
// to an endpoint. (Except in the case of CallMeMaybe frames
// resetting the counter, as the first pings likely didn't through
// the firewall)
discoPingInterval = 5 * time.Second
// pingTimeoutDuration is how long we wait for a pong reply before
// assuming it's never coming.
pingTimeoutDuration = 5 * time.Second
// trustUDPAddrDuration is how long we trust a UDP address as the exclusive
// path (without using DERP) without having heard a Pong reply.
trustUDPAddrDuration = 5 * time.Second
)
// endpointState is some state and history for a specific endpoint of
// a discoEndpoint. (The subject is the discoEndpoint.endpointState
// map key)
type endpointState struct {
lastPing time.Time
// TODO: lastPong time.Time
index int // index in nodecfg.Node.Endpoints
// all fields guarded by discoEndpoint.mu:
lastPing time.Time
recentPongs []pongReply // ring buffer up to pongHistoryCount entries
recentPong uint16 // index into recentPongs of most recent; older , wrapped
index int16 // index in nodecfg.Node.Endpoints
}
// pongHistoryCount is how many pongReply values we keep per endpointState
const pongHistoryCount = 64
type pongReply struct {
latency time.Duration
pongAt time.Time // when we received the pong
from netaddr.IPPort // the pong's src (usually same as endpoint map key)
pongSrc netaddr.IPPort // what they reported they heard
}
type sentPing struct {
@ -2683,35 +2725,44 @@ func (de *discoEndpoint) UpdateDst(addr *net.UDPAddr) error {
return nil
}
// addrForSendLocked returns the address(es) that should be used for
// sending the next packet. Zero, one, or both of UDP address and DERP
// addr may be non-zero.
//
// de.mu must be held.
func (de *discoEndpoint) addrForSendLocked(now time.Time) (udpAddr, derpAddr netaddr.IPPort) {
udpAddr = de.bestAddr
if udpAddr.IsZero() || now.After(de.trustBestAddrUntil) {
// We had a bestAddr but it expired so send both to it
// and DERP.
derpAddr = de.derpAddr
}
return
}
func (de *discoEndpoint) send(b []byte) error {
now := time.Now()
de.mu.Lock()
de.lastSend = now
derpAddr := de.derpAddr
haveDerp := !derpAddr.IsZero()
bestAddr := de.bestAddr
bestOld := now.After(de.trustBestAddrUntil)
if bestAddr.IsZero() || bestOld {
udpAddr, derpAddr := de.addrForSendLocked(now)
if udpAddr.IsZero() || now.After(de.trustBestAddrUntil) {
de.sendPingsLocked(now, true)
}
de.mu.Unlock()
var didDerp bool
if bestAddr.IsZero() {
if !haveDerp {
return errors.New("no DERP addr")
}
bestAddr = derpAddr
} else if bestOld && haveDerp {
// We have a bestAddr, but it hasn't been confirmed in a while,
// so let's not entirely trust it and also send via DERP.
didDerp, _ = de.c.sendAddr(derpAddr, de.publicKey, b)
if udpAddr.IsZero() && derpAddr.IsZero() {
return errors.New("no UDP or DERP addr")
}
_, err := de.c.sendAddr(bestAddr, de.publicKey, b)
if didDerp {
return nil
var err error
if !udpAddr.IsZero() {
_, err = de.c.sendAddr(udpAddr, de.publicKey, b)
}
if !derpAddr.IsZero() {
if ok, _ := de.c.sendAddr(derpAddr, de.publicKey, b); ok && err != nil {
// UDP failed but DERP worked, so good enough:
return nil
}
}
return err
}
@ -2765,14 +2816,14 @@ func (de *discoEndpoint) sendPingsLocked(now time.Time, sendCallMeMaybe bool) {
var sentAny bool
for ep, st := range de.endpointState {
ep := ep
if !st.lastPing.IsZero() && now.Sub(st.lastPing) < 5*time.Second {
if !st.lastPing.IsZero() && now.Sub(st.lastPing) < discoPingInterval {
continue
}
st.lastPing = now
txid := stun.NewTxID()
t := de.newTimerLocked(5*time.Second, func() {
de.c.logf("magicsock: disco: timeout waiting for pong %x from %v (%v, %v)", txid, ep, de.publicKey.ShortString(), de.discoShort)
t := de.newTimerLocked(pingTimeoutDuration, func() {
de.c.logf("magicsock: disco: timeout waiting for pong %x from %v (%v, %v)", txid[:6], ep, de.publicKey.ShortString(), de.discoShort)
de.forgetPing(txid)
})
de.sentPing[txid] = sentPing{
@ -2824,15 +2875,19 @@ func (de *discoEndpoint) updateFromNode(n *tailcfg.Node) {
st.index = -1 // assume deleted until updated in next loop
}
for i, epStr := range n.Endpoints {
if i > math.MaxInt16 {
// Seems unlikely.
continue
}
ipp, err := netaddr.ParseIPPort(epStr)
if err != nil {
de.c.logf("magicsock: bogus netmap endpoint %q", epStr)
continue
}
if st, ok := de.endpointState[ipp]; ok {
st.index = i
st.index = int16(i)
} else {
de.endpointState[ipp] = &endpointState{index: i}
de.endpointState[ipp] = &endpointState{index: int16(i)}
}
}
// Now delete anything that wasn't updated.
@ -2878,38 +2933,60 @@ func (de *discoEndpoint) handlePongConnLocked(m *disco.Pong, src netaddr.IPPort)
}
de.removeSentPingLocked(m.TxID, sp)
st, ok := de.endpointState[sp.to]
if !ok {
// This is no longer an endpoint we care about.
return
}
de.c.setAddrToDiscoLocked(src, de.discoKey, de)
now := time.Now()
delay := now.Sub(sp.at)
latency := now.Sub(sp.at)
de.c.logf("magicsock: disco: %v<-%v (%v, %v) pong tx=%x latency=%v pong.src=%v%v", de.c.discoShort, de.discoShort, de.publicKey.ShortString(), src, m.TxID, delay.Round(time.Millisecond), m.Src, logger.ArgWriter(func(bw *bufio.Writer) {
st.addPongReplyLocked(pongReply{
latency: latency,
pongAt: now,
from: src,
pongSrc: m.Src,
})
de.c.logf("magicsock: disco: %v<-%v (%v, %v) got pong tx=%x latency=%v pong.src=%v%v", de.c.discoShort, de.discoShort, de.publicKey.ShortString(), src, m.TxID[:6], latency.Round(time.Millisecond), m.Src, logger.ArgWriter(func(bw *bufio.Writer) {
if sp.to != src {
fmt.Fprintf(bw, " ping.to=%v", sp.to)
}
}))
// Expire our best address if we haven't heard from it in awhile.
tooOld := now.Add(-15 * time.Second)
if !de.bestAddr.IsZero() && de.bestAddrAt.Before(tooOld) {
de.bestAddr = netaddr.IPPort{}
}
// Promote this pong response to our current best address if it's lower latency.
// TODO(bradfitz): decide how latency vs. preference order affects decision
if de.bestAddr.IsZero() || delay < de.bestAddrLatency {
if de.bestAddr.IsZero() || latency < de.bestAddrLatency {
if de.bestAddr != sp.to {
de.c.logf("magicsock: disco: node %v %v now using %v", de.publicKey.ShortString(), de.discoShort, sp.to)
de.bestAddr = sp.to
}
}
if de.bestAddr == sp.to {
de.bestAddrLatency = delay
de.bestAddrLatency = latency
de.bestAddrAt = now
de.trustBestAddrUntil = now.Add(5 * time.Second)
de.trustBestAddrUntil = now.Add(trustUDPAddrDuration)
}
}
// discoEndpoint.mu must be held.
func (st *endpointState) addPongReplyLocked(r pongReply) {
if n := len(st.recentPongs); n < pongHistoryCount {
st.recentPong = uint16(n)
st.recentPongs = append(st.recentPongs, r)
return
}
i := st.recentPong + 1
if i == pongHistoryCount {
i = 0
}
st.recentPongs[i] = r
st.recentPong = i
}
// handleCallMeMaybe handles a CallMeMaybe discovery message via
// DERP. The contract for use of this message is that the peer has
// already sent to us via UDP, so their stateful firewall should be
@ -2926,6 +3003,20 @@ func (de *discoEndpoint) handleCallMeMaybe() {
de.sendPingsLocked(time.Now(), false)
}
func (de *discoEndpoint) populatePeerStatus(ps *ipnstate.PeerStatus) {
de.mu.Lock()
defer de.mu.Unlock()
if de.lastSend.IsZero() {
return
}
now := time.Now()
if udpAddr, _ := de.addrForSendLocked(now); !udpAddr.IsZero() {
ps.CurAddr = udpAddr.String()
}
}
// cleanup is called when a discovery endpoint is no longer present in the NetworkMap.
// This is where we can do cleanup such as closing goroutines or canceling timers.
func (de *discoEndpoint) cleanup() {