mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-29 04:55:31 +00:00
wgengine: on TCP connect fail/timeout, log some clues about why it failed
So users can see why things aren't working. A start. More diagnostics coming. Updates #1094
This commit is contained in:
parent
5eeaea9ef9
commit
85e54af0d7
@ -51,7 +51,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep
|
||||
tailscale.com/logtail/backoff from tailscale.com/control/controlclient+
|
||||
tailscale.com/metrics from tailscale.com/derp
|
||||
tailscale.com/net/dnscache from tailscale.com/control/controlclient+
|
||||
tailscale.com/net/flowtrack from tailscale.com/wgengine/filter
|
||||
tailscale.com/net/flowtrack from tailscale.com/wgengine/filter+
|
||||
💣 tailscale.com/net/interfaces from tailscale.com/cmd/tailscale/cli+
|
||||
tailscale.com/net/netcheck from tailscale.com/cmd/tailscale/cli+
|
||||
tailscale.com/net/netns from tailscale.com/control/controlclient+
|
||||
|
@ -86,7 +86,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
|
||||
tailscale.com/logtail/filch from tailscale.com/logpolicy
|
||||
tailscale.com/metrics from tailscale.com/derp
|
||||
tailscale.com/net/dnscache from tailscale.com/control/controlclient+
|
||||
tailscale.com/net/flowtrack from tailscale.com/wgengine/filter
|
||||
tailscale.com/net/flowtrack from tailscale.com/wgengine/filter+
|
||||
💣 tailscale.com/net/interfaces from tailscale.com/ipn+
|
||||
tailscale.com/net/netcheck from tailscale.com/wgengine/magicsock
|
||||
tailscale.com/net/netns from tailscale.com/control/controlclient+
|
||||
|
@ -710,14 +710,46 @@ func peerForIP(nm *controlclient.NetworkMap, ip netaddr.IP) (n *tailcfg.Node, ok
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(bradfitz): this is O(n peers). Add ART to netaddr?
|
||||
var best netaddr.IPPrefix
|
||||
for _, p := range nm.Peers {
|
||||
for _, cidr := range p.AllowedIPs {
|
||||
if cidr.Contains(ip) {
|
||||
return p, true
|
||||
if best.IsZero() || cidr.Bits > best.Bits {
|
||||
n = p
|
||||
best = cidr
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
return n, n != nil
|
||||
}
|
||||
|
||||
// PeerForIP returns the node that ip should route to.
|
||||
func (c *Conn) PeerForIP(ip netaddr.IP) (n *tailcfg.Node, ok bool) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if c.netMap == nil {
|
||||
return
|
||||
}
|
||||
return peerForIP(c.netMap, ip)
|
||||
}
|
||||
|
||||
// LastRecvActivityOfDisco returns the time we last got traffic from
|
||||
// this endpoint (updated every ~10 seconds).
|
||||
func (c *Conn) LastRecvActivityOfDisco(dk tailcfg.DiscoKey) time.Time {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
de, ok := c.endpointOfDisco[dk]
|
||||
if !ok {
|
||||
return time.Time{}
|
||||
}
|
||||
unix := atomic.LoadInt64(&de.lastRecvUnixAtomic)
|
||||
if unix == 0 {
|
||||
return time.Time{}
|
||||
}
|
||||
return time.Unix(unix, 0)
|
||||
}
|
||||
|
||||
// Ping handles a "tailscale ping" CLI query.
|
||||
|
157
wgengine/pendopen.go
Normal file
157
wgengine/pendopen.go
Normal file
@ -0,0 +1,157 @@
|
||||
// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package wgengine
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"tailscale.com/net/flowtrack"
|
||||
"tailscale.com/net/packet"
|
||||
"tailscale.com/wgengine/filter"
|
||||
"tailscale.com/wgengine/tstun"
|
||||
)
|
||||
|
||||
const tcpTimeoutBeforeDebug = 5 * time.Second
|
||||
|
||||
// debugConnectFailures reports whether the local node should track
|
||||
// outgoing TCP connections and log which ones fail and why.
|
||||
func debugConnectFailures() bool {
|
||||
s := os.Getenv("TS_DEBUG_CONNECT_FAILURES")
|
||||
if s == "" {
|
||||
return true
|
||||
}
|
||||
v, _ := strconv.ParseBool(s)
|
||||
return v
|
||||
}
|
||||
|
||||
type pendingOpenFlow struct {
|
||||
timer *time.Timer // until giving up on the flow
|
||||
}
|
||||
|
||||
func (e *userspaceEngine) trackOpenPreFilterIn(pp *packet.Parsed, t *tstun.TUN) (res filter.Response) {
|
||||
res = filter.Accept // always
|
||||
|
||||
if pp.IPVersion == 0 ||
|
||||
pp.IPProto != packet.TCP ||
|
||||
pp.TCPFlags&(packet.TCPSyn|packet.TCPRst) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
flow := flowtrack.Tuple{Dst: pp.Src, Src: pp.Dst} // src/dst reversed
|
||||
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
of, ok := e.pendOpen[flow]
|
||||
if !ok {
|
||||
// Not a tracked flow.
|
||||
return
|
||||
}
|
||||
of.timer.Stop()
|
||||
delete(e.pendOpen, flow)
|
||||
|
||||
if pp.TCPFlags&packet.TCPRst != 0 {
|
||||
// TODO(bradfitz): have peer send a IP proto 99 "why"
|
||||
// packet first with details and log that instead
|
||||
// (e.g. ACL prohibited, shields up, etc).
|
||||
e.logf("open-conn-track: flow %v got RST by peer", flow)
|
||||
return
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (e *userspaceEngine) trackOpenPostFilterOut(pp *packet.Parsed, t *tstun.TUN) (res filter.Response) {
|
||||
res = filter.Accept // always
|
||||
|
||||
if pp.IPVersion == 0 ||
|
||||
pp.IPProto != packet.TCP ||
|
||||
pp.TCPFlags&packet.TCPSyn == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
flow := flowtrack.Tuple{Src: pp.Src, Dst: pp.Dst}
|
||||
timer := time.AfterFunc(tcpTimeoutBeforeDebug, func() {
|
||||
e.onOpenTimeout(flow)
|
||||
})
|
||||
|
||||
e.mu.Lock()
|
||||
defer e.mu.Unlock()
|
||||
if e.pendOpen == nil {
|
||||
e.pendOpen = make(map[flowtrack.Tuple]*pendingOpenFlow)
|
||||
}
|
||||
if _, dup := e.pendOpen[flow]; dup {
|
||||
// Duplicates are expected when the OS retransmits. Ignore.
|
||||
return
|
||||
}
|
||||
e.pendOpen[flow] = &pendingOpenFlow{timer: timer}
|
||||
|
||||
return filter.Accept
|
||||
}
|
||||
|
||||
func (e *userspaceEngine) onOpenTimeout(flow flowtrack.Tuple) {
|
||||
e.mu.Lock()
|
||||
if _, ok := e.pendOpen[flow]; !ok {
|
||||
// Not a tracked flow, or already handled & deleted.
|
||||
e.mu.Unlock()
|
||||
return
|
||||
}
|
||||
delete(e.pendOpen, flow)
|
||||
e.mu.Unlock()
|
||||
|
||||
// Diagnose why it might've timed out.
|
||||
n, ok := e.magicConn.PeerForIP(flow.Dst.IP)
|
||||
if !ok {
|
||||
e.logf("open-conn-track: timeout opening %v; no associated peer node", flow)
|
||||
return
|
||||
}
|
||||
if n.DiscoKey.IsZero() {
|
||||
e.logf("open-conn-track: timeout opening %v; peer node %v running pre-0.100", flow, n.Key.ShortString())
|
||||
return
|
||||
}
|
||||
if n.DERP == "" {
|
||||
e.logf("open-conn-track: timeout opening %v; peer node %v not connected to any DERP relay", flow, n.Key.ShortString())
|
||||
return
|
||||
}
|
||||
var lastSeen time.Time
|
||||
if n.LastSeen != nil {
|
||||
lastSeen = *n.LastSeen
|
||||
}
|
||||
|
||||
var ps *PeerStatus
|
||||
if st, err := e.getStatus(); err == nil {
|
||||
for _, v := range st.Peers {
|
||||
if v.NodeKey == n.Key {
|
||||
v := v // copy
|
||||
ps = &v
|
||||
}
|
||||
}
|
||||
} else {
|
||||
e.logf("open-conn-track: timeout opening %v to node %v; failed to get engine status: %v", flow, n.Key.ShortString(), err)
|
||||
return
|
||||
}
|
||||
if ps == nil {
|
||||
e.logf("open-conn-track: timeout opening %v; target node %v in netmap but unknown to wireguard", flow, n.Key.ShortString())
|
||||
return
|
||||
}
|
||||
|
||||
// TODO(bradfitz): figure out what PeerStatus.LastHandshake
|
||||
// is. It appears to be the last time we sent a handshake,
|
||||
// which isn't what I expected. I thought it was when a
|
||||
// handshake completed, which is what I want.
|
||||
_ = ps.LastHandshake
|
||||
|
||||
e.logf("open-conn-track: timeout opening %v to node %v; lastSeen=%v, lastRecv=%v",
|
||||
flow, n.Key.ShortString(),
|
||||
agoOrNever(lastSeen), agoOrNever(e.magicConn.LastRecvActivityOfDisco(n.DiscoKey)))
|
||||
}
|
||||
|
||||
func agoOrNever(t time.Time) string {
|
||||
if t.IsZero() {
|
||||
return "never"
|
||||
}
|
||||
return time.Since(t).Round(time.Second).String()
|
||||
}
|
@ -30,6 +30,7 @@
|
||||
"tailscale.com/control/controlclient"
|
||||
"tailscale.com/internal/deepprint"
|
||||
"tailscale.com/ipn/ipnstate"
|
||||
"tailscale.com/net/flowtrack"
|
||||
"tailscale.com/net/interfaces"
|
||||
"tailscale.com/net/packet"
|
||||
"tailscale.com/net/tsaddr"
|
||||
@ -118,6 +119,7 @@ type userspaceEngine struct {
|
||||
endpoints []string
|
||||
pingers map[wgkey.Key]*pinger // legacy pingers for pre-discovery peers
|
||||
linkState *interfaces.State
|
||||
pendOpen map[flowtrack.Tuple]*pendingOpenFlow // see pendopen.go
|
||||
|
||||
// Lock ordering: magicsock.Conn.mu, wgLock, then mu.
|
||||
}
|
||||
@ -265,6 +267,17 @@ func newUserspaceEngineAdvanced(conf EngineConfig) (_ Engine, reterr error) {
|
||||
}
|
||||
e.tundev.PreFilterOut = e.handleLocalPackets
|
||||
|
||||
if debugConnectFailures() {
|
||||
if e.tundev.PreFilterIn != nil {
|
||||
return nil, errors.New("unexpected PreFilterIn already set")
|
||||
}
|
||||
e.tundev.PreFilterIn = e.trackOpenPreFilterIn
|
||||
if e.tundev.PostFilterOut != nil {
|
||||
return nil, errors.New("unexpected PostFilterOut already set")
|
||||
}
|
||||
e.tundev.PostFilterOut = e.trackOpenPostFilterOut
|
||||
}
|
||||
|
||||
// wireguard-go logs as it starts and stops routines.
|
||||
// Silence those; there are a lot of them, and they're just noise.
|
||||
allowLogf := func(s string) bool {
|
||||
|
Loading…
Reference in New Issue
Block a user