tailscale/cmd/derper/derper.go
James Tucker edbad6d274 cmd/derper: add user timeout and reduce TCP keepalive
The derper sends an in-protocol keepalive every 60-65s, so frequent TCP
keepalives are unnecessary. In this tuning TCP keepalives should never
occur for a DERP client connection, as they will send an L7 keepalive
often enough to always reset the TCP keepalive timer. If however a
connection does not receive an ACK promptly it will now be shutdown,
which happens sooner than it would with a normal TCP keepalive tuning.

This re-tuning reduces the frequency of network traffic from derp to
client, reducing battery cost.

Updates tailscale/corp#17587
Updates #3363

Signed-off-by: James Tucker <james@tailscale.com>
2024-02-22 11:22:08 -08:00

474 lines
15 KiB
Go

// Copyright (c) Tailscale Inc & AUTHORS
// SPDX-License-Identifier: BSD-3-Clause
// The derper binary is a simple DERP server.
package main // import "tailscale.com/cmd/derper"
import (
"cmp"
"context"
"crypto/tls"
"encoding/json"
"errors"
"expvar"
"flag"
"fmt"
"io"
"log"
"math"
"net"
"net/http"
"os"
"os/signal"
"path/filepath"
"regexp"
"strings"
"syscall"
"time"
"go4.org/mem"
"golang.org/x/time/rate"
"tailscale.com/atomicfile"
"tailscale.com/derp"
"tailscale.com/derp/derphttp"
"tailscale.com/metrics"
"tailscale.com/net/ktimeout"
"tailscale.com/net/stunserver"
"tailscale.com/tsweb"
"tailscale.com/types/key"
)
var (
dev = flag.Bool("dev", false, "run in localhost development mode (overrides -a)")
addr = flag.String("a", ":443", "server HTTP/HTTPS listen address, in form \":port\", \"ip:port\", or for IPv6 \"[ip]:port\". If the IP is omitted, it defaults to all interfaces. Serves HTTPS if the port is 443 and/or -certmode is manual, otherwise HTTP.")
httpPort = flag.Int("http-port", 80, "The port on which to serve HTTP. Set to -1 to disable. The listener is bound to the same IP (if any) as specified in the -a flag.")
stunPort = flag.Int("stun-port", 3478, "The UDP port on which to serve STUN. The listener is bound to the same IP (if any) as specified in the -a flag.")
configPath = flag.String("c", "", "config file path")
certMode = flag.String("certmode", "letsencrypt", "mode for getting a cert. possible options: manual, letsencrypt")
certDir = flag.String("certdir", tsweb.DefaultCertDir("derper-certs"), "directory to store LetsEncrypt certs, if addr's port is :443")
hostname = flag.String("hostname", "derp.tailscale.com", "LetsEncrypt host name, if addr's port is :443")
runSTUN = flag.Bool("stun", true, "whether to run a STUN server. It will bind to the same IP (if any) as the --addr flag value.")
runDERP = flag.Bool("derp", true, "whether to run a DERP server. The only reason to set this false is if you're decommissioning a server but want to keep its bootstrap DNS functionality still running.")
meshPSKFile = flag.String("mesh-psk-file", defaultMeshPSKFile(), "if non-empty, path to file containing the mesh pre-shared key file. It should contain some hex string; whitespace is trimmed.")
meshWith = flag.String("mesh-with", "", "optional comma-separated list of hostnames to mesh with; the server's own hostname can be in the list")
bootstrapDNS = flag.String("bootstrap-dns-names", "", "optional comma-separated list of hostnames to make available at /bootstrap-dns")
unpublishedDNS = flag.String("unpublished-bootstrap-dns-names", "", "optional comma-separated list of hostnames to make available at /bootstrap-dns and not publish in the list")
verifyClients = flag.Bool("verify-clients", false, "verify clients to this DERP server through a local tailscaled instance.")
verifyClientURL = flag.String("verify-client-url", "", "if non-empty, an admission controller URL for permitting client connections; see tailcfg.DERPAdmitClientRequest")
verifyFailOpen = flag.Bool("verify-client-url-fail-open", true, "whether we fail open if --verify-client-url is unreachable")
acceptConnLimit = flag.Float64("accept-connection-limit", math.Inf(+1), "rate limit for accepting new connection")
acceptConnBurst = flag.Int("accept-connection-burst", math.MaxInt, "burst limit for accepting new connection")
// tcpKeepAlive is intentionally long, to reduce battery cost. There is an L7 keepalive on a higher frequency schedule.
tcpKeepAlive = flag.Duration("tcp-keepalive-time", 10*time.Minute, "TCP keepalive time")
// tcpUserTimeout is intentionally short, so that hung connections are cleaned up promptly. DERPs should be nearby users.
tcpUserTimeout = flag.Duration("tcp-user-timeout", 15*time.Second, "TCP user timeout")
)
var (
tlsRequestVersion = &metrics.LabelMap{Label: "version"}
tlsActiveVersion = &metrics.LabelMap{Label: "version"}
)
func init() {
expvar.Publish("derper_tls_request_version", tlsRequestVersion)
expvar.Publish("gauge_derper_tls_active_version", tlsActiveVersion)
}
type config struct {
PrivateKey key.NodePrivate
}
func loadConfig() config {
if *dev {
return config{PrivateKey: key.NewNode()}
}
if *configPath == "" {
if os.Getuid() == 0 {
*configPath = "/var/lib/derper/derper.key"
} else {
log.Fatalf("derper: -c <config path> not specified")
}
log.Printf("no config path specified; using %s", *configPath)
}
b, err := os.ReadFile(*configPath)
switch {
case errors.Is(err, os.ErrNotExist):
return writeNewConfig()
case err != nil:
log.Fatal(err)
panic("unreachable")
default:
var cfg config
if err := json.Unmarshal(b, &cfg); err != nil {
log.Fatalf("derper: config: %v", err)
}
return cfg
}
}
func writeNewConfig() config {
k := key.NewNode()
if err := os.MkdirAll(filepath.Dir(*configPath), 0777); err != nil {
log.Fatal(err)
}
cfg := config{
PrivateKey: k,
}
b, err := json.MarshalIndent(cfg, "", "\t")
if err != nil {
log.Fatal(err)
}
if err := atomicfile.WriteFile(*configPath, b, 0600); err != nil {
log.Fatal(err)
}
return cfg
}
func main() {
flag.Parse()
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer cancel()
if *dev {
*addr = ":3340" // above the keys DERP
log.Printf("Running in dev mode.")
tsweb.DevMode = true
}
listenHost, _, err := net.SplitHostPort(*addr)
if err != nil {
log.Fatalf("invalid server address: %v", err)
}
if *runSTUN {
ss := stunserver.New(ctx)
go ss.ListenAndServe(net.JoinHostPort(listenHost, fmt.Sprint(*stunPort)))
}
cfg := loadConfig()
serveTLS := tsweb.IsProd443(*addr) || *certMode == "manual"
s := derp.NewServer(cfg.PrivateKey, log.Printf)
s.SetVerifyClient(*verifyClients)
s.SetVerifyClientURL(*verifyClientURL)
s.SetVerifyClientURLFailOpen(*verifyFailOpen)
if *meshPSKFile != "" {
b, err := os.ReadFile(*meshPSKFile)
if err != nil {
log.Fatal(err)
}
key := strings.TrimSpace(string(b))
if matched, _ := regexp.MatchString(`(?i)^[0-9a-f]{64,}$`, key); !matched {
log.Fatalf("key in %s must contain 64+ hex digits", *meshPSKFile)
}
s.SetMeshKey(key)
log.Printf("DERP mesh key configured")
}
if err := startMesh(s); err != nil {
log.Fatalf("startMesh: %v", err)
}
expvar.Publish("derp", s.ExpVar())
mux := http.NewServeMux()
if *runDERP {
derpHandler := derphttp.Handler(s)
derpHandler = addWebSocketSupport(s, derpHandler)
mux.Handle("/derp", derpHandler)
} else {
mux.Handle("/derp", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "derp server disabled", http.StatusNotFound)
}))
}
mux.HandleFunc("/derp/probe", probeHandler)
go refreshBootstrapDNSLoop()
mux.HandleFunc("/bootstrap-dns", tsweb.BrowserHeaderHandlerFunc(handleBootstrapDNS))
mux.Handle("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tsweb.AddBrowserHeaders(w)
w.Header().Set("Content-Type", "text/html; charset=utf-8")
w.WriteHeader(200)
io.WriteString(w, `<html><body>
<h1>DERP</h1>
<p>
This is a
<a href="https://tailscale.com/">Tailscale</a>
<a href="https://pkg.go.dev/tailscale.com/derp">DERP</a>
server.
</p>
`)
if !*runDERP {
io.WriteString(w, `<p>Status: <b>disabled</b></p>`)
}
if tsweb.AllowDebugAccess(r) {
io.WriteString(w, "<p>Debug info at <a href='/debug/'>/debug/</a>.</p>\n")
}
}))
mux.Handle("/robots.txt", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
tsweb.AddBrowserHeaders(w)
io.WriteString(w, "User-agent: *\nDisallow: /\n")
}))
mux.Handle("/generate_204", http.HandlerFunc(serveNoContent))
debug := tsweb.Debugger(mux)
debug.KV("TLS hostname", *hostname)
debug.KV("Mesh key", s.HasMeshKey())
debug.Handle("check", "Consistency check", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
err := s.ConsistencyCheck()
if err != nil {
http.Error(w, err.Error(), 500)
} else {
io.WriteString(w, "derp.Server ConsistencyCheck okay")
}
}))
debug.Handle("traffic", "Traffic check", http.HandlerFunc(s.ServeDebugTraffic))
// Longer lived DERP connections send an application layer keepalive. Note
// if the keepalive is hit, the user timeout will take precedence over the
// keepalive counter, so the probe if unanswered will take effect promptly,
// this is less tolerant of high loss, but high loss is unexpected.
lc := net.ListenConfig{
Control: ktimeout.UserTimeout(*tcpUserTimeout),
KeepAlive: *tcpKeepAlive,
}
quietLogger := log.New(logFilter{}, "", 0)
httpsrv := &http.Server{
Addr: *addr,
Handler: mux,
ErrorLog: quietLogger,
// Set read/write timeout. For derper, this basically
// only affects TLS setup, as read/write deadlines are
// cleared on Hijack, which the DERP server does. But
// without this, we slowly accumulate stuck TLS
// handshake goroutines forever. This also affects
// /debug/ traffic, but 30 seconds is plenty for
// Prometheus/etc scraping.
ReadTimeout: 30 * time.Second,
WriteTimeout: 30 * time.Second,
}
go func() {
<-ctx.Done()
httpsrv.Shutdown(ctx)
}()
if serveTLS {
log.Printf("derper: serving on %s with TLS", *addr)
var certManager certProvider
certManager, err = certProviderByCertMode(*certMode, *certDir, *hostname)
if err != nil {
log.Fatalf("derper: can not start cert provider: %v", err)
}
httpsrv.TLSConfig = certManager.TLSConfig()
getCert := httpsrv.TLSConfig.GetCertificate
httpsrv.TLSConfig.GetCertificate = func(hi *tls.ClientHelloInfo) (*tls.Certificate, error) {
cert, err := getCert(hi)
if err != nil {
return nil, err
}
cert.Certificate = append(cert.Certificate, s.MetaCert())
return cert, nil
}
// Disable TLS 1.0 and 1.1, which are obsolete and have security issues.
httpsrv.TLSConfig.MinVersion = tls.VersionTLS12
httpsrv.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.TLS != nil {
label := "unknown"
switch r.TLS.Version {
case tls.VersionTLS10:
label = "1.0"
case tls.VersionTLS11:
label = "1.1"
case tls.VersionTLS12:
label = "1.2"
case tls.VersionTLS13:
label = "1.3"
}
tlsRequestVersion.Add(label, 1)
tlsActiveVersion.Add(label, 1)
defer tlsActiveVersion.Add(label, -1)
}
mux.ServeHTTP(w, r)
})
if *httpPort > -1 {
go func() {
port80mux := http.NewServeMux()
port80mux.HandleFunc("/generate_204", serveNoContent)
port80mux.Handle("/", certManager.HTTPHandler(tsweb.Port80Handler{Main: mux}))
port80srv := &http.Server{
Addr: net.JoinHostPort(listenHost, fmt.Sprintf("%d", *httpPort)),
Handler: port80mux,
ErrorLog: quietLogger,
ReadTimeout: 30 * time.Second,
// Crank up WriteTimeout a bit more than usually
// necessary just so we can do long CPU profiles
// and not hit net/http/pprof's "profile
// duration exceeds server's WriteTimeout".
WriteTimeout: 5 * time.Minute,
}
ln, err := lc.Listen(context.Background(), "tcp", port80srv.Addr)
if err != nil {
log.Fatal(err)
}
defer ln.Close()
err = port80srv.Serve(ln)
if err != nil {
if err != http.ErrServerClosed {
log.Fatal(err)
}
}
}()
}
err = rateLimitedListenAndServeTLS(httpsrv)
} else {
log.Printf("derper: serving on %s", *addr)
var ln net.Listener
ln, err = lc.Listen(context.Background(), "tcp", httpsrv.Addr)
if err != nil {
log.Fatal(err)
}
err = httpsrv.Serve(ln)
}
if err != nil && err != http.ErrServerClosed {
log.Fatalf("derper: %v", err)
}
}
const (
noContentChallengeHeader = "X-Tailscale-Challenge"
noContentResponseHeader = "X-Tailscale-Response"
)
// For captive portal detection
func serveNoContent(w http.ResponseWriter, r *http.Request) {
if challenge := r.Header.Get(noContentChallengeHeader); challenge != "" {
badChar := strings.IndexFunc(challenge, func(r rune) bool {
return !isChallengeChar(r)
}) != -1
if len(challenge) <= 64 && !badChar {
w.Header().Set(noContentResponseHeader, "response "+challenge)
}
}
w.WriteHeader(http.StatusNoContent)
}
func isChallengeChar(c rune) bool {
// Semi-randomly chosen as a limited set of valid characters
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9') ||
c == '.' || c == '-' || c == '_'
}
// probeHandler is the endpoint that js/wasm clients hit to measure
// DERP latency, since they can't do UDP STUN queries.
func probeHandler(w http.ResponseWriter, r *http.Request) {
switch r.Method {
case "HEAD", "GET":
w.Header().Set("Access-Control-Allow-Origin", "*")
default:
http.Error(w, "bogus probe method", http.StatusMethodNotAllowed)
}
}
var validProdHostname = regexp.MustCompile(`^derp([^.]*)\.tailscale\.com\.?$`)
func prodAutocertHostPolicy(_ context.Context, host string) error {
if validProdHostname.MatchString(host) {
return nil
}
return errors.New("invalid hostname")
}
func defaultMeshPSKFile() string {
try := []string{
"/home/derp/keys/derp-mesh.key",
filepath.Join(os.Getenv("HOME"), "keys", "derp-mesh.key"),
}
for _, p := range try {
if _, err := os.Stat(p); err == nil {
return p
}
}
return ""
}
func rateLimitedListenAndServeTLS(srv *http.Server) error {
ln, err := net.Listen("tcp", cmp.Or(srv.Addr, ":https"))
if err != nil {
return err
}
rln := newRateLimitedListener(ln, rate.Limit(*acceptConnLimit), *acceptConnBurst)
expvar.Publish("tls_listener", rln.ExpVar())
defer rln.Close()
return srv.ServeTLS(rln, "", "")
}
type rateLimitedListener struct {
// These are at the start of the struct to ensure 64-bit alignment
// on 32-bit architecture regardless of what other fields may exist
// in this package.
numAccepts expvar.Int // does not include number of rejects
numRejects expvar.Int
net.Listener
lim *rate.Limiter
}
func newRateLimitedListener(ln net.Listener, limit rate.Limit, burst int) *rateLimitedListener {
return &rateLimitedListener{Listener: ln, lim: rate.NewLimiter(limit, burst)}
}
func (l *rateLimitedListener) ExpVar() expvar.Var {
m := new(metrics.Set)
m.Set("counter_accepted_connections", &l.numAccepts)
m.Set("counter_rejected_connections", &l.numRejects)
return m
}
var errLimitedConn = errors.New("cannot accept connection; rate limited")
func (l *rateLimitedListener) Accept() (net.Conn, error) {
// Even under a rate limited situation, we accept the connection immediately
// and close it, rather than being slow at accepting new connections.
// This provides two benefits: 1) it signals to the client that something
// is going on on the server, and 2) it prevents new connections from
// piling up and occupying resources in the OS kernel.
// The client will retry as needing (with backoffs in place).
cn, err := l.Listener.Accept()
if err != nil {
return nil, err
}
if !l.lim.Allow() {
l.numRejects.Add(1)
cn.Close()
return nil, errLimitedConn
}
l.numAccepts.Add(1)
return cn, nil
}
// logFilter is used to filter out useless error logs that are logged to
// the net/http.Server.ErrorLog logger.
type logFilter struct{}
func (logFilter) Write(p []byte) (int, error) {
b := mem.B(p)
if mem.HasSuffix(b, mem.S(": EOF\n")) ||
mem.HasSuffix(b, mem.S(": i/o timeout\n")) ||
mem.HasSuffix(b, mem.S(": read: connection reset by peer\n")) ||
mem.HasSuffix(b, mem.S(": remote error: tls: bad certificate\n")) ||
mem.HasSuffix(b, mem.S(": tls: first record does not look like a TLS handshake\n")) {
// Skip this log message, but say that we processed it
return len(p), nil
}
log.Printf("%s", p)
return len(p), nil
}