chirp: add a 10s timeout when communicating with BIRD (#5444)

Prior to this change, if BIRD stops responding wgengine.watchdogEngine
will crash tailscaled.

This happens because in wgengine.userspaceEngine, we end up blocking
forever trying to write a request to or read a response from BIRD with
wgLock held, and then future watchdog'd calls will block on acquiring
that mutex until the watchdog kills the process. With the timeout, we at
least get the chance to print an error message and decide whether we
want to crash or not.

Updates tailscale/coral#72

Signed-off-by: Andrew Dunham <andrew@tailscale.com>

Signed-off-by: Andrew Dunham <andrew@tailscale.com>
This commit is contained in:
Andrew Dunham 2022-08-27 20:49:31 -04:00 committed by GitHub
parent 9bf13fc3d1
commit e1738ea78e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 117 additions and 6 deletions

View File

@ -11,15 +11,31 @@
"fmt" "fmt"
"net" "net"
"strings" "strings"
"time"
)
const (
// Maximum amount of time we should wait when reading a response from BIRD.
responseTimeout = 10 * time.Second
) )
// New creates a BIRDClient. // New creates a BIRDClient.
func New(socket string) (*BIRDClient, error) { func New(socket string) (*BIRDClient, error) {
return newWithTimeout(socket, responseTimeout)
}
func newWithTimeout(socket string, timeout time.Duration) (*BIRDClient, error) {
conn, err := net.Dial("unix", socket) conn, err := net.Dial("unix", socket)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to connect to BIRD: %w", err) return nil, fmt.Errorf("failed to connect to BIRD: %w", err)
} }
b := &BIRDClient{socket: socket, conn: conn, scanner: bufio.NewScanner(conn)} b := &BIRDClient{
socket: socket,
conn: conn,
scanner: bufio.NewScanner(conn),
timeNow: time.Now,
timeout: timeout,
}
// Read and discard the first line as that is the welcome message. // Read and discard the first line as that is the welcome message.
if _, err := b.readResponse(); err != nil { if _, err := b.readResponse(); err != nil {
return nil, err return nil, err
@ -32,6 +48,8 @@ type BIRDClient struct {
socket string socket string
conn net.Conn conn net.Conn
scanner *bufio.Scanner scanner *bufio.Scanner
timeNow func() time.Time
timeout time.Duration
} }
// Close closes the underlying connection to BIRD. // Close closes the underlying connection to BIRD.
@ -81,10 +99,15 @@ func (b *BIRDClient) EnableProtocol(protocol string) error {
// 1 means table entry, 8 runtime error and 9 syntax error. // 1 means table entry, 8 runtime error and 9 syntax error.
func (b *BIRDClient) exec(cmd string, args ...any) (string, error) { func (b *BIRDClient) exec(cmd string, args ...any) (string, error) {
if err := b.conn.SetWriteDeadline(b.timeNow().Add(b.timeout)); err != nil {
return "", err
}
if _, err := fmt.Fprintf(b.conn, cmd, args...); err != nil { if _, err := fmt.Fprintf(b.conn, cmd, args...); err != nil {
return "", err return "", err
} }
fmt.Fprintln(b.conn) if _, err := fmt.Fprintln(b.conn); err != nil {
return "", err
}
return b.readResponse() return b.readResponse()
} }
@ -105,14 +128,20 @@ func hasResponseCode(s []byte) bool {
} }
func (b *BIRDClient) readResponse() (string, error) { func (b *BIRDClient) readResponse() (string, error) {
// Set the read timeout before we start reading anything.
if err := b.conn.SetReadDeadline(b.timeNow().Add(b.timeout)); err != nil {
return "", err
}
var resp strings.Builder var resp strings.Builder
var done bool var done bool
for !done { for !done {
if !b.scanner.Scan() { if !b.scanner.Scan() {
return "", fmt.Errorf("reading response from bird failed: %q", resp.String()) if err := b.scanner.Err(); err != nil {
} return "", err
if err := b.scanner.Err(); err != nil { }
return "", err
return "", fmt.Errorf("reading response from bird failed (EOF): %q", resp.String())
} }
out := b.scanner.Bytes() out := b.scanner.Bytes()
if _, err := resp.Write(out); err != nil { if _, err := resp.Write(out); err != nil {

View File

@ -8,9 +8,12 @@
"errors" "errors"
"fmt" "fmt"
"net" "net"
"os"
"path/filepath" "path/filepath"
"strings" "strings"
"sync"
"testing" "testing"
"time"
) )
type fakeBIRD struct { type fakeBIRD struct {
@ -109,3 +112,82 @@ func TestChirp(t *testing.T) {
t.Fatalf("disabling %q succeded", "rando") t.Fatalf("disabling %q succeded", "rando")
} }
} }
type hangingListener struct {
net.Listener
t *testing.T
done chan struct{}
wg sync.WaitGroup
sock string
}
func newHangingListener(t *testing.T) *hangingListener {
sock := filepath.Join(t.TempDir(), "sock")
l, err := net.Listen("unix", sock)
if err != nil {
t.Fatal(err)
}
return &hangingListener{
Listener: l,
t: t,
done: make(chan struct{}),
sock: sock,
}
}
func (hl *hangingListener) Stop() {
hl.Close()
close(hl.done)
hl.wg.Wait()
}
func (hl *hangingListener) listen() error {
for {
c, err := hl.Accept()
if err != nil {
if errors.Is(err, net.ErrClosed) {
return nil
}
return err
}
hl.wg.Add(1)
go hl.handle(c)
}
}
func (hl *hangingListener) handle(c net.Conn) {
defer hl.wg.Done()
// Write our fake first line of response so that we get into the read loop
fmt.Fprintln(c, "0001 BIRD 2.0.8 ready.")
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
hl.t.Logf("connection still hanging")
case <-hl.done:
return
}
}
}
func TestChirpTimeout(t *testing.T) {
fb := newHangingListener(t)
defer fb.Stop()
go fb.listen()
c, err := newWithTimeout(fb.sock, 500*time.Millisecond)
if err != nil {
t.Fatal(err)
}
err = c.EnableProtocol("tailscale")
if err == nil {
t.Fatal("got err=nil, want timeout")
}
if !os.IsTimeout(err) {
t.Fatalf("got err=%v, want os.IsTimeout(err)=true", err)
}
}