chirp: add a 10s timeout when communicating with BIRD (#5444)

Prior to this change, if BIRD stops responding wgengine.watchdogEngine
will crash tailscaled.

This happens because in wgengine.userspaceEngine, we end up blocking
forever trying to write a request to or read a response from BIRD with
wgLock held, and then future watchdog'd calls will block on acquiring
that mutex until the watchdog kills the process. With the timeout, we at
least get the chance to print an error message and decide whether we
want to crash or not.

Updates tailscale/coral#72

Signed-off-by: Andrew Dunham <andrew@tailscale.com>

Signed-off-by: Andrew Dunham <andrew@tailscale.com>
This commit is contained in:
Andrew Dunham
2022-08-27 20:49:31 -04:00
committed by GitHub
parent 9bf13fc3d1
commit e1738ea78e
2 changed files with 117 additions and 6 deletions

View File

@@ -8,9 +8,12 @@ import (
"errors"
"fmt"
"net"
"os"
"path/filepath"
"strings"
"sync"
"testing"
"time"
)
type fakeBIRD struct {
@@ -109,3 +112,82 @@ func TestChirp(t *testing.T) {
t.Fatalf("disabling %q succeded", "rando")
}
}
type hangingListener struct {
net.Listener
t *testing.T
done chan struct{}
wg sync.WaitGroup
sock string
}
func newHangingListener(t *testing.T) *hangingListener {
sock := filepath.Join(t.TempDir(), "sock")
l, err := net.Listen("unix", sock)
if err != nil {
t.Fatal(err)
}
return &hangingListener{
Listener: l,
t: t,
done: make(chan struct{}),
sock: sock,
}
}
func (hl *hangingListener) Stop() {
hl.Close()
close(hl.done)
hl.wg.Wait()
}
func (hl *hangingListener) listen() error {
for {
c, err := hl.Accept()
if err != nil {
if errors.Is(err, net.ErrClosed) {
return nil
}
return err
}
hl.wg.Add(1)
go hl.handle(c)
}
}
func (hl *hangingListener) handle(c net.Conn) {
defer hl.wg.Done()
// Write our fake first line of response so that we get into the read loop
fmt.Fprintln(c, "0001 BIRD 2.0.8 ready.")
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
hl.t.Logf("connection still hanging")
case <-hl.done:
return
}
}
}
func TestChirpTimeout(t *testing.T) {
fb := newHangingListener(t)
defer fb.Stop()
go fb.listen()
c, err := newWithTimeout(fb.sock, 500*time.Millisecond)
if err != nil {
t.Fatal(err)
}
err = c.EnableProtocol("tailscale")
if err == nil {
t.Fatal("got err=nil, want timeout")
}
if !os.IsTimeout(err) {
t.Fatalf("got err=%v, want os.IsTimeout(err)=true", err)
}
}