ssh/tailssh: handle dialing multiple recorders and failing open

This adds support to try dialing out to multiple recorders each
with a 5s timeout and an overall 30s timeout. It also starts respecting
the actions `OnRecordingFailure` field if set, if it is not set
it fails open.

Updates tailscale/corp#9967

Signed-off-by: Maisem Ali <maisem@tailscale.com>
This commit is contained in:
Maisem Ali 2023-04-19 21:33:33 -07:00 committed by Maisem Ali
parent f66ddb544c
commit 7778d708a6
3 changed files with 259 additions and 76 deletions

View File

@ -17,6 +17,7 @@
"io" "io"
"net" "net"
"net/http" "net/http"
"net/http/httptrace"
"net/netip" "net/netip"
"net/url" "net/url"
"os" "os"
@ -42,6 +43,7 @@
"tailscale.com/types/netmap" "tailscale.com/types/netmap"
"tailscale.com/util/clientmetric" "tailscale.com/util/clientmetric"
"tailscale.com/util/mak" "tailscale.com/util/mak"
"tailscale.com/util/multierr"
"tailscale.com/version/distro" "tailscale.com/version/distro"
) )
@ -79,33 +81,11 @@ type server struct {
// mu protects the following // mu protects the following
mu sync.Mutex mu sync.Mutex
httpc *http.Client // for calling out to peers.
activeConns map[*conn]bool // set; value is always true activeConns map[*conn]bool // set; value is always true
fetchPublicKeysCache map[string]pubKeyCacheEntry // by https URL fetchPublicKeysCache map[string]pubKeyCacheEntry // by https URL
shutdownCalled bool shutdownCalled bool
} }
// sessionRecordingClient returns an http.Client that uses srv.lb.Dialer() to
// dial connections. This is used to make requests to the session recording
// server to upload session recordings.
func (srv *server) sessionRecordingClient() *http.Client {
srv.mu.Lock()
defer srv.mu.Unlock()
if srv.httpc != nil {
return srv.httpc
}
tr := http.DefaultTransport.(*http.Transport).Clone()
tr.DialContext = func(ctx context.Context, network, addr string) (net.Conn, error) {
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
return srv.lb.Dialer().UserDial(ctx, network, addr)
}
srv.httpc = &http.Client{
Transport: tr,
}
return srv.httpc
}
func (srv *server) now() time.Time { func (srv *server) now() time.Time {
if srv != nil && srv.timeNow != nil { if srv != nil && srv.timeNow != nil {
return srv.timeNow() return srv.timeNow()
@ -1078,7 +1058,7 @@ func (ss *sshSession) run() {
if err != nil { if err != nil {
var uve userVisibleError var uve userVisibleError
if errors.As(err, &uve) { if errors.As(err, &uve) {
fmt.Fprintf(ss, "%s\r\n", uve) fmt.Fprintf(ss, "%s\r\n", uve.SSHTerminationMessage())
} else { } else {
fmt.Fprintf(ss, "can't start new recording\r\n") fmt.Fprintf(ss, "can't start new recording\r\n")
} }
@ -1086,7 +1066,9 @@ func (ss *sshSession) run() {
ss.Exit(1) ss.Exit(1)
return return
} }
defer rec.Close() if rec != nil {
defer rec.Close()
}
} }
} }
@ -1169,15 +1151,16 @@ func (ss *sshSession) run() {
// If the final action has a non-empty list of recorders, that list is // If the final action has a non-empty list of recorders, that list is
// returned. Otherwise, the list of recorders from the initial action // returned. Otherwise, the list of recorders from the initial action
// is returned. // is returned.
func (ss *sshSession) recorders() []netip.AddrPort { func (ss *sshSession) recorders() ([]netip.AddrPort, *tailcfg.SSHRecorderFailureAction) {
if len(ss.conn.finalAction.Recorders) > 0 { if len(ss.conn.finalAction.Recorders) > 0 {
return ss.conn.finalAction.Recorders return ss.conn.finalAction.Recorders, ss.conn.finalAction.OnRecordingFailure
} }
return ss.conn.action0.Recorders return ss.conn.action0.Recorders, ss.conn.action0.OnRecordingFailure
} }
func (ss *sshSession) shouldRecord() bool { func (ss *sshSession) shouldRecord() bool {
return len(ss.recorders()) > 0 recs, _ := ss.recorders()
return len(recs) > 0
} }
type sshConnInfo struct { type sshConnInfo struct {
@ -1409,16 +1392,120 @@ type CastHeader struct {
LocalUser string `json:"localUser"` LocalUser string `json:"localUser"`
} }
// sessionRecordingClient returns an http.Client that uses srv.lb.Dialer() to
// dial connections. This is used to make requests to the session recording
// server to upload session recordings.
// It uses the provided dialCtx to dial connections, and limits a single dial
// to 5 seconds.
func (ss *sshSession) sessionRecordingClient(dialCtx context.Context) (*http.Client, error) {
dialer := ss.conn.srv.lb.Dialer()
if dialer == nil {
return nil, errors.New("no peer API transport")
}
tr := dialer.PeerAPITransport().Clone()
dialContextFn := tr.DialContext
tr.DialContext = func(ctx context.Context, network, addr string) (net.Conn, error) {
perAttemptCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
go func() {
select {
case <-perAttemptCtx.Done():
case <-dialCtx.Done():
cancel()
}
}()
return dialContextFn(perAttemptCtx, network, addr)
}
return &http.Client{
Transport: tr,
}, nil
}
// connectToRecorder connects to the recorder at any of the provided addresses.
// It returns the first successful response, or a multierr if all attempts fail.
//
// On success, it returns a WriteCloser that can be used to upload the
// recording, and a channel that will be sent an error (or nil) when the upload
// fails or completes.
func (ss *sshSession) connectToRecorder(ctx context.Context, recs []netip.AddrPort) (io.WriteCloser, <-chan error, error) {
if len(recs) == 0 {
return nil, nil, errors.New("no recorders configured")
}
// We use a special context for dialing the recorder, so that we can
// limit the time we spend dialing to 30 seconds and still have an
// unbounded context for the upload.
dialCtx, dialCancel := context.WithTimeout(ctx, 30*time.Second)
defer dialCancel()
hc, err := ss.sessionRecordingClient(dialCtx)
if err != nil {
return nil, nil, err
}
var errs []error
for _, ap := range recs {
// We dial the recorder and wait for it to send a 100-continue
// response before returning from this function. This ensures that
// the recorder is ready to accept the recording.
// got100 is closed when we receive the 100-continue response.
got100 := make(chan struct{})
ctx = httptrace.WithClientTrace(ctx, &httptrace.ClientTrace{
Got100Continue: func() {
close(got100)
},
})
pr, pw := io.Pipe()
req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://%s:%d/record", ap.Addr(), ap.Port()), pr)
if err != nil {
errs = append(errs, fmt.Errorf("recording: error starting recording: %w", err))
continue
}
// We set the Expect header to 100-continue, so that the recorder
// will send a 100-continue response before it starts reading the
// request body.
req.Header.Set("Expect", "100-continue")
// errChan is used to indicate the result of the request.
errChan := make(chan error, 1)
go func() {
resp, err := hc.Do(req)
if err != nil {
errChan <- fmt.Errorf("recording: error starting recording: %w", err)
return
}
if resp.StatusCode != 200 {
errChan <- fmt.Errorf("recording: unexpected status: %v", resp.Status)
return
}
errChan <- nil
}()
select {
case <-got100:
case err := <-errChan:
// If we get an error before we get the 100-continue response,
// we need to try another recorder.
if err == nil {
// If the error is nil, we got a 200 response, which
// is unexpected as we haven't sent any data yet.
err = errors.New("recording: unexpected EOF")
}
errs = append(errs, err)
continue
}
return pw, errChan, nil
}
return nil, nil, multierr.New(errs...)
}
// startNewRecording starts a new SSH session recording. // startNewRecording starts a new SSH session recording.
// It may return a nil recording if recording is not available.
func (ss *sshSession) startNewRecording() (_ *recording, err error) { func (ss *sshSession) startNewRecording() (_ *recording, err error) {
recorders := ss.recorders() recorders, onFailure := ss.recorders()
if len(recorders) == 0 { if len(recorders) == 0 {
return nil, errors.New("no recorders configured") return nil, errors.New("no recorders configured")
} }
recorder := recorders[0]
if len(recorders) > 1 {
ss.logf("warning: multiple recorders configured, using first one: %v", recorder)
}
var w ssh.Window var w ssh.Window
if ptyReq, _, isPtyReq := ss.Pty(); isPtyReq { if ptyReq, _, isPtyReq := ss.Pty(); isPtyReq {
@ -1436,51 +1523,43 @@ func (ss *sshSession) startNewRecording() (_ *recording, err error) {
start: now, start: now,
} }
pr, pw := io.Pipe()
// We want to use a background context for uploading and not ss.ctx. // We want to use a background context for uploading and not ss.ctx.
// ss.ctx is closed when the session closes, but we don't want to break the upload at that time. // ss.ctx is closed when the session closes, but we don't want to break the upload at that time.
// Instead we want to wait for the session to close the writer when it finishes. // Instead we want to wait for the session to close the writer when it finishes.
ctx := context.Background() ctx := context.Background()
req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://%s:%d/record", recorder.Addr(), recorder.Port()), pr) wc, errChan, err := ss.connectToRecorder(ctx, recorders)
if err != nil { if err != nil {
pr.Close() // TODO(catzkorn): notify control here.
pw.Close() if onFailure != nil && onFailure.RejectSessionWithMessage != "" {
return nil, err ss.logf("recording: error starting recording (rejecting session): %v", err)
} return nil, userVisibleError{
// We want to wait for the server to respond with 100 Continue to notifiy us
// that it's ready to receive data. We do this to block the session from
// starting until the server is ready to receive data.
// It also allows the server to reject the request before we start sending
// data.
req.Header.Set("Expect", "100-continue")
go func() {
defer pw.Close()
ss.logf("starting asciinema recording to %s", recorder)
hc := ss.conn.srv.sessionRecordingClient()
resp, err := hc.Do(req)
if err != nil {
err := fmt.Errorf("recording: error sending recording: %w", err)
ss.logf("%v", err)
ss.cancelCtx(userVisibleError{
msg: "recording: error sending recording",
error: err, error: err,
msg: onFailure.RejectSessionWithMessage,
}
}
ss.logf("recording: error starting recording (failing open): %v", err)
return nil, nil
}
go func() {
err := <-errChan
if err == nil {
// Success.
return
}
// TODO(catzkorn): notify control here.
if onFailure != nil && onFailure.TerminateSessionWithMessage != "" {
ss.logf("recording: error uploading recording (closing session): %v", err)
ss.cancelCtx(userVisibleError{
error: err,
msg: onFailure.TerminateSessionWithMessage,
}) })
return return
} }
defer resp.Body.Close() ss.logf("recording: error uploading recording (failing open): %v", err)
defer ss.cancelCtx(errors.New("recording: done"))
if resp.StatusCode != http.StatusOK {
err := fmt.Errorf("recording: server responded with %s", resp.Status)
ss.logf("%v", err)
ss.cancelCtx(userVisibleError{
msg: "recording server responded with: " + resp.Status,
error: err,
})
}
}() }()
rec.out = pw rec.out = wc
ch := CastHeader{ ch := CastHeader{
Version: 2, Version: 2,
@ -1515,7 +1594,7 @@ func (ss *sshSession) startNewRecording() (_ *recording, err error) {
return nil, err return nil, err
} }
j = append(j, '\n') j = append(j, '\n')
if _, err := pw.Write(j); err != nil { if _, err := rec.out.Write(j); err != nil {
if errors.Is(err, io.ErrClosedPipe) && ss.ctx.Err() != nil { if errors.Is(err, io.ErrClosedPipe) && ss.ctx.Err() != nil {
// If we got an io.ErrClosedPipe, it's likely because // If we got an io.ErrClosedPipe, it's likely because
// the recording server closed the connection on us. Return // the recording server closed the connection on us. Return

View File

@ -240,7 +240,7 @@ type localState struct {
) )
func (ts *localState) Dialer() *tsdial.Dialer { func (ts *localState) Dialer() *tsdial.Dialer {
return nil return &tsdial.Dialer{}
} }
func (ts *localState) GetSSH_HostKeys() ([]gossh.Signer, error) { func (ts *localState) GetSSH_HostKeys() ([]gossh.Signer, error) {
@ -338,8 +338,7 @@ func TestSSHRecordingCancelsSessionsOnUploadFailure(t *testing.T) {
defer recordingServer.Close() defer recordingServer.Close()
s := &server{ s := &server{
logf: t.Logf, logf: t.Logf,
httpc: recordingServer.Client(),
lb: &localState{ lb: &localState{
sshEnabled: true, sshEnabled: true,
matchingRule: newSSHRule( matchingRule: newSSHRule(
@ -348,6 +347,10 @@ func TestSSHRecordingCancelsSessionsOnUploadFailure(t *testing.T) {
Recorders: []netip.AddrPort{ Recorders: []netip.AddrPort{
netip.MustParseAddrPort(recordingServer.Listener.Addr().String()), netip.MustParseAddrPort(recordingServer.Listener.Addr().String()),
}, },
OnRecordingFailure: &tailcfg.SSHRecorderFailureAction{
RejectSessionWithMessage: "session rejected",
TerminateSessionWithMessage: "session terminated",
},
}, },
), ),
}, },
@ -374,7 +377,7 @@ func TestSSHRecordingCancelsSessionsOnUploadFailure(t *testing.T) {
w.WriteHeader(http.StatusForbidden) w.WriteHeader(http.StatusForbidden)
}, },
sshCommand: "echo hello", sshCommand: "echo hello",
wantClientOutput: "recording: server responded with 403 Forbidden\r\n", wantClientOutput: "session rejected\r\n",
clientOutputMustNotContain: []string{"hello"}, clientOutputMustNotContain: []string{"hello"},
}, },
@ -386,7 +389,7 @@ func TestSSHRecordingCancelsSessionsOnUploadFailure(t *testing.T) {
w.WriteHeader(http.StatusInternalServerError) w.WriteHeader(http.StatusInternalServerError)
}, },
sshCommand: "echo hello && sleep 1 && echo world", sshCommand: "echo hello && sleep 1 && echo world",
wantClientOutput: "\r\n\r\nrecording server responded with: 500 Internal Server Error\r\n\r\n", wantClientOutput: "\r\n\r\nsession terminated\r\n\r\n",
clientOutputMustNotContain: []string{"world"}, clientOutputMustNotContain: []string{"world"},
}, },
@ -440,6 +443,103 @@ func TestSSHRecordingCancelsSessionsOnUploadFailure(t *testing.T) {
} }
} }
func TestMultipleRecorders(t *testing.T) {
if runtime.GOOS != "linux" && runtime.GOOS != "darwin" {
t.Skipf("skipping on %q; only runs on linux and darwin", runtime.GOOS)
}
done := make(chan struct{})
recordingServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
defer close(done)
io.ReadAll(r.Body)
w.WriteHeader(http.StatusOK)
}))
defer recordingServer.Close()
badRecorder, err := net.Listen("tcp", ":0")
if err != nil {
t.Fatal(err)
}
badRecorderAddr := badRecorder.Addr().String()
badRecorder.Close()
badRecordingServer500 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(500)
}))
defer badRecordingServer500.Close()
badRecordingServer200 := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
}))
defer badRecordingServer200.Close()
s := &server{
logf: t.Logf,
lb: &localState{
sshEnabled: true,
matchingRule: newSSHRule(
&tailcfg.SSHAction{
Accept: true,
Recorders: []netip.AddrPort{
netip.MustParseAddrPort(badRecorderAddr),
netip.MustParseAddrPort(badRecordingServer500.Listener.Addr().String()),
netip.MustParseAddrPort(badRecordingServer200.Listener.Addr().String()),
netip.MustParseAddrPort(recordingServer.Listener.Addr().String()),
},
OnRecordingFailure: &tailcfg.SSHRecorderFailureAction{
RejectSessionWithMessage: "session rejected",
TerminateSessionWithMessage: "session terminated",
},
},
),
},
}
defer s.Shutdown()
src, dst := must.Get(netip.ParseAddrPort("100.100.100.101:2231")), must.Get(netip.ParseAddrPort("100.100.100.102:22"))
sc, dc := memnet.NewTCPConn(src, dst, 1024)
const sshUser = "alice"
cfg := &gossh.ClientConfig{
User: sshUser,
HostKeyCallback: gossh.InsecureIgnoreHostKey(),
}
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
c, chans, reqs, err := gossh.NewClientConn(sc, sc.RemoteAddr().String(), cfg)
if err != nil {
t.Errorf("client: %v", err)
return
}
client := gossh.NewClient(c, chans, reqs)
defer client.Close()
session, err := client.NewSession()
if err != nil {
t.Errorf("client: %v", err)
return
}
defer session.Close()
t.Logf("client established session")
out, err := session.CombinedOutput("echo Ran echo!")
if err != nil {
t.Errorf("client: %v", err)
}
if string(out) != "Ran echo!\n" {
t.Errorf("client: unexpected output: %q", out)
}
}()
if err := s.HandleSSHConn(dc); err != nil {
t.Errorf("unexpected error: %v", err)
}
wg.Wait()
select {
case <-done:
case <-time.After(1 * time.Second):
t.Fatal("timed out waiting for recording")
}
}
// TestSSHRecordingNonInteractive tests that the SSH server records the SSH session // TestSSHRecordingNonInteractive tests that the SSH server records the SSH session
// when the client is not interactive (i.e. no PTY). // when the client is not interactive (i.e. no PTY).
// It starts a local SSH server and a recording server. The recording server // It starts a local SSH server and a recording server. The recording server
@ -464,8 +564,7 @@ func TestSSHRecordingNonInteractive(t *testing.T) {
defer recordingServer.Close() defer recordingServer.Close()
s := &server{ s := &server{
logf: logger.Discard, logf: logger.Discard,
httpc: recordingServer.Client(),
lb: &localState{ lb: &localState{
sshEnabled: true, sshEnabled: true,
matchingRule: newSSHRule( matchingRule: newSSHRule(
@ -474,6 +573,10 @@ func TestSSHRecordingNonInteractive(t *testing.T) {
Recorders: []netip.AddrPort{ Recorders: []netip.AddrPort{
must.Get(netip.ParseAddrPort(recordingServer.Listener.Addr().String())), must.Get(netip.ParseAddrPort(recordingServer.Listener.Addr().String())),
}, },
OnRecordingFailure: &tailcfg.SSHRecorderFailureAction{
RejectSessionWithMessage: "session rejected",
TerminateSessionWithMessage: "session terminated",
},
}, },
), ),
}, },

View File

@ -97,7 +97,8 @@
// - 58: 2023-03-10: Client retries lite map updates before restarting map poll. // - 58: 2023-03-10: Client retries lite map updates before restarting map poll.
// - 59: 2023-03-16: Client understands Peers[].SelfNodeV4MasqAddrForThisPeer // - 59: 2023-03-16: Client understands Peers[].SelfNodeV4MasqAddrForThisPeer
// - 60: 2023-04-06: Client understands IsWireGuardOnly // - 60: 2023-04-06: Client understands IsWireGuardOnly
const CurrentCapabilityVersion CapabilityVersion = 60 // - 61: 2023-04-18: Client understand SSHAction.SSHRecorderFailureAction
const CurrentCapabilityVersion CapabilityVersion = 61
type StableID string type StableID string