mirror of
https://github.com/tailscale/tailscale.git
synced 2025-04-16 03:31:39 +00:00
control/controlclient, ipn: add client audit logging (#14950)
updates tailscale/corp#26435 Adds client support for sending audit logs to control via /machine/audit-log. Specifically implements audit logging for user initiated disconnections. This will require further work to optimize the peristant storage and exclusion via build tags for mobile: tailscale/corp#27011 tailscale/corp#27012 Signed-off-by: Jonathan Nobels <jonathan@tailscale.com>
This commit is contained in:
parent
06ae52d309
commit
52710945f5
@ -814,6 +814,7 @@ tailscale.com/cmd/k8s-operator dependencies: (generated by github.com/tailscale/
|
||||
tailscale.com/internal/client/tailscale from tailscale.com/cmd/k8s-operator
|
||||
tailscale.com/internal/noiseconn from tailscale.com/control/controlclient
|
||||
tailscale.com/ipn from tailscale.com/client/local+
|
||||
tailscale.com/ipn/auditlog from tailscale.com/ipn/ipnlocal+
|
||||
tailscale.com/ipn/conffile from tailscale.com/ipn/ipnlocal+
|
||||
💣 tailscale.com/ipn/desktop from tailscale.com/ipn/ipnlocal+
|
||||
💣 tailscale.com/ipn/ipnauth from tailscale.com/ipn/ipnlocal+
|
||||
|
@ -271,6 +271,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
|
||||
tailscale.com/hostinfo from tailscale.com/client/web+
|
||||
tailscale.com/internal/noiseconn from tailscale.com/control/controlclient
|
||||
tailscale.com/ipn from tailscale.com/client/local+
|
||||
tailscale.com/ipn/auditlog from tailscale.com/ipn/ipnlocal+
|
||||
tailscale.com/ipn/conffile from tailscale.com/cmd/tailscaled+
|
||||
💣 tailscale.com/ipn/desktop from tailscale.com/cmd/tailscaled+
|
||||
💣 tailscale.com/ipn/ipnauth from tailscale.com/ipn/ipnlocal+
|
||||
|
@ -119,6 +119,7 @@ type Auto struct {
|
||||
updateCh chan struct{} // readable when we should inform the server of a change
|
||||
observer Observer // called to update Client status; always non-nil
|
||||
observerQueue execqueue.ExecQueue
|
||||
shutdownFn func() // to be called prior to shutdown or nil
|
||||
|
||||
unregisterHealthWatch func()
|
||||
|
||||
@ -189,6 +190,7 @@ func NewNoStart(opts Options) (_ *Auto, err error) {
|
||||
mapDone: make(chan struct{}),
|
||||
updateDone: make(chan struct{}),
|
||||
observer: opts.Observer,
|
||||
shutdownFn: opts.Shutdown,
|
||||
}
|
||||
c.authCtx, c.authCancel = context.WithCancel(context.Background())
|
||||
c.authCtx = sockstats.WithSockStats(c.authCtx, sockstats.LabelControlClientAuto, opts.Logf)
|
||||
@ -755,6 +757,7 @@ func (c *Auto) Shutdown() {
|
||||
return
|
||||
}
|
||||
c.logf("client.Shutdown ...")
|
||||
shutdownFn := c.shutdownFn
|
||||
|
||||
direct := c.direct
|
||||
c.closed = true
|
||||
@ -767,6 +770,10 @@ func (c *Auto) Shutdown() {
|
||||
c.unpauseWaiters = nil
|
||||
c.mu.Unlock()
|
||||
|
||||
if shutdownFn != nil {
|
||||
shutdownFn()
|
||||
}
|
||||
|
||||
c.unregisterHealthWatch()
|
||||
<-c.authDone
|
||||
<-c.mapDone
|
||||
|
@ -4,6 +4,8 @@
|
||||
package controlclient
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"reflect"
|
||||
"slices"
|
||||
@ -147,3 +149,42 @@ func TestCanSkipStatus(t *testing.T) {
|
||||
t.Errorf("Status fields = %q; this code was only written to handle fields %q", f, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRetryableErrors(t *testing.T) {
|
||||
errorTests := []struct {
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{errNoNoiseClient, true},
|
||||
{errNoNodeKey, true},
|
||||
{fmt.Errorf("%w: %w", errNoNoiseClient, errors.New("no noise")), true},
|
||||
{fmt.Errorf("%w: %w", errHTTPPostFailure, errors.New("bad post")), true},
|
||||
{fmt.Errorf("%w: %w", errNoNodeKey, errors.New("not node key")), true},
|
||||
{errBadHTTPResponse(429, "too may requests"), true},
|
||||
{errBadHTTPResponse(500, "internal server eror"), true},
|
||||
{errBadHTTPResponse(502, "bad gateway"), true},
|
||||
{errBadHTTPResponse(503, "service unavailable"), true},
|
||||
{errBadHTTPResponse(504, "gateway timeout"), true},
|
||||
{errBadHTTPResponse(1234, "random error"), false},
|
||||
}
|
||||
|
||||
for _, tt := range errorTests {
|
||||
t.Run(tt.err.Error(), func(t *testing.T) {
|
||||
if isRetryableErrorForTest(tt.err) != tt.want {
|
||||
t.Fatalf("retriable: got %v, want %v", tt.err, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
type retryableForTest interface {
|
||||
Retryable() bool
|
||||
}
|
||||
|
||||
func isRetryableErrorForTest(err error) bool {
|
||||
var ae retryableForTest
|
||||
if errors.As(err, &ae) {
|
||||
return ae.Retryable()
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
@ -156,6 +156,11 @@ type Options struct {
|
||||
// If we receive a new DialPlan from the server, this value will be
|
||||
// updated.
|
||||
DialPlan ControlDialPlanner
|
||||
|
||||
// Shutdown is an optional function that will be called before client shutdown is
|
||||
// attempted. It is used to allow the client to clean up any resources or complete any
|
||||
// tasks that are dependent on a live client.
|
||||
Shutdown func()
|
||||
}
|
||||
|
||||
// ControlDialPlanner is the interface optionally supplied when creating a
|
||||
@ -1662,11 +1667,11 @@ func (c *Auto) SetDeviceAttrs(ctx context.Context, attrs tailcfg.AttrUpdate) err
|
||||
func (c *Direct) SetDeviceAttrs(ctx context.Context, attrs tailcfg.AttrUpdate) error {
|
||||
nc, err := c.getNoiseClient()
|
||||
if err != nil {
|
||||
return err
|
||||
return fmt.Errorf("%w: %w", errNoNoiseClient, err)
|
||||
}
|
||||
nodeKey, ok := c.GetPersist().PublicNodeKeyOK()
|
||||
if !ok {
|
||||
return errors.New("no node key")
|
||||
return errNoNodeKey
|
||||
}
|
||||
if c.panicOnUse {
|
||||
panic("tainted client")
|
||||
@ -1697,6 +1702,47 @@ func (c *Direct) SetDeviceAttrs(ctx context.Context, attrs tailcfg.AttrUpdate) e
|
||||
return nil
|
||||
}
|
||||
|
||||
// SendAuditLog implements [auditlog.Transport] by sending an audit log synchronously to the control plane.
|
||||
//
|
||||
// See docs on [tailcfg.AuditLogRequest] and [auditlog.Logger] for background.
|
||||
func (c *Auto) SendAuditLog(ctx context.Context, auditLog tailcfg.AuditLogRequest) (err error) {
|
||||
return c.direct.sendAuditLog(ctx, auditLog)
|
||||
}
|
||||
|
||||
func (c *Direct) sendAuditLog(ctx context.Context, auditLog tailcfg.AuditLogRequest) (err error) {
|
||||
nc, err := c.getNoiseClient()
|
||||
if err != nil {
|
||||
return fmt.Errorf("%w: %w", errNoNoiseClient, err)
|
||||
}
|
||||
|
||||
nodeKey, ok := c.GetPersist().PublicNodeKeyOK()
|
||||
if !ok {
|
||||
return errNoNodeKey
|
||||
}
|
||||
|
||||
req := &tailcfg.AuditLogRequest{
|
||||
Version: tailcfg.CurrentCapabilityVersion,
|
||||
NodeKey: nodeKey,
|
||||
Action: auditLog.Action,
|
||||
Details: auditLog.Details,
|
||||
}
|
||||
|
||||
if c.panicOnUse {
|
||||
panic("tainted client")
|
||||
}
|
||||
|
||||
res, err := nc.post(ctx, "/machine/audit-log", nodeKey, req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("%w: %w", errHTTPPostFailure, err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
if res.StatusCode != 200 {
|
||||
all, _ := io.ReadAll(res.Body)
|
||||
return errBadHTTPResponse(res.StatusCode, string(all))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func addLBHeader(req *http.Request, nodeKey key.NodePublic) {
|
||||
if !nodeKey.IsZero() {
|
||||
req.Header.Add(tailcfg.LBHeader, nodeKey.String())
|
||||
|
51
control/controlclient/errors.go
Normal file
51
control/controlclient/errors.go
Normal file
@ -0,0 +1,51 @@
|
||||
// Copyright (c) Tailscale Inc & AUTHORS
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
package controlclient
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
// apiResponseError is an error type that can be returned by controlclient
|
||||
// api requests.
|
||||
//
|
||||
// It wraps an underlying error and a flag for clients to query if the
|
||||
// error is retryable via the Retryable() method.
|
||||
type apiResponseError struct {
|
||||
err error
|
||||
retryable bool
|
||||
}
|
||||
|
||||
// Error implements [error].
|
||||
func (e *apiResponseError) Error() string {
|
||||
return e.err.Error()
|
||||
}
|
||||
|
||||
// Retryable reports whether the error is retryable.
|
||||
func (e *apiResponseError) Retryable() bool {
|
||||
return e.retryable
|
||||
}
|
||||
|
||||
func (e *apiResponseError) Unwrap() error { return e.err }
|
||||
|
||||
var (
|
||||
errNoNodeKey = &apiResponseError{errors.New("no node key"), true}
|
||||
errNoNoiseClient = &apiResponseError{errors.New("no noise client"), true}
|
||||
errHTTPPostFailure = &apiResponseError{errors.New("http failure"), true}
|
||||
)
|
||||
|
||||
func errBadHTTPResponse(code int, msg string) error {
|
||||
retryable := false
|
||||
switch code {
|
||||
case http.StatusTooManyRequests,
|
||||
http.StatusInternalServerError,
|
||||
http.StatusBadGateway,
|
||||
http.StatusServiceUnavailable,
|
||||
http.StatusGatewayTimeout:
|
||||
retryable = true
|
||||
}
|
||||
return &apiResponseError{fmt.Errorf("http error %d: %s", code, msg), retryable}
|
||||
}
|
466
ipn/auditlog/auditlog.go
Normal file
466
ipn/auditlog/auditlog.go
Normal file
@ -0,0 +1,466 @@
|
||||
// Copyright (c) Tailscale Inc & AUTHORS
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
// Package auditlog provides a mechanism for logging audit events.
|
||||
package auditlog
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/tailcfg"
|
||||
"tailscale.com/types/logger"
|
||||
"tailscale.com/util/rands"
|
||||
"tailscale.com/util/set"
|
||||
)
|
||||
|
||||
// transaction represents an audit log that has not yet been sent to the control plane.
|
||||
type transaction struct {
|
||||
// EventID is the unique identifier for the event being logged.
|
||||
// This is used on the client side only and is not sent to control.
|
||||
EventID string `json:",omitempty"`
|
||||
// Retries is the number of times the logger has attempted to send this log.
|
||||
// This is used on the client side only and is not sent to control.
|
||||
Retries int `json:",omitempty"`
|
||||
|
||||
// Action is the action to be logged. It must correspond to a known action in the control plane.
|
||||
Action tailcfg.ClientAuditAction `json:",omitempty"`
|
||||
// Details is an opaque string specific to the action being logged. Empty strings may not
|
||||
// be valid depending on the action being logged.
|
||||
Details string `json:",omitempty"`
|
||||
// TimeStamp is the time at which the audit log was generated on the node.
|
||||
TimeStamp time.Time `json:",omitzero"`
|
||||
}
|
||||
|
||||
// Transport provides a means for a client to send audit logs to a consumer (typically the control plane).
|
||||
type Transport interface {
|
||||
// SendAuditLog sends an audit log to a consumer of audit logs.
|
||||
// Errors should be checked with [IsRetryableError] for retryability.
|
||||
SendAuditLog(context.Context, tailcfg.AuditLogRequest) error
|
||||
}
|
||||
|
||||
// LogStore provides a means for a [Logger] to persist logs to disk or memory.
|
||||
type LogStore interface {
|
||||
// Save saves the given data to a persistent store. Save will overwrite existing data
|
||||
// for the given key.
|
||||
save(key ipn.ProfileID, txns []*transaction) error
|
||||
|
||||
// Load retrieves the data from a persistent store. Returns a nil slice and
|
||||
// no error if no data exists for the given key.
|
||||
load(key ipn.ProfileID) ([]*transaction, error)
|
||||
}
|
||||
|
||||
// Opts contains the configuration options for a [Logger].
|
||||
type Opts struct {
|
||||
// RetryLimit is the maximum number of attempts the logger will make to send a log before giving up.
|
||||
RetryLimit int
|
||||
// Store is the persistent store used to save logs to disk. Must be non-nil.
|
||||
Store LogStore
|
||||
// Logf is the logger used to log messages from the audit logger. Must be non-nil.
|
||||
Logf logger.Logf
|
||||
}
|
||||
|
||||
// IsRetryableError returns true if the given error is retryable
|
||||
// See [controlclient.apiResponseError]. Potentially retryable errors implement the Retryable() method.
|
||||
func IsRetryableError(err error) bool {
|
||||
var retryable interface{ Retryable() bool }
|
||||
return errors.As(err, &retryable) && retryable.Retryable()
|
||||
}
|
||||
|
||||
type backoffOpts struct {
|
||||
min, max time.Duration
|
||||
multiplier float64
|
||||
}
|
||||
|
||||
// .5, 1, 2, 4, 8, 10, 10, 10, 10, 10...
|
||||
var defaultBackoffOpts = backoffOpts{
|
||||
min: time.Millisecond * 500,
|
||||
max: 10 * time.Second,
|
||||
multiplier: 2,
|
||||
}
|
||||
|
||||
// Logger provides a queue-based mechanism for submitting audit logs to the control plane - or
|
||||
// another suitable consumer. Logs are stored to disk and retried until they are successfully sent,
|
||||
// or until they permanently fail.
|
||||
//
|
||||
// Each individual profile/controlclient tuple should construct and manage a unique [Logger] instance.
|
||||
type Logger struct {
|
||||
logf logger.Logf
|
||||
retryLimit int // the maximum number of attempts to send a log before giving up.
|
||||
flusher chan struct{} // channel used to signal a flush operation.
|
||||
done chan struct{} // closed when the flush worker exits.
|
||||
ctx context.Context // canceled when the logger is stopped.
|
||||
ctxCancel context.CancelFunc // cancels ctx.
|
||||
backoffOpts // backoff settings for retry operations.
|
||||
|
||||
// mu protects the fields below.
|
||||
mu sync.Mutex
|
||||
store LogStore // persistent storage for unsent logs.
|
||||
profileID ipn.ProfileID // empty if [Logger.SetProfileID] has not been called.
|
||||
transport Transport // nil until [Logger.Start] is called.
|
||||
}
|
||||
|
||||
// NewLogger creates a new [Logger] with the given options.
|
||||
func NewLogger(opts Opts) *Logger {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
al := &Logger{
|
||||
retryLimit: opts.RetryLimit,
|
||||
logf: logger.WithPrefix(opts.Logf, "auditlog: "),
|
||||
store: opts.Store,
|
||||
flusher: make(chan struct{}, 1),
|
||||
done: make(chan struct{}),
|
||||
ctx: ctx,
|
||||
ctxCancel: cancel,
|
||||
backoffOpts: defaultBackoffOpts,
|
||||
}
|
||||
al.logf("created")
|
||||
return al
|
||||
}
|
||||
|
||||
// FlushAndStop synchronously flushes all pending logs and stops the audit logger.
|
||||
// This will block until a final flush operation completes or context is done.
|
||||
// If the logger is already stopped, this will return immediately. All unsent
|
||||
// logs will be persisted to the store.
|
||||
func (al *Logger) FlushAndStop(ctx context.Context) {
|
||||
al.stop()
|
||||
al.flush(ctx)
|
||||
}
|
||||
|
||||
// SetProfileID sets the profileID for the logger. This must be called before any logs can be enqueued.
|
||||
// The profileID of a logger cannot be changed once set.
|
||||
func (al *Logger) SetProfileID(profileID ipn.ProfileID) error {
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
if al.profileID != "" {
|
||||
return errors.New("profileID already set")
|
||||
}
|
||||
|
||||
al.profileID = profileID
|
||||
return nil
|
||||
}
|
||||
|
||||
// Start starts the audit logger with the given transport.
|
||||
// It returns an error if the logger is already started.
|
||||
func (al *Logger) Start(t Transport) error {
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
|
||||
if al.transport != nil {
|
||||
return errors.New("already started")
|
||||
}
|
||||
|
||||
al.transport = t
|
||||
pending, err := al.storedCountLocked()
|
||||
if err != nil {
|
||||
al.logf("[unexpected] failed to restore logs: %v", err)
|
||||
}
|
||||
go al.flushWorker()
|
||||
if pending > 0 {
|
||||
al.flushAsync()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ErrAuditLogStorageFailure is returned when the logger fails to persist logs to the store.
|
||||
var ErrAuditLogStorageFailure = errors.New("audit log storage failure")
|
||||
|
||||
// Enqueue queues an audit log to be sent to the control plane (or another suitable consumer/transport).
|
||||
// This will return an error if the underlying store fails to save the log or we fail to generate a unique
|
||||
// eventID for the log.
|
||||
func (al *Logger) Enqueue(action tailcfg.ClientAuditAction, details string) error {
|
||||
txn := &transaction{
|
||||
Action: action,
|
||||
Details: details,
|
||||
TimeStamp: time.Now(),
|
||||
}
|
||||
// Generate a suitably random eventID for the transaction.
|
||||
txn.EventID = fmt.Sprint(txn.TimeStamp, rands.HexString(16))
|
||||
return al.enqueue(txn)
|
||||
}
|
||||
|
||||
// flushAsync requests an asynchronous flush.
|
||||
// It is a no-op if a flush is already pending.
|
||||
func (al *Logger) flushAsync() {
|
||||
select {
|
||||
case al.flusher <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
func (al *Logger) flushWorker() {
|
||||
defer close(al.done)
|
||||
|
||||
var retryDelay time.Duration
|
||||
retry := time.NewTimer(0)
|
||||
retry.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-al.ctx.Done():
|
||||
return
|
||||
case <-al.flusher:
|
||||
err := al.flush(al.ctx)
|
||||
switch {
|
||||
case errors.Is(err, context.Canceled):
|
||||
// The logger was stopped, no need to retry.
|
||||
return
|
||||
case err != nil:
|
||||
retryDelay = max(al.backoffOpts.min, min(retryDelay*time.Duration(al.backoffOpts.multiplier), al.backoffOpts.max))
|
||||
al.logf("retrying after %v, %v", retryDelay, err)
|
||||
retry.Reset(retryDelay)
|
||||
default:
|
||||
retryDelay = 0
|
||||
retry.Stop()
|
||||
}
|
||||
case <-retry.C:
|
||||
al.flushAsync()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// flush attempts to send all pending logs to the control plane.
|
||||
// l.mu must not be held.
|
||||
func (al *Logger) flush(ctx context.Context) error {
|
||||
al.mu.Lock()
|
||||
pending, err := al.store.load(al.profileID)
|
||||
t := al.transport
|
||||
al.mu.Unlock()
|
||||
|
||||
if err != nil {
|
||||
// This will catch nil profileIDs
|
||||
return fmt.Errorf("failed to restore pending logs: %w", err)
|
||||
}
|
||||
if len(pending) == 0 {
|
||||
return nil
|
||||
}
|
||||
if t == nil {
|
||||
return errors.New("no transport")
|
||||
}
|
||||
|
||||
complete, unsent := al.sendToTransport(ctx, pending, t)
|
||||
al.markTransactionsDone(complete)
|
||||
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
if err = al.appendToStoreLocked(unsent); err != nil {
|
||||
al.logf("[unexpected] failed to persist logs: %v", err)
|
||||
}
|
||||
|
||||
if len(unsent) != 0 {
|
||||
return fmt.Errorf("failed to send %d logs", len(unsent))
|
||||
}
|
||||
|
||||
if len(complete) != 0 {
|
||||
al.logf("complete %d audit log transactions", len(complete))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// sendToTransport sends all pending logs to the control plane. Returns a pair of slices
|
||||
// containing the logs that were successfully sent (or failed permanently) and those that were not.
|
||||
//
|
||||
// This may require multiple round trips to the control plane and can be a long running transaction.
|
||||
func (al *Logger) sendToTransport(ctx context.Context, pending []*transaction, t Transport) (complete []*transaction, unsent []*transaction) {
|
||||
for i, txn := range pending {
|
||||
req := tailcfg.AuditLogRequest{
|
||||
Action: tailcfg.ClientAuditAction(txn.Action),
|
||||
Details: txn.Details,
|
||||
Timestamp: txn.TimeStamp,
|
||||
}
|
||||
|
||||
if err := t.SendAuditLog(ctx, req); err != nil {
|
||||
switch {
|
||||
case errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded):
|
||||
// The contex is done. All further attempts will fail.
|
||||
unsent = append(unsent, pending[i:]...)
|
||||
return complete, unsent
|
||||
case IsRetryableError(err) && txn.Retries+1 < al.retryLimit:
|
||||
// We permit a maximum number of retries for each log. All retriable
|
||||
// errors should be transient and we should be able to send the log eventually, but
|
||||
// we don't want logs to be persisted indefinitely.
|
||||
txn.Retries++
|
||||
unsent = append(unsent, txn)
|
||||
default:
|
||||
complete = append(complete, txn)
|
||||
al.logf("failed permanently: %v", err)
|
||||
}
|
||||
} else {
|
||||
// No error - we're done.
|
||||
complete = append(complete, txn)
|
||||
}
|
||||
}
|
||||
|
||||
return complete, unsent
|
||||
}
|
||||
|
||||
func (al *Logger) stop() {
|
||||
al.mu.Lock()
|
||||
t := al.transport
|
||||
al.mu.Unlock()
|
||||
|
||||
if t == nil {
|
||||
// No transport means no worker goroutine and done will not be
|
||||
// closed if we cancel the context.
|
||||
return
|
||||
}
|
||||
|
||||
al.ctxCancel()
|
||||
<-al.done
|
||||
al.logf("stopped for profileID: %v", al.profileID)
|
||||
}
|
||||
|
||||
// appendToStoreLocked persists logs to the store. This will deduplicate
|
||||
// logs so it is safe to call this with the same logs multiple time, to
|
||||
// requeue failed transactions for example.
|
||||
//
|
||||
// l.mu must be held.
|
||||
func (al *Logger) appendToStoreLocked(txns []*transaction) error {
|
||||
if len(txns) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if al.profileID == "" {
|
||||
return errors.New("no logId set")
|
||||
}
|
||||
|
||||
persisted, err := al.store.load(al.profileID)
|
||||
if err != nil {
|
||||
al.logf("[unexpected] append failed to restore logs: %v", err)
|
||||
}
|
||||
|
||||
// The order is important here. We want the latest transactions first, which will
|
||||
// ensure when we dedup, the new transactions are seen and the older transactions
|
||||
// are discarded.
|
||||
txnsOut := append(txns, persisted...)
|
||||
txnsOut = deduplicateAndSort(txnsOut)
|
||||
|
||||
return al.store.save(al.profileID, txnsOut)
|
||||
}
|
||||
|
||||
// storedCountLocked returns the number of logs persisted to the store.
|
||||
// al.mu must be held.
|
||||
func (al *Logger) storedCountLocked() (int, error) {
|
||||
persisted, err := al.store.load(al.profileID)
|
||||
return len(persisted), err
|
||||
}
|
||||
|
||||
// markTransactionsDone removes logs from the store that are complete (sent or failed permanently).
|
||||
// al.mu must not be held.
|
||||
func (al *Logger) markTransactionsDone(sent []*transaction) {
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
|
||||
ids := set.Set[string]{}
|
||||
for _, txn := range sent {
|
||||
ids.Add(txn.EventID)
|
||||
}
|
||||
|
||||
persisted, err := al.store.load(al.profileID)
|
||||
if err != nil {
|
||||
al.logf("[unexpected] markTransactionsDone failed to restore logs: %v", err)
|
||||
}
|
||||
var unsent []*transaction
|
||||
for _, txn := range persisted {
|
||||
if !ids.Contains(txn.EventID) {
|
||||
unsent = append(unsent, txn)
|
||||
}
|
||||
}
|
||||
al.store.save(al.profileID, unsent)
|
||||
}
|
||||
|
||||
// deduplicateAndSort removes duplicate logs from the given slice and sorts them by timestamp.
|
||||
// The first log entry in the slice will be retained, subsequent logs with the same EventID will be discarded.
|
||||
func deduplicateAndSort(txns []*transaction) []*transaction {
|
||||
seen := set.Set[string]{}
|
||||
deduped := make([]*transaction, 0, len(txns))
|
||||
for _, txn := range txns {
|
||||
if !seen.Contains(txn.EventID) {
|
||||
deduped = append(deduped, txn)
|
||||
seen.Add(txn.EventID)
|
||||
}
|
||||
}
|
||||
// Sort logs by timestamp - oldest to newest. This will put the oldest logs at
|
||||
// the front of the queue.
|
||||
sort.Slice(deduped, func(i, j int) bool {
|
||||
return deduped[i].TimeStamp.Before(deduped[j].TimeStamp)
|
||||
})
|
||||
return deduped
|
||||
}
|
||||
|
||||
func (al *Logger) enqueue(txn *transaction) error {
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
|
||||
if err := al.appendToStoreLocked([]*transaction{txn}); err != nil {
|
||||
return fmt.Errorf("%w: %w", ErrAuditLogStorageFailure, err)
|
||||
}
|
||||
|
||||
// If a.transport is nil if the logger is stopped.
|
||||
if al.transport != nil {
|
||||
al.flushAsync()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
var _ LogStore = (*logStateStore)(nil)
|
||||
|
||||
// logStateStore is a concrete implementation of [LogStore]
|
||||
// using [ipn.StateStore] as the underlying storage.
|
||||
type logStateStore struct {
|
||||
store ipn.StateStore
|
||||
}
|
||||
|
||||
// NewLogStore creates a new LogStateStore with the given [ipn.StateStore].
|
||||
func NewLogStore(store ipn.StateStore) LogStore {
|
||||
return &logStateStore{
|
||||
store: store,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *logStateStore) generateKey(key ipn.ProfileID) string {
|
||||
return "auditlog-" + string(key)
|
||||
}
|
||||
|
||||
// Save saves the given logs to an [ipn.StateStore]. This overwrites
|
||||
// any existing entries for the given key.
|
||||
func (s *logStateStore) save(key ipn.ProfileID, txns []*transaction) error {
|
||||
if key == "" {
|
||||
return errors.New("empty key")
|
||||
}
|
||||
|
||||
data, err := json.Marshal(txns)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
k := ipn.StateKey(s.generateKey(key))
|
||||
return s.store.WriteState(k, data)
|
||||
}
|
||||
|
||||
// Load retrieves the logs from an [ipn.StateStore].
|
||||
func (s *logStateStore) load(key ipn.ProfileID) ([]*transaction, error) {
|
||||
if key == "" {
|
||||
return nil, errors.New("empty key")
|
||||
}
|
||||
|
||||
k := ipn.StateKey(s.generateKey(key))
|
||||
data, err := s.store.ReadState(k)
|
||||
|
||||
switch {
|
||||
case errors.Is(err, ipn.ErrStateNotExist):
|
||||
return nil, nil
|
||||
case err != nil:
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var txns []*transaction
|
||||
err = json.Unmarshal(data, &txns)
|
||||
return txns, err
|
||||
}
|
481
ipn/auditlog/auditlog_test.go
Normal file
481
ipn/auditlog/auditlog_test.go
Normal file
@ -0,0 +1,481 @@
|
||||
// Copyright (c) Tailscale Inc & AUTHORS
|
||||
// SPDX-License-Identifier: BSD-3-Clause
|
||||
|
||||
package auditlog
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
qt "github.com/frankban/quicktest"
|
||||
"tailscale.com/ipn/store/mem"
|
||||
"tailscale.com/tailcfg"
|
||||
"tailscale.com/tstest"
|
||||
)
|
||||
|
||||
// loggerForTest creates an auditLogger for you and cleans it up
|
||||
// (and ensures no goroutines are leaked) when the test is done.
|
||||
func loggerForTest(t *testing.T, opts Opts) *Logger {
|
||||
t.Helper()
|
||||
tstest.ResourceCheck(t)
|
||||
|
||||
if opts.Logf == nil {
|
||||
opts.Logf = t.Logf
|
||||
}
|
||||
|
||||
if opts.Store == nil {
|
||||
t.Fatalf("opts.Store must be set")
|
||||
}
|
||||
|
||||
a := NewLogger(opts)
|
||||
|
||||
t.Cleanup(func() {
|
||||
a.FlushAndStop(context.Background())
|
||||
})
|
||||
return a
|
||||
}
|
||||
|
||||
func TestNonRetryableErrors(t *testing.T) {
|
||||
errorTests := []struct {
|
||||
desc string
|
||||
err error
|
||||
want bool
|
||||
}{
|
||||
{"DeadlineExceeded", context.DeadlineExceeded, false},
|
||||
{"Canceled", context.Canceled, false},
|
||||
{"Canceled wrapped", fmt.Errorf("%w: %w", context.Canceled, errors.New("ctx cancelled")), false},
|
||||
{"Random error", errors.New("random error"), false},
|
||||
}
|
||||
|
||||
for _, tt := range errorTests {
|
||||
t.Run(tt.desc, func(t *testing.T) {
|
||||
if IsRetryableError(tt.err) != tt.want {
|
||||
t.Fatalf("retriable: got %v, want %v", !tt.want, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnqueueAndFlush enqueues n logs and flushes them.
|
||||
// We expect all logs to be flushed and for no
|
||||
// logs to remain in the store once FlushAndStop returns.
|
||||
func TestEnqueueAndFlush(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
mockTransport := newMockTransport(nil)
|
||||
al := loggerForTest(t, Opts{
|
||||
RetryLimit: 200,
|
||||
Logf: t.Logf,
|
||||
Store: NewLogStore(&mem.Store{}),
|
||||
})
|
||||
|
||||
c.Assert(al.SetProfileID("test"), qt.IsNil)
|
||||
c.Assert(al.Start(mockTransport), qt.IsNil)
|
||||
|
||||
wantSent := 10
|
||||
|
||||
for i := range wantSent {
|
||||
err := al.Enqueue(tailcfg.AuditNodeDisconnect, fmt.Sprintf("log %d", i))
|
||||
c.Assert(err, qt.IsNil)
|
||||
}
|
||||
|
||||
al.FlushAndStop(context.Background())
|
||||
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
gotStored, err := al.storedCountLocked()
|
||||
c.Assert(err, qt.IsNil)
|
||||
|
||||
if wantStored := 0; gotStored != wantStored {
|
||||
t.Fatalf("stored: got %d, want %d", gotStored, wantStored)
|
||||
}
|
||||
|
||||
if gotSent := mockTransport.sentCount(); gotSent != wantSent {
|
||||
t.Fatalf("sent: got %d, want %d", gotSent, wantSent)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnqueueAndFlushWithFlushCancel calls FlushAndCancel with a cancelled
|
||||
// context. We expect nothing to be sent and all logs to be stored.
|
||||
func TestEnqueueAndFlushWithFlushCancel(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
mockTransport := newMockTransport(&retriableError)
|
||||
al := loggerForTest(t, Opts{
|
||||
RetryLimit: 200,
|
||||
Logf: t.Logf,
|
||||
Store: NewLogStore(&mem.Store{}),
|
||||
})
|
||||
|
||||
c.Assert(al.SetProfileID("test"), qt.IsNil)
|
||||
c.Assert(al.Start(mockTransport), qt.IsNil)
|
||||
|
||||
for i := range 10 {
|
||||
err := al.Enqueue(tailcfg.AuditNodeDisconnect, fmt.Sprintf("log %d", i))
|
||||
c.Assert(err, qt.IsNil)
|
||||
}
|
||||
|
||||
// Cancel the context before calling FlushAndStop - nothing should get sent.
|
||||
// This mimics a timeout before flush() has a chance to execute.
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
|
||||
al.FlushAndStop(ctx)
|
||||
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
gotStored, err := al.storedCountLocked()
|
||||
c.Assert(err, qt.IsNil)
|
||||
|
||||
if wantStored := 10; gotStored != wantStored {
|
||||
t.Fatalf("stored: got %d, want %d", gotStored, wantStored)
|
||||
}
|
||||
|
||||
if gotSent, wantSent := mockTransport.sentCount(), 0; gotSent != wantSent {
|
||||
t.Fatalf("sent: got %d, want %d", gotSent, wantSent)
|
||||
}
|
||||
}
|
||||
|
||||
// TestDeduplicateAndSort tests that the most recent log is kept when deduplicating logs
|
||||
func TestDeduplicateAndSort(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
al := loggerForTest(t, Opts{
|
||||
RetryLimit: 100,
|
||||
Logf: t.Logf,
|
||||
Store: NewLogStore(&mem.Store{}),
|
||||
})
|
||||
|
||||
c.Assert(al.SetProfileID("test"), qt.IsNil)
|
||||
|
||||
logs := []*transaction{
|
||||
{EventID: "1", Details: "log 1", TimeStamp: time.Now().Add(-time.Minute * 1), Retries: 1},
|
||||
}
|
||||
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
al.appendToStoreLocked(logs)
|
||||
|
||||
// Update the transaction and re-append it
|
||||
logs[0].Retries = 2
|
||||
al.appendToStoreLocked(logs)
|
||||
|
||||
fromStore, err := al.store.load("test")
|
||||
c.Assert(err, qt.IsNil)
|
||||
|
||||
// We should see only one transaction
|
||||
if wantStored, gotStored := len(logs), len(fromStore); gotStored != wantStored {
|
||||
t.Fatalf("stored: got %d, want %d", gotStored, wantStored)
|
||||
}
|
||||
|
||||
// We should see the latest transaction
|
||||
if wantRetryCount, gotRetryCount := 2, fromStore[0].Retries; gotRetryCount != wantRetryCount {
|
||||
t.Fatalf("reties: got %d, want %d", gotRetryCount, wantRetryCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChangeProfileId(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
al := loggerForTest(t, Opts{
|
||||
RetryLimit: 100,
|
||||
Logf: t.Logf,
|
||||
Store: NewLogStore(&mem.Store{}),
|
||||
})
|
||||
c.Assert(al.SetProfileID("test"), qt.IsNil)
|
||||
|
||||
// Changing a profile ID must fail
|
||||
c.Assert(al.SetProfileID("test"), qt.IsNotNil)
|
||||
}
|
||||
|
||||
// TestSendOnRestore pushes a n logs to the persistent store, and ensures they
|
||||
// are sent as soon as Start is called then checks to ensure the sent logs no
|
||||
// longer exist in the store.
|
||||
func TestSendOnRestore(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
mockTransport := newMockTransport(nil)
|
||||
al := loggerForTest(t, Opts{
|
||||
RetryLimit: 100,
|
||||
Logf: t.Logf,
|
||||
Store: NewLogStore(&mem.Store{}),
|
||||
})
|
||||
al.SetProfileID("test")
|
||||
|
||||
wantTotal := 10
|
||||
|
||||
for range 10 {
|
||||
al.Enqueue(tailcfg.AuditNodeDisconnect, "log")
|
||||
}
|
||||
|
||||
c.Assert(al.Start(mockTransport), qt.IsNil)
|
||||
|
||||
al.FlushAndStop(context.Background())
|
||||
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
gotStored, err := al.storedCountLocked()
|
||||
c.Assert(err, qt.IsNil)
|
||||
|
||||
if wantStored := 0; gotStored != wantStored {
|
||||
t.Fatalf("stored: got %d, want %d", gotStored, wantStored)
|
||||
}
|
||||
|
||||
if gotSent, wantSent := mockTransport.sentCount(), wantTotal; gotSent != wantSent {
|
||||
t.Fatalf("sent: got %d, want %d", gotSent, wantSent)
|
||||
}
|
||||
}
|
||||
|
||||
// TestFailureExhaustion enqueues n logs, with the transport in a failable state.
|
||||
// We then set it to a non-failing state, call FlushAndStop and expect all logs to be sent.
|
||||
func TestFailureExhaustion(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
mockTransport := newMockTransport(&retriableError)
|
||||
|
||||
al := loggerForTest(t, Opts{
|
||||
RetryLimit: 1,
|
||||
Logf: t.Logf,
|
||||
Store: NewLogStore(&mem.Store{}),
|
||||
})
|
||||
|
||||
c.Assert(al.SetProfileID("test"), qt.IsNil)
|
||||
c.Assert(al.Start(mockTransport), qt.IsNil)
|
||||
|
||||
for range 10 {
|
||||
err := al.Enqueue(tailcfg.AuditNodeDisconnect, "log")
|
||||
c.Assert(err, qt.IsNil)
|
||||
}
|
||||
|
||||
al.FlushAndStop(context.Background())
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
gotStored, err := al.storedCountLocked()
|
||||
c.Assert(err, qt.IsNil)
|
||||
|
||||
if wantStored := 0; gotStored != wantStored {
|
||||
t.Fatalf("stored: got %d, want %d", gotStored, wantStored)
|
||||
}
|
||||
|
||||
if gotSent, wantSent := mockTransport.sentCount(), 0; gotSent != wantSent {
|
||||
t.Fatalf("sent: got %d, want %d", gotSent, wantSent)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnqueueAndFailNoRetry enqueues a set of logs, all of which will fail and are not
|
||||
// retriable. We then call FlushAndStop and expect all to be unsent.
|
||||
func TestEnqueueAndFailNoRetry(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
mockTransport := newMockTransport(&nonRetriableError)
|
||||
|
||||
al := loggerForTest(t, Opts{
|
||||
RetryLimit: 100,
|
||||
Logf: t.Logf,
|
||||
Store: NewLogStore(&mem.Store{}),
|
||||
})
|
||||
|
||||
c.Assert(al.SetProfileID("test"), qt.IsNil)
|
||||
c.Assert(al.Start(mockTransport), qt.IsNil)
|
||||
|
||||
for i := range 10 {
|
||||
err := al.Enqueue(tailcfg.AuditNodeDisconnect, fmt.Sprintf("log %d", i))
|
||||
c.Assert(err, qt.IsNil)
|
||||
}
|
||||
|
||||
al.FlushAndStop(context.Background())
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
gotStored, err := al.storedCountLocked()
|
||||
c.Assert(err, qt.IsNil)
|
||||
|
||||
if wantStored := 0; gotStored != wantStored {
|
||||
t.Fatalf("stored: got %d, want %d", gotStored, wantStored)
|
||||
}
|
||||
|
||||
if gotSent, wantSent := mockTransport.sentCount(), 0; gotSent != wantSent {
|
||||
t.Fatalf("sent: got %d, want %d", gotSent, wantSent)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnqueueAndRetry enqueues a set of logs, all of which will fail and are retriable.
|
||||
// Mid-test, we set the transport to not-fail and expect the queue to flush properly
|
||||
// We set the backoff parameters to 0 seconds so retries are immediate.
|
||||
func TestEnqueueAndRetry(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
mockTransport := newMockTransport(&retriableError)
|
||||
|
||||
al := loggerForTest(t, Opts{
|
||||
RetryLimit: 100,
|
||||
Logf: t.Logf,
|
||||
Store: NewLogStore(&mem.Store{}),
|
||||
})
|
||||
|
||||
al.backoffOpts = backoffOpts{
|
||||
min: 1 * time.Millisecond,
|
||||
max: 4 * time.Millisecond,
|
||||
multiplier: 2.0,
|
||||
}
|
||||
|
||||
c.Assert(al.SetProfileID("test"), qt.IsNil)
|
||||
c.Assert(al.Start(mockTransport), qt.IsNil)
|
||||
|
||||
err := al.Enqueue(tailcfg.AuditNodeDisconnect, fmt.Sprintf("log 1"))
|
||||
c.Assert(err, qt.IsNil)
|
||||
|
||||
// This will wait for at least 2 retries
|
||||
gotRetried, wantRetried := mockTransport.waitForSendAttemptsToReach(3), true
|
||||
if gotRetried != wantRetried {
|
||||
t.Fatalf("retried: got %v, want %v", gotRetried, wantRetried)
|
||||
}
|
||||
|
||||
mockTransport.setErrorCondition(nil)
|
||||
|
||||
al.FlushAndStop(context.Background())
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
|
||||
gotStored, err := al.storedCountLocked()
|
||||
c.Assert(err, qt.IsNil)
|
||||
|
||||
if wantStored := 0; gotStored != wantStored {
|
||||
t.Fatalf("stored: got %d, want %d", gotStored, wantStored)
|
||||
}
|
||||
|
||||
if gotSent, wantSent := mockTransport.sentCount(), 1; gotSent != wantSent {
|
||||
t.Fatalf("sent: got %d, want %d", gotSent, wantSent)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnqueueBeforeSetProfileID tests that logs enqueued before SetProfileId are not sent
|
||||
func TestEnqueueBeforeSetProfileID(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
al := loggerForTest(t, Opts{
|
||||
RetryLimit: 100,
|
||||
Logf: t.Logf,
|
||||
Store: NewLogStore(&mem.Store{}),
|
||||
})
|
||||
|
||||
err := al.Enqueue(tailcfg.AuditNodeDisconnect, "log")
|
||||
c.Assert(err, qt.IsNotNil)
|
||||
al.FlushAndStop(context.Background())
|
||||
|
||||
al.mu.Lock()
|
||||
defer al.mu.Unlock()
|
||||
gotStored, err := al.storedCountLocked()
|
||||
c.Assert(err, qt.IsNotNil)
|
||||
|
||||
if wantStored := 0; gotStored != wantStored {
|
||||
t.Fatalf("stored: got %d, want %d", gotStored, wantStored)
|
||||
}
|
||||
}
|
||||
|
||||
// TestLogStoring tests that audit logs are persisted sorted by timestamp, oldest to newest
|
||||
func TestLogSorting(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
mockStore := NewLogStore(&mem.Store{})
|
||||
|
||||
logs := []*transaction{
|
||||
{EventID: "1", Details: "log 3", TimeStamp: time.Now().Add(-time.Minute * 1)},
|
||||
{EventID: "1", Details: "log 3", TimeStamp: time.Now().Add(-time.Minute * 2)},
|
||||
{EventID: "2", Details: "log 2", TimeStamp: time.Now().Add(-time.Minute * 3)},
|
||||
{EventID: "3", Details: "log 1", TimeStamp: time.Now().Add(-time.Minute * 4)},
|
||||
}
|
||||
|
||||
wantLogs := []transaction{
|
||||
{Details: "log 1"},
|
||||
{Details: "log 2"},
|
||||
{Details: "log 3"},
|
||||
}
|
||||
|
||||
mockStore.save("test", logs)
|
||||
|
||||
gotLogs, err := mockStore.load("test")
|
||||
c.Assert(err, qt.IsNil)
|
||||
gotLogs = deduplicateAndSort(gotLogs)
|
||||
|
||||
for i := range gotLogs {
|
||||
if want, got := wantLogs[i].Details, gotLogs[i].Details; want != got {
|
||||
t.Fatalf("Details: got %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mock implementations for testing
|
||||
|
||||
// newMockTransport returns a mock transport for testing
|
||||
// If err is no nil, SendAuditLog will return this error if the send is attempted
|
||||
// before the context is cancelled.
|
||||
func newMockTransport(err error) *mockAuditLogTransport {
|
||||
return &mockAuditLogTransport{
|
||||
err: err,
|
||||
attempts: make(chan int, 1),
|
||||
}
|
||||
}
|
||||
|
||||
type mockAuditLogTransport struct {
|
||||
attempts chan int // channel to notify of send attempts
|
||||
|
||||
mu sync.Mutex
|
||||
sendAttmpts int // number of attempts to send logs
|
||||
sendCount int // number of logs sent by the transport
|
||||
err error // error to return when sending logs
|
||||
}
|
||||
|
||||
// waitForSendAttemptsToReach blocks until the number of send attempts reaches n
|
||||
// This should be use only in tests where the transport is expected to retry sending logs
|
||||
func (t *mockAuditLogTransport) waitForSendAttemptsToReach(n int) bool {
|
||||
for attempts := range t.attempts {
|
||||
if attempts >= n {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (t *mockAuditLogTransport) setErrorCondition(err error) {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
t.err = err
|
||||
}
|
||||
|
||||
func (t *mockAuditLogTransport) sentCount() int {
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
return t.sendCount
|
||||
}
|
||||
|
||||
func (t *mockAuditLogTransport) SendAuditLog(ctx context.Context, _ tailcfg.AuditLogRequest) (err error) {
|
||||
t.mu.Lock()
|
||||
t.sendAttmpts += 1
|
||||
defer func() {
|
||||
a := t.sendAttmpts
|
||||
t.mu.Unlock()
|
||||
select {
|
||||
case t.attempts <- a:
|
||||
default:
|
||||
}
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
if t.err != nil {
|
||||
return t.err
|
||||
}
|
||||
t.sendCount += 1
|
||||
return nil
|
||||
}
|
||||
|
||||
var (
|
||||
retriableError = mockError{errors.New("retriable error")}
|
||||
nonRetriableError = mockError{errors.New("permanent failure error")}
|
||||
)
|
||||
|
||||
type mockError struct {
|
||||
error
|
||||
}
|
||||
|
||||
func (e mockError) Retryable() bool {
|
||||
return e == retriableError
|
||||
}
|
@ -10,12 +10,11 @@ import (
|
||||
|
||||
"tailscale.com/client/tailscale/apitype"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/tailcfg"
|
||||
)
|
||||
|
||||
// AuditLogFunc is any function that can be used to log audit actions performed by an [Actor].
|
||||
//
|
||||
// TODO(nickkhyl,barnstar): define a named string type for the action (in tailcfg?) and use it here.
|
||||
type AuditLogFunc func(action, details string)
|
||||
type AuditLogFunc func(action tailcfg.ClientAuditAction, details string) error
|
||||
|
||||
// Actor is any actor using the [ipnlocal.LocalBackend].
|
||||
//
|
||||
@ -45,7 +44,7 @@ type Actor interface {
|
||||
//
|
||||
// If the auditLogger is non-nil, it is used to write details about the action
|
||||
// to the audit log when required by the policy.
|
||||
CheckProfileAccess(profile ipn.LoginProfileView, requestedAccess ProfileAccess, auditLogger AuditLogFunc) error
|
||||
CheckProfileAccess(profile ipn.LoginProfileView, requestedAccess ProfileAccess, auditLogFn AuditLogFunc) error
|
||||
|
||||
// IsLocalSystem reports whether the actor is the Windows' Local System account.
|
||||
//
|
||||
|
@ -9,6 +9,7 @@ import (
|
||||
|
||||
"tailscale.com/client/tailscale/apitype"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/tailcfg"
|
||||
"tailscale.com/util/syspolicy"
|
||||
)
|
||||
|
||||
@ -48,7 +49,7 @@ func (a actorWithPolicyChecks) CheckProfileAccess(profile ipn.LoginProfileView,
|
||||
//
|
||||
// TODO(nickkhyl): unexport it when we move [ipn.Actor] implementations from [ipnserver]
|
||||
// and corp to this package.
|
||||
func CheckDisconnectPolicy(actor Actor, profile ipn.LoginProfileView, reason string, auditLogger AuditLogFunc) error {
|
||||
func CheckDisconnectPolicy(actor Actor, profile ipn.LoginProfileView, reason string, auditFn AuditLogFunc) error {
|
||||
if alwaysOn, _ := syspolicy.GetBoolean(syspolicy.AlwaysOn, false); !alwaysOn {
|
||||
return nil
|
||||
}
|
||||
@ -58,15 +59,16 @@ func CheckDisconnectPolicy(actor Actor, profile ipn.LoginProfileView, reason str
|
||||
if reason == "" {
|
||||
return errors.New("disconnect not allowed: reason required")
|
||||
}
|
||||
if auditLogger != nil {
|
||||
if auditFn != nil {
|
||||
var details string
|
||||
if username, _ := actor.Username(); username != "" { // best-effort; we don't have it on all platforms
|
||||
details = fmt.Sprintf("%q is being disconnected by %q: %v", profile.Name(), username, reason)
|
||||
} else {
|
||||
details = fmt.Sprintf("%q is being disconnected: %v", profile.Name(), reason)
|
||||
}
|
||||
// TODO(nickkhyl,barnstar): use a const for DISCONNECT_NODE.
|
||||
auditLogger("DISCONNECT_NODE", details)
|
||||
if err := auditFn(tailcfg.AuditNodeDisconnect, details); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -57,10 +57,12 @@ import (
|
||||
"tailscale.com/health/healthmsg"
|
||||
"tailscale.com/hostinfo"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/ipn/auditlog"
|
||||
"tailscale.com/ipn/conffile"
|
||||
"tailscale.com/ipn/ipnauth"
|
||||
"tailscale.com/ipn/ipnstate"
|
||||
"tailscale.com/ipn/policy"
|
||||
memstore "tailscale.com/ipn/store/mem"
|
||||
"tailscale.com/log/sockstatlog"
|
||||
"tailscale.com/logpolicy"
|
||||
"tailscale.com/net/captivedetection"
|
||||
@ -450,6 +452,12 @@ type LocalBackend struct {
|
||||
// Each callback is called exactly once in unspecified order and without b.mu held.
|
||||
// Returned errors are logged but otherwise ignored and do not affect the shutdown process.
|
||||
shutdownCbs set.HandleSet[func() error]
|
||||
|
||||
// auditLogger, if non-nil, manages audit logging for the backend.
|
||||
//
|
||||
// It queues, persists, and sends audit logs
|
||||
// to the control client. auditLogger has the same lifespan as b.cc.
|
||||
auditLogger *auditlog.Logger
|
||||
}
|
||||
|
||||
// HealthTracker returns the health tracker for the backend.
|
||||
@ -1679,6 +1687,15 @@ func (b *LocalBackend) SetControlClientStatus(c controlclient.Client, st control
|
||||
b.logf("Failed to save new controlclient state: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Update the audit logger with the current profile ID.
|
||||
if b.auditLogger != nil && prefsChanged {
|
||||
pid := b.pm.CurrentProfile().ID()
|
||||
if err := b.auditLogger.SetProfileID(pid); err != nil {
|
||||
b.logf("Failed to set profile ID in audit logger: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// initTKALocked is dependent on CurrentProfile.ID, which is initialized
|
||||
// (for new profiles) on the first call to b.pm.SetPrefs.
|
||||
if err := b.initTKALocked(); err != nil {
|
||||
@ -2386,6 +2403,27 @@ func (b *LocalBackend) Start(opts ipn.Options) error {
|
||||
debugFlags = append([]string{"netstack"}, debugFlags...)
|
||||
}
|
||||
|
||||
var auditLogShutdown func()
|
||||
// Audit logging is only available if the client has set up a proper persistent
|
||||
// store for the logs in sys.
|
||||
store, ok := b.sys.AuditLogStore.GetOK()
|
||||
if !ok {
|
||||
b.logf("auditlog: [unexpected] no persistent audit log storage configured. using memory store.")
|
||||
store = auditlog.NewLogStore(&memstore.Store{})
|
||||
}
|
||||
|
||||
al := auditlog.NewLogger(auditlog.Opts{
|
||||
Logf: b.logf,
|
||||
RetryLimit: 32,
|
||||
Store: store,
|
||||
})
|
||||
b.auditLogger = al
|
||||
auditLogShutdown = func() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
al.FlushAndStop(ctx)
|
||||
}
|
||||
|
||||
// TODO(apenwarr): The only way to change the ServerURL is to
|
||||
// re-run b.Start, because this is the only place we create a
|
||||
// new controlclient. EditPrefs allows you to overwrite ServerURL,
|
||||
@ -2411,6 +2449,7 @@ func (b *LocalBackend) Start(opts ipn.Options) error {
|
||||
C2NHandler: http.HandlerFunc(b.handleC2N),
|
||||
DialPlan: &b.dialPlan, // pointer because it can't be copied
|
||||
ControlKnobs: b.sys.ControlKnobs(),
|
||||
Shutdown: auditLogShutdown,
|
||||
|
||||
// Don't warn about broken Linux IP forwarding when
|
||||
// netstack is being used.
|
||||
@ -4263,6 +4302,21 @@ func (b *LocalBackend) MaybeClearAppConnector(mp *ipn.MaskedPrefs) error {
|
||||
return err
|
||||
}
|
||||
|
||||
var errNoAuditLogger = errors.New("no audit logger configured")
|
||||
|
||||
func (b *LocalBackend) getAuditLoggerLocked() ipnauth.AuditLogFunc {
|
||||
logger := b.auditLogger
|
||||
return func(action tailcfg.ClientAuditAction, details string) error {
|
||||
if logger == nil {
|
||||
return errNoAuditLogger
|
||||
}
|
||||
if err := logger.Enqueue(action, details); err != nil {
|
||||
return fmt.Errorf("failed to enqueue audit log %v %q: %w", action, details, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// EditPrefs applies the changes in mp to the current prefs,
|
||||
// acting as the tailscaled itself rather than a specific user.
|
||||
func (b *LocalBackend) EditPrefs(mp *ipn.MaskedPrefs) (ipn.PrefsView, error) {
|
||||
@ -4288,9 +4342,8 @@ func (b *LocalBackend) EditPrefsAs(mp *ipn.MaskedPrefs, actor ipnauth.Actor) (ip
|
||||
unlock := b.lockAndGetUnlock()
|
||||
defer unlock()
|
||||
if mp.WantRunningSet && !mp.WantRunning && b.pm.CurrentPrefs().WantRunning() {
|
||||
// TODO(barnstar,nickkhyl): replace loggerFn with the actual audit logger.
|
||||
loggerFn := func(action, details string) { b.logf("[audit]: %s: %s", action, details) }
|
||||
if err := actor.CheckProfileAccess(b.pm.CurrentProfile(), ipnauth.Disconnect, loggerFn); err != nil {
|
||||
if err := actor.CheckProfileAccess(b.pm.CurrentProfile(), ipnauth.Disconnect, b.getAuditLoggerLocked()); err != nil {
|
||||
b.logf("check profile access failed: %v", err)
|
||||
return ipn.PrefsView{}, err
|
||||
}
|
||||
|
||||
@ -5874,6 +5927,15 @@ func (b *LocalBackend) requestEngineStatusAndWait() {
|
||||
func (b *LocalBackend) setControlClientLocked(cc controlclient.Client) {
|
||||
b.cc = cc
|
||||
b.ccAuto, _ = cc.(*controlclient.Auto)
|
||||
if b.auditLogger != nil {
|
||||
if err := b.auditLogger.SetProfileID(b.pm.CurrentProfile().ID()); err != nil {
|
||||
b.logf("audit logger set profile ID failure: %v", err)
|
||||
}
|
||||
|
||||
if err := b.auditLogger.Start(b.ccAuto); err != nil {
|
||||
b.logf("audit logger start failure: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// resetControlClientLocked sets b.cc to nil and returns the old value. If the
|
||||
|
@ -2982,3 +2982,33 @@ const LBHeader = "Ts-Lb"
|
||||
// correspond to those IPs. Any services that don't correspond to a service
|
||||
// this client is hosting can be ignored.
|
||||
type ServiceIPMappings map[ServiceName][]netip.Addr
|
||||
|
||||
// ClientAuditAction represents an auditable action that a client can report to the
|
||||
// control plane. These actions must correspond to the supported actions
|
||||
// in the control plane.
|
||||
type ClientAuditAction string
|
||||
|
||||
const (
|
||||
// AuditNodeDisconnect action is sent when a node has disconnected
|
||||
// from the control plane. The details must include a reason in the Details
|
||||
// field, either generated, or entered by the user.
|
||||
AuditNodeDisconnect = ClientAuditAction("DISCONNECT_NODE")
|
||||
)
|
||||
|
||||
// AuditLogRequest represents an audit log request to be sent to the control plane.
|
||||
//
|
||||
// This is JSON-encoded and sent over the control plane connection to:
|
||||
// POST https://<control-plane>/machine/audit-log
|
||||
type AuditLogRequest struct {
|
||||
// Version is the client's current CapabilityVersion.
|
||||
Version CapabilityVersion `json:",omitempty"`
|
||||
// NodeKey is the client's current node key.
|
||||
NodeKey key.NodePublic `json:",omitzero"`
|
||||
// Action is the action to be logged. It must correspond to a known action in the control plane.
|
||||
Action ClientAuditAction `json:",omitempty"`
|
||||
// Details is an opaque string, specific to the action being logged. Empty strings may not
|
||||
// be valid depending on the action being logged.
|
||||
Details string `json:",omitempty"`
|
||||
// Timestamp is the time at which the audit log was generated on the node.
|
||||
Timestamp time.Time `json:",omitzero"`
|
||||
}
|
||||
|
@ -25,6 +25,7 @@ import (
|
||||
"tailscale.com/drive"
|
||||
"tailscale.com/health"
|
||||
"tailscale.com/ipn"
|
||||
"tailscale.com/ipn/auditlog"
|
||||
"tailscale.com/ipn/conffile"
|
||||
"tailscale.com/ipn/desktop"
|
||||
"tailscale.com/net/dns"
|
||||
@ -50,6 +51,7 @@ type System struct {
|
||||
Router SubSystem[router.Router]
|
||||
Tun SubSystem[*tstun.Wrapper]
|
||||
StateStore SubSystem[ipn.StateStore]
|
||||
AuditLogStore SubSystem[auditlog.LogStore]
|
||||
Netstack SubSystem[NetstackImpl] // actually a *netstack.Impl
|
||||
DriveForLocal SubSystem[drive.FileSystemForLocal]
|
||||
DriveForRemote SubSystem[drive.FileSystemForRemote]
|
||||
@ -106,6 +108,8 @@ func (s *System) Set(v any) {
|
||||
s.MagicSock.Set(v)
|
||||
case ipn.StateStore:
|
||||
s.StateStore.Set(v)
|
||||
case auditlog.LogStore:
|
||||
s.AuditLogStore.Set(v)
|
||||
case NetstackImpl:
|
||||
s.Netstack.Set(v)
|
||||
case drive.FileSystemForLocal:
|
||||
|
Loading…
x
Reference in New Issue
Block a user