mirror of
https://github.com/tailscale/tailscale.git
synced 2025-04-04 23:35:50 +00:00

In this PR, we update ipnlocal.LocalBackend to allow registering callbacks for control client creation and profile changes. We also allow to register ipnauth.AuditLogFunc to be called when an auditable action is attempted. We then use all this to invert the dependency between the auditlog and ipnlocal packages and make the auditlog functionality optional, where it only registers its callbacks via ipnlocal-provided hooks when the auditlog package is imported. We then underscore-import it when building tailscaled for Windows, and we'll explicitly import it when building xcode/ipn-go-bridge for macOS. Since there's no default log-store location for macOS, we'll also need to call auditlog.SetStoreFilePath to specify where pending audit logs should be persisted. Fixes #15394 Updates tailscale/corp#26435 Updates tailscale/corp#27012 Signed-off-by: Nick Khyl <nickk@tailscale.com>
469 lines
14 KiB
Go
469 lines
14 KiB
Go
// Copyright (c) Tailscale Inc & AUTHORS
|
|
// SPDX-License-Identifier: BSD-3-Clause
|
|
|
|
// Package auditlog provides a mechanism for logging audit events.
|
|
package auditlog
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"sort"
|
|
"sync"
|
|
"time"
|
|
|
|
"tailscale.com/ipn"
|
|
"tailscale.com/tailcfg"
|
|
"tailscale.com/types/logger"
|
|
"tailscale.com/util/rands"
|
|
"tailscale.com/util/set"
|
|
)
|
|
|
|
// transaction represents an audit log that has not yet been sent to the control plane.
|
|
type transaction struct {
|
|
// EventID is the unique identifier for the event being logged.
|
|
// This is used on the client side only and is not sent to control.
|
|
EventID string `json:",omitempty"`
|
|
// Retries is the number of times the logger has attempted to send this log.
|
|
// This is used on the client side only and is not sent to control.
|
|
Retries int `json:",omitempty"`
|
|
|
|
// Action is the action to be logged. It must correspond to a known action in the control plane.
|
|
Action tailcfg.ClientAuditAction `json:",omitempty"`
|
|
// Details is an opaque string specific to the action being logged. Empty strings may not
|
|
// be valid depending on the action being logged.
|
|
Details string `json:",omitempty"`
|
|
// TimeStamp is the time at which the audit log was generated on the node.
|
|
TimeStamp time.Time `json:",omitzero"`
|
|
}
|
|
|
|
// Transport provides a means for a client to send audit logs to a consumer (typically the control plane).
|
|
type Transport interface {
|
|
// SendAuditLog sends an audit log to a consumer of audit logs.
|
|
// Errors should be checked with [IsRetryableError] for retryability.
|
|
SendAuditLog(context.Context, tailcfg.AuditLogRequest) error
|
|
}
|
|
|
|
// LogStore provides a means for a [Logger] to persist logs to disk or memory.
|
|
type LogStore interface {
|
|
// Save saves the given data to a persistent store. Save will overwrite existing data
|
|
// for the given key.
|
|
save(key ipn.ProfileID, txns []*transaction) error
|
|
|
|
// Load retrieves the data from a persistent store. Returns a nil slice and
|
|
// no error if no data exists for the given key.
|
|
load(key ipn.ProfileID) ([]*transaction, error)
|
|
}
|
|
|
|
// Opts contains the configuration options for a [Logger].
|
|
type Opts struct {
|
|
// RetryLimit is the maximum number of attempts the logger will make to send a log before giving up.
|
|
RetryLimit int
|
|
// Store is the persistent store used to save logs to disk. Must be non-nil.
|
|
Store LogStore
|
|
// Logf is the logger used to log messages from the audit logger. Must be non-nil.
|
|
Logf logger.Logf
|
|
}
|
|
|
|
// IsRetryableError returns true if the given error is retryable
|
|
// See [controlclient.apiResponseError]. Potentially retryable errors implement the Retryable() method.
|
|
func IsRetryableError(err error) bool {
|
|
var retryable interface{ Retryable() bool }
|
|
return errors.As(err, &retryable) && retryable.Retryable()
|
|
}
|
|
|
|
type backoffOpts struct {
|
|
min, max time.Duration
|
|
multiplier float64
|
|
}
|
|
|
|
// .5, 1, 2, 4, 8, 10, 10, 10, 10, 10...
|
|
var defaultBackoffOpts = backoffOpts{
|
|
min: time.Millisecond * 500,
|
|
max: 10 * time.Second,
|
|
multiplier: 2,
|
|
}
|
|
|
|
// Logger provides a queue-based mechanism for submitting audit logs to the control plane - or
|
|
// another suitable consumer. Logs are stored to disk and retried until they are successfully sent,
|
|
// or until they permanently fail.
|
|
//
|
|
// Each individual profile/controlclient tuple should construct and manage a unique [Logger] instance.
|
|
type Logger struct {
|
|
logf logger.Logf
|
|
retryLimit int // the maximum number of attempts to send a log before giving up.
|
|
flusher chan struct{} // channel used to signal a flush operation.
|
|
done chan struct{} // closed when the flush worker exits.
|
|
ctx context.Context // canceled when the logger is stopped.
|
|
ctxCancel context.CancelFunc // cancels ctx.
|
|
backoffOpts // backoff settings for retry operations.
|
|
|
|
// mu protects the fields below.
|
|
mu sync.Mutex
|
|
store LogStore // persistent storage for unsent logs.
|
|
profileID ipn.ProfileID // empty if [Logger.SetProfileID] has not been called.
|
|
transport Transport // nil until [Logger.Start] is called.
|
|
}
|
|
|
|
// NewLogger creates a new [Logger] with the given options.
|
|
func NewLogger(opts Opts) *Logger {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
al := &Logger{
|
|
retryLimit: opts.RetryLimit,
|
|
logf: opts.Logf,
|
|
store: opts.Store,
|
|
flusher: make(chan struct{}, 1),
|
|
done: make(chan struct{}),
|
|
ctx: ctx,
|
|
ctxCancel: cancel,
|
|
backoffOpts: defaultBackoffOpts,
|
|
}
|
|
al.logf("created")
|
|
return al
|
|
}
|
|
|
|
// FlushAndStop synchronously flushes all pending logs and stops the audit logger.
|
|
// This will block until a final flush operation completes or context is done.
|
|
// If the logger is already stopped, this will return immediately. All unsent
|
|
// logs will be persisted to the store.
|
|
func (al *Logger) FlushAndStop(ctx context.Context) {
|
|
al.stop()
|
|
al.flush(ctx)
|
|
}
|
|
|
|
// SetProfileID sets the profileID for the logger. This must be called before any logs can be enqueued.
|
|
// The profileID of a logger cannot be changed once set.
|
|
func (al *Logger) SetProfileID(profileID ipn.ProfileID) error {
|
|
al.mu.Lock()
|
|
defer al.mu.Unlock()
|
|
// It's not an error to call SetProfileID more than once
|
|
// with the same [ipn.ProfileID].
|
|
if al.profileID != "" && al.profileID != profileID {
|
|
return errors.New("profileID cannot be changed once set")
|
|
}
|
|
|
|
al.profileID = profileID
|
|
return nil
|
|
}
|
|
|
|
// Start starts the audit logger with the given transport.
|
|
// It returns an error if the logger is already started.
|
|
func (al *Logger) Start(t Transport) error {
|
|
al.mu.Lock()
|
|
defer al.mu.Unlock()
|
|
|
|
if al.transport != nil {
|
|
return errors.New("already started")
|
|
}
|
|
|
|
al.transport = t
|
|
pending, err := al.storedCountLocked()
|
|
if err != nil {
|
|
al.logf("[unexpected] failed to restore logs: %v", err)
|
|
}
|
|
go al.flushWorker()
|
|
if pending > 0 {
|
|
al.flushAsync()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ErrAuditLogStorageFailure is returned when the logger fails to persist logs to the store.
|
|
var ErrAuditLogStorageFailure = errors.New("audit log storage failure")
|
|
|
|
// Enqueue queues an audit log to be sent to the control plane (or another suitable consumer/transport).
|
|
// This will return an error if the underlying store fails to save the log or we fail to generate a unique
|
|
// eventID for the log.
|
|
func (al *Logger) Enqueue(action tailcfg.ClientAuditAction, details string) error {
|
|
txn := &transaction{
|
|
Action: action,
|
|
Details: details,
|
|
TimeStamp: time.Now(),
|
|
}
|
|
// Generate a suitably random eventID for the transaction.
|
|
txn.EventID = fmt.Sprint(txn.TimeStamp, rands.HexString(16))
|
|
return al.enqueue(txn)
|
|
}
|
|
|
|
// flushAsync requests an asynchronous flush.
|
|
// It is a no-op if a flush is already pending.
|
|
func (al *Logger) flushAsync() {
|
|
select {
|
|
case al.flusher <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
|
|
func (al *Logger) flushWorker() {
|
|
defer close(al.done)
|
|
|
|
var retryDelay time.Duration
|
|
retry := time.NewTimer(0)
|
|
retry.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-al.ctx.Done():
|
|
return
|
|
case <-al.flusher:
|
|
err := al.flush(al.ctx)
|
|
switch {
|
|
case errors.Is(err, context.Canceled):
|
|
// The logger was stopped, no need to retry.
|
|
return
|
|
case err != nil:
|
|
retryDelay = max(al.backoffOpts.min, min(retryDelay*time.Duration(al.backoffOpts.multiplier), al.backoffOpts.max))
|
|
al.logf("retrying after %v, %v", retryDelay, err)
|
|
retry.Reset(retryDelay)
|
|
default:
|
|
retryDelay = 0
|
|
retry.Stop()
|
|
}
|
|
case <-retry.C:
|
|
al.flushAsync()
|
|
}
|
|
}
|
|
}
|
|
|
|
// flush attempts to send all pending logs to the control plane.
|
|
// l.mu must not be held.
|
|
func (al *Logger) flush(ctx context.Context) error {
|
|
al.mu.Lock()
|
|
pending, err := al.store.load(al.profileID)
|
|
t := al.transport
|
|
al.mu.Unlock()
|
|
|
|
if err != nil {
|
|
// This will catch nil profileIDs
|
|
return fmt.Errorf("failed to restore pending logs: %w", err)
|
|
}
|
|
if len(pending) == 0 {
|
|
return nil
|
|
}
|
|
if t == nil {
|
|
return errors.New("no transport")
|
|
}
|
|
|
|
complete, unsent := al.sendToTransport(ctx, pending, t)
|
|
al.markTransactionsDone(complete)
|
|
|
|
al.mu.Lock()
|
|
defer al.mu.Unlock()
|
|
if err = al.appendToStoreLocked(unsent); err != nil {
|
|
al.logf("[unexpected] failed to persist logs: %v", err)
|
|
}
|
|
|
|
if len(unsent) != 0 {
|
|
return fmt.Errorf("failed to send %d logs", len(unsent))
|
|
}
|
|
|
|
if len(complete) != 0 {
|
|
al.logf("complete %d audit log transactions", len(complete))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// sendToTransport sends all pending logs to the control plane. Returns a pair of slices
|
|
// containing the logs that were successfully sent (or failed permanently) and those that were not.
|
|
//
|
|
// This may require multiple round trips to the control plane and can be a long running transaction.
|
|
func (al *Logger) sendToTransport(ctx context.Context, pending []*transaction, t Transport) (complete []*transaction, unsent []*transaction) {
|
|
for i, txn := range pending {
|
|
req := tailcfg.AuditLogRequest{
|
|
Action: tailcfg.ClientAuditAction(txn.Action),
|
|
Details: txn.Details,
|
|
Timestamp: txn.TimeStamp,
|
|
}
|
|
|
|
if err := t.SendAuditLog(ctx, req); err != nil {
|
|
switch {
|
|
case errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded):
|
|
// The contex is done. All further attempts will fail.
|
|
unsent = append(unsent, pending[i:]...)
|
|
return complete, unsent
|
|
case IsRetryableError(err) && txn.Retries+1 < al.retryLimit:
|
|
// We permit a maximum number of retries for each log. All retriable
|
|
// errors should be transient and we should be able to send the log eventually, but
|
|
// we don't want logs to be persisted indefinitely.
|
|
txn.Retries++
|
|
unsent = append(unsent, txn)
|
|
default:
|
|
complete = append(complete, txn)
|
|
al.logf("failed permanently: %v", err)
|
|
}
|
|
} else {
|
|
// No error - we're done.
|
|
complete = append(complete, txn)
|
|
}
|
|
}
|
|
|
|
return complete, unsent
|
|
}
|
|
|
|
func (al *Logger) stop() {
|
|
al.mu.Lock()
|
|
t := al.transport
|
|
al.mu.Unlock()
|
|
|
|
if t == nil {
|
|
// No transport means no worker goroutine and done will not be
|
|
// closed if we cancel the context.
|
|
return
|
|
}
|
|
|
|
al.ctxCancel()
|
|
<-al.done
|
|
al.logf("stopped for profileID: %v", al.profileID)
|
|
}
|
|
|
|
// appendToStoreLocked persists logs to the store. This will deduplicate
|
|
// logs so it is safe to call this with the same logs multiple time, to
|
|
// requeue failed transactions for example.
|
|
//
|
|
// l.mu must be held.
|
|
func (al *Logger) appendToStoreLocked(txns []*transaction) error {
|
|
if len(txns) == 0 {
|
|
return nil
|
|
}
|
|
|
|
if al.profileID == "" {
|
|
return errors.New("no logId set")
|
|
}
|
|
|
|
persisted, err := al.store.load(al.profileID)
|
|
if err != nil {
|
|
al.logf("[unexpected] append failed to restore logs: %v", err)
|
|
}
|
|
|
|
// The order is important here. We want the latest transactions first, which will
|
|
// ensure when we dedup, the new transactions are seen and the older transactions
|
|
// are discarded.
|
|
txnsOut := append(txns, persisted...)
|
|
txnsOut = deduplicateAndSort(txnsOut)
|
|
|
|
return al.store.save(al.profileID, txnsOut)
|
|
}
|
|
|
|
// storedCountLocked returns the number of logs persisted to the store.
|
|
// al.mu must be held.
|
|
func (al *Logger) storedCountLocked() (int, error) {
|
|
persisted, err := al.store.load(al.profileID)
|
|
return len(persisted), err
|
|
}
|
|
|
|
// markTransactionsDone removes logs from the store that are complete (sent or failed permanently).
|
|
// al.mu must not be held.
|
|
func (al *Logger) markTransactionsDone(sent []*transaction) {
|
|
al.mu.Lock()
|
|
defer al.mu.Unlock()
|
|
|
|
ids := set.Set[string]{}
|
|
for _, txn := range sent {
|
|
ids.Add(txn.EventID)
|
|
}
|
|
|
|
persisted, err := al.store.load(al.profileID)
|
|
if err != nil {
|
|
al.logf("[unexpected] markTransactionsDone failed to restore logs: %v", err)
|
|
}
|
|
var unsent []*transaction
|
|
for _, txn := range persisted {
|
|
if !ids.Contains(txn.EventID) {
|
|
unsent = append(unsent, txn)
|
|
}
|
|
}
|
|
al.store.save(al.profileID, unsent)
|
|
}
|
|
|
|
// deduplicateAndSort removes duplicate logs from the given slice and sorts them by timestamp.
|
|
// The first log entry in the slice will be retained, subsequent logs with the same EventID will be discarded.
|
|
func deduplicateAndSort(txns []*transaction) []*transaction {
|
|
seen := set.Set[string]{}
|
|
deduped := make([]*transaction, 0, len(txns))
|
|
for _, txn := range txns {
|
|
if !seen.Contains(txn.EventID) {
|
|
deduped = append(deduped, txn)
|
|
seen.Add(txn.EventID)
|
|
}
|
|
}
|
|
// Sort logs by timestamp - oldest to newest. This will put the oldest logs at
|
|
// the front of the queue.
|
|
sort.Slice(deduped, func(i, j int) bool {
|
|
return deduped[i].TimeStamp.Before(deduped[j].TimeStamp)
|
|
})
|
|
return deduped
|
|
}
|
|
|
|
func (al *Logger) enqueue(txn *transaction) error {
|
|
al.mu.Lock()
|
|
defer al.mu.Unlock()
|
|
|
|
if err := al.appendToStoreLocked([]*transaction{txn}); err != nil {
|
|
return fmt.Errorf("%w: %w", ErrAuditLogStorageFailure, err)
|
|
}
|
|
|
|
// If a.transport is nil if the logger is stopped.
|
|
if al.transport != nil {
|
|
al.flushAsync()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
var _ LogStore = (*logStateStore)(nil)
|
|
|
|
// logStateStore is a concrete implementation of [LogStore]
|
|
// using [ipn.StateStore] as the underlying storage.
|
|
type logStateStore struct {
|
|
store ipn.StateStore
|
|
}
|
|
|
|
// NewLogStore creates a new LogStateStore with the given [ipn.StateStore].
|
|
func NewLogStore(store ipn.StateStore) LogStore {
|
|
return &logStateStore{
|
|
store: store,
|
|
}
|
|
}
|
|
|
|
func (s *logStateStore) generateKey(key ipn.ProfileID) string {
|
|
return "auditlog-" + string(key)
|
|
}
|
|
|
|
// Save saves the given logs to an [ipn.StateStore]. This overwrites
|
|
// any existing entries for the given key.
|
|
func (s *logStateStore) save(key ipn.ProfileID, txns []*transaction) error {
|
|
if key == "" {
|
|
return errors.New("empty key")
|
|
}
|
|
|
|
data, err := json.Marshal(txns)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
k := ipn.StateKey(s.generateKey(key))
|
|
return s.store.WriteState(k, data)
|
|
}
|
|
|
|
// Load retrieves the logs from an [ipn.StateStore].
|
|
func (s *logStateStore) load(key ipn.ProfileID) ([]*transaction, error) {
|
|
if key == "" {
|
|
return nil, errors.New("empty key")
|
|
}
|
|
|
|
k := ipn.StateKey(s.generateKey(key))
|
|
data, err := s.store.ReadState(k)
|
|
|
|
switch {
|
|
case errors.Is(err, ipn.ErrStateNotExist):
|
|
return nil, nil
|
|
case err != nil:
|
|
return nil, err
|
|
}
|
|
|
|
var txns []*transaction
|
|
err = json.Unmarshal(data, &txns)
|
|
return txns, err
|
|
}
|