lock: move code to repository package

2025-12-03 19:31:47 +00:00 · 2024-02-24 16:26:29 +01:00
parent 118a69a84b
commit cbb5f89252
3 changed files with 370 additions and 363 deletions
--- a/internal/repository/lock.go
+++ b/internal/repository/lock.go
@@ -0,0 +1,301 @@
+package repository
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/restic/restic/internal/backend"
+	"github.com/restic/restic/internal/debug"
+	"github.com/restic/restic/internal/errors"
+	"github.com/restic/restic/internal/restic"
+)
+
+type lockContext struct {
+	lock      *restic.Lock
+	cancel    context.CancelFunc
+	refreshWG sync.WaitGroup
+}
+
+var globalLocks struct {
+	locks map[*restic.Lock]*lockContext
+	sync.Mutex
+}
+
+var (
+	retrySleepStart = 5 * time.Second
+	retrySleepMax   = 60 * time.Second
+)
+
+func minDuration(a, b time.Duration) time.Duration {
+	if a <= b {
+		return a
+	}
+	return b
+}
+
+// Lock wraps the ctx such that it is cancelled when the repository is unlocked
+// cancelling the original context also stops the lock refresh
+func Lock(ctx context.Context, repo restic.Repository, exclusive bool, retryLock time.Duration, printRetry func(msg string), logger func(format string, args ...interface{})) (*restic.Lock, context.Context, error) {
+
+	lockFn := restic.NewLock
+	if exclusive {
+		lockFn = restic.NewExclusiveLock
+	}
+
+	var lock *restic.Lock
+	var err error
+
+	retrySleep := minDuration(retrySleepStart, retryLock)
+	retryMessagePrinted := false
+	retryTimeout := time.After(retryLock)
+
+retryLoop:
+	for {
+		lock, err = lockFn(ctx, repo)
+		if err != nil && restic.IsAlreadyLocked(err) {
+
+			if !retryMessagePrinted {
+				printRetry(fmt.Sprintf("repo already locked, waiting up to %s for the lock\n", retryLock))
+				retryMessagePrinted = true
+			}
+
+			debug.Log("repo already locked, retrying in %v", retrySleep)
+			retrySleepCh := time.After(retrySleep)
+
+			select {
+			case <-ctx.Done():
+				return nil, ctx, ctx.Err()
+			case <-retryTimeout:
+				debug.Log("repo already locked, timeout expired")
+				// Last lock attempt
+				lock, err = lockFn(ctx, repo)
+				break retryLoop
+			case <-retrySleepCh:
+				retrySleep = minDuration(retrySleep*2, retrySleepMax)
+			}
+		} else {
+			// anything else, either a successful lock or another error
+			break retryLoop
+		}
+	}
+	if restic.IsInvalidLock(err) {
+		return nil, ctx, errors.Fatalf("%v\n\nthe `unlock --remove-all` command can be used to remove invalid locks. Make sure that no other restic process is accessing the repository when running the command", err)
+	}
+	if err != nil {
+		return nil, ctx, fmt.Errorf("unable to create lock in backend: %w", err)
+	}
+	debug.Log("create lock %p (exclusive %v)", lock, exclusive)
+
+	ctx, cancel := context.WithCancel(ctx)
+	lockInfo := &lockContext{
+		lock:   lock,
+		cancel: cancel,
+	}
+	lockInfo.refreshWG.Add(2)
+	refreshChan := make(chan struct{})
+	forceRefreshChan := make(chan refreshLockRequest)
+
+	globalLocks.Lock()
+	globalLocks.locks[lock] = lockInfo
+	go refreshLocks(ctx, repo.Backend(), lockInfo, refreshChan, forceRefreshChan, logger)
+	go monitorLockRefresh(ctx, lockInfo, refreshChan, forceRefreshChan, logger)
+	globalLocks.Unlock()
+
+	return lock, ctx, err
+}
+
+var refreshInterval = 5 * time.Minute
+
+// consider a lock refresh failed a bit before the lock actually becomes stale
+// the difference allows to compensate for a small time drift between clients.
+var refreshabilityTimeout = restic.StaleLockTimeout - refreshInterval*3/2
+
+type refreshLockRequest struct {
+	result chan bool
+}
+
+func refreshLocks(ctx context.Context, backend backend.Backend, lockInfo *lockContext, refreshed chan<- struct{}, forceRefresh <-chan refreshLockRequest, logger func(format string, args ...interface{})) {
+	debug.Log("start")
+	lock := lockInfo.lock
+	ticker := time.NewTicker(refreshInterval)
+	lastRefresh := lock.Time
+
+	defer func() {
+		ticker.Stop()
+		// ensure that the context was cancelled before removing the lock
+		lockInfo.cancel()
+
+		// remove the lock from the repo
+		debug.Log("unlocking repository with lock %v", lock)
+		if err := lock.Unlock(); err != nil {
+			debug.Log("error while unlocking: %v", err)
+			logger("error while unlocking: %v", err)
+		}
+
+		lockInfo.refreshWG.Done()
+	}()
+
+	for {
+		select {
+		case <-ctx.Done():
+			debug.Log("terminate")
+			return
+
+		case req := <-forceRefresh:
+			debug.Log("trying to refresh stale lock")
+			// keep on going if our current lock still exists
+			success := tryRefreshStaleLock(ctx, backend, lock, lockInfo.cancel, logger)
+			// inform refresh goroutine about forced refresh
+			select {
+			case <-ctx.Done():
+			case req.result <- success:
+			}
+
+			if success {
+				// update lock refresh time
+				lastRefresh = lock.Time
+			}
+
+		case <-ticker.C:
+			if time.Since(lastRefresh) > refreshabilityTimeout {
+				// the lock is too old, wait until the expiry monitor cancels the context
+				continue
+			}
+
+			debug.Log("refreshing locks")
+			err := lock.Refresh(context.TODO())
+			if err != nil {
+				logger("unable to refresh lock: %v\n", err)
+			} else {
+				lastRefresh = lock.Time
+				// inform monitor goroutine about successful refresh
+				select {
+				case <-ctx.Done():
+				case refreshed <- struct{}{}:
+				}
+			}
+		}
+	}
+}
+
+func monitorLockRefresh(ctx context.Context, lockInfo *lockContext, refreshed <-chan struct{}, forceRefresh chan<- refreshLockRequest, logger func(format string, args ...interface{})) {
+	// time.Now() might use a monotonic timer which is paused during standby
+	// convert to unix time to ensure we compare real time values
+	lastRefresh := time.Now().UnixNano()
+	pollDuration := 1 * time.Second
+	if refreshInterval < pollDuration {
+		// required for TestLockFailedRefresh
+		pollDuration = refreshInterval / 5
+	}
+	// timers are paused during standby, which is a problem as the refresh timeout
+	// _must_ expire if the host was too long in standby. Thus fall back to periodic checks
+	// https://github.com/golang/go/issues/35012
+	ticker := time.NewTicker(pollDuration)
+	defer func() {
+		ticker.Stop()
+		lockInfo.cancel()
+		lockInfo.refreshWG.Done()
+	}()
+
+	var refreshStaleLockResult chan bool
+
+	for {
+		select {
+		case <-ctx.Done():
+			debug.Log("terminate expiry monitoring")
+			return
+		case <-refreshed:
+			if refreshStaleLockResult != nil {
+				// ignore delayed refresh notifications while the stale lock is refreshed
+				continue
+			}
+			lastRefresh = time.Now().UnixNano()
+		case <-ticker.C:
+			if time.Now().UnixNano()-lastRefresh < refreshabilityTimeout.Nanoseconds() || refreshStaleLockResult != nil {
+				continue
+			}
+
+			debug.Log("trying to refreshStaleLock")
+			// keep on going if our current lock still exists
+			refreshReq := refreshLockRequest{
+				result: make(chan bool),
+			}
+			refreshStaleLockResult = refreshReq.result
+
+			// inform refresh goroutine about forced refresh
+			select {
+			case <-ctx.Done():
+			case forceRefresh <- refreshReq:
+			}
+		case success := <-refreshStaleLockResult:
+			if success {
+				lastRefresh = time.Now().UnixNano()
+				refreshStaleLockResult = nil
+				continue
+			}
+
+			logger("Fatal: failed to refresh lock in time\n")
+			return
+		}
+	}
+}
+
+func tryRefreshStaleLock(ctx context.Context, be backend.Backend, lock *restic.Lock, cancel context.CancelFunc, logger func(format string, args ...interface{})) bool {
+	freeze := backend.AsBackend[backend.FreezeBackend](be)
+	if freeze != nil {
+		debug.Log("freezing backend")
+		freeze.Freeze()
+		defer freeze.Unfreeze()
+	}
+
+	err := lock.RefreshStaleLock(ctx)
+	if err != nil {
+		logger("failed to refresh stale lock: %v\n", err)
+		// cancel context while the backend is still frozen to prevent accidental modifications
+		cancel()
+		return false
+	}
+
+	return true
+}
+
+func Unlock(lock *restic.Lock) {
+	if lock == nil {
+		return
+	}
+
+	globalLocks.Lock()
+	lockInfo, exists := globalLocks.locks[lock]
+	delete(globalLocks.locks, lock)
+	globalLocks.Unlock()
+
+	if !exists {
+		debug.Log("unable to find lock %v in the global list of locks, ignoring", lock)
+		return
+	}
+	lockInfo.cancel()
+	lockInfo.refreshWG.Wait()
+}
+
+func UnlockAll(code int) (int, error) {
+	globalLocks.Lock()
+	locks := globalLocks.locks
+	debug.Log("unlocking %d locks", len(globalLocks.locks))
+	for _, lockInfo := range globalLocks.locks {
+		lockInfo.cancel()
+	}
+	globalLocks.locks = make(map[*restic.Lock]*lockContext)
+	globalLocks.Unlock()
+
+	for _, lockInfo := range locks {
+		lockInfo.refreshWG.Wait()
+	}
+
+	return code, nil
+}
+
+func init() {
+	globalLocks.locks = make(map[*restic.Lock]*lockContext)
+}
--- a/internal/repository/lock_test.go
+++ b/internal/repository/lock_test.go
@@ -0,0 +1,321 @@
+package repository
+
+import (
+	"context"
+	"fmt"
+	"runtime"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/restic/restic/internal/backend"
+	"github.com/restic/restic/internal/backend/mem"
+	"github.com/restic/restic/internal/debug"
+	"github.com/restic/restic/internal/restic"
+	"github.com/restic/restic/internal/test"
+	rtest "github.com/restic/restic/internal/test"
+)
+
+type backendWrapper func(r backend.Backend) (backend.Backend, error)
+
+func openLockTestRepo(t *testing.T, wrapper backendWrapper) restic.Repository {
+	be := backend.Backend(mem.New())
+	// initialize repo
+	TestRepositoryWithBackend(t, be, 0, Options{})
+
+	// reopen repository to allow injecting a backend wrapper
+	if wrapper != nil {
+		var err error
+		be, err = wrapper(be)
+		rtest.OK(t, err)
+	}
+
+	repo, err := New(be, Options{})
+	rtest.OK(t, err)
+	rtest.OK(t, repo.SearchKey(context.TODO(), test.TestPassword, 1, ""))
+	return repo
+}
+
+func checkedLockRepo(ctx context.Context, t *testing.T, repo restic.Repository, retryLock time.Duration) (*restic.Lock, context.Context) {
+	lock, wrappedCtx, err := Lock(ctx, repo, false, retryLock, func(msg string) {}, func(format string, args ...interface{}) {})
+	test.OK(t, err)
+	test.OK(t, wrappedCtx.Err())
+	if lock.Stale() {
+		t.Fatal("lock returned stale lock")
+	}
+	return lock, wrappedCtx
+}
+
+func TestLock(t *testing.T) {
+	repo := openLockTestRepo(t, nil)
+
+	lock, wrappedCtx := checkedLockRepo(context.Background(), t, repo, 0)
+	Unlock(lock)
+	if wrappedCtx.Err() == nil {
+		t.Fatal("unlock did not cancel context")
+	}
+}
+
+func TestLockCancel(t *testing.T) {
+	repo := openLockTestRepo(t, nil)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	lock, wrappedCtx := checkedLockRepo(ctx, t, repo, 0)
+	cancel()
+	if wrappedCtx.Err() == nil {
+		t.Fatal("canceled parent context did not cancel context")
+	}
+
+	// Unlock should not crash
+	Unlock(lock)
+}
+
+func TestLockUnlockAll(t *testing.T) {
+	repo := openLockTestRepo(t, nil)
+
+	lock, wrappedCtx := checkedLockRepo(context.Background(), t, repo, 0)
+	_, err := UnlockAll(0)
+	test.OK(t, err)
+	if wrappedCtx.Err() == nil {
+		t.Fatal("canceled parent context did not cancel context")
+	}
+
+	// Unlock should not crash
+	Unlock(lock)
+}
+
+func TestLockConflict(t *testing.T) {
+	repo := openLockTestRepo(t, nil)
+	repo2, err := New(repo.Backend(), Options{})
+	test.OK(t, err)
+	test.OK(t, repo2.SearchKey(context.TODO(), test.TestPassword, 1, ""))
+
+	lock, _, err := Lock(context.Background(), repo, true, 0, func(msg string) {}, func(format string, args ...interface{}) {})
+	test.OK(t, err)
+	defer Unlock(lock)
+	_, _, err = Lock(context.Background(), repo2, false, 0, func(msg string) {}, func(format string, args ...interface{}) {})
+	if err == nil {
+		t.Fatal("second lock should have failed")
+	}
+	test.Assert(t, restic.IsAlreadyLocked(err), "unexpected error %v", err)
+}
+
+type writeOnceBackend struct {
+	backend.Backend
+	written bool
+}
+
+func (b *writeOnceBackend) Save(ctx context.Context, h backend.Handle, rd backend.RewindReader) error {
+	if b.written {
+		return fmt.Errorf("fail after first write")
+	}
+	b.written = true
+	return b.Backend.Save(ctx, h, rd)
+}
+
+func TestLockFailedRefresh(t *testing.T) {
+	repo := openLockTestRepo(t, func(r backend.Backend) (backend.Backend, error) {
+		return &writeOnceBackend{Backend: r}, nil
+	})
+
+	// reduce locking intervals to be suitable for testing
+	ri, rt := refreshInterval, refreshabilityTimeout
+	refreshInterval = 20 * time.Millisecond
+	refreshabilityTimeout = 100 * time.Millisecond
+	defer func() {
+		refreshInterval, refreshabilityTimeout = ri, rt
+	}()
+
+	lock, wrappedCtx := checkedLockRepo(context.Background(), t, repo, 0)
+
+	select {
+	case <-wrappedCtx.Done():
+		// expected lock refresh failure
+	case <-time.After(time.Second):
+		t.Fatal("failed lock refresh did not cause context cancellation")
+	}
+	// Unlock should not crash
+	Unlock(lock)
+}
+
+type loggingBackend struct {
+	backend.Backend
+	t *testing.T
+}
+
+func (b *loggingBackend) Save(ctx context.Context, h backend.Handle, rd backend.RewindReader) error {
+	b.t.Logf("save %v @ %v", h, time.Now())
+	err := b.Backend.Save(ctx, h, rd)
+	b.t.Logf("save finished %v @ %v", h, time.Now())
+	return err
+}
+
+func TestLockSuccessfulRefresh(t *testing.T) {
+	repo := openLockTestRepo(t, func(r backend.Backend) (backend.Backend, error) {
+		return &loggingBackend{
+			Backend: r,
+			t:       t,
+		}, nil
+	})
+
+	t.Logf("test for successful lock refresh %v", time.Now())
+	// reduce locking intervals to be suitable for testing
+	ri, rt := refreshInterval, refreshabilityTimeout
+	refreshInterval = 60 * time.Millisecond
+	refreshabilityTimeout = 500 * time.Millisecond
+	defer func() {
+		refreshInterval, refreshabilityTimeout = ri, rt
+	}()
+
+	lock, wrappedCtx := checkedLockRepo(context.Background(), t, repo, 0)
+
+	select {
+	case <-wrappedCtx.Done():
+		// don't call t.Fatal to allow the lock to be properly cleaned up
+		t.Error("lock refresh failed", time.Now())
+
+		// Dump full stacktrace
+		buf := make([]byte, 1024*1024)
+		n := runtime.Stack(buf, true)
+		buf = buf[:n]
+		t.Log(string(buf))
+
+	case <-time.After(2 * refreshabilityTimeout):
+		// expected lock refresh to work
+	}
+	// Unlock should not crash
+	Unlock(lock)
+}
+
+type slowBackend struct {
+	backend.Backend
+	m     sync.Mutex
+	sleep time.Duration
+}
+
+func (b *slowBackend) Save(ctx context.Context, h backend.Handle, rd backend.RewindReader) error {
+	b.m.Lock()
+	sleep := b.sleep
+	b.m.Unlock()
+	time.Sleep(sleep)
+	return b.Backend.Save(ctx, h, rd)
+}
+
+func TestLockSuccessfulStaleRefresh(t *testing.T) {
+	var sb *slowBackend
+	repo := openLockTestRepo(t, func(r backend.Backend) (backend.Backend, error) {
+		sb = &slowBackend{Backend: r}
+		return sb, nil
+	})
+
+	t.Logf("test for successful lock refresh %v", time.Now())
+	// reduce locking intervals to be suitable for testing
+	ri, rt := refreshInterval, refreshabilityTimeout
+	refreshInterval = 10 * time.Millisecond
+	refreshabilityTimeout = 50 * time.Millisecond
+	defer func() {
+		refreshInterval, refreshabilityTimeout = ri, rt
+	}()
+
+	lock, wrappedCtx := checkedLockRepo(context.Background(), t, repo, 0)
+	// delay lock refreshing long enough that the lock would expire
+	sb.m.Lock()
+	sb.sleep = refreshabilityTimeout + refreshInterval
+	sb.m.Unlock()
+
+	select {
+	case <-wrappedCtx.Done():
+		// don't call t.Fatal to allow the lock to be properly cleaned up
+		t.Error("lock refresh failed", time.Now())
+
+	case <-time.After(refreshabilityTimeout):
+	}
+	// reset slow backend
+	sb.m.Lock()
+	sb.sleep = 0
+	sb.m.Unlock()
+	debug.Log("normal lock period has expired")
+
+	select {
+	case <-wrappedCtx.Done():
+		// don't call t.Fatal to allow the lock to be properly cleaned up
+		t.Error("lock refresh failed", time.Now())
+
+	case <-time.After(3 * refreshabilityTimeout):
+		// expected lock refresh to work
+	}
+
+	// Unlock should not crash
+	Unlock(lock)
+}
+
+func TestLockWaitTimeout(t *testing.T) {
+	repo := openLockTestRepo(t, nil)
+
+	elock, _, err := Lock(context.TODO(), repo, true, 0, func(msg string) {}, func(format string, args ...interface{}) {})
+	test.OK(t, err)
+
+	retryLock := 200 * time.Millisecond
+
+	start := time.Now()
+	lock, _, err := Lock(context.TODO(), repo, false, retryLock, func(msg string) {}, func(format string, args ...interface{}) {})
+	duration := time.Since(start)
+
+	test.Assert(t, err != nil,
+		"create normal lock with exclusively locked repo didn't return an error")
+	test.Assert(t, strings.Contains(err.Error(), "repository is already locked exclusively"),
+		"create normal lock with exclusively locked repo didn't return the correct error")
+	test.Assert(t, retryLock <= duration && duration < retryLock*3/2,
+		"create normal lock with exclusively locked repo didn't wait for the specified timeout")
+
+	test.OK(t, lock.Unlock())
+	test.OK(t, elock.Unlock())
+}
+
+func TestLockWaitCancel(t *testing.T) {
+	repo := openLockTestRepo(t, nil)
+
+	elock, _, err := Lock(context.TODO(), repo, true, 0, func(msg string) {}, func(format string, args ...interface{}) {})
+	test.OK(t, err)
+
+	retryLock := 200 * time.Millisecond
+	cancelAfter := 40 * time.Millisecond
+
+	start := time.Now()
+	ctx, cancel := context.WithCancel(context.TODO())
+	time.AfterFunc(cancelAfter, cancel)
+
+	lock, _, err := Lock(ctx, repo, false, retryLock, func(msg string) {}, func(format string, args ...interface{}) {})
+	duration := time.Since(start)
+
+	test.Assert(t, err != nil,
+		"create normal lock with exclusively locked repo didn't return an error")
+	test.Assert(t, strings.Contains(err.Error(), "context canceled"),
+		"create normal lock with exclusively locked repo didn't return the correct error")
+	test.Assert(t, cancelAfter <= duration && duration < retryLock-10*time.Millisecond,
+		"create normal lock with exclusively locked repo didn't return in time, duration %v", duration)
+
+	test.OK(t, lock.Unlock())
+	test.OK(t, elock.Unlock())
+}
+
+func TestLockWaitSuccess(t *testing.T) {
+	repo := openLockTestRepo(t, nil)
+
+	elock, _, err := Lock(context.TODO(), repo, true, 0, func(msg string) {}, func(format string, args ...interface{}) {})
+	test.OK(t, err)
+
+	retryLock := 200 * time.Millisecond
+	unlockAfter := 40 * time.Millisecond
+
+	time.AfterFunc(unlockAfter, func() {
+		test.OK(t, elock.Unlock())
+	})
+
+	lock, _, err := Lock(context.TODO(), repo, false, retryLock, func(msg string) {}, func(format string, args ...interface{}) {})
+	test.OK(t, err)
+
+	test.OK(t, lock.Unlock())
+}