cmd/tailscaled,util/syspolicy/source,util/winutil/gp: disallow acquiring the GP lock during service startup

In v1.78, we started acquiring the GP lock when reading policy settings. This led to a deadlock during
Tailscale installation via Group Policy Software Installation because the GP engine holds the write lock
for the duration of policy processing, which in turn waits for the installation to complete, which in turn
waits for the service to enter the running state.

In this PR, we prevent the acquisition of GP locks (aka EnterCriticalPolicySection) during service startup
and update the Windows Registry-based util/syspolicy/source.PlatformPolicyStore to handle this failure
gracefully. The GP lock is somewhat optional; it’s safe to read policy settings without it, but acquiring
the lock is recommended when reading multiple values to prevent the Group Policy engine from modifying
settings mid-read and to avoid inconsistent results.

Fixes #14416

Signed-off-by: Nick Khyl <nickk@tailscale.com>
This commit is contained in:
Nick Khyl 2025-01-16 15:48:07 -06:00 committed by Nick Khyl
parent 413fb5b933
commit f0db47338e
4 changed files with 138 additions and 8 deletions

View File

@ -55,6 +55,7 @@ import (
"tailscale.com/util/osdiag"
"tailscale.com/util/syspolicy"
"tailscale.com/util/winutil"
"tailscale.com/util/winutil/gp"
"tailscale.com/version"
"tailscale.com/wf"
)
@ -70,6 +71,22 @@ func init() {
}
}
// permitPolicyLocks is a function to be called to lift the restriction on acquiring
// [gp.PolicyLock]s once the service is running.
// It is safe to be called multiple times.
var permitPolicyLocks = func() {}
func init() {
if isWindowsService() {
// We prevent [gp.PolicyLock]s from being acquired until the service enters the running state.
// Otherwise, if tailscaled starts due to a GPSI policy installing Tailscale, it may deadlock
// while waiting for the write counterpart of the GP lock to be released by Group Policy,
// which is itself waiting for the installation to complete and tailscaled to start.
// See tailscale/tailscale#14416 for more information.
permitPolicyLocks = gp.RestrictPolicyLocks()
}
}
const serviceName = "Tailscale"
// Application-defined command codes between 128 and 255
@ -109,13 +126,13 @@ func tstunNewWithWindowsRetries(logf logger.Logf, tunName string) (_ tun.Device,
}
}
func isWindowsService() bool {
var isWindowsService = sync.OnceValue(func() bool {
v, err := svc.IsWindowsService()
if err != nil {
log.Fatalf("svc.IsWindowsService failed: %v", err)
}
return v
}
})
// syslogf is a logger function that writes to the Windows event log (ie, the
// one that you see in the Windows Event Viewer). tailscaled may optionally
@ -180,6 +197,10 @@ func (service *ipnService) Execute(args []string, r <-chan svc.ChangeRequest, ch
changes <- svc.Status{State: svc.Running, Accepts: svcAccepts}
syslogf("Service running")
// It is safe to allow GP locks to be acquired now that the service
// is running.
permitPolicyLocks()
for {
select {
case <-doneCh:

View File

@ -60,6 +60,7 @@ import (
_ "tailscale.com/util/osshare"
_ "tailscale.com/util/syspolicy"
_ "tailscale.com/util/winutil"
_ "tailscale.com/util/winutil/gp"
_ "tailscale.com/version"
_ "tailscale.com/version/distro"
_ "tailscale.com/wf"

View File

@ -12,6 +12,7 @@ import (
"golang.org/x/sys/windows"
"golang.org/x/sys/windows/registry"
"tailscale.com/util/set"
"tailscale.com/util/syspolicy/internal/loggerx"
"tailscale.com/util/syspolicy/setting"
"tailscale.com/util/winutil/gp"
)
@ -29,6 +30,18 @@ var (
_ Expirable = (*PlatformPolicyStore)(nil)
)
// lockableCloser is a [Lockable] that can also be closed.
// It is implemented by [gp.PolicyLock] and [optionalPolicyLock].
type lockableCloser interface {
Lockable
Close() error
}
var (
_ lockableCloser = (*gp.PolicyLock)(nil)
_ lockableCloser = (*optionalPolicyLock)(nil)
)
// PlatformPolicyStore implements [Store] by providing read access to
// Registry-based Tailscale policies, such as those configured via Group Policy or MDM.
// For better performance and consistency, it is recommended to lock it when
@ -55,7 +68,7 @@ type PlatformPolicyStore struct {
// they are being read.
//
// When both policyLock and mu need to be taken, mu must be taken before policyLock.
policyLock *gp.PolicyLock
policyLock lockableCloser
mu sync.Mutex
tsKeys []registry.Key // or nil if the [PlatformPolicyStore] hasn't been locked.
@ -108,7 +121,7 @@ func newPlatformPolicyStore(scope gp.Scope, softwareKey registry.Key, policyLock
scope: scope,
softwareKey: softwareKey,
done: make(chan struct{}),
policyLock: policyLock,
policyLock: &optionalPolicyLock{PolicyLock: policyLock},
}
}
@ -448,3 +461,68 @@ func tailscaleKeyNamesFor(scope gp.Scope) []string {
panic("unreachable")
}
}
type gpLockState int
const (
gpUnlocked = gpLockState(iota)
gpLocked
gpLockRestricted // the lock could not be acquired due to a restriction in place
)
// optionalPolicyLock is a wrapper around [gp.PolicyLock] that locks
// and unlocks the underlying [gp.PolicyLock].
//
// If the [gp.PolicyLock.Lock] returns [gp.ErrLockRestricted], the error is ignored,
// and calling [optionalPolicyLock.Unlock] is a no-op.
//
// The underlying GP lock is kinda optional: it is safe to read policy settings
// from the Registry without acquiring it, but it is recommended to lock it anyway
// when reading multiple policy settings to avoid potentially inconsistent results.
//
// It is not safe for concurrent use.
type optionalPolicyLock struct {
*gp.PolicyLock
state gpLockState
}
// Lock acquires the underlying [gp.PolicyLock], returning an error on failure.
// If the lock cannot be acquired due to a restriction in place
// (e.g., attempting to acquire a lock while the service is starting),
// the lock is considered to be held, the method returns nil, and a subsequent
// call to [Unlock] is a no-op.
// It is a runtime error to call Lock when the lock is already held.
func (o *optionalPolicyLock) Lock() error {
if o.state != gpUnlocked {
panic("already locked")
}
switch err := o.PolicyLock.Lock(); err {
case nil:
o.state = gpLocked
return nil
case gp.ErrLockRestricted:
loggerx.Errorf("GP lock not acquired: %v", err)
o.state = gpLockRestricted
return nil
default:
return err
}
}
// Unlock releases the underlying [gp.PolicyLock], if it was previously acquired.
// It is a runtime error to call Unlock when the lock is not held.
func (o *optionalPolicyLock) Unlock() {
switch o.state {
case gpLocked:
o.PolicyLock.Unlock()
case gpLockRestricted:
// The GP lock wasn't acquired due to a restriction in place
// when [optionalPolicyLock.Lock] was called. Unlock is a no-op.
case gpUnlocked:
panic("not locked")
default:
panic("unreachable")
}
o.state = gpUnlocked
}

View File

@ -48,10 +48,35 @@ type policyLockResult struct {
}
var (
// ErrInvalidLockState is returned by (*PolicyLock).Lock if the lock has a zero value or has already been closed.
// ErrInvalidLockState is returned by [PolicyLock.Lock] if the lock has a zero value or has already been closed.
ErrInvalidLockState = errors.New("the lock has not been created or has already been closed")
// ErrLockRestricted is returned by [PolicyLock.Lock] if the lock cannot be acquired due to a restriction in place,
// such as when [RestrictPolicyLocks] has been called.
ErrLockRestricted = errors.New("the lock cannot be acquired due to a restriction in place")
)
var policyLockRestricted atomic.Int32
// RestrictPolicyLocks forces all [PolicyLock.Lock] calls to return [ErrLockRestricted]
// until the returned function is called to remove the restriction.
//
// It is safe to call the returned function multiple times, but the restriction will only
// be removed once. If [RestrictPolicyLocks] is called multiple times, each call must be
// matched by a corresponding call to the returned function to fully remove the restrictions.
//
// It is primarily used to prevent certain deadlocks, such as when tailscaled attempts to acquire
// a policy lock during startup. If the service starts due to Tailscale being installed by GPSI,
// the write lock will be held by the Group Policy service throughout the installation,
// preventing tailscaled from acquiring the read lock. Since Group Policy waits for the installation
// to complete, and therefore for tailscaled to start, before releasing the write lock, this scenario
// would result in a deadlock. See tailscale/tailscale#14416 for more information.
func RestrictPolicyLocks() (removeRestriction func()) {
policyLockRestricted.Add(1)
return sync.OnceFunc(func() {
policyLockRestricted.Add(-1)
})
}
// NewMachinePolicyLock creates a PolicyLock that facilitates pausing the
// application of computer policy. To avoid deadlocks when acquiring both
// machine and user locks, acquire the user lock before the machine lock.
@ -103,13 +128,18 @@ func NewUserPolicyLock(token windows.Token) (*PolicyLock, error) {
}
// Lock locks l.
// It returns ErrNotInitialized if l has a zero value or has already been closed,
// or an Errno if the underlying Group Policy lock cannot be acquired.
// It returns [ErrInvalidLockState] if l has a zero value or has already been closed,
// [ErrLockRestricted] if the lock cannot be acquired due to a restriction in place,
// or a [syscall.Errno] if the underlying Group Policy lock cannot be acquired.
//
// As a special case, it fails with windows.ERROR_ACCESS_DENIED
// As a special case, it fails with [windows.ERROR_ACCESS_DENIED]
// if l is a user policy lock, and the corresponding user is not logged in
// interactively at the time of the call.
func (l *PolicyLock) Lock() error {
if policyLockRestricted.Load() > 0 {
return ErrLockRestricted
}
l.mu.Lock()
defer l.mu.Unlock()
if l.lockCnt.Add(2)&1 == 0 {