mirror of
https://github.com/tailscale/tailscale.git
synced 2024-11-29 13:05:46 +00:00
cmd/tailscaled: don't block ipnserver startup behind engine init on Windows
With this change, the ipnserver's safesocket.Listen (the localhost tcp.Listen) happens right away, before any synchronous TUN/DNS/Engine/etc setup work, which might be slow, especially on early boot on Windows. Because the safesocket.Listen starts up early, that means localhost TCP dials (the safesocket.Connect from the GUI) complete successfully and thus the GUI avoids the MessageBox error. (I verified that pacifies it, even without a Listener.Accept; I'd feared that Windows localhost was maybe special and avoided the normal listener backlog). Once the GUI can then connect immediately without errors, the various timeouts then matter less, because the backend is no longer trying to race against the GUI's timeout. So keep retrying on errors for a minute, or 10 minutes if the system just booted in the past 10 minutes. This should fix the problem with Windows 10 desktops auto-logging in and starting the Tailscale frontend which was then showing a MessageBox error about failing to connect to tailscaled, which was slow coming up because the Windows networking stack wasn't up yet. Fingers crossed. Fixes #1313 (previously #1187, etc) Signed-off-by: Brad Fitzpatrick <bradfitz@tailscale.com>
This commit is contained in:
parent
7d8f082ff7
commit
91c9c33036
@ -159,18 +159,16 @@ func beFirewallKillswitch() bool {
|
||||
|
||||
func startIPNServer(ctx context.Context, logid string) error {
|
||||
var logf logger.Logf = log.Printf
|
||||
var eng wgengine.Engine
|
||||
var err error
|
||||
|
||||
getEngine := func() (wgengine.Engine, error) {
|
||||
getEngineRaw := func() (wgengine.Engine, error) {
|
||||
dev, devName, err := tstun.New(logf, "Tailscale")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("TUN: %w", err)
|
||||
}
|
||||
r, err := router.New(logf, dev)
|
||||
if err != nil {
|
||||
dev.Close()
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("Router: %w", err)
|
||||
}
|
||||
if wrapNetstack {
|
||||
r = netstack.NewSubnetRouterWrapper(r)
|
||||
@ -179,7 +177,7 @@ func startIPNServer(ctx context.Context, logid string) error {
|
||||
if err != nil {
|
||||
r.Close()
|
||||
dev.Close()
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("DNS: %w", err)
|
||||
}
|
||||
eng, err := wgengine.NewUserspaceEngine(logf, wgengine.Config{
|
||||
Tun: dev,
|
||||
@ -190,7 +188,7 @@ func startIPNServer(ctx context.Context, logid string) error {
|
||||
if err != nil {
|
||||
r.Close()
|
||||
dev.Close()
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("Engine: %w", err)
|
||||
}
|
||||
onlySubnets := true
|
||||
if wrapNetstack {
|
||||
@ -199,60 +197,82 @@ func startIPNServer(ctx context.Context, logid string) error {
|
||||
return wgengine.NewWatchdog(eng), nil
|
||||
}
|
||||
|
||||
if msg := os.Getenv("TS_DEBUG_WIN_FAIL"); msg != "" {
|
||||
err = fmt.Errorf("pretending to be a service failure: %v", msg)
|
||||
type engineOrError struct {
|
||||
Engine wgengine.Engine
|
||||
Err error
|
||||
}
|
||||
engErrc := make(chan engineOrError)
|
||||
t0 := time.Now()
|
||||
go func() {
|
||||
const ms = time.Millisecond
|
||||
for try := 1; ; try++ {
|
||||
logf("tailscaled: getting engine... (try %v)", try)
|
||||
t1 := time.Now()
|
||||
eng, err := getEngineRaw()
|
||||
d, dt := time.Since(t1).Round(ms), time.Since(t1).Round(ms)
|
||||
if err != nil {
|
||||
logf("tailscaled: engine fetch error (try %v) in %v (total %v, sysUptime %v): %v",
|
||||
try, d, dt, windowsUptime().Round(time.Second), err)
|
||||
} else {
|
||||
// We have a bunch of bug reports of wgengine.NewUserspaceEngine returning a few different errors,
|
||||
// all intermittently. A few times I (Brad) have also seen sporadic failures that simply
|
||||
// restarting fixed. So try a few times.
|
||||
for try := 1; try <= 5; try++ {
|
||||
if try > 1 {
|
||||
// Only sleep a bit. Don't do some massive backoff because
|
||||
// the frontend GUI has a 30 second timeout on connecting to us,
|
||||
// but even 5 seconds is too long for them to get any results.
|
||||
// 5 tries * 1 second each seems fine.
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
eng, err = getEngine()
|
||||
if err != nil {
|
||||
logf("wgengine.NewUserspaceEngine: (try %v) %v", try, err)
|
||||
continue
|
||||
}
|
||||
if try > 1 {
|
||||
logf("wgengine.NewUserspaceEngine: ended up working on try %v", try)
|
||||
}
|
||||
break
|
||||
logf("tailscaled: got engine on try %v in %v (total %v)", try, d, dt)
|
||||
} else {
|
||||
logf("tailscaled: got engine in %v", d)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
// Log the error, but don't fatalf. We want to
|
||||
// propagate the error message to the UI frontend. So
|
||||
// we continue and tell the ipnserver to return that
|
||||
// in a Notify message.
|
||||
logf("wgengine.NewUserspaceEngine: %v", err)
|
||||
timer := time.NewTimer(5 * time.Second)
|
||||
engErrc <- engineOrError{eng, err}
|
||||
if err == nil {
|
||||
timer.Stop()
|
||||
return
|
||||
}
|
||||
<-timer.C
|
||||
}
|
||||
}()
|
||||
|
||||
opts := ipnserver.Options{
|
||||
Port: 41112,
|
||||
SurviveDisconnects: false,
|
||||
StatePath: args.statepath,
|
||||
}
|
||||
if err != nil {
|
||||
|
||||
// getEngine is called by ipnserver to get the engine. It's
|
||||
// not called concurrently and is not called again once it
|
||||
// successfully returns an engine.
|
||||
getEngine := func() (wgengine.Engine, error) {
|
||||
if msg := os.Getenv("TS_DEBUG_WIN_FAIL"); msg != "" {
|
||||
return nil, fmt.Errorf("pretending to be a service failure: %v", msg)
|
||||
}
|
||||
for {
|
||||
res := <-engErrc
|
||||
if res.Engine != nil {
|
||||
return res.Engine, nil
|
||||
}
|
||||
if time.Since(t0) < time.Minute || windowsUptime() < 10*time.Minute {
|
||||
// Ignore errors during early boot. Windows 10 auto logs in the GUI
|
||||
// way sooner than the networking stack components start up.
|
||||
// So the network will fail for a bit (and require a few tries) while
|
||||
// the GUI is still fine.
|
||||
continue
|
||||
}
|
||||
// Return nicer errors to users, annotated with logids, which helps
|
||||
// when they file bugs.
|
||||
rawGetEngine := getEngine // raw == without verbose logid-containing error
|
||||
getEngine = func() (wgengine.Engine, error) {
|
||||
eng, err := rawGetEngine()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("wgengine.NewUserspaceEngine: %v\n\nlogid: %v", err, logid)
|
||||
return nil, fmt.Errorf("%w\n\nlogid: %v", res.Err, logid)
|
||||
}
|
||||
return eng, nil
|
||||
}
|
||||
} else {
|
||||
getEngine = ipnserver.FixedEngine(eng)
|
||||
}
|
||||
err = ipnserver.Run(ctx, logf, logid, getEngine, opts)
|
||||
err := ipnserver.Run(ctx, logf, logid, getEngine, opts)
|
||||
if err != nil {
|
||||
logf("ipnserver.Run: %v", err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
var (
|
||||
kernel32 = windows.NewLazySystemDLL("kernel32.dll")
|
||||
getTickCount64Proc = kernel32.NewProc("GetTickCount64")
|
||||
)
|
||||
|
||||
func windowsUptime() time.Duration {
|
||||
r, _, _ := getTickCount64Proc.Call()
|
||||
return time.Duration(int64(r)) * time.Millisecond
|
||||
}
|
||||
|
@ -592,6 +592,7 @@ func (s *server) writeToClients(b []byte) {
|
||||
// Run runs a Tailscale backend service.
|
||||
// The getEngine func is called repeatedly, once per connection, until it returns an engine successfully.
|
||||
func Run(ctx context.Context, logf logger.Logf, logid string, getEngine func() (wgengine.Engine, error), opts Options) error {
|
||||
getEngine = getEngineUntilItWorksWrapper(getEngine)
|
||||
runDone := make(chan struct{})
|
||||
defer close(runDone)
|
||||
|
||||
@ -652,46 +653,6 @@ func Run(ctx context.Context, logf logger.Logf, logid string, getEngine func() (
|
||||
eng, err := getEngine()
|
||||
if err != nil {
|
||||
logf("ipnserver: initial getEngine call: %v", err)
|
||||
|
||||
// Issue 1187: on Windows, in unattended mode,
|
||||
// sometimes we try 5 times and fail to create the
|
||||
// engine before the system's ready. Hack until the
|
||||
// bug if fixed properly: if we're running in
|
||||
// unattended mode on Windows, keep trying forever,
|
||||
// waiting for the machine to be ready (networking to
|
||||
// come up?) and then dial our own safesocket TCP
|
||||
// listener to wake up the usual mechanism that lets
|
||||
// us surface getEngine errors to UI clients. (We
|
||||
// don't want to just call getEngine in a loop without
|
||||
// the listener.Accept, as we do want to handle client
|
||||
// connections so we can tell them about errors)
|
||||
|
||||
bootRaceWaitForEngine, bootRaceWaitForEngineCancel := context.WithTimeout(context.Background(), time.Minute)
|
||||
if runtime.GOOS == "windows" && opts.AutostartStateKey != "" {
|
||||
logf("ipnserver: in unattended mode, waiting for engine availability")
|
||||
getEngine = getEngineUntilItWorksWrapper(getEngine)
|
||||
// Wait for it to be ready.
|
||||
go func() {
|
||||
defer bootRaceWaitForEngineCancel()
|
||||
t0 := time.Now()
|
||||
for {
|
||||
time.Sleep(10 * time.Second)
|
||||
if _, err := getEngine(); err != nil {
|
||||
logf("ipnserver: unattended mode engine load: %v", err)
|
||||
continue
|
||||
}
|
||||
c, err := net.Dial("tcp", listen.Addr().String())
|
||||
logf("ipnserver: engine created after %v; waking up Accept: Dial error: %v", time.Since(t0).Round(time.Second), err)
|
||||
if err == nil {
|
||||
c.Close()
|
||||
}
|
||||
break
|
||||
}
|
||||
}()
|
||||
} else {
|
||||
bootRaceWaitForEngineCancel()
|
||||
}
|
||||
|
||||
for i := 1; ctx.Err() == nil; i++ {
|
||||
c, err := listen.Accept()
|
||||
if err != nil {
|
||||
@ -699,7 +660,6 @@ func Run(ctx context.Context, logf logger.Logf, logid string, getEngine func() (
|
||||
bo.BackOff(ctx, err)
|
||||
continue
|
||||
}
|
||||
<-bootRaceWaitForEngine.Done()
|
||||
logf("ipnserver: try%d: trying getEngine again...", i)
|
||||
eng, err = getEngine()
|
||||
if err == nil {
|
||||
|
Loading…
Reference in New Issue
Block a user