From 0fba9e7570487146ca823ad4ddc06f9524cb4a3c Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Tue, 16 Apr 2024 09:10:50 -0700 Subject: [PATCH] cmd/tailscale/cli: prevent concurrent Start calls in 'up' Seems to deflake tstest/integration tests. I can't reproduce it anymore on one of my VMs that was consistently flaking after a dozen runs before. Now I can run hundreds of times. Updates #11649 Fixes #7036 Change-Id: I2f7d4ae97500d507bdd78af9e92cd1242e8e44b8 Signed-off-by: Brad Fitzpatrick --- cmd/tailscale/cli/up.go | 27 +++++++++++++++++++++----- tstest/integration/integration_test.go | 18 ++++++++--------- wgengine/magicsock/magicsock_test.go | 2 +- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/cmd/tailscale/cli/up.go b/cmd/tailscale/cli/up.go index a9dc8a3b5..951ed4edd 100644 --- a/cmd/tailscale/cli/up.go +++ b/cmd/tailscale/cli/up.go @@ -496,11 +496,23 @@ func runUp(ctx context.Context, cmd string, args []string, upArgs upArgsT) (retE running := make(chan bool, 1) // gets value once in state ipn.Running pumpErr := make(chan error, 1) - var printed bool // whether we've yet printed anything to stdout or stderr - var loginOnce sync.Once - startLoginInteractive := func() { loginOnce.Do(func() { localClient.StartLoginInteractive(ctx) }) } + // localAPIMu should be held while doing mutable LocalAPI calls + // to the backend. In particular, it prevents StartLoginInteractive from + // being called from the watcher goroutine while the Start call from + // the other goroutine is in progress. + // See https://github.com/tailscale/tailscale/issues/7036#issuecomment-2053771466 + // TODO(bradfitz): simplify this once #11649 is cleaned up and Start is + // hopefully removed. + var localAPIMu sync.Mutex + + startLoginInteractive := sync.OnceFunc(func() { + localAPIMu.Lock() + defer localAPIMu.Unlock() + localClient.StartLoginInteractive(ctx) + }) go func() { + var printed bool // whether we've yet printed anything to stdout or stderr for { n, err := watcher.Next() if err != nil { @@ -574,12 +586,14 @@ func runUp(ctx context.Context, cmd string, args []string, upArgs upArgsT) (retE // Special case: bare "tailscale up" means to just start // running, if there's ever been a login. if simpleUp { + localAPIMu.Lock() _, err := localClient.EditPrefs(ctx, &ipn.MaskedPrefs{ Prefs: ipn.Prefs{ WantRunning: true, }, WantRunningSet: true, }) + localAPIMu.Unlock() if err != nil { return err } @@ -596,10 +610,13 @@ func runUp(ctx context.Context, cmd string, args []string, upArgs upArgsT) (retE if err != nil { return err } - if err := localClient.Start(ctx, ipn.Options{ + localAPIMu.Lock() + err = localClient.Start(ctx, ipn.Options{ AuthKey: authKey, UpdatePrefs: prefs, - }); err != nil { + }) + localAPIMu.Unlock() + if err != nil { return err } if upArgs.forceReauth { diff --git a/tstest/integration/integration_test.go b/tstest/integration/integration_test.go index 0dfaee5a6..41c04fce2 100644 --- a/tstest/integration/integration_test.go +++ b/tstest/integration/integration_test.go @@ -345,7 +345,6 @@ func TestConfigFileAuthKey(t *testing.T) { func TestTwoNodes(t *testing.T) { tstest.Shard(t) - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/3598") tstest.Parallel(t) env := newTestEnv(t) @@ -400,7 +399,6 @@ func TestTwoNodes(t *testing.T) { // PeersRemoved set) saying that the second node disappeared. func TestIncrementalMapUpdatePeersRemoved(t *testing.T) { tstest.Shard(t) - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/3598") tstest.Parallel(t) env := newTestEnv(t) @@ -658,17 +656,14 @@ func TestNoControlConnWhenDown(t *testing.T) { d2 := n1.StartDaemon() n1.AwaitResponding() - st := n1.MustStatus() - if got, want := st.BackendState, "Stopped"; got != want { - t.Fatalf("after restart, state = %q; want %q", got, want) - } + n1.AwaitBackendState("Stopped") ip2 := n1.AwaitIP4() if ip1 != ip2 { t.Errorf("IPs different: %q vs %q", ip1, ip2) } - // The real test: verify our daemon doesn't have an HTTP request open.: + // The real test: verify our daemon doesn't have an HTTP request open. if n := env.Control.InServeMap(); n != 0 { t.Errorf("in serve map = %d; want 0", n) } @@ -1007,7 +1002,6 @@ func newTestEnv(t testing.TB, opts ...testEnvOpt) *testEnv { if runtime.GOOS == "windows" { t.Skip("not tested/working on Windows yet") } - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/7036") derpMap := RunDERPAndSTUN(t, logger.Discard, "127.0.0.1") logc := new(LogCatcher) control := &testcontrol.Server{ @@ -1389,6 +1383,10 @@ func (n *testNode) AwaitIP6() netip.Addr { // AwaitRunning waits for n to reach the IPN state "Running". func (n *testNode) AwaitRunning() { + n.AwaitBackendState("Running") +} + +func (n *testNode) AwaitBackendState(state string) { t := n.env.t t.Helper() if err := tstest.WaitFor(20*time.Second, func() error { @@ -1396,8 +1394,8 @@ func (n *testNode) AwaitRunning() { if err != nil { return err } - if st.BackendState != "Running" { - return fmt.Errorf("in state %q", st.BackendState) + if st.BackendState != state { + return fmt.Errorf("in state %q; want %q", st.BackendState, state) } return nil }); err != nil { diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go index 7605bdce9..8bbab15e0 100644 --- a/wgengine/magicsock/magicsock_test.go +++ b/wgengine/magicsock/magicsock_test.go @@ -626,7 +626,7 @@ func (localhostListener) ListenPacket(ctx context.Context, network, address stri } func TestTwoDevicePing(t *testing.T) { - flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/1277") + flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/11762") l, ip := localhostListener{}, netaddr.IPv4(127, 0, 0, 1) n := &devices{ m1: l,