diff --git a/wgengine/magicsock/magicsock_test.go b/wgengine/magicsock/magicsock_test.go
index 6d1d2c14e..330bf1a76 100644
--- a/wgengine/magicsock/magicsock_test.go
+++ b/wgengine/magicsock/magicsock_test.go
@@ -18,6 +18,7 @@
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"github.com/tailscale/wireguard-go/device"
 	"github.com/tailscale/wireguard-go/tun/tuntest"
 	"github.com/tailscale/wireguard-go/wgcfg"
@@ -424,16 +425,38 @@ func TestTwoDevicePing(t *testing.T) {
 		ping2(t)
 	})
 
-	pingSeq := func(t *testing.T, count int) {
+	pingSeq := func(t *testing.T, count int, totalTime time.Duration) {
 		msg := func(i int) []byte {
 			b := tuntest.Ping(net.ParseIP("1.0.0.2"), net.ParseIP("1.0.0.1"))
 			b[len(b)-1] = byte(i) // set seq num
 			return b
 		}
 
+		// Space out ping transmissions so that the overall
+		// transmission happens in totalTime.
+		//
+		// We do this because the packet spray logic in magicsock is
+		// time-based to allow for reliable NAT traversal. However,
+		// for the packet spraying test further down, there needs to
+		// be at least 1 sprayed packet that is not the handshake, in
+		// case the handshake gets eaten by the race resolution logic.
+		//
+		// This is an inherent "race by design" in our current
+		// magicsock+wireguard-go codebase: sometimes, racing
+		// handshakes will result in a sub-optimal path for a few
+		// hundred milliseconds, until a subsequent spray corrects the
+		// issue. In order for the test to reflect that magicsock
+		// works as designed, we have to space out packet transmission
+		// here.
+		interPacketGap := totalTime / time.Duration(count)
+		if interPacketGap < 1*time.Millisecond {
+			interPacketGap = 0
+		}
+
 		for i := 0; i < count; i++ {
 			b := msg(i)
 			tun1.Outbound <- b
+			time.Sleep(interPacketGap)
 		}
 
 		for i := 0; i < count; i++ {
@@ -441,7 +464,7 @@ func TestTwoDevicePing(t *testing.T) {
 			select {
 			case msgRecv := <-tun2.Inbound:
 				if !bytes.Equal(b, msgRecv) {
-					t.Errorf("return ping %d did not transit correctly", i)
+					t.Errorf("return ping %d did not transit correctly: %s", i, cmp.Diff(b, msgRecv))
 				}
 			case <-time.After(3 * time.Second):
 				t.Fatalf("return ping %d did not transit", i)
@@ -451,7 +474,7 @@ func TestTwoDevicePing(t *testing.T) {
 	}
 
 	t.Run("ping 1.0.0.1 x50", func(t *testing.T) {
-		pingSeq(t, 50)
+		pingSeq(t, 50, 0)
 	})
 
 	// Add DERP relay.
@@ -473,7 +496,7 @@ func TestTwoDevicePing(t *testing.T) {
 		defer func() {
 			t.Logf("DERP vars: %s", derpServer.ExpVar().String())
 		}()
-		pingSeq(t, 20)
+		pingSeq(t, 20, 0)
 	})
 
 	// Disable real route.
@@ -497,22 +520,32 @@ func TestTwoDevicePing(t *testing.T) {
 				t.Logf("cfg1: %v", uapi2)
 			}
 		}()
-		pingSeq(t, 20)
+		pingSeq(t, 20, 0)
 	})
 
-	// Put one real route.
+	dev1.RemoveAllPeers()
+	dev2.RemoveAllPeers()
+
+	// Give one peer a non-DERP endpoint. We expect the other to
+	// accept it via roamAddr.
 	cfgs[0].Peers[0].Endpoints = ep0
 	if ep2 := cfgs[1].Peers[0].Endpoints; len(ep2) != 1 {
 		t.Errorf("unexpected peer endpoints in dev2: %v", ep2)
 	}
-	if err := dev1.Reconfig(&cfgs[0]); err != nil {
-		t.Fatal(err)
-	}
 	if err := dev2.Reconfig(&cfgs[1]); err != nil {
 		t.Fatal(err)
 	}
+	if err := dev1.Reconfig(&cfgs[0]); err != nil {
+		t.Fatal(err)
+	}
+	// Dear future human debugging a test failure here: this test is
+	// flaky, and very infrequently will drop 1-2 of the 50 ping
+	// packets. This does not affect normal operation of tailscaled,
+	// but makes this test fail.
+	//
+	// TODO(danderson): finish root-causing and de-flake this test.
 	t.Run("one real route is enough thanks to spray", func(t *testing.T) {
-		pingSeq(t, 50)
+		pingSeq(t, 50, 700*time.Millisecond)
 
 		ep2 := dev2.Config().Peers[0].Endpoints
 		if len(ep2) != 2 {