tstest/integration/vms: test vms as they are ready (#2022)

Instead of testing all the VMs at once when they are all ready, this
patch changes the testing logic so that the vms are tested as soon as
they register with testcontrol. Also limit the amount of VM ram used at
once with the `-ram-limit` flag. That uses a semaphore to guard resource
use.

Also document CentOS' sins.

Updates #1988

Signed-off-by: Christine Dodrill <xe@tailscale.com>
This commit is contained in:
Christine Dodrill 2021-05-31 17:04:49 -04:00 committed by GitHub
parent eaa6507cc9
commit 2802a01b81
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -7,6 +7,7 @@
package vms
import (
"context"
"crypto/sha256"
"encoding/hex"
"flag"
@ -31,6 +32,7 @@
expect "github.com/google/goexpect"
"github.com/pkg/sftp"
"golang.org/x/crypto/ssh"
"golang.org/x/sync/semaphore"
"inet.af/netaddr"
"tailscale.com/net/interfaces"
"tailscale.com/tstest"
@ -40,7 +42,8 @@
const securePassword = "hunter2"
var runVMTests = flag.Bool("run-vm-tests", false, "if set, run expensive (10G+ ram) VM based integration tests")
var runVMTests = flag.Bool("run-vm-tests", false, "if set, run expensive VM based integration tests")
var vmRamLimit = flag.Int("ram-limit", 4096, "the maximum number of megabytes of ram that can be used for VMs, must be greater than or equal to 1024")
var distroRex *regexValue = func() *regexValue {
result := &regexValue{r: regexp.MustCompile(`.*`)}
flag.Var(result, "distro-regex", "The regex that matches what distros should be run")
@ -259,9 +262,8 @@ func mkVM(t *testing.T, n int, d Distro, sshKey, hostURL, tdir string) func() {
if err != nil {
t.Fatalf("can't find cache dir: %v", err)
}
cdir = filepath.Join(cdir, "within", "mkvm")
cdir = filepath.Join(cdir, "tailscale", "vm-test")
os.MkdirAll(filepath.Join(cdir, "qcow2"), 0755)
os.MkdirAll(filepath.Join(cdir, "seed"), 0755)
port := 23100 + n
@ -280,6 +282,7 @@ func mkVM(t *testing.T, n int, d Distro, sshKey, hostURL, tdir string) func() {
"-drive", driveArg,
"-cdrom", filepath.Join(tdir, d.name, "seed", "seed.iso"),
"-vnc", fmt.Sprintf(":%d", n),
"-smbios", "type=1,serial=ds=nocloud;h=" + d.name,
}
t.Logf("running: qemu-system-x86_64 %s", strings.Join(args, " "))
@ -378,7 +381,7 @@ func TestVMIntegrationEndToEnd(t *testing.T) {
var (
ipMu sync.Mutex
ipMap = []ipMapping{}
ipMap = map[string]ipMapping{}
)
mux := http.NewServeMux()
@ -398,7 +401,8 @@ func TestVMIntegrationEndToEnd(t *testing.T) {
if err != nil {
log.Panicf("bad port: %v", port)
}
ipMap = append(ipMap, ipMapping{r.UserAgent(), port, host})
distro := r.UserAgent()
ipMap[distro] = ipMapping{distro, port, host}
t.Logf("%s: %v", name, host)
})
@ -424,140 +428,129 @@ func TestVMIntegrationEndToEnd(t *testing.T) {
loginServer := fmt.Sprintf("http://%s", ln.Addr())
t.Logf("loginServer: %s", loginServer)
var numDistros = 0
tstest.FixLogs(t)
defer tstest.UnfixLogs(t)
cancels := make(chan func(), len(distros))
ramsem := semaphore.NewWeighted(int64(*vmRamLimit))
t.Run("mkvm", func(t *testing.T) {
t.Run("do", func(t *testing.T) {
for n, distro := range distros {
n, distro := n, distro
if rex.MatchString(distro.name) {
t.Logf("%s matches %s", distro.name, rex)
numDistros++
} else {
continue
}
t.Run(distro.name, func(t *testing.T) {
ctx, done := context.WithCancel(context.Background())
defer done()
if distro.name == "opensuse-leap-15-1" {
t.Skip("OpenSUSE Leap 15.1's cloud-init image just doesn't work for some reason, see https://github.com/tailscale/tailscale/issues/1988")
}
t.Parallel()
err := ramsem.Acquire(ctx, int64(distro.mem))
if err != nil {
t.Fatalf("can't acquire ram semaphore: %v", err)
}
defer ramsem.Release(int64(distro.mem))
cancel := mkVM(t, n, distro, string(pubkey), loginServer, dir)
cancels <- cancel
defer cancel()
var ipm ipMapping
t.Run("wait-for-start", func(t *testing.T) {
waiter := time.NewTicker(time.Second)
defer waiter.Stop()
var ok bool
for {
<-waiter.C
ipMu.Lock()
if ipm, ok = ipMap[distro.name]; ok {
ipMu.Unlock()
break
}
ipMu.Unlock()
}
})
testDistro(t, loginServer, signer, ipm)
})
}
})
}
close(cancels)
for cancel := range cancels {
//lint:ignore SA9001 They do actually get ran
defer cancel()
if len(cancels) == 0 {
t.Log("all VMs started")
break
}
func testDistro(t *testing.T, loginServer string, signer ssh.Signer, ipm ipMapping) {
t.Helper()
port := ipm.port
hostport := fmt.Sprintf("127.0.0.1:%d", port)
ccfg := &ssh.ClientConfig{
User: "root",
Auth: []ssh.AuthMethod{ssh.PublicKeys(signer), ssh.Password(securePassword)},
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
}
t.Run("wait-for-vms", func(t *testing.T) {
t.Log("waiting for VMs to register")
waiter := time.NewTicker(time.Second)
defer waiter.Stop()
n := 0
for {
<-waiter.C
ipMu.Lock()
if len(ipMap) == numDistros {
ipMu.Unlock()
break
} else {
if n%30 == 0 {
t.Logf("ipMap: %d", len(ipMap))
t.Logf("distros: %d", numDistros)
}
}
n++
ipMu.Unlock()
// NOTE(Xe): This deadline loop helps to make things a bit faster, centos
// sometimes is slow at starting its sshd and will sometimes randomly kill
// SSH sessions on transition to multi-user.target. I don't know why they
// don't use socket activation.
const maxRetries = 5
var working bool
for i := 0; i < maxRetries; i++ {
cli, err := ssh.Dial("tcp", hostport, ccfg)
if err == nil {
working = true
cli.Close()
break
}
})
ipMu.Lock()
defer ipMu.Unlock()
t.Run("join-net", func(t *testing.T) {
for _, ipm := range ipMap {
ipm := ipm
port := ipm.port
t.Run(ipm.name, func(t *testing.T) {
tstest.FixLogs(t)
t.Parallel()
time.Sleep(10 * time.Second)
}
hostport := fmt.Sprintf("127.0.0.1:%d", port)
if !working {
t.Fatalf("can't connect to %s, tried %d times", hostport, maxRetries)
}
// NOTE(Xe): This retry loop helps to make things a bit faster, centos sometimes is slow at starting its sshd. I don't know why they don't use socket activation.
const maxRetries = 5
var working bool
for i := 0; i < maxRetries; i++ {
conn, err := net.Dial("tcp", hostport)
if err == nil {
working = true
conn.Close()
break
}
t.Logf("about to ssh into 127.0.0.1:%d", port)
cli, err := ssh.Dial("tcp", hostport, ccfg)
if err != nil {
t.Fatal(err)
}
copyBinaries(t, cli)
time.Sleep(5 * time.Second)
}
timeout := 5 * time.Minute
if !working {
t.Fatalf("can't connect to %s, tried %d times", hostport, maxRetries)
}
e, _, err := expect.SpawnSSH(cli, timeout, expect.Verbose(true), expect.VerboseWriter(log.Writer()))
if err != nil {
t.Fatalf("%d: can't register a shell session: %v", port, err)
}
defer e.Close()
t.Logf("about to ssh into 127.0.0.1:%d", port)
cli, err := ssh.Dial("tcp", hostport, &ssh.ClientConfig{
User: "root",
Auth: []ssh.AuthMethod{ssh.PublicKeys(signer), ssh.Password(securePassword)},
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
})
if err != nil {
t.Fatal(err)
}
copyBinaries(t, cli)
t.Log("opened session")
timeout := 5 * time.Minute
e, _, err := expect.SpawnSSH(cli, timeout, expect.Verbose(true), expect.VerboseWriter(log.Writer()))
if err != nil {
t.Fatalf("%d: can't register a shell session: %v", port, err)
}
defer e.Close()
t.Log("opened session")
_, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout)
if err != nil {
t.Fatalf("%d: can't get a shell: %v", port, err)
}
t.Logf("got shell for %d", port)
err = e.Send("systemctl start tailscaled.service\n")
if err != nil {
t.Fatalf("can't send command to start tailscaled: %v", err)
}
_, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout)
if err != nil {
t.Fatalf("%d: can't get a shell: %v", port, err)
}
err = e.Send(fmt.Sprintf("sudo tailscale up --login-server %s\n", loginServer))
if err != nil {
t.Fatalf("%d: can't send tailscale up command: %v", port, err)
}
_, _, err = e.Expect(regexp.MustCompile(`Success.`), timeout)
if err != nil {
t.Fatalf("not successful: %v", err)
}
})
}
})
if numNodes := cs.NumNodes(); numNodes != len(ipMap) {
t.Errorf("wanted %d nodes, got: %d", len(ipMap), numNodes)
_, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout)
if err != nil {
t.Fatalf("%d: can't get a shell: %v", port, err)
}
t.Logf("got shell for %d", port)
err = e.Send("systemctl start tailscaled.service\n")
if err != nil {
t.Fatalf("can't send command to start tailscaled: %v", err)
}
_, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout)
if err != nil {
t.Fatalf("%d: can't get a shell: %v", port, err)
}
err = e.Send(fmt.Sprintf("sudo tailscale up --login-server %s\n", loginServer))
if err != nil {
t.Fatalf("%d: can't send tailscale up command: %v", port, err)
}
_, _, err = e.Expect(regexp.MustCompile(`Success.`), timeout)
if err != nil {
t.Fatalf("not successful: %v", err)
}
}