tstest/integration/vms: test vms as they are ready (#2022)

Instead of testing all the VMs at once when they are all ready, this
patch changes the testing logic so that the vms are tested as soon as
they register with testcontrol. Also limit the amount of VM ram used at
once with the `-ram-limit` flag. That uses a semaphore to guard resource
use.

Also document CentOS' sins.

Updates #1988

Signed-off-by: Christine Dodrill <xe@tailscale.com>
This commit is contained in:
Christine Dodrill 2021-05-31 17:04:49 -04:00 committed by GitHub
parent eaa6507cc9
commit 2802a01b81
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -7,6 +7,7 @@
package vms package vms
import ( import (
"context"
"crypto/sha256" "crypto/sha256"
"encoding/hex" "encoding/hex"
"flag" "flag"
@ -31,6 +32,7 @@ import (
expect "github.com/google/goexpect" expect "github.com/google/goexpect"
"github.com/pkg/sftp" "github.com/pkg/sftp"
"golang.org/x/crypto/ssh" "golang.org/x/crypto/ssh"
"golang.org/x/sync/semaphore"
"inet.af/netaddr" "inet.af/netaddr"
"tailscale.com/net/interfaces" "tailscale.com/net/interfaces"
"tailscale.com/tstest" "tailscale.com/tstest"
@ -40,7 +42,8 @@ import (
const securePassword = "hunter2" const securePassword = "hunter2"
var runVMTests = flag.Bool("run-vm-tests", false, "if set, run expensive (10G+ ram) VM based integration tests") var runVMTests = flag.Bool("run-vm-tests", false, "if set, run expensive VM based integration tests")
var vmRamLimit = flag.Int("ram-limit", 4096, "the maximum number of megabytes of ram that can be used for VMs, must be greater than or equal to 1024")
var distroRex *regexValue = func() *regexValue { var distroRex *regexValue = func() *regexValue {
result := &regexValue{r: regexp.MustCompile(`.*`)} result := &regexValue{r: regexp.MustCompile(`.*`)}
flag.Var(result, "distro-regex", "The regex that matches what distros should be run") flag.Var(result, "distro-regex", "The regex that matches what distros should be run")
@ -259,9 +262,8 @@ func mkVM(t *testing.T, n int, d Distro, sshKey, hostURL, tdir string) func() {
if err != nil { if err != nil {
t.Fatalf("can't find cache dir: %v", err) t.Fatalf("can't find cache dir: %v", err)
} }
cdir = filepath.Join(cdir, "within", "mkvm") cdir = filepath.Join(cdir, "tailscale", "vm-test")
os.MkdirAll(filepath.Join(cdir, "qcow2"), 0755) os.MkdirAll(filepath.Join(cdir, "qcow2"), 0755)
os.MkdirAll(filepath.Join(cdir, "seed"), 0755)
port := 23100 + n port := 23100 + n
@ -280,6 +282,7 @@ func mkVM(t *testing.T, n int, d Distro, sshKey, hostURL, tdir string) func() {
"-drive", driveArg, "-drive", driveArg,
"-cdrom", filepath.Join(tdir, d.name, "seed", "seed.iso"), "-cdrom", filepath.Join(tdir, d.name, "seed", "seed.iso"),
"-vnc", fmt.Sprintf(":%d", n), "-vnc", fmt.Sprintf(":%d", n),
"-smbios", "type=1,serial=ds=nocloud;h=" + d.name,
} }
t.Logf("running: qemu-system-x86_64 %s", strings.Join(args, " ")) t.Logf("running: qemu-system-x86_64 %s", strings.Join(args, " "))
@ -378,7 +381,7 @@ func TestVMIntegrationEndToEnd(t *testing.T) {
var ( var (
ipMu sync.Mutex ipMu sync.Mutex
ipMap = []ipMapping{} ipMap = map[string]ipMapping{}
) )
mux := http.NewServeMux() mux := http.NewServeMux()
@ -398,7 +401,8 @@ func TestVMIntegrationEndToEnd(t *testing.T) {
if err != nil { if err != nil {
log.Panicf("bad port: %v", port) log.Panicf("bad port: %v", port)
} }
ipMap = append(ipMap, ipMapping{r.UserAgent(), port, host}) distro := r.UserAgent()
ipMap[distro] = ipMapping{distro, port, host}
t.Logf("%s: %v", name, host) t.Logf("%s: %v", name, host)
}) })
@ -424,140 +428,129 @@ func TestVMIntegrationEndToEnd(t *testing.T) {
loginServer := fmt.Sprintf("http://%s", ln.Addr()) loginServer := fmt.Sprintf("http://%s", ln.Addr())
t.Logf("loginServer: %s", loginServer) t.Logf("loginServer: %s", loginServer)
var numDistros = 0 tstest.FixLogs(t)
defer tstest.UnfixLogs(t)
cancels := make(chan func(), len(distros)) ramsem := semaphore.NewWeighted(int64(*vmRamLimit))
t.Run("mkvm", func(t *testing.T) { t.Run("do", func(t *testing.T) {
for n, distro := range distros { for n, distro := range distros {
n, distro := n, distro n, distro := n, distro
if rex.MatchString(distro.name) { if rex.MatchString(distro.name) {
t.Logf("%s matches %s", distro.name, rex) t.Logf("%s matches %s", distro.name, rex)
numDistros++
} else { } else {
continue continue
} }
t.Run(distro.name, func(t *testing.T) { t.Run(distro.name, func(t *testing.T) {
ctx, done := context.WithCancel(context.Background())
defer done()
if distro.name == "opensuse-leap-15-1" {
t.Skip("OpenSUSE Leap 15.1's cloud-init image just doesn't work for some reason, see https://github.com/tailscale/tailscale/issues/1988")
}
t.Parallel() t.Parallel()
err := ramsem.Acquire(ctx, int64(distro.mem))
if err != nil {
t.Fatalf("can't acquire ram semaphore: %v", err)
}
defer ramsem.Release(int64(distro.mem))
cancel := mkVM(t, n, distro, string(pubkey), loginServer, dir) cancel := mkVM(t, n, distro, string(pubkey), loginServer, dir)
cancels <- cancel defer cancel()
var ipm ipMapping
t.Run("wait-for-start", func(t *testing.T) {
waiter := time.NewTicker(time.Second)
defer waiter.Stop()
var ok bool
for {
<-waiter.C
ipMu.Lock()
if ipm, ok = ipMap[distro.name]; ok {
ipMu.Unlock()
break
}
ipMu.Unlock()
}
})
testDistro(t, loginServer, signer, ipm)
}) })
} }
}) })
}
close(cancels) func testDistro(t *testing.T, loginServer string, signer ssh.Signer, ipm ipMapping) {
for cancel := range cancels { t.Helper()
//lint:ignore SA9001 They do actually get ran port := ipm.port
defer cancel() hostport := fmt.Sprintf("127.0.0.1:%d", port)
ccfg := &ssh.ClientConfig{
if len(cancels) == 0 { User: "root",
t.Log("all VMs started") Auth: []ssh.AuthMethod{ssh.PublicKeys(signer), ssh.Password(securePassword)},
break HostKeyCallback: ssh.InsecureIgnoreHostKey(),
}
} }
t.Run("wait-for-vms", func(t *testing.T) { // NOTE(Xe): This deadline loop helps to make things a bit faster, centos
t.Log("waiting for VMs to register") // sometimes is slow at starting its sshd and will sometimes randomly kill
waiter := time.NewTicker(time.Second) // SSH sessions on transition to multi-user.target. I don't know why they
defer waiter.Stop() // don't use socket activation.
n := 0 const maxRetries = 5
for { var working bool
<-waiter.C for i := 0; i < maxRetries; i++ {
ipMu.Lock() cli, err := ssh.Dial("tcp", hostport, ccfg)
if len(ipMap) == numDistros { if err == nil {
ipMu.Unlock() working = true
break cli.Close()
} else { break
if n%30 == 0 {
t.Logf("ipMap: %d", len(ipMap))
t.Logf("distros: %d", numDistros)
}
}
n++
ipMu.Unlock()
} }
})
ipMu.Lock() time.Sleep(10 * time.Second)
defer ipMu.Unlock() }
t.Run("join-net", func(t *testing.T) {
for _, ipm := range ipMap {
ipm := ipm
port := ipm.port
t.Run(ipm.name, func(t *testing.T) {
tstest.FixLogs(t)
t.Parallel()
hostport := fmt.Sprintf("127.0.0.1:%d", port) if !working {
t.Fatalf("can't connect to %s, tried %d times", hostport, maxRetries)
}
// NOTE(Xe): This retry loop helps to make things a bit faster, centos sometimes is slow at starting its sshd. I don't know why they don't use socket activation. t.Logf("about to ssh into 127.0.0.1:%d", port)
const maxRetries = 5 cli, err := ssh.Dial("tcp", hostport, ccfg)
var working bool if err != nil {
for i := 0; i < maxRetries; i++ { t.Fatal(err)
conn, err := net.Dial("tcp", hostport) }
if err == nil { copyBinaries(t, cli)
working = true
conn.Close()
break
}
time.Sleep(5 * time.Second) timeout := 5 * time.Minute
}
if !working { e, _, err := expect.SpawnSSH(cli, timeout, expect.Verbose(true), expect.VerboseWriter(log.Writer()))
t.Fatalf("can't connect to %s, tried %d times", hostport, maxRetries) if err != nil {
} t.Fatalf("%d: can't register a shell session: %v", port, err)
}
defer e.Close()
t.Logf("about to ssh into 127.0.0.1:%d", port) t.Log("opened session")
cli, err := ssh.Dial("tcp", hostport, &ssh.ClientConfig{
User: "root",
Auth: []ssh.AuthMethod{ssh.PublicKeys(signer), ssh.Password(securePassword)},
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
})
if err != nil {
t.Fatal(err)
}
copyBinaries(t, cli)
timeout := 5 * time.Minute _, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout)
if err != nil {
e, _, err := expect.SpawnSSH(cli, timeout, expect.Verbose(true), expect.VerboseWriter(log.Writer())) t.Fatalf("%d: can't get a shell: %v", port, err)
if err != nil { }
t.Fatalf("%d: can't register a shell session: %v", port, err) t.Logf("got shell for %d", port)
} err = e.Send("systemctl start tailscaled.service\n")
defer e.Close() if err != nil {
t.Fatalf("can't send command to start tailscaled: %v", err)
t.Log("opened session") }
_, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout)
_, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout) if err != nil {
if err != nil { t.Fatalf("%d: can't get a shell: %v", port, err)
t.Fatalf("%d: can't get a shell: %v", port, err) }
} err = e.Send(fmt.Sprintf("sudo tailscale up --login-server %s\n", loginServer))
t.Logf("got shell for %d", port) if err != nil {
err = e.Send("systemctl start tailscaled.service\n") t.Fatalf("%d: can't send tailscale up command: %v", port, err)
if err != nil { }
t.Fatalf("can't send command to start tailscaled: %v", err) _, _, err = e.Expect(regexp.MustCompile(`Success.`), timeout)
} if err != nil {
_, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout) t.Fatalf("not successful: %v", err)
if err != nil {
t.Fatalf("%d: can't get a shell: %v", port, err)
}
err = e.Send(fmt.Sprintf("sudo tailscale up --login-server %s\n", loginServer))
if err != nil {
t.Fatalf("%d: can't send tailscale up command: %v", port, err)
}
_, _, err = e.Expect(regexp.MustCompile(`Success.`), timeout)
if err != nil {
t.Fatalf("not successful: %v", err)
}
})
}
})
if numNodes := cs.NumNodes(); numNodes != len(ipMap) {
t.Errorf("wanted %d nodes, got: %d", len(ipMap), numNodes)
} }
} }