tailscale/tstest/integration/vms/vms_test.go
Christine Dodrill 2802a01b81
tstest/integration/vms: test vms as they are ready (#2022)
Instead of testing all the VMs at once when they are all ready, this
patch changes the testing logic so that the vms are tested as soon as
they register with testcontrol. Also limit the amount of VM ram used at
once with the `-ram-limit` flag. That uses a semaphore to guard resource
use.

Also document CentOS' sins.

Updates #1988

Signed-off-by: Christine Dodrill <xe@tailscale.com>
2021-05-31 17:04:49 -04:00

696 lines
20 KiB
Go

// Copyright (c) 2021 Tailscale Inc & AUTHORS All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build linux
package vms
import (
"context"
"crypto/sha256"
"encoding/hex"
"flag"
"fmt"
"io"
"log"
"net"
"net/http"
"os"
"os/exec"
"path"
"path/filepath"
"regexp"
"strconv"
"strings"
"sync"
"syscall"
"testing"
"text/template"
"time"
expect "github.com/google/goexpect"
"github.com/pkg/sftp"
"golang.org/x/crypto/ssh"
"golang.org/x/sync/semaphore"
"inet.af/netaddr"
"tailscale.com/net/interfaces"
"tailscale.com/tstest"
"tailscale.com/tstest/integration"
"tailscale.com/tstest/integration/testcontrol"
)
const securePassword = "hunter2"
var runVMTests = flag.Bool("run-vm-tests", false, "if set, run expensive VM based integration tests")
var vmRamLimit = flag.Int("ram-limit", 4096, "the maximum number of megabytes of ram that can be used for VMs, must be greater than or equal to 1024")
var distroRex *regexValue = func() *regexValue {
result := &regexValue{r: regexp.MustCompile(`.*`)}
flag.Var(result, "distro-regex", "The regex that matches what distros should be run")
return result
}()
type Distro struct {
name string // amazon-linux
url string // URL to a qcow2 image
sha256sum string // hex-encoded sha256 sum of contents of URL
mem int // VM memory in megabytes
packageManager string // yum/apt/dnf/zypper
}
func (d *Distro) InstallPre() string {
switch d.packageManager {
case "yum":
return ` - [ yum, update, gnupg2 ]
- [ yum, "-y", install, iptables ]`
case "zypper":
return ` - [ zypper, in, "-y", iptables ]`
case "dnf":
return ` - [ dnf, install, "-y", iptables ]`
case "apt":
return ` - [ apt-get, update ]
- [ apt-get, "-y", install, curl, "apt-transport-https", gnupg2 ]`
case "apk":
return ` - [ apk, "-U", add, curl, "ca-certificates" ]`
}
return ""
}
// fetchDistro fetches a distribution from the internet if it doesn't already exist locally. It
// also validates the sha256 sum from a known good hash.
func fetchDistro(t *testing.T, resultDistro Distro) {
t.Helper()
cdir, err := os.UserCacheDir()
if err != nil {
t.Fatalf("can't find cache dir: %v", err)
}
cdir = filepath.Join(cdir, "tailscale", "vm-test")
qcowPath := filepath.Join(cdir, "qcow2", resultDistro.sha256sum)
_, err = os.Stat(qcowPath)
if err != nil {
t.Logf("downloading distro image %s to %s", resultDistro.url, qcowPath)
fout, err := os.Create(qcowPath)
if err != nil {
t.Fatal(err)
}
resp, err := http.Get(resultDistro.url)
if err != nil {
t.Fatalf("can't fetch qcow2 for %s (%s): %v", resultDistro.name, resultDistro.url, err)
}
if resp.StatusCode != http.StatusOK {
resp.Body.Close()
t.Fatalf("%s replied %s", resultDistro.url, resp.Status)
}
_, err = io.Copy(fout, resp.Body)
resp.Body.Close()
if err != nil {
t.Fatalf("download of %s failed: %v", resultDistro.url, err)
}
err = fout.Close()
if err != nil {
t.Fatalf("can't close fout: %v", err)
}
fin, err := os.Open(qcowPath)
if err != nil {
t.Fatal(err)
}
hasher := sha256.New()
if _, err := io.Copy(hasher, fin); err != nil {
t.Fatal(err)
}
hash := hex.EncodeToString(hasher.Sum(nil))
if hash != resultDistro.sha256sum {
t.Logf("got: %q", hash)
t.Logf("want: %q", resultDistro.sha256sum)
t.Fatal("hash mismatch, someone is doing something nasty")
}
t.Logf("hash check passed (%s)", resultDistro.sha256sum)
}
}
// run runs a command or fails the test.
func run(t *testing.T, dir, prog string, args ...string) {
t.Helper()
t.Logf("running: %s %s", prog, strings.Join(args, " "))
tstest.FixLogs(t)
cmd := exec.Command(prog, args...)
cmd.Stdout = log.Writer()
cmd.Stderr = log.Writer()
cmd.Dir = dir
if err := cmd.Run(); err != nil {
t.Fatal(err)
}
}
// mkLayeredQcow makes a layered qcow image that allows us to keep the upstream
// VM images pristine and only do our changes on an overlay.
func mkLayeredQcow(t *testing.T, tdir string, d Distro) {
t.Helper()
cdir, err := os.UserCacheDir()
if err != nil {
t.Fatalf("can't find cache dir: %v", err)
}
cdir = filepath.Join(cdir, "tailscale", "vm-test")
run(t, tdir, "qemu-img", "create",
"-f", "qcow2",
"-o", "backing_file="+filepath.Join(cdir, "qcow2", d.sha256sum),
filepath.Join(tdir, d.name+".qcow2"),
)
}
var (
metaDataTempl = template.Must(template.New("meta-data.yaml").Parse(metaDataTemplate))
userDataTempl = template.Must(template.New("user-data.yaml").Parse(userDataTemplate))
)
// mkSeed makes the cloud-init seed ISO that is used to configure a VM with
// tailscale.
func mkSeed(t *testing.T, d Distro, sshKey, hostURL, tdir string, port int) {
t.Helper()
dir := filepath.Join(tdir, d.name, "seed")
os.MkdirAll(dir, 0700)
// make meta-data
{
fout, err := os.Create(filepath.Join(dir, "meta-data"))
if err != nil {
t.Fatal(err)
}
err = metaDataTempl.Execute(fout, struct {
ID string
Hostname string
}{
ID: "31337",
Hostname: d.name,
})
if err != nil {
t.Fatal(err)
}
err = fout.Close()
if err != nil {
t.Fatal(err)
}
}
// make user-data
{
fout, err := os.Create(filepath.Join(dir, "user-data"))
if err != nil {
t.Fatal(err)
}
err = userDataTempl.Execute(fout, struct {
SSHKey string
HostURL string
Hostname string
Port int
InstallPre string
Password string
}{
SSHKey: strings.TrimSpace(sshKey),
HostURL: hostURL,
Hostname: d.name,
Port: port,
InstallPre: d.InstallPre(),
Password: securePassword,
})
if err != nil {
t.Fatal(err)
}
err = fout.Close()
if err != nil {
t.Fatal(err)
}
}
run(t, tdir, "genisoimage",
"-output", filepath.Join(dir, "seed.iso"),
"-volid", "cidata", "-joliet", "-rock",
filepath.Join(dir, "meta-data"),
filepath.Join(dir, "user-data"),
)
}
// mkVM makes a KVM-accelerated virtual machine and prepares it for introduction
// to the testcontrol server. The function it returns is for killing the virtual
// machine when it is time for it to die.
func mkVM(t *testing.T, n int, d Distro, sshKey, hostURL, tdir string) func() {
t.Helper()
cdir, err := os.UserCacheDir()
if err != nil {
t.Fatalf("can't find cache dir: %v", err)
}
cdir = filepath.Join(cdir, "tailscale", "vm-test")
os.MkdirAll(filepath.Join(cdir, "qcow2"), 0755)
port := 23100 + n
fetchDistro(t, d)
mkLayeredQcow(t, tdir, d)
mkSeed(t, d, sshKey, hostURL, tdir, port)
driveArg := fmt.Sprintf("file=%s,if=virtio", filepath.Join(tdir, d.name+".qcow2"))
args := []string{
"-machine", "pc-q35-5.1,accel=kvm,usb=off,vmport=off,dump-guest-core=off",
"-netdev", fmt.Sprintf("user,hostfwd=::%d-:22,id=net0", port),
"-device", "virtio-net-pci,netdev=net0,id=net0,mac=8a:28:5c:30:1f:25",
"-m", fmt.Sprint(d.mem),
"-boot", "c",
"-drive", driveArg,
"-cdrom", filepath.Join(tdir, d.name, "seed", "seed.iso"),
"-vnc", fmt.Sprintf(":%d", n),
"-smbios", "type=1,serial=ds=nocloud;h=" + d.name,
}
t.Logf("running: qemu-system-x86_64 %s", strings.Join(args, " "))
cmd := exec.Command("qemu-system-x86_64", args...)
err = cmd.Start()
if err != nil {
t.Fatal(err)
}
time.Sleep(time.Second)
if err := cmd.Process.Signal(syscall.Signal(0)); err != nil {
t.Fatal("qemu is not running")
}
return func() {
err := cmd.Process.Kill()
if err != nil {
t.Errorf("can't kill %s (%d): %v", d.name, cmd.Process.Pid, err)
}
}
}
// ipMapping maps a hostname, SSH port and SSH IP together
type ipMapping struct {
name string
port int
ip string
}
// TestVMIntegrationEndToEnd creates a virtual machine with qemu, installs
// tailscale on it and then ensures that it connects to the network
// successfully.
func TestVMIntegrationEndToEnd(t *testing.T) {
if !*runVMTests {
t.Skip("not running integration tests (need -run-vm-tests)")
}
os.Setenv("CGO_ENABLED", "0")
if _, err := exec.LookPath("qemu-system-x86_64"); err != nil {
t.Logf("hint: nix-shell -p go -p qemu -p cdrkit --run 'go test -v -timeout=60m -run-vm-tests'")
t.Fatalf("missing dependency: %v", err)
}
if _, err := exec.LookPath("genisoimage"); err != nil {
t.Logf("hint: nix-shell -p go -p qemu -p cdrkit --run 'go test -v -timeout=60m -run-vm-tests'")
t.Fatalf("missing dependency: %v", err)
}
distros := []Distro{
// NOTE(Xe): If you run into issues getting the autoconfig to work, comment
// out all the other distros and uncomment this one. Connect with a VNC
// client with a command like this:
//
// $ vncviewer :0
//
// On NixOS you can get away with something like this:
//
// $ env NIXPKGS_ALLOW_UNFREE=1 nix-shell -p tigervnc --run 'vncviewer :0'
//
// Login as root with the password root. Then look in
// /var/log/cloud-init-output.log for what you messed up.
// {"alpine-edge", "https://xena.greedo.xeserv.us/pkg/alpine/img/alpine-edge-2021-05-18-cloud-init-within.qcow2", "b3bb15311c0bd3beffa1b554f022b75d3b7309b5fdf76fb146fe7c72b83b16d0", 256, "apk"},
{"amazon-linux", "https://cdn.amazonlinux.com/os-images/2.0.20210427.0/kvm/amzn2-kvm-2.0.20210427.0-x86_64.xfs.gpt.qcow2", "6ef9daef32cec69b2d0088626ec96410cd24afc504d57278bbf2f2ba2b7e529b", 512, "yum"},
{"centos-7", "https://cloud.centos.org/centos/7/images/CentOS-7-x86_64-GenericCloud.qcow2", "1db30c9c272fb37b00111b93dcebff16c278384755bdbe158559e9c240b73b80", 512, "yum"},
{"centos-8", "https://cloud.centos.org/centos/8/x86_64/images/CentOS-8-GenericCloud-8.3.2011-20201204.2.x86_64.qcow2", "7ec97062618dc0a7ebf211864abf63629da1f325578868579ee70c495bed3ba0", 768, "dnf"},
{"debian-9", "https://cdimage.debian.org/cdimage/openstack/9.13.21-20210511/debian-9.13.21-20210511-openstack-amd64.qcow2", "0667a08e2d947b331aee068db4bbf3a703e03edaf5afa52e23d534adff44b62a", 512, "apt"},
{"debian-10", "https://cdimage.debian.org/images/cloud/buster/20210329-591/debian-10-generic-amd64-20210329-591.qcow2", "70c61956095870c4082103d1a7a1cb5925293f8405fc6cb348588ec97e8611b0", 768, "apt"},
{"fedora-34", "https://download.fedoraproject.org/pub/fedora/linux/releases/34/Cloud/x86_64/images/Fedora-Cloud-Base-34-1.2.x86_64.qcow2", "b9b621b26725ba95442d9a56cbaa054784e0779a9522ec6eafff07c6e6f717ea", 768, "dnf"},
{"opensuse-leap-15-1", "https://download.opensuse.org/repositories/Cloud:/Images:/Leap_15.1/images/openSUSE-Leap-15.1-OpenStack.x86_64.qcow2", "3203e256dab5981ca3301408574b63bc522a69972fbe9850b65b54ff44a96e0a", 512, "zypper"},
{"opensuse-leap-15-2", "https://download.opensuse.org/repositories/Cloud:/Images:/Leap_15.2/images/openSUSE-Leap-15.2-OpenStack.x86_64.qcow2", "4df9cee9281d1f57d20f79dc65d76e255592b904760e73c0dd44ac753a54330f", 512, "zypper"},
{"opensuse-tumbleweed", "https://download.opensuse.org/tumbleweed/appliances/openSUSE-Tumbleweed-JeOS.x86_64-OpenStack-Cloud.qcow2", "ba3ecd281045b5019f0fb11378329a644a41870b77631ea647b128cd07eb804b", 512, "zypper"},
{"ubuntu-16-04", "https://cloud-images.ubuntu.com/xenial/current/xenial-server-cloudimg-amd64-disk1.img", "50a21bc067c05e0c73bf5d8727ab61152340d93073b3dc32eff18b626f7d813b", 512, "apt"},
{"ubuntu-18-04", "https://cloud-images.ubuntu.com/bionic/current/bionic-server-cloudimg-amd64.img", "08396cf95c18534a2e3f88289bd92d18eee76f0e75813636b3ab9f1e603816d7", 512, "apt"},
{"ubuntu-20-04", "https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-amd64.img", "513158b22ff0f08d0a078d8d60293bcddffdb17094a7809c76c52aba415ecc54", 512, "apt"},
{"ubuntu-20-10", "https://cloud-images.ubuntu.com/groovy/current/groovy-server-cloudimg-amd64.img", "e470df72fce4fb8d0ee4ef8af8eed740ee3bf51290515eb42e5c747725e98b6d", 512, "apt"},
{"ubuntu-21-04", "https://cloud-images.ubuntu.com/hirsute/current/hirsute-server-cloudimg-amd64.img", "7fab8eda0bcf6f8f6e63845ccf1e29de4706e3359c82d3888835093020fe6f05", 512, "apt"},
}
dir := t.TempDir()
rex := distroRex.Unwrap()
ln, err := net.Listen("tcp", deriveBindhost(t)+":0")
if err != nil {
t.Fatalf("can't make TCP listener: %v", err)
}
defer ln.Close()
t.Logf("host:port: %s", ln.Addr())
cs := &testcontrol.Server{}
var (
ipMu sync.Mutex
ipMap = map[string]ipMapping{}
)
mux := http.NewServeMux()
mux.Handle("/", cs)
// This handler will let the virtual machines tell the host information about that VM.
// This is used to maintain a list of port->IP address mappings that are known to be
// working. This allows later steps to connect over SSH. This returns no response to
// clients because no response is needed.
mux.HandleFunc("/myip/", func(w http.ResponseWriter, r *http.Request) {
ipMu.Lock()
defer ipMu.Unlock()
name := path.Base(r.URL.Path)
host, _, _ := net.SplitHostPort(r.RemoteAddr)
port, err := strconv.Atoi(name)
if err != nil {
log.Panicf("bad port: %v", port)
}
distro := r.UserAgent()
ipMap[distro] = ipMapping{distro, port, host}
t.Logf("%s: %v", name, host)
})
hs := &http.Server{Handler: mux}
go hs.Serve(ln)
run(t, dir, "ssh-keygen", "-t", "ed25519", "-f", "machinekey", "-N", ``)
pubkey, err := os.ReadFile(filepath.Join(dir, "machinekey.pub"))
if err != nil {
t.Fatalf("can't read ssh key: %v", err)
}
privateKey, err := os.ReadFile(filepath.Join(dir, "machinekey"))
if err != nil {
t.Fatalf("can't read ssh private key: %v", err)
}
signer, err := ssh.ParsePrivateKey(privateKey)
if err != nil {
t.Fatalf("can't parse private key: %v", err)
}
loginServer := fmt.Sprintf("http://%s", ln.Addr())
t.Logf("loginServer: %s", loginServer)
tstest.FixLogs(t)
defer tstest.UnfixLogs(t)
ramsem := semaphore.NewWeighted(int64(*vmRamLimit))
t.Run("do", func(t *testing.T) {
for n, distro := range distros {
n, distro := n, distro
if rex.MatchString(distro.name) {
t.Logf("%s matches %s", distro.name, rex)
} else {
continue
}
t.Run(distro.name, func(t *testing.T) {
ctx, done := context.WithCancel(context.Background())
defer done()
if distro.name == "opensuse-leap-15-1" {
t.Skip("OpenSUSE Leap 15.1's cloud-init image just doesn't work for some reason, see https://github.com/tailscale/tailscale/issues/1988")
}
t.Parallel()
err := ramsem.Acquire(ctx, int64(distro.mem))
if err != nil {
t.Fatalf("can't acquire ram semaphore: %v", err)
}
defer ramsem.Release(int64(distro.mem))
cancel := mkVM(t, n, distro, string(pubkey), loginServer, dir)
defer cancel()
var ipm ipMapping
t.Run("wait-for-start", func(t *testing.T) {
waiter := time.NewTicker(time.Second)
defer waiter.Stop()
var ok bool
for {
<-waiter.C
ipMu.Lock()
if ipm, ok = ipMap[distro.name]; ok {
ipMu.Unlock()
break
}
ipMu.Unlock()
}
})
testDistro(t, loginServer, signer, ipm)
})
}
})
}
func testDistro(t *testing.T, loginServer string, signer ssh.Signer, ipm ipMapping) {
t.Helper()
port := ipm.port
hostport := fmt.Sprintf("127.0.0.1:%d", port)
ccfg := &ssh.ClientConfig{
User: "root",
Auth: []ssh.AuthMethod{ssh.PublicKeys(signer), ssh.Password(securePassword)},
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
}
// NOTE(Xe): This deadline loop helps to make things a bit faster, centos
// sometimes is slow at starting its sshd and will sometimes randomly kill
// SSH sessions on transition to multi-user.target. I don't know why they
// don't use socket activation.
const maxRetries = 5
var working bool
for i := 0; i < maxRetries; i++ {
cli, err := ssh.Dial("tcp", hostport, ccfg)
if err == nil {
working = true
cli.Close()
break
}
time.Sleep(10 * time.Second)
}
if !working {
t.Fatalf("can't connect to %s, tried %d times", hostport, maxRetries)
}
t.Logf("about to ssh into 127.0.0.1:%d", port)
cli, err := ssh.Dial("tcp", hostport, ccfg)
if err != nil {
t.Fatal(err)
}
copyBinaries(t, cli)
timeout := 5 * time.Minute
e, _, err := expect.SpawnSSH(cli, timeout, expect.Verbose(true), expect.VerboseWriter(log.Writer()))
if err != nil {
t.Fatalf("%d: can't register a shell session: %v", port, err)
}
defer e.Close()
t.Log("opened session")
_, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout)
if err != nil {
t.Fatalf("%d: can't get a shell: %v", port, err)
}
t.Logf("got shell for %d", port)
err = e.Send("systemctl start tailscaled.service\n")
if err != nil {
t.Fatalf("can't send command to start tailscaled: %v", err)
}
_, _, err = e.Expect(regexp.MustCompile(`(\#)`), timeout)
if err != nil {
t.Fatalf("%d: can't get a shell: %v", port, err)
}
err = e.Send(fmt.Sprintf("sudo tailscale up --login-server %s\n", loginServer))
if err != nil {
t.Fatalf("%d: can't send tailscale up command: %v", port, err)
}
_, _, err = e.Expect(regexp.MustCompile(`Success.`), timeout)
if err != nil {
t.Fatalf("not successful: %v", err)
}
}
func copyBinaries(t *testing.T, conn *ssh.Client) {
bins := integration.BuildTestBinaries(t)
cli, err := sftp.NewClient(conn)
if err != nil {
t.Fatalf("can't connect over sftp to copy binaries: %v", err)
}
mkdir(t, cli, "/usr/bin")
mkdir(t, cli, "/usr/sbin")
mkdir(t, cli, "/etc/systemd/system")
mkdir(t, cli, "/etc/default")
copyFile(t, cli, bins.Daemon, "/usr/sbin/tailscaled")
copyFile(t, cli, bins.CLI, "/usr/bin/tailscale")
// TODO(Xe): revisit this life decision, hopefully before this assumption
// breaks the test.
copyFile(t, cli, "../../../cmd/tailscaled/tailscaled.defaults", "/etc/default/tailscaled")
copyFile(t, cli, "../../../cmd/tailscaled/tailscaled.service", "/etc/systemd/system/tailscaled.service")
t.Log("tailscale installed!")
}
func mkdir(t *testing.T, cli *sftp.Client, name string) {
t.Helper()
err := cli.MkdirAll(name)
if err != nil {
t.Fatalf("can't make %s: %v", name, err)
}
}
func copyFile(t *testing.T, cli *sftp.Client, localSrc, remoteDest string) {
t.Helper()
fin, err := os.Open(localSrc)
if err != nil {
t.Fatalf("can't open: %v", err)
}
defer fin.Close()
fi, err := fin.Stat()
if err != nil {
t.Fatalf("can't stat: %v", err)
}
fout, err := cli.Create(remoteDest)
if err != nil {
t.Fatalf("can't create output file: %v", err)
}
err = fout.Chmod(fi.Mode())
if err != nil {
fout.Close()
t.Fatalf("can't chmod fout: %v", err)
}
n, err := io.Copy(fout, fin)
if err != nil {
fout.Close()
t.Fatalf("copy failed: %v", err)
}
if fi.Size() != n {
t.Fatalf("incorrect number of bytes copied: wanted: %d, got: %d", fi.Size(), n)
}
err = fout.Close()
if err != nil {
t.Fatalf("can't close fout on remote host: %v", err)
}
}
func deriveBindhost(t *testing.T) string {
t.Helper()
ifName, err := interfaces.DefaultRouteInterface()
if err != nil {
t.Fatal(err)
}
var ret string
err = interfaces.ForeachInterfaceAddress(func(i interfaces.Interface, prefix netaddr.IPPrefix) {
if ret != "" || i.Name != ifName {
return
}
ret = prefix.IP().String()
})
if ret != "" {
return ret
}
if err != nil {
t.Fatal(err)
}
t.Fatal("can't find a bindhost")
return "unreachable"
}
func TestDeriveBindhost(t *testing.T) {
t.Log(deriveBindhost(t))
}
const metaDataTemplate = `instance-id: {{.ID}}
local-hostname: {{.Hostname}}`
const userDataTemplate = `#cloud-config
#vim:syntax=yaml
cloud_config_modules:
- runcmd
cloud_final_modules:
- [users-groups, always]
- [scripts-user, once-per-instance]
users:
- name: root
ssh-authorized-keys:
- {{.SSHKey}}
- name: ts
plain_text_passwd: {{.Password}}
groups: [ wheel ]
sudo: [ "ALL=(ALL) NOPASSWD:ALL" ]
shell: /bin/sh
ssh-authorized-keys:
- {{.SSHKey}}
write_files:
- path: /etc/cloud/cloud.cfg.d/80_disable_network_after_firstboot.cfg
content: |
# Disable network configuration after first boot
network:
config: disabled
runcmd:
{{.InstallPre}}
- [ curl, "{{.HostURL}}/myip/{{.Port}}", "-H", "User-Agent: {{.Hostname}}" ]
`