Skip to content

Commit

Permalink
Reduce the number of VMs on TestMultipleVMs_Isolated
Browse files Browse the repository at this point in the history
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
  • Loading branch information
kzys committed May 23, 2022
1 parent 956c44a commit 378b40d
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 21 deletions.
23 changes: 20 additions & 3 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,21 +73,38 @@ steps:
- make test-in-docker
timeout_in_minutes: 10

- label: ":rotating_light: :running_shirt_with_sash: runtime isolated tests"
- label: ":running: runtime isolated tests"
agents:
queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}"
distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}"
hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}"
env:
DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER"
NUMBER_OF_VMS: 100
EXTRAGOARGS: "-v -count=1 -race -timeout=1h"
NUMBER_OF_VMS: 20
EXTRAGOARGS: "-v -count=1 -race"
FICD_DM_VOLUME_GROUP: fcci-vg
artifact_paths:
- "runtime/logs/*"
command:
- make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_runtime

- label: ":weight_lifter: running stress tests"
concurrency_group: stress
concurrency: 1
agents:
queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}"
distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}"
hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}"
env:
DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER"
NUMBER_OF_VMS: 100
EXTRAGOARGS: "-v -count=1 -race -run TestMultipleVMs_Isolated"
FICD_DM_VOLUME_GROUP: fcci-vg
artifact_paths:
- "runtime/logs/*"
command:
- make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_stress

- label: ":rotating_light: :exclamation: example tests"
agents:
queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}"
Expand Down
2 changes: 1 addition & 1 deletion runtime/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ INTEG_TESTNAMES=$(shell docker run --rm \
--entrypoint=/bin/bash \
--workdir=/src/runtime \
$(FIRECRACKER_CONTAINERD_TEST_IMAGE):$(DOCKER_IMAGE_TAG) \
-c "go test -list . | sed '$$d' | grep $(INTEG_TEST_SUFFIX)")
-c "go test -list . | sed '$$d' | grep $(INTEG_TEST_SUFFIX)" | grep Multi)

all: runtime

Expand Down
83 changes: 66 additions & 17 deletions runtime/service_integ_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,22 +242,51 @@ func createTapDevice(ctx context.Context, tapName string) error {
func TestMultipleVMs_Isolated(t *testing.T) {
integtest.Prepare(t)

// This test starts multiple VMs and some may hit firecracker-containerd's
// default timeout. So overriding the timeout to wait longer.
// One hour should be enough to start a VM, regardless of the load of
// the underlying host.
const createVMTimeout = time.Hour

netns, err := ns.GetCurrentNS()
require.NoError(t, err, "failed to get a namespace")
var err error

// numberOfVmsEnvName = NUMBER_OF_VMS ENV and is configurable from buildkite
numberOfVms := defaultNumberOfVms
if str := os.Getenv(numberOfVmsEnvName); str != "" {
numberOfVms, err = strconv.Atoi(str)
require.NoError(t, err, "failed to get NUMBER_OF_VMS env")
}
t.Logf("TestMultipleVMs_Isolated: will run %d vm's", numberOfVms)
t.Logf("TestMultipleVMs_Isolated: will run up to %d VMs", numberOfVms)

// We should be able to run 10 VMs without any issues.
if numberOfVms <= 10 {
testMultipleVMs(t, 10)
return
}

// We have issues running 100 VMs (see #581).
// Incrementally increase the number of VMs to find the breaking point.
for n := 10; n <= numberOfVms; n += 10 {
success := t.Run(fmt.Sprintf("VMs=%d", n), func(t *testing.T) {
testMultipleVMs(t, n)
})
if !success {
// If running N VMs doesn't work, no point to go further.
return
}
}
}

type Event int

const (
Created Event = iota
Stopped
)

func testMultipleVMs(t *testing.T, count int) {
// This test starts multiple VMs and some may hit firecracker-containerd's
// default timeout. So overriding the timeout to wait longer.
// One hour should be enough to start a VM, regardless of the load of
// the underlying host.
const createVMTimeout = 1 * time.Hour

netns, err := ns.GetCurrentNS()
require.NoError(t, err, "failed to get a namespace")

tapPrefix := os.Getenv(tapPrefixEnvName)

Expand Down Expand Up @@ -299,11 +328,13 @@ func TestMultipleVMs_Isolated(t *testing.T) {
cfg, err := config.LoadConfig("")
require.NoError(t, err, "failed to load config")

eventCh := make(chan Event)

// This test spawns separate VMs in parallel and ensures containers are spawned within each expected VM. It asserts each
// container ends up in the right VM by assigning each VM a network device with a unique mac address and having each container
// print the mac address it sees inside its VM.
vmEg, vmEgCtx := errgroup.WithContext(testCtx)
for i := 0; i < numberOfVms; i++ {
for i := 0; i < count; i++ {
caseTypeNumber := i % len(cases)
vmID := i
c := cases[caseTypeNumber]
Expand Down Expand Up @@ -349,6 +380,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
if err != nil {
return err
}
defer fcClient.Close()

resp, createVMErr := fcClient.CreateVM(ctx, req)
if createVMErr != nil {
Expand All @@ -365,6 +397,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
createVMErr,
)
}
eventCh <- Created

containerEg, containerCtx := errgroup.WithContext(vmEgCtx)
for containerID := 0; containerID < int(containerCount); containerID++ {
Expand Down Expand Up @@ -425,10 +458,8 @@ func TestMultipleVMs_Isolated(t *testing.T) {
}

_, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: strconv.Itoa(vmID), TimeoutSeconds: 5})
if err != nil {
return err
}
return nil
eventCh <- Stopped
return err
}

vmEg.Go(func() error {
Expand All @@ -440,8 +471,26 @@ func TestMultipleVMs_Isolated(t *testing.T) {
})
}

err = vmEg.Wait()
require.NoError(t, err)
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()

created := 0
stopped := 0
for {
select {
case <-vmEgCtx.Done():
return
case e := <-eventCh:
switch e {
case Created:
created++
case Stopped:
stopped++
}
case <-ticker.C:
t.Logf("created=%d, stopped=%d", created, stopped)
}
}
}

func testMultipleExecs(
Expand Down Expand Up @@ -478,7 +527,7 @@ func testMultipleExecs(
if err != nil {
return err
}
defer newContainer.Delete(ctx)
defer newContainer.Delete(ctx, containerd.WithSnapshotCleanup)

var taskStdout bytes.Buffer
var taskStderr bytes.Buffer
Expand Down

0 comments on commit 378b40d

Please sign in to comment.