From 69c4649bb53e328afad6241bed8d8823e539f4fa Mon Sep 17 00:00:00 2001 From: Kazuyoshi Kato Date: Fri, 20 May 2022 22:57:20 +0000 Subject: [PATCH] Run TestMultipleVMs_Isolated as a separate step This test is still unstable (see #581) and we tend to just retry that. Signed-off-by: Kazuyoshi Kato --- .buildkite/pipeline.yml | 25 +++++++++-- runtime/service_integ_test.go | 84 ++++++++++++++++++++++++++++------- 2 files changed, 89 insertions(+), 20 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a644a9023..56007e805 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -73,21 +73,40 @@ steps: - make test-in-docker timeout_in_minutes: 10 - - label: ":rotating_light: :running_shirt_with_sash: runtime isolated tests" + - label: ":running: runtime isolated tests" agents: queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}" distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}" hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}" env: DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER" - NUMBER_OF_VMS: 100 - EXTRAGOARGS: "-v -count=1 -race -timeout=1h" + NUMBER_OF_VMS: 20 + EXTRAGOARGS: "-v -count=1 -race" FICD_DM_VOLUME_GROUP: fcci-vg + TAP_PREFIX: "tap_runtime_$BUILDKITE_BUILD_NUMBER" artifact_paths: - "runtime/logs/*" command: - make -C runtime integ-test FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_runtime + - label: ":weight_lifter: running stress tests" + concurrency_group: stress + concurrency: 1 + agents: + queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}" + distro: "${BUILDKITE_AGENT_META_DATA_DISTRO}" + hostname: "${BUILDKITE_AGENT_META_DATA_HOSTNAME}" + env: + DOCKER_IMAGE_TAG: "$BUILDKITE_BUILD_NUMBER" + NUMBER_OF_VMS: 100 + EXTRAGOARGS: "-v -count=1 -race" + FICD_DM_VOLUME_GROUP: fcci-vg + TAP_PREFIX: "tap_stress_$BUILDKITE_BUILD_NUMBER" + artifact_paths: + - "runtime/logs/*" + command: + - make -C runtime integ-test-TestMultipleVMs_Isolated FICD_DM_POOL=build_${BUILDKITE_BUILD_NUMBER}_stress + - label: ":rotating_light: :exclamation: example tests" agents: queue: "${BUILDKITE_AGENT_META_DATA_QUEUE:-default}" diff --git a/runtime/service_integ_test.go b/runtime/service_integ_test.go index 64685a7eb..167a695cd 100644 --- a/runtime/service_integ_test.go +++ b/runtime/service_integ_test.go @@ -242,14 +242,7 @@ func createTapDevice(ctx context.Context, tapName string) error { func TestMultipleVMs_Isolated(t *testing.T) { integtest.Prepare(t) - // This test starts multiple VMs and some may hit firecracker-containerd's - // default timeout. So overriding the timeout to wait longer. - // One hour should be enough to start a VM, regardless of the load of - // the underlying host. - const createVMTimeout = time.Hour - - netns, err := ns.GetCurrentNS() - require.NoError(t, err, "failed to get a namespace") + var err error // numberOfVmsEnvName = NUMBER_OF_VMS ENV and is configurable from buildkite numberOfVms := defaultNumberOfVms @@ -257,7 +250,44 @@ func TestMultipleVMs_Isolated(t *testing.T) { numberOfVms, err = strconv.Atoi(str) require.NoError(t, err, "failed to get NUMBER_OF_VMS env") } - t.Logf("TestMultipleVMs_Isolated: will run %d vm's", numberOfVms) + t.Logf("TestMultipleVMs_Isolated: will run up to %d VMs", numberOfVms) + + // We should be able to run 10 VMs without any issues. + if numberOfVms <= 10 { + testMultipleVMs(t, 10) + return + } + + // We have issues running 100 VMs (see #581). + // Incrementally increase the number of VMs to find the breaking point. + for i := 10; i <= numberOfVms; i += 10 { + success := t.Run(fmt.Sprintf("VMs=%d", i), func(t *testing.T) { + testMultipleVMs(t, i) + }) + if !success { + // If running N VMs doesn't work, no point to go further. + return + } + } +} + +type Event int + +const ( + Created Event = iota + Stopped +) + +func testMultipleVMs(t *testing.T, count int) { + t.Logf("testMultipleVMs %s %d", t.Name(), count) + // This test starts multiple VMs and some may hit firecracker-containerd's + // default timeout. So overriding the timeout to wait longer. + // One hour should be enough to start a VM, regardless of the load of + // the underlying host. + const createVMTimeout = 1 * time.Hour + + netns, err := ns.GetCurrentNS() + require.NoError(t, err, "failed to get a namespace") tapPrefix := os.Getenv(tapPrefixEnvName) @@ -299,11 +329,13 @@ func TestMultipleVMs_Isolated(t *testing.T) { cfg, err := config.LoadConfig("") require.NoError(t, err, "failed to load config") + eventCh := make(chan Event) + // This test spawns separate VMs in parallel and ensures containers are spawned within each expected VM. It asserts each // container ends up in the right VM by assigning each VM a network device with a unique mac address and having each container // print the mac address it sees inside its VM. vmEg, vmEgCtx := errgroup.WithContext(testCtx) - for i := 0; i < numberOfVms; i++ { + for i := 0; i < count; i++ { caseTypeNumber := i % len(cases) vmID := i c := cases[caseTypeNumber] @@ -349,6 +381,7 @@ func TestMultipleVMs_Isolated(t *testing.T) { if err != nil { return err } + defer fcClient.Close() resp, createVMErr := fcClient.CreateVM(ctx, req) if createVMErr != nil { @@ -365,6 +398,7 @@ func TestMultipleVMs_Isolated(t *testing.T) { createVMErr, ) } + eventCh <- Created containerEg, containerCtx := errgroup.WithContext(vmEgCtx) for containerID := 0; containerID < int(containerCount); containerID++ { @@ -425,10 +459,8 @@ func TestMultipleVMs_Isolated(t *testing.T) { } _, err = fcClient.StopVM(ctx, &proto.StopVMRequest{VMID: strconv.Itoa(vmID), TimeoutSeconds: 5}) - if err != nil { - return err - } - return nil + eventCh <- Stopped + return err } vmEg.Go(func() error { @@ -440,8 +472,26 @@ func TestMultipleVMs_Isolated(t *testing.T) { }) } - err = vmEg.Wait() - require.NoError(t, err) + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + var created int + for stopped := 0; stopped < count; { + select { + case <-vmEgCtx.Done(): + require.NoError(t, vmEg.Wait()) + return + case e := <-eventCh: + switch e { + case Created: + created++ + case Stopped: + stopped++ + } + case <-ticker.C: + t.Logf("created=%d/%d stopped=%d/%d", created, count, stopped, count) + } + } } func testMultipleExecs( @@ -478,7 +528,7 @@ func testMultipleExecs( if err != nil { return err } - defer newContainer.Delete(ctx) + defer newContainer.Delete(ctx, containerd.WithSnapshotCleanup) var taskStdout bytes.Buffer var taskStderr bytes.Buffer