From fa920f54c70f473152428f368a79d32f3f39d159 Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Wed, 16 Oct 2024 13:02:48 -0600 Subject: [PATCH] CI: e2e: fix checkpoint flake Two flakes seen in the last three months. One of them was in August, so it's not related to ongoing criu-4.0 problems. Suspected cause: race waiting for "podman run --rm" container to transition from stopped to removed. Solution: allow a 5-second grace period, retrying every second. Also: add explanations to the Expect()s, remove unnecessary code, and tighten up the CID check. Signed-off-by: Ed Santiago --- test/e2e/checkpoint_test.go | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/test/e2e/checkpoint_test.go b/test/e2e/checkpoint_test.go index c0f85ade8a..e6abe28d23 100644 --- a/test/e2e/checkpoint_test.go +++ b/test/e2e/checkpoint_test.go @@ -711,12 +711,13 @@ var _ = Describe("Podman checkpoint", func() { session := podmanTest.Podman([]string{"run", "--network=none", "-d", "--rm", ALPINE, "top"}) session.WaitWithDefaultTimeout() Expect(session).Should(ExitCleanly()) - Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1)) + Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1), "# of running containers at start") cid := session.OutputToString() fileName := filepath.Join(podmanTest.TempDir, "/checkpoint-"+cid+".tar.gz") // Change the container's root file-system - result := podmanTest.Podman([]string{"exec", cid, "/bin/sh", "-c", "echo test" + cid + "test > /test.output"}) + signalFile := "/test.output" + result := podmanTest.Podman([]string{"exec", cid, "touch", signalFile}) result.WaitWithDefaultTimeout() Expect(result).Should(ExitCleanly()) @@ -725,23 +726,32 @@ var _ = Describe("Podman checkpoint", func() { result.WaitWithDefaultTimeout() Expect(result).Should(ExitCleanly()) - Expect(result.OutputToString()).To(ContainSubstring(cid)) - Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0)) - Expect(podmanTest.NumberOfContainers()).To(Equal(0)) + Expect(result.OutputToString()).To(Equal(cid), "checkpoint output") + // Allow a few seconds for --rm to take effect + ncontainers := podmanTest.NumberOfContainers() + for try := 0; try < 4; try++ { + if ncontainers == 0 { + break + } + time.Sleep(time.Second) + ncontainers = podmanTest.NumberOfContainers() + } + Expect(ncontainers).To(Equal(0), "# of containers (total) after checkpoint") // Restore the container result = podmanTest.Podman([]string{"container", "restore", "--ignore-rootfs", "-i", fileName}) result.WaitWithDefaultTimeout() - Expect(result).Should(ExitCleanly()) - Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1)) - Expect(podmanTest.NumberOfContainers()).To(Equal(1)) - Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Up")) + + runCheck := podmanTest.Podman([]string{"ps", "-a", "--noheading", "--no-trunc", "--format", "{{.ID}} {{.State}}"}) + runCheck.WaitWithDefaultTimeout() + Expect(runCheck).Should(ExitCleanly()) + Expect(runCheck.OutputToString()).To(Equal(cid+" running"), "podman ps, after restore") // Verify the changes to the container's root file-system - result = podmanTest.Podman([]string{"exec", cid, "cat", "/test.output"}) + result = podmanTest.Podman([]string{"exec", cid, "cat", signalFile}) result.WaitWithDefaultTimeout() - Expect(result).Should(ExitWithError(1, "cat: can't open '/test.output': No such file or directory")) + Expect(result).Should(ExitWithError(1, "cat: can't open '"+signalFile+"': No such file or directory")) // Remove exported checkpoint os.Remove(fileName)