Skip to content

Commit

Permalink
CI: e2e: fix checkpoint flake
Browse files Browse the repository at this point in the history
Two flakes seen in the last three months. One of them was in
August, so it's not related to ongoing criu-4.0 problems.

Suspected cause: race waiting for "podman run --rm" container
to transition from stopped to removed.

Solution: allow a 5-second grace period, retrying every second.

Also: add explanations to the Expect()s, remove unnecessary
code, and tighten up the CID check.

Signed-off-by: Ed Santiago <[email protected]>
  • Loading branch information
edsantiago committed Oct 17, 2024
1 parent 993ecd5 commit fa920f5
Showing 1 changed file with 21 additions and 11 deletions.
32 changes: 21 additions & 11 deletions test/e2e/checkpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -711,12 +711,13 @@ var _ = Describe("Podman checkpoint", func() {
session := podmanTest.Podman([]string{"run", "--network=none", "-d", "--rm", ALPINE, "top"})
session.WaitWithDefaultTimeout()
Expect(session).Should(ExitCleanly())
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1), "# of running containers at start")
cid := session.OutputToString()
fileName := filepath.Join(podmanTest.TempDir, "/checkpoint-"+cid+".tar.gz")

// Change the container's root file-system
result := podmanTest.Podman([]string{"exec", cid, "/bin/sh", "-c", "echo test" + cid + "test > /test.output"})
signalFile := "/test.output"
result := podmanTest.Podman([]string{"exec", cid, "touch", signalFile})
result.WaitWithDefaultTimeout()
Expect(result).Should(ExitCleanly())

Expand All @@ -725,23 +726,32 @@ var _ = Describe("Podman checkpoint", func() {
result.WaitWithDefaultTimeout()

Expect(result).Should(ExitCleanly())
Expect(result.OutputToString()).To(ContainSubstring(cid))
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(0))
Expect(podmanTest.NumberOfContainers()).To(Equal(0))
Expect(result.OutputToString()).To(Equal(cid), "checkpoint output")
// Allow a few seconds for --rm to take effect
ncontainers := podmanTest.NumberOfContainers()
for try := 0; try < 4; try++ {
if ncontainers == 0 {
break
}
time.Sleep(time.Second)
ncontainers = podmanTest.NumberOfContainers()
}
Expect(ncontainers).To(Equal(0), "# of containers (total) after checkpoint")

// Restore the container
result = podmanTest.Podman([]string{"container", "restore", "--ignore-rootfs", "-i", fileName})
result.WaitWithDefaultTimeout()

Expect(result).Should(ExitCleanly())
Expect(podmanTest.NumberOfContainersRunning()).To(Equal(1))
Expect(podmanTest.NumberOfContainers()).To(Equal(1))
Expect(podmanTest.GetContainerStatus()).To(ContainSubstring("Up"))

runCheck := podmanTest.Podman([]string{"ps", "-a", "--noheading", "--no-trunc", "--format", "{{.ID}} {{.State}}"})
runCheck.WaitWithDefaultTimeout()
Expect(runCheck).Should(ExitCleanly())
Expect(runCheck.OutputToString()).To(Equal(cid+" running"), "podman ps, after restore")

// Verify the changes to the container's root file-system
result = podmanTest.Podman([]string{"exec", cid, "cat", "/test.output"})
result = podmanTest.Podman([]string{"exec", cid, "cat", signalFile})
result.WaitWithDefaultTimeout()
Expect(result).Should(ExitWithError(1, "cat: can't open '/test.output': No such file or directory"))
Expect(result).Should(ExitWithError(1, "cat: can't open '"+signalFile+"': No such file or directory"))

// Remove exported checkpoint
os.Remove(fileName)
Expand Down

0 comments on commit fa920f5

Please sign in to comment.