From b06c58b4a5d7ab42d52560ce3b1e4913eb1d071b Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Thu, 9 May 2024 16:08:55 +0200 Subject: [PATCH 1/2] libpod: wait for healthy on main thread wait for the healthy status on the thread where the container lock is held. Otherwise, if it is performed from a go routine, a different thread is used (since the runtime.LockOSThread() call doesn't have any effect), causing pthread_mutex_unlock() to fail with EPERM. Closes: https://github.com/containers/podman/issues/22651 Signed-off-by: Giuseppe Scrivano --- libpod/container_api.go | 10 ++++++++-- libpod/container_internal.go | 32 +++++++++++++++++++++--------- libpod/oci_conmon_attach_common.go | 4 ++-- pkg/domain/infra/abi/containers.go | 14 +++++++------ test/system/260-sdnotify.bats | 9 +++++++++ 5 files changed, 50 insertions(+), 19 deletions(-) diff --git a/libpod/container_api.go b/libpod/container_api.go index 1b49266469..4168b1d7cc 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -115,7 +115,10 @@ func (c *Container) Start(ctx context.Context, recursive bool) (finalErr error) } // Start the container - return c.start(ctx) + if err := c.start(); err != nil { + return err + } + return c.waitForHealthy(ctx) } // Update updates the given container. @@ -194,6 +197,9 @@ func (c *Container) StartAndAttach(ctx context.Context, streams *define.AttachSt opts.Start = true opts.Started = startedChan + // attach and start the container on a different thread. waitForHealthy must + // be done later, as it requires to run on the same thread that holds the lock + // for the container. if err := c.ociRuntime.Attach(c, opts); err != nil { attachChan <- err } @@ -207,7 +213,7 @@ func (c *Container) StartAndAttach(ctx context.Context, streams *define.AttachSt c.newContainerEvent(events.Attach) } - return attachChan, nil + return attachChan, c.waitForHealthy(ctx) } // RestartWithTimeout restarts a running container and takes a given timeout in uint diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 7f909b78db..62eb460634 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -330,10 +330,10 @@ func (c *Container) handleRestartPolicy(ctx context.Context) (_ bool, retErr err return false, err } } - if err := c.start(ctx); err != nil { + if err := c.start(); err != nil { return false, err } - return true, nil + return true, c.waitForHealthy(ctx) } // Ensure that the container is in a specific state or state. @@ -1208,7 +1208,7 @@ func (c *Container) reinit(ctx context.Context, retainRetries bool) error { // Initialize (if necessary) and start a container // Performs all necessary steps to start a container that is not running -// Does not lock or check validity +// Does not lock or check validity, requires to run on the same thread that holds the lock for the container. func (c *Container) initAndStart(ctx context.Context) (retErr error) { // If we are ContainerStateUnknown, throw an error if c.state.State == define.ContainerStateUnknown { @@ -1253,11 +1253,14 @@ func (c *Container) initAndStart(ctx context.Context) (retErr error) { } // Now start the container - return c.start(ctx) + if err := c.start(); err != nil { + return err + } + return c.waitForHealthy(ctx) } // Internal, non-locking function to start a container -func (c *Container) start(ctx context.Context) error { +func (c *Container) start() error { if c.config.Spec.Process != nil { logrus.Debugf("Starting container %s with command %v", c.ID(), c.config.Spec.Process.Args) } @@ -1298,10 +1301,14 @@ func (c *Container) start(ctx context.Context) error { c.newContainerEvent(events.Start) - if err := c.save(); err != nil { - return err - } + return c.save() +} +// waitForHealthy, when sdNotifyMode == SdNotifyModeHealthy, waits up to the DefaultWaitInterval +// for the container to get into the healthy state and reports the status to the notify socket. +// The function unlocks the container lock, so it must be called from the same thread that locks +// the container. +func (c *Container) waitForHealthy(ctx context.Context) error { if c.config.SdNotifyMode != define.SdNotifyModeHealthy { return nil } @@ -1315,6 +1322,9 @@ func (c *Container) start(ctx context.Context) error { } if _, err := c.WaitForConditionWithInterval(ctx, DefaultWaitInterval, define.HealthCheckHealthy); err != nil { + if errors.Is(err, define.ErrNoSuchCtr) { + return nil + } return err } @@ -1566,6 +1576,7 @@ func (c *Container) unpause() error { } // Internal, non-locking function to restart a container +// It requires to run on the same thread that holds the lock. func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retErr error) { if !c.ensureState(define.ContainerStateConfigured, define.ContainerStateCreated, define.ContainerStateRunning, define.ContainerStateStopped, define.ContainerStateExited) { return fmt.Errorf("unable to restart a container in a paused or unknown state: %w", define.ErrCtrStateInvalid) @@ -1613,7 +1624,10 @@ func (c *Container) restartWithTimeout(ctx context.Context, timeout uint) (retEr return err } } - return c.start(ctx) + if err := c.start(); err != nil { + return err + } + return c.waitForHealthy(ctx) } // mountStorage sets up the container's root filesystem diff --git a/libpod/oci_conmon_attach_common.go b/libpod/oci_conmon_attach_common.go index 64fc0f41be..c69cbfbf55 100644 --- a/libpod/oci_conmon_attach_common.go +++ b/libpod/oci_conmon_attach_common.go @@ -3,7 +3,6 @@ package libpod import ( - "context" "errors" "fmt" "io" @@ -32,6 +31,7 @@ const ( // Attach to the given container. // Does not check if state is appropriate. // started is only required if startContainer is true. +// It does not wait for the container to be healthy, it is the caller responsibility to do so. func (r *ConmonOCIRuntime) Attach(c *Container, params *AttachOptions) error { passthrough := c.LogDriver() == define.PassthroughLogging || c.LogDriver() == define.PassthroughTTYLogging @@ -86,7 +86,7 @@ func (r *ConmonOCIRuntime) Attach(c *Container, params *AttachOptions) error { // If starting was requested, start the container and notify when that's // done. if params.Start { - if err := c.start(context.TODO()); err != nil { + if err := c.start(); err != nil { return err } params.Started <- true diff --git a/pkg/domain/infra/abi/containers.go b/pkg/domain/infra/abi/containers.go index aa0d47236a..fb417acfa2 100644 --- a/pkg/domain/infra/abi/containers.go +++ b/pkg/domain/infra/abi/containers.go @@ -995,7 +995,10 @@ func (ic *ContainerEngine) ContainerStart(ctx context.Context, namesOrIds []stri return reports, fmt.Errorf("unable to start container %s: %w", ctr.ID(), err) } - exitCode = ic.GetContainerExitCode(ctx, ctr.Container) + exitCode, err2 := ic.ContainerWaitForExitCode(ctx, ctr.Container) + if err2 != nil { + logrus.Errorf("Waiting for container %s: %v", ctr.ID(), err2) + } reports = append(reports, &entities.ContainerStartReport{ Id: ctr.ID(), RawInput: ctr.rawInput, @@ -1189,7 +1192,7 @@ func (ic *ContainerEngine) ContainerRun(ctx context.Context, opts entities.Conta report.ExitCode = define.ExitCode(err) return &report, err } - report.ExitCode = ic.GetContainerExitCode(ctx, ctr) + report.ExitCode, _ = ic.ContainerWaitForExitCode(ctx, ctr) if opts.Rm && !ctr.ShouldRestart(ctx) { if err := removeContainer(ctr, false); err != nil { if errors.Is(err, define.ErrNoSuchCtr) || @@ -1203,14 +1206,13 @@ func (ic *ContainerEngine) ContainerRun(ctx context.Context, opts entities.Conta return &report, nil } -func (ic *ContainerEngine) GetContainerExitCode(ctx context.Context, ctr *libpod.Container) int { +func (ic *ContainerEngine) ContainerWaitForExitCode(ctx context.Context, ctr *libpod.Container) (int, error) { exitCode, err := ctr.Wait(ctx) if err != nil { - logrus.Errorf("Waiting for container %s: %v", ctr.ID(), err) intExitCode := int(define.ExecErrorCodeNotFound) - return intExitCode + return intExitCode, err } - return int(exitCode) + return int(exitCode), nil } func (ic *ContainerEngine) ContainerLogs(ctx context.Context, namesOrIds []string, options entities.ContainerLogsOptions) error { diff --git a/test/system/260-sdnotify.bats b/test/system/260-sdnotify.bats index fb7cb39f58..11629d2c7e 100644 --- a/test/system/260-sdnotify.bats +++ b/test/system/260-sdnotify.bats @@ -233,6 +233,15 @@ READY=1" "Container log after healthcheck run" is "$output" "running" "make sure container is still running" run_podman rm -f -t0 $ctr + + ctr=$(random_string) + run_podman run --name $ctr \ + --health-cmd="touch /terminate" \ + --sdnotify=healthy \ + $IMAGE sh -c 'while test \! -e /terminate; do sleep 0.1; done; echo finished' + is "$output" "finished" "make sure container exited successfully" + run_podman rm -f -t0 $ctr + _stop_socat } From 35375e0af835568258e859ce554811d8c4e7c77f Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Tue, 14 May 2024 22:49:07 +0200 Subject: [PATCH 2/2] container_api: do not wait for healtchecks if stopped do not wait for the healthcheck status to change if the container is stopped. Signed-off-by: Giuseppe Scrivano --- libpod/container_api.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/libpod/container_api.go b/libpod/container_api.go index 4168b1d7cc..52b7e145de 100644 --- a/libpod/container_api.go +++ b/libpod/container_api.go @@ -781,6 +781,14 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou } } if len(wantedHealthStates) > 0 { + // even if we are interested only in the health check + // check that the container is still running to avoid + // waiting until the timeout expires. + state, err := c.State() + if err != nil { + trySend(-1, err) + return + } status, err := c.HealthCheckStatus() if err != nil { trySend(-1, err) @@ -790,6 +798,10 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou trySend(-1, nil) return } + if state != define.ContainerStateCreated && state != define.ContainerStateRunning && state != define.ContainerStatePaused { + trySend(-1, define.ErrCtrStopped) + return + } } select { case <-ctx.Done():