From 3350cd3eed1fb405255f0103ff60913b418c475b Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Fri, 5 Jul 2024 16:20:28 +0200 Subject: [PATCH 1/2] pkg/rootless: simplify reexec for container code The code currently tried to avoid joining the userns from conmon directly and rather joined to only read the pid file and then send this back to use so we could join the userns. From the comment this was done because we could not read the pid file. However this is no longer true as of commit 49eb5af301 and file is no always owned by the real user. This means we can just remove this special logic and join the namespace directly there. A test has been added to check the rejoin logic with a custom uidmapping. Signed-off-by: Paul Holzinger --- pkg/domain/infra/abi/system_linux.go | 2 +- pkg/rootless/rootless.go | 4 +- pkg/rootless/rootless_freebsd.go | 6 +- pkg/rootless/rootless_linux.c | 50 +--------------- pkg/rootless/rootless_linux.go | 85 +++++----------------------- pkg/rootless/rootless_unsupported.go | 6 +- test/system/550-pause-process.bats | 5 +- 7 files changed, 24 insertions(+), 134 deletions(-) diff --git a/pkg/domain/infra/abi/system_linux.go b/pkg/domain/infra/abi/system_linux.go index fe36ad88a7..db51f4d4d0 100644 --- a/pkg/domain/infra/abi/system_linux.go +++ b/pkg/domain/infra/abi/system_linux.go @@ -88,7 +88,7 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool, } if len(paths) > 0 { - became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, true, paths) + became, ret, err = rootless.TryJoinFromFilePaths(pausePidPath, paths) } else { became, ret, err = rootless.BecomeRootInUserNS(pausePidPath) if err == nil { diff --git a/pkg/rootless/rootless.go b/pkg/rootless/rootless.go index 3d654b1f35..fd906e96d7 100644 --- a/pkg/rootless/rootless.go +++ b/pkg/rootless/rootless.go @@ -24,7 +24,7 @@ func TryJoinPauseProcess(pausePidPath string) (bool, int, error) { return false, -1, err } - became, ret, err := TryJoinFromFilePaths("", false, []string{pausePidPath}) + became, ret, err := TryJoinFromFilePaths("", []string{pausePidPath}) if err == nil { return became, ret, nil } @@ -45,7 +45,7 @@ func TryJoinPauseProcess(pausePidPath string) (bool, int, error) { }() // Now the pause PID file is locked. Try to join once again in case it changed while it was not locked. - became, ret, err = TryJoinFromFilePaths("", false, []string{pausePidPath}) + became, ret, err = TryJoinFromFilePaths("", []string{pausePidPath}) if err != nil { // It is still failing. We can safely remove it. os.Remove(pausePidPath) diff --git a/pkg/rootless/rootless_freebsd.go b/pkg/rootless/rootless_freebsd.go index 28c1a5e15d..3d88d05225 100644 --- a/pkg/rootless/rootless_freebsd.go +++ b/pkg/rootless/rootless_freebsd.go @@ -38,11 +38,7 @@ func GetRootlessGID() int { // This is useful when there are already running containers and we // don't have a pause process yet. We can use the paths to the conmon // processes to attempt joining their namespaces. -// If needNewNamespace is set, the file is read from a temporary user -// namespace, this is useful for containers that are running with a -// different uidmap and the unprivileged user has no way to read the -// file owned by the root in the container. -func TryJoinFromFilePaths(pausePidPath string, needNewNamespace bool, paths []string) (bool, int, error) { +func TryJoinFromFilePaths(pausePidPath string, paths []string) (bool, int, error) { return false, -1, errors.New("this function is not supported on this os") } diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c index 6e4702946b..ddb97b3346 100644 --- a/pkg/rootless/rootless_linux.c +++ b/pkg/rootless/rootless_linux.c @@ -947,49 +947,8 @@ check_proc_sys_userns_file (const char *path) } } -static int -copy_file_to_fd (const char *file_to_read, int outfd) -{ - char buf[512]; - cleanup_close int fd = -1; - - fd = open (file_to_read, O_RDONLY); - if (fd < 0) - { - fprintf (stderr, "open `%s`: %m\n", file_to_read); - return fd; - } - - for (;;) - { - ssize_t r, w, t = 0; - - r = TEMP_FAILURE_RETRY (read (fd, buf, sizeof buf)); - if (r < 0) - { - fprintf (stderr, "read from `%s`: %m\n", file_to_read); - return r; - } - - if (r == 0) - break; - - while (t < r) - { - w = TEMP_FAILURE_RETRY (write (outfd, &buf[t], r - t)); - if (w < 0) - { - fprintf (stderr, "write file to output fd `%s`: %m\n", file_to_read); - return w; - } - t += w; - } - } - return 0; -} - int -reexec_in_user_namespace (int ready, char *pause_pid_file_path, char *file_to_read, int outputfd) +reexec_in_user_namespace (int ready, char *pause_pid_file_path) { cleanup_free char **argv = NULL; cleanup_free char *argv0 = NULL; @@ -1138,13 +1097,6 @@ reexec_in_user_namespace (int ready, char *pause_pid_file_path, char *file_to_re _exit (EXIT_FAILURE); } - if (file_to_read && file_to_read[0]) - { - ret = copy_file_to_fd (file_to_read, outputfd); - close (outputfd); - _exit (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); - } - execvp ("/proc/self/exe", argv); fprintf (stderr, "failed to reexec: %m\n"); diff --git a/pkg/rootless/rootless_linux.go b/pkg/rootless/rootless_linux.go index 61133ababa..3ad3906241 100644 --- a/pkg/rootless/rootless_linux.go +++ b/pkg/rootless/rootless_linux.go @@ -32,7 +32,7 @@ import ( #include extern uid_t rootless_uid(); extern uid_t rootless_gid(); -extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path, char *file_to_read, int fd); +extern int reexec_in_user_namespace(int ready, char *pause_pid_file_path); extern int reexec_in_user_namespace_wait(int pid, int options); extern int reexec_userns_join(int pid, char *pause_pid_file_path); extern int is_fd_inherited(int fd); @@ -213,7 +213,7 @@ func copyMappings(from, to string) error { return os.WriteFile(to, content, 0600) } -func becomeRootInUserNS(pausePid, fileToRead string, fileOutput *os.File) (_ bool, _ int, retErr error) { +func becomeRootInUserNS(pausePid string) (_ bool, _ int, retErr error) { hasCapSysAdmin, err := unshare.HasCapSysAdmin() if err != nil { return false, 0, err @@ -249,13 +249,6 @@ func becomeRootInUserNS(pausePid, fileToRead string, fileOutput *os.File) (_ boo cPausePid := C.CString(pausePid) defer C.free(unsafe.Pointer(cPausePid)) - cFileToRead := C.CString(fileToRead) - defer C.free(unsafe.Pointer(cFileToRead)) - var fileOutputFD C.int - if fileOutput != nil { - fileOutputFD = C.int(fileOutput.Fd()) - } - runtime.LockOSThread() defer runtime.UnlockOSThread() @@ -287,7 +280,7 @@ func becomeRootInUserNS(pausePid, fileToRead string, fileOutput *os.File) (_ boo } }() - pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid, cFileToRead, fileOutputFD) + pidC := C.reexec_in_user_namespace(C.int(r.Fd()), cPausePid) pid = int(pidC) if pid < 0 { return false, -1, fmt.Errorf("cannot re-exec process") @@ -361,14 +354,6 @@ func becomeRootInUserNS(pausePid, fileToRead string, fileOutput *os.File) (_ boo return false, -1, fmt.Errorf("read from sync pipe: %w", err) } - if fileOutput != nil { - ret := C.reexec_in_user_namespace_wait(pidC, 0) - if ret < 0 { - return false, -1, errors.New("waiting for the re-exec process") - } - return true, 0, nil - } - if b[0] == '2' { // We have lost the race for writing the PID file, as probably another // process created a namespace and wrote the PID. @@ -434,69 +419,27 @@ func waitAndProxySignalsToChild(pid C.int) (bool, int, error) { // If podman was re-executed the caller needs to propagate the error code returned by the child // process. func BecomeRootInUserNS(pausePid string) (bool, int, error) { - return becomeRootInUserNS(pausePid, "", nil) + return becomeRootInUserNS(pausePid) } // TryJoinFromFilePaths attempts to join the namespaces of the pid files in paths. // This is useful when there are already running containers and we // don't have a pause process yet. We can use the paths to the conmon // processes to attempt joining their namespaces. -// If needNewNamespace is set, the file is read from a temporary user -// namespace, this is useful for containers that are running with a -// different uidmap and the unprivileged user has no way to read the -// file owned by the root in the container. -func TryJoinFromFilePaths(pausePidPath string, needNewNamespace bool, paths []string) (bool, int, error) { +func TryJoinFromFilePaths(pausePidPath string, paths []string) (bool, int, error) { var lastErr error - var pausePid int for _, path := range paths { - if !needNewNamespace { - data, err := os.ReadFile(path) - if err != nil { - lastErr = err - continue - } - - pausePid, err = strconv.Atoi(string(data)) - if err != nil { - lastErr = fmt.Errorf("cannot parse file %q: %w", path, err) - continue - } - } else { - r, w, err := os.Pipe() - if err != nil { - lastErr = err - continue - } - - defer errorhandling.CloseQuiet(r) - - if _, _, err := becomeRootInUserNS("", path, w); err != nil { - w.Close() - lastErr = err - continue - } - - if err := w.Close(); err != nil { - return false, 0, err - } - defer func() { - C.reexec_in_user_namespace_wait(-1, 0) - }() - - b := make([]byte, 32) - - n, err := r.Read(b) - if err != nil { - lastErr = fmt.Errorf("cannot read %q: %w", path, err) - continue - } + data, err := os.ReadFile(path) + if err != nil { + lastErr = err + continue + } - pausePid, err = strconv.Atoi(string(b[:n])) - if err != nil { - lastErr = err - continue - } + pausePid, err := strconv.Atoi(string(data)) + if err != nil { + lastErr = fmt.Errorf("cannot parse file %q: %w", path, err) + continue } if pausePid > 0 && unix.Kill(pausePid, 0) == nil { diff --git a/pkg/rootless/rootless_unsupported.go b/pkg/rootless/rootless_unsupported.go index 0d587644fd..73282fb2a6 100644 --- a/pkg/rootless/rootless_unsupported.go +++ b/pkg/rootless/rootless_unsupported.go @@ -41,11 +41,7 @@ func GetRootlessGID() int { // This is useful when there are already running containers and we // don't have a pause process yet. We can use the paths to the conmon // processes to attempt joining their namespaces. -// If needNewNamespace is set, the file is read from a temporary user -// namespace, this is useful for containers that are running with a -// different uidmap and the unprivileged user has no way to read the -// file owned by the root in the container. -func TryJoinFromFilePaths(pausePidPath string, needNewNamespace bool, paths []string) (bool, int, error) { +func TryJoinFromFilePaths(pausePidPath string, paths []string) (bool, int, error) { return false, -1, errors.New("this function is not supported on this os") } diff --git a/test/system/550-pause-process.bats b/test/system/550-pause-process.bats index eee770f27a..cd38609306 100644 --- a/test/system/550-pause-process.bats +++ b/test/system/550-pause-process.bats @@ -119,7 +119,7 @@ function _check_pause_process() { # First let's run a container in the background to keep the userns active local cname1=c1_$(random_string) - run_podman run -d --name $cname1 $IMAGE top + run_podman run -d --name $cname1 --uidmap 0:100:100 $IMAGE top run_podman unshare readlink /proc/self/ns/user userns="$output" @@ -136,6 +136,9 @@ function _check_pause_process() { _test_sigproxy $cname2 $kidpid + # check pause process again + _check_pause_process + # our container exits 0 so podman should too wait $kidpid || die "podman run exited $? instead of zero" From a2c83cb0fd401cfe4e584395307dd951c76af6ee Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Mon, 8 Jul 2024 13:32:23 +0200 Subject: [PATCH 2/2] SetupRootless(): only reexec when needed We should never try to reexxec when we are already root with CAP_SYS_ADMIN. The code contained a bug when --cgroups=disabled is used as it tried to perfom a reexec even when it was not needed. Fixes: 900e29549a ("libpod: do not move podman with --cgroups=disabled") Signed-off-by: Paul Holzinger --- pkg/domain/infra/abi/system_linux.go | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pkg/domain/infra/abi/system_linux.go b/pkg/domain/infra/abi/system_linux.go index db51f4d4d0..6dd99554cd 100644 --- a/pkg/domain/infra/abi/system_linux.go +++ b/pkg/domain/infra/abi/system_linux.go @@ -30,15 +30,16 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool, } } - configureCgroup := cgroupMode != "disabled" - if configureCgroup { + hasCapSysAdmin, err := unshare.HasCapSysAdmin() + if err != nil { + return err + } + + // check for both euid == 0 and CAP_SYS_ADMIN because we may be running in a container with CAP_SYS_ADMIN set. + if os.Geteuid() == 0 && hasCapSysAdmin { // do it only after podman has already re-execed and running with uid==0. - hasCapSysAdmin, err := unshare.HasCapSysAdmin() - if err != nil { - return err - } - // check for both euid == 0 and CAP_SYS_ADMIN because we may be running in a container with CAP_SYS_ADMIN set. - if os.Geteuid() == 0 && hasCapSysAdmin { + configureCgroup := cgroupMode != "disabled" + if configureCgroup { ownsCgroup, err := cgroups.UserOwnsCurrentSystemdCgroup() if err != nil { logrus.Infof("Failed to detect the owner for the current cgroup: %v", err) @@ -55,8 +56,8 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool, } } } - return nil } + return nil } pausePidPath, err := util.GetRootlessPauseProcessPidPath()