From ccc76713a7614d4386fcc1108147bb768106d70a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 17 Aug 2023 15:13:33 +1000 Subject: [PATCH 1/2] sync: rename procResume -> procHooksDone The old name was quite confusing, and with the addition of the procMountPlease sync message there are now multiple sync messages that are related to "resuming" runc-init. Signed-off-by: Aleksa Sarai --- libcontainer/init_linux.go | 2 +- libcontainer/process_linux.go | 2 +- libcontainer/sync.go | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index a24be276878..71956d6dabd 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -413,7 +413,7 @@ func syncParentHooks(pipe *os.File) error { return err } // Wait for parent to give the all-clear. - return readSync(pipe, procResume) + return readSync(pipe, procHooksDone) } // syncParentSeccomp sends the fd associated with the seccomp file descriptor diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 8785d65700f..9ea8ad5b2bc 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -568,7 +568,7 @@ func (p *initProcess) start() (retErr error) { } } // Sync with child. - if err := writeSync(p.messageSockPair.parent, procResume); err != nil { + if err := writeSync(p.messageSockPair.parent, procHooksDone); err != nil { return err } default: diff --git a/libcontainer/sync.go b/libcontainer/sync.go index 25507e58ad9..36e657f66d8 100644 --- a/libcontainer/sync.go +++ b/libcontainer/sync.go @@ -23,7 +23,7 @@ type syncType string // --- no return synchronisation // // procHooks --> [run hooks] -// <-- procResume +// <-- procHooksDone // // procReady --> [final setup] // <-- procRun @@ -35,7 +35,7 @@ const ( procReady syncType = "procReady" procRun syncType = "procRun" procHooks syncType = "procHooks" - procResume syncType = "procResume" + procHooksDone syncType = "procHooksDone" procSeccomp syncType = "procSeccomp" procSeccompDone syncType = "procSeccompDone" ) From 8da42aaec275f2ff93c4f7a2f3b08e7ad48a08a3 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 17 Aug 2023 21:20:33 +1000 Subject: [PATCH 2/2] sync: split init config (stream) and synchronisation (seqpacket) pipes We have different requirements for the initial configuration and initWaiter pipe (just send netlink and JSON blobs with no complicated handling needed for message coalescing) and the packet-based synchronisation pipe. Tests with switching everything to SOCK_SEQPACKET lead to endless issues with runc hanging on start-up because random things would try to do short reads (which SOCK_SEQPACKET will not allow and the Go stdlib explicitly treats as a streaming source), so splitting it was the only reasonable solution. Even doing somewhat dodgy tricks such as adding a Read() wrapper which actually calls ReadPacket() and makes it seem like a stream source doesn't work -- and is a bit too magical. One upside is that doing it this way makes the difference between the modes clearer -- INITPIPE is still used for initWaiter syncrhonisation but aside from that all other synchronisation is done by SYNCPIPE. Signed-off-by: Aleksa Sarai --- libcontainer/container_linux.go | 61 +++++++-------- libcontainer/init_linux.go | 42 +++++++---- libcontainer/process_linux.go | 113 +++++++++++++++++----------- libcontainer/rootfs_linux.go | 2 +- libcontainer/setns_init_linux.go | 4 +- libcontainer/standard_init_linux.go | 3 +- libcontainer/sync.go | 63 ++++++++++++---- libcontainer/sync_unix.go | 76 +++++++++++++++++++ libcontainer/utils/utils.go | 3 + libcontainer/utils/utils_unix.go | 2 +- 10 files changed, 258 insertions(+), 111 deletions(-) create mode 100644 libcontainer/sync_unix.go diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index ae5d4fb46b4..618128e2ab6 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -488,17 +488,10 @@ func isDmzBinarySafe(c *configs.Config) bool { } func (c *Container) newParentProcess(p *Process) (parentProcess, error) { - parentInitPipe, childInitPipe, err := utils.NewSockPair("init") + comm, err := newProcessComm() if err != nil { - return nil, fmt.Errorf("unable to create init pipe: %w", err) - } - messageSockPair := filePair{parentInitPipe, childInitPipe} - - parentLogPipe, childLogPipe, err := os.Pipe() - if err != nil { - return nil, fmt.Errorf("unable to create log pipe: %w", err) + return nil, err } - logFilePair := filePair{parentLogPipe, childLogPipe} // Make sure we use a new safe copy of /proc/self/exe or the runc-dmz // binary each time this is called, to make sure that if a container @@ -569,10 +562,15 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), ) } - cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe) + cmd.Env = append(cmd.Env, "_LIBCONTAINER_STATEDIR="+c.root) + + cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild) cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), - "_LIBCONTAINER_STATEDIR="+c.root, + ) + cmd.ExtraFiles = append(cmd.ExtraFiles, comm.syncSockChild.File()) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), ) if dmzExe != nil { @@ -581,7 +579,7 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { "_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) } - cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe) + cmd.ExtraFiles = append(cmd.ExtraFiles, comm.logPipeChild) cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) if p.LogLevel != "" { @@ -617,9 +615,9 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { if err := c.includeExecFifo(cmd); err != nil { return nil, fmt.Errorf("unable to setup exec fifo: %w", err) } - return c.newInitProcess(p, cmd, messageSockPair, logFilePair) + return c.newInitProcess(p, cmd, comm) } - return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) + return c.newSetnsProcess(p, cmd, comm) } // shouldSendMountSources says whether the child process must setup bind mounts with @@ -681,27 +679,27 @@ func (c *Container) shouldSendIdmapSources() bool { return false } -func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error { +func (c *Container) sendMountSources(cmd *exec.Cmd, comm *processComm) error { if !c.shouldSendMountSources() { return nil } - return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool { + return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool { return m.IsBind() && !m.IsIDMapped() }) } -func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error { +func (c *Container) sendIdmapSources(cmd *exec.Cmd, comm *processComm) error { if !c.shouldSendIdmapSources() { return nil } - return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool { + return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool { return m.IsBind() && m.IsIDMapped() }) } -func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error { +func (c *Container) sendFdsSources(cmd *exec.Cmd, comm *processComm, envVar string, condition func(*configs.Mount) bool) error { // Elements on these slices will be paired with mounts (see StartInitialization() and // prepareRootfs()). These slices MUST have the same size as c.config.Mounts. fds := make([]int, len(c.config.Mounts)) @@ -712,11 +710,12 @@ func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envV continue } - // The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need - // to allocate a fd so that we know the number to pass in the environment variable. The fd - // must not be closed before cmd.Start(), so we reuse messageSockPair.child because the - // lifecycle of that fd is already taken care of. - cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child) + // The fd passed here will not be used: nsexec.c will overwrite it with + // dup3(). We just need to allocate a fd so that we know the number to + // pass in the environment variable. The fd must not be closed before + // cmd.Start(), so we reuse initSockChild because the lifecycle of that + // fd is already taken care of. + cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild) fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1 } fdsJSON, err := json.Marshal(fds) @@ -727,7 +726,7 @@ func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envV return nil } -func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { +func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) nsMaps := make(map[configs.NamespaceType]string) for _, ns := range c.config.Namespaces { @@ -739,17 +738,16 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l if err != nil { return nil, err } - if err := c.sendMountSources(cmd, messageSockPair); err != nil { + if err := c.sendMountSources(cmd, comm); err != nil { return nil, err } - if err := c.sendIdmapSources(cmd, messageSockPair); err != nil { + if err := c.sendIdmapSources(cmd, comm); err != nil { return nil, err } init := &initProcess{ cmd: cmd, - messageSockPair: messageSockPair, - logFilePair: logFilePair, + comm: comm, manager: c.cgroupManager, intelRdtManager: c.intelRdtManager, config: c.newInitConfig(p), @@ -761,7 +759,7 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l return init, nil } -func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) { +func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) state, err := c.currentState() if err != nil { @@ -778,8 +776,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, cgroupPaths: state.CgroupPaths, rootlessCgroups: c.config.RootlessCgroups, intelRdtPath: state.IntelRdtPath, - messageSockPair: messageSockPair, - logFilePair: logFilePair, + comm: comm, manager: c.cgroupManager, config: c.newInitConfig(p), process: p, diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 71956d6dabd..edb112f64bd 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -108,20 +108,20 @@ func Init() { // error, it means the initialization has failed. If the error is returned, // it means the error can not be communicated back to the parent. func startInitialization() (retErr error) { - // Get the INITPIPE. - envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE") - pipefd, err := strconv.Atoi(envInitPipe) + // Get the syncrhonisation pipe. + envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE") + syncPipeFd, err := strconv.Atoi(envSyncPipe) if err != nil { - return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err) + return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err) } - pipe := os.NewFile(uintptr(pipefd), "pipe") - defer pipe.Close() + syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync")) + defer syncPipe.Close() defer func() { // If this defer is ever called, this means initialization has failed. // Send the error back to the parent process in the form of an initError. ierr := initError{Message: retErr.Error()} - if err := writeSyncArg(pipe, procError, ierr); err != nil { + if err := writeSyncArg(syncPipe, procError, ierr); err != nil { fmt.Fprintln(os.Stderr, err) return } @@ -129,6 +129,15 @@ func startInitialization() (retErr error) { retErr = nil }() + // Get the INITPIPE. + envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE") + initPipeFd, err := strconv.Atoi(envInitPipe) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err) + } + initPipe := os.NewFile(uintptr(initPipeFd), "init") + defer initPipe.Close() + // Set up logging. This is used rarely, and mostly for init debugging. // Passing log level is optional; currently libcontainer/integration does not do it. @@ -207,15 +216,16 @@ func startInitialization() (retErr error) { } }() + var config initConfig + if err := json.NewDecoder(initPipe).Decode(&config); err != nil { + return err + } + // If init succeeds, it will not return, hence none of the defers will be called. - return containerInit(it, pipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) + return containerInit(it, &config, syncPipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) } -func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error { - var config *initConfig - if err := json.NewDecoder(pipe).Decode(&config); err != nil { - return err - } +func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error { if err := populateProcessEnvironment(config.Env); err != nil { return err } @@ -395,7 +405,7 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { // syncParentReady sends to the given pipe a JSON payload which indicates that // the init is ready to Exec the child process. It then waits for the parent to // indicate that it is cleared to Exec. -func syncParentReady(pipe *os.File) error { +func syncParentReady(pipe *syncSocket) error { // Tell parent. if err := writeSync(pipe, procReady); err != nil { return err @@ -407,7 +417,7 @@ func syncParentReady(pipe *os.File) error { // syncParentHooks sends to the given pipe a JSON payload which indicates that // the parent should execute pre-start hooks. It then waits for the parent to // indicate that it is cleared to resume. -func syncParentHooks(pipe *os.File) error { +func syncParentHooks(pipe *syncSocket) error { // Tell parent. if err := writeSync(pipe, procHooks); err != nil { return err @@ -418,7 +428,7 @@ func syncParentHooks(pipe *os.File) error { // syncParentSeccomp sends the fd associated with the seccomp file descriptor // to the parent, and wait for the parent to do pidfd_getfd() to grab a copy. -func syncParentSeccomp(pipe *os.File, seccompFd *os.File) error { +func syncParentSeccomp(pipe *syncSocket, seccompFd *os.File) error { if seccompFd == nil { return nil } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 9ea8ad5b2bc..c1b60c49884 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -46,15 +46,54 @@ type parentProcess interface { forwardChildLogs() chan error } -type filePair struct { - parent *os.File - child *os.File +type processComm struct { + // Used to send initial configuration to "runc init" and for "runc init" to + // indicate that it is ready. + initSockParent *os.File + initSockChild *os.File + // Used for control messages between parent and "runc init". + syncSockParent *syncSocket + syncSockChild *syncSocket + // Used for log forwarding from "runc init" to the parent. + logPipeParent *os.File + logPipeChild *os.File +} + +func newProcessComm() (*processComm, error) { + var ( + comm processComm + err error + ) + comm.initSockParent, comm.initSockChild, err = utils.NewSockPair("init") + if err != nil { + return nil, fmt.Errorf("unable to create init pipe: %w", err) + } + comm.syncSockParent, comm.syncSockChild, err = newSyncSockpair("sync") + if err != nil { + return nil, fmt.Errorf("unable to create sync pipe: %w", err) + } + comm.logPipeParent, comm.logPipeChild, err = os.Pipe() + if err != nil { + return nil, fmt.Errorf("unable to create log pipe: %w", err) + } + return &comm, nil +} + +func (c *processComm) closeChild() { + _ = c.initSockChild.Close() + _ = c.syncSockChild.Close() + _ = c.logPipeChild.Close() +} + +func (c *processComm) closeParent() { + _ = c.initSockParent.Close() + _ = c.syncSockParent.Close() + // c.logPipeParent is kept alive for ForwardLogs } type setnsProcess struct { cmd *exec.Cmd - messageSockPair filePair - logFilePair filePair + comm *processComm cgroupPaths map[string]string rootlessCgroups bool manager cgroups.Manager @@ -80,18 +119,17 @@ func (p *setnsProcess) signal(sig os.Signal) error { } func (p *setnsProcess) start() (retErr error) { - defer p.messageSockPair.parent.Close() + defer p.comm.closeParent() // get the "before" value of oom kill count oom, _ := p.manager.OOMKillCount() err := p.cmd.Start() - // close the write-side of the pipes (controlled by child) - p.messageSockPair.child.Close() - p.logFilePair.child.Close() + // close the child-side of the pipes (controlled by child) + p.comm.closeChild() if err != nil { return fmt.Errorf("error starting setns process: %w", err) } - waitInit := initWaiter(p.messageSockPair.parent) + waitInit := initWaiter(p.comm.initSockParent) defer func() { if retErr != nil { if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom { @@ -110,7 +148,7 @@ func (p *setnsProcess) start() (retErr error) { }() if p.bootstrapData != nil { - if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { + if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil { return fmt.Errorf("error copying bootstrap data to pipe: %w", err) } } @@ -158,11 +196,11 @@ func (p *setnsProcess) start() (retErr error) { if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { return fmt.Errorf("error setting rlimits for process: %w", err) } - if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil { + if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil { return fmt.Errorf("error writing config to pipe: %w", err) } - ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { + ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error { switch sync.Type { case procReady: // This shouldn't happen. @@ -190,7 +228,7 @@ func (p *setnsProcess) start() (retErr error) { // wait for the seccomp notify listener to get the fd before we // permit the child to continue because the child will happily wait // for the listener if it hits SCMP_ACT_NOTIFY. - if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { + if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil { return err } @@ -219,8 +257,8 @@ func (p *setnsProcess) start() (retErr error) { return nil }) - if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { - return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err} + if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil { + return err } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { @@ -245,7 +283,7 @@ func (p *setnsProcess) execSetns() error { return &exec.ExitError{ProcessState: status} } var pid *pid - if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil { + if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil { _ = p.cmd.Wait() return fmt.Errorf("error reading pid from init pipe: %w", err) } @@ -299,13 +337,12 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { } func (p *setnsProcess) forwardChildLogs() chan error { - return logs.ForwardLogs(p.logFilePair.parent) + return logs.ForwardLogs(p.comm.logPipeParent) } type initProcess struct { cmd *exec.Cmd - messageSockPair filePair - logFilePair filePair + comm *processComm config *initConfig manager cgroups.Manager intelRdtManager *intelrdt.Manager @@ -326,7 +363,7 @@ func (p *initProcess) externalDescriptors() []string { // getChildPid receives the final child's pid over the provided pipe. func (p *initProcess) getChildPid() (int, error) { var pid pid - if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil { + if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil { _ = p.cmd.Wait() return -1, err } @@ -362,18 +399,17 @@ func (p *initProcess) waitForChildExit(childPid int) error { } func (p *initProcess) start() (retErr error) { - defer p.messageSockPair.parent.Close() //nolint: errcheck + defer p.comm.closeParent() err := p.cmd.Start() p.process.ops = p - // close the write-side of the pipes (controlled by child) - _ = p.messageSockPair.child.Close() - _ = p.logFilePair.child.Close() + // close the child-side of the pipes (controlled by child) + p.comm.closeChild() if err != nil { p.process.ops = nil return fmt.Errorf("unable to start init: %w", err) } - waitInit := initWaiter(p.messageSockPair.parent) + waitInit := initWaiter(p.comm.initSockParent) defer func() { if retErr != nil { // Find out if init is killed by the kernel's OOM killer. @@ -424,7 +460,7 @@ func (p *initProcess) start() (retErr error) { return fmt.Errorf("unable to apply Intel RDT configuration: %w", err) } } - if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { + if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil { return fmt.Errorf("can't copy bootstrap data to pipe: %w", err) } err = <-waitInit @@ -457,12 +493,12 @@ func (p *initProcess) start() (retErr error) { if err := p.updateSpecState(); err != nil { return fmt.Errorf("error updating spec state: %w", err) } - if err := p.sendConfig(); err != nil { + if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil { return fmt.Errorf("error sending config to init process: %w", err) } var seenProcReady bool - ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { + ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error { switch sync.Type { case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { @@ -484,7 +520,7 @@ func (p *initProcess) start() (retErr error) { // wait for the seccomp notify listener to get the fd before we // permit the child to continue because the child will happily wait // for the listener if it hits SCMP_ACT_NOTIFY. - if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { + if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil { return err } @@ -537,7 +573,7 @@ func (p *initProcess) start() (retErr error) { p.container.initProcessStartTime = state.InitProcessStartTime // Sync with child. - if err := writeSync(p.messageSockPair.parent, procRun); err != nil { + if err := writeSync(p.comm.syncSockParent, procRun); err != nil { return err } case procHooks: @@ -568,7 +604,7 @@ func (p *initProcess) start() (retErr error) { } } // Sync with child. - if err := writeSync(p.messageSockPair.parent, procHooksDone); err != nil { + if err := writeSync(p.comm.syncSockParent, procHooksDone); err != nil { return err } default: @@ -577,8 +613,8 @@ func (p *initProcess) start() (retErr error) { return nil }) - if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil && ierr == nil { - return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err} + if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil { + return err } if !seenProcReady && ierr == nil { ierr = errors.New("procReady not received") @@ -620,13 +656,6 @@ func (p *initProcess) updateSpecState() error { return nil } -func (p *initProcess) sendConfig() error { - // send the config to the container's init process, we don't use JSON Encode - // here because there might be a problem in JSON decoder in some cases, see: - // https://github.com/docker/docker/issues/14203#issuecomment-174177790 - return utils.WriteJSON(p.messageSockPair.parent, p.config) -} - func (p *initProcess) createNetworkInterfaces() error { for _, config := range p.config.Config.Networks { strategy, err := getStrategy(config.Type) @@ -657,7 +686,7 @@ func (p *initProcess) setExternalDescriptors(newFds []string) { } func (p *initProcess) forwardChildLogs() chan error { - return logs.ForwardLogs(p.logFilePair.parent) + return logs.ForwardLogs(p.comm.logPipeParent) } func pidGetFd(pid, srcFd int) (*os.File, error) { diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 88ecb287c11..d76dadcd842 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -63,7 +63,7 @@ func needsSetupDev(config *configs.Config) bool { // prepareRootfs sets up the devices, mount points, and filesystems for use // inside a new mount namespace. It doesn't set anything as ro. You must call // finalizeRootfs after this function to finish setting up the rootfs. -func prepareRootfs(pipe *os.File, iConfig *initConfig, mountFds mountFds) (err error) { +func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (err error) { config := iConfig.Config if err := prepareRoot(config); err != nil { return fmt.Errorf("error preparing rootfs: %w", err) diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 7709219300b..82bcfec2aa4 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -20,7 +20,7 @@ import ( // linuxSetnsInit performs the container's initialization for running a new process // inside an existing container. type linuxSetnsInit struct { - pipe *os.File + pipe *syncSocket consoleSocket *os.File config *initConfig logFd int @@ -111,9 +111,9 @@ func (l *linuxSetnsInit) Init() error { return err } } - logrus.Debugf("setns_init: about to exec") // Close the log pipe fd so the parent's ForwardLogs can exit. + logrus.Debugf("setns_init: about to exec") if err := unix.Close(l.logFd); err != nil { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 4eb3d8db435..86750700c60 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -20,7 +20,7 @@ import ( ) type linuxStandardInit struct { - pipe *os.File + pipe *syncSocket consoleSocket *os.File parentPid int fifoFd int @@ -231,6 +231,7 @@ func (l *linuxStandardInit) Init() error { _ = l.pipe.Close() // Close the log pipe fd so the parent's ForwardLogs can exit. + logrus.Debugf("init: about to wait on exec fifo") if err := unix.Close(l.logFd); err != nil { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } diff --git a/libcontainer/sync.go b/libcontainer/sync.go index 36e657f66d8..1894e870e53 100644 --- a/libcontainer/sync.go +++ b/libcontainer/sync.go @@ -6,8 +6,11 @@ import ( "fmt" "io" "os" + "strconv" "github.com/opencontainers/runc/libcontainer/utils" + + "github.com/sirupsen/logrus" ) type syncType string @@ -53,6 +56,20 @@ type syncT struct { File *os.File `json:"-"` // passed oob through SCM_RIGHTS } +func (s syncT) String() string { + str := "type:" + string(s.Type) + if s.Flags != 0 { + str += " flags:0b" + strconv.FormatInt(int64(s.Flags), 2) + } + if s.Arg != nil { + str += " arg:" + string(*s.Arg) + } + if s.File != nil { + str += " file:" + s.File.Name() + " (fd:" + strconv.Itoa(int(s.File.Fd())) + ")" + } + return str +} + // initError is used to wrap errors for passing them via JSON, // as encoding/json can't unmarshal into error type. type initError struct { @@ -63,43 +80,56 @@ func (i initError) Error() string { return i.Message } -func doWriteSync(pipe *os.File, sync syncT) error { +func doWriteSync(pipe *syncSocket, sync syncT) error { sync.Flags &= ^syncFlagHasFd if sync.File != nil { sync.Flags |= syncFlagHasFd } - if err := utils.WriteJSON(pipe, sync); err != nil { - return fmt.Errorf("writing sync %q: %w", sync.Type, err) + logrus.Debugf("writing sync %s", sync) + data, err := json.Marshal(sync) + if err != nil { + return fmt.Errorf("marshal sync %v: %w", sync.Type, err) + } + if _, err := pipe.WritePacket(data); err != nil { + return fmt.Errorf("writing sync %v: %w", sync.Type, err) } if sync.Flags&syncFlagHasFd != 0 { - if err := utils.SendFile(pipe, sync.File); err != nil { + logrus.Debugf("writing sync file %s", sync) + if err := utils.SendFile(pipe.File(), sync.File); err != nil { return fmt.Errorf("sending file after sync %q: %w", sync.Type, err) } } return nil } -func writeSync(pipe *os.File, sync syncType) error { +func writeSync(pipe *syncSocket, sync syncType) error { return doWriteSync(pipe, syncT{Type: sync}) } -func writeSyncArg(pipe *os.File, sync syncType, arg interface{}) error { +func writeSyncArg(pipe *syncSocket, sync syncType, arg interface{}) error { argJSON, err := json.Marshal(arg) if err != nil { - return fmt.Errorf("writing sync %q: marshal argument failed: %w", sync, err) + return fmt.Errorf("writing sync %v: marshal argument failed: %w", sync, err) } argJSONMsg := json.RawMessage(argJSON) return doWriteSync(pipe, syncT{Type: sync, Arg: &argJSONMsg}) } -func doReadSync(pipe *os.File) (syncT, error) { +func doReadSync(pipe *syncSocket) (syncT, error) { var sync syncT - if err := json.NewDecoder(pipe).Decode(&sync); err != nil { + logrus.Debugf("reading sync") + packet, err := pipe.ReadPacket() + if err != nil { if errors.Is(err, io.EOF) { + logrus.Debugf("sync pipe closed") return sync, err } return sync, fmt.Errorf("reading from parent failed: %w", err) } + if err := json.Unmarshal(packet, &sync); err != nil { + return sync, fmt.Errorf("unmarshal sync from parent failed: %w", err) + } + logrus.Debugf("read sync %s", sync) if sync.Type == procError { var ierr initError if sync.Arg == nil { @@ -111,16 +141,17 @@ func doReadSync(pipe *os.File) (syncT, error) { return sync, &ierr } if sync.Flags&syncFlagHasFd != 0 { - file, err := utils.RecvFile(pipe) + logrus.Debugf("reading sync file %s", sync) + file, err := utils.RecvFile(pipe.File()) if err != nil { - return sync, fmt.Errorf("receiving fd from sync %q failed: %w", sync.Type, err) + return sync, fmt.Errorf("receiving fd from sync %v failed: %w", sync.Type, err) } sync.File = file } return sync, nil } -func readSyncFull(pipe *os.File, expected syncType) (syncT, error) { +func readSyncFull(pipe *syncSocket, expected syncType) (syncT, error) { sync, err := doReadSync(pipe) if err != nil { return sync, err @@ -131,24 +162,24 @@ func readSyncFull(pipe *os.File, expected syncType) (syncT, error) { return sync, nil } -func readSync(pipe *os.File, expected syncType) error { +func readSync(pipe *syncSocket, expected syncType) error { sync, err := readSyncFull(pipe, expected) if err != nil { return err } if sync.Arg != nil { - return fmt.Errorf("sync %q had unexpected argument passed: %q", expected, string(*sync.Arg)) + return fmt.Errorf("sync %v had unexpected argument passed: %q", expected, string(*sync.Arg)) } if sync.File != nil { _ = sync.File.Close() - return fmt.Errorf("sync %q had unexpected file passed", sync.Type) + return fmt.Errorf("sync %v had unexpected file passed", sync.Type) } return nil } // parseSync runs the given callback function on each syncT received from the // child. It will return once io.EOF is returned from the given pipe. -func parseSync(pipe *os.File, fn func(*syncT) error) error { +func parseSync(pipe *syncSocket, fn func(*syncT) error) error { for { sync, err := doReadSync(pipe) if err != nil { diff --git a/libcontainer/sync_unix.go b/libcontainer/sync_unix.go new file mode 100644 index 00000000000..f94486cb300 --- /dev/null +++ b/libcontainer/sync_unix.go @@ -0,0 +1,76 @@ +package libcontainer + +import ( + "fmt" + "io" + "os" + + "golang.org/x/sys/unix" +) + +// syncSocket is a wrapper around a SOCK_SEQPACKET socket, providing +// packet-oriented methods. This is needed because SOCK_SEQPACKET does not +// allow for partial reads, but the Go stdlib treats it as a streamable source, +// which ends up making things like json.Decoder hang forever if the packet is +// bigger than the internal read buffer. +type syncSocket struct { + f *os.File +} + +func newSyncSocket(f *os.File) *syncSocket { + return &syncSocket{f: f} +} + +func (s *syncSocket) File() *os.File { + return s.f +} + +func (s *syncSocket) Close() error { + return s.f.Close() +} + +func (s *syncSocket) WritePacket(b []byte) (int, error) { + return s.f.Write(b) +} + +func (s *syncSocket) ReadPacket() ([]byte, error) { + size, _, err := unix.Recvfrom(int(s.f.Fd()), nil, unix.MSG_TRUNC|unix.MSG_PEEK) + if err != nil { + return nil, fmt.Errorf("fetch packet length from socket: %w", err) + } + // We will only get a zero size if the socket has been closed from the + // other end (otherwise recvfrom(2) will block until a packet is ready). In + // addition, SOCK_SEQPACKET is treated as a stream source by Go stdlib so + // returning io.EOF here is correct from that perspective too. + if size == 0 { + return nil, io.EOF + } + buf := make([]byte, size) + n, err := s.f.Read(buf) + if err != nil { + return nil, err + } + if n != size { + return nil, fmt.Errorf("packet read too short: expected %d byte packet but only %d bytes read", size, n) + } + return buf, nil +} + +func (s *syncSocket) Shutdown(how int) error { + if err := unix.Shutdown(int(s.f.Fd()), how); err != nil { + return &os.PathError{Op: "shutdown", Path: s.f.Name() + " (sync pipe)", Err: err} + } + return nil +} + +// newSyncSockpair returns a new SOCK_SEQPACKET unix socket pair to be used for +// runc-init synchronisation. +func newSyncSockpair(name string) (parent, child *syncSocket, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + parentFile := os.NewFile(uintptr(fds[1]), name+"-p") + childFile := os.NewFile(uintptr(fds[0]), name+"-c") + return newSyncSocket(parentFile), newSyncSocket(childFile), nil +} diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index dbd43534165..74d9d20c7f1 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -43,6 +43,9 @@ func ExitStatus(status unix.WaitStatus) int { } // WriteJSON writes the provided struct v to w using standard json marshaling +// without a trailing newline. This is used instead of json.Encoder because +// there might be a problem in json decoder in some cases, see: +// https://github.com/docker/docker/issues/14203#issuecomment-174177790 func WriteJSON(w io.Writer, v interface{}) error { data, err := json.Marshal(v) if err != nil { diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index ca520b63b36..e5f11523d1d 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -90,7 +90,7 @@ func CloseExecFrom(minFd int) error { return nil } -// NewSockPair returns a new unix socket pair +// NewSockPair returns a new SOCK_STREAM unix socket pair. func NewSockPair(name string) (parent, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil {