diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index c941239b841..f20ffa4fb48 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -442,21 +442,14 @@ func (c *Container) includeExecFifo(cmd *exec.Cmd) error { } func (c *Container) newParentProcess(p *Process) (parentProcess, error) { - parentInitPipe, childInitPipe, err := utils.NewSockPair("init") + comm, err := newProcessComm() if err != nil { - return nil, fmt.Errorf("unable to create init pipe: %w", err) - } - messageSockPair := filePair{parentInitPipe, childInitPipe} - - parentLogPipe, childLogPipe, err := os.Pipe() - if err != nil { - return nil, fmt.Errorf("unable to create log pipe: %w", err) + return nil, err } - logFilePair := filePair{parentLogPipe, childLogPipe} - cmd := c.commandTemplate(p, childInitPipe, childLogPipe) + cmd := c.commandTemplate(p, comm) if !p.Init { - return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) + return c.newSetnsProcess(p, cmd, comm) } // We only set up fifoFd if we're not doing a `runc exec`. The historic @@ -467,10 +460,10 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { if err := c.includeExecFifo(cmd); err != nil { return nil, fmt.Errorf("unable to setup exec fifo: %w", err) } - return c.newInitProcess(p, cmd, messageSockPair, logFilePair) + return c.newInitProcess(p, cmd, comm) } -func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd { +func (c *Container) commandTemplate(p *Process, comm *processComm) *exec.Cmd { cmd := exec.Command("/proc/self/exe", "init") cmd.Args[0] = os.Args[0] cmd.Stdin = p.Stdin @@ -488,13 +481,18 @@ func (c *Container) commandTemplate(p *Process, childInitPipe *os.File, childLog "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), ) } - cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe) + cmd.Env = append(cmd.Env, "_LIBCONTAINER_STATEDIR="+c.root) + + cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild) cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), - "_LIBCONTAINER_STATEDIR="+c.root, + ) + cmd.ExtraFiles = append(cmd.ExtraFiles, comm.syncSockChild.File()) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), ) - cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe) + cmd.ExtraFiles = append(cmd.ExtraFiles, comm.logPipeChild) cmd.Env = append(cmd.Env, "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) if p.LogLevel != "" { @@ -569,27 +567,27 @@ func (c *Container) shouldSendIdmapSources() bool { return false } -func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error { +func (c *Container) sendMountSources(cmd *exec.Cmd, comm *processComm) error { if !c.shouldSendMountSources() { return nil } - return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool { + return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool { return m.IsBind() && !m.IsIDMapped() }) } -func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error { +func (c *Container) sendIdmapSources(cmd *exec.Cmd, comm *processComm) error { if !c.shouldSendIdmapSources() { return nil } - return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool { + return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool { return m.IsBind() && m.IsIDMapped() }) } -func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error { +func (c *Container) sendFdsSources(cmd *exec.Cmd, comm *processComm, envVar string, condition func(*configs.Mount) bool) error { // Elements on these slices will be paired with mounts (see StartInitialization() and // prepareRootfs()). These slices MUST have the same size as c.config.Mounts. fds := make([]int, len(c.config.Mounts)) @@ -600,11 +598,12 @@ func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envV continue } - // The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need - // to allocate a fd so that we know the number to pass in the environment variable. The fd - // must not be closed before cmd.Start(), so we reuse messageSockPair.child because the - // lifecycle of that fd is already taken care of. - cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child) + // The fd passed here will not be used: nsexec.c will overwrite it with + // dup3(). We just need to allocate a fd so that we know the number to + // pass in the environment variable. The fd must not be closed before + // cmd.Start(), so we reuse initSockChild because the lifecycle of that + // fd is already taken care of. + cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild) fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1 } fdsJSON, err := json.Marshal(fds) @@ -615,7 +614,7 @@ func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envV return nil } -func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { +func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) nsMaps := make(map[configs.NamespaceType]string) for _, ns := range c.config.Namespaces { @@ -627,17 +626,16 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l if err != nil { return nil, err } - if err := c.sendMountSources(cmd, messageSockPair); err != nil { + if err := c.sendMountSources(cmd, comm); err != nil { return nil, err } - if err := c.sendIdmapSources(cmd, messageSockPair); err != nil { + if err := c.sendIdmapSources(cmd, comm); err != nil { return nil, err } init := &initProcess{ cmd: cmd, - messageSockPair: messageSockPair, - logFilePair: logFilePair, + comm: comm, manager: c.cgroupManager, intelRdtManager: c.intelRdtManager, config: c.newInitConfig(p), @@ -649,7 +647,7 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l return init, nil } -func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) { +func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) state, err := c.currentState() if err != nil { @@ -666,8 +664,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, cgroupPaths: state.CgroupPaths, rootlessCgroups: c.config.RootlessCgroups, intelRdtPath: state.IntelRdtPath, - messageSockPair: messageSockPair, - logFilePair: logFilePair, + comm: comm, manager: c.cgroupManager, config: c.newInitConfig(p), process: p, diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 998b01ab7c2..60b63d5f96d 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -108,20 +108,20 @@ func Init() { // error, it means the initialization has failed. If the error is returned, // it means the error can not be communicated back to the parent. func startInitialization() (retErr error) { - // Get the INITPIPE. - envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE") - pipefd, err := strconv.Atoi(envInitPipe) + // Get the syncrhonisation pipe. + envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE") + syncPipeFd, err := strconv.Atoi(envSyncPipe) if err != nil { - return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err) + return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err) } - pipe := os.NewFile(uintptr(pipefd), "pipe") - defer pipe.Close() + syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync")) + defer syncPipe.Close() defer func() { // If this defer is ever called, this means initialization has failed. // Send the error back to the parent process in the form of an initError. ierr := initError{Message: retErr.Error()} - if err := writeSyncArg(pipe, procError, ierr); err != nil { + if err := writeSyncArg(syncPipe, procError, ierr); err != nil { fmt.Fprintln(os.Stderr, err) return } @@ -129,6 +129,15 @@ func startInitialization() (retErr error) { retErr = nil }() + // Get the INITPIPE. + envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE") + initPipeFd, err := strconv.Atoi(envInitPipe) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err) + } + initPipe := os.NewFile(uintptr(initPipeFd), "init") + defer initPipe.Close() + // Set up logging. This is used rarely, and mostly for init debugging. // Passing log level is optional; currently libcontainer/integration does not do it. @@ -196,15 +205,16 @@ func startInitialization() (retErr error) { } }() + var config initConfig + if err := json.NewDecoder(initPipe).Decode(&config); err != nil { + return err + } + // If init succeeds, it will not return, hence none of the defers will be called. - return containerInit(it, pipe, consoleSocket, fifofd, logFD, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) + return containerInit(it, &config, syncPipe, consoleSocket, fifofd, logFD, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) } -func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error { - var config *initConfig - if err := json.NewDecoder(pipe).Decode(&config); err != nil { - return err - } +func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) error { if err := populateProcessEnvironment(config.Env); err != nil { return err } @@ -381,7 +391,7 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { // syncParentReady sends to the given pipe a JSON payload which indicates that // the init is ready to Exec the child process. It then waits for the parent to // indicate that it is cleared to Exec. -func syncParentReady(pipe *os.File) error { +func syncParentReady(pipe *syncSocket) error { // Tell parent. if err := writeSync(pipe, procReady); err != nil { return err @@ -393,7 +403,7 @@ func syncParentReady(pipe *os.File) error { // syncParentHooks sends to the given pipe a JSON payload which indicates that // the parent should execute pre-start hooks. It then waits for the parent to // indicate that it is cleared to resume. -func syncParentHooks(pipe *os.File) error { +func syncParentHooks(pipe *syncSocket) error { // Tell parent. if err := writeSync(pipe, procHooks); err != nil { return err @@ -404,7 +414,7 @@ func syncParentHooks(pipe *os.File) error { // syncParentSeccomp sends the fd associated with the seccomp file descriptor // to the parent, and wait for the parent to do pidfd_getfd() to grab a copy. -func syncParentSeccomp(pipe *os.File, seccompFd *os.File) error { +func syncParentSeccomp(pipe *syncSocket, seccompFd *os.File) error { if seccompFd == nil { return nil } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 9ea8ad5b2bc..c1b60c49884 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -46,15 +46,54 @@ type parentProcess interface { forwardChildLogs() chan error } -type filePair struct { - parent *os.File - child *os.File +type processComm struct { + // Used to send initial configuration to "runc init" and for "runc init" to + // indicate that it is ready. + initSockParent *os.File + initSockChild *os.File + // Used for control messages between parent and "runc init". + syncSockParent *syncSocket + syncSockChild *syncSocket + // Used for log forwarding from "runc init" to the parent. + logPipeParent *os.File + logPipeChild *os.File +} + +func newProcessComm() (*processComm, error) { + var ( + comm processComm + err error + ) + comm.initSockParent, comm.initSockChild, err = utils.NewSockPair("init") + if err != nil { + return nil, fmt.Errorf("unable to create init pipe: %w", err) + } + comm.syncSockParent, comm.syncSockChild, err = newSyncSockpair("sync") + if err != nil { + return nil, fmt.Errorf("unable to create sync pipe: %w", err) + } + comm.logPipeParent, comm.logPipeChild, err = os.Pipe() + if err != nil { + return nil, fmt.Errorf("unable to create log pipe: %w", err) + } + return &comm, nil +} + +func (c *processComm) closeChild() { + _ = c.initSockChild.Close() + _ = c.syncSockChild.Close() + _ = c.logPipeChild.Close() +} + +func (c *processComm) closeParent() { + _ = c.initSockParent.Close() + _ = c.syncSockParent.Close() + // c.logPipeParent is kept alive for ForwardLogs } type setnsProcess struct { cmd *exec.Cmd - messageSockPair filePair - logFilePair filePair + comm *processComm cgroupPaths map[string]string rootlessCgroups bool manager cgroups.Manager @@ -80,18 +119,17 @@ func (p *setnsProcess) signal(sig os.Signal) error { } func (p *setnsProcess) start() (retErr error) { - defer p.messageSockPair.parent.Close() + defer p.comm.closeParent() // get the "before" value of oom kill count oom, _ := p.manager.OOMKillCount() err := p.cmd.Start() - // close the write-side of the pipes (controlled by child) - p.messageSockPair.child.Close() - p.logFilePair.child.Close() + // close the child-side of the pipes (controlled by child) + p.comm.closeChild() if err != nil { return fmt.Errorf("error starting setns process: %w", err) } - waitInit := initWaiter(p.messageSockPair.parent) + waitInit := initWaiter(p.comm.initSockParent) defer func() { if retErr != nil { if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom { @@ -110,7 +148,7 @@ func (p *setnsProcess) start() (retErr error) { }() if p.bootstrapData != nil { - if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { + if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil { return fmt.Errorf("error copying bootstrap data to pipe: %w", err) } } @@ -158,11 +196,11 @@ func (p *setnsProcess) start() (retErr error) { if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { return fmt.Errorf("error setting rlimits for process: %w", err) } - if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil { + if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil { return fmt.Errorf("error writing config to pipe: %w", err) } - ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { + ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error { switch sync.Type { case procReady: // This shouldn't happen. @@ -190,7 +228,7 @@ func (p *setnsProcess) start() (retErr error) { // wait for the seccomp notify listener to get the fd before we // permit the child to continue because the child will happily wait // for the listener if it hits SCMP_ACT_NOTIFY. - if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { + if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil { return err } @@ -219,8 +257,8 @@ func (p *setnsProcess) start() (retErr error) { return nil }) - if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { - return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err} + if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil { + return err } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { @@ -245,7 +283,7 @@ func (p *setnsProcess) execSetns() error { return &exec.ExitError{ProcessState: status} } var pid *pid - if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil { + if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil { _ = p.cmd.Wait() return fmt.Errorf("error reading pid from init pipe: %w", err) } @@ -299,13 +337,12 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { } func (p *setnsProcess) forwardChildLogs() chan error { - return logs.ForwardLogs(p.logFilePair.parent) + return logs.ForwardLogs(p.comm.logPipeParent) } type initProcess struct { cmd *exec.Cmd - messageSockPair filePair - logFilePair filePair + comm *processComm config *initConfig manager cgroups.Manager intelRdtManager *intelrdt.Manager @@ -326,7 +363,7 @@ func (p *initProcess) externalDescriptors() []string { // getChildPid receives the final child's pid over the provided pipe. func (p *initProcess) getChildPid() (int, error) { var pid pid - if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil { + if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil { _ = p.cmd.Wait() return -1, err } @@ -362,18 +399,17 @@ func (p *initProcess) waitForChildExit(childPid int) error { } func (p *initProcess) start() (retErr error) { - defer p.messageSockPair.parent.Close() //nolint: errcheck + defer p.comm.closeParent() err := p.cmd.Start() p.process.ops = p - // close the write-side of the pipes (controlled by child) - _ = p.messageSockPair.child.Close() - _ = p.logFilePair.child.Close() + // close the child-side of the pipes (controlled by child) + p.comm.closeChild() if err != nil { p.process.ops = nil return fmt.Errorf("unable to start init: %w", err) } - waitInit := initWaiter(p.messageSockPair.parent) + waitInit := initWaiter(p.comm.initSockParent) defer func() { if retErr != nil { // Find out if init is killed by the kernel's OOM killer. @@ -424,7 +460,7 @@ func (p *initProcess) start() (retErr error) { return fmt.Errorf("unable to apply Intel RDT configuration: %w", err) } } - if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { + if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil { return fmt.Errorf("can't copy bootstrap data to pipe: %w", err) } err = <-waitInit @@ -457,12 +493,12 @@ func (p *initProcess) start() (retErr error) { if err := p.updateSpecState(); err != nil { return fmt.Errorf("error updating spec state: %w", err) } - if err := p.sendConfig(); err != nil { + if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil { return fmt.Errorf("error sending config to init process: %w", err) } var seenProcReady bool - ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { + ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error { switch sync.Type { case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { @@ -484,7 +520,7 @@ func (p *initProcess) start() (retErr error) { // wait for the seccomp notify listener to get the fd before we // permit the child to continue because the child will happily wait // for the listener if it hits SCMP_ACT_NOTIFY. - if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { + if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil { return err } @@ -537,7 +573,7 @@ func (p *initProcess) start() (retErr error) { p.container.initProcessStartTime = state.InitProcessStartTime // Sync with child. - if err := writeSync(p.messageSockPair.parent, procRun); err != nil { + if err := writeSync(p.comm.syncSockParent, procRun); err != nil { return err } case procHooks: @@ -568,7 +604,7 @@ func (p *initProcess) start() (retErr error) { } } // Sync with child. - if err := writeSync(p.messageSockPair.parent, procHooksDone); err != nil { + if err := writeSync(p.comm.syncSockParent, procHooksDone); err != nil { return err } default: @@ -577,8 +613,8 @@ func (p *initProcess) start() (retErr error) { return nil }) - if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil && ierr == nil { - return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err} + if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil { + return err } if !seenProcReady && ierr == nil { ierr = errors.New("procReady not received") @@ -620,13 +656,6 @@ func (p *initProcess) updateSpecState() error { return nil } -func (p *initProcess) sendConfig() error { - // send the config to the container's init process, we don't use JSON Encode - // here because there might be a problem in JSON decoder in some cases, see: - // https://github.com/docker/docker/issues/14203#issuecomment-174177790 - return utils.WriteJSON(p.messageSockPair.parent, p.config) -} - func (p *initProcess) createNetworkInterfaces() error { for _, config := range p.config.Config.Networks { strategy, err := getStrategy(config.Type) @@ -657,7 +686,7 @@ func (p *initProcess) setExternalDescriptors(newFds []string) { } func (p *initProcess) forwardChildLogs() chan error { - return logs.ForwardLogs(p.logFilePair.parent) + return logs.ForwardLogs(p.comm.logPipeParent) } func pidGetFd(pid, srcFd int) (*os.File, error) { diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 88ecb287c11..d76dadcd842 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -63,7 +63,7 @@ func needsSetupDev(config *configs.Config) bool { // prepareRootfs sets up the devices, mount points, and filesystems for use // inside a new mount namespace. It doesn't set anything as ro. You must call // finalizeRootfs after this function to finish setting up the rootfs. -func prepareRootfs(pipe *os.File, iConfig *initConfig, mountFds mountFds) (err error) { +func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (err error) { config := iConfig.Config if err := prepareRoot(config); err != nil { return fmt.Errorf("error preparing rootfs: %w", err) diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 40a47a2e95c..a0117acd891 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -19,7 +19,7 @@ import ( // linuxSetnsInit performs the container's initialization for running a new process // inside an existing container. type linuxSetnsInit struct { - pipe *os.File + pipe *syncSocket consoleSocket *os.File config *initConfig logFd int @@ -97,8 +97,9 @@ func (l *linuxSetnsInit) Init() error { return err } } - logrus.Debugf("setns_init: about to exec") + // Close the log pipe fd so the parent's ForwardLogs can exit. + logrus.Debugf("setns_init: about to exec") if err := unix.Close(l.logFd); err != nil { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index c64173ecfc3..75ff5e9ef18 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -20,7 +20,7 @@ import ( ) type linuxStandardInit struct { - pipe *os.File + pipe *syncSocket consoleSocket *os.File parentPid int fifoFd int @@ -230,6 +230,7 @@ func (l *linuxStandardInit) Init() error { _ = l.pipe.Close() // Close the log pipe fd so the parent's ForwardLogs can exit. + logrus.Debugf("init: about to wait on exec fifo") if err := unix.Close(l.logFd); err != nil { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } diff --git a/libcontainer/sync.go b/libcontainer/sync.go index 36e657f66d8..1894e870e53 100644 --- a/libcontainer/sync.go +++ b/libcontainer/sync.go @@ -6,8 +6,11 @@ import ( "fmt" "io" "os" + "strconv" "github.com/opencontainers/runc/libcontainer/utils" + + "github.com/sirupsen/logrus" ) type syncType string @@ -53,6 +56,20 @@ type syncT struct { File *os.File `json:"-"` // passed oob through SCM_RIGHTS } +func (s syncT) String() string { + str := "type:" + string(s.Type) + if s.Flags != 0 { + str += " flags:0b" + strconv.FormatInt(int64(s.Flags), 2) + } + if s.Arg != nil { + str += " arg:" + string(*s.Arg) + } + if s.File != nil { + str += " file:" + s.File.Name() + " (fd:" + strconv.Itoa(int(s.File.Fd())) + ")" + } + return str +} + // initError is used to wrap errors for passing them via JSON, // as encoding/json can't unmarshal into error type. type initError struct { @@ -63,43 +80,56 @@ func (i initError) Error() string { return i.Message } -func doWriteSync(pipe *os.File, sync syncT) error { +func doWriteSync(pipe *syncSocket, sync syncT) error { sync.Flags &= ^syncFlagHasFd if sync.File != nil { sync.Flags |= syncFlagHasFd } - if err := utils.WriteJSON(pipe, sync); err != nil { - return fmt.Errorf("writing sync %q: %w", sync.Type, err) + logrus.Debugf("writing sync %s", sync) + data, err := json.Marshal(sync) + if err != nil { + return fmt.Errorf("marshal sync %v: %w", sync.Type, err) + } + if _, err := pipe.WritePacket(data); err != nil { + return fmt.Errorf("writing sync %v: %w", sync.Type, err) } if sync.Flags&syncFlagHasFd != 0 { - if err := utils.SendFile(pipe, sync.File); err != nil { + logrus.Debugf("writing sync file %s", sync) + if err := utils.SendFile(pipe.File(), sync.File); err != nil { return fmt.Errorf("sending file after sync %q: %w", sync.Type, err) } } return nil } -func writeSync(pipe *os.File, sync syncType) error { +func writeSync(pipe *syncSocket, sync syncType) error { return doWriteSync(pipe, syncT{Type: sync}) } -func writeSyncArg(pipe *os.File, sync syncType, arg interface{}) error { +func writeSyncArg(pipe *syncSocket, sync syncType, arg interface{}) error { argJSON, err := json.Marshal(arg) if err != nil { - return fmt.Errorf("writing sync %q: marshal argument failed: %w", sync, err) + return fmt.Errorf("writing sync %v: marshal argument failed: %w", sync, err) } argJSONMsg := json.RawMessage(argJSON) return doWriteSync(pipe, syncT{Type: sync, Arg: &argJSONMsg}) } -func doReadSync(pipe *os.File) (syncT, error) { +func doReadSync(pipe *syncSocket) (syncT, error) { var sync syncT - if err := json.NewDecoder(pipe).Decode(&sync); err != nil { + logrus.Debugf("reading sync") + packet, err := pipe.ReadPacket() + if err != nil { if errors.Is(err, io.EOF) { + logrus.Debugf("sync pipe closed") return sync, err } return sync, fmt.Errorf("reading from parent failed: %w", err) } + if err := json.Unmarshal(packet, &sync); err != nil { + return sync, fmt.Errorf("unmarshal sync from parent failed: %w", err) + } + logrus.Debugf("read sync %s", sync) if sync.Type == procError { var ierr initError if sync.Arg == nil { @@ -111,16 +141,17 @@ func doReadSync(pipe *os.File) (syncT, error) { return sync, &ierr } if sync.Flags&syncFlagHasFd != 0 { - file, err := utils.RecvFile(pipe) + logrus.Debugf("reading sync file %s", sync) + file, err := utils.RecvFile(pipe.File()) if err != nil { - return sync, fmt.Errorf("receiving fd from sync %q failed: %w", sync.Type, err) + return sync, fmt.Errorf("receiving fd from sync %v failed: %w", sync.Type, err) } sync.File = file } return sync, nil } -func readSyncFull(pipe *os.File, expected syncType) (syncT, error) { +func readSyncFull(pipe *syncSocket, expected syncType) (syncT, error) { sync, err := doReadSync(pipe) if err != nil { return sync, err @@ -131,24 +162,24 @@ func readSyncFull(pipe *os.File, expected syncType) (syncT, error) { return sync, nil } -func readSync(pipe *os.File, expected syncType) error { +func readSync(pipe *syncSocket, expected syncType) error { sync, err := readSyncFull(pipe, expected) if err != nil { return err } if sync.Arg != nil { - return fmt.Errorf("sync %q had unexpected argument passed: %q", expected, string(*sync.Arg)) + return fmt.Errorf("sync %v had unexpected argument passed: %q", expected, string(*sync.Arg)) } if sync.File != nil { _ = sync.File.Close() - return fmt.Errorf("sync %q had unexpected file passed", sync.Type) + return fmt.Errorf("sync %v had unexpected file passed", sync.Type) } return nil } // parseSync runs the given callback function on each syncT received from the // child. It will return once io.EOF is returned from the given pipe. -func parseSync(pipe *os.File, fn func(*syncT) error) error { +func parseSync(pipe *syncSocket, fn func(*syncT) error) error { for { sync, err := doReadSync(pipe) if err != nil { diff --git a/libcontainer/sync_unix.go b/libcontainer/sync_unix.go new file mode 100644 index 00000000000..f94486cb300 --- /dev/null +++ b/libcontainer/sync_unix.go @@ -0,0 +1,76 @@ +package libcontainer + +import ( + "fmt" + "io" + "os" + + "golang.org/x/sys/unix" +) + +// syncSocket is a wrapper around a SOCK_SEQPACKET socket, providing +// packet-oriented methods. This is needed because SOCK_SEQPACKET does not +// allow for partial reads, but the Go stdlib treats it as a streamable source, +// which ends up making things like json.Decoder hang forever if the packet is +// bigger than the internal read buffer. +type syncSocket struct { + f *os.File +} + +func newSyncSocket(f *os.File) *syncSocket { + return &syncSocket{f: f} +} + +func (s *syncSocket) File() *os.File { + return s.f +} + +func (s *syncSocket) Close() error { + return s.f.Close() +} + +func (s *syncSocket) WritePacket(b []byte) (int, error) { + return s.f.Write(b) +} + +func (s *syncSocket) ReadPacket() ([]byte, error) { + size, _, err := unix.Recvfrom(int(s.f.Fd()), nil, unix.MSG_TRUNC|unix.MSG_PEEK) + if err != nil { + return nil, fmt.Errorf("fetch packet length from socket: %w", err) + } + // We will only get a zero size if the socket has been closed from the + // other end (otherwise recvfrom(2) will block until a packet is ready). In + // addition, SOCK_SEQPACKET is treated as a stream source by Go stdlib so + // returning io.EOF here is correct from that perspective too. + if size == 0 { + return nil, io.EOF + } + buf := make([]byte, size) + n, err := s.f.Read(buf) + if err != nil { + return nil, err + } + if n != size { + return nil, fmt.Errorf("packet read too short: expected %d byte packet but only %d bytes read", size, n) + } + return buf, nil +} + +func (s *syncSocket) Shutdown(how int) error { + if err := unix.Shutdown(int(s.f.Fd()), how); err != nil { + return &os.PathError{Op: "shutdown", Path: s.f.Name() + " (sync pipe)", Err: err} + } + return nil +} + +// newSyncSockpair returns a new SOCK_SEQPACKET unix socket pair to be used for +// runc-init synchronisation. +func newSyncSockpair(name string) (parent, child *syncSocket, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + parentFile := os.NewFile(uintptr(fds[1]), name+"-p") + childFile := os.NewFile(uintptr(fds[0]), name+"-c") + return newSyncSocket(parentFile), newSyncSocket(childFile), nil +} diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index dbd43534165..74d9d20c7f1 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -43,6 +43,9 @@ func ExitStatus(status unix.WaitStatus) int { } // WriteJSON writes the provided struct v to w using standard json marshaling +// without a trailing newline. This is used instead of json.Encoder because +// there might be a problem in json decoder in some cases, see: +// https://github.com/docker/docker/issues/14203#issuecomment-174177790 func WriteJSON(w io.Writer, v interface{}) error { data, err := json.Marshal(v) if err != nil { diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index ca520b63b36..e5f11523d1d 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -90,7 +90,7 @@ func CloseExecFrom(minFd int) error { return nil } -// NewSockPair returns a new unix socket pair +// NewSockPair returns a new SOCK_STREAM unix socket pair. func NewSockPair(name string) (parent, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil {