Skip to content

Commit

Permalink
sync: split init config (stream) and synchronisation (seqpacket) pipes
Browse files Browse the repository at this point in the history
We have different requirements for the initial configuration and
initWaiter pipe (just send netlink and JSON blobs with no complicated
handling needed for message coalescing) and the packet-based
synchronisation pipe.

Tests with switching everything to SOCK_SEQPACKET lead to endless issues
with runc hanging on start-up because random things would try to do
short reads (which SOCK_SEQPACKET will not allow and the Go stdlib
explicitly treats as a streaming source), so splitting it was the only
reasonable solution. Even doing somewhat dodgy tricks such as adding a
Read() wrapper which actually calls ReadPacket() and makes it seem like
a stream source doesn't work -- and is a bit too magical.

One upside is that doing it this way makes the difference between the
modes clearer -- INITPIPE is still used for initWaiter syncrhonisation
but aside from that all other synchronisation is done by SYNCPIPE.

Signed-off-by: Aleksa Sarai <[email protected]>
  • Loading branch information
cyphar authored and lifubang committed Sep 24, 2023
1 parent ccc7671 commit 8da42aa
Show file tree
Hide file tree
Showing 10 changed files with 258 additions and 111 deletions.
61 changes: 29 additions & 32 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -488,17 +488,10 @@ func isDmzBinarySafe(c *configs.Config) bool {
}

func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
comm, err := newProcessComm()
if err != nil {
return nil, fmt.Errorf("unable to create init pipe: %w", err)
}
messageSockPair := filePair{parentInitPipe, childInitPipe}

parentLogPipe, childLogPipe, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("unable to create log pipe: %w", err)
return nil, err
}
logFilePair := filePair{parentLogPipe, childLogPipe}

// Make sure we use a new safe copy of /proc/self/exe or the runc-dmz
// binary each time this is called, to make sure that if a container
Expand Down Expand Up @@ -569,10 +562,15 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
)
}
cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
cmd.Env = append(cmd.Env, "_LIBCONTAINER_STATEDIR="+c.root)

cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
"_LIBCONTAINER_STATEDIR="+c.root,
)
cmd.ExtraFiles = append(cmd.ExtraFiles, comm.syncSockChild.File())
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_SYNCPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
)

if dmzExe != nil {
Expand All @@ -581,7 +579,7 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
"_LIBCONTAINER_DMZEXEFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
}

cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
cmd.ExtraFiles = append(cmd.ExtraFiles, comm.logPipeChild)
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
if p.LogLevel != "" {
Expand Down Expand Up @@ -617,9 +615,9 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
if err := c.includeExecFifo(cmd); err != nil {
return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
}
return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
return c.newInitProcess(p, cmd, comm)
}
return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
return c.newSetnsProcess(p, cmd, comm)
}

// shouldSendMountSources says whether the child process must setup bind mounts with
Expand Down Expand Up @@ -681,27 +679,27 @@ func (c *Container) shouldSendIdmapSources() bool {
return false
}

func (c *Container) sendMountSources(cmd *exec.Cmd, messageSockPair filePair) error {
func (c *Container) sendMountSources(cmd *exec.Cmd, comm *processComm) error {
if !c.shouldSendMountSources() {
return nil
}

return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool {
return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool {
return m.IsBind() && !m.IsIDMapped()
})
}

func (c *Container) sendIdmapSources(cmd *exec.Cmd, messageSockPair filePair) error {
func (c *Container) sendIdmapSources(cmd *exec.Cmd, comm *processComm) error {
if !c.shouldSendIdmapSources() {
return nil
}

return c.sendFdsSources(cmd, messageSockPair, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool {
return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool {
return m.IsBind() && m.IsIDMapped()
})
}

func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envVar string, condition func(*configs.Mount) bool) error {
func (c *Container) sendFdsSources(cmd *exec.Cmd, comm *processComm, envVar string, condition func(*configs.Mount) bool) error {
// Elements on these slices will be paired with mounts (see StartInitialization() and
// prepareRootfs()). These slices MUST have the same size as c.config.Mounts.
fds := make([]int, len(c.config.Mounts))
Expand All @@ -712,11 +710,12 @@ func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envV
continue
}

// The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need
// to allocate a fd so that we know the number to pass in the environment variable. The fd
// must not be closed before cmd.Start(), so we reuse messageSockPair.child because the
// lifecycle of that fd is already taken care of.
cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
// The fd passed here will not be used: nsexec.c will overwrite it with
// dup3(). We just need to allocate a fd so that we know the number to
// pass in the environment variable. The fd must not be closed before
// cmd.Start(), so we reuse initSockChild because the lifecycle of that
// fd is already taken care of.
cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild)
fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
}
fdsJSON, err := json.Marshal(fds)
Expand All @@ -727,7 +726,7 @@ func (c *Container) sendFdsSources(cmd *exec.Cmd, messageSockPair filePair, envV
return nil
}

func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
Expand All @@ -739,17 +738,16 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
if err != nil {
return nil, err
}
if err := c.sendMountSources(cmd, messageSockPair); err != nil {
if err := c.sendMountSources(cmd, comm); err != nil {
return nil, err
}
if err := c.sendIdmapSources(cmd, messageSockPair); err != nil {
if err := c.sendIdmapSources(cmd, comm); err != nil {
return nil, err
}

init := &initProcess{
cmd: cmd,
messageSockPair: messageSockPair,
logFilePair: logFilePair,
comm: comm,
manager: c.cgroupManager,
intelRdtManager: c.intelRdtManager,
config: c.newInitConfig(p),
Expand All @@ -761,7 +759,7 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
return init, nil
}

func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) {
func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState()
if err != nil {
Expand All @@ -778,8 +776,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair,
cgroupPaths: state.CgroupPaths,
rootlessCgroups: c.config.RootlessCgroups,
intelRdtPath: state.IntelRdtPath,
messageSockPair: messageSockPair,
logFilePair: logFilePair,
comm: comm,
manager: c.cgroupManager,
config: c.newInitConfig(p),
process: p,
Expand Down
42 changes: 26 additions & 16 deletions libcontainer/init_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,27 +108,36 @@ func Init() {
// error, it means the initialization has failed. If the error is returned,
// it means the error can not be communicated back to the parent.
func startInitialization() (retErr error) {
// Get the INITPIPE.
envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
pipefd, err := strconv.Atoi(envInitPipe)
// Get the syncrhonisation pipe.
envSyncPipe := os.Getenv("_LIBCONTAINER_SYNCPIPE")
syncPipeFd, err := strconv.Atoi(envSyncPipe)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
return fmt.Errorf("unable to convert _LIBCONTAINER_SYNCPIPE: %w", err)
}
pipe := os.NewFile(uintptr(pipefd), "pipe")
defer pipe.Close()
syncPipe := newSyncSocket(os.NewFile(uintptr(syncPipeFd), "sync"))
defer syncPipe.Close()

defer func() {
// If this defer is ever called, this means initialization has failed.
// Send the error back to the parent process in the form of an initError.
ierr := initError{Message: retErr.Error()}
if err := writeSyncArg(pipe, procError, ierr); err != nil {
if err := writeSyncArg(syncPipe, procError, ierr); err != nil {
fmt.Fprintln(os.Stderr, err)
return
}
// The error is sent, no need to also return it (or it will be reported twice).
retErr = nil
}()

// Get the INITPIPE.
envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
initPipeFd, err := strconv.Atoi(envInitPipe)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
}
initPipe := os.NewFile(uintptr(initPipeFd), "init")
defer initPipe.Close()

// Set up logging. This is used rarely, and mostly for init debugging.

// Passing log level is optional; currently libcontainer/integration does not do it.
Expand Down Expand Up @@ -207,15 +216,16 @@ func startInitialization() (retErr error) {
}
}()

var config initConfig
if err := json.NewDecoder(initPipe).Decode(&config); err != nil {
return err
}

// If init succeeds, it will not return, hence none of the defers will be called.
return containerInit(it, pipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
return containerInit(it, &config, syncPipe, consoleSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
}

func containerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return err
}
func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
if err := populateProcessEnvironment(config.Env); err != nil {
return err
}
Expand Down Expand Up @@ -395,7 +405,7 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error {
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe *os.File) error {
func syncParentReady(pipe *syncSocket) error {
// Tell parent.
if err := writeSync(pipe, procReady); err != nil {
return err
Expand All @@ -407,7 +417,7 @@ func syncParentReady(pipe *os.File) error {
// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe *os.File) error {
func syncParentHooks(pipe *syncSocket) error {
// Tell parent.
if err := writeSync(pipe, procHooks); err != nil {
return err
Expand All @@ -418,7 +428,7 @@ func syncParentHooks(pipe *os.File) error {

// syncParentSeccomp sends the fd associated with the seccomp file descriptor
// to the parent, and wait for the parent to do pidfd_getfd() to grab a copy.
func syncParentSeccomp(pipe *os.File, seccompFd *os.File) error {
func syncParentSeccomp(pipe *syncSocket, seccompFd *os.File) error {
if seccompFd == nil {
return nil
}
Expand Down
Loading

0 comments on commit 8da42aa

Please sign in to comment.