Skip to content

Commit

Permalink
Merge pull request containerd#10607 from fuweid/pin-userns
Browse files Browse the repository at this point in the history
internal/cri: simplify netns setup with pinned userns
  • Loading branch information
AkihiroSuda authored Sep 19, 2024
2 parents 67b0687 + ee0ed75 commit 8c64a2f
Show file tree
Hide file tree
Showing 16 changed files with 685 additions and 153 deletions.
17 changes: 17 additions & 0 deletions internal/cri/opts/spec_opts.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,23 @@ func WithoutNamespace(t runtimespec.LinuxNamespaceType) oci.SpecOpts {
}
}

// WithNamespacePath updates namespace with existing path.
func WithNamespacePath(t runtimespec.LinuxNamespaceType, nsPath string) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
if s.Linux == nil {
return fmt.Errorf("Linux spec is required")
}

for i, ns := range s.Linux.Namespaces {
if ns.Type == t {
s.Linux.Namespaces[i].Path = nsPath
return nil
}
}
return fmt.Errorf("no such namespace %s", t)
}
}

// WithPodNamespaces sets the pod namespaces for the container
func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts {
namespaces := config.GetNamespaceOptions()
Expand Down
45 changes: 45 additions & 0 deletions internal/cri/server/podsandbox/helpers_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/containerd/containerd/v2/core/snapshots"
"github.com/containerd/containerd/v2/internal/cri/seutil"
"github.com/containerd/containerd/v2/pkg/seccomp"
"github.com/containerd/containerd/v2/pkg/sys"
)

const (
Expand Down Expand Up @@ -88,6 +89,50 @@ func (c *Controller) getSandboxDevShm(id string) string {
return filepath.Join(c.getVolatileSandboxRootDir(id), "shm")
}

// getSandboxPinnedNamespaces returns the pinned namespaces directory inside the
// sandbox state directory.
func (c *Controller) getSandboxPinnedNamespaces(id string) string {
return filepath.Join(c.getVolatileSandboxRootDir(id), "pinned-namespaces")
}

// getSandboxPinnedUserNamespace returns the pinned user namespace file.
func (c *Controller) getSandboxPinnedUserNamespace(id string) string {
return filepath.Join(c.getSandboxPinnedNamespaces(id), "user")
}

// pinUserNamespace persists user namespace in namespace filesystem.
func (c *Controller) pinUserNamespace(sandboxID string, netnsPath string) error {
nsPath := c.getSandboxPinnedUserNamespace(sandboxID)

baseDir := filepath.Dir(nsPath)
if err := os.MkdirAll(baseDir, 0755); err != nil {
return fmt.Errorf("failed to init pinned-namespaces directory %s: %w", baseDir, err)
}

emptyFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666)
if err != nil {
return fmt.Errorf("failed to create empty file %s: %w", nsPath, err)
}
emptyFd.Close()

netnsFd, err := os.Open(netnsPath)
if err != nil {
return fmt.Errorf("failed to open netns(%s): %w", netnsPath, err)
}
defer netnsFd.Close()

usernsFd, err := sys.GetUsernsForNamespace(netnsFd.Fd())
if err != nil {
return fmt.Errorf("failed to get user namespace for netns(%s): %w", netnsPath, err)
}
defer usernsFd.Close()

if err = unix.Mount(usernsFd.Name(), nsPath, "none", unix.MS_BIND, ""); err != nil {
return fmt.Errorf("failed to bind mount ns src: %v at %s: %w", usernsFd.Name(), nsPath, err)
}
return nil
}

func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
var labels []string

Expand Down
64 changes: 33 additions & 31 deletions internal/cri/server/podsandbox/sandbox_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,39 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll

labels["oci_runtime_type"] = ociRuntime.Type

// Create sandbox container root directories.
sandboxRootDir := c.getSandboxRootDir(id)
if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil {
return cin, fmt.Errorf("failed to create sandbox root directory %q: %w",
sandboxRootDir, err)
}
defer func() {
if retErr != nil && cleanupErr == nil {
// Cleanup the sandbox root directory.
if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil {
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q",
sandboxRootDir)
}
}
}()

volatileSandboxRootDir := c.getVolatileSandboxRootDir(id)
if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil {
return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w",
volatileSandboxRootDir, err)
}
defer func() {
if retErr != nil && cleanupErr == nil {
deferCtx, deferCancel := ctrdutil.DeferContext()
defer deferCancel()
// Cleanup the volatile sandbox root directory.
if cleanupErr = ensureRemoveAll(deferCtx, volatileSandboxRootDir); cleanupErr != nil {
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q",
volatileSandboxRootDir)
}
}
}()

// Create sandbox container.
// NOTE: sandboxContainerSpec SHOULD NOT have side
// effect, e.g. accessing/creating files, so that we can test
Expand Down Expand Up @@ -164,37 +197,6 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll
}
}()

// Create sandbox container root directories.
sandboxRootDir := c.getSandboxRootDir(id)
if err := c.os.MkdirAll(sandboxRootDir, 0755); err != nil {
return cin, fmt.Errorf("failed to create sandbox root directory %q: %w",
sandboxRootDir, err)
}
defer func() {
if retErr != nil && cleanupErr == nil {
// Cleanup the sandbox root directory.
if cleanupErr = c.os.RemoveAll(sandboxRootDir); cleanupErr != nil {
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove sandbox root directory %q",
sandboxRootDir)
}
}
}()

volatileSandboxRootDir := c.getVolatileSandboxRootDir(id)
if err := c.os.MkdirAll(volatileSandboxRootDir, 0755); err != nil {
return cin, fmt.Errorf("failed to create volatile sandbox root directory %q: %w",
volatileSandboxRootDir, err)
}
defer func() {
if retErr != nil && cleanupErr == nil {
// Cleanup the volatile sandbox root directory.
if cleanupErr = c.os.RemoveAll(volatileSandboxRootDir); cleanupErr != nil {
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove volatile sandbox root directory %q",
volatileSandboxRootDir)
}
}
}()

// Setup files required for the sandbox.
if err = c.setupSandboxFiles(id, config); err != nil {
return cin, fmt.Errorf("failed to setup sandbox files: %w", err)
Expand Down
5 changes: 5 additions & 0 deletions internal/cri/server/podsandbox/sandbox_run_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC
case runtime.NamespaceMode_POD:
specOpts = append(specOpts, oci.WithUserNamespace(uids, gids))
usernsEnabled = true

if err := c.pinUserNamespace(id, nsPath); err != nil {
return nil, fmt.Errorf("failed to pin user namespace: %w", err)
}
specOpts = append(specOpts, customopts.WithNamespacePath(runtimespec.UserNamespace, c.getSandboxPinnedUserNamespace(id)))
default:
return nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
}
Expand Down
66 changes: 48 additions & 18 deletions internal/cri/server/podsandbox/sandbox_run_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
package podsandbox

import (
"context"
"os"
"path/filepath"
"strconv"
"syscall"
"testing"

"github.com/moby/sys/userns"
Expand All @@ -32,11 +34,15 @@ import (
v1 "k8s.io/cri-api/pkg/apis/runtime/v1"

"github.com/containerd/containerd/v2/internal/cri/annotations"
criconfig "github.com/containerd/containerd/v2/internal/cri/config"
"github.com/containerd/containerd/v2/internal/cri/opts"
"github.com/containerd/containerd/v2/pkg/netns"
ostesting "github.com/containerd/containerd/v2/pkg/os/testing"
"github.com/containerd/containerd/v2/pkg/sys"
"github.com/containerd/containerd/v2/pkg/testutil"
)

func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) {
func getRunPodSandboxTestData(criCfg criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) {
config := &runtime.PodSandboxConfig{
Metadata: &runtime.PodSandboxMetadata{
Name: "test-name",
Expand Down Expand Up @@ -94,7 +100,7 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf
}

assert.Contains(t, spec.Mounts, runtimespec.Mount{
Source: "/test/root/sandboxes/test-id/resolv.conf",
Source: filepath.Join(criCfg.RootDir, "sandboxes/test-id/resolv.conf"),
Destination: resolvConfPath,
Type: "bind",
Options: []string{"rbind", "ro", "nosuid", "nodev", "noexec"},
Expand All @@ -105,8 +111,10 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf
}

func TestLinuxSandboxContainerSpec(t *testing.T) {
testutil.RequiresRoot(t)

testID := "test-id"
nsPath := "test-cni"

idMap := runtime.IDMapping{
HostId: 1000,
ContainerId: 1000,
Expand All @@ -118,15 +126,30 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
Size: 10,
}

netnsBasedir := t.TempDir()
t.Cleanup(func() {
assert.NoError(t, unmountRecursive(context.Background(), netnsBasedir))
})

var netNs *netns.NetNS
uerr := sys.UnshareAfterEnterUserns("1000:1000:10", "1000:1000:10", syscall.CLONE_NEWNET, func(pid int) error {
var err error
netNs, err = netns.NewNetNSFromPID(netnsBasedir, uint32(pid))
return err
})
require.NoError(t, uerr)

nsPath := netNs.GetPath()

for _, test := range []struct {
desc string
configChange func(*runtime.PodSandboxConfig)
specCheck func(*testing.T, *runtimespec.Spec)
specCheck func(*testing.T, *Controller, *runtimespec.Spec)
expectErr bool
}{
{
desc: "spec should reflect original config",
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
// runtime spec should have expected namespaces enabled by default.
require.NotNil(t, spec.Linux)
assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
Expand Down Expand Up @@ -162,10 +185,11 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
},
}
},
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
specCheck: func(t *testing.T, c *Controller, spec *runtimespec.Spec) {
require.NotNil(t, spec.Linux)
assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
Type: runtimespec.UserNamespace,
Path: filepath.Join(c.config.StateDir, "sandboxes", testID, "pinned-namespaces", "user"),
})
assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647")
},
Expand All @@ -181,7 +205,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
},
}
},
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
// runtime spec should disable expected namespaces in host mode.
require.NotNil(t, spec.Linux)
assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
Expand Down Expand Up @@ -213,10 +237,11 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
},
}
},
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
specCheck: func(t *testing.T, c *Controller, spec *runtimespec.Spec) {
require.NotNil(t, spec.Linux)
assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{
Type: runtimespec.UserNamespace,
Path: filepath.Join(c.config.StateDir, "sandboxes", testID, "pinned-namespaces", "user"),
})
require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap})
require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap})
Expand Down Expand Up @@ -314,7 +339,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
SupplementalGroups: []int64{1111, 2222},
}
},
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
require.NotNil(t, spec.Process)
assert.Contains(t, spec.Process.User.AdditionalGids, uint32(1111))
assert.Contains(t, spec.Process.User.AdditionalGids, uint32(2222))
Expand All @@ -328,7 +353,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
"net.ipv4.ping_group_range": "1 1000",
}
},
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
require.NotNil(t, spec.Process)
assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "500")
assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "1 1000")
Expand All @@ -344,7 +369,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
MemoryLimitInBytes: 1024,
}
},
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
value, ok := spec.Annotations[annotations.SandboxCPUPeriod]
assert.True(t, ok)
assert.EqualValues(t, strconv.FormatInt(100, 10), value)
Expand All @@ -365,7 +390,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
},
{
desc: "sandbox sizing annotations should not be set if LinuxContainerResources were not provided",
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
_, ok := spec.Annotations[annotations.SandboxCPUPeriod]
assert.False(t, ok)
_, ok = spec.Annotations[annotations.SandboxCPUQuota]
Expand All @@ -381,7 +406,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
configChange: func(c *runtime.PodSandboxConfig) {
c.Linux.Resources = &v1.LinuxContainerResources{}
},
specCheck: func(t *testing.T, spec *runtimespec.Spec) {
specCheck: func(t *testing.T, _ *Controller, spec *runtimespec.Spec) {
value, ok := spec.Annotations[annotations.SandboxCPUPeriod]
assert.True(t, ok)
assert.EqualValues(t, "0", value)
Expand All @@ -400,9 +425,17 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
test := test
t.Run(test.desc, func(t *testing.T) {
c := newControllerService()
c.config.RootDir = t.TempDir()
c.config.StateDir = t.TempDir()

defer func() {
assert.NoError(t, unmountRecursive(context.Background(), c.config.StateDir))
}()

c.config.EnableUnprivilegedICMP = true
c.config.EnableUnprivilegedPorts = true
config, imageConfig, specCheck := getRunPodSandboxTestData()

config, imageConfig, specCheck := getRunPodSandboxTestData(c.config)
if test.configChange != nil {
test.configChange(config)
}
Expand All @@ -416,7 +449,7 @@ func TestLinuxSandboxContainerSpec(t *testing.T) {
assert.NotNil(t, spec)
specCheck(t, testID, spec)
if test.specCheck != nil {
test.specCheck(t, spec)
test.specCheck(t, c, spec)
}
})
}
Expand Down Expand Up @@ -757,6 +790,3 @@ options timeout:1
})
}
}

// TODO(random-liu): [P1] Add unit test for different error cases to make sure
// the function cleans up on error properly.
3 changes: 2 additions & 1 deletion internal/cri/server/podsandbox/sandbox_run_other_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,13 @@ package podsandbox
import (
"testing"

criconfig "github.com/containerd/containerd/v2/internal/cri/config"
imagespec "github.com/opencontainers/image-spec/specs-go/v1"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) {
func getRunPodSandboxTestData(_ criconfig.Config) (*runtime.PodSandboxConfig, *imagespec.ImageConfig, func(*testing.T, string, *runtimespec.Spec)) {
config := &runtime.PodSandboxConfig{}
imageConfig := &imagespec.ImageConfig{}
specCheck := func(t *testing.T, id string, spec *runtimespec.Spec) {
Expand Down
Loading

0 comments on commit 8c64a2f

Please sign in to comment.