diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 06d4c553..10e684b8 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -566,6 +566,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP config: c.newInitConfig(p), process: p, bootstrapData: data, + initProcessPid: state.InitProcessPid, }, nil } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 6123d68b..b8599761 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "errors" + "fmt" "io" "os" "os/exec" @@ -12,12 +13,14 @@ import ( "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/logs" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -66,6 +69,7 @@ type setnsProcess struct { fds []string process *Process bootstrapData io.Reader + initProcessPid int } func (p *setnsProcess) startTime() (uint64, error) { @@ -100,7 +104,25 @@ func (p *setnsProcess) start() (err error) { } if len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups { - return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) + // On cgroup v2 + nesting + domain controllers, EnterPid may fail with EBUSY. + // https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643 + // Try to join the cgroup of InitProcessPid. + if cgroups.IsCgroup2UnifiedMode() { + initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid) + initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile) + if initCgErr == nil { + if initCgPath, ok := initCg[""]; ok { + initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath) + logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)", + p.pid(), p.cgroupPaths, err, initCg, initCgDirpath) + // NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container. + err = cgroups.WriteCgroupProc(initCgDirpath, p.pid()) + } + } + } + if err != nil { + return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) + } } } if p.intelRdtPath != "" { diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index d5df2858..14c30889 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -136,3 +136,49 @@ EOF [ "$status" -eq 0 ] [[ ${lines[0]} == *"cgroups_exec"* ]] } + +@test "runc exec (cgroup v2 + init process in non-root cgroup) succeeds" { + requires root cgroups_v2 + + set_cgroups_path "$BUSYBOX_BUNDLE" + set_cgroup_mount_writable "$BUSYBOX_BUNDLE" + + runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_group + [ "$status" -eq 0 ] + + runc exec test_cgroups_group cat /sys/fs/cgroup/cgroup.controllers + [ "$status" -eq 0 ] + [[ ${lines[0]} == *"memory"* ]] + + runc exec test_cgroups_group cat /proc/self/cgroup + [ "$status" -eq 0 ] + [[ ${lines[0]} == "0::/" ]] + + runc exec test_cgroups_group mkdir /sys/fs/cgroup/foo + [ "$status" -eq 0 ] + + runc exec test_cgroups_group sh -c "echo 1 > /sys/fs/cgroup/foo/cgroup.procs" + [ "$status" -eq 0 ] + +# the init process is now in "/foo", but an exec process can still join "/" +# because we haven't enabled any domain controller. + runc exec test_cgroups_group cat /proc/self/cgroup + [ "$status" -eq 0 ] + [[ ${lines[0]} == "0::/" ]] + +# turn on a domain controller (memory) + runc exec test_cgroups_group sh -euxc 'echo $$ > /sys/fs/cgroup/foo/cgroup.procs; echo +memory > /sys/fs/cgroup/cgroup.subtree_control' + [ "$status" -eq 0 ] + +# an exec process can no longer join "/" after turning on a domain controller. +# falls back to "/foo". + runc exec test_cgroups_group cat /proc/self/cgroup + [ "$status" -eq 0 ] + [[ ${lines[0]} == "0::/foo" ]] + +# teardown: remove "/foo" + runc exec test_cgroups_group sh -uxc 'echo -memory > /sys/fs/cgroup/cgroup.subtree_control; for f in $(cat /sys/fs/cgroup/foo/cgroup.procs); do echo $f > /sys/fs/cgroup/cgroup.procs; done; rmdir /sys/fs/cgroup/foo' + runc exec test_cgroups_group test ! -d /sys/fs/cgroup/foo + [ "$status" -eq 0 ] +# +} diff --git a/tests/integration/delete.bats b/tests/integration/delete.bats index 9c80909d..bccd52e7 100644 --- a/tests/integration/delete.bats +++ b/tests/integration/delete.bats @@ -55,12 +55,7 @@ function teardown() { @test "runc delete --force in cgroupv2 with subcgroups" { requires cgroups_v2 root set_cgroups_path "$BUSYBOX_BUNDLE" - - # grant `rw` priviledge to `/sys/fs/cgroup` - cat "${BUSYBOX_BUNDLE}/config.json"\ - | jq '.mounts |= map((select(.type=="cgroup") | .options -= ["ro"]) // .)'\ - > "${BUSYBOX_BUNDLE}/config.json.tmp" - mv "${BUSYBOX_BUNDLE}/config.json"{.tmp,} + set_cgroup_mount_writable "$BUSYBOX_BUNDLE" # run busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index c712b2d0..98eb36f0 100644 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -197,6 +197,15 @@ function set_resources_limit() { sed -i 's/\("linux": {\)/\1\n "resources": { "pids": { "limit": 100 } },/' "$bundle/config.json" } +# Helper function to make /sys/fs/cgroup writable +function set_cgroup_mount_writable() { + bundle="${1:-.}" + cat "$bundle/config.json" \ + | jq '.mounts |= map((select(.type == "cgroup") | .options -= ["ro"]) // .)' \ + >"$bundle/config.json.tmp" + mv "$bundle/config.json"{.tmp,} +} + # Fails the current test, providing the error given. function fail() { echo "$@" >&2