cgroup2: exec: join the cgroup of the init process on EBUSY

Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
This commit is contained in:
Akihiro Suda 2020-05-19 20:46:31 +09:00
parent e664e732d5
commit c91fe9aeba
5 changed files with 80 additions and 7 deletions

View File

@ -566,6 +566,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
initProcessPid: state.InitProcessPid,
}, nil
}

View File

@ -5,6 +5,7 @@ package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"io"
"os"
"os/exec"
@ -12,12 +13,14 @@ import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/logs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@ -66,6 +69,7 @@ type setnsProcess struct {
fds []string
process *Process
bootstrapData io.Reader
initProcessPid int
}
func (p *setnsProcess) startTime() (uint64, error) {
@ -100,7 +104,25 @@ func (p *setnsProcess) start() (err error) {
}
if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
// On cgroup v2 + nesting + domain controllers, EnterPid may fail with EBUSY.
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
// Try to join the cgroup of InitProcessPid.
if cgroups.IsCgroup2UnifiedMode() {
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
if initCgErr == nil {
if initCgPath, ok := initCg[""]; ok {
initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
// NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
}
}
}
if err != nil {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
}
}
if p.intelRdtPath != "" {

View File

@ -136,3 +136,49 @@ EOF
[ "$status" -eq 0 ]
[[ ${lines[0]} == *"cgroups_exec"* ]]
}
@test "runc exec (cgroup v2 + init process in non-root cgroup) succeeds" {
requires root cgroups_v2
set_cgroups_path "$BUSYBOX_BUNDLE"
set_cgroup_mount_writable "$BUSYBOX_BUNDLE"
runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_group
[ "$status" -eq 0 ]
runc exec test_cgroups_group cat /sys/fs/cgroup/cgroup.controllers
[ "$status" -eq 0 ]
[[ ${lines[0]} == *"memory"* ]]
runc exec test_cgroups_group cat /proc/self/cgroup
[ "$status" -eq 0 ]
[[ ${lines[0]} == "0::/" ]]
runc exec test_cgroups_group mkdir /sys/fs/cgroup/foo
[ "$status" -eq 0 ]
runc exec test_cgroups_group sh -c "echo 1 > /sys/fs/cgroup/foo/cgroup.procs"
[ "$status" -eq 0 ]
# the init process is now in "/foo", but an exec process can still join "/"
# because we haven't enabled any domain controller.
runc exec test_cgroups_group cat /proc/self/cgroup
[ "$status" -eq 0 ]
[[ ${lines[0]} == "0::/" ]]
# turn on a domain controller (memory)
runc exec test_cgroups_group sh -euxc 'echo $$ > /sys/fs/cgroup/foo/cgroup.procs; echo +memory > /sys/fs/cgroup/cgroup.subtree_control'
[ "$status" -eq 0 ]
# an exec process can no longer join "/" after turning on a domain controller.
# falls back to "/foo".
runc exec test_cgroups_group cat /proc/self/cgroup
[ "$status" -eq 0 ]
[[ ${lines[0]} == "0::/foo" ]]
# teardown: remove "/foo"
runc exec test_cgroups_group sh -uxc 'echo -memory > /sys/fs/cgroup/cgroup.subtree_control; for f in $(cat /sys/fs/cgroup/foo/cgroup.procs); do echo $f > /sys/fs/cgroup/cgroup.procs; done; rmdir /sys/fs/cgroup/foo'
runc exec test_cgroups_group test ! -d /sys/fs/cgroup/foo
[ "$status" -eq 0 ]
#
}

View File

@ -55,12 +55,7 @@ function teardown() {
@test "runc delete --force in cgroupv2 with subcgroups" {
requires cgroups_v2 root
set_cgroups_path "$BUSYBOX_BUNDLE"
# grant `rw` priviledge to `/sys/fs/cgroup`
cat "${BUSYBOX_BUNDLE}/config.json"\
| jq '.mounts |= map((select(.type=="cgroup") | .options -= ["ro"]) // .)'\
> "${BUSYBOX_BUNDLE}/config.json.tmp"
mv "${BUSYBOX_BUNDLE}/config.json"{.tmp,}
set_cgroup_mount_writable "$BUSYBOX_BUNDLE"
# run busybox detached
runc run -d --console-socket $CONSOLE_SOCKET test_busybox

View File

@ -197,6 +197,15 @@ function set_resources_limit() {
sed -i 's/\("linux": {\)/\1\n "resources": { "pids": { "limit": 100 } },/' "$bundle/config.json"
}
# Helper function to make /sys/fs/cgroup writable
function set_cgroup_mount_writable() {
bundle="${1:-.}"
cat "$bundle/config.json" \
| jq '.mounts |= map((select(.type == "cgroup") | .options -= ["ro"]) // .)' \
>"$bundle/config.json.tmp"
mv "$bundle/config.json"{.tmp,}
}
# Fails the current test, providing the error given.
function fail() {
echo "$@" >&2