cgroup v2: support rootless systemd

Tested with both Podman (master) and Moby (master), on Ubuntu 19.10 .

$ podman --cgroup-manager=systemd run -it --rm --runtime=runc \
  --cgroupns=host --memory 42m --cpus 0.42 --pids-limit 42 alpine
/ # cat /proc/self/cgroup
0::/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope
/ # cat /sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope/memory.max
44040192
/ # cat /sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope/cpu.max
42000 100000
/ # cat /sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope/pids.max
42

Signed-off-by: Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp>
This commit is contained in:
Akihiro Suda 2020-04-01 10:47:06 +09:00
parent 492cfd8bf9
commit bf15cc99b1
13 changed files with 276 additions and 43 deletions

View File

@ -36,7 +36,9 @@ matrix:
- sudo ssh default -t 'cd /vagrant && sudo make localintegration RUNC_USE_SYSTEMD=yes'
# same setup but with fs2 driver instead of systemd
- sudo ssh default -t 'cd /vagrant && sudo make localintegration'
# rootless
# cgroupv2+systemd (rootless)
- sudo ssh default -t 'cd /vagrant && sudo make localrootlessintegration RUNC_USE_SYSTEMD=yes'
# same setup but with fs2 driver (rootless) instead of systemd
- sudo ssh default -t 'cd /vagrant && sudo make localrootlessintegration'
allow_failures:
- go: tip

16
Vagrantfile vendored
View File

@ -28,9 +28,25 @@ EOF
# Add a user for rootless tests
useradd -u2000 -m -d/home/rootless -s/bin/bash rootless
# Allow root to execute `ssh rootless@localhost` in tests/rootless.sh
ssh-keygen -t ecdsa -N "" -f /root/rootless.key
mkdir -m 0700 -p /home/rootless/.ssh
cat /root/rootless.key.pub >> /home/rootless/.ssh/authorized_keys
chown -R rootless.rootless /home/rootless
# Add busybox for libcontainer/integration tests
. /vagrant/tests/integration/multi-arch.bash \
&& mkdir /busybox \
&& curl -fsSL $(get_busybox) | tar xfJC - /busybox
# Delegate cgroup v2 controllers to rootless user via --systemd-cgroup
mkdir -p /etc/systemd/system/user@.service.d
cat > /etc/systemd/system/user@.service.d/delegate.conf << EOF
[Service]
# default: Delegate=pids memory
# NOTE: delegation of cpuset requires systemd >= 244 (Fedora >= 32, Ubuntu >= 20.04). cpuset is ignored on Fedora 31.
Delegate=cpu cpuset io memory pids
EOF
systemctl daemon-reload
SHELL
end

View File

@ -71,9 +71,13 @@ func ExpandSlice(slice string) (string, error) {
// getDbusConnection lazy initializes systemd dbus connection
// and returns it
func getDbusConnection() (*systemdDbus.Conn, error) {
func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) {
connOnce.Do(func() {
connDbus, connErr = systemdDbus.New()
if rootless {
connDbus, connErr = NewUserSystemdDbus()
} else {
connDbus, connErr = systemdDbus.New()
}
})
return connDbus, connErr
}
@ -103,12 +107,7 @@ func isUnitExists(err error) bool {
return false
}
func startUnit(unitName string, properties []systemdDbus.Property) error {
dbusConnection, err := getDbusConnection()
if err != nil {
return err
}
func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
statusChan := make(chan string, 1)
if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
select {
@ -129,12 +128,7 @@ func startUnit(unitName string, properties []systemdDbus.Property) error {
return nil
}
func stopUnit(unitName string) error {
dbusConnection, err := getDbusConnection()
if err != nil {
return err
}
func stopUnit(dbusConnection *systemdDbus.Conn, unitName string) error {
statusChan := make(chan string, 1)
if _, err := dbusConnection.StopUnit(unitName, "replace", statusChan); err == nil {
select {

View File

@ -0,0 +1,106 @@
// +build linux
package systemd
import (
"bufio"
"bytes"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/pkg/errors"
)
// NewUserSystemdDbus creates a connection for systemd user-instance.
func NewUserSystemdDbus() (*systemdDbus.Conn, error) {
addr, err := DetectUserDbusSessionBusAddress()
if err != nil {
return nil, err
}
uid, err := DetectUID()
if err != nil {
return nil, err
}
return systemdDbus.NewConnection(func() (*dbus.Conn, error) {
conn, err := dbus.Dial(addr)
if err != nil {
return nil, errors.Wrapf(err, "error while dialing %q", addr)
}
methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))}
err = conn.Auth(methods)
if err != nil {
conn.Close()
return nil, errors.Wrapf(err, "error while authenticating connection, address=%q, UID=%d", addr, uid)
}
if err = conn.Hello(); err != nil {
conn.Close()
return nil, errors.Wrapf(err, "error while sending Hello message, address=%q, UID=%d", addr, uid)
}
return conn, nil
})
}
// DetectUID detects UID from the OwnerUID field of `busctl --user status`
// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) .
//
// Otherwise returns os.Getuid() .
func DetectUID() (int, error) {
if !system.RunningInUserNS() {
return os.Getuid(), nil
}
b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()
if err != nil {
return -1, errors.Wrap(err, "could not execute `busctl --user --no-pager status`")
}
scanner := bufio.NewScanner(bytes.NewReader(b))
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(s, "OwnerUID=") {
uidStr := strings.TrimPrefix(s, "OwnerUID=")
i, err := strconv.Atoi(uidStr)
if err != nil {
return -1, errors.Wrapf(err, "could not detect the OwnerUID: %s", s)
}
return i, nil
}
}
if err := scanner.Err(); err != nil {
return -1, err
}
return -1, errors.New("could not detect the OwnerUID")
}
// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS if set.
// Otherwise returns "unix:path=$XDG_RUNTIME_DIR/bus" if $XDG_RUNTIME_DIR/bus exists.
// Otherwise parses the value from `systemctl --user show-environment` .
func DetectUserDbusSessionBusAddress() (string, error) {
if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" {
return env, nil
}
if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" {
busPath := filepath.Join(xdr, "bus")
if _, err := os.Stat(busPath); err == nil {
busAddress := "unix:path=" + busPath
return busAddress, nil
}
}
b, err := exec.Command("systemctl", "--user", "--no-pager", "show-environment").CombinedOutput()
if err != nil {
return "", errors.Wrapf(err, "could not execute `systemctl --user --no-pager show-environment`, output=%q", string(b))
}
scanner := bufio.NewScanner(bytes.NewReader(b))
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(s, "DBUS_SESSION_BUS_ADDRESS=") {
return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil
}
}
return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`")
}

View File

@ -182,7 +182,11 @@ func (m *LegacyManager) Apply(pid int) error {
properties = append(properties, resourcesProperties...)
properties = append(properties, c.SystemdProps...)
if err := startUnit(unitName, properties); err != nil {
dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
}
if err := startUnit(dbusConnection, unitName, properties); err != nil {
return err
}
@ -213,8 +217,12 @@ func (m *LegacyManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
}
unitName := getUnitName(m.Cgroups)
if err := stopUnit(unitName); err != nil {
if err := stopUnit(dbusConnection, unitName); err != nil {
return err
}
m.Paths = make(map[string]string)
@ -371,7 +379,7 @@ func (m *LegacyManager) Set(container *configs.Config) error {
if err != nil {
return err
}
dbusConnection, err := getDbusConnection()
dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
}

View File

@ -6,6 +6,7 @@ import (
"math"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
@ -14,6 +15,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
)
type unifiedManager struct {
@ -89,7 +91,6 @@ func (m *unifiedManager) Apply(pid int) error {
var (
c = m.cgroups
unitName = getUnitName(c)
slice = "system.slice"
properties []systemdDbus.Property
)
@ -97,6 +98,10 @@ func (m *unifiedManager) Apply(pid int) error {
return cgroups.WriteCgroupProc(m.path, pid)
}
slice := "system.slice"
if m.rootless {
slice = "user.slice"
}
if c.Parent != "" {
slice = c.Parent
}
@ -140,9 +145,13 @@ func (m *unifiedManager) Apply(pid int) error {
properties = append(properties, resourcesProperties...)
properties = append(properties, c.SystemdProps...)
if err := startUnit(unitName, properties); err != nil {
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return err
}
if err := startUnit(dbusConnection, unitName, properties); err != nil {
return errors.Wrapf(err, "error while starting unit %q with properties %+v", unitName, properties)
}
_, err = m.GetUnifiedPath()
if err != nil {
@ -161,13 +170,17 @@ func (m *unifiedManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return err
}
unitName := getUnitName(m.cgroups)
if err := stopUnit(unitName); err != nil {
if err := stopUnit(dbusConnection, unitName); err != nil {
return err
}
// XXX this is probably not needed, systemd should handle it
err := os.Remove(m.path)
err = os.Remove(m.path)
if err != nil && !os.IsNotExist(err) {
return err
}
@ -190,6 +203,44 @@ func (m *unifiedManager) GetPaths() map[string]string {
return paths
}
// getSliceFull value is used in GetUnifiedPath.
// The value is incompatible with systemdDbus.PropSlice.
func (m *unifiedManager) getSliceFull() (string, error) {
c := m.cgroups
slice := "system.slice"
if m.rootless {
slice = "user.slice"
}
if c.Parent != "" {
var err error
slice, err = ExpandSlice(c.Parent)
if err != nil {
return "", err
}
}
if m.rootless {
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return "", err
}
// managerCGQuoted is typically "/user.slice/user-${uid}.slice/user@${uid}.service" including the quote symbols
managerCGQuoted, err := dbusConnection.GetManagerProperty("ControlGroup")
if err != nil {
return "", err
}
managerCG, err := strconv.Unquote(managerCGQuoted)
if err != nil {
return "", err
}
slice = filepath.Join(managerCG, slice)
}
// an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice"
// NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified.
return slice, nil
}
func (m *unifiedManager) GetUnifiedPath() (string, error) {
m.mu.Lock()
defer m.mu.Unlock()
@ -197,24 +248,21 @@ func (m *unifiedManager) GetUnifiedPath() (string, error) {
return m.path, nil
}
c := m.cgroups
slice := "system.slice"
if c.Parent != "" {
slice = c.Parent
}
slice, err := ExpandSlice(slice)
sliceFull, err := m.getSliceFull()
if err != nil {
return "", err
}
path := filepath.Join(slice, getUnitName(c))
c := m.cgroups
path := filepath.Join(sliceFull, getUnitName(c))
path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path)
if err != nil {
return "", err
}
m.path = path
// an example of the final path in rootless:
// "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
return m.path, nil
}
@ -263,12 +311,12 @@ func (m *unifiedManager) Set(container *configs.Config) error {
if err != nil {
return err
}
dbusConnection, err := getDbusConnection()
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return err
}
if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil {
return err
return errors.Wrap(err, "error while setting unit properties")
}
fsMgr, err := m.fsManager()

View File

@ -97,6 +97,18 @@ func SystemdCgroups(l *LinuxFactory) error {
return nil
}
// RootlessSystemdCgroups is rootless version of SystemdCgroups.
func RootlessSystemdCgroups(l *LinuxFactory) error {
if !systemd.IsRunningSystemd() {
return fmt.Errorf("systemd not running on this host, can't use systemd as cgroups manager")
}
if !cgroups.IsCgroup2UnifiedMode() {
return fmt.Errorf("cgroup v2 not enabled on this host, can't use systemd (rootless) as cgroups manager")
}
return systemdCgroupV2(l, true)
}
func cgroupfs2(l *LinuxFactory, rootless bool) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
m, err := fs2.NewManager(config, getUnifiedPath(paths), rootless)

View File

@ -19,10 +19,6 @@ func shouldUseRootlessCgroupManager(context *cli.Context) (bool, error) {
if b != nil {
return *b, nil
}
if context.GlobalBool("systemd-cgroup") {
return false, nil
}
}
if os.Geteuid() != 0 {
return true, nil

View File

@ -79,6 +79,8 @@ EOF
@test "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" {
requires rootless
requires rootless_no_cgroup
# systemd controls the permission, so error does not happen
requires no_systemd
set_cgroups_path "$BUSYBOX_BUNDLE"
@ -90,6 +92,8 @@ EOF
@test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" {
requires rootless
requires rootless_no_cgroup
# systemd controls the permission, so error does not happen
requires no_systemd
set_resources_limit "$BUSYBOX_BUNDLE"

View File

@ -110,8 +110,14 @@ function init_cgroup_paths() {
test -n "$CGROUP_UNIFIED" && return
if [ -n "${RUNC_USE_SYSTEMD}" ] ; then
REL_CGROUPS_PATH="/machine.slice/runc-cgroups-integration-test.scope"
OCI_CGROUPS_PATH="machine.slice:runc-cgroups:integration-test"
if [ $(id -u) = "0" ]; then
REL_CGROUPS_PATH="/machine.slice/runc-cgroups-integration-test.scope"
OCI_CGROUPS_PATH="machine.slice:runc-cgroups:integration-test"
else
REL_CGROUPS_PATH="/user.slice/user-$(id -u).slice/user@$(id -u).service/machine.slice/runc-cgroups-integration-test.scope"
# OCI path doesn't contain "/user.slice/user-$(id -u).slice/user@$(id -u).service/" prefix
OCI_CGROUPS_PATH="machine.slice:runc-cgroups:integration-test"
fi
else
REL_CGROUPS_PATH="/runc-cgroups-integration-test/test-cgroup"
OCI_CGROUPS_PATH=$REL_CGROUPS_PATH
@ -169,7 +175,11 @@ function check_systemd_value() {
source=$2
expected=$3
current=$(systemctl show $unitname | grep $source)
if [ $(id -u) = "0" ]; then
current=$(systemctl show $unitname | grep $source)
else
current=$(systemctl --user show $unitname | grep $source)
fi
echo "current" $current "!?" "$expected"
[ "$current" = "$expected" ]
}
@ -245,6 +255,16 @@ function requires() {
skip "Test requires cgroups v2 (unified)"
fi
;;
systemd)
if [ -z "${RUNC_USE_SYSTEMD}" ]; then
skip "Test requires systemd"
fi
;;
no_systemd)
if [ -n "${RUNC_USE_SYSTEMD}" ]; then
skip "Test requires no systemd"
fi
;;
*)
fail "BUG: Invalid requires ${var}."
;;

View File

@ -29,17 +29,30 @@ function setup() {
},
"pids": {
"limit": 20
},
}
EOF
)
DATA=$(echo ${DATA} | sed 's/\n/\\n/g')
sed -i "s/\(\"resources\": {\)/\1\n${DATA}/" ${BUSYBOX_BUNDLE}/config.json
if grep -qw \"resources\" ${BUSYBOX_BUNDLE}/config.json; then
sed -i "s/\(\"resources\": {\)/\1\n${DATA},/" ${BUSYBOX_BUNDLE}/config.json
else
sed -i "s/\(\"linux\": {\)/\1\n\"resources\": {${DATA}},/" ${BUSYBOX_BUNDLE}/config.json
fi
}
# Tests whatever limits are (more or less) common between cgroup
# v1 and v2: memory/swap, pids, and cpuset.
@test "update cgroup v1/v2 common limits" {
[[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup
[[ "$ROOTLESS" -ne 0 && -z "$RUNC_USE_SYSTEMD" ]] && requires rootless_cgroup
if [[ "$ROOTLESS" -ne 0 && -n "$RUNC_USE_SYSTEMD" ]]; then
file="/sys/fs/cgroup/user.slice/user-$(id -u).slice/user@$(id -u).service/cgroup.controllers"
# NOTE: delegation of cpuset requires systemd >= 244 (Fedora >= 32, Ubuntu >= 20.04).
for f in memory pids cpuset; do
if grep -qwv $f $file; then
skip "$f is not enabled in $file"
fi
done
fi
init_cgroup_paths
# run a few busyboxes detached

View File

@ -20,6 +20,10 @@
# and add an enable_* and disable_* hook.
ALL_FEATURES=("idmap" "cgroup")
# cgroup is managed by systemd when RUNC_USE_SYSTEMD is set
if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then
ALL_FEATURES=("idmap")
fi
ROOT="$(readlink -f "$(dirname "${BASH_SOURCE}")/..")"
# FEATURE: Opportunistic new{uid,gid}map support, allowing a rootless container
@ -143,6 +147,13 @@ do
set -e
echo path: $PATH
export ROOTLESS_FEATURES="$enabled_features"
sudo -HE -u rootless PATH="$PATH" bats -t "$ROOT/tests/integration$ROOTLESS_TESTPATH"
if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then
# We use `ssh rootless@localhost` instead of `sudo -u rootless` for creating systemd user session.
# Alternatively we could use `machinectl shell`, but it is known not to work well on SELinux-enabled hosts as of April 2020:
# https://bugzilla.redhat.com/show_bug.cgi?id=1788616
ssh -t -t -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i $HOME/rootless.key rootless@localhost -- PATH="$PATH" RUNC_USE_SYSTEMD="$RUNC_USE_SYSTEMD" bats -t "$ROOT/tests/integration$ROOTLESS_TESTPATH"
else
sudo -HE -u rootless PATH="$PATH" bats -t "$ROOT/tests/integration$ROOTLESS_TESTPATH"
fi
set +e
done

View File

@ -49,6 +49,9 @@ func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
if context.GlobalBool("systemd-cgroup") {
if systemd.IsRunningSystemd() {
cgroupManager = libcontainer.SystemdCgroups
if rootlessCg {
cgroupManager = libcontainer.RootlessSystemdCgroups
}
} else {
return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
}