From b441dfa7293dfff89efb81800ba968ed73f8f0d1 Mon Sep 17 00:00:00 2001
From: Alexander Larsson <alexl@redhat.com>
Date: Fri, 13 Jun 2014 12:43:01 +0200
Subject: [PATCH] Mount cgroups in the container

We make a tmpfs on /sys/fs/cgroups, and here we mount read-only
versions of all the host cgroups. Additionally we make symlinks
for all merged subsystems.

For any "named" cgroup, such as "name=systemd" we also mount the
subset of the cgroup where the container lives as read-write. This
means that the container can create sub-cgroups inside the container
and move tasks into those, but it can never escape from its current
position in the cgroup hierarchy.

In particular, this allows systemd to mostly work in a non-privileged
container. The only problem currently is that PrivateTmp=true fails
because systemd is not allowed to mount a new /tmp.

Docker-DCO-1.1-Signed-off-by: Alexander Larsson <alexl@redhat.com> (github: alexlarsson)
---
 mount/init.go | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/mount/init.go b/mount/init.go
index 4e913ad1..e61d3c3b 100644
--- a/mount/init.go
+++ b/mount/init.go
@@ -6,9 +6,11 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"strings"
 	"syscall"
 
 	"github.com/docker/libcontainer"
+	"github.com/docker/libcontainer/cgroups"
 	"github.com/docker/libcontainer/label"
 	"github.com/docker/libcontainer/mount/nodes"
 	"github.com/dotcloud/docker/pkg/symlink"
@@ -92,6 +94,69 @@ func mountSystem(rootfs string, container *libcontainer.Container) error {
 			return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err)
 		}
 	}
+
+	// Mount all cgroup subsystems into the container, read-only. Create symlinks for each
+	// subsystem if any subsystems are merged
+	cgroupMounts, err := cgroups.GetCgroupMounts()
+	if err != nil {
+		return err
+	}
+
+	cgroupsDir := filepath.Join(rootfs, "/sys/fs/cgroup")
+
+	for _, m := range cgroupMounts {
+		dir := filepath.Base(m.Mountpoint)
+		mountpoint := filepath.Join(cgroupsDir, dir)
+
+		if err := os.MkdirAll(mountpoint, 0755); err != nil && !os.IsExist(err) {
+			return fmt.Errorf("mkdirall %s %s", mountpoint, err)
+		}
+
+		// Bind-mount the cgroup to /sys/fs/cgroup with the same name as the outer mount
+		if err := system.Mount(m.Mountpoint, mountpoint, "bind", uintptr(syscall.MS_BIND|defaultMountFlags), ""); err != nil {
+			return fmt.Errorf("mounting %s into %s %s", m.Mountpoint, mountpoint, err)
+		}
+		// Make it read-only
+		if err := system.Mount(mountpoint, mountpoint, "bind", uintptr(syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|defaultMountFlags), ""); err != nil {
+			return fmt.Errorf("remounting %s into %s %s", mountpoint, mountpoint, err)
+		}
+
+		hasName := false
+		for _, subsys := range m.Subsystems {
+			isName := strings.HasPrefix(subsys, "name=")
+			canonicalName := subsys
+			if isName {
+				hasName = true
+				canonicalName = subsys[5:]
+			}
+
+			// For the merged case dir will be something like "cpu,cpuacct", so
+			// we make symlinks for all the pure subsystem names "cpu -> cpu,cpuacct", etc
+			if canonicalName != dir {
+				if err := os.Symlink(dir, filepath.Join(cgroupsDir, canonicalName)); err != nil {
+					return fmt.Errorf("creating cgroup symlink for %s: %s", dir, err)
+				}
+			}
+		}
+
+		// For named cgroups, such as name=systemd we mount a read-write subset at the
+		// current cgroup path. This lets e.g. systemd work inside a container, as it can create subcgroups inside the
+		// current cgroup, while not being able to do anything dangerous in the real cgroups
+		if hasName {
+			cgroupPath, _ := m.GetThisCgroupDir()
+			if cgroupPath != "" && cgroupPath != "/" {
+				if err := system.Mount(filepath.Join(m.Mountpoint, cgroupPath), filepath.Join(mountpoint, cgroupPath), "bind", uintptr(syscall.MS_BIND|defaultMountFlags), ""); err != nil {
+					return fmt.Errorf("mounting %s into %s %s", filepath.Join(m.Mountpoint, cgroupPath), filepath.Join(mountpoint, cgroupPath), err)
+				}
+			}
+		}
+	}
+
+	// Make /sys/fs/cgroup read-only
+	if err := system.Mount(cgroupsDir, cgroupsDir, "bind", uintptr(syscall.MS_REMOUNT|syscall.MS_RDONLY|defaultMountFlags), ""); err != nil {
+		return fmt.Errorf("remounting %s read-only %s", cgroupsDir, err)
+	}
+
 	return nil
 }
 
@@ -193,6 +258,7 @@ func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mo
 		{source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags},
 		{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags},
 		{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: label.FormatMountLabel("mode=755", mountLabel)},
+		{source: "tmpfs", path: filepath.Join(rootfs, "sys/fs/cgroup"), device: "tmpfs", flags: defaultMountFlags},
 		{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)},
 		{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)},
 	}