From b441dfa7293dfff89efb81800ba968ed73f8f0d1 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Fri, 13 Jun 2014 12:43:01 +0200 Subject: [PATCH] Mount cgroups in the container We make a tmpfs on /sys/fs/cgroups, and here we mount read-only versions of all the host cgroups. Additionally we make symlinks for all merged subsystems. For any "named" cgroup, such as "name=systemd" we also mount the subset of the cgroup where the container lives as read-write. This means that the container can create sub-cgroups inside the container and move tasks into those, but it can never escape from its current position in the cgroup hierarchy. In particular, this allows systemd to mostly work in a non-privileged container. The only problem currently is that PrivateTmp=true fails because systemd is not allowed to mount a new /tmp. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- mount/init.go | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/mount/init.go b/mount/init.go index 4e913ad1..e61d3c3b 100644 --- a/mount/init.go +++ b/mount/init.go @@ -6,9 +6,11 @@ import ( "fmt" "os" "path/filepath" + "strings" "syscall" "github.com/docker/libcontainer" + "github.com/docker/libcontainer/cgroups" "github.com/docker/libcontainer/label" "github.com/docker/libcontainer/mount/nodes" "github.com/dotcloud/docker/pkg/symlink" @@ -92,6 +94,69 @@ func mountSystem(rootfs string, container *libcontainer.Container) error { return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) } } + + // Mount all cgroup subsystems into the container, read-only. Create symlinks for each + // subsystem if any subsystems are merged + cgroupMounts, err := cgroups.GetCgroupMounts() + if err != nil { + return err + } + + cgroupsDir := filepath.Join(rootfs, "/sys/fs/cgroup") + + for _, m := range cgroupMounts { + dir := filepath.Base(m.Mountpoint) + mountpoint := filepath.Join(cgroupsDir, dir) + + if err := os.MkdirAll(mountpoint, 0755); err != nil && !os.IsExist(err) { + return fmt.Errorf("mkdirall %s %s", mountpoint, err) + } + + // Bind-mount the cgroup to /sys/fs/cgroup with the same name as the outer mount + if err := system.Mount(m.Mountpoint, mountpoint, "bind", uintptr(syscall.MS_BIND|defaultMountFlags), ""); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.Mountpoint, mountpoint, err) + } + // Make it read-only + if err := system.Mount(mountpoint, mountpoint, "bind", uintptr(syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|defaultMountFlags), ""); err != nil { + return fmt.Errorf("remounting %s into %s %s", mountpoint, mountpoint, err) + } + + hasName := false + for _, subsys := range m.Subsystems { + isName := strings.HasPrefix(subsys, "name=") + canonicalName := subsys + if isName { + hasName = true + canonicalName = subsys[5:] + } + + // For the merged case dir will be something like "cpu,cpuacct", so + // we make symlinks for all the pure subsystem names "cpu -> cpu,cpuacct", etc + if canonicalName != dir { + if err := os.Symlink(dir, filepath.Join(cgroupsDir, canonicalName)); err != nil { + return fmt.Errorf("creating cgroup symlink for %s: %s", dir, err) + } + } + } + + // For named cgroups, such as name=systemd we mount a read-write subset at the + // current cgroup path. This lets e.g. systemd work inside a container, as it can create subcgroups inside the + // current cgroup, while not being able to do anything dangerous in the real cgroups + if hasName { + cgroupPath, _ := m.GetThisCgroupDir() + if cgroupPath != "" && cgroupPath != "/" { + if err := system.Mount(filepath.Join(m.Mountpoint, cgroupPath), filepath.Join(mountpoint, cgroupPath), "bind", uintptr(syscall.MS_BIND|defaultMountFlags), ""); err != nil { + return fmt.Errorf("mounting %s into %s %s", filepath.Join(m.Mountpoint, cgroupPath), filepath.Join(mountpoint, cgroupPath), err) + } + } + } + } + + // Make /sys/fs/cgroup read-only + if err := system.Mount(cgroupsDir, cgroupsDir, "bind", uintptr(syscall.MS_REMOUNT|syscall.MS_RDONLY|defaultMountFlags), ""); err != nil { + return fmt.Errorf("remounting %s read-only %s", cgroupsDir, err) + } + return nil } @@ -193,6 +258,7 @@ func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mo {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, {source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: label.FormatMountLabel("mode=755", mountLabel)}, + {source: "tmpfs", path: filepath.Join(rootfs, "sys/fs/cgroup"), device: "tmpfs", flags: defaultMountFlags}, {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)}, {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}, }