From baeef298582869504e73651e2b0fb78b156e5783 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 26 Apr 2016 02:19:39 +1000 Subject: [PATCH] rootless: add rootless cgroup manager The rootless cgroup manager acts as a noop for all set and apply operations. It is just used for rootless setups. Currently this is far too simple (we need to add opportunistic cgroup management), but is good enough as a first-pass at a noop cgroup manager. Signed-off-by: Aleksa Sarai --- libcontainer/cgroups/fs/apply_raw.go | 24 +--- libcontainer/cgroups/rootless/rootless.go | 128 ++++++++++++++++++ libcontainer/cgroups/systemd/apply_systemd.go | 2 +- libcontainer/cgroups/utils.go | 41 +++++- libcontainer/container_linux.go | 8 ++ libcontainer/factory_linux.go | 22 +++ libcontainer/process_linux.go | 20 ++- libcontainer/rootfs_linux.go | 2 +- 8 files changed, 210 insertions(+), 37 deletions(-) create mode 100644 libcontainer/cgroups/rootless/rootless.go diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go index d316313c..22d82acb 100644 --- a/libcontainer/cgroups/fs/apply_raw.go +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -267,25 +267,8 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { }, nil } -func (raw *cgroupData) parentPath(subsystem, mountpoint, root string) (string, error) { - // Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating - // process could in container and shared pid namespace with host, and - // /proc/1/cgroup could point to whole other world of cgroups. - initPath, err := cgroups.GetThisCgroupDir(subsystem) - if err != nil { - return "", err - } - // This is needed for nested containers, because in /proc/self/cgroup we - // see pathes from host, which don't exist in container. - relDir, err := filepath.Rel(root, initPath) - if err != nil { - return "", err - } - return filepath.Join(mountpoint, relDir), nil -} - func (raw *cgroupData) path(subsystem string) (string, error) { - mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem) + mnt, err := cgroups.FindCgroupMountpoint(subsystem) // If we didn't mount the subsystem, there is no point we make the path. if err != nil { return "", err @@ -297,7 +280,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) { return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil } - parentPath, err := raw.parentPath(subsystem, mnt, root) + // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating + // process could in container and shared pid namespace with host, and + // /proc/1/cgroup could point to whole other world of cgroups. + parentPath, err := cgroups.GetOwnCgroupPath(subsystem) if err != nil { return "", err } diff --git a/libcontainer/cgroups/rootless/rootless.go b/libcontainer/cgroups/rootless/rootless.go new file mode 100644 index 00000000..b1efbfd9 --- /dev/null +++ b/libcontainer/cgroups/rootless/rootless.go @@ -0,0 +1,128 @@ +// +build linux + +package rootless + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" +) + +// TODO: This is copied from libcontainer/cgroups/fs, which duplicates this code +// needlessly. We should probably export this list. + +var subsystems = []subsystem{ + &fs.CpusetGroup{}, + &fs.DevicesGroup{}, + &fs.MemoryGroup{}, + &fs.CpuGroup{}, + &fs.CpuacctGroup{}, + &fs.PidsGroup{}, + &fs.BlkioGroup{}, + &fs.HugetlbGroup{}, + &fs.NetClsGroup{}, + &fs.NetPrioGroup{}, + &fs.PerfEventGroup{}, + &fs.FreezerGroup{}, + &fs.NameGroup{GroupName: "name=systemd"}, +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error +} + +// The noop cgroup manager is used for rootless containers, because we currently +// cannot manage cgroups if we are in a rootless setup. This manager is chosen +// by factory if we are in rootless mode. We error out if any cgroup options are +// set in the config -- this may change in the future with upcoming kernel features +// like the cgroup namespace. + +type Manager struct { + Cgroups *configs.Cgroup + Paths map[string]string +} + +func (m *Manager) Apply(pid int) error { + // If there are no cgroup settings, there's nothing to do. + if m.Cgroups == nil { + return nil + } + + // We can't set paths. + // TODO(cyphar): Implement the case where the runner of a rootless container + // owns their own cgroup, which would allow us to set up a + // cgroup for each path. + if m.Cgroups.Paths != nil { + return fmt.Errorf("cannot change cgroup path in rootless container") + } + + // We load the paths into the manager. + paths := make(map[string]string) + for _, sys := range subsystems { + name := sys.Name() + + path, err := cgroups.GetOwnCgroupPath(name) + if err != nil { + // Ignore paths we couldn't resolve. + continue + } + + paths[name] = path + } + + m.Paths = paths + return nil +} + +func (m *Manager) GetPaths() map[string]string { + return m.Paths +} + +func (m *Manager) Set(container *configs.Config) error { + // We have to re-do the validation here, since someone might decide to + // update a rootless container. + return validate.New().Validate(container) +} + +func (m *Manager) GetPids() ([]int, error) { + dir, err := cgroups.GetOwnCgroupPath("devices") + if err != nil { + return nil, err + } + return cgroups.GetPids(dir) +} + +func (m *Manager) GetAllPids() ([]int, error) { + dir, err := cgroups.GetOwnCgroupPath("devices") + if err != nil { + return nil, err + } + return cgroups.GetAllPids(dir) +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + // TODO(cyphar): We can make this work if we figure out a way to allow usage + // of cgroups with a rootless container. While this doesn't + // actually require write access to a cgroup directory, the + // statistics are not useful if they can be affected by + // non-container processes. + return nil, fmt.Errorf("cannot get cgroup stats in rootless container") +} + +func (m *Manager) Freeze(state configs.FreezerState) error { + // TODO(cyphar): We can make this work if we figure out a way to allow usage + // of cgroups with a rootless container. + return fmt.Errorf("cannot use freezer cgroup in rootless container") +} + +func (m *Manager) Destroy() error { + // We don't have to do anything here because we didn't do any setup. + return nil +} diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go index 2872bfac..456c57d9 100644 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -426,7 +426,7 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { return "", err } - initPath, err := cgroups.GetInitCgroupDir(subsystem) + initPath, err := cgroups.GetInitCgroup(subsystem) if err != nil { return "", err } diff --git a/libcontainer/cgroups/utils.go b/libcontainer/cgroups/utils.go index 52fc87eb..5db37344 100644 --- a/libcontainer/cgroups/utils.go +++ b/libcontainer/cgroups/utils.go @@ -109,7 +109,7 @@ type Mount struct { Subsystems []string } -func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) { +func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { if len(m.Subsystems) == 0 { return "", fmt.Errorf("no subsystem for mount") } @@ -203,8 +203,8 @@ func GetAllSubsystems() ([]string, error) { return subsystems, nil } -// GetThisCgroupDir returns the relative path to the cgroup docker is running in. -func GetThisCgroupDir(subsystem string) (string, error) { +// GetOwnCgroup returns the relative path to the cgroup docker is running in. +func GetOwnCgroup(subsystem string) (string, error) { cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return "", err @@ -213,8 +213,16 @@ func GetThisCgroupDir(subsystem string) (string, error) { return getControllerPath(subsystem, cgroups) } -func GetInitCgroupDir(subsystem string) (string, error) { +func GetOwnCgroupPath(subsystem string) (string, error) { + cgroup, err := GetOwnCgroup(subsystem) + if err != nil { + return "", err + } + return getCgroupPathHelper(subsystem, cgroup) +} + +func GetInitCgroup(subsystem string) (string, error) { cgroups, err := ParseCgroupFile("/proc/1/cgroup") if err != nil { return "", err @@ -223,6 +231,31 @@ func GetInitCgroupDir(subsystem string) (string, error) { return getControllerPath(subsystem, cgroups) } +func GetInitCgroupPath(subsystem string) (string, error) { + cgroup, err := GetInitCgroup(subsystem) + if err != nil { + return "", err + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func getCgroupPathHelper(subsystem, cgroup string) (string, error) { + mnt, root, err := FindCgroupMountpointAndRoot(subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see pathes from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + return filepath.Join(mnt, relCgroup), nil +} + func readProcsFile(dir string) ([]int, error) { f, err := os.Open(filepath.Join(dir, CgroupProcesses)) if err != nil { diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index c3dd42d2..f3b73ee0 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -520,10 +520,18 @@ func (c *linuxContainer) Resume() error { } func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.Rootless { + return nil, fmt.Errorf("cannot get OOM notifications from rootless container") + } return notifyOnOOM(c.cgroupManager.GetPaths()) } func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.Rootless { + return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container") + } return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) } diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index d5532875..1f965e62 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -15,6 +15,7 @@ import ( "github.com/docker/docker/pkg/mount" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/rootless" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs/validate" @@ -73,6 +74,20 @@ func Cgroupfs(l *LinuxFactory) error { return nil } +// RootlessCgroups is an options func to configure a LinuxFactory to +// return containers that use the "rootless" cgroup manager, which will +// fail to do any operations not possible to do with an unprivileged user. +// It should only be used in conjunction with rootless containers. +func RootlessCgroups(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &rootless.Manager{ + Cgroups: config, + Paths: paths, + } + } + return nil +} + // TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. func TmpfsRoot(l *LinuxFactory) error { mounted, err := mount.Mounted(l.Root) @@ -169,6 +184,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := os.Chown(containerRoot, uid, gid); err != nil { return nil, newGenericError(err, SystemError) } + if config.Rootless { + RootlessCgroups(l) + } c := &linuxContainer{ id: id, root: containerRoot, @@ -195,6 +213,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) { processStartTime: state.InitProcessStartTime, fds: state.ExternalDescriptors, } + // We have to use the RootlessManager. + if state.Rootless { + RootlessCgroups(l) + } c := &linuxContainer{ initProcess: r, initProcessStartTime: state.InitProcessStartTime, diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index e8b7506d..bfe99551 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -254,15 +254,14 @@ func (p *initProcess) start() error { return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) } p.setExternalDescriptors(fds) - if !p.container.config.Rootless { - // Do this before syncing with child so that no children can escape the - // cgroup. We can't do this if we're not running as root. - if err := p.manager.Apply(p.pid()); err != nil { - return newSystemErrorWithCause(err, "applying cgroup configuration for process") - } + // Do this before syncing with child so that no children can escape the + // cgroup. We don't need to worry about not doing this and not being root + // because we'd be using the rootless cgroup manager in that case. + if err := p.manager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") } defer func() { - if err != nil && !p.container.config.Rootless { + if err != nil { // TODO: should not be the responsibility to call here p.manager.Destroy() } @@ -281,11 +280,8 @@ func (p *initProcess) start() error { ierr := parseSync(p.parentPipe, func(sync *syncT) error { switch sync.Type { case procReady: - // We can't set cgroups if we're in a rootless container. - if !p.container.config.Rootless { - if err := p.manager.Set(p.config.Config); err != nil { - return newSystemErrorWithCause(err, "setting cgroup config for ready process") - } + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting cgroup config for ready process") } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 2635fd6f..b4948687 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -348,7 +348,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { var binds []*configs.Mount for _, mm := range mounts { - dir, err := mm.GetThisCgroupDir(cgroupPaths) + dir, err := mm.GetOwnCgroup(cgroupPaths) if err != nil { return nil, err }