From 03e585985fabb9122539f8647c9f5aedb4c5ed6f Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 16 Mar 2018 11:33:04 +1100 Subject: [PATCH 1/2] rootless: cgroup: treat EROFS as a skippable error In some cases, /sys/fs/cgroups is mounted read-only. In rootless containers we can consider this effectively identical to having cgroups that we don't have write permission to -- because the user isn't responsible for the read-only setup and cannot modify it. The rules are identical to when /sys/fs/cgroups is not writable by the unprivileged user. An example of this is the default configuration of Docker, where cgroups are mounted as read-only as a preventative security measure. Reported-by: Vladimir Rutsky Signed-off-by: Aleksa Sarai --- libcontainer/cgroups/fs/apply_raw.go | 48 +++++++++++++++++++++------- libcontainer/factory_linux.go | 23 +++++++++++-- tests/integration/cgroups.bats | 2 +- utils_linux.go | 3 ++ 4 files changed, 60 insertions(+), 16 deletions(-) diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go index 6d9123dc..e93f6a96 100644 --- a/libcontainer/cgroups/fs/apply_raw.go +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -3,7 +3,6 @@ package fs import ( - "errors" "fmt" "io" "io/ioutil" @@ -14,6 +13,8 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" + "golang.org/x/sys/unix" ) var ( @@ -35,7 +36,7 @@ var ( HugePageSizes, _ = cgroups.GetHugePageSize() ) -var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") +var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist") type subsystemSet []subsystem @@ -62,9 +63,10 @@ type subsystem interface { } type Manager struct { - mu sync.Mutex - Cgroups *configs.Cgroup - Paths map[string]string + mu sync.Mutex + Cgroups *configs.Cgroup + Rootless bool + Paths map[string]string } // The absolute path to the root of the cgroup hierarchies. @@ -100,6 +102,27 @@ type cgroupData struct { pid int } +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(err error) bool { + if os.IsPermission(errors.Cause(err)) { + return true + } + + var errno error + switch err := errors.Cause(err).(type) { + case *os.PathError: + errno = err.Err + case *os.LinkError: + errno = err.Err + case *os.SyscallError: + errno = err.Err + } + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES +} + func (m *Manager) Apply(pid int) (err error) { if m.Cgroups == nil { return nil @@ -145,11 +168,11 @@ func (m *Manager) Apply(pid int) (err error) { m.Paths[sys.Name()] = p if err := sys.Apply(d); err != nil { - if os.IsPermission(err) && m.Cgroups.Path == "" { - // If we didn't set a cgroup path, then let's defer the error here - // until we know whether we have set limits or not. - // If we hadn't set limits, then it's ok that we couldn't join this cgroup, because - // it will have the same limits as its parent. + // In the case of rootless, where an explicit cgroup path hasn't + // been set, we don't bail on error in case of permission problems. + // Cases where limits have been set (and we couldn't create our own + // cgroup) are handled by Set. + if m.Rootless && isIgnorableError(err) && m.Cgroups.Path == "" { delete(m.Paths, sys.Name()) continue } @@ -208,8 +231,9 @@ func (m *Manager) Set(container *configs.Config) error { path := paths[sys.Name()] if err := sys.Set(path, container.Cgroups); err != nil { if path == "" { - // cgroup never applied - return fmt.Errorf("cannot set limits on the %s cgroup, as the container has not joined it", sys.Name()) + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) } return err } diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 7d53d5e0..4cbd70cd 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -59,9 +59,9 @@ func SystemdCgroups(l *LinuxFactory) error { return nil } -// Cgroupfs is an options func to configure a LinuxFactory to return -// containers that use the native cgroups filesystem implementation to -// create and manage cgroups. +// Cgroupfs is an options func to configure a LinuxFactory to return containers +// that use the native cgroups filesystem implementation to create and manage +// cgroups. func Cgroupfs(l *LinuxFactory) error { l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { return &fs.Manager{ @@ -72,6 +72,23 @@ func Cgroupfs(l *LinuxFactory) error { return nil } +// RootlessCgroupfs is an options func to configure a LinuxFactory to return +// containers that use the native cgroups filesystem implementation to create +// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is +// that RootlessCgroupfs can transparently handle permission errors that occur +// during rootless container setup (while still allowing cgroup usage if +// they've been set up properly). +func RootlessCgroupfs(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &fs.Manager{ + Cgroups: config, + Rootless: true, + Paths: paths, + } + } + return nil +} + // IntelRdtfs is an options func to configure a LinuxFactory to return // containers that use the Intel RDT "resource control" filesystem to // create and manage Intel Xeon platform shared resources (e.g., L3 cache). diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index f1a06b4e..17812abf 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -99,7 +99,7 @@ EOF runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions [ "$status" -eq 1 ] - [[ ${lines[1]} == *"cannot set limits on the pids cgroup, as the container has not joined it"* ]] + [[ ${lines[1]} == *"cannot set pids limit: container could not join or create cgroup"* ]] } @test "runc create (limits + cgrouppath + permission on the cgroup dir) succeeds" { diff --git a/utils_linux.go b/utils_linux.go index 84731c84..c6223f0d 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -38,6 +38,9 @@ func loadFactory(context *cli.Context) (libcontainer.Factory, error) { // We default to cgroupfs, and can only use systemd if the system is a // systemd box. cgroupManager := libcontainer.Cgroupfs + if isRootless() { + cgroupManager = libcontainer.RootlessCgroupfs + } if context.GlobalBool("systemd-cgroup") { if systemd.UseSystemd() { cgroupManager = libcontainer.SystemdCgroups From fd3a6e6c8323b318ce5c6bb1a8a2f690ccd3f40a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 16 Mar 2018 11:54:47 +1100 Subject: [PATCH 2/2] libcontainer: handle unset oomScoreAdj corectly Previously if oomScoreAdj was not set in config.json we would implicitly set oom_score_adj to 0. This is not allowed according to the spec: > If oomScoreAdj is not set, the runtime MUST NOT change the value of > oom_score_adj. Change this so that we do not modify oom_score_adj if oomScoreAdj is not present in the configuration. While this modifies our internal configuration types, the on-disk format is still compatible. Signed-off-by: Aleksa Sarai --- libcontainer/configs/config.go | 5 +++-- libcontainer/container_linux.go | 12 +++++++----- libcontainer/integration/exec_test.go | 6 +++--- libcontainer/integration/execin_test.go | 6 +++--- libcontainer/integration/utils_test.go | 4 ++++ libcontainer/specconv/spec_linux.go | 4 ++-- 6 files changed, 22 insertions(+), 15 deletions(-) diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 3cae4fd8..b1c4762f 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -141,9 +141,10 @@ type Config struct { // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores // for a process. Valid values are between the range [-1000, '1000'], where processes with - // higher scores are preferred for being killed. + // higher scores are preferred for being killed. If it is unset then we don't touch the current + // value. // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ - OomScoreAdj int `json:"oom_score_adj"` + OomScoreAdj *int `json:"oom_score_adj,omitempty"` // UidMappings is an array of User ID mappings for User Namespaces UidMappings []IDMap `json:"uid_mappings"` diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index db2242e2..eec9f6bc 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1818,11 +1818,13 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na } } - // write oom_score_adj - r.AddData(&Bytemsg{ - Type: OomScoreAdjAttr, - Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), - }) + if c.config.OomScoreAdj != nil { + // write oom_score_adj + r.AddData(&Bytemsg{ + Type: OomScoreAdjAttr, + Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)), + }) + } // write rootless r.AddData(&Boolmsg{ diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 4fbd760f..0ef425be 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -1075,7 +1075,7 @@ func TestOomScoreAdj(t *testing.T) { defer remove(rootfs) config := newTemplateConfig(rootfs) - config.OomScoreAdj = 200 + config.OomScoreAdj = ptrInt(200) factory, err := libcontainer.New(root, libcontainer.Cgroupfs) ok(t, err) @@ -1100,8 +1100,8 @@ func TestOomScoreAdj(t *testing.T) { outputOomScoreAdj := strings.TrimSpace(string(stdout.Bytes())) // Check that the oom_score_adj matches the value that was set as part of config. - if outputOomScoreAdj != strconv.Itoa(config.OomScoreAdj) { - t.Fatalf("Expected oom_score_adj %d; got %q", config.OomScoreAdj, outputOomScoreAdj) + if outputOomScoreAdj != strconv.Itoa(*config.OomScoreAdj) { + t.Fatalf("Expected oom_score_adj %d; got %q", *config.OomScoreAdj, outputOomScoreAdj) } } diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go index 8ef19947..56fa3c75 100644 --- a/libcontainer/integration/execin_test.go +++ b/libcontainer/integration/execin_test.go @@ -497,7 +497,7 @@ func TestExecInOomScoreAdj(t *testing.T) { ok(t, err) defer remove(rootfs) config := newTemplateConfig(rootfs) - config.OomScoreAdj = 200 + config.OomScoreAdj = ptrInt(200) container, err := newContainer(config) ok(t, err) defer container.Destroy() @@ -532,8 +532,8 @@ func TestExecInOomScoreAdj(t *testing.T) { waitProcess(process, t) out := buffers.Stdout.String() - if oomScoreAdj := strings.TrimSpace(out); oomScoreAdj != strconv.Itoa(config.OomScoreAdj) { - t.Fatalf("expected oomScoreAdj to be %d, got %s", config.OomScoreAdj, oomScoreAdj) + if oomScoreAdj := strings.TrimSpace(out); oomScoreAdj != strconv.Itoa(*config.OomScoreAdj) { + t.Fatalf("expected oomScoreAdj to be %d, got %s", *config.OomScoreAdj, oomScoreAdj) } } diff --git a/libcontainer/integration/utils_test.go b/libcontainer/integration/utils_test.go index e209c9bc..25500032 100644 --- a/libcontainer/integration/utils_test.go +++ b/libcontainer/integration/utils_test.go @@ -19,6 +19,10 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" ) +func ptrInt(v int) *int { + return &v +} + func newStdBuffers() *stdBuffers { return &stdBuffers{ Stdin: bytes.NewBuffer(nil), diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 98fd2e63..0222d392 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -243,8 +243,8 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { if spec.Process.SelinuxLabel != "" { config.ProcessLabel = spec.Process.SelinuxLabel } - if spec.Process != nil && spec.Process.OOMScoreAdj != nil { - config.OomScoreAdj = *spec.Process.OOMScoreAdj + if spec.Process != nil { + config.OomScoreAdj = spec.Process.OOMScoreAdj } if spec.Process.Capabilities != nil { config.Capabilities = &configs.Capabilities{