From db3159c9d9d3b65d2d27ff33f722eb9797f20e52 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 15 Dec 2015 00:33:56 +1100 Subject: [PATCH 1/4] libcontainer: cgroups: add pids controller support Add support for the pids cgroup controller to libcontainer, a recent feature that is available in Linux 4.3+. Unfortunately, due to the init process being written in Go, it can spawn an an unknown number of threads due to blocked syscalls. This results in the init process being unable to run properly, and thus small pids.max configs won't work properly. Signed-off-by: Aleksa Sarai --- libcontainer/cgroups/fs/apply_raw.go | 1 + libcontainer/cgroups/fs/pids.go | 62 ++++++++++++++ libcontainer/cgroups/fs/pids_test.go | 83 +++++++++++++++++++ libcontainer/cgroups/stats.go | 6 ++ libcontainer/cgroups/systemd/apply_systemd.go | 19 ++++- libcontainer/configs/cgroup_unix.go | 3 + libcontainer/integration/exec_test.go | 64 ++++++++++++++ spec.go | 1 + 8 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 libcontainer/cgroups/fs/pids.go create mode 100644 libcontainer/cgroups/fs/pids_test.go diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go index 0c4d207e..60646cd4 100644 --- a/libcontainer/cgroups/fs/apply_raw.go +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -23,6 +23,7 @@ var ( &MemoryGroup{}, &CpuGroup{}, &CpuacctGroup{}, + &PidsGroup{}, &BlkioGroup{}, &HugetlbGroup{}, &NetClsGroup{}, diff --git a/libcontainer/cgroups/fs/pids.go b/libcontainer/cgroups/fs/pids.go new file mode 100644 index 00000000..23893db5 --- /dev/null +++ b/libcontainer/cgroups/fs/pids.go @@ -0,0 +1,62 @@ +// +build linux + +package fs + +import ( + "fmt" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PidsGroup struct { +} + +func (s *PidsGroup) Name() string { + return "pids" +} + +func (s *PidsGroup) Apply(d *cgroupData) error { + dir, err := d.join("pids") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + + if err := s.Set(dir, d.config); err != nil { + return err + } + + return nil +} + +func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if cgroup.Resources.PidsLimit > 0 { + limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) + } + + if err := writeFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroup) Remove(d *cgroupData) error { + return removePath(d.path("pids")) +} + +func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + value, err := getCgroupParamUint(path, "pids.current") + if err != nil { + return fmt.Errorf("failed to parse pids.current - %s", err) + } + + stats.PidsStats.Current = value + return nil +} diff --git a/libcontainer/cgroups/fs/pids_test.go b/libcontainer/cgroups/fs/pids_test.go new file mode 100644 index 00000000..06b11927 --- /dev/null +++ b/libcontainer/cgroups/fs/pids_test.go @@ -0,0 +1,83 @@ +// +build linux + +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +const ( + maxUnlimited = -1 + maxLimited = 1024 +) + +func TestPidsSetMax(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.max": "max", + }) + + helper.CgroupData.config.Resources.PidsLimit = maxLimited + pids := &PidsGroup{} + if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := getCgroupParamUint(helper.CgroupPath, "pids.max") + if err != nil { + t.Fatalf("Failed to parse pids.max - %s", err) + } + + if value != maxLimited { + t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value) + } +} + +func TestPidsSetUnlimited(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.max": strconv.Itoa(maxLimited), + }) + + helper.CgroupData.config.Resources.PidsLimit = maxUnlimited + pids := &PidsGroup{} + if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := getCgroupParamString(helper.CgroupPath, "pids.max") + if err != nil { + t.Fatalf("Failed to parse pids.max - %s", err) + } + + if value != "max" { + t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value) + } +} + +func TestPidsStats(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.current": strconv.Itoa(1337), + "pids.max": strconv.Itoa(maxLimited), + }) + + pids := &PidsGroup{} + stats := *cgroups.NewStats() + if err := pids.GetStats(helper.CgroupPath, &stats); err != nil { + t.Fatal(err) + } + + if stats.PidsStats.Current != 1337 { + t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current) + } +} diff --git a/libcontainer/cgroups/stats.go b/libcontainer/cgroups/stats.go index bda32b20..74c65abf 100644 --- a/libcontainer/cgroups/stats.go +++ b/libcontainer/cgroups/stats.go @@ -49,6 +49,11 @@ type MemoryStats struct { Stats map[string]uint64 `json:"stats,omitempty"` } +type PidsStats struct { + // number of pids in the cgroup + Current uint64 `json:"current,omitempty"` +} + type BlkioStatEntry struct { Major uint64 `json:"major,omitempty"` Minor uint64 `json:"minor,omitempty"` @@ -80,6 +85,7 @@ type HugetlbStats struct { type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` MemoryStats MemoryStats `json:"memory_stats,omitempty"` + PidsStats PidsStats `json:"pids_stats,omitempty"` BlkioStats BlkioStats `json:"blkio_stats,omitempty"` // the map is in the format "size of hugepage: stats of the hugepage" HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go index 93300cda..b8f92dbc 100644 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -55,6 +55,7 @@ var subsystems = subsystemSet{ &fs.MemoryGroup{}, &fs.CpuGroup{}, &fs.CpuacctGroup{}, + &fs.PidsGroup{}, &fs.BlkioGroup{}, &fs.HugetlbGroup{}, &fs.PerfEventGroup{}, @@ -233,7 +234,7 @@ func (m *Manager) Apply(pid int) error { return err } - // we need to manually join the freezer, net_cls, net_prio and cpuset cgroup in systemd + // we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd // because it does not currently support it via the dbus api. if err := joinFreezer(c, pid); err != nil { return err @@ -246,6 +247,10 @@ func (m *Manager) Apply(pid int) error { return err } + if err := joinPids(c, pid); err != nil { + return err + } + if err := joinCpuset(c, pid); err != nil { return err } @@ -394,6 +399,18 @@ func joinNetCls(c *configs.Cgroup, pid int) error { return netcls.Set(path, c) } +func joinPids(c *configs.Cgroup, pid int) error { + path, err := join(c, "pids", pid) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + pids, err := subsystems.Get("pids") + if err != nil { + return err + } + return pids.Set(path, c) +} + func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { mountpoint, err := cgroups.FindCgroupMountpoint(subsystem) if err != nil { diff --git a/libcontainer/configs/cgroup_unix.go b/libcontainer/configs/cgroup_unix.go index ef781324..08b669a4 100644 --- a/libcontainer/configs/cgroup_unix.go +++ b/libcontainer/configs/cgroup_unix.go @@ -64,6 +64,9 @@ type Resources struct { // MEM to use CpusetMems string `json:"cpuset_mems"` + // Process limit; set <= `0' to disable limit. + PidsLimit int64 `json:"pids_limit"` + // Specifies per cgroup weight, range is from 10 to 1000. BlkioWeight uint16 `json:"blkio_weight"` diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 59c8afd8..0fc277ea 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -525,6 +525,70 @@ func testCpuShares(t *testing.T, systemd bool) { } } +func TestPids(t *testing.T) { + testPids(t, false) +} + +func TestPidsSystemd(t *testing.T) { + if !systemd.UseSystemd() { + t.Skip("Systemd is unsupported") + } + testPids(t, true) +} + +func testPids(t *testing.T, systemd bool) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + if systemd { + config.Cgroups.Parent = "system.slice" + } + config.Cgroups.Resources.PidsLimit = -1 + + // Running multiple processes. + _, ret, err := runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true") + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + ok(t, err) + + if ret != 0 { + t.Fatalf("expected fork() to succeed with no pids limit") + } + + // Enforce a permissive limit (shell + 6 * true + 3). + config.Cgroups.Resources.PidsLimit = 10 + _, ret, err = runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true") + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + ok(t, err) + + if ret != 0 { + t.Fatalf("expected fork() to succeed with permissive pids limit") + } + + // Enforce a restrictive limit (shell + 6 * true + 3). + config.Cgroups.Resources.PidsLimit = 10 + out, ret, err := runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true") + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + if err != nil && !strings.Contains(out.String(), "sh: can't fork") { + ok(t, err) + } + + if err == nil { + t.Fatalf("expected fork() to fail with restrictive pids limit") + } +} + func TestRunWithKernelMemory(t *testing.T) { testRunWithKernelMemory(t, false) } diff --git a/spec.go b/spec.go index fbe31bb2..6c5e4790 100644 --- a/spec.go +++ b/spec.go @@ -456,6 +456,7 @@ func createCgroupConfig(name string, spec *specs.LinuxRuntimeSpec, devices []*co c.Resources.CpuRtPeriod = r.CPU.RealtimePeriod c.Resources.CpusetCpus = r.CPU.Cpus c.Resources.CpusetMems = r.CPU.Mems + c.Resources.PidsLimit = r.Pids.Limit c.Resources.BlkioWeight = r.BlockIO.Weight c.Resources.BlkioLeafWeight = r.BlockIO.LeafWeight for _, wd := range r.BlockIO.WeightDevice { From f36ed4b174bb7e8951db6b61e2202a45a7251e30 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sun, 20 Dec 2015 22:30:35 +1100 Subject: [PATCH 2/4] libcontainer: cgroups: don't Set in Apply Apply and Set are two separate operations, and it doesn't make sense to group the two together (especially considering that the bootstrap process is added to the cgroup as well). The only exception to this is the memory cgroup, which requires the configuration to be set before processes can join. One of the weird cases to deal with is systemd. Systemd sets some of the cgroup configuration options, but not all of them. Because memory is a special case, we need to explicitly set memory in the systemd Apply(). Otherwise, the rest can be safely re-applied in .Set() as usual. Signed-off-by: Aleksa Sarai --- libcontainer/cgroups/fs/apply_raw.go | 20 ++- libcontainer/cgroups/fs/blkio.go | 7 +- libcontainer/cgroups/fs/cpu.go | 7 +- libcontainer/cgroups/fs/cpuset.go | 5 - libcontainer/cgroups/fs/devices.go | 7 +- libcontainer/cgroups/fs/freezer.go | 7 +- libcontainer/cgroups/fs/hugetlb.go | 7 +- libcontainer/cgroups/fs/memory.go | 1 - libcontainer/cgroups/fs/net_cls.go | 7 +- libcontainer/cgroups/fs/net_prio.go | 7 +- libcontainer/cgroups/fs/pids.go | 7 +- libcontainer/cgroups/systemd/apply_systemd.go | 164 +++++------------- libcontainer/process_linux.go | 3 + 13 files changed, 66 insertions(+), 183 deletions(-) diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go index 60646cd4..c67c55ac 100644 --- a/libcontainer/cgroups/fs/apply_raw.go +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -106,8 +106,6 @@ func (m *Manager) Apply(pid int) (err error) { return nil } - var c = m.Cgroups - d, err := getCgroupData(m.Cgroups, pid) if err != nil { return err @@ -136,13 +134,6 @@ func (m *Manager) Apply(pid int) (err error) { paths[sys.Name()] = p } m.Paths = paths - - if paths["cpu"] != "" { - if err := CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil { - return err - } - } - return nil } @@ -181,6 +172,11 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { func (m *Manager) Set(container *configs.Config) error { for name, path := range m.Paths { + // We can't set this here, because after being applied, memcg doesn't + // allow a non-empty cgroup from having its limits changed. + if name == "memory" { + continue + } sys, err := subsystems.Get(name) if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { continue @@ -189,6 +185,12 @@ func (m *Manager) Set(container *configs.Config) error { return err } } + + if m.Paths["cpu"] != "" { + if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } return nil } diff --git a/libcontainer/cgroups/fs/blkio.go b/libcontainer/cgroups/fs/blkio.go index 518cb63f..a142cb99 100644 --- a/libcontainer/cgroups/fs/blkio.go +++ b/libcontainer/cgroups/fs/blkio.go @@ -22,15 +22,10 @@ func (s *BlkioGroup) Name() string { } func (s *BlkioGroup) Apply(d *cgroupData) error { - dir, err := d.join("blkio") + _, err := d.join("blkio") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/cpu.go b/libcontainer/cgroups/fs/cpu.go index ad5f427e..a4ef28a6 100644 --- a/libcontainer/cgroups/fs/cpu.go +++ b/libcontainer/cgroups/fs/cpu.go @@ -22,15 +22,10 @@ func (s *CpuGroup) Name() string { func (s *CpuGroup) Apply(d *cgroupData) error { // We always want to join the cpu group, to allow fair cpu scheduling // on a container basis - dir, err := d.join("cpu") + _, err := d.join("cpu") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/cpuset.go b/libcontainer/cgroups/fs/cpuset.go index b7632108..a9e3ab72 100644 --- a/libcontainer/cgroups/fs/cpuset.go +++ b/libcontainer/cgroups/fs/cpuset.go @@ -63,11 +63,6 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro if err := s.ensureParent(dir, root); err != nil { return err } - // the default values inherit from parent cgroup are already set in - // s.ensureParent, cover these if we have our own - if err := s.Set(dir, cgroup); err != nil { - return err - } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil { diff --git a/libcontainer/cgroups/fs/devices.go b/libcontainer/cgroups/fs/devices.go index a9883eb4..a41ce801 100644 --- a/libcontainer/cgroups/fs/devices.go +++ b/libcontainer/cgroups/fs/devices.go @@ -15,17 +15,12 @@ func (s *DevicesGroup) Name() string { } func (s *DevicesGroup) Apply(d *cgroupData) error { - dir, err := d.join("devices") + _, err := d.join("devices") if err != nil { // We will return error even it's `not found` error, devices // cgroup is hard requirement for container's security. return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/freezer.go b/libcontainer/cgroups/fs/freezer.go index 6aaad4e0..e70dfe3b 100644 --- a/libcontainer/cgroups/fs/freezer.go +++ b/libcontainer/cgroups/fs/freezer.go @@ -19,15 +19,10 @@ func (s *FreezerGroup) Name() string { } func (s *FreezerGroup) Apply(d *cgroupData) error { - dir, err := d.join("freezer") + _, err := d.join("freezer") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/hugetlb.go b/libcontainer/cgroups/fs/hugetlb.go index ca106da4..2f972771 100644 --- a/libcontainer/cgroups/fs/hugetlb.go +++ b/libcontainer/cgroups/fs/hugetlb.go @@ -19,15 +19,10 @@ func (s *HugetlbGroup) Name() string { } func (s *HugetlbGroup) Apply(d *cgroupData) error { - dir, err := d.join("hugetlb") + _, err := d.join("hugetlb") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index 6b9687ca..0329692a 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -32,7 +32,6 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { return err } } - if err := s.Set(path, d.config); err != nil { return err } diff --git a/libcontainer/cgroups/fs/net_cls.go b/libcontainer/cgroups/fs/net_cls.go index 63823731..8a4054ba 100644 --- a/libcontainer/cgroups/fs/net_cls.go +++ b/libcontainer/cgroups/fs/net_cls.go @@ -15,15 +15,10 @@ func (s *NetClsGroup) Name() string { } func (s *NetClsGroup) Apply(d *cgroupData) error { - dir, err := d.join("net_cls") + _, err := d.join("net_cls") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/net_prio.go b/libcontainer/cgroups/fs/net_prio.go index 0dabaae7..d0ab2af8 100644 --- a/libcontainer/cgroups/fs/net_prio.go +++ b/libcontainer/cgroups/fs/net_prio.go @@ -15,15 +15,10 @@ func (s *NetPrioGroup) Name() string { } func (s *NetPrioGroup) Apply(d *cgroupData) error { - dir, err := d.join("net_prio") + _, err := d.join("net_prio") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/pids.go b/libcontainer/cgroups/fs/pids.go index 23893db5..96cbb896 100644 --- a/libcontainer/cgroups/fs/pids.go +++ b/libcontainer/cgroups/fs/pids.go @@ -18,15 +18,10 @@ func (s *PidsGroup) Name() string { } func (s *PidsGroup) Apply(d *cgroupData) error { - dir, err := d.join("pids") + _, err := d.join("pids") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go index b8f92dbc..952820e6 100644 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -282,13 +282,6 @@ func (m *Manager) Apply(pid int) error { paths[s.Name()] = subsystemPath } m.Paths = paths - - if paths["cpu"] != "" { - if err := fs.CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil { - return err - } - } - return nil } @@ -335,80 +328,43 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { } func joinCpu(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "cpu") + _, err := join(c, "cpu", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - if c.Resources.CpuQuota != 0 { - if err = writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(c.Resources.CpuQuota, 10)); err != nil { - return err - } - } - if c.Resources.CpuPeriod != 0 { - if err = writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(c.Resources.CpuPeriod, 10)); err != nil { - return err - } - } - if c.Resources.CpuRtPeriod != 0 { - if err = writeFile(path, "cpu.rt_period_us", strconv.FormatInt(c.Resources.CpuRtPeriod, 10)); err != nil { - return err - } - } - if c.Resources.CpuRtRuntime != 0 { - if err = writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(c.Resources.CpuRtRuntime, 10)); err != nil { - return err - } - } - return nil } func joinFreezer(c *configs.Cgroup, pid int) error { - path, err := join(c, "freezer", pid) + _, err := join(c, "freezer", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - freezer, err := subsystems.Get("freezer") - if err != nil { - return err - } - return freezer.Set(path, c) + return nil } func joinNetPrio(c *configs.Cgroup, pid int) error { - path, err := join(c, "net_prio", pid) + _, err := join(c, "net_prio", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - netPrio, err := subsystems.Get("net_prio") - if err != nil { - return err - } - return netPrio.Set(path, c) + return nil } func joinNetCls(c *configs.Cgroup, pid int) error { - path, err := join(c, "net_cls", pid) + _, err := join(c, "net_cls", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - netcls, err := subsystems.Get("net_cls") - if err != nil { - return err - } - return netcls.Set(path, c) + return nil } func joinPids(c *configs.Cgroup, pid int) error { - path, err := join(c, "pids", pid) + _, err := join(c, "pids", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - pids, err := subsystems.Get("pids") - if err != nil { - return err - } - return pids.Set(path, c) + return nil } func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { @@ -476,6 +432,11 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { func (m *Manager) Set(container *configs.Config) error { for name, path := range m.Paths { + // We can't set this here, because after being applied, memcg doesn't + // allow a non-empty cgroup from having its limits changed. + if name == "memory" { + continue + } sys, err := subsystems.Get(name) if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { continue @@ -485,6 +446,11 @@ func (m *Manager) Set(container *configs.Config) error { } } + if m.Paths["cpu"] != "" { + if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } return nil } @@ -504,17 +470,13 @@ func getUnitName(c *configs.Cgroup) string { // because systemd will re-write the device settings if it needs to re-apply the cgroup context. // This happens at least for v208 when any sibling unit is started. func joinDevices(c *configs.Cgroup, pid int) error { - path, err := join(c, "devices", pid) + _, err := join(c, "devices", pid) // Even if it's `not found` error, we'll return err because devices cgroup // is hard requirement for container security. if err != nil { return err } - devices, err := subsystems.Get("devices") - if err != nil { - return err - } - return devices.Set(path, c) + return nil } func setKernelMemory(c *configs.Cgroup) error { @@ -527,9 +489,10 @@ func setKernelMemory(c *configs.Cgroup) error { return err } + // Shamelessly copied from fs.(*MemoryGroup).Set(). This doesn't get called + // by manager.Set, so we need to do it here. if c.Resources.KernelMemory > 0 { - err = writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.Resources.KernelMemory, 10)) - if err != nil { + if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.Resources.KernelMemory, 10)); err != nil { return err } } @@ -538,21 +501,27 @@ func setKernelMemory(c *configs.Cgroup) error { } func joinMemory(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "memory") + path, err := join(c, "memory", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - // -1 disables memoryswap - if c.Resources.MemorySwap > 0 { - err = writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Resources.MemorySwap, 10)) - if err != nil { + // Shamelessly copied from fs.(*MemoryGroup).Set(). This doesn't include the + // kernel memory limit (which is done in setKernelMemory). All of these are + // not set by manager.Set, we have do do it here. + + if c.Resources.Memory != 0 { + if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(c.Resources.Memory, 10)); err != nil { return err } } - if c.Resources.MemoryReservation > 0 { - err = writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Resources.MemoryReservation, 10)) - if err != nil { + if c.Resources.MemoryReservation != 0 { + if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Resources.MemoryReservation, 10)); err != nil { + return err + } + } + if c.Resources.MemorySwap > 0 { + if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Resources.MemorySwap, 10)); err != nil { return err } } @@ -561,10 +530,8 @@ func joinMemory(c *configs.Cgroup, pid int) error { return err } } - if c.Resources.MemorySwappiness >= 0 && c.Resources.MemorySwappiness <= 100 { - err = writeFile(path, "memory.swappiness", strconv.FormatInt(c.Resources.MemorySwappiness, 10)) - if err != nil { + if err := writeFile(path, "memory.swappiness", strconv.FormatInt(c.Resources.MemorySwappiness, 10)); err != nil { return err } } else if c.Resources.MemorySwappiness == -1 { @@ -594,68 +561,25 @@ func joinCpuset(c *configs.Cgroup, pid int) error { // expects device path instead of major minor numbers, which is also confusing // for users. So we use fs work around for now. func joinBlkio(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "blkio") + _, err := join(c, "blkio", pid) if err != nil { return err } - // systemd doesn't directly support this in the dbus properties - if c.Resources.BlkioLeafWeight != 0 { - if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(c.Resources.BlkioLeafWeight), 10)); err != nil { - return err - } - } - for _, wd := range c.Resources.BlkioWeightDevice { - if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil { - return err - } - if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { - return err - } - } - for _, td := range c.Resources.BlkioThrottleReadBpsDevice { - if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { - return err - } - } - for _, td := range c.Resources.BlkioThrottleWriteBpsDevice { - if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { - return err - } - } - for _, td := range c.Resources.BlkioThrottleReadIOPSDevice { - if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { - return err - } - } - for _, td := range c.Resources.BlkioThrottleWriteIOPSDevice { - if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { - return err - } - } - return nil } func joinHugetlb(c *configs.Cgroup, pid int) error { - path, err := join(c, "hugetlb", pid) + _, err := join(c, "hugetlb", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - hugetlb, err := subsystems.Get("hugetlb") - if err != nil { - return err - } - return hugetlb.Set(path, c) + return nil } func joinPerfEvent(c *configs.Cgroup, pid int) error { - path, err := join(c, "perf_event", pid) + _, err := join(c, "perf_event", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - perfEvent, err := subsystems.Get("perf_event") - if err != nil { - return err - } - return perfEvent.Set(path, c) + return nil } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 114c71b3..5b32eb10 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -204,6 +204,9 @@ func (p *initProcess) start() (err error) { if err := p.manager.Apply(p.pid()); err != nil { return newSystemError(err) } + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemError(err) + } defer func() { if err != nil { // TODO: should not be the responsibility to call here From a95483402ef53ec0b82d85b9fb32c559c19b0450 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 17 Dec 2015 20:13:06 +1100 Subject: [PATCH 3/4] libcontainer: cgroups: loudly fail with Set It is vital to loudly fail when a user attempts to set a cgroup limit (rather than using the system default). Otherwise the user will assume they have security they do not actually have. This mirrors the original Apply() (that would set cgroup configs) semantics. Signed-off-by: Aleksa Sarai --- libcontainer/cgroups/fs/apply_raw.go | 18 +++++++++++++----- libcontainer/cgroups/systemd/apply_systemd.go | 13 ++++++++----- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go index c67c55ac..8eb71b4d 100644 --- a/libcontainer/cgroups/fs/apply_raw.go +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -171,16 +171,24 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { - for name, path := range m.Paths { + for _, sys := range subsystems { // We can't set this here, because after being applied, memcg doesn't // allow a non-empty cgroup from having its limits changed. - if name == "memory" { + if sys.Name() == "memory" { continue } - sys, err := subsystems.Get(name) - if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { - continue + + // Generate fake cgroup data. + d, err := getCgroupData(container.Cgroups, -1) + if err != nil { + return err } + // Get the path, but don't error out if the cgroup wasn't found. + path, err := d.path(sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + if err := sys.Set(path, container.Cgroups); err != nil { return err } diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go index 952820e6..b943e9da 100644 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -431,16 +431,19 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { - for name, path := range m.Paths { + for _, sys := range subsystems { // We can't set this here, because after being applied, memcg doesn't // allow a non-empty cgroup from having its limits changed. - if name == "memory" { + if sys.Name() == "memory" { continue } - sys, err := subsystems.Get(name) - if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { - continue + + // Get the subsystem path, but don't error out for not found cgroups. + path, err := getSubsystemPath(container.Cgroups, sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err } + if err := sys.Set(path, container.Cgroups); err != nil { return err } From 103853ead72ff1d71677464fce338a2644e8a6f6 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 17 Dec 2015 20:16:34 +1100 Subject: [PATCH 4/4] libcontainer: set cgroup config late Due to the fact that the init is implemented in Go (which seemingly randomly spawns new processes and loves eating memory), most cgroup configurations are required to have an arbitrary minimum dictated by the init. This confuses users and makes configuration more annoying than it should. An example of this is pids.max, where Go spawns multiple processes that then cause init to violate the pids cgroup constraint before the container can even start. Solve this problem by setting the cgroup configurations as late as possible, to avoid hitting as many of the resources hogged by the Go init as possible. This has to be done before seccomp rules are applied, as the parent and child must synchronise in order for the parent to correctly set the configurations (and writes might be blocked by seccomp). Signed-off-by: Aleksa Sarai --- libcontainer/factory_linux.go | 17 +++++--- libcontainer/generic_error.go | 9 ++++ libcontainer/init_linux.go | 24 ++++++++++ libcontainer/integration/exec_test.go | 5 +++ libcontainer/process_linux.go | 63 ++++++++++++++++++++++----- libcontainer/standard_init_linux.go | 10 ++++- 6 files changed, 111 insertions(+), 17 deletions(-) diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index c2d359ed..ae5db9cd 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -5,7 +5,6 @@ package libcontainer import ( "encoding/json" "fmt" - "io/ioutil" "os" "os/exec" "path/filepath" @@ -226,21 +225,29 @@ func (l *LinuxFactory) StartInitialization() (err error) { // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() + var i initer defer func() { // if we have an error during the initialization of the container's init then send it back to the // parent process in the form of an initError. if err != nil { - // ensure that any data sent from the parent is consumed so it doesn't - // receive ECONNRESET when the child writes to the pipe. - ioutil.ReadAll(pipe) + if _, ok := i.(*linuxStandardInit); ok { + // Synchronisation only necessary for standard init. + if err := json.NewEncoder(pipe).Encode(procError); err != nil { + panic(err) + } + } if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil { panic(err) } + } else { + if err := json.NewEncoder(pipe).Encode(procStart); err != nil { + panic(err) + } } // ensure that this pipe is always closed pipe.Close() }() - i, err := newContainerInit(it, pipe) + i, err = newContainerInit(it, pipe) if err != nil { return err } diff --git a/libcontainer/generic_error.go b/libcontainer/generic_error.go index 6fbc2d75..e4872e2d 100644 --- a/libcontainer/generic_error.go +++ b/libcontainer/generic_error.go @@ -9,6 +9,15 @@ import ( "github.com/opencontainers/runc/libcontainer/stacktrace" ) +type syncType uint8 + +const ( + procReady syncType = iota + procError + procStart + procRun +) + var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} Code: {{.ECode}} {{if .Message }} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index ddb11865..504bf732 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "fmt" + "io" "io/ioutil" "net" "os" @@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) { }, nil case initStandard: return &linuxStandardInit{ + pipe: pipe, parentPid: syscall.Getppid(), config: config, }, nil @@ -140,6 +142,28 @@ func finalizeNamespace(config *initConfig) error { return nil } +// syncParentReady sends to the given pipe a JSON payload which indicates that +// the init is ready to Exec the child process. It then waits for the parent to +// indicate that it is cleared to Exec. +func syncParentReady(pipe io.ReadWriter) error { + // Tell parent. + if err := json.NewEncoder(pipe).Encode(procReady); err != nil { + return err + } + + // Wait for parent to give the all-clear. + var procSync syncType + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + if procSync != procRun { + return fmt.Errorf("invalid synchronisation flag from parent") + } + } + return nil +} + // joinExistingNamespaces gets all the namespace paths specified for the container and // does a setns on the namespace fd so that the current process joins the namespace. func joinExistingNamespaces(namespaces []configs.Namespace) error { diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 0fc277ea..6be1dd60 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -587,6 +587,11 @@ func testPids(t *testing.T, systemd bool) { if err == nil { t.Fatalf("expected fork() to fail with restrictive pids limit") } + + // Minimal restrictions are not really supported, due to quirks in using Go + // due to the fact that it spawns random processes. While we do our best with + // late setting cgroup values, it's just too unreliable with very small pids.max. + // As such, we don't test that case. YMMV. } func TestRunWithKernelMemory(t *testing.T) { diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 5b32eb10..1a6dc124 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "errors" + "fmt" "io" "os" "os/exec" @@ -86,6 +87,7 @@ func (p *setnsProcess) start() (err error) { if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return newSystemError(err) } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } @@ -95,6 +97,7 @@ func (p *setnsProcess) start() (err error) { if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { return newSystemError(err) } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() return newSystemError(ierr) @@ -198,15 +201,11 @@ func (p *initProcess) start() (err error) { return newSystemError(err) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children // can escape the cgroup if err := p.manager.Apply(p.pid()); err != nil { return newSystemError(err) } - if err := p.manager.Set(p.config.Config); err != nil { - return newSystemError(err) - } defer func() { if err != nil { // TODO: should not be the responsibility to call here @@ -232,13 +231,58 @@ func (p *initProcess) start() (err error) { if err := p.sendConfig(); err != nil { return newSystemError(err) } - // wait for the child process to fully complete and receive an error message - // if one was encoutered - var ierr *genericError - if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + + var ( + procSync syncType + sentRun bool + ierr *genericError + ) + +loop: + for { + if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil { + if err == io.EOF { + break loop + } + return newSystemError(err) + } + + switch procSync { + case procStart: + break loop + case procReady: + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemError(err) + } + // Sync with child. + if err := json.NewEncoder(p.parentPipe).Encode(procRun); err != nil { + return newSystemError(err) + } + sentRun = true + case procError: + // wait for the child process to fully complete and receive an error message + // if one was encoutered + if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + return newSystemError(err) + } + if ierr != nil { + break loop + } + // Programmer error. + panic("No error following JSON procError payload.") + default: + return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child")) + } + } + if !sentRun { + return newSystemError(fmt.Errorf("could not synchronise with container process")) + } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { + p.wait() return newSystemError(ierr) } return nil @@ -276,8 +320,7 @@ func (p *initProcess) sendConfig() error { if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return err } - // shutdown writes for the parent side of the pipe - return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR) + return nil } func (p *initProcess) createNetworkInterfaces() error { diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index ec100578..d3b50863 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -3,6 +3,7 @@ package libcontainer import ( + "io" "os" "syscall" @@ -14,6 +15,7 @@ import ( ) type linuxStandardInit struct { + pipe io.ReadWriter parentPid int config *initConfig } @@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error { if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { return err } - label.Init() // InitializeMountNamespace() can be executed only for a new mount namespace if l.config.Config.Namespaces.Contains(configs.NEWNS) { @@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error { return err } } - for _, path := range l.config.Config.ReadonlyPaths { if err := remountReadonly(path); err != nil { return err @@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error { if err != nil { return err } + // Tell our parent that we're ready to Execv. This must be done before the + // Seccomp rules have been applied, because we need to be able to read and + // write to a socket. + if err := syncParentReady(l.pipe); err != nil { + return err + } if l.config.Config.Seccomp != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { return err