diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go index 1a8202c7..b6fcc90f 100644 --- a/libcontainer/cgroups/fs/apply_raw.go +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -23,6 +23,7 @@ var ( &MemoryGroup{}, &CpuGroup{}, &CpuacctGroup{}, + &PidsGroup{}, &BlkioGroup{}, &HugetlbGroup{}, &NetClsGroup{}, @@ -105,8 +106,6 @@ func (m *Manager) Apply(pid int) (err error) { return nil } - var c = m.Cgroups - d, err := getCgroupData(m.Cgroups, pid) if err != nil { return err @@ -135,13 +134,6 @@ func (m *Manager) Apply(pid int) (err error) { paths[sys.Name()] = p } m.Paths = paths - - if paths["cpu"] != "" { - if err := CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil { - return err - } - } - return nil } @@ -179,15 +171,34 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { - for name, path := range m.Paths { - sys, err := subsystems.Get(name) - if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { + for _, sys := range subsystems { + // We can't set this here, because after being applied, memcg doesn't + // allow a non-empty cgroup from having its limits changed. + if sys.Name() == "memory" { continue } + + // Generate fake cgroup data. + d, err := getCgroupData(container.Cgroups, -1) + if err != nil { + return err + } + // Get the path, but don't error out if the cgroup wasn't found. + path, err := d.path(sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + if err := sys.Set(path, container.Cgroups); err != nil { return err } } + + if m.Paths["cpu"] != "" { + if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } return nil } diff --git a/libcontainer/cgroups/fs/blkio.go b/libcontainer/cgroups/fs/blkio.go index 518cb63f..a142cb99 100644 --- a/libcontainer/cgroups/fs/blkio.go +++ b/libcontainer/cgroups/fs/blkio.go @@ -22,15 +22,10 @@ func (s *BlkioGroup) Name() string { } func (s *BlkioGroup) Apply(d *cgroupData) error { - dir, err := d.join("blkio") + _, err := d.join("blkio") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/cpu.go b/libcontainer/cgroups/fs/cpu.go index ad5f427e..a4ef28a6 100644 --- a/libcontainer/cgroups/fs/cpu.go +++ b/libcontainer/cgroups/fs/cpu.go @@ -22,15 +22,10 @@ func (s *CpuGroup) Name() string { func (s *CpuGroup) Apply(d *cgroupData) error { // We always want to join the cpu group, to allow fair cpu scheduling // on a container basis - dir, err := d.join("cpu") + _, err := d.join("cpu") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/cpuset.go b/libcontainer/cgroups/fs/cpuset.go index 8daacfc6..ed100231 100644 --- a/libcontainer/cgroups/fs/cpuset.go +++ b/libcontainer/cgroups/fs/cpuset.go @@ -64,11 +64,6 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro if err := s.ensureParent(dir, root); err != nil { return err } - // the default values inherit from parent cgroup are already set in - // s.ensureParent, cover these if we have our own - if err := s.Set(dir, cgroup); err != nil { - return err - } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil { diff --git a/libcontainer/cgroups/fs/devices.go b/libcontainer/cgroups/fs/devices.go index a9883eb4..a41ce801 100644 --- a/libcontainer/cgroups/fs/devices.go +++ b/libcontainer/cgroups/fs/devices.go @@ -15,17 +15,12 @@ func (s *DevicesGroup) Name() string { } func (s *DevicesGroup) Apply(d *cgroupData) error { - dir, err := d.join("devices") + _, err := d.join("devices") if err != nil { // We will return error even it's `not found` error, devices // cgroup is hard requirement for container's security. return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/freezer.go b/libcontainer/cgroups/fs/freezer.go index 6aaad4e0..e70dfe3b 100644 --- a/libcontainer/cgroups/fs/freezer.go +++ b/libcontainer/cgroups/fs/freezer.go @@ -19,15 +19,10 @@ func (s *FreezerGroup) Name() string { } func (s *FreezerGroup) Apply(d *cgroupData) error { - dir, err := d.join("freezer") + _, err := d.join("freezer") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/hugetlb.go b/libcontainer/cgroups/fs/hugetlb.go index ca106da4..2f972771 100644 --- a/libcontainer/cgroups/fs/hugetlb.go +++ b/libcontainer/cgroups/fs/hugetlb.go @@ -19,15 +19,10 @@ func (s *HugetlbGroup) Name() string { } func (s *HugetlbGroup) Apply(d *cgroupData) error { - dir, err := d.join("hugetlb") + _, err := d.join("hugetlb") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index 6b9687ca..0329692a 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -32,7 +32,6 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { return err } } - if err := s.Set(path, d.config); err != nil { return err } diff --git a/libcontainer/cgroups/fs/net_cls.go b/libcontainer/cgroups/fs/net_cls.go index 63823731..8a4054ba 100644 --- a/libcontainer/cgroups/fs/net_cls.go +++ b/libcontainer/cgroups/fs/net_cls.go @@ -15,15 +15,10 @@ func (s *NetClsGroup) Name() string { } func (s *NetClsGroup) Apply(d *cgroupData) error { - dir, err := d.join("net_cls") + _, err := d.join("net_cls") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/net_prio.go b/libcontainer/cgroups/fs/net_prio.go index 0dabaae7..d0ab2af8 100644 --- a/libcontainer/cgroups/fs/net_prio.go +++ b/libcontainer/cgroups/fs/net_prio.go @@ -15,15 +15,10 @@ func (s *NetPrioGroup) Name() string { } func (s *NetPrioGroup) Apply(d *cgroupData) error { - dir, err := d.join("net_prio") + _, err := d.join("net_prio") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } diff --git a/libcontainer/cgroups/fs/pids.go b/libcontainer/cgroups/fs/pids.go new file mode 100644 index 00000000..96cbb896 --- /dev/null +++ b/libcontainer/cgroups/fs/pids.go @@ -0,0 +1,57 @@ +// +build linux + +package fs + +import ( + "fmt" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PidsGroup struct { +} + +func (s *PidsGroup) Name() string { + return "pids" +} + +func (s *PidsGroup) Apply(d *cgroupData) error { + _, err := d.join("pids") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if cgroup.Resources.PidsLimit > 0 { + limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) + } + + if err := writeFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroup) Remove(d *cgroupData) error { + return removePath(d.path("pids")) +} + +func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + value, err := getCgroupParamUint(path, "pids.current") + if err != nil { + return fmt.Errorf("failed to parse pids.current - %s", err) + } + + stats.PidsStats.Current = value + return nil +} diff --git a/libcontainer/cgroups/fs/pids_test.go b/libcontainer/cgroups/fs/pids_test.go new file mode 100644 index 00000000..06b11927 --- /dev/null +++ b/libcontainer/cgroups/fs/pids_test.go @@ -0,0 +1,83 @@ +// +build linux + +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +const ( + maxUnlimited = -1 + maxLimited = 1024 +) + +func TestPidsSetMax(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.max": "max", + }) + + helper.CgroupData.config.Resources.PidsLimit = maxLimited + pids := &PidsGroup{} + if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := getCgroupParamUint(helper.CgroupPath, "pids.max") + if err != nil { + t.Fatalf("Failed to parse pids.max - %s", err) + } + + if value != maxLimited { + t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value) + } +} + +func TestPidsSetUnlimited(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.max": strconv.Itoa(maxLimited), + }) + + helper.CgroupData.config.Resources.PidsLimit = maxUnlimited + pids := &PidsGroup{} + if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := getCgroupParamString(helper.CgroupPath, "pids.max") + if err != nil { + t.Fatalf("Failed to parse pids.max - %s", err) + } + + if value != "max" { + t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value) + } +} + +func TestPidsStats(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.current": strconv.Itoa(1337), + "pids.max": strconv.Itoa(maxLimited), + }) + + pids := &PidsGroup{} + stats := *cgroups.NewStats() + if err := pids.GetStats(helper.CgroupPath, &stats); err != nil { + t.Fatal(err) + } + + if stats.PidsStats.Current != 1337 { + t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current) + } +} diff --git a/libcontainer/cgroups/stats.go b/libcontainer/cgroups/stats.go index bda32b20..74c65abf 100644 --- a/libcontainer/cgroups/stats.go +++ b/libcontainer/cgroups/stats.go @@ -49,6 +49,11 @@ type MemoryStats struct { Stats map[string]uint64 `json:"stats,omitempty"` } +type PidsStats struct { + // number of pids in the cgroup + Current uint64 `json:"current,omitempty"` +} + type BlkioStatEntry struct { Major uint64 `json:"major,omitempty"` Minor uint64 `json:"minor,omitempty"` @@ -80,6 +85,7 @@ type HugetlbStats struct { type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` MemoryStats MemoryStats `json:"memory_stats,omitempty"` + PidsStats PidsStats `json:"pids_stats,omitempty"` BlkioStats BlkioStats `json:"blkio_stats,omitempty"` // the map is in the format "size of hugepage: stats of the hugepage" HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go index 9a3987d4..f8d3a955 100644 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -55,6 +55,7 @@ var subsystems = subsystemSet{ &fs.MemoryGroup{}, &fs.CpuGroup{}, &fs.CpuacctGroup{}, + &fs.PidsGroup{}, &fs.BlkioGroup{}, &fs.HugetlbGroup{}, &fs.PerfEventGroup{}, @@ -233,7 +234,7 @@ func (m *Manager) Apply(pid int) error { return err } - // we need to manually join the freezer, net_cls, net_prio and cpuset cgroup in systemd + // we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd // because it does not currently support it via the dbus api. if err := joinFreezer(c, pid); err != nil { return err @@ -246,6 +247,10 @@ func (m *Manager) Apply(pid int) error { return err } + if err := joinPids(c, pid); err != nil { + return err + } + if err := joinCpuset(c, pid); err != nil { return err } @@ -277,13 +282,6 @@ func (m *Manager) Apply(pid int) error { paths[s.Name()] = subsystemPath } m.Paths = paths - - if paths["cpu"] != "" { - if err := fs.CheckCpushares(paths["cpu"], c.Resources.CpuShares); err != nil { - return err - } - } - return nil } @@ -330,68 +328,43 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { } func joinCpu(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "cpu") + _, err := join(c, "cpu", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - if c.Resources.CpuQuota != 0 { - if err = writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(c.Resources.CpuQuota, 10)); err != nil { - return err - } - } - if c.Resources.CpuPeriod != 0 { - if err = writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(c.Resources.CpuPeriod, 10)); err != nil { - return err - } - } - if c.Resources.CpuRtPeriod != 0 { - if err = writeFile(path, "cpu.rt_period_us", strconv.FormatInt(c.Resources.CpuRtPeriod, 10)); err != nil { - return err - } - } - if c.Resources.CpuRtRuntime != 0 { - if err = writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(c.Resources.CpuRtRuntime, 10)); err != nil { - return err - } - } - return nil } func joinFreezer(c *configs.Cgroup, pid int) error { - path, err := join(c, "freezer", pid) + _, err := join(c, "freezer", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - freezer, err := subsystems.Get("freezer") - if err != nil { - return err - } - return freezer.Set(path, c) + return nil } func joinNetPrio(c *configs.Cgroup, pid int) error { - path, err := join(c, "net_prio", pid) + _, err := join(c, "net_prio", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - netPrio, err := subsystems.Get("net_prio") - if err != nil { - return err - } - return netPrio.Set(path, c) + return nil } func joinNetCls(c *configs.Cgroup, pid int) error { - path, err := join(c, "net_cls", pid) + _, err := join(c, "net_cls", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - netcls, err := subsystems.Get("net_cls") - if err != nil { + return nil +} + +func joinPids(c *configs.Cgroup, pid int) error { + _, err := join(c, "pids", pid) + if err != nil && !cgroups.IsNotFound(err) { return err } - return netcls.Set(path, c) + return nil } func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { @@ -466,16 +439,29 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { - for name, path := range m.Paths { - sys, err := subsystems.Get(name) - if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { + for _, sys := range subsystems { + // We can't set this here, because after being applied, memcg doesn't + // allow a non-empty cgroup from having its limits changed. + if sys.Name() == "memory" { continue } + + // Get the subsystem path, but don't error out for not found cgroups. + path, err := getSubsystemPath(container.Cgroups, sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + if err := sys.Set(path, container.Cgroups); err != nil { return err } } + if m.Paths["cpu"] != "" { + if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } return nil } @@ -495,17 +481,13 @@ func getUnitName(c *configs.Cgroup) string { // because systemd will re-write the device settings if it needs to re-apply the cgroup context. // This happens at least for v208 when any sibling unit is started. func joinDevices(c *configs.Cgroup, pid int) error { - path, err := join(c, "devices", pid) + _, err := join(c, "devices", pid) // Even if it's `not found` error, we'll return err because devices cgroup // is hard requirement for container security. if err != nil { return err } - devices, err := subsystems.Get("devices") - if err != nil { - return err - } - return devices.Set(path, c) + return nil } func setKernelMemory(c *configs.Cgroup) error { @@ -518,9 +500,10 @@ func setKernelMemory(c *configs.Cgroup) error { return err } + // Shamelessly copied from fs.(*MemoryGroup).Set(). This doesn't get called + // by manager.Set, so we need to do it here. if c.Resources.KernelMemory > 0 { - err = writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.Resources.KernelMemory, 10)) - if err != nil { + if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.Resources.KernelMemory, 10)); err != nil { return err } } @@ -529,21 +512,27 @@ func setKernelMemory(c *configs.Cgroup) error { } func joinMemory(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "memory") + path, err := join(c, "memory", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - // -1 disables memoryswap - if c.Resources.MemorySwap > 0 { - err = writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Resources.MemorySwap, 10)) - if err != nil { + // Shamelessly copied from fs.(*MemoryGroup).Set(). This doesn't include the + // kernel memory limit (which is done in setKernelMemory). All of these are + // not set by manager.Set, we have do do it here. + + if c.Resources.Memory != 0 { + if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(c.Resources.Memory, 10)); err != nil { return err } } - if c.Resources.MemoryReservation > 0 { - err = writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Resources.MemoryReservation, 10)) - if err != nil { + if c.Resources.MemoryReservation != 0 { + if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Resources.MemoryReservation, 10)); err != nil { + return err + } + } + if c.Resources.MemorySwap > 0 { + if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Resources.MemorySwap, 10)); err != nil { return err } } @@ -552,10 +541,8 @@ func joinMemory(c *configs.Cgroup, pid int) error { return err } } - if c.Resources.MemorySwappiness >= 0 && c.Resources.MemorySwappiness <= 100 { - err = writeFile(path, "memory.swappiness", strconv.FormatInt(c.Resources.MemorySwappiness, 10)) - if err != nil { + if err := writeFile(path, "memory.swappiness", strconv.FormatInt(c.Resources.MemorySwappiness, 10)); err != nil { return err } } else if c.Resources.MemorySwappiness == -1 { @@ -585,68 +572,25 @@ func joinCpuset(c *configs.Cgroup, pid int) error { // expects device path instead of major minor numbers, which is also confusing // for users. So we use fs work around for now. func joinBlkio(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "blkio") + _, err := join(c, "blkio", pid) if err != nil { return err } - // systemd doesn't directly support this in the dbus properties - if c.Resources.BlkioLeafWeight != 0 { - if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(c.Resources.BlkioLeafWeight), 10)); err != nil { - return err - } - } - for _, wd := range c.Resources.BlkioWeightDevice { - if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil { - return err - } - if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { - return err - } - } - for _, td := range c.Resources.BlkioThrottleReadBpsDevice { - if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { - return err - } - } - for _, td := range c.Resources.BlkioThrottleWriteBpsDevice { - if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { - return err - } - } - for _, td := range c.Resources.BlkioThrottleReadIOPSDevice { - if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { - return err - } - } - for _, td := range c.Resources.BlkioThrottleWriteIOPSDevice { - if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { - return err - } - } - return nil } func joinHugetlb(c *configs.Cgroup, pid int) error { - path, err := join(c, "hugetlb", pid) + _, err := join(c, "hugetlb", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - hugetlb, err := subsystems.Get("hugetlb") - if err != nil { - return err - } - return hugetlb.Set(path, c) + return nil } func joinPerfEvent(c *configs.Cgroup, pid int) error { - path, err := join(c, "perf_event", pid) + _, err := join(c, "perf_event", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - perfEvent, err := subsystems.Get("perf_event") - if err != nil { - return err - } - return perfEvent.Set(path, c) + return nil } diff --git a/libcontainer/configs/cgroup_unix.go b/libcontainer/configs/cgroup_unix.go index ef781324..08b669a4 100644 --- a/libcontainer/configs/cgroup_unix.go +++ b/libcontainer/configs/cgroup_unix.go @@ -64,6 +64,9 @@ type Resources struct { // MEM to use CpusetMems string `json:"cpuset_mems"` + // Process limit; set <= `0' to disable limit. + PidsLimit int64 `json:"pids_limit"` + // Specifies per cgroup weight, range is from 10 to 1000. BlkioWeight uint16 `json:"blkio_weight"` diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index c2d359ed..ae5db9cd 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -5,7 +5,6 @@ package libcontainer import ( "encoding/json" "fmt" - "io/ioutil" "os" "os/exec" "path/filepath" @@ -226,21 +225,29 @@ func (l *LinuxFactory) StartInitialization() (err error) { // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() + var i initer defer func() { // if we have an error during the initialization of the container's init then send it back to the // parent process in the form of an initError. if err != nil { - // ensure that any data sent from the parent is consumed so it doesn't - // receive ECONNRESET when the child writes to the pipe. - ioutil.ReadAll(pipe) + if _, ok := i.(*linuxStandardInit); ok { + // Synchronisation only necessary for standard init. + if err := json.NewEncoder(pipe).Encode(procError); err != nil { + panic(err) + } + } if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil { panic(err) } + } else { + if err := json.NewEncoder(pipe).Encode(procStart); err != nil { + panic(err) + } } // ensure that this pipe is always closed pipe.Close() }() - i, err := newContainerInit(it, pipe) + i, err = newContainerInit(it, pipe) if err != nil { return err } diff --git a/libcontainer/generic_error.go b/libcontainer/generic_error.go index 6fbc2d75..e4872e2d 100644 --- a/libcontainer/generic_error.go +++ b/libcontainer/generic_error.go @@ -9,6 +9,15 @@ import ( "github.com/opencontainers/runc/libcontainer/stacktrace" ) +type syncType uint8 + +const ( + procReady syncType = iota + procError + procStart + procRun +) + var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} Code: {{.ECode}} {{if .Message }} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 5637e40c..57e53782 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "fmt" + "io" "io/ioutil" "net" "os" @@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) { }, nil case initStandard: return &linuxStandardInit{ + pipe: pipe, parentPid: syscall.Getppid(), config: config, }, nil @@ -140,6 +142,28 @@ func finalizeNamespace(config *initConfig) error { return nil } +// syncParentReady sends to the given pipe a JSON payload which indicates that +// the init is ready to Exec the child process. It then waits for the parent to +// indicate that it is cleared to Exec. +func syncParentReady(pipe io.ReadWriter) error { + // Tell parent. + if err := json.NewEncoder(pipe).Encode(procReady); err != nil { + return err + } + + // Wait for parent to give the all-clear. + var procSync syncType + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + if procSync != procRun { + return fmt.Errorf("invalid synchronisation flag from parent") + } + } + return nil +} + // joinExistingNamespaces gets all the namespace paths specified for the container and // does a setns on the namespace fd so that the current process joins the namespace. func joinExistingNamespaces(namespaces []configs.Namespace) error { diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 59c8afd8..6be1dd60 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -525,6 +525,75 @@ func testCpuShares(t *testing.T, systemd bool) { } } +func TestPids(t *testing.T) { + testPids(t, false) +} + +func TestPidsSystemd(t *testing.T) { + if !systemd.UseSystemd() { + t.Skip("Systemd is unsupported") + } + testPids(t, true) +} + +func testPids(t *testing.T, systemd bool) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(rootfs) + if systemd { + config.Cgroups.Parent = "system.slice" + } + config.Cgroups.Resources.PidsLimit = -1 + + // Running multiple processes. + _, ret, err := runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true") + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + ok(t, err) + + if ret != 0 { + t.Fatalf("expected fork() to succeed with no pids limit") + } + + // Enforce a permissive limit (shell + 6 * true + 3). + config.Cgroups.Resources.PidsLimit = 10 + _, ret, err = runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true") + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + ok(t, err) + + if ret != 0 { + t.Fatalf("expected fork() to succeed with permissive pids limit") + } + + // Enforce a restrictive limit (shell + 6 * true + 3). + config.Cgroups.Resources.PidsLimit = 10 + out, ret, err := runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true") + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + if err != nil && !strings.Contains(out.String(), "sh: can't fork") { + ok(t, err) + } + + if err == nil { + t.Fatalf("expected fork() to fail with restrictive pids limit") + } + + // Minimal restrictions are not really supported, due to quirks in using Go + // due to the fact that it spawns random processes. While we do our best with + // late setting cgroup values, it's just too unreliable with very small pids.max. + // As such, we don't test that case. YMMV. +} + func TestRunWithKernelMemory(t *testing.T) { testRunWithKernelMemory(t, false) } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 114c71b3..1a6dc124 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "errors" + "fmt" "io" "os" "os/exec" @@ -86,6 +87,7 @@ func (p *setnsProcess) start() (err error) { if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return newSystemError(err) } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } @@ -95,6 +97,7 @@ func (p *setnsProcess) start() (err error) { if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { return newSystemError(err) } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() return newSystemError(ierr) @@ -198,7 +201,6 @@ func (p *initProcess) start() (err error) { return newSystemError(err) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children // can escape the cgroup if err := p.manager.Apply(p.pid()); err != nil { @@ -229,13 +231,58 @@ func (p *initProcess) start() (err error) { if err := p.sendConfig(); err != nil { return newSystemError(err) } - // wait for the child process to fully complete and receive an error message - // if one was encoutered - var ierr *genericError - if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + + var ( + procSync syncType + sentRun bool + ierr *genericError + ) + +loop: + for { + if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil { + if err == io.EOF { + break loop + } + return newSystemError(err) + } + + switch procSync { + case procStart: + break loop + case procReady: + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemError(err) + } + // Sync with child. + if err := json.NewEncoder(p.parentPipe).Encode(procRun); err != nil { + return newSystemError(err) + } + sentRun = true + case procError: + // wait for the child process to fully complete and receive an error message + // if one was encoutered + if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + return newSystemError(err) + } + if ierr != nil { + break loop + } + // Programmer error. + panic("No error following JSON procError payload.") + default: + return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child")) + } + } + if !sentRun { + return newSystemError(fmt.Errorf("could not synchronise with container process")) + } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { + p.wait() return newSystemError(ierr) } return nil @@ -273,8 +320,7 @@ func (p *initProcess) sendConfig() error { if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return err } - // shutdown writes for the parent side of the pipe - return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR) + return nil } func (p *initProcess) createNetworkInterfaces() error { diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index ec100578..d3b50863 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -3,6 +3,7 @@ package libcontainer import ( + "io" "os" "syscall" @@ -14,6 +15,7 @@ import ( ) type linuxStandardInit struct { + pipe io.ReadWriter parentPid int config *initConfig } @@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error { if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { return err } - label.Init() // InitializeMountNamespace() can be executed only for a new mount namespace if l.config.Config.Namespaces.Contains(configs.NEWNS) { @@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error { return err } } - for _, path := range l.config.Config.ReadonlyPaths { if err := remountReadonly(path); err != nil { return err @@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error { if err != nil { return err } + // Tell our parent that we're ready to Execv. This must be done before the + // Seccomp rules have been applied, because we need to be able to read and + // write to a socket. + if err := syncParentReady(l.pipe); err != nil { + return err + } if l.config.Config.Seccomp != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { return err diff --git a/spec.go b/spec.go index fbe31bb2..6c5e4790 100644 --- a/spec.go +++ b/spec.go @@ -456,6 +456,7 @@ func createCgroupConfig(name string, spec *specs.LinuxRuntimeSpec, devices []*co c.Resources.CpuRtPeriod = r.CPU.RealtimePeriod c.Resources.CpusetCpus = r.CPU.Cpus c.Resources.CpusetMems = r.CPU.Mems + c.Resources.PidsLimit = r.Pids.Limit c.Resources.BlkioWeight = r.BlockIO.Weight c.Resources.BlkioLeafWeight = r.BlockIO.LeafWeight for _, wd := range r.BlockIO.WeightDevice {