Merge pull request #2113 from giuseppe/cgroupv2

libcontainer: initial support for cgroups v2
This commit is contained in:
Mrunal Patel 2019-09-05 13:14:29 -07:00 committed by GitHub
commit 92ac8e3f84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 1244 additions and 27 deletions

View File

@ -18,7 +18,7 @@ import (
)
var (
subsystems = subsystemSet{
subsystemsLegacy = subsystemSet{
&CpusetGroup{},
&DevicesGroup{},
&MemoryGroup{},
@ -33,6 +33,14 @@ var (
&FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
subsystemsUnified = subsystemSet{
&CpusetGroupV2{},
&FreezerGroupV2{},
&CpuGroupV2{},
&MemoryGroupV2{},
&IOGroupV2{},
&PidsGroupV2{},
}
HugePageSizes, _ = cgroups.GetHugePageSize()
)
@ -129,6 +137,13 @@ func isIgnorableError(rootless bool, err error) bool {
return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
}
func (m *Manager) getSubsystems() subsystemSet {
if cgroups.IsCgroup2UnifiedMode() {
return subsystemsUnified
}
return subsystemsLegacy
}
func (m *Manager) Apply(pid int) (err error) {
if m.Cgroups == nil {
return nil
@ -158,7 +173,7 @@ func (m *Manager) Apply(pid int) (err error) {
return cgroups.EnterPid(m.Paths, pid)
}
for _, sys := range subsystems {
for _, sys := range m.getSubsystems() {
// TODO: Apply should, ideally, be reentrant or be broken up into a separate
// create and join phase so that the cgroup hierarchy for a container can be
// created then join consists of writing the process pids to cgroup.procs
@ -214,7 +229,7 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
defer m.mu.Unlock()
stats := cgroups.NewStats()
for name, path := range m.Paths {
sys, err := subsystems.Get(name)
sys, err := m.getSubsystems().Get(name)
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
continue
}
@ -237,7 +252,7 @@ func (m *Manager) Set(container *configs.Config) error {
}
paths := m.GetPaths()
for _, sys := range subsystems {
for _, sys := range m.getSubsystems() {
path := paths[sys.Name()]
if err := sys.Set(path, container.Cgroups); err != nil {
if m.Rootless && sys.Name() == "devices" {
@ -274,7 +289,7 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
dir := paths["freezer"]
prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Resources.Freezer = state
freezer, err := subsystems.Get("freezer")
freezer, err := m.getSubsystems().Get("freezer")
if err != nil {
return err
}

View File

@ -0,0 +1,92 @@
// +build linux
package fs
import (
"bufio"
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type CpuGroupV2 struct {
}
func (s *CpuGroupV2) Name() string {
return "cpu"
}
func (s *CpuGroupV2) Apply(d *cgroupData) error {
// We always want to join the cpu group, to allow fair cpu scheduling
// on a container basis
path, err := d.path("cpu")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return s.ApplyDir(path, d.config, d.pid)
}
func (s *CpuGroupV2) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error {
// This might happen if we have no cpu cgroup mounted.
// Just do nothing and don't fail.
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
return cgroups.WriteCgroupProc(path, pid)
}
func (s *CpuGroupV2) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuWeight != 0 {
if err := writeFile(path, "cpu.weight", strconv.FormatUint(cgroup.Resources.CpuWeight, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuMax != "" {
if err := writeFile(path, "cpu.max", cgroup.Resources.CpuMax); err != nil {
return err
}
}
return nil
}
func (s *CpuGroupV2) Remove(d *cgroupData) error {
return removePath(d.path("cpu"))
}
func (s *CpuGroupV2) GetStats(path string, stats *cgroups.Stats) error {
f, err := os.Open(filepath.Join(path, "cpu.stat"))
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := getCgroupParamKeyValue(sc.Text())
if err != nil {
return err
}
switch t {
case "usage_usec":
stats.CpuStats.CpuUsage.TotalUsage = v * 1000
case "user_usec":
stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000
case "system_usec":
stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000
}
}
return nil
}

View File

@ -0,0 +1,159 @@
// +build linux
package fs
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
type CpusetGroupV2 struct {
}
func (s *CpusetGroupV2) Name() string {
return "cpuset"
}
func (s *CpusetGroupV2) Apply(d *cgroupData) error {
dir, err := d.path("cpuset")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return s.ApplyDir(dir, d.config, d.pid)
}
func (s *CpusetGroupV2) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpusetCpus != "" {
if err := writeFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
return err
}
}
if cgroup.Resources.CpusetMems != "" {
if err := writeFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
return err
}
}
return nil
}
func (s *CpusetGroupV2) Remove(d *cgroupData) error {
return removePath(d.path("cpuset"))
}
func (s *CpusetGroupV2) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
func (s *CpusetGroupV2) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
// This might happen if we have no cpuset cgroup mounted.
// Just do nothing and don't fail.
if dir == "" {
return nil
}
mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo")
if err != nil {
return err
}
root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo)))
// 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
return err
}
if err := os.MkdirAll(dir, 0755); err != nil {
return err
}
// We didn't inherit cpuset configs from parent, but we have
// to ensure cpuset configs are set before moving task into the
// cgroup.
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatibility.
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
return cgroups.WriteCgroupProc(dir, pid)
}
func (s *CpusetGroupV2) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {
if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus.effective")); err != nil {
return
}
if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems.effective")); err != nil {
return
}
return cpus, mems, nil
}
// ensureParent makes sure that the parent directory of current is created
// and populated with the proper cpus and mems files copied from
// it's parent.
func (s *CpusetGroupV2) ensureParent(current, root string) error {
parent := filepath.Dir(current)
if libcontainerUtils.CleanPath(parent) == root {
return nil
}
// Avoid infinite recursion.
if parent == current {
return fmt.Errorf("cpuset: cgroup parent path outside cgroup root")
}
if err := s.ensureParent(parent, root); err != nil {
return err
}
if err := os.MkdirAll(current, 0755); err != nil {
return err
}
return s.copyIfNeeded(current, parent)
}
// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
// directory to the current directory if the file's contents are 0
func (s *CpusetGroupV2) copyIfNeeded(current, parent string) error {
var (
err error
currentCpus, currentMems []byte
parentCpus, parentMems []byte
)
if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil {
return err
}
if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil {
return err
}
if s.isEmpty(currentCpus) {
if err := writeFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
return err
}
}
if s.isEmpty(currentMems) {
if err := writeFile(current, "cpuset.mems", string(parentMems)); err != nil {
return err
}
}
return nil
}
func (s *CpusetGroupV2) isEmpty(b []byte) bool {
return len(bytes.Trim(b, "\n")) == 0
}
func (s *CpusetGroupV2) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
if err := s.Set(path, cgroup); err != nil {
return err
}
return s.copyIfNeeded(path, filepath.Dir(path))
}

View File

@ -0,0 +1,74 @@
// +build linux
package fs
import (
"fmt"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type FreezerGroupV2 struct {
}
func (s *FreezerGroupV2) Name() string {
return "freezer"
}
func (s *FreezerGroupV2) Apply(d *cgroupData) error {
_, err := d.join("freezer")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *FreezerGroupV2) Set(path string, cgroup *configs.Cgroup) error {
var desiredState string
filename := "cgroup.freeze"
if cgroup.Resources.Freezer == configs.Frozen {
desiredState = "1"
} else {
desiredState = "0"
}
switch cgroup.Resources.Freezer {
case configs.Frozen, configs.Thawed:
for {
// In case this loop does not exit because it doesn't get the expected
// state, let's write again this state, hoping it's going to be properly
// set this time. Otherwise, this loop could run infinitely, waiting for
// a state change that would never happen.
if err := writeFile(path, filename, desiredState); err != nil {
return err
}
state, err := readFile(path, filename)
if err != nil {
return err
}
if strings.TrimSpace(state) == desiredState {
break
}
time.Sleep(1 * time.Millisecond)
}
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
}
return nil
}
func (s *FreezerGroupV2) Remove(d *cgroupData) error {
return removePath(d.path("freezer"))
}
func (s *FreezerGroupV2) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -0,0 +1,191 @@
// +build linux
package fs
import (
"bufio"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type IOGroupV2 struct {
}
func (s *IOGroupV2) Name() string {
return "blkio"
}
func (s *IOGroupV2) Apply(d *cgroupData) error {
_, err := d.join("blkio")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *IOGroupV2) Set(path string, cgroup *configs.Cgroup) error {
cgroupsv2 := cgroups.IsCgroup2UnifiedMode()
if cgroup.Resources.BlkioWeight != 0 {
filename := "blkio.weight"
if cgroupsv2 {
filename = "io.bfq.weight"
}
if err := writeFile(path, filename, strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
return err
}
}
if cgroup.Resources.BlkioLeafWeight != 0 {
if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range cgroup.Resources.BlkioWeightDevice {
if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err
}
if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
}
for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
if cgroupsv2 {
if err := writeFile(path, "io.max", td.StringName("rbps")); err != nil {
return err
}
} else {
if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
}
for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
if cgroupsv2 {
if err := writeFile(path, "io.max", td.StringName("wbps")); err != nil {
return err
}
} else {
if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
}
for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
if cgroupsv2 {
if err := writeFile(path, "io.max", td.StringName("riops")); err != nil {
return err
}
} else {
if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
}
for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
if cgroupsv2 {
if err := writeFile(path, "io.max", td.StringName("wiops")); err != nil {
return err
}
} else {
if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
}
return nil
}
func (s *IOGroupV2) Remove(d *cgroupData) error {
return removePath(d.path("blkio"))
}
func readCgroup2MapFile(path string, name string) (map[string][]string, error) {
ret := map[string][]string{}
p := filepath.Join("/sys/fs/cgroup", path, name)
f, err := os.Open(p)
if err != nil {
if os.IsNotExist(err) {
return ret, nil
}
return nil, err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
parts := strings.Fields(line)
if len(parts) < 2 {
continue
}
ret[parts[0]] = parts[1:]
}
if err := scanner.Err(); err != nil {
return nil, err
}
return ret, nil
}
func (s *IOGroupV2) getCgroupV2Stats(path string, stats *cgroups.Stats) error {
// more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
var ioServiceBytesRecursive []cgroups.BlkioStatEntry
values, err := readCgroup2MapFile(path, "io.stat")
if err != nil {
return err
}
for k, v := range values {
d := strings.Split(k, ":")
if len(d) != 2 {
continue
}
minor, err := strconv.ParseUint(d[0], 10, 0)
if err != nil {
return err
}
major, err := strconv.ParseUint(d[1], 10, 0)
if err != nil {
return err
}
for _, item := range v {
d := strings.Split(item, "=")
if len(d) != 2 {
continue
}
op := d[0]
// Accommodate the cgroup v1 naming
switch op {
case "rbytes":
op = "read"
case "wbytes":
op = "write"
}
value, err := strconv.ParseUint(d[1], 10, 0)
if err != nil {
return err
}
entry := cgroups.BlkioStatEntry{
Op: op,
Major: major,
Minor: minor,
Value: value,
}
ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry)
}
}
stats.BlkioStats = cgroups.BlkioStats{IoServiceBytesRecursive: ioServiceBytesRecursive}
return nil
}
func (s *IOGroupV2) GetStats(path string, stats *cgroups.Stats) error {
return s.getCgroupV2Stats(path, stats)
}

View File

@ -0,0 +1,164 @@
// +build linux
package fs
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type MemoryGroupV2 struct {
}
func (s *MemoryGroupV2) Name() string {
return "memory"
}
func (s *MemoryGroupV2) Apply(d *cgroupData) (err error) {
path, err := d.path("memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
} else if path == "" {
return nil
}
if memoryAssigned(d.config) {
if _, err := os.Stat(path); os.IsNotExist(err) {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// Only enable kernel memory accouting when this cgroup
// is created by libcontainer, otherwise we might get
// error when people use `cgroupsPath` to join an existed
// cgroup whose kernel memory is not initialized.
if err := EnableKernelMemoryAccounting(path); err != nil {
return err
}
}
}
defer func() {
if err != nil {
os.RemoveAll(path)
}
}()
// We need to join memory cgroup after set memory limits, because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
_, err = d.join("memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func setMemoryAndSwapCgroups(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.MemorySwap != 0 {
if err := writeFile(path, "memory.swap.max", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.max", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
}
return nil
}
func (s *MemoryGroupV2) Set(path string, cgroup *configs.Cgroup) error {
if err := setMemoryAndSwapCgroups(path, cgroup); err != nil {
return err
}
if cgroup.Resources.KernelMemory != 0 {
if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
return err
}
}
if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.high", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
return nil
}
func (s *MemoryGroupV2) Remove(d *cgroupData) error {
return removePath(d.path("memory"))
}
func (s *MemoryGroupV2) GetStats(path string, stats *cgroups.Stats) error {
// Set stats from memory.stat.
statsFile, err := os.Open(filepath.Join(path, "memory.stat"))
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
defer statsFile.Close()
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := getCgroupParamKeyValue(sc.Text())
if err != nil {
return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
}
stats.MemoryStats.Stats[t] = v
}
stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
memoryUsage, err := getMemoryDataV2(path, "")
if err != nil {
return err
}
stats.MemoryStats.Usage = memoryUsage
swapUsage, err := getMemoryDataV2(path, "swap")
if err != nil {
return err
}
stats.MemoryStats.SwapUsage = swapUsage
stats.MemoryStats.UseHierarchy = true
return nil
}
func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
memoryData := cgroups.MemoryData{}
moduleName := "memory"
if name != "" {
moduleName = strings.Join([]string{"memory", name}, ".")
}
usage := strings.Join([]string{moduleName, "current"}, ".")
limit := strings.Join([]string{moduleName, "max"}, ".")
value, err := getCgroupParamUint(path, usage)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
}
memoryData.Usage = value
value, err = getCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
}
memoryData.Limit = value
return memoryData, nil
}

View File

@ -0,0 +1,107 @@
// +build linux
package fs
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
type PidsGroupV2 struct {
}
func (s *PidsGroupV2) Name() string {
return "pids"
}
func (s *PidsGroupV2) Apply(d *cgroupData) error {
_, err := d.join("pids")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *PidsGroupV2) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if cgroup.Resources.PidsLimit > 0 {
limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
}
if err := writeFile(path, "pids.max", limit); err != nil {
return err
}
}
return nil
}
func (s *PidsGroupV2) Remove(d *cgroupData) error {
return removePath(d.path("pids"))
}
func isNOTSUP(err error) bool {
switch err := err.(type) {
case *os.PathError:
return err.Err == unix.ENOTSUP
default:
return false
}
}
func (s *PidsGroupV2) GetStats(path string, stats *cgroups.Stats) error {
current, err := getCgroupParamUint(path, "pids.current")
if os.IsNotExist(err) {
// if the controller is not enabled, let's read the list
// PIDs (or threads if cgroup.threads is enabled)
contents, err := ioutil.ReadFile(filepath.Join(path, "cgroup.procs"))
if err != nil && isNOTSUP(err) {
contents, err = ioutil.ReadFile(filepath.Join(path, "cgroup.threads"))
}
if err != nil {
return err
}
pids := make(map[string]string)
for _, i := range strings.Split(string(contents), "\n") {
if i != "" {
pids[i] = i
}
}
stats.PidsStats.Current = uint64(len(pids))
stats.PidsStats.Limit = 0
return nil
}
if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err)
}
maxString, err := getCgroupParamString(path, "pids.max")
if err != nil {
return fmt.Errorf("failed to parse pids.max - %s", err)
}
// Default if pids.max == "max" is 0 -- which represents "no limit".
var max uint64
if maxString != "max" {
max, err = parseUint(maxString, 10, 64)
if err != nil {
return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
}
}
stats.PidsStats.Current = current
stats.PidsStats.Limit = max
return nil
}

View File

@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"io/ioutil"
"math"
"path/filepath"
"strconv"
"strings"
@ -59,8 +60,12 @@ func getCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) {
if err != nil {
return 0, err
}
trimmed := strings.TrimSpace(string(contents))
if trimmed == "max" {
return math.MaxUint64, nil
}
res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64)
res, err := parseUint(trimmed, 10, 64)
if err != nil {
return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName)
}

View File

@ -21,7 +21,7 @@ import (
"github.com/sirupsen/logrus"
)
type Manager struct {
type LegacyManager struct {
mu sync.Mutex
Cgroups *configs.Cgroup
Paths map[string]string
@ -49,7 +49,7 @@ func (s subsystemSet) Get(name string) (subsystem, error) {
return nil, errSubsystemDoesNotExist
}
var subsystems = subsystemSet{
var legacySubsystems = subsystemSet{
&fs.CpusetGroup{},
&fs.DevicesGroup{},
&fs.MemoryGroup{},
@ -119,15 +119,23 @@ func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]s
if !isRunningSystemd() {
return nil, fmt.Errorf("systemd not running on this host, can't use systemd as a cgroups.Manager")
}
if cgroups.IsCgroup2UnifiedMode() {
return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &UnifiedManager{
Cgroups: config,
Paths: paths,
}
}, nil
}
return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &Manager{
return &LegacyManager{
Cgroups: config,
Paths: paths,
}
}, nil
}
func (m *Manager) Apply(pid int) error {
func (m *LegacyManager) Apply(pid int) error {
var (
c = m.Cgroups
unitName = getUnitName(c)
@ -253,7 +261,7 @@ func (m *Manager) Apply(pid int) error {
}
paths := make(map[string]string)
for _, s := range subsystems {
for _, s := range legacySubsystems {
subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name())
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
@ -268,7 +276,7 @@ func (m *Manager) Apply(pid int) error {
return nil
}
func (m *Manager) Destroy() error {
func (m *LegacyManager) Destroy() error {
if m.Cgroups.Paths != nil {
return nil
}
@ -282,7 +290,7 @@ func (m *Manager) Destroy() error {
return nil
}
func (m *Manager) GetPaths() map[string]string {
func (m *LegacyManager) GetPaths() map[string]string {
m.mu.Lock()
paths := m.Paths
m.mu.Unlock()
@ -294,6 +302,7 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
if err != nil {
return "", err
}
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
@ -304,7 +313,7 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
}
func joinCgroups(c *configs.Cgroup, pid int) error {
for _, sys := range subsystems {
for _, sys := range legacySubsystems {
name := sys.Name()
switch name {
case "name=systemd":
@ -399,14 +408,14 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
}
func (m *Manager) Freeze(state configs.FreezerState) error {
func (m *LegacyManager) Freeze(state configs.FreezerState) error {
path, err := getSubsystemPath(m.Cgroups, "freezer")
if err != nil {
return err
}
prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Resources.Freezer = state
freezer, err := subsystems.Get("freezer")
freezer, err := legacySubsystems.Get("freezer")
if err != nil {
return err
}
@ -418,7 +427,7 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
return nil
}
func (m *Manager) GetPids() ([]int, error) {
func (m *LegacyManager) GetPids() ([]int, error) {
path, err := getSubsystemPath(m.Cgroups, "devices")
if err != nil {
return nil, err
@ -426,7 +435,7 @@ func (m *Manager) GetPids() ([]int, error) {
return cgroups.GetPids(path)
}
func (m *Manager) GetAllPids() ([]int, error) {
func (m *LegacyManager) GetAllPids() ([]int, error) {
path, err := getSubsystemPath(m.Cgroups, "devices")
if err != nil {
return nil, err
@ -434,12 +443,12 @@ func (m *Manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(path)
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
func (m *LegacyManager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
for name, path := range m.Paths {
sys, err := subsystems.Get(name)
sys, err := legacySubsystems.Get(name)
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
continue
}
@ -451,13 +460,13 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
return stats, nil
}
func (m *Manager) Set(container *configs.Config) error {
func (m *LegacyManager) Set(container *configs.Config) error {
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.Cgroups.Paths != nil {
return nil
}
for _, sys := range subsystems {
for _, sys := range legacySubsystems {
// Get the subsystem path, but don't error out for not found cgroups.
path, err := getSubsystemPath(container.Cgroups, sys.Name())
if err != nil && !cgroups.IsNotFound(err) {

View File

@ -0,0 +1,329 @@
// +build linux,!static_build
package systemd
import (
"fmt"
"io/ioutil"
"math"
"os"
"path/filepath"
"strings"
"sync"
"time"
systemdDbus "github.com/coreos/go-systemd/dbus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
)
type UnifiedManager struct {
mu sync.Mutex
Cgroups *configs.Cgroup
Paths map[string]string
}
var unifiedSubsystems = subsystemSet{
&fs.CpusetGroupV2{},
&fs.FreezerGroupV2{},
&fs.CpuGroupV2{},
&fs.MemoryGroupV2{},
&fs.IOGroupV2{},
&fs.PidsGroupV2{},
}
func (m *UnifiedManager) Apply(pid int) error {
var (
c = m.Cgroups
unitName = getUnitName(c)
slice = "system.slice"
properties []systemdDbus.Property
)
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := getSubsystemPath(m.Cgroups, name)
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
if c.Parent != "" {
slice = c.Parent
}
properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
// if we create a slice, the parent is defined via a Wants=
if strings.HasSuffix(unitName, ".slice") {
properties = append(properties, systemdDbus.PropWants(slice))
} else {
// otherwise, we use Slice=
properties = append(properties, systemdDbus.PropSlice(slice))
}
// only add pid if its valid, -1 is used w/ general slice creation.
if pid != -1 {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}
// Check if we can delegate. This is only supported on systemd versions 218 and above.
if !strings.HasSuffix(unitName, ".slice") {
// Assume scopes always support delegation.
properties = append(properties, newProp("Delegate", true))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties,
newProp("MemoryAccounting", true),
newProp("CPUAccounting", true),
newProp("BlockIOAccounting", true))
// Assume DefaultDependencies= will always work (the check for it was previously broken.)
properties = append(properties,
newProp("DefaultDependencies", false))
if c.Resources.Memory != 0 {
properties = append(properties,
newProp("MemoryLimit", uint64(c.Resources.Memory)))
}
if c.Resources.CpuShares != 0 {
properties = append(properties,
newProp("CPUShares", c.Resources.CpuShares))
}
// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
// corresponds to USEC_INFINITY in systemd
// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
// always setting a property value ensures we can apply a quota and remove it later
cpuQuotaPerSecUSec := uint64(math.MaxUint64)
if c.Resources.CpuQuota > 0 {
// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
// (integer percentage of CPU) internally. This means that if a fractional percent of
// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
if cpuQuotaPerSecUSec%10000 != 0 {
cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
}
}
properties = append(properties,
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
}
if c.Resources.BlkioWeight != 0 {
properties = append(properties,
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
}
if c.Resources.PidsLimit > 0 {
properties = append(properties,
newProp("TasksAccounting", true),
newProp("TasksMax", uint64(c.Resources.PidsLimit)))
}
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if c.Resources.KernelMemory != 0 {
if err := setKernelMemory(c); err != nil {
return err
}
}
statusChan := make(chan string, 1)
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
select {
case <-statusChan:
case <-time.After(time.Second):
logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
}
} else if !isUnitExists(err) {
return err
}
if err := joinCgroupsV2(c, pid); err != nil {
return err
}
paths := make(map[string]string)
for _, s := range unifiedSubsystems {
subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name())
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[s.Name()] = subsystemPath
}
m.Paths = paths
return nil
}
func (m *UnifiedManager) Destroy() error {
if m.Cgroups.Paths != nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
if err := cgroups.RemovePaths(m.Paths); err != nil {
return err
}
m.Paths = make(map[string]string)
return nil
}
func (m *UnifiedManager) GetPaths() map[string]string {
m.mu.Lock()
paths := m.Paths
m.mu.Unlock()
return paths
}
func createCgroupsv2Path(path string) (Err error) {
content, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
if err != nil {
return err
}
if !filepath.HasPrefix(path, "/sys/fs/cgroup") {
return fmt.Errorf("invalid cgroup path %s", path)
}
res := ""
for i, c := range strings.Split(strings.TrimSpace(string(content)), " ") {
if i == 0 {
res = fmt.Sprintf("+%s", c)
} else {
res = res + fmt.Sprintf(" +%s", c)
}
}
resByte := []byte(res)
current := "/sys/fs"
elements := strings.Split(path, "/")
for i, e := range elements[3:] {
current = filepath.Join(current, e)
if i > 0 {
if err := os.Mkdir(current, 0755); err != nil {
if !os.IsExist(err) {
return err
}
} else {
// If the directory was created, be sure it is not left around on errors.
defer func() {
if Err != nil {
os.Remove(current)
}
}()
}
}
if i < len(elements[3:])-1 {
if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), resByte, 0755); err != nil {
return err
}
}
}
return nil
}
func joinCgroupsV2(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "memory")
if err != nil {
return err
}
return createCgroupsv2Path(path)
}
func (m *UnifiedManager) Freeze(state configs.FreezerState) error {
path, err := getSubsystemPath(m.Cgroups, "freezer")
if err != nil {
return err
}
prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Resources.Freezer = state
freezer, err := unifiedSubsystems.Get("freezer")
if err != nil {
return err
}
err = freezer.Set(path, m.Cgroups)
if err != nil {
m.Cgroups.Resources.Freezer = prevState
return err
}
return nil
}
func (m *UnifiedManager) GetPids() ([]int, error) {
path, err := getSubsystemPath(m.Cgroups, "devices")
if err != nil {
return nil, err
}
return cgroups.GetPids(path)
}
func (m *UnifiedManager) GetAllPids() ([]int, error) {
path, err := getSubsystemPath(m.Cgroups, "devices")
if err != nil {
return nil, err
}
return cgroups.GetAllPids(path)
}
func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
for name, path := range m.Paths {
sys, err := unifiedSubsystems.Get(name)
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
continue
}
if err := sys.GetStats(path, stats); err != nil {
return nil, err
}
}
return stats, nil
}
func (m *UnifiedManager) Set(container *configs.Config) error {
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.Cgroups.Paths != nil {
return nil
}
for _, sys := range unifiedSubsystems {
// Get the subsystem path, but don't error out for not found cgroups.
path, err := getSubsystemPath(container.Cgroups, sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := sys.Set(path, container.Cgroups); err != nil {
return err
}
}
if m.Paths["cpu"] != "" {
if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
return err
}
}
return nil
}

View File

@ -11,6 +11,8 @@ import (
"path/filepath"
"strconv"
"strings"
"sync"
"syscall"
"time"
units "github.com/docker/go-units"
@ -22,6 +24,11 @@ const (
CgroupProcesses = "cgroup.procs"
)
var (
isUnifiedOnce sync.Once
isUnified bool
)
// HugePageSizeUnitList is a list of the units used by the linux kernel when
// naming the HugePage control files.
// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
@ -29,6 +36,18 @@ const (
// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
func IsCgroup2UnifiedMode() bool {
isUnifiedOnce.Do(func() {
var st syscall.Statfs_t
if err := syscall.Statfs("/sys/fs/cgroup", &st); err != nil {
panic("cannot statfs cgroup root")
}
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
})
return isUnified
}
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
@ -49,6 +68,10 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string,
}
defer f.Close()
if IsCgroup2UnifiedMode() {
subsystem = ""
}
return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
}
@ -57,12 +80,12 @@ func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsyst
for scanner.Scan() {
txt := scanner.Text()
fields := strings.Fields(txt)
if len(fields) < 5 {
if len(fields) < 9 {
continue
}
if strings.HasPrefix(fields[4], cgroupPath) {
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if opt == subsystem {
if (subsystem == "" && fields[9] == "cgroup2") || opt == subsystem {
return fields[4], fields[3], nil
}
}
@ -76,6 +99,19 @@ func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsyst
}
func isSubsystemAvailable(subsystem string) bool {
if IsCgroup2UnifiedMode() {
controllers, err := GetAllSubsystems()
if err != nil {
return false
}
for _, c := range controllers {
if c == subsystem {
return true
}
}
return false
}
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return false
@ -120,7 +156,7 @@ func FindCgroupMountpointDir() (string, error) {
return "", fmt.Errorf("Found no fields post '-' in %q", text)
}
if postSeparatorFields[0] == "cgroup" {
if postSeparatorFields[0] == "cgroup" || postSeparatorFields[0] == "cgroup2" {
// Check that the mount is properly formatted.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
@ -193,6 +229,19 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
// GetCgroupMounts returns the mounts for the cgroup subsystems.
// all indicates whether to return just the first instance or all the mounts.
func GetCgroupMounts(all bool) ([]Mount, error) {
if IsCgroup2UnifiedMode() {
availableControllers, err := GetAllSubsystems()
if err != nil {
return nil, err
}
m := Mount{
Mountpoint: "/sys/fs/cgroup",
Root: "/sys/fs/cgroup",
Subsystems: availableControllers,
}
return []Mount{m}, nil
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return nil, err
@ -356,6 +405,9 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
}
func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
if IsCgroup2UnifiedMode() {
return "/", nil
}
if p, ok := cgroups[subsystem]; ok {
return p, nil

View File

@ -59,3 +59,8 @@ func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
func (td *ThrottleDevice) String() string {
return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
}
// StringName formats the struct to be writable to the cgroup specific file
func (td *ThrottleDevice) StringName(name string) string {
return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate)
}

View File

@ -119,4 +119,12 @@ type Resources struct {
// Set class identifier for container's network packets
NetClsClassid uint32 `json:"net_cls_classid_u"`
// Used on cgroups v2:
// CpuWeight sets a proportional bandwidth limit.
CpuWeight uint64 `json:"cpu_weight"`
// CpuMax sets she maximum bandwidth limit (format: max period).
CpuMax string `json:"cpu_max"`
}

View File

@ -1814,7 +1814,14 @@ func (c *linuxContainer) isPaused() (bool, error) {
// A container doesn't have a freezer cgroup
return false, nil
}
data, err := ioutil.ReadFile(filepath.Join(fcg, "freezer.state"))
pausedState := "FROZEN"
filename := "freezer.state"
if cgroups.IsCgroup2UnifiedMode() {
filename = "cgroup.freeze"
pausedState = "1"
}
data, err := ioutil.ReadFile(filepath.Join(fcg, filename))
if err != nil {
// If freezer cgroup is not mounted, the container would just be not paused.
if os.IsNotExist(err) {
@ -1822,7 +1829,7 @@ func (c *linuxContainer) isPaused() (bool, error) {
}
return false, newSystemErrorWithCause(err, "checking if container is paused")
}
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
return bytes.Equal(bytes.TrimSpace(data), []byte(pausedState)), nil
}
func (c *linuxContainer) currentState() (*State, error) {