2014-10-23 04:45:23 +08:00
|
|
|
// +build linux
|
|
|
|
|
|
|
|
package libcontainer
|
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/json"
|
2014-11-01 04:56:53 +08:00
|
|
|
"fmt"
|
2014-10-23 04:45:23 +08:00
|
|
|
"os"
|
|
|
|
"path/filepath"
|
2014-11-01 04:56:53 +08:00
|
|
|
"regexp"
|
2016-03-23 06:41:49 +08:00
|
|
|
"runtime/debug"
|
2015-04-09 05:14:51 +08:00
|
|
|
"strconv"
|
2014-10-31 06:08:28 +08:00
|
|
|
|
2018-09-04 09:02:18 +08:00
|
|
|
"github.com/cyphar/filepath-securejoin"
|
2015-06-22 10:29:59 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/configs/validate"
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
2017-11-08 22:25:07 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/mount"
|
2016-01-26 10:15:44 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
2017-05-10 05:38:27 +08:00
|
|
|
|
|
|
|
"golang.org/x/sys/unix"
|
2014-10-23 04:45:23 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2016-06-07 04:15:18 +08:00
|
|
|
stateFilename = "state.json"
|
|
|
|
execFifoFilename = "exec.fifo"
|
2014-10-23 04:45:23 +08:00
|
|
|
)
|
|
|
|
|
2017-05-04 11:33:19 +08:00
|
|
|
var idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
|
2014-11-01 04:56:53 +08:00
|
|
|
|
2015-02-14 07:43:14 +08:00
|
|
|
// InitArgs returns an options func to configure a LinuxFactory with the
|
2016-07-06 21:58:30 +08:00
|
|
|
// provided init binary path and arguments.
|
2015-02-14 07:43:14 +08:00
|
|
|
func InitArgs(args ...string) func(*LinuxFactory) error {
|
2017-01-25 06:53:59 +08:00
|
|
|
return func(l *LinuxFactory) (err error) {
|
|
|
|
if len(args) > 0 {
|
|
|
|
// Resolve relative paths to ensure that its available
|
|
|
|
// after directory changes.
|
|
|
|
if args[0], err = filepath.Abs(args[0]); err != nil {
|
|
|
|
return newGenericError(err, ConfigInvalid)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-14 07:43:14 +08:00
|
|
|
l.InitArgs = args
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// SystemdCgroups is an options func to configure a LinuxFactory to return
|
|
|
|
// containers that use systemd to create and manage cgroups.
|
|
|
|
func SystemdCgroups(l *LinuxFactory) error {
|
2019-05-02 04:22:19 +08:00
|
|
|
systemdCgroupsManager, err := systemd.NewSystemdCgroupsManager()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2015-02-14 07:43:14 +08:00
|
|
|
}
|
2019-05-02 04:22:19 +08:00
|
|
|
l.NewCgroupsManager = systemdCgroupsManager
|
2015-02-14 07:43:14 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-03-16 08:33:04 +08:00
|
|
|
// Cgroupfs is an options func to configure a LinuxFactory to return containers
|
|
|
|
// that use the native cgroups filesystem implementation to create and manage
|
|
|
|
// cgroups.
|
2015-02-14 07:43:14 +08:00
|
|
|
func Cgroupfs(l *LinuxFactory) error {
|
|
|
|
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
|
|
|
|
return &fs.Manager{
|
|
|
|
Cgroups: config,
|
|
|
|
Paths: paths,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-03-16 08:33:04 +08:00
|
|
|
// RootlessCgroupfs is an options func to configure a LinuxFactory to return
|
|
|
|
// containers that use the native cgroups filesystem implementation to create
|
|
|
|
// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is
|
|
|
|
// that RootlessCgroupfs can transparently handle permission errors that occur
|
Disable rootless mode except RootlessCgMgr when executed as the root in userns
This PR decomposes `libcontainer/configs.Config.Rootless bool` into `RootlessEUID bool` and
`RootlessCgroups bool`, so as to make "runc-in-userns" to be more compatible with "rootful" runc.
`RootlessEUID` denotes that runc is being executed as a non-root user (euid != 0) in
the current user namespace. `RootlessEUID` is almost identical to the former `Rootless`
except cgroups stuff.
`RootlessCgroups` denotes that runc is unlikely to have the full access to cgroups.
`RootlessCgroups` is set to false if runc is executed as the root (euid == 0) in the initial namespace.
Otherwise `RootlessCgroups` is set to true.
(Hint: if `RootlessEUID` is true, `RootlessCgroups` becomes true as well)
When runc is executed as the root (euid == 0) in an user namespace (e.g. by Docker-in-LXD, Podman, Usernetes),
`RootlessEUID` is set to false but `RootlessCgroups` is set to true.
So, "runc-in-userns" behaves almost same as "rootful" runc except that cgroups errors are ignored.
This PR does not have any impact on CLI flags and `state.json`.
Note about CLI:
* Now `runc --rootless=(auto|true|false)` CLI flag is only used for setting `RootlessCgroups`.
* Now `runc spec --rootless` is only required when `RootlessEUID` is set to true.
For runc-in-userns, `runc spec` without `--rootless` should work, when sufficient numbers of
UID/GID are mapped.
Note about `$XDG_RUNTIME_DIR` (e.g. `/run/user/1000`):
* `$XDG_RUNTIME_DIR` is ignored if runc is being executed as the root (euid == 0) in the initial namespace, for backward compatibility.
(`/run/runc` is used)
* If runc is executed as the root (euid == 0) in an user namespace, `$XDG_RUNTIME_DIR` is honored if `$USER != "" && $USER != "root"`.
This allows unprivileged users to allow execute runc as the root in userns, without mounting writable `/run/runc`.
Note about `state.json`:
* `rootless` is set to true when `RootlessEUID == true && RootlessCgroups == true`.
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
2018-07-05 14:28:21 +08:00
|
|
|
// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if
|
2018-03-16 08:33:04 +08:00
|
|
|
// they've been set up properly).
|
|
|
|
func RootlessCgroupfs(l *LinuxFactory) error {
|
|
|
|
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
|
|
|
|
return &fs.Manager{
|
|
|
|
Cgroups: config,
|
|
|
|
Rootless: true,
|
|
|
|
Paths: paths,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
// IntelRdtfs is an options func to configure a LinuxFactory to return
|
|
|
|
// containers that use the Intel RDT "resource control" filesystem to
|
2018-10-16 12:37:41 +08:00
|
|
|
// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth).
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
func IntelRdtFs(l *LinuxFactory) error {
|
|
|
|
l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
|
|
|
|
return &intelrdt.IntelRdtManager{
|
|
|
|
Config: config,
|
|
|
|
Id: id,
|
|
|
|
Path: path,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-03-20 01:17:32 +08:00
|
|
|
// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
|
|
|
|
func TmpfsRoot(l *LinuxFactory) error {
|
|
|
|
mounted, err := mount.Mounted(l.Root)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if !mounted {
|
2017-05-10 05:38:27 +08:00
|
|
|
if err := unix.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil {
|
2015-03-20 01:17:32 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-07-07 10:58:55 +08:00
|
|
|
// CriuPath returns an option func to configure a LinuxFactory with the
|
|
|
|
// provided criupath
|
|
|
|
func CriuPath(criupath string) func(*LinuxFactory) error {
|
|
|
|
return func(l *LinuxFactory) error {
|
|
|
|
l.CriuPath = criupath
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-14 07:43:14 +08:00
|
|
|
// New returns a linux based container factory based in the root directory and
|
|
|
|
// configures the factory with the provided option funcs.
|
|
|
|
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
|
2014-12-15 23:05:11 +08:00
|
|
|
if root != "" {
|
|
|
|
if err := os.MkdirAll(root, 0700); err != nil {
|
|
|
|
return nil, newGenericError(err, SystemError)
|
|
|
|
}
|
2014-10-23 04:45:23 +08:00
|
|
|
}
|
2015-02-14 07:43:14 +08:00
|
|
|
l := &LinuxFactory{
|
|
|
|
Root: root,
|
2017-05-19 12:54:02 +08:00
|
|
|
InitPath: "/proc/self/exe",
|
|
|
|
InitArgs: []string{os.Args[0], "init"},
|
2015-02-14 07:43:14 +08:00
|
|
|
Validator: validate.New(),
|
2015-05-19 05:52:26 +08:00
|
|
|
CriuPath: "criu",
|
2015-02-14 07:43:14 +08:00
|
|
|
}
|
|
|
|
Cgroupfs(l)
|
|
|
|
for _, opt := range options {
|
2017-07-21 01:33:01 +08:00
|
|
|
if opt == nil {
|
|
|
|
continue
|
|
|
|
}
|
2015-02-14 07:43:14 +08:00
|
|
|
if err := opt(l); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return l, nil
|
2014-10-23 04:45:23 +08:00
|
|
|
}
|
|
|
|
|
2015-02-14 07:43:14 +08:00
|
|
|
// LinuxFactory implements the default factory interface for linux based systems.
|
|
|
|
type LinuxFactory struct {
|
|
|
|
// Root directory for the factory to store state.
|
|
|
|
Root string
|
|
|
|
|
2017-05-19 12:54:02 +08:00
|
|
|
// InitPath is the path for calling the init responsibilities for spawning
|
|
|
|
// a container.
|
|
|
|
InitPath string
|
|
|
|
|
2015-02-14 07:43:14 +08:00
|
|
|
// InitArgs are arguments for calling the init responsibilities for spawning
|
|
|
|
// a container.
|
|
|
|
InitArgs []string
|
|
|
|
|
2015-03-07 03:21:02 +08:00
|
|
|
// CriuPath is the path to the criu binary used for checkpoint and restore of
|
|
|
|
// containers.
|
|
|
|
CriuPath string
|
|
|
|
|
2017-07-21 01:33:01 +08:00
|
|
|
// New{u,g}uidmapPath is the path to the binaries used for mapping with
|
|
|
|
// rootless containers.
|
|
|
|
NewuidmapPath string
|
|
|
|
NewgidmapPath string
|
|
|
|
|
2015-02-14 07:43:14 +08:00
|
|
|
// Validator provides validation to container configurations.
|
|
|
|
Validator validate.Validator
|
|
|
|
|
|
|
|
// NewCgroupsManager returns an initialized cgroups manager for a single container.
|
|
|
|
NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
|
|
|
|
// NewIntelRdtManager returns an initialized Intel RDT manager for a single container.
|
|
|
|
NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager
|
2014-10-23 04:45:23 +08:00
|
|
|
}
|
|
|
|
|
2015-02-14 07:43:14 +08:00
|
|
|
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
|
|
|
|
if l.Root == "" {
|
2014-12-15 23:05:11 +08:00
|
|
|
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
|
|
|
|
}
|
2015-02-01 12:51:12 +08:00
|
|
|
if err := l.validateID(id); err != nil {
|
|
|
|
return nil, err
|
2014-11-04 01:42:20 +08:00
|
|
|
}
|
2015-02-14 07:43:14 +08:00
|
|
|
if err := l.Validator.Validate(config); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return nil, newGenericError(err, ConfigInvalid)
|
|
|
|
}
|
2018-09-04 09:02:18 +08:00
|
|
|
containerRoot, err := securejoin.SecureJoin(l.Root, id)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2015-02-01 12:51:12 +08:00
|
|
|
if _, err := os.Stat(containerRoot); err == nil {
|
2015-10-29 22:15:26 +08:00
|
|
|
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
|
2014-11-04 01:42:20 +08:00
|
|
|
} else if !os.IsNotExist(err) {
|
|
|
|
return nil, newGenericError(err, SystemError)
|
2014-11-01 04:56:53 +08:00
|
|
|
}
|
2016-06-14 08:21:28 +08:00
|
|
|
if err := os.MkdirAll(containerRoot, 0711); err != nil {
|
|
|
|
return nil, newGenericError(err, SystemError)
|
|
|
|
}
|
2017-07-12 04:27:01 +08:00
|
|
|
if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
|
2016-06-14 08:21:28 +08:00
|
|
|
return nil, newGenericError(err, SystemError)
|
|
|
|
}
|
2015-10-03 02:16:50 +08:00
|
|
|
c := &linuxContainer{
|
2014-12-15 23:05:11 +08:00
|
|
|
id: id,
|
|
|
|
root: containerRoot,
|
|
|
|
config: config,
|
2017-05-19 12:54:02 +08:00
|
|
|
initPath: l.InitPath,
|
2015-02-14 07:43:14 +08:00
|
|
|
initArgs: l.InitArgs,
|
2015-03-07 03:21:02 +08:00
|
|
|
criuPath: l.CriuPath,
|
2017-07-21 01:33:01 +08:00
|
|
|
newuidmapPath: l.NewuidmapPath,
|
|
|
|
newgidmapPath: l.NewgidmapPath,
|
2015-02-14 07:43:14 +08:00
|
|
|
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
|
2015-10-03 02:16:50 +08:00
|
|
|
}
|
2018-10-16 12:37:41 +08:00
|
|
|
if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
|
|
|
|
}
|
2015-10-03 02:16:50 +08:00
|
|
|
c.state = &stoppedState{c: c}
|
|
|
|
return c, nil
|
2014-10-23 04:45:23 +08:00
|
|
|
}
|
|
|
|
|
2015-02-14 07:43:14 +08:00
|
|
|
func (l *LinuxFactory) Load(id string) (Container, error) {
|
|
|
|
if l.Root == "" {
|
2014-12-15 23:05:11 +08:00
|
|
|
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
|
|
|
|
}
|
2018-08-31 11:17:42 +08:00
|
|
|
//when load, we need to check id is valid or not.
|
|
|
|
if err := l.validateID(id); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2018-09-04 09:02:18 +08:00
|
|
|
containerRoot, err := securejoin.SecureJoin(l.Root, id)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-08-26 01:23:35 +08:00
|
|
|
state, err := l.loadState(containerRoot, id)
|
2014-10-23 04:45:23 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2015-03-13 12:45:43 +08:00
|
|
|
r := &nonChildProcess{
|
2015-02-12 06:45:07 +08:00
|
|
|
processPid: state.InitProcessPid,
|
|
|
|
processStartTime: state.InitProcessStartTime,
|
2015-04-29 19:35:21 +08:00
|
|
|
fds: state.ExternalDescriptors,
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
2015-10-03 02:16:50 +08:00
|
|
|
c := &linuxContainer{
|
2016-07-05 08:24:13 +08:00
|
|
|
initProcess: r,
|
|
|
|
initProcessStartTime: state.InitProcessStartTime,
|
|
|
|
id: id,
|
|
|
|
config: &state.Config,
|
2017-05-19 12:54:02 +08:00
|
|
|
initPath: l.InitPath,
|
2016-07-05 08:24:13 +08:00
|
|
|
initArgs: l.InitArgs,
|
|
|
|
criuPath: l.CriuPath,
|
2017-07-21 01:33:01 +08:00
|
|
|
newuidmapPath: l.NewuidmapPath,
|
|
|
|
newgidmapPath: l.NewgidmapPath,
|
2016-07-05 08:24:13 +08:00
|
|
|
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
|
|
|
|
root: containerRoot,
|
|
|
|
created: state.Created,
|
2015-10-03 02:16:50 +08:00
|
|
|
}
|
2016-05-14 08:01:12 +08:00
|
|
|
c.state = &loadedState{c: c}
|
2016-01-22 08:48:05 +08:00
|
|
|
if err := c.refreshState(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2018-10-16 12:37:41 +08:00
|
|
|
if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
|
|
|
|
}
|
2015-10-03 02:16:50 +08:00
|
|
|
return c, nil
|
2014-10-23 04:45:23 +08:00
|
|
|
}
|
|
|
|
|
2015-03-12 02:44:56 +08:00
|
|
|
func (l *LinuxFactory) Type() string {
|
|
|
|
return "libcontainer"
|
|
|
|
}
|
|
|
|
|
2015-02-01 12:51:12 +08:00
|
|
|
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
|
|
|
|
// This is a low level implementation detail of the reexec and should not be consumed externally
|
2015-04-09 05:14:51 +08:00
|
|
|
func (l *LinuxFactory) StartInitialization() (err error) {
|
2016-11-28 22:25:06 +08:00
|
|
|
var (
|
2017-08-24 15:37:26 +08:00
|
|
|
pipefd, fifofd int
|
2017-03-03 04:53:06 +08:00
|
|
|
consoleSocket *os.File
|
2016-11-28 22:25:06 +08:00
|
|
|
envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
|
2017-08-24 15:37:26 +08:00
|
|
|
envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD")
|
2017-03-03 04:53:06 +08:00
|
|
|
envConsole = os.Getenv("_LIBCONTAINER_CONSOLE")
|
2016-11-28 22:25:06 +08:00
|
|
|
)
|
2016-06-07 04:15:18 +08:00
|
|
|
|
2016-11-28 22:25:06 +08:00
|
|
|
// Get the INITPIPE.
|
|
|
|
pipefd, err = strconv.Atoi(envInitPipe)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
|
2015-04-09 05:14:51 +08:00
|
|
|
}
|
2016-11-28 22:25:06 +08:00
|
|
|
|
2015-02-07 04:48:57 +08:00
|
|
|
var (
|
|
|
|
pipe = os.NewFile(uintptr(pipefd), "pipe")
|
|
|
|
it = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
|
|
|
|
)
|
2017-01-25 07:07:19 +08:00
|
|
|
defer pipe.Close()
|
|
|
|
|
2017-08-24 15:37:26 +08:00
|
|
|
// Only init processes have FIFOFD.
|
|
|
|
fifofd = -1
|
2016-11-28 22:25:06 +08:00
|
|
|
if it == initStandard {
|
2017-08-24 15:37:26 +08:00
|
|
|
if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
|
|
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
|
2016-11-28 22:25:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-03 04:53:06 +08:00
|
|
|
if envConsole != "" {
|
|
|
|
console, err := strconv.Atoi(envConsole)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
|
|
|
|
}
|
|
|
|
consoleSocket = os.NewFile(uintptr(console), "console-socket")
|
|
|
|
defer consoleSocket.Close()
|
|
|
|
}
|
|
|
|
|
2015-02-07 04:48:57 +08:00
|
|
|
// clear the current process's environment to clean any libcontainer
|
|
|
|
// specific env vars.
|
|
|
|
os.Clearenv()
|
2016-06-07 04:15:18 +08:00
|
|
|
|
2016-03-10 09:48:12 +08:00
|
|
|
defer func() {
|
|
|
|
// We have an error during the initialization of the container's init,
|
|
|
|
// send it back to the parent process in the form of an initError.
|
2016-06-06 18:26:35 +08:00
|
|
|
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
|
|
|
|
fmt.Fprintln(os.Stderr, err)
|
|
|
|
return
|
2015-02-01 13:21:06 +08:00
|
|
|
}
|
2016-06-14 08:21:28 +08:00
|
|
|
if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
|
2016-10-18 06:54:51 +08:00
|
|
|
fmt.Fprintln(os.Stderr, err)
|
|
|
|
return
|
2016-03-10 09:48:12 +08:00
|
|
|
}
|
|
|
|
}()
|
2016-03-23 06:41:49 +08:00
|
|
|
defer func() {
|
|
|
|
if e := recover(); e != nil {
|
|
|
|
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
|
|
|
|
}
|
|
|
|
}()
|
2017-01-25 07:07:19 +08:00
|
|
|
|
2017-08-24 15:37:26 +08:00
|
|
|
i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
|
2016-03-10 09:48:12 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
2015-02-01 13:21:06 +08:00
|
|
|
}
|
2017-01-25 07:07:19 +08:00
|
|
|
|
|
|
|
// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
|
2016-06-07 04:15:18 +08:00
|
|
|
return i.Init()
|
2015-02-01 12:51:12 +08:00
|
|
|
}
|
|
|
|
|
2016-08-26 01:23:35 +08:00
|
|
|
func (l *LinuxFactory) loadState(root, id string) (*State, error) {
|
2018-09-04 09:02:18 +08:00
|
|
|
stateFilePath, err := securejoin.SecureJoin(root, stateFilename)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
f, err := os.Open(stateFilePath)
|
2014-10-23 04:45:23 +08:00
|
|
|
if err != nil {
|
|
|
|
if os.IsNotExist(err) {
|
2016-09-28 18:37:19 +08:00
|
|
|
return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
|
2014-10-23 04:45:23 +08:00
|
|
|
}
|
|
|
|
return nil, newGenericError(err, SystemError)
|
|
|
|
}
|
|
|
|
defer f.Close()
|
2015-02-12 06:45:07 +08:00
|
|
|
var state *State
|
2014-10-23 04:45:23 +08:00
|
|
|
if err := json.NewDecoder(f).Decode(&state); err != nil {
|
|
|
|
return nil, newGenericError(err, SystemError)
|
|
|
|
}
|
|
|
|
return state, nil
|
|
|
|
}
|
2014-12-15 23:05:11 +08:00
|
|
|
|
2015-02-14 07:43:14 +08:00
|
|
|
func (l *LinuxFactory) validateID(id string) error {
|
2018-09-04 09:02:18 +08:00
|
|
|
if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) {
|
2015-10-29 22:15:26 +08:00
|
|
|
return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
|
2014-12-23 06:06:22 +08:00
|
|
|
}
|
2017-05-04 11:33:19 +08:00
|
|
|
|
2015-02-01 12:51:12 +08:00
|
|
|
return nil
|
2014-12-15 23:05:11 +08:00
|
|
|
}
|
2017-07-21 01:33:01 +08:00
|
|
|
|
|
|
|
// NewuidmapPath returns an option func to configure a LinuxFactory with the
|
|
|
|
// provided ..
|
|
|
|
func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error {
|
|
|
|
return func(l *LinuxFactory) error {
|
|
|
|
l.NewuidmapPath = newuidmapPath
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewgidmapPath returns an option func to configure a LinuxFactory with the
|
|
|
|
// provided ..
|
|
|
|
func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error {
|
|
|
|
return func(l *LinuxFactory) error {
|
|
|
|
l.NewgidmapPath = newgidmapPath
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|