2016-03-25 23:44:09 +08:00
|
|
|
// +build linux
|
|
|
|
|
2016-03-31 05:48:19 +08:00
|
|
|
// Package specconv implements conversion of specifications to libcontainer
|
2016-03-25 23:44:09 +08:00
|
|
|
// configurations
|
|
|
|
package specconv
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"strings"
|
2016-03-30 02:14:59 +08:00
|
|
|
"time"
|
2016-03-25 23:44:09 +08:00
|
|
|
|
|
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/seccomp"
|
|
|
|
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
|
2016-04-13 04:35:51 +08:00
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
2017-05-10 05:38:27 +08:00
|
|
|
|
|
|
|
"golang.org/x/sys/unix"
|
2016-03-25 23:44:09 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
const wildcard = -1
|
|
|
|
|
2016-12-17 13:01:53 +08:00
|
|
|
var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{
|
2016-03-25 23:44:09 +08:00
|
|
|
specs.PIDNamespace: configs.NEWPID,
|
|
|
|
specs.NetworkNamespace: configs.NEWNET,
|
|
|
|
specs.MountNamespace: configs.NEWNS,
|
|
|
|
specs.UserNamespace: configs.NEWUSER,
|
|
|
|
specs.IPCNamespace: configs.NEWIPC,
|
|
|
|
specs.UTSNamespace: configs.NEWUTS,
|
|
|
|
}
|
|
|
|
|
|
|
|
var mountPropagationMapping = map[string]int{
|
2017-05-10 05:38:27 +08:00
|
|
|
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
|
|
|
|
"private": unix.MS_PRIVATE,
|
|
|
|
"rslave": unix.MS_SLAVE | unix.MS_REC,
|
|
|
|
"slave": unix.MS_SLAVE,
|
|
|
|
"rshared": unix.MS_SHARED | unix.MS_REC,
|
|
|
|
"shared": unix.MS_SHARED,
|
|
|
|
"": unix.MS_PRIVATE | unix.MS_REC,
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
var allowedDevices = []*configs.Device{
|
|
|
|
// allow mknod for any device
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Major: wildcard,
|
|
|
|
Minor: wildcard,
|
|
|
|
Permissions: "m",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'b',
|
|
|
|
Major: wildcard,
|
|
|
|
Minor: wildcard,
|
|
|
|
Permissions: "m",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/null",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 3,
|
|
|
|
Permissions: "rwm",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/random",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 8,
|
|
|
|
Permissions: "rwm",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/full",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 7,
|
|
|
|
Permissions: "rwm",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/tty",
|
|
|
|
Major: 5,
|
|
|
|
Minor: 0,
|
|
|
|
Permissions: "rwm",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/zero",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 5,
|
|
|
|
Permissions: "rwm",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/urandom",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 9,
|
|
|
|
Permissions: "rwm",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Path: "/dev/console",
|
|
|
|
Type: 'c',
|
|
|
|
Major: 5,
|
|
|
|
Minor: 1,
|
|
|
|
Permissions: "rwm",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
// /dev/pts/ - pts namespaces are "coming soon"
|
|
|
|
{
|
|
|
|
Path: "",
|
|
|
|
Type: 'c',
|
|
|
|
Major: 136,
|
|
|
|
Minor: wildcard,
|
|
|
|
Permissions: "rwm",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Path: "",
|
|
|
|
Type: 'c',
|
|
|
|
Major: 5,
|
|
|
|
Minor: 2,
|
|
|
|
Permissions: "rwm",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
// tuntap
|
|
|
|
{
|
|
|
|
Path: "",
|
|
|
|
Type: 'c',
|
|
|
|
Major: 10,
|
|
|
|
Minor: 200,
|
|
|
|
Permissions: "rwm",
|
|
|
|
Allow: true,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
2016-03-31 02:12:03 +08:00
|
|
|
type CreateOpts struct {
|
|
|
|
CgroupName string
|
|
|
|
UseSystemdCgroup bool
|
|
|
|
NoPivotRoot bool
|
2016-06-04 02:53:07 +08:00
|
|
|
NoNewKeyring bool
|
2016-03-31 02:12:03 +08:00
|
|
|
Spec *specs.Spec
|
2016-04-23 21:39:42 +08:00
|
|
|
Rootless bool
|
2016-03-31 02:12:03 +08:00
|
|
|
}
|
|
|
|
|
2016-03-25 23:44:09 +08:00
|
|
|
// CreateLibcontainerConfig creates a new libcontainer configuration from a
|
|
|
|
// given specification and a cgroup name
|
2016-03-31 02:12:03 +08:00
|
|
|
func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
|
2016-03-25 23:44:09 +08:00
|
|
|
// runc's cwd will always be the bundle path
|
|
|
|
rcwd, err := os.Getwd()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
cwd, err := filepath.Abs(rcwd)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-03-31 02:12:03 +08:00
|
|
|
spec := opts.Spec
|
2017-07-13 00:00:49 +08:00
|
|
|
if spec.Root == nil {
|
|
|
|
return nil, fmt.Errorf("Root must be specified")
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
rootfsPath := spec.Root.Path
|
|
|
|
if !filepath.IsAbs(rootfsPath) {
|
|
|
|
rootfsPath = filepath.Join(cwd, rootfsPath)
|
|
|
|
}
|
2016-06-03 03:44:43 +08:00
|
|
|
labels := []string{}
|
|
|
|
for k, v := range spec.Annotations {
|
|
|
|
labels = append(labels, fmt.Sprintf("%s=%s", k, v))
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
config := &configs.Config{
|
2016-06-04 02:53:07 +08:00
|
|
|
Rootfs: rootfsPath,
|
|
|
|
NoPivotRoot: opts.NoPivotRoot,
|
|
|
|
Readonlyfs: spec.Root.Readonly,
|
|
|
|
Hostname: spec.Hostname,
|
|
|
|
Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
|
|
|
|
NoNewKeyring: opts.NoNewKeyring,
|
2016-04-23 21:39:42 +08:00
|
|
|
Rootless: opts.Rootless,
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
exists := false
|
|
|
|
for _, m := range spec.Mounts {
|
|
|
|
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
|
|
|
|
}
|
|
|
|
if err := createDevices(spec, config); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if err := setupUserNamespace(spec, config); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-04-23 21:39:42 +08:00
|
|
|
c, err := createCgroupConfig(opts)
|
2016-03-25 23:44:09 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
config.Cgroups = c
|
2017-05-15 15:45:06 +08:00
|
|
|
// set linux-specific config
|
|
|
|
if spec.Linux != nil {
|
|
|
|
if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
|
|
|
|
return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, ns := range spec.Linux.Namespaces {
|
|
|
|
t, exists := namespaceMapping[ns.Type]
|
|
|
|
if !exists {
|
|
|
|
return nil, fmt.Errorf("namespace %q does not exist", ns)
|
|
|
|
}
|
|
|
|
if config.Namespaces.Contains(t) {
|
|
|
|
return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
|
|
|
|
}
|
|
|
|
config.Namespaces.Add(t, ns.Path)
|
|
|
|
}
|
2017-09-08 09:30:07 +08:00
|
|
|
if config.Namespaces.Contains(configs.NEWNET) {
|
|
|
|
config.Networks = []*configs.Network{
|
|
|
|
{
|
|
|
|
Type: "loopback",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
config.MaskPaths = spec.Linux.MaskedPaths
|
|
|
|
config.ReadonlyPaths = spec.Linux.ReadonlyPaths
|
|
|
|
config.MountLabel = spec.Linux.MountLabel
|
|
|
|
config.Sysctl = spec.Linux.Sysctl
|
|
|
|
if spec.Linux.Seccomp != nil {
|
|
|
|
seccomp, err := setupSeccomp(spec.Linux.Seccomp)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
config.Seccomp = seccomp
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
}
|
2016-03-22 22:20:16 +08:00
|
|
|
if spec.Process.SelinuxLabel != "" {
|
|
|
|
config.ProcessLabel = spec.Process.SelinuxLabel
|
|
|
|
}
|
2017-06-02 07:17:21 +08:00
|
|
|
if spec.Process != nil && spec.Process.OOMScoreAdj != nil {
|
|
|
|
config.OomScoreAdj = *spec.Process.OOMScoreAdj
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-03-15 00:36:38 +08:00
|
|
|
if spec.Process.Capabilities != nil {
|
|
|
|
config.Capabilities = &configs.Capabilities{
|
|
|
|
Bounding: spec.Process.Capabilities.Bounding,
|
|
|
|
Effective: spec.Process.Capabilities.Effective,
|
|
|
|
Permitted: spec.Process.Capabilities.Permitted,
|
|
|
|
Inheritable: spec.Process.Capabilities.Inheritable,
|
|
|
|
Ambient: spec.Process.Capabilities.Ambient,
|
|
|
|
}
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
createHooks(spec, config)
|
|
|
|
config.Version = specs.Version
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
if spec.Linux.IntelRdt != nil {
|
|
|
|
config.IntelRdt = &configs.IntelRdt{}
|
|
|
|
if spec.Linux.IntelRdt.L3CacheSchema != "" {
|
|
|
|
config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema
|
|
|
|
}
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
return config, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
|
2016-05-27 03:36:29 +08:00
|
|
|
flags, pgflags, data, ext := parseMountOptions(m.Options)
|
2016-03-25 23:44:09 +08:00
|
|
|
source := m.Source
|
|
|
|
if m.Type == "bind" {
|
|
|
|
if !filepath.IsAbs(source) {
|
|
|
|
source = filepath.Join(cwd, m.Source)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return &configs.Mount{
|
|
|
|
Device: m.Type,
|
|
|
|
Source: source,
|
|
|
|
Destination: m.Destination,
|
|
|
|
Data: data,
|
|
|
|
Flags: flags,
|
|
|
|
PropagationFlags: pgflags,
|
2016-05-27 03:36:29 +08:00
|
|
|
Extensions: ext,
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-04-23 21:39:42 +08:00
|
|
|
func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
|
|
|
|
var (
|
|
|
|
myCgroupPath string
|
|
|
|
|
|
|
|
spec = opts.Spec
|
|
|
|
useSystemdCgroup = opts.UseSystemdCgroup
|
|
|
|
name = opts.CgroupName
|
|
|
|
)
|
2016-03-25 23:44:09 +08:00
|
|
|
|
|
|
|
c := &configs.Cgroup{
|
|
|
|
Resources: &configs.Resources{},
|
|
|
|
}
|
|
|
|
|
2017-03-15 00:36:38 +08:00
|
|
|
if spec.Linux != nil && spec.Linux.CgroupsPath != "" {
|
|
|
|
myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath)
|
2016-03-25 23:44:09 +08:00
|
|
|
if useSystemdCgroup {
|
2017-03-15 00:36:38 +08:00
|
|
|
myCgroupPath = spec.Linux.CgroupsPath
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if useSystemdCgroup {
|
|
|
|
if myCgroupPath == "" {
|
|
|
|
c.Parent = "system.slice"
|
|
|
|
c.ScopePrefix = "runc"
|
|
|
|
c.Name = name
|
|
|
|
} else {
|
|
|
|
// Parse the path from expected "slice:prefix:name"
|
|
|
|
// for e.g. "system.slice:docker:1234"
|
|
|
|
parts := strings.Split(myCgroupPath, ":")
|
|
|
|
if len(parts) != 3 {
|
|
|
|
return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups")
|
|
|
|
}
|
|
|
|
c.Parent = parts[0]
|
|
|
|
c.ScopePrefix = parts[1]
|
|
|
|
c.Name = parts[2]
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if myCgroupPath == "" {
|
2016-08-30 14:12:15 +08:00
|
|
|
c.Name = name
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
c.Path = myCgroupPath
|
|
|
|
}
|
|
|
|
|
2016-04-23 21:39:42 +08:00
|
|
|
// In rootless containers, any attempt to make cgroup changes will fail.
|
|
|
|
// libcontainer will validate this and we shouldn't add any cgroup options
|
|
|
|
// the user didn't specify.
|
|
|
|
if !opts.Rootless {
|
|
|
|
c.Resources.AllowedDevices = allowedDevices
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
if spec.Linux != nil {
|
|
|
|
r := spec.Linux.Resources
|
|
|
|
if r == nil {
|
|
|
|
return c, nil
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
for i, d := range spec.Linux.Resources.Devices {
|
|
|
|
var (
|
|
|
|
t = "a"
|
|
|
|
major = int64(-1)
|
|
|
|
minor = int64(-1)
|
|
|
|
)
|
|
|
|
if d.Type != "" {
|
|
|
|
t = d.Type
|
|
|
|
}
|
|
|
|
if d.Major != nil {
|
|
|
|
major = *d.Major
|
|
|
|
}
|
|
|
|
if d.Minor != nil {
|
|
|
|
minor = *d.Minor
|
|
|
|
}
|
|
|
|
if d.Access == "" {
|
|
|
|
return nil, fmt.Errorf("device access at %d field cannot be empty", i)
|
|
|
|
}
|
|
|
|
dt, err := stringToCgroupDeviceRune(t)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
dd := &configs.Device{
|
|
|
|
Type: dt,
|
|
|
|
Major: major,
|
|
|
|
Minor: minor,
|
|
|
|
Permissions: d.Access,
|
|
|
|
Allow: d.Allow,
|
|
|
|
}
|
|
|
|
c.Resources.Devices = append(c.Resources.Devices, dd)
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
if r.Memory != nil {
|
|
|
|
if r.Memory.Limit != nil {
|
|
|
|
c.Resources.Memory = *r.Memory.Limit
|
|
|
|
}
|
|
|
|
if r.Memory.Reservation != nil {
|
|
|
|
c.Resources.MemoryReservation = *r.Memory.Reservation
|
|
|
|
}
|
|
|
|
if r.Memory.Swap != nil {
|
|
|
|
c.Resources.MemorySwap = *r.Memory.Swap
|
|
|
|
}
|
|
|
|
if r.Memory.Kernel != nil {
|
|
|
|
c.Resources.KernelMemory = *r.Memory.Kernel
|
|
|
|
}
|
|
|
|
if r.Memory.KernelTCP != nil {
|
|
|
|
c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
|
|
|
|
}
|
|
|
|
if r.Memory.Swappiness != nil {
|
|
|
|
c.Resources.MemorySwappiness = r.Memory.Swappiness
|
|
|
|
}
|
|
|
|
if r.Memory.DisableOOMKiller != nil {
|
|
|
|
c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
if r.CPU != nil {
|
|
|
|
if r.CPU.Shares != nil {
|
|
|
|
c.Resources.CpuShares = *r.CPU.Shares
|
|
|
|
}
|
|
|
|
if r.CPU.Quota != nil {
|
|
|
|
c.Resources.CpuQuota = *r.CPU.Quota
|
|
|
|
}
|
|
|
|
if r.CPU.Period != nil {
|
|
|
|
c.Resources.CpuPeriod = *r.CPU.Period
|
|
|
|
}
|
|
|
|
if r.CPU.RealtimeRuntime != nil {
|
|
|
|
c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
|
|
|
|
}
|
|
|
|
if r.CPU.RealtimePeriod != nil {
|
|
|
|
c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
|
|
|
|
}
|
|
|
|
if r.CPU.Cpus != "" {
|
|
|
|
c.Resources.CpusetCpus = r.CPU.Cpus
|
|
|
|
}
|
|
|
|
if r.CPU.Mems != "" {
|
|
|
|
c.Resources.CpusetMems = r.CPU.Mems
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
if r.Pids != nil {
|
|
|
|
c.Resources.PidsLimit = r.Pids.Limit
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
if r.BlockIO != nil {
|
|
|
|
if r.BlockIO.Weight != nil {
|
|
|
|
c.Resources.BlkioWeight = *r.BlockIO.Weight
|
|
|
|
}
|
|
|
|
if r.BlockIO.LeafWeight != nil {
|
|
|
|
c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
|
|
|
|
}
|
|
|
|
if r.BlockIO.WeightDevice != nil {
|
|
|
|
for _, wd := range r.BlockIO.WeightDevice {
|
|
|
|
var weight, leafWeight uint16
|
|
|
|
if wd.Weight != nil {
|
|
|
|
weight = *wd.Weight
|
|
|
|
}
|
|
|
|
if wd.LeafWeight != nil {
|
|
|
|
leafWeight = *wd.LeafWeight
|
|
|
|
}
|
|
|
|
weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
|
|
|
|
c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
|
2016-06-11 09:11:20 +08:00
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
}
|
|
|
|
if r.BlockIO.ThrottleReadBpsDevice != nil {
|
|
|
|
for _, td := range r.BlockIO.ThrottleReadBpsDevice {
|
|
|
|
rate := td.Rate
|
|
|
|
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
|
|
|
c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
|
2016-06-11 09:11:20 +08:00
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
if r.BlockIO.ThrottleWriteBpsDevice != nil {
|
|
|
|
for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
|
|
|
|
rate := td.Rate
|
|
|
|
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
|
|
|
c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
if r.BlockIO.ThrottleReadIOPSDevice != nil {
|
|
|
|
for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
|
|
|
|
rate := td.Rate
|
|
|
|
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
|
|
|
c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
if r.BlockIO.ThrottleWriteIOPSDevice != nil {
|
|
|
|
for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
|
|
|
|
rate := td.Rate
|
|
|
|
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
|
|
|
c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
for _, l := range r.HugepageLimits {
|
|
|
|
c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
|
|
|
|
Pagesize: l.Pagesize,
|
|
|
|
Limit: l.Limit,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
if r.Network != nil {
|
|
|
|
if r.Network.ClassID != nil {
|
|
|
|
c.Resources.NetClsClassid = *r.Network.ClassID
|
|
|
|
}
|
|
|
|
for _, m := range r.Network.Priorities {
|
|
|
|
c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
|
|
|
|
Interface: m.Name,
|
|
|
|
Priority: int64(m.Priority),
|
|
|
|
})
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
if !opts.Rootless {
|
|
|
|
// append the default allowed devices to the end of the list
|
|
|
|
c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
return c, nil
|
|
|
|
}
|
|
|
|
|
2017-02-10 14:39:40 +08:00
|
|
|
func stringToCgroupDeviceRune(s string) (rune, error) {
|
2016-03-25 23:44:09 +08:00
|
|
|
switch s {
|
|
|
|
case "a":
|
|
|
|
return 'a', nil
|
|
|
|
case "b":
|
|
|
|
return 'b', nil
|
|
|
|
case "c":
|
|
|
|
return 'c', nil
|
2017-02-10 14:39:40 +08:00
|
|
|
default:
|
|
|
|
return 0, fmt.Errorf("invalid cgroup device type %q", s)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func stringToDeviceRune(s string) (rune, error) {
|
|
|
|
switch s {
|
|
|
|
case "p":
|
|
|
|
return 'p', nil
|
|
|
|
case "u":
|
|
|
|
return 'u', nil
|
|
|
|
case "b":
|
|
|
|
return 'b', nil
|
|
|
|
case "c":
|
|
|
|
return 'c', nil
|
2016-03-25 23:44:09 +08:00
|
|
|
default:
|
|
|
|
return 0, fmt.Errorf("invalid device type %q", s)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func createDevices(spec *specs.Spec, config *configs.Config) error {
|
|
|
|
// add whitelisted devices
|
|
|
|
config.Devices = []*configs.Device{
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/null",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 3,
|
|
|
|
FileMode: 0666,
|
|
|
|
Uid: 0,
|
|
|
|
Gid: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/random",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 8,
|
|
|
|
FileMode: 0666,
|
|
|
|
Uid: 0,
|
|
|
|
Gid: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/full",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 7,
|
|
|
|
FileMode: 0666,
|
|
|
|
Uid: 0,
|
|
|
|
Gid: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/tty",
|
|
|
|
Major: 5,
|
|
|
|
Minor: 0,
|
|
|
|
FileMode: 0666,
|
|
|
|
Uid: 0,
|
|
|
|
Gid: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/zero",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 5,
|
|
|
|
FileMode: 0666,
|
|
|
|
Uid: 0,
|
|
|
|
Gid: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/urandom",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 9,
|
|
|
|
FileMode: 0666,
|
|
|
|
Uid: 0,
|
|
|
|
Gid: 0,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
// merge in additional devices from the spec
|
2017-05-15 15:45:06 +08:00
|
|
|
if spec.Linux != nil {
|
|
|
|
for _, d := range spec.Linux.Devices {
|
|
|
|
var uid, gid uint32
|
|
|
|
var filemode os.FileMode = 0666
|
2016-10-14 01:32:28 +08:00
|
|
|
|
2017-05-15 15:45:06 +08:00
|
|
|
if d.UID != nil {
|
|
|
|
uid = *d.UID
|
|
|
|
}
|
|
|
|
if d.GID != nil {
|
|
|
|
gid = *d.GID
|
|
|
|
}
|
|
|
|
dt, err := stringToDeviceRune(d.Type)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if d.FileMode != nil {
|
|
|
|
filemode = *d.FileMode
|
|
|
|
}
|
|
|
|
device := &configs.Device{
|
|
|
|
Type: dt,
|
|
|
|
Path: d.Path,
|
|
|
|
Major: d.Major,
|
|
|
|
Minor: d.Minor,
|
|
|
|
FileMode: filemode,
|
|
|
|
Uid: uid,
|
|
|
|
Gid: gid,
|
|
|
|
}
|
|
|
|
config.Devices = append(config.Devices, device)
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
|
2016-12-17 13:01:53 +08:00
|
|
|
create := func(m specs.LinuxIDMapping) configs.IDMap {
|
2016-03-25 23:44:09 +08:00
|
|
|
return configs.IDMap{
|
|
|
|
HostID: int(m.HostID),
|
|
|
|
ContainerID: int(m.ContainerID),
|
|
|
|
Size: int(m.Size),
|
|
|
|
}
|
|
|
|
}
|
2017-05-15 15:45:06 +08:00
|
|
|
if spec.Linux != nil {
|
|
|
|
if len(spec.Linux.UIDMappings) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
for _, m := range spec.Linux.UIDMappings {
|
|
|
|
config.UidMappings = append(config.UidMappings, create(m))
|
|
|
|
}
|
|
|
|
for _, m := range spec.Linux.GIDMappings {
|
|
|
|
config.GidMappings = append(config.GidMappings, create(m))
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-03-18 01:32:16 +08:00
|
|
|
rootUID, err := config.HostRootUID()
|
2016-03-25 23:44:09 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-03-18 01:32:16 +08:00
|
|
|
rootGID, err := config.HostRootGID()
|
2016-03-25 23:44:09 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
for _, node := range config.Devices {
|
|
|
|
node.Uid = uint32(rootUID)
|
|
|
|
node.Gid = uint32(rootGID)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// parseMountOptions parses the string and returns the flags, propagation
|
|
|
|
// flags and any mount data that it contains.
|
2016-05-27 03:36:29 +08:00
|
|
|
func parseMountOptions(options []string) (int, []int, string, int) {
|
2016-03-25 23:44:09 +08:00
|
|
|
var (
|
2016-05-27 03:36:29 +08:00
|
|
|
flag int
|
|
|
|
pgflag []int
|
|
|
|
data []string
|
|
|
|
extFlags int
|
2016-03-25 23:44:09 +08:00
|
|
|
)
|
|
|
|
flags := map[string]struct {
|
|
|
|
clear bool
|
|
|
|
flag int
|
|
|
|
}{
|
2017-05-27 00:13:43 +08:00
|
|
|
"acl": {false, unix.MS_POSIXACL},
|
2017-05-10 05:38:27 +08:00
|
|
|
"async": {true, unix.MS_SYNCHRONOUS},
|
|
|
|
"atime": {true, unix.MS_NOATIME},
|
|
|
|
"bind": {false, unix.MS_BIND},
|
2016-03-25 23:44:09 +08:00
|
|
|
"defaults": {false, 0},
|
2017-05-10 05:38:27 +08:00
|
|
|
"dev": {true, unix.MS_NODEV},
|
|
|
|
"diratime": {true, unix.MS_NODIRATIME},
|
|
|
|
"dirsync": {false, unix.MS_DIRSYNC},
|
|
|
|
"exec": {true, unix.MS_NOEXEC},
|
2017-05-27 00:13:43 +08:00
|
|
|
"iversion": {false, unix.MS_I_VERSION},
|
|
|
|
"lazytime": {false, unix.MS_LAZYTIME},
|
|
|
|
"loud": {true, unix.MS_SILENT},
|
2017-05-10 05:38:27 +08:00
|
|
|
"mand": {false, unix.MS_MANDLOCK},
|
2017-05-27 00:13:43 +08:00
|
|
|
"noacl": {true, unix.MS_POSIXACL},
|
2017-05-10 05:38:27 +08:00
|
|
|
"noatime": {false, unix.MS_NOATIME},
|
|
|
|
"nodev": {false, unix.MS_NODEV},
|
|
|
|
"nodiratime": {false, unix.MS_NODIRATIME},
|
|
|
|
"noexec": {false, unix.MS_NOEXEC},
|
2017-05-27 00:13:43 +08:00
|
|
|
"noiversion": {true, unix.MS_I_VERSION},
|
|
|
|
"nolazytime": {true, unix.MS_LAZYTIME},
|
2017-05-10 05:38:27 +08:00
|
|
|
"nomand": {true, unix.MS_MANDLOCK},
|
|
|
|
"norelatime": {true, unix.MS_RELATIME},
|
|
|
|
"nostrictatime": {true, unix.MS_STRICTATIME},
|
|
|
|
"nosuid": {false, unix.MS_NOSUID},
|
|
|
|
"rbind": {false, unix.MS_BIND | unix.MS_REC},
|
|
|
|
"relatime": {false, unix.MS_RELATIME},
|
|
|
|
"remount": {false, unix.MS_REMOUNT},
|
|
|
|
"ro": {false, unix.MS_RDONLY},
|
|
|
|
"rw": {true, unix.MS_RDONLY},
|
2017-05-27 00:13:43 +08:00
|
|
|
"silent": {false, unix.MS_SILENT},
|
2017-05-10 05:38:27 +08:00
|
|
|
"strictatime": {false, unix.MS_STRICTATIME},
|
|
|
|
"suid": {true, unix.MS_NOSUID},
|
|
|
|
"sync": {false, unix.MS_SYNCHRONOUS},
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2016-09-06 23:41:40 +08:00
|
|
|
propagationFlags := map[string]int{
|
2017-05-10 05:38:27 +08:00
|
|
|
"private": unix.MS_PRIVATE,
|
|
|
|
"shared": unix.MS_SHARED,
|
|
|
|
"slave": unix.MS_SLAVE,
|
|
|
|
"unbindable": unix.MS_UNBINDABLE,
|
|
|
|
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
|
|
|
|
"rshared": unix.MS_SHARED | unix.MS_REC,
|
|
|
|
"rslave": unix.MS_SLAVE | unix.MS_REC,
|
|
|
|
"runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2016-05-27 03:36:29 +08:00
|
|
|
extensionFlags := map[string]struct {
|
|
|
|
clear bool
|
|
|
|
flag int
|
|
|
|
}{
|
|
|
|
"tmpcopyup": {false, configs.EXT_COPYUP},
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
for _, o := range options {
|
|
|
|
// If the option does not exist in the flags table or the flag
|
|
|
|
// is not supported on the platform,
|
|
|
|
// then it is a data value for a specific fs type
|
|
|
|
if f, exists := flags[o]; exists && f.flag != 0 {
|
|
|
|
if f.clear {
|
|
|
|
flag &= ^f.flag
|
|
|
|
} else {
|
|
|
|
flag |= f.flag
|
|
|
|
}
|
2016-09-06 23:41:40 +08:00
|
|
|
} else if f, exists := propagationFlags[o]; exists && f != 0 {
|
|
|
|
pgflag = append(pgflag, f)
|
2016-05-27 03:36:29 +08:00
|
|
|
} else if f, exists := extensionFlags[o]; exists && f.flag != 0 {
|
|
|
|
if f.clear {
|
|
|
|
extFlags &= ^f.flag
|
|
|
|
} else {
|
|
|
|
extFlags |= f.flag
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
} else {
|
|
|
|
data = append(data, o)
|
|
|
|
}
|
|
|
|
}
|
2016-05-27 03:36:29 +08:00
|
|
|
return flag, pgflag, strings.Join(data, ","), extFlags
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
|
2016-12-17 13:01:53 +08:00
|
|
|
func setupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
|
2016-03-25 23:44:09 +08:00
|
|
|
if config == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// No default action specified, no syscalls listed, assume seccomp disabled
|
|
|
|
if config.DefaultAction == "" && len(config.Syscalls) == 0 {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
newConfig := new(configs.Seccomp)
|
|
|
|
newConfig.Syscalls = []*configs.Syscall{}
|
|
|
|
|
|
|
|
if len(config.Architectures) > 0 {
|
|
|
|
newConfig.Architectures = []string{}
|
|
|
|
for _, arch := range config.Architectures {
|
|
|
|
newArch, err := seccomp.ConvertStringToArch(string(arch))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
newConfig.Architectures = append(newConfig.Architectures, newArch)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Convert default action from string representation
|
|
|
|
newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
newConfig.DefaultAction = newDefaultAction
|
|
|
|
|
|
|
|
// Loop through all syscall blocks and convert them to libcontainer format
|
|
|
|
for _, call := range config.Syscalls {
|
|
|
|
newAction, err := seccomp.ConvertStringToAction(string(call.Action))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2017-03-15 00:36:38 +08:00
|
|
|
for _, name := range call.Names {
|
|
|
|
newCall := configs.Syscall{
|
|
|
|
Name: name,
|
|
|
|
Action: newAction,
|
|
|
|
Args: []*configs.Arg{},
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
2017-03-15 00:36:38 +08:00
|
|
|
// Loop through all the arguments of the syscall and convert them
|
|
|
|
for _, arg := range call.Args {
|
|
|
|
newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
|
2017-03-15 00:36:38 +08:00
|
|
|
newArg := configs.Arg{
|
|
|
|
Index: arg.Index,
|
|
|
|
Value: arg.Value,
|
|
|
|
ValueTwo: arg.ValueTwo,
|
|
|
|
Op: newOp,
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
|
2017-03-15 00:36:38 +08:00
|
|
|
newCall.Args = append(newCall.Args, &newArg)
|
|
|
|
}
|
|
|
|
newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return newConfig, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func createHooks(rspec *specs.Spec, config *configs.Config) {
|
|
|
|
config.Hooks = &configs.Hooks{}
|
2017-03-15 00:36:38 +08:00
|
|
|
if rspec.Hooks != nil {
|
|
|
|
|
|
|
|
for _, h := range rspec.Hooks.Prestart {
|
|
|
|
cmd := createCommandHook(h)
|
|
|
|
config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd))
|
|
|
|
}
|
|
|
|
for _, h := range rspec.Hooks.Poststart {
|
|
|
|
cmd := createCommandHook(h)
|
|
|
|
config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd))
|
|
|
|
}
|
|
|
|
for _, h := range rspec.Hooks.Poststop {
|
|
|
|
cmd := createCommandHook(h)
|
|
|
|
config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd))
|
|
|
|
}
|
2016-03-25 23:44:09 +08:00
|
|
|
}
|
|
|
|
}
|
2016-03-30 02:14:59 +08:00
|
|
|
|
|
|
|
func createCommandHook(h specs.Hook) configs.Command {
|
|
|
|
cmd := configs.Command{
|
|
|
|
Path: h.Path,
|
|
|
|
Args: h.Args,
|
|
|
|
Env: h.Env,
|
|
|
|
}
|
|
|
|
if h.Timeout != nil {
|
|
|
|
d := time.Duration(*h.Timeout) * time.Second
|
|
|
|
cmd.Timeout = &d
|
|
|
|
}
|
|
|
|
return cmd
|
|
|
|
}
|