2014-12-17 17:12:23 +08:00
package configs
2014-07-09 01:17:05 +08:00
2015-09-11 08:57:31 +08:00
import (
"bytes"
"encoding/json"
2016-03-30 02:14:59 +08:00
"fmt"
2015-09-11 08:57:31 +08:00
"os/exec"
2016-03-30 02:14:59 +08:00
"time"
2016-03-03 23:32:59 +08:00
2016-12-19 23:38:56 +08:00
"github.com/opencontainers/runtime-spec/specs-go"
2017-07-19 22:28:59 +08:00
"github.com/sirupsen/logrus"
2015-09-11 08:57:31 +08:00
)
2015-02-01 11:56:27 +08:00
type Rlimit struct {
2015-02-12 08:45:23 +08:00
Type int ` json:"type" `
Hard uint64 ` json:"hard" `
Soft uint64 ` json:"soft" `
2015-01-27 20:54:19 +08:00
}
2015-02-01 11:56:27 +08:00
// IDMap represents UID/GID Mappings for User Namespaces.
type IDMap struct {
2015-02-12 08:45:23 +08:00
ContainerID int ` json:"container_id" `
HostID int ` json:"host_id" `
Size int ` json:"size" `
2014-11-25 06:39:32 +08:00
}
2015-06-30 02:12:54 +08:00
// Seccomp represents syscall restrictions
2015-09-23 22:52:53 +08:00
// By default, only the native architecture of the kernel is allowed to be used
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
2015-05-30 06:24:18 +08:00
type Seccomp struct {
2015-06-30 02:12:54 +08:00
DefaultAction Action ` json:"default_action" `
2015-09-23 22:52:53 +08:00
Architectures [ ] string ` json:"architectures" `
2015-06-30 02:12:54 +08:00
Syscalls [ ] * Syscall ` json:"syscalls" `
2015-05-30 06:24:18 +08:00
}
2016-04-12 16:12:23 +08:00
// Action is taken upon rule match in Seccomp
2015-05-30 06:24:18 +08:00
type Action int
const (
2015-11-13 09:03:53 +08:00
Kill Action = iota + 1
2015-06-30 02:12:54 +08:00
Errno
2015-05-30 06:24:18 +08:00
Trap
Allow
2015-11-13 09:03:53 +08:00
Trace
2015-05-30 06:24:18 +08:00
)
2016-04-12 16:12:23 +08:00
// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
2015-05-30 06:24:18 +08:00
type Operator int
const (
2015-11-13 09:03:53 +08:00
EqualTo Operator = iota + 1
2015-05-30 06:24:18 +08:00
NotEqualTo
2015-06-30 02:12:54 +08:00
GreaterThan
GreaterThanOrEqualTo
2015-05-30 06:24:18 +08:00
LessThan
2015-06-30 02:12:54 +08:00
LessThanOrEqualTo
2015-05-30 06:24:18 +08:00
MaskEqualTo
)
2016-04-12 16:12:23 +08:00
// Arg is a rule to match a specific syscall argument in Seccomp
2015-05-30 06:24:18 +08:00
type Arg struct {
2015-06-30 02:12:54 +08:00
Index uint ` json:"index" `
Value uint64 ` json:"value" `
ValueTwo uint64 ` json:"value_two" `
Op Operator ` json:"op" `
2015-05-30 06:24:18 +08:00
}
2016-04-12 16:12:23 +08:00
// Syscall is a rule to match a syscall in Seccomp
2015-05-30 06:24:18 +08:00
type Syscall struct {
2015-06-30 02:12:54 +08:00
Name string ` json:"name" `
2015-05-30 06:24:18 +08:00
Action Action ` json:"action" `
Args [ ] * Arg ` json:"args" `
2015-05-23 07:10:20 +08:00
}
2015-05-14 06:42:16 +08:00
// TODO Windows. Many of these fields should be factored out into those parts
// which are common across platforms, and those which are platform specific.
2014-07-09 01:17:05 +08:00
// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
2015-02-01 11:56:27 +08:00
// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
// This is a common option when the container is running in ramdisk
2015-02-12 08:45:23 +08:00
NoPivotRoot bool ` json:"no_pivot_root" `
2015-02-01 11:56:27 +08:00
2015-02-07 10:50:11 +08:00
// ParentDeathSignal specifies the signal that is sent to the container's process in the case
// that the parent process dies.
2015-02-12 08:45:23 +08:00
ParentDeathSignal int ` json:"parent_death_signal" `
2015-02-07 10:50:11 +08:00
2015-02-04 09:44:58 +08:00
// Path to a directory containing the container's root filesystem.
2015-02-12 08:45:23 +08:00
Rootfs string ` json:"rootfs" `
2015-02-04 09:44:58 +08:00
// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
// bind mounts are writtable.
2015-02-12 08:45:23 +08:00
Readonlyfs bool ` json:"readonlyfs" `
2015-02-01 11:56:27 +08:00
2015-10-02 05:03:02 +08:00
// Specifies the mount propagation flags to be applied to /.
RootPropagation int ` json:"rootPropagation" `
2015-04-10 22:45:04 +08:00
2015-02-01 11:56:27 +08:00
// Mounts specify additional source and destination paths that will be mounted inside the container's
// rootfs and mount namespace if specified
2015-02-12 08:45:23 +08:00
Mounts [ ] * Mount ` json:"mounts" `
2015-02-01 11:56:27 +08:00
// The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well!
2015-02-12 08:45:23 +08:00
Devices [ ] * Device ` json:"devices" `
2015-02-01 11:56:27 +08:00
2015-02-12 08:45:23 +08:00
MountLabel string ` json:"mount_label" `
2014-07-09 01:17:05 +08:00
// Hostname optionally sets the container's hostname if provided
2015-02-12 08:45:23 +08:00
Hostname string ` json:"hostname" `
2014-07-09 01:17:05 +08:00
// Namespaces specifies the container's namespaces that it should setup when cloning the init process
// If a namespace is not provided that namespace is shared from the container's parent process
2015-02-12 08:45:23 +08:00
Namespaces Namespaces ` json:"namespaces" `
2014-07-09 01:17:05 +08:00
// Capabilities specify the capabilities to keep when executing the process inside the container
2017-03-15 00:36:38 +08:00
// All capabilities not specified will be dropped from the processes capability mask
Capabilities * Capabilities ` json:"capabilities" `
2014-07-09 01:17:05 +08:00
// Networks specifies the container's network setup to be created
2015-02-12 08:45:23 +08:00
Networks [ ] * Network ` json:"networks" `
2014-07-09 01:17:05 +08:00
// Routes can be specified to create entries in the route table as the container is started
2015-02-12 08:45:23 +08:00
Routes [ ] * Route ` json:"routes" `
2014-07-09 01:17:05 +08:00
// Cgroups specifies specific cgroup settings for the various subsystems that the container is
// placed into to limit the resources the container has available
2015-02-12 08:45:23 +08:00
Cgroups * Cgroup ` json:"cgroups" `
2014-07-09 01:17:05 +08:00
// AppArmorProfile specifies the profile to apply to the process running in the container and is
// change at the time the process is execed
2016-03-04 02:44:33 +08:00
AppArmorProfile string ` json:"apparmor_profile,omitempty" `
2014-07-09 01:17:05 +08:00
// ProcessLabel specifies the label to apply to the process running in the container. It is
// commonly used by selinux
2016-03-04 02:44:33 +08:00
ProcessLabel string ` json:"process_label,omitempty" `
2014-07-09 01:17:05 +08:00
2014-11-27 02:16:53 +08:00
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
2016-03-11 06:35:16 +08:00
Rlimits [ ] Rlimit ` json:"rlimits,omitempty" `
2015-01-27 20:54:19 +08:00
2015-08-27 07:37:24 +08:00
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with
2018-03-16 08:54:47 +08:00
// higher scores are preferred for being killed. If it is unset then we don't touch the current
// value.
2015-08-27 07:37:24 +08:00
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
2018-03-16 08:54:47 +08:00
OomScoreAdj * int ` json:"oom_score_adj,omitempty" `
2015-08-27 07:37:24 +08:00
2015-01-27 20:54:19 +08:00
// UidMappings is an array of User ID mappings for User Namespaces
2015-02-12 08:45:23 +08:00
UidMappings [ ] IDMap ` json:"uid_mappings" `
2015-01-27 20:54:19 +08:00
// GidMappings is an array of Group ID mappings for User Namespaces
2015-02-12 08:45:23 +08:00
GidMappings [ ] IDMap ` json:"gid_mappings" `
2015-02-13 08:23:05 +08:00
// MaskPaths specifies paths within the container's rootfs to mask over with a bind
// mount pointing to /dev/null as to prevent reads of the file.
MaskPaths [ ] string ` json:"mask_paths" `
// ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
// so that these files prevent any writes.
ReadonlyPaths [ ] string ` json:"readonly_paths" `
2015-04-23 10:17:30 +08:00
2015-07-07 07:18:08 +08:00
// Sysctl is a map of properties and their values. It is the equivalent of using
2015-04-23 10:17:30 +08:00
// sysctl -w my.property.name value in Linux.
2015-07-07 07:18:08 +08:00
Sysctl map [ string ] string ` json:"sysctl" `
2015-05-23 07:10:20 +08:00
2015-05-30 06:24:18 +08:00
// Seccomp allows actions to be taken whenever a syscall is made within the container.
2015-06-30 02:12:54 +08:00
// A number of rules are given, each having an action to be taken if a syscall matches it.
// A default action to be taken if no rules match is also given.
2015-05-30 06:24:18 +08:00
Seccomp * Seccomp ` json:"seccomp" `
2015-09-11 08:57:31 +08:00
2016-02-16 19:54:58 +08:00
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
2016-03-04 02:44:33 +08:00
NoNewPrivileges bool ` json:"no_new_privileges,omitempty" `
2016-02-16 19:54:58 +08:00
2015-09-11 08:57:31 +08:00
// Hooks are a collection of actions to perform at various container lifecycle events.
2016-03-03 23:32:59 +08:00
// CommandHooks are serialized to JSON, but other hooks are not.
Hooks * Hooks
2015-09-24 08:13:00 +08:00
// Version is the version of opencontainer specification that is supported.
Version string ` json:"version" `
2016-02-25 02:45:20 +08:00
// Labels are user defined metadata that is stored in the config and populated on the state
Labels [ ] string ` json:"labels" `
2016-06-04 02:53:07 +08:00
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool ` json:"no_new_keyring" `
2016-04-23 21:39:42 +08:00
2018-10-16 12:37:41 +08:00
// IntelRdt specifies settings for Intel RDT group that the container is placed into
// to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
IntelRdt * IntelRdt ` json:"intel_rdt,omitempty" `
Disable rootless mode except RootlessCgMgr when executed as the root in userns
This PR decomposes `libcontainer/configs.Config.Rootless bool` into `RootlessEUID bool` and
`RootlessCgroups bool`, so as to make "runc-in-userns" to be more compatible with "rootful" runc.
`RootlessEUID` denotes that runc is being executed as a non-root user (euid != 0) in
the current user namespace. `RootlessEUID` is almost identical to the former `Rootless`
except cgroups stuff.
`RootlessCgroups` denotes that runc is unlikely to have the full access to cgroups.
`RootlessCgroups` is set to false if runc is executed as the root (euid == 0) in the initial namespace.
Otherwise `RootlessCgroups` is set to true.
(Hint: if `RootlessEUID` is true, `RootlessCgroups` becomes true as well)
When runc is executed as the root (euid == 0) in an user namespace (e.g. by Docker-in-LXD, Podman, Usernetes),
`RootlessEUID` is set to false but `RootlessCgroups` is set to true.
So, "runc-in-userns" behaves almost same as "rootful" runc except that cgroups errors are ignored.
This PR does not have any impact on CLI flags and `state.json`.
Note about CLI:
* Now `runc --rootless=(auto|true|false)` CLI flag is only used for setting `RootlessCgroups`.
* Now `runc spec --rootless` is only required when `RootlessEUID` is set to true.
For runc-in-userns, `runc spec` without `--rootless` should work, when sufficient numbers of
UID/GID are mapped.
Note about `$XDG_RUNTIME_DIR` (e.g. `/run/user/1000`):
* `$XDG_RUNTIME_DIR` is ignored if runc is being executed as the root (euid == 0) in the initial namespace, for backward compatibility.
(`/run/runc` is used)
* If runc is executed as the root (euid == 0) in an user namespace, `$XDG_RUNTIME_DIR` is honored if `$USER != "" && $USER != "root"`.
This allows unprivileged users to allow execute runc as the root in userns, without mounting writable `/run/runc`.
Note about `state.json`:
* `rootless` is set to true when `RootlessEUID == true && RootlessCgroups == true`.
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
2018-07-05 14:28:21 +08:00
// RootlessEUID is set when the runc was launched with non-zero EUID.
// Note that RootlessEUID is set to false when launched with EUID=0 in userns.
// When RootlessEUID is set, runc creates a new userns for the container.
// (config.json needs to contain userns settings)
RootlessEUID bool ` json:"rootless_euid,omitempty" `
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool ` json:"rootless_cgroups,omitempty" `
2015-09-11 08:57:31 +08:00
}
type Hooks struct {
// Prestart commands are executed after the container namespaces are created,
// but before the user supplied command is executed from init.
Prestart [ ] Hook
2015-11-07 07:02:50 +08:00
// Poststart commands are executed after the container init process starts.
Poststart [ ] Hook
2015-09-12 01:28:25 +08:00
// Poststop commands are executed after the container init process exits.
2015-09-11 08:57:31 +08:00
Poststop [ ] Hook
}
2017-03-15 00:36:38 +08:00
type Capabilities struct {
// Bounding is the set of capabilities checked by the kernel.
Bounding [ ] string
// Effective is the set of capabilities checked by the kernel.
Effective [ ] string
// Inheritable is the capabilities preserved across execve.
Inheritable [ ] string
// Permitted is the limiting superset for effective capabilities.
Permitted [ ] string
// Ambient is the ambient set of capabilities that are kept.
Ambient [ ] string
}
2016-03-03 23:32:59 +08:00
func ( hooks * Hooks ) UnmarshalJSON ( b [ ] byte ) error {
var state struct {
Prestart [ ] CommandHook
Poststart [ ] CommandHook
Poststop [ ] CommandHook
}
if err := json . Unmarshal ( b , & state ) ; err != nil {
return err
}
deserialize := func ( shooks [ ] CommandHook ) ( hooks [ ] Hook ) {
for _ , shook := range shooks {
hooks = append ( hooks , shook )
}
return hooks
}
hooks . Prestart = deserialize ( state . Prestart )
hooks . Poststart = deserialize ( state . Poststart )
hooks . Poststop = deserialize ( state . Poststop )
return nil
}
func ( hooks Hooks ) MarshalJSON ( ) ( [ ] byte , error ) {
serialize := func ( hooks [ ] Hook ) ( serializableHooks [ ] CommandHook ) {
for _ , hook := range hooks {
switch chook := hook . ( type ) {
case CommandHook :
serializableHooks = append ( serializableHooks , chook )
default :
logrus . Warnf ( "cannot serialize hook of type %T, skipping" , hook )
}
}
return serializableHooks
}
return json . Marshal ( map [ string ] interface { } {
"prestart" : serialize ( hooks . Prestart ) ,
"poststart" : serialize ( hooks . Poststart ) ,
"poststop" : serialize ( hooks . Poststop ) ,
} )
}
2015-09-11 08:57:31 +08:00
// HookState is the payload provided to a hook on execution.
2016-12-19 23:38:56 +08:00
type HookState specs . State
2015-09-11 08:57:31 +08:00
type Hook interface {
// Run executes the hook with the provided state.
2015-09-12 01:28:25 +08:00
Run ( HookState ) error
2015-09-11 08:57:31 +08:00
}
2016-04-12 16:12:23 +08:00
// NewFunctionHook will call the provided function when the hook is run.
2015-09-12 01:28:25 +08:00
func NewFunctionHook ( f func ( HookState ) error ) FuncHook {
return FuncHook {
2015-09-11 09:15:00 +08:00
run : f ,
2015-09-11 08:57:31 +08:00
}
}
type FuncHook struct {
2015-09-12 01:28:25 +08:00
run func ( HookState ) error
2015-09-11 09:15:00 +08:00
}
2015-09-12 01:28:25 +08:00
func ( f FuncHook ) Run ( s HookState ) error {
2015-09-11 09:15:00 +08:00
return f . run ( s )
2015-09-11 08:57:31 +08:00
}
type Command struct {
2016-03-30 02:14:59 +08:00
Path string ` json:"path" `
Args [ ] string ` json:"args" `
Env [ ] string ` json:"env" `
Dir string ` json:"dir" `
Timeout * time . Duration ` json:"timeout" `
2015-09-11 08:57:31 +08:00
}
2016-04-12 16:12:23 +08:00
// NewCommandHook will execute the provided command when the hook is run.
2015-09-12 01:28:25 +08:00
func NewCommandHook ( cmd Command ) CommandHook {
return CommandHook {
2015-09-11 08:57:31 +08:00
Command : cmd ,
}
}
type CommandHook struct {
Command
}
2015-09-12 01:28:25 +08:00
func ( c Command ) Run ( s HookState ) error {
2015-09-11 08:57:31 +08:00
b , err := json . Marshal ( s )
if err != nil {
return err
}
2016-08-09 01:14:11 +08:00
var stdout , stderr bytes . Buffer
2015-09-11 08:57:31 +08:00
cmd := exec . Cmd {
2016-08-09 01:14:11 +08:00
Path : c . Path ,
Args : c . Args ,
Env : c . Env ,
Stdin : bytes . NewReader ( b ) ,
Stdout : & stdout ,
Stderr : & stderr ,
}
if err := cmd . Start ( ) ; err != nil {
return err
2015-09-11 08:57:31 +08:00
}
2016-03-30 02:14:59 +08:00
errC := make ( chan error , 1 )
go func ( ) {
2016-08-09 01:14:11 +08:00
err := cmd . Wait ( )
2016-04-09 02:02:44 +08:00
if err != nil {
2016-08-09 01:14:11 +08:00
err = fmt . Errorf ( "error running hook: %v, stdout: %s, stderr: %s" , err , stdout . String ( ) , stderr . String ( ) )
2016-04-09 02:02:44 +08:00
}
errC <- err
2016-03-30 02:14:59 +08:00
} ( )
2016-08-09 01:14:11 +08:00
var timerCh <- chan time . Time
2016-03-30 02:14:59 +08:00
if c . Timeout != nil {
2016-08-09 01:14:11 +08:00
timer := time . NewTimer ( * c . Timeout )
defer timer . Stop ( )
timerCh = timer . C
}
select {
case err := <- errC :
return err
case <- timerCh :
cmd . Process . Kill ( )
cmd . Wait ( )
return fmt . Errorf ( "hook ran past specified timeout of %.1fs" , c . Timeout . Seconds ( ) )
2016-03-30 02:14:59 +08:00
}
2014-07-09 01:17:05 +08:00
}