2016-04-23 21:39:42 +08:00
|
|
|
package specconv
|
|
|
|
|
|
|
|
import (
|
2016-05-09 19:26:11 +08:00
|
|
|
"os"
|
|
|
|
"strings"
|
2016-04-23 21:39:42 +08:00
|
|
|
|
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
|
|
)
|
|
|
|
|
2016-05-09 19:26:11 +08:00
|
|
|
// Example returns an example spec file, with many options set so a user can
|
|
|
|
// see what a standard spec file looks like.
|
|
|
|
func Example() *specs.Spec {
|
2016-04-23 21:39:42 +08:00
|
|
|
return &specs.Spec{
|
|
|
|
Version: specs.Version,
|
2017-07-13 00:00:49 +08:00
|
|
|
Root: &specs.Root{
|
2016-04-23 21:39:42 +08:00
|
|
|
Path: "rootfs",
|
|
|
|
Readonly: true,
|
|
|
|
},
|
2017-06-02 07:17:21 +08:00
|
|
|
Process: &specs.Process{
|
2016-04-23 21:39:42 +08:00
|
|
|
Terminal: true,
|
|
|
|
User: specs.User{},
|
|
|
|
Args: []string{
|
|
|
|
"sh",
|
|
|
|
},
|
|
|
|
Env: []string{
|
|
|
|
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
|
|
|
|
"TERM=xterm",
|
|
|
|
},
|
|
|
|
Cwd: "/",
|
|
|
|
NoNewPrivileges: true,
|
|
|
|
Capabilities: &specs.LinuxCapabilities{
|
|
|
|
Bounding: []string{
|
|
|
|
"CAP_AUDIT_WRITE",
|
|
|
|
"CAP_KILL",
|
|
|
|
"CAP_NET_BIND_SERVICE",
|
|
|
|
},
|
|
|
|
Permitted: []string{
|
|
|
|
"CAP_AUDIT_WRITE",
|
|
|
|
"CAP_KILL",
|
|
|
|
"CAP_NET_BIND_SERVICE",
|
|
|
|
},
|
|
|
|
Inheritable: []string{
|
|
|
|
"CAP_AUDIT_WRITE",
|
|
|
|
"CAP_KILL",
|
|
|
|
"CAP_NET_BIND_SERVICE",
|
|
|
|
},
|
|
|
|
Ambient: []string{
|
|
|
|
"CAP_AUDIT_WRITE",
|
|
|
|
"CAP_KILL",
|
|
|
|
"CAP_NET_BIND_SERVICE",
|
|
|
|
},
|
|
|
|
Effective: []string{
|
|
|
|
"CAP_AUDIT_WRITE",
|
|
|
|
"CAP_KILL",
|
|
|
|
"CAP_NET_BIND_SERVICE",
|
|
|
|
},
|
|
|
|
},
|
2017-07-13 00:00:49 +08:00
|
|
|
Rlimits: []specs.POSIXRlimit{
|
2016-04-23 21:39:42 +08:00
|
|
|
{
|
|
|
|
Type: "RLIMIT_NOFILE",
|
|
|
|
Hard: uint64(1024),
|
|
|
|
Soft: uint64(1024),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
Hostname: "runc",
|
|
|
|
Mounts: []specs.Mount{
|
|
|
|
{
|
|
|
|
Destination: "/proc",
|
|
|
|
Type: "proc",
|
|
|
|
Source: "proc",
|
|
|
|
Options: nil,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/dev",
|
|
|
|
Type: "tmpfs",
|
|
|
|
Source: "tmpfs",
|
|
|
|
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/dev/pts",
|
|
|
|
Type: "devpts",
|
|
|
|
Source: "devpts",
|
|
|
|
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/dev/shm",
|
|
|
|
Type: "tmpfs",
|
|
|
|
Source: "shm",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/dev/mqueue",
|
|
|
|
Type: "mqueue",
|
|
|
|
Source: "mqueue",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev"},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/sys",
|
|
|
|
Type: "sysfs",
|
|
|
|
Source: "sysfs",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev", "ro"},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/sys/fs/cgroup",
|
|
|
|
Type: "cgroup",
|
|
|
|
Source: "cgroup",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
Linux: &specs.Linux{
|
|
|
|
MaskedPaths: []string{
|
2019-08-26 11:25:56 +08:00
|
|
|
"/proc/acpi",
|
|
|
|
"/proc/asound",
|
2016-04-23 21:39:42 +08:00
|
|
|
"/proc/kcore",
|
2019-08-26 11:25:56 +08:00
|
|
|
"/proc/keys",
|
2016-04-23 21:39:42 +08:00
|
|
|
"/proc/latency_stats",
|
|
|
|
"/proc/timer_list",
|
|
|
|
"/proc/timer_stats",
|
|
|
|
"/proc/sched_debug",
|
|
|
|
"/sys/firmware",
|
2017-11-05 01:38:14 +08:00
|
|
|
"/proc/scsi",
|
2016-04-23 21:39:42 +08:00
|
|
|
},
|
|
|
|
ReadonlyPaths: []string{
|
|
|
|
"/proc/bus",
|
|
|
|
"/proc/fs",
|
|
|
|
"/proc/irq",
|
|
|
|
"/proc/sys",
|
|
|
|
"/proc/sysrq-trigger",
|
|
|
|
},
|
|
|
|
Resources: &specs.LinuxResources{
|
|
|
|
Devices: []specs.LinuxDeviceCgroup{
|
|
|
|
{
|
|
|
|
Allow: false,
|
|
|
|
Access: "rwm",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
Namespaces: []specs.LinuxNamespace{
|
|
|
|
{
|
|
|
|
Type: "pid",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: "network",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: "ipc",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: "uts",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: "mount",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
2016-05-09 19:26:11 +08:00
|
|
|
|
2018-02-24 05:56:38 +08:00
|
|
|
// ToRootless converts the given spec file into one that should work with
|
Disable rootless mode except RootlessCgMgr when executed as the root in userns
This PR decomposes `libcontainer/configs.Config.Rootless bool` into `RootlessEUID bool` and
`RootlessCgroups bool`, so as to make "runc-in-userns" to be more compatible with "rootful" runc.
`RootlessEUID` denotes that runc is being executed as a non-root user (euid != 0) in
the current user namespace. `RootlessEUID` is almost identical to the former `Rootless`
except cgroups stuff.
`RootlessCgroups` denotes that runc is unlikely to have the full access to cgroups.
`RootlessCgroups` is set to false if runc is executed as the root (euid == 0) in the initial namespace.
Otherwise `RootlessCgroups` is set to true.
(Hint: if `RootlessEUID` is true, `RootlessCgroups` becomes true as well)
When runc is executed as the root (euid == 0) in an user namespace (e.g. by Docker-in-LXD, Podman, Usernetes),
`RootlessEUID` is set to false but `RootlessCgroups` is set to true.
So, "runc-in-userns" behaves almost same as "rootful" runc except that cgroups errors are ignored.
This PR does not have any impact on CLI flags and `state.json`.
Note about CLI:
* Now `runc --rootless=(auto|true|false)` CLI flag is only used for setting `RootlessCgroups`.
* Now `runc spec --rootless` is only required when `RootlessEUID` is set to true.
For runc-in-userns, `runc spec` without `--rootless` should work, when sufficient numbers of
UID/GID are mapped.
Note about `$XDG_RUNTIME_DIR` (e.g. `/run/user/1000`):
* `$XDG_RUNTIME_DIR` is ignored if runc is being executed as the root (euid == 0) in the initial namespace, for backward compatibility.
(`/run/runc` is used)
* If runc is executed as the root (euid == 0) in an user namespace, `$XDG_RUNTIME_DIR` is honored if `$USER != "" && $USER != "root"`.
This allows unprivileged users to allow execute runc as the root in userns, without mounting writable `/run/runc`.
Note about `state.json`:
* `rootless` is set to true when `RootlessEUID == true && RootlessCgroups == true`.
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
2018-07-05 14:28:21 +08:00
|
|
|
// rootless containers (euid != 0), by removing incompatible options and adding others that
|
2018-02-24 05:56:38 +08:00
|
|
|
// are needed.
|
2016-05-09 19:26:11 +08:00
|
|
|
func ToRootless(spec *specs.Spec) {
|
|
|
|
var namespaces []specs.LinuxNamespace
|
|
|
|
|
|
|
|
// Remove networkns from the spec.
|
|
|
|
for _, ns := range spec.Linux.Namespaces {
|
|
|
|
switch ns.Type {
|
|
|
|
case specs.NetworkNamespace, specs.UserNamespace:
|
|
|
|
// Do nothing.
|
|
|
|
default:
|
|
|
|
namespaces = append(namespaces, ns)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Add userns to the spec.
|
|
|
|
namespaces = append(namespaces, specs.LinuxNamespace{
|
|
|
|
Type: specs.UserNamespace,
|
|
|
|
})
|
|
|
|
spec.Linux.Namespaces = namespaces
|
|
|
|
|
|
|
|
// Add mappings for the current user.
|
|
|
|
spec.Linux.UIDMappings = []specs.LinuxIDMapping{{
|
|
|
|
HostID: uint32(os.Geteuid()),
|
|
|
|
ContainerID: 0,
|
|
|
|
Size: 1,
|
|
|
|
}}
|
|
|
|
spec.Linux.GIDMappings = []specs.LinuxIDMapping{{
|
|
|
|
HostID: uint32(os.Getegid()),
|
|
|
|
ContainerID: 0,
|
|
|
|
Size: 1,
|
|
|
|
}}
|
|
|
|
|
|
|
|
// Fix up mounts.
|
|
|
|
var mounts []specs.Mount
|
|
|
|
for _, mount := range spec.Mounts {
|
|
|
|
// Ignore all mounts that are under /sys.
|
|
|
|
if strings.HasPrefix(mount.Destination, "/sys") {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove all gid= and uid= mappings.
|
|
|
|
var options []string
|
|
|
|
for _, option := range mount.Options {
|
|
|
|
if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
|
|
|
|
options = append(options, option)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mount.Options = options
|
|
|
|
mounts = append(mounts, mount)
|
|
|
|
}
|
|
|
|
// Add the sysfs mount as an rbind.
|
|
|
|
mounts = append(mounts, specs.Mount{
|
|
|
|
Source: "/sys",
|
|
|
|
Destination: "/sys",
|
|
|
|
Type: "none",
|
|
|
|
Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
|
|
|
|
})
|
|
|
|
spec.Mounts = mounts
|
|
|
|
|
|
|
|
// Remove cgroup settings.
|
|
|
|
spec.Linux.Resources = nil
|
|
|
|
}
|