2016-04-13 05:41:09 +08:00
|
|
|
// +build linux
|
|
|
|
|
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2016-09-03 01:31:54 +08:00
|
|
|
"net"
|
2016-04-13 05:41:09 +08:00
|
|
|
"os"
|
2017-09-06 20:13:47 +08:00
|
|
|
"os/exec"
|
2016-04-13 05:41:09 +08:00
|
|
|
"path/filepath"
|
2016-06-10 18:35:13 +08:00
|
|
|
"strconv"
|
2016-04-13 05:41:09 +08:00
|
|
|
|
|
|
|
"github.com/opencontainers/runc/libcontainer"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
|
2017-03-15 00:36:38 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
2016-04-13 05:41:09 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/specconv"
|
2017-03-03 04:53:06 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
2016-04-13 05:41:09 +08:00
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
2019-04-03 12:08:06 +08:00
|
|
|
selinux "github.com/opencontainers/selinux/go-selinux"
|
2017-05-11 23:06:37 +08:00
|
|
|
|
2017-07-19 22:28:59 +08:00
|
|
|
"github.com/coreos/go-systemd/activation"
|
2019-02-28 16:44:37 +08:00
|
|
|
"github.com/pkg/errors"
|
2017-07-19 22:28:59 +08:00
|
|
|
"github.com/sirupsen/logrus"
|
|
|
|
"github.com/urfave/cli"
|
2017-05-11 23:06:37 +08:00
|
|
|
"golang.org/x/sys/unix"
|
2016-04-13 05:41:09 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
var errEmptyID = errors.New("container id cannot be empty")
|
|
|
|
|
|
|
|
// loadFactory returns the configured factory instance for execing containers.
|
|
|
|
func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
|
|
|
|
root := context.GlobalString("root")
|
|
|
|
abs, err := filepath.Abs(root)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-09-06 20:13:47 +08:00
|
|
|
|
|
|
|
// We default to cgroupfs, and can only use systemd if the system is a
|
|
|
|
// systemd box.
|
2016-04-13 05:41:09 +08:00
|
|
|
cgroupManager := libcontainer.Cgroupfs
|
Disable rootless mode except RootlessCgMgr when executed as the root in userns
This PR decomposes `libcontainer/configs.Config.Rootless bool` into `RootlessEUID bool` and
`RootlessCgroups bool`, so as to make "runc-in-userns" to be more compatible with "rootful" runc.
`RootlessEUID` denotes that runc is being executed as a non-root user (euid != 0) in
the current user namespace. `RootlessEUID` is almost identical to the former `Rootless`
except cgroups stuff.
`RootlessCgroups` denotes that runc is unlikely to have the full access to cgroups.
`RootlessCgroups` is set to false if runc is executed as the root (euid == 0) in the initial namespace.
Otherwise `RootlessCgroups` is set to true.
(Hint: if `RootlessEUID` is true, `RootlessCgroups` becomes true as well)
When runc is executed as the root (euid == 0) in an user namespace (e.g. by Docker-in-LXD, Podman, Usernetes),
`RootlessEUID` is set to false but `RootlessCgroups` is set to true.
So, "runc-in-userns" behaves almost same as "rootful" runc except that cgroups errors are ignored.
This PR does not have any impact on CLI flags and `state.json`.
Note about CLI:
* Now `runc --rootless=(auto|true|false)` CLI flag is only used for setting `RootlessCgroups`.
* Now `runc spec --rootless` is only required when `RootlessEUID` is set to true.
For runc-in-userns, `runc spec` without `--rootless` should work, when sufficient numbers of
UID/GID are mapped.
Note about `$XDG_RUNTIME_DIR` (e.g. `/run/user/1000`):
* `$XDG_RUNTIME_DIR` is ignored if runc is being executed as the root (euid == 0) in the initial namespace, for backward compatibility.
(`/run/runc` is used)
* If runc is executed as the root (euid == 0) in an user namespace, `$XDG_RUNTIME_DIR` is honored if `$USER != "" && $USER != "root"`.
This allows unprivileged users to allow execute runc as the root in userns, without mounting writable `/run/runc`.
Note about `state.json`:
* `rootless` is set to true when `RootlessEUID == true && RootlessCgroups == true`.
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
2018-07-05 14:28:21 +08:00
|
|
|
rootlessCg, err := shouldUseRootlessCgroupManager(context)
|
2018-05-30 10:25:43 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
Disable rootless mode except RootlessCgMgr when executed as the root in userns
This PR decomposes `libcontainer/configs.Config.Rootless bool` into `RootlessEUID bool` and
`RootlessCgroups bool`, so as to make "runc-in-userns" to be more compatible with "rootful" runc.
`RootlessEUID` denotes that runc is being executed as a non-root user (euid != 0) in
the current user namespace. `RootlessEUID` is almost identical to the former `Rootless`
except cgroups stuff.
`RootlessCgroups` denotes that runc is unlikely to have the full access to cgroups.
`RootlessCgroups` is set to false if runc is executed as the root (euid == 0) in the initial namespace.
Otherwise `RootlessCgroups` is set to true.
(Hint: if `RootlessEUID` is true, `RootlessCgroups` becomes true as well)
When runc is executed as the root (euid == 0) in an user namespace (e.g. by Docker-in-LXD, Podman, Usernetes),
`RootlessEUID` is set to false but `RootlessCgroups` is set to true.
So, "runc-in-userns" behaves almost same as "rootful" runc except that cgroups errors are ignored.
This PR does not have any impact on CLI flags and `state.json`.
Note about CLI:
* Now `runc --rootless=(auto|true|false)` CLI flag is only used for setting `RootlessCgroups`.
* Now `runc spec --rootless` is only required when `RootlessEUID` is set to true.
For runc-in-userns, `runc spec` without `--rootless` should work, when sufficient numbers of
UID/GID are mapped.
Note about `$XDG_RUNTIME_DIR` (e.g. `/run/user/1000`):
* `$XDG_RUNTIME_DIR` is ignored if runc is being executed as the root (euid == 0) in the initial namespace, for backward compatibility.
(`/run/runc` is used)
* If runc is executed as the root (euid == 0) in an user namespace, `$XDG_RUNTIME_DIR` is honored if `$USER != "" && $USER != "root"`.
This allows unprivileged users to allow execute runc as the root in userns, without mounting writable `/run/runc`.
Note about `state.json`:
* `rootless` is set to true when `RootlessEUID == true && RootlessCgroups == true`.
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
2018-07-05 14:28:21 +08:00
|
|
|
if rootlessCg {
|
2018-03-16 08:33:04 +08:00
|
|
|
cgroupManager = libcontainer.RootlessCgroupfs
|
|
|
|
}
|
2016-04-13 05:41:09 +08:00
|
|
|
if context.GlobalBool("systemd-cgroup") {
|
|
|
|
if systemd.UseSystemd() {
|
|
|
|
cgroupManager = libcontainer.SystemdCgroups
|
|
|
|
} else {
|
|
|
|
return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
|
|
|
|
}
|
|
|
|
}
|
2017-07-21 01:33:01 +08:00
|
|
|
|
|
|
|
intelRdtManager := libcontainer.IntelRdtFs
|
2018-10-16 12:37:41 +08:00
|
|
|
if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() {
|
2017-07-21 01:33:01 +08:00
|
|
|
intelRdtManager = nil
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
}
|
2017-07-21 01:33:01 +08:00
|
|
|
|
2017-09-06 20:13:47 +08:00
|
|
|
// We resolve the paths for {newuidmap,newgidmap} from the context of runc,
|
|
|
|
// to avoid doing a path lookup in the nsexec context. TODO: The binary
|
|
|
|
// names are not currently configurable.
|
|
|
|
newuidmap, err := exec.LookPath("newuidmap")
|
|
|
|
if err != nil {
|
|
|
|
newuidmap = ""
|
|
|
|
}
|
|
|
|
newgidmap, err := exec.LookPath("newgidmap")
|
|
|
|
if err != nil {
|
|
|
|
newgidmap = ""
|
|
|
|
}
|
|
|
|
|
2017-07-21 01:33:01 +08:00
|
|
|
return libcontainer.New(abs, cgroupManager, intelRdtManager,
|
|
|
|
libcontainer.CriuPath(context.GlobalString("criu")),
|
2017-09-06 20:13:47 +08:00
|
|
|
libcontainer.NewuidmapPath(newuidmap),
|
|
|
|
libcontainer.NewgidmapPath(newgidmap))
|
2016-04-13 05:41:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// getContainer returns the specified container instance by loading it from state
|
|
|
|
// with the default factory.
|
|
|
|
func getContainer(context *cli.Context) (libcontainer.Container, error) {
|
|
|
|
id := context.Args().First()
|
|
|
|
if id == "" {
|
|
|
|
return nil, errEmptyID
|
|
|
|
}
|
|
|
|
factory, err := loadFactory(context)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return factory.Load(id)
|
|
|
|
}
|
|
|
|
|
|
|
|
func fatalf(t string, v ...interface{}) {
|
|
|
|
fatal(fmt.Errorf(t, v...))
|
|
|
|
}
|
|
|
|
|
|
|
|
func getDefaultImagePath(context *cli.Context) string {
|
|
|
|
cwd, err := os.Getwd()
|
|
|
|
if err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
return filepath.Join(cwd, "checkpoint")
|
|
|
|
}
|
|
|
|
|
|
|
|
// newProcess returns a new libcontainer Process with the arguments from the
|
|
|
|
// spec and stdio from the current process.
|
2019-04-19 22:36:52 +08:00
|
|
|
func newProcess(p specs.Process, init bool, logLevel string) (*libcontainer.Process, error) {
|
2016-04-13 05:41:09 +08:00
|
|
|
lp := &libcontainer.Process{
|
|
|
|
Args: p.Args,
|
|
|
|
Env: p.Env,
|
|
|
|
// TODO: fix libcontainer's API to better support uid/gid in a typesafe way.
|
|
|
|
User: fmt.Sprintf("%d:%d", p.User.UID, p.User.GID),
|
|
|
|
Cwd: p.Cwd,
|
|
|
|
Label: p.SelinuxLabel,
|
|
|
|
NoNewPrivileges: &p.NoNewPrivileges,
|
|
|
|
AppArmorProfile: p.ApparmorProfile,
|
2018-06-02 03:56:13 +08:00
|
|
|
Init: init,
|
2019-04-19 22:36:52 +08:00
|
|
|
LogLevel: logLevel,
|
2016-04-13 05:41:09 +08:00
|
|
|
}
|
2017-09-26 21:39:46 +08:00
|
|
|
|
|
|
|
if p.ConsoleSize != nil {
|
|
|
|
lp.ConsoleWidth = uint16(p.ConsoleSize.Width)
|
|
|
|
lp.ConsoleHeight = uint16(p.ConsoleSize.Height)
|
|
|
|
}
|
|
|
|
|
2017-03-15 00:36:38 +08:00
|
|
|
if p.Capabilities != nil {
|
|
|
|
lp.Capabilities = &configs.Capabilities{}
|
|
|
|
lp.Capabilities.Bounding = p.Capabilities.Bounding
|
|
|
|
lp.Capabilities.Effective = p.Capabilities.Effective
|
|
|
|
lp.Capabilities.Inheritable = p.Capabilities.Inheritable
|
|
|
|
lp.Capabilities.Permitted = p.Capabilities.Permitted
|
|
|
|
lp.Capabilities.Ambient = p.Capabilities.Ambient
|
|
|
|
}
|
2016-06-10 18:35:13 +08:00
|
|
|
for _, gid := range p.User.AdditionalGids {
|
|
|
|
lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10))
|
|
|
|
}
|
2016-04-13 05:41:09 +08:00
|
|
|
for _, rlimit := range p.Rlimits {
|
|
|
|
rl, err := createLibContainerRlimit(rlimit)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
lp.Rlimits = append(lp.Rlimits, rl)
|
|
|
|
}
|
|
|
|
return lp, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func destroy(container libcontainer.Container) {
|
|
|
|
if err := container.Destroy(); err != nil {
|
|
|
|
logrus.Error(err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-06-03 23:29:34 +08:00
|
|
|
// setupIO modifies the given process config according to the options.
|
2017-03-03 04:53:06 +08:00
|
|
|
func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, detach bool, sockpath string) (*tty, error) {
|
2016-04-13 05:41:09 +08:00
|
|
|
if createTTY {
|
2016-06-03 23:29:34 +08:00
|
|
|
process.Stdin = nil
|
|
|
|
process.Stdout = nil
|
|
|
|
process.Stderr = nil
|
2017-03-03 04:53:06 +08:00
|
|
|
t := &tty{}
|
|
|
|
if !detach {
|
|
|
|
parent, child, err := utils.NewSockPair("console")
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
process.ConsoleSocket = child
|
|
|
|
t.postStart = append(t.postStart, parent, child)
|
|
|
|
t.consoleC = make(chan error, 1)
|
|
|
|
go func() {
|
|
|
|
if err := t.recvtty(process, parent); err != nil {
|
|
|
|
t.consoleC <- err
|
|
|
|
}
|
|
|
|
t.consoleC <- nil
|
|
|
|
}()
|
|
|
|
} else {
|
|
|
|
// the caller of runc will handle receiving the console master
|
|
|
|
conn, err := net.Dial("unix", sockpath)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
uc, ok := conn.(*net.UnixConn)
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("casting to UnixConn failed")
|
|
|
|
}
|
|
|
|
t.postStart = append(t.postStart, uc)
|
|
|
|
socket, err := uc.File()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
t.postStart = append(t.postStart, socket)
|
|
|
|
process.ConsoleSocket = socket
|
|
|
|
}
|
|
|
|
return t, nil
|
2016-04-13 05:41:09 +08:00
|
|
|
}
|
2017-03-02 02:12:54 +08:00
|
|
|
// when runc will detach the caller provides the stdio to runc via runc's 0,1,2
|
|
|
|
// and the container's process inherits runc's stdio.
|
2016-04-13 05:41:09 +08:00
|
|
|
if detach {
|
2017-03-02 02:12:54 +08:00
|
|
|
if err := inheritStdio(process); err != nil {
|
2016-04-13 05:41:09 +08:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return &tty{}, nil
|
|
|
|
}
|
2017-03-02 02:12:54 +08:00
|
|
|
return setupProcessPipes(process, rootuid, rootgid)
|
2016-04-13 05:41:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// createPidFile creates a file with the processes pid inside it atomically
|
|
|
|
// it creates a temp file with the paths filename + '.' infront of it
|
|
|
|
// then renames the file
|
|
|
|
func createPidFile(path string, process *libcontainer.Process) error {
|
|
|
|
pid, err := process.Pid()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
var (
|
|
|
|
tmpDir = filepath.Dir(path)
|
|
|
|
tmpName = filepath.Join(tmpDir, fmt.Sprintf(".%s", filepath.Base(path)))
|
|
|
|
)
|
|
|
|
f, err := os.OpenFile(tmpName, os.O_RDWR|os.O_CREATE|os.O_EXCL|os.O_SYNC, 0666)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
_, err = fmt.Fprintf(f, "%d", pid)
|
|
|
|
f.Close()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return os.Rename(tmpName, path)
|
|
|
|
}
|
|
|
|
|
|
|
|
func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
|
Disable rootless mode except RootlessCgMgr when executed as the root in userns
This PR decomposes `libcontainer/configs.Config.Rootless bool` into `RootlessEUID bool` and
`RootlessCgroups bool`, so as to make "runc-in-userns" to be more compatible with "rootful" runc.
`RootlessEUID` denotes that runc is being executed as a non-root user (euid != 0) in
the current user namespace. `RootlessEUID` is almost identical to the former `Rootless`
except cgroups stuff.
`RootlessCgroups` denotes that runc is unlikely to have the full access to cgroups.
`RootlessCgroups` is set to false if runc is executed as the root (euid == 0) in the initial namespace.
Otherwise `RootlessCgroups` is set to true.
(Hint: if `RootlessEUID` is true, `RootlessCgroups` becomes true as well)
When runc is executed as the root (euid == 0) in an user namespace (e.g. by Docker-in-LXD, Podman, Usernetes),
`RootlessEUID` is set to false but `RootlessCgroups` is set to true.
So, "runc-in-userns" behaves almost same as "rootful" runc except that cgroups errors are ignored.
This PR does not have any impact on CLI flags and `state.json`.
Note about CLI:
* Now `runc --rootless=(auto|true|false)` CLI flag is only used for setting `RootlessCgroups`.
* Now `runc spec --rootless` is only required when `RootlessEUID` is set to true.
For runc-in-userns, `runc spec` without `--rootless` should work, when sufficient numbers of
UID/GID are mapped.
Note about `$XDG_RUNTIME_DIR` (e.g. `/run/user/1000`):
* `$XDG_RUNTIME_DIR` is ignored if runc is being executed as the root (euid == 0) in the initial namespace, for backward compatibility.
(`/run/runc` is used)
* If runc is executed as the root (euid == 0) in an user namespace, `$XDG_RUNTIME_DIR` is honored if `$USER != "" && $USER != "root"`.
This allows unprivileged users to allow execute runc as the root in userns, without mounting writable `/run/runc`.
Note about `state.json`:
* `rootless` is set to true when `RootlessEUID == true && RootlessCgroups == true`.
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
2018-07-05 14:28:21 +08:00
|
|
|
rootlessCg, err := shouldUseRootlessCgroupManager(context)
|
2018-01-13 15:39:28 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-04-13 05:41:09 +08:00
|
|
|
config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
|
|
|
|
CgroupName: id,
|
|
|
|
UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
|
|
|
|
NoPivotRoot: context.Bool("no-pivot"),
|
2016-06-04 02:53:07 +08:00
|
|
|
NoNewKeyring: context.Bool("no-new-keyring"),
|
2016-04-13 05:41:09 +08:00
|
|
|
Spec: spec,
|
Disable rootless mode except RootlessCgMgr when executed as the root in userns
This PR decomposes `libcontainer/configs.Config.Rootless bool` into `RootlessEUID bool` and
`RootlessCgroups bool`, so as to make "runc-in-userns" to be more compatible with "rootful" runc.
`RootlessEUID` denotes that runc is being executed as a non-root user (euid != 0) in
the current user namespace. `RootlessEUID` is almost identical to the former `Rootless`
except cgroups stuff.
`RootlessCgroups` denotes that runc is unlikely to have the full access to cgroups.
`RootlessCgroups` is set to false if runc is executed as the root (euid == 0) in the initial namespace.
Otherwise `RootlessCgroups` is set to true.
(Hint: if `RootlessEUID` is true, `RootlessCgroups` becomes true as well)
When runc is executed as the root (euid == 0) in an user namespace (e.g. by Docker-in-LXD, Podman, Usernetes),
`RootlessEUID` is set to false but `RootlessCgroups` is set to true.
So, "runc-in-userns" behaves almost same as "rootful" runc except that cgroups errors are ignored.
This PR does not have any impact on CLI flags and `state.json`.
Note about CLI:
* Now `runc --rootless=(auto|true|false)` CLI flag is only used for setting `RootlessCgroups`.
* Now `runc spec --rootless` is only required when `RootlessEUID` is set to true.
For runc-in-userns, `runc spec` without `--rootless` should work, when sufficient numbers of
UID/GID are mapped.
Note about `$XDG_RUNTIME_DIR` (e.g. `/run/user/1000`):
* `$XDG_RUNTIME_DIR` is ignored if runc is being executed as the root (euid == 0) in the initial namespace, for backward compatibility.
(`/run/runc` is used)
* If runc is executed as the root (euid == 0) in an user namespace, `$XDG_RUNTIME_DIR` is honored if `$USER != "" && $USER != "root"`.
This allows unprivileged users to allow execute runc as the root in userns, without mounting writable `/run/runc`.
Note about `state.json`:
* `rootless` is set to true when `RootlessEUID == true && RootlessCgroups == true`.
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
2018-07-05 14:28:21 +08:00
|
|
|
RootlessEUID: os.Geteuid() != 0,
|
|
|
|
RootlessCgroups: rootlessCg,
|
2016-04-13 05:41:09 +08:00
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
factory, err := loadFactory(context)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return factory.Create(id, config)
|
|
|
|
}
|
|
|
|
|
|
|
|
type runner struct {
|
2018-06-02 03:56:13 +08:00
|
|
|
init bool
|
2016-04-13 05:41:09 +08:00
|
|
|
enableSubreaper bool
|
|
|
|
shouldDestroy bool
|
|
|
|
detach bool
|
|
|
|
listenFDs []*os.File
|
2016-12-14 00:21:36 +08:00
|
|
|
preserveFDs int
|
2016-04-13 05:41:09 +08:00
|
|
|
pidFile string
|
2016-09-03 01:31:54 +08:00
|
|
|
consoleSocket string
|
2016-04-13 05:41:09 +08:00
|
|
|
container libcontainer.Container
|
2017-03-02 15:48:00 +08:00
|
|
|
action CtAct
|
2017-02-03 21:17:34 +08:00
|
|
|
notifySocket *notifySocket
|
2017-03-02 15:48:00 +08:00
|
|
|
criuOpts *libcontainer.CriuOpts
|
2019-04-19 22:36:52 +08:00
|
|
|
logLevel string
|
2016-04-13 05:41:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func (r *runner) run(config *specs.Process) (int, error) {
|
2019-04-10 23:24:43 +08:00
|
|
|
var err error
|
|
|
|
defer func() {
|
|
|
|
if err != nil {
|
|
|
|
r.destroy()
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
if err = r.checkTerminal(config); err != nil {
|
2017-03-03 04:53:06 +08:00
|
|
|
return -1, err
|
|
|
|
}
|
2019-04-19 22:36:52 +08:00
|
|
|
process, err := newProcess(*config, r.init, r.logLevel)
|
2016-04-13 05:41:09 +08:00
|
|
|
if err != nil {
|
|
|
|
return -1, err
|
|
|
|
}
|
|
|
|
if len(r.listenFDs) > 0 {
|
|
|
|
process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
|
|
|
|
process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
|
|
|
|
}
|
2016-12-14 00:21:36 +08:00
|
|
|
baseFd := 3 + len(process.ExtraFiles)
|
|
|
|
for i := baseFd; i < baseFd+r.preserveFDs; i++ {
|
2019-04-10 23:24:43 +08:00
|
|
|
_, err = os.Stat(fmt.Sprintf("/proc/self/fd/%d", i))
|
2019-02-28 16:44:37 +08:00
|
|
|
if err != nil {
|
|
|
|
return -1, errors.Wrapf(err, "please check that preserved-fd %d (of %d) is present", i-baseFd, r.preserveFDs)
|
|
|
|
}
|
2016-12-14 00:21:36 +08:00
|
|
|
process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i)))
|
|
|
|
}
|
2017-03-18 01:32:16 +08:00
|
|
|
rootuid, err := r.container.Config().HostRootUID()
|
2016-04-13 05:41:09 +08:00
|
|
|
if err != nil {
|
|
|
|
return -1, err
|
|
|
|
}
|
2017-03-18 01:32:16 +08:00
|
|
|
rootgid, err := r.container.Config().HostRootGID()
|
2016-04-23 21:39:38 +08:00
|
|
|
if err != nil {
|
|
|
|
return -1, err
|
|
|
|
}
|
2017-03-03 04:53:06 +08:00
|
|
|
var (
|
2017-03-02 15:48:00 +08:00
|
|
|
detach = r.detach || (r.action == CT_ACT_CREATE)
|
2017-03-03 04:53:06 +08:00
|
|
|
)
|
2016-06-03 23:29:34 +08:00
|
|
|
// Setting up IO is a two stage process. We need to modify process to deal
|
|
|
|
// with detaching containers, and then we get a tty after the container has
|
|
|
|
// started.
|
2017-02-03 21:17:34 +08:00
|
|
|
handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
|
2017-03-03 04:53:06 +08:00
|
|
|
tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
|
2016-06-03 23:29:34 +08:00
|
|
|
if err != nil {
|
|
|
|
return -1, err
|
|
|
|
}
|
2017-01-26 09:02:00 +08:00
|
|
|
defer tty.Close()
|
2017-03-02 15:48:00 +08:00
|
|
|
|
|
|
|
switch r.action {
|
|
|
|
case CT_ACT_CREATE:
|
|
|
|
err = r.container.Start(process)
|
|
|
|
case CT_ACT_RESTORE:
|
|
|
|
err = r.container.Restore(process, r.criuOpts)
|
|
|
|
case CT_ACT_RUN:
|
|
|
|
err = r.container.Run(process)
|
|
|
|
default:
|
|
|
|
panic("Unknown action")
|
|
|
|
}
|
|
|
|
if err != nil {
|
2016-04-13 05:41:09 +08:00
|
|
|
return -1, err
|
|
|
|
}
|
2019-04-10 23:24:43 +08:00
|
|
|
if err = tty.waitConsole(); err != nil {
|
2017-03-03 04:53:06 +08:00
|
|
|
r.terminate(process)
|
|
|
|
return -1, err
|
2016-09-03 01:31:54 +08:00
|
|
|
}
|
2017-01-07 08:21:23 +08:00
|
|
|
if err = tty.ClosePostStart(); err != nil {
|
2016-04-13 05:41:09 +08:00
|
|
|
r.terminate(process)
|
|
|
|
return -1, err
|
|
|
|
}
|
|
|
|
if r.pidFile != "" {
|
2017-01-07 08:21:23 +08:00
|
|
|
if err = createPidFile(r.pidFile, process); err != nil {
|
2016-04-13 05:41:09 +08:00
|
|
|
r.terminate(process)
|
|
|
|
return -1, err
|
|
|
|
}
|
|
|
|
}
|
2017-02-09 19:27:12 +08:00
|
|
|
status, err := handler.forward(process, tty, detach)
|
2016-04-13 05:41:09 +08:00
|
|
|
if err != nil {
|
|
|
|
r.terminate(process)
|
|
|
|
}
|
2017-02-09 19:27:12 +08:00
|
|
|
if detach {
|
|
|
|
return 0, nil
|
|
|
|
}
|
2016-04-13 05:41:09 +08:00
|
|
|
r.destroy()
|
|
|
|
return status, err
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *runner) destroy() {
|
|
|
|
if r.shouldDestroy {
|
|
|
|
destroy(r.container)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *runner) terminate(p *libcontainer.Process) {
|
2017-05-11 23:06:37 +08:00
|
|
|
_ = p.Signal(unix.SIGKILL)
|
2017-01-07 08:21:23 +08:00
|
|
|
_, _ = p.Wait()
|
2016-04-13 05:41:09 +08:00
|
|
|
}
|
|
|
|
|
2017-03-03 04:53:06 +08:00
|
|
|
func (r *runner) checkTerminal(config *specs.Process) error {
|
2017-03-02 15:48:00 +08:00
|
|
|
detach := r.detach || (r.action == CT_ACT_CREATE)
|
2017-03-03 04:53:06 +08:00
|
|
|
// Check command-line for sanity.
|
|
|
|
if detach && config.Terminal && r.consoleSocket == "" {
|
|
|
|
return fmt.Errorf("cannot allocate tty if runc will detach without setting console socket")
|
|
|
|
}
|
|
|
|
if (!detach || !config.Terminal) && r.consoleSocket != "" {
|
|
|
|
return fmt.Errorf("cannot use console socket if runc will not detach or allocate tty")
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-04-13 05:41:09 +08:00
|
|
|
func validateProcessSpec(spec *specs.Process) error {
|
|
|
|
if spec.Cwd == "" {
|
|
|
|
return fmt.Errorf("Cwd property must not be empty")
|
|
|
|
}
|
|
|
|
if !filepath.IsAbs(spec.Cwd) {
|
|
|
|
return fmt.Errorf("Cwd must be an absolute path")
|
|
|
|
}
|
|
|
|
if len(spec.Args) == 0 {
|
|
|
|
return fmt.Errorf("args must not be empty")
|
|
|
|
}
|
2019-04-03 12:08:06 +08:00
|
|
|
if spec.SelinuxLabel != "" && !selinux.GetEnabled() {
|
|
|
|
return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
|
|
|
|
}
|
2016-04-13 05:41:09 +08:00
|
|
|
return nil
|
|
|
|
}
|
2016-05-26 02:24:26 +08:00
|
|
|
|
2017-03-02 15:48:00 +08:00
|
|
|
type CtAct uint8
|
|
|
|
|
|
|
|
const (
|
|
|
|
CT_ACT_CREATE CtAct = iota + 1
|
|
|
|
CT_ACT_RUN
|
|
|
|
CT_ACT_RESTORE
|
|
|
|
)
|
|
|
|
|
|
|
|
func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) {
|
2016-05-26 02:24:26 +08:00
|
|
|
id := context.Args().First()
|
|
|
|
if id == "" {
|
|
|
|
return -1, errEmptyID
|
|
|
|
}
|
2017-02-03 21:17:34 +08:00
|
|
|
|
|
|
|
notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id)
|
|
|
|
if notifySocket != nil {
|
2018-05-26 00:04:06 +08:00
|
|
|
if err := notifySocket.setupSpec(context, spec); err != nil {
|
|
|
|
return -1, err
|
|
|
|
}
|
2017-02-03 21:17:34 +08:00
|
|
|
}
|
|
|
|
|
2016-05-26 02:24:26 +08:00
|
|
|
container, err := createContainer(context, id, spec)
|
|
|
|
if err != nil {
|
|
|
|
return -1, err
|
|
|
|
}
|
2017-02-03 21:17:34 +08:00
|
|
|
|
|
|
|
if notifySocket != nil {
|
2018-05-26 00:04:06 +08:00
|
|
|
err := notifySocket.setupSocketDirectory()
|
2017-08-17 17:41:19 +08:00
|
|
|
if err != nil {
|
|
|
|
return -1, err
|
|
|
|
}
|
2018-05-26 00:04:06 +08:00
|
|
|
if action == CT_ACT_RUN {
|
|
|
|
err := notifySocket.bindSocket()
|
|
|
|
if err != nil {
|
|
|
|
return -1, err
|
|
|
|
}
|
|
|
|
}
|
2017-02-03 21:17:34 +08:00
|
|
|
}
|
|
|
|
|
2016-05-26 02:24:26 +08:00
|
|
|
// Support on-demand socket activation by passing file descriptors into the container init process.
|
|
|
|
listenFDs := []*os.File{}
|
|
|
|
if os.Getenv("LISTEN_FDS") != "" {
|
|
|
|
listenFDs = activation.Files(false)
|
|
|
|
}
|
2019-04-19 22:36:52 +08:00
|
|
|
|
|
|
|
logLevel := "info"
|
|
|
|
if context.GlobalBool("debug") {
|
|
|
|
logLevel = "debug"
|
|
|
|
}
|
|
|
|
|
2016-05-26 02:24:26 +08:00
|
|
|
r := &runner{
|
|
|
|
enableSubreaper: !context.Bool("no-subreaper"),
|
|
|
|
shouldDestroy: true,
|
|
|
|
container: container,
|
|
|
|
listenFDs: listenFDs,
|
2017-02-03 21:17:34 +08:00
|
|
|
notifySocket: notifySocket,
|
2016-09-03 01:31:54 +08:00
|
|
|
consoleSocket: context.String("console-socket"),
|
2016-10-20 17:57:37 +08:00
|
|
|
detach: context.Bool("detach"),
|
2016-05-26 02:24:26 +08:00
|
|
|
pidFile: context.String("pid-file"),
|
2016-12-14 00:21:36 +08:00
|
|
|
preserveFDs: context.Int("preserve-fds"),
|
2017-03-02 15:48:00 +08:00
|
|
|
action: action,
|
|
|
|
criuOpts: criuOpts,
|
2018-06-02 03:56:13 +08:00
|
|
|
init: true,
|
2019-04-19 22:36:52 +08:00
|
|
|
logLevel: logLevel,
|
2016-05-26 02:24:26 +08:00
|
|
|
}
|
2017-06-02 07:17:21 +08:00
|
|
|
return r.run(spec.Process)
|
2016-05-26 02:24:26 +08:00
|
|
|
}
|