2015-02-07 13:12:27 +08:00
|
|
|
// +build linux
|
|
|
|
|
|
|
|
package libcontainer
|
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/json"
|
2015-03-28 01:50:32 +08:00
|
|
|
"errors"
|
2020-05-19 19:46:31 +08:00
|
|
|
"fmt"
|
2015-02-07 13:12:27 +08:00
|
|
|
"io"
|
|
|
|
"os"
|
|
|
|
"os/exec"
|
2015-03-19 11:22:21 +08:00
|
|
|
"path/filepath"
|
|
|
|
"strconv"
|
2015-02-07 13:12:27 +08:00
|
|
|
|
2015-06-22 10:29:59 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
2020-05-19 19:46:31 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
|
2015-09-11 08:57:31 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
2019-04-19 22:36:52 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/logs"
|
2015-06-22 10:29:59 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
2016-01-26 10:15:44 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
2017-05-10 05:38:27 +08:00
|
|
|
|
2020-05-19 19:46:31 +08:00
|
|
|
"github.com/sirupsen/logrus"
|
2017-05-10 05:38:27 +08:00
|
|
|
"golang.org/x/sys/unix"
|
2015-02-07 13:12:27 +08:00
|
|
|
)
|
|
|
|
|
2016-11-14 20:54:17 +08:00
|
|
|
// Synchronisation value for cgroup namespace setup.
|
|
|
|
// The same constant is defined in nsexec.c as "CREATECGROUPNS".
|
|
|
|
const createCgroupns = 0x80
|
|
|
|
|
2015-02-07 13:12:27 +08:00
|
|
|
type parentProcess interface {
|
|
|
|
// pid returns the pid for the running process.
|
|
|
|
pid() int
|
|
|
|
|
|
|
|
// start starts the process execution.
|
|
|
|
start() error
|
|
|
|
|
|
|
|
// send a SIGKILL to the process and wait for the exit.
|
|
|
|
terminate() error
|
|
|
|
|
|
|
|
// wait waits on the process returning the process state.
|
|
|
|
wait() (*os.ProcessState, error)
|
|
|
|
|
2016-09-21 20:13:32 +08:00
|
|
|
// startTime returns the process start time.
|
2017-06-15 06:38:45 +08:00
|
|
|
startTime() (uint64, error)
|
2015-02-07 14:33:10 +08:00
|
|
|
|
|
|
|
signal(os.Signal) error
|
2015-04-29 03:13:57 +08:00
|
|
|
|
2015-04-29 19:52:17 +08:00
|
|
|
externalDescriptors() []string
|
2015-04-29 04:54:03 +08:00
|
|
|
|
2015-04-29 19:52:17 +08:00
|
|
|
setExternalDescriptors(fds []string)
|
2018-08-04 01:11:20 +08:00
|
|
|
|
2019-04-04 19:57:28 +08:00
|
|
|
forwardChildLogs()
|
2018-08-04 01:11:20 +08:00
|
|
|
}
|
|
|
|
|
2019-04-23 22:02:31 +08:00
|
|
|
type filePair struct {
|
|
|
|
parent *os.File
|
|
|
|
child *os.File
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
type setnsProcess struct {
|
Disable rootless mode except RootlessCgMgr when executed as the root in userns
This PR decomposes `libcontainer/configs.Config.Rootless bool` into `RootlessEUID bool` and
`RootlessCgroups bool`, so as to make "runc-in-userns" to be more compatible with "rootful" runc.
`RootlessEUID` denotes that runc is being executed as a non-root user (euid != 0) in
the current user namespace. `RootlessEUID` is almost identical to the former `Rootless`
except cgroups stuff.
`RootlessCgroups` denotes that runc is unlikely to have the full access to cgroups.
`RootlessCgroups` is set to false if runc is executed as the root (euid == 0) in the initial namespace.
Otherwise `RootlessCgroups` is set to true.
(Hint: if `RootlessEUID` is true, `RootlessCgroups` becomes true as well)
When runc is executed as the root (euid == 0) in an user namespace (e.g. by Docker-in-LXD, Podman, Usernetes),
`RootlessEUID` is set to false but `RootlessCgroups` is set to true.
So, "runc-in-userns" behaves almost same as "rootful" runc except that cgroups errors are ignored.
This PR does not have any impact on CLI flags and `state.json`.
Note about CLI:
* Now `runc --rootless=(auto|true|false)` CLI flag is only used for setting `RootlessCgroups`.
* Now `runc spec --rootless` is only required when `RootlessEUID` is set to true.
For runc-in-userns, `runc spec` without `--rootless` should work, when sufficient numbers of
UID/GID are mapped.
Note about `$XDG_RUNTIME_DIR` (e.g. `/run/user/1000`):
* `$XDG_RUNTIME_DIR` is ignored if runc is being executed as the root (euid == 0) in the initial namespace, for backward compatibility.
(`/run/runc` is used)
* If runc is executed as the root (euid == 0) in an user namespace, `$XDG_RUNTIME_DIR` is honored if `$USER != "" && $USER != "root"`.
This allows unprivileged users to allow execute runc as the root in userns, without mounting writable `/run/runc`.
Note about `state.json`:
* `rootless` is set to true when `RootlessEUID == true && RootlessCgroups == true`.
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
2018-07-05 14:28:21 +08:00
|
|
|
cmd *exec.Cmd
|
2019-04-23 22:02:31 +08:00
|
|
|
messageSockPair filePair
|
|
|
|
logFilePair filePair
|
Disable rootless mode except RootlessCgMgr when executed as the root in userns
This PR decomposes `libcontainer/configs.Config.Rootless bool` into `RootlessEUID bool` and
`RootlessCgroups bool`, so as to make "runc-in-userns" to be more compatible with "rootful" runc.
`RootlessEUID` denotes that runc is being executed as a non-root user (euid != 0) in
the current user namespace. `RootlessEUID` is almost identical to the former `Rootless`
except cgroups stuff.
`RootlessCgroups` denotes that runc is unlikely to have the full access to cgroups.
`RootlessCgroups` is set to false if runc is executed as the root (euid == 0) in the initial namespace.
Otherwise `RootlessCgroups` is set to true.
(Hint: if `RootlessEUID` is true, `RootlessCgroups` becomes true as well)
When runc is executed as the root (euid == 0) in an user namespace (e.g. by Docker-in-LXD, Podman, Usernetes),
`RootlessEUID` is set to false but `RootlessCgroups` is set to true.
So, "runc-in-userns" behaves almost same as "rootful" runc except that cgroups errors are ignored.
This PR does not have any impact on CLI flags and `state.json`.
Note about CLI:
* Now `runc --rootless=(auto|true|false)` CLI flag is only used for setting `RootlessCgroups`.
* Now `runc spec --rootless` is only required when `RootlessEUID` is set to true.
For runc-in-userns, `runc spec` without `--rootless` should work, when sufficient numbers of
UID/GID are mapped.
Note about `$XDG_RUNTIME_DIR` (e.g. `/run/user/1000`):
* `$XDG_RUNTIME_DIR` is ignored if runc is being executed as the root (euid == 0) in the initial namespace, for backward compatibility.
(`/run/runc` is used)
* If runc is executed as the root (euid == 0) in an user namespace, `$XDG_RUNTIME_DIR` is honored if `$USER != "" && $USER != "root"`.
This allows unprivileged users to allow execute runc as the root in userns, without mounting writable `/run/runc`.
Note about `state.json`:
* `rootless` is set to true when `RootlessEUID == true && RootlessCgroups == true`.
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
2018-07-05 14:28:21 +08:00
|
|
|
cgroupPaths map[string]string
|
|
|
|
rootlessCgroups bool
|
|
|
|
intelRdtPath string
|
|
|
|
config *initConfig
|
|
|
|
fds []string
|
|
|
|
process *Process
|
|
|
|
bootstrapData io.Reader
|
2020-05-19 19:46:31 +08:00
|
|
|
initProcessPid int
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
|
2017-06-15 06:38:45 +08:00
|
|
|
func (p *setnsProcess) startTime() (uint64, error) {
|
|
|
|
stat, err := system.Stat(p.pid())
|
|
|
|
return stat.StartTime, err
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
|
2015-03-28 01:50:32 +08:00
|
|
|
func (p *setnsProcess) signal(sig os.Signal) error {
|
2020-04-19 07:05:10 +08:00
|
|
|
s, ok := sig.(unix.Signal)
|
2015-03-28 01:50:32 +08:00
|
|
|
if !ok {
|
|
|
|
return errors.New("os: unsupported signal type")
|
|
|
|
}
|
2017-05-10 05:38:27 +08:00
|
|
|
return unix.Kill(p.pid(), s)
|
2015-02-07 14:33:10 +08:00
|
|
|
}
|
|
|
|
|
2015-02-07 13:12:27 +08:00
|
|
|
func (p *setnsProcess) start() (err error) {
|
2019-04-23 22:02:31 +08:00
|
|
|
defer p.messageSockPair.parent.Close()
|
2015-10-17 23:14:26 +08:00
|
|
|
err = p.cmd.Start()
|
2018-08-04 01:11:20 +08:00
|
|
|
// close the write-side of the pipes (controlled by child)
|
2019-04-23 22:02:31 +08:00
|
|
|
p.messageSockPair.child.Close()
|
|
|
|
p.logFilePair.child.Close()
|
2015-10-17 23:14:26 +08:00
|
|
|
if err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "starting setns process")
|
2015-10-17 23:14:26 +08:00
|
|
|
}
|
|
|
|
if p.bootstrapData != nil {
|
2019-04-23 22:02:31 +08:00
|
|
|
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
|
2015-10-17 23:14:26 +08:00
|
|
|
}
|
|
|
|
}
|
2015-02-23 17:26:43 +08:00
|
|
|
if err = p.execSetns(); err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "executing setns process")
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
2017-09-15 17:39:35 +08:00
|
|
|
if len(p.cgroupPaths) > 0 {
|
Disable rootless mode except RootlessCgMgr when executed as the root in userns
This PR decomposes `libcontainer/configs.Config.Rootless bool` into `RootlessEUID bool` and
`RootlessCgroups bool`, so as to make "runc-in-userns" to be more compatible with "rootful" runc.
`RootlessEUID` denotes that runc is being executed as a non-root user (euid != 0) in
the current user namespace. `RootlessEUID` is almost identical to the former `Rootless`
except cgroups stuff.
`RootlessCgroups` denotes that runc is unlikely to have the full access to cgroups.
`RootlessCgroups` is set to false if runc is executed as the root (euid == 0) in the initial namespace.
Otherwise `RootlessCgroups` is set to true.
(Hint: if `RootlessEUID` is true, `RootlessCgroups` becomes true as well)
When runc is executed as the root (euid == 0) in an user namespace (e.g. by Docker-in-LXD, Podman, Usernetes),
`RootlessEUID` is set to false but `RootlessCgroups` is set to true.
So, "runc-in-userns" behaves almost same as "rootful" runc except that cgroups errors are ignored.
This PR does not have any impact on CLI flags and `state.json`.
Note about CLI:
* Now `runc --rootless=(auto|true|false)` CLI flag is only used for setting `RootlessCgroups`.
* Now `runc spec --rootless` is only required when `RootlessEUID` is set to true.
For runc-in-userns, `runc spec` without `--rootless` should work, when sufficient numbers of
UID/GID are mapped.
Note about `$XDG_RUNTIME_DIR` (e.g. `/run/user/1000`):
* `$XDG_RUNTIME_DIR` is ignored if runc is being executed as the root (euid == 0) in the initial namespace, for backward compatibility.
(`/run/runc` is used)
* If runc is executed as the root (euid == 0) in an user namespace, `$XDG_RUNTIME_DIR` is honored if `$USER != "" && $USER != "root"`.
This allows unprivileged users to allow execute runc as the root in userns, without mounting writable `/run/runc`.
Note about `state.json`:
* `rootless` is set to true when `RootlessEUID == true && RootlessCgroups == true`.
Signed-off-by: Akihiro Suda <suda.akihiro@lab.ntt.co.jp>
2018-07-05 14:28:21 +08:00
|
|
|
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
|
2020-05-19 19:46:31 +08:00
|
|
|
// On cgroup v2 + nesting + domain controllers, EnterPid may fail with EBUSY.
|
|
|
|
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
|
|
|
|
// Try to join the cgroup of InitProcessPid.
|
|
|
|
if cgroups.IsCgroup2UnifiedMode() {
|
|
|
|
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
|
|
|
|
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
|
|
|
|
if initCgErr == nil {
|
|
|
|
if initCgPath, ok := initCg[""]; ok {
|
|
|
|
initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
|
|
|
|
logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
|
|
|
|
p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
|
|
|
|
// NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
|
|
|
|
err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
|
|
|
|
}
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
}
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
if p.intelRdtPath != "" {
|
|
|
|
// if Intel RDT "resource control" filesystem path exists
|
|
|
|
_, err := os.Stat(p.intelRdtPath)
|
|
|
|
if err == nil {
|
|
|
|
if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
|
|
|
|
return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-03-25 23:03:30 +08:00
|
|
|
// set rlimits, this has to be done here because we lose permissions
|
|
|
|
// to raise the limits once we enter a user-namespace
|
|
|
|
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "setting rlimits for process")
|
2016-03-25 23:03:30 +08:00
|
|
|
}
|
2019-04-23 22:02:31 +08:00
|
|
|
if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "writing config to pipe")
|
2016-03-22 06:33:17 +08:00
|
|
|
}
|
2015-12-17 17:16:34 +08:00
|
|
|
|
2019-04-23 22:02:31 +08:00
|
|
|
ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
|
2016-06-06 18:26:35 +08:00
|
|
|
switch sync.Type {
|
|
|
|
case procReady:
|
|
|
|
// This shouldn't happen.
|
|
|
|
panic("unexpected procReady in setns")
|
|
|
|
case procHooks:
|
|
|
|
// This shouldn't happen.
|
|
|
|
panic("unexpected procHooks in setns")
|
|
|
|
default:
|
2020-05-17 08:20:44 +08:00
|
|
|
return newSystemError(errors.New("invalid JSON payload from child"))
|
2016-06-06 18:26:35 +08:00
|
|
|
}
|
|
|
|
})
|
|
|
|
|
2019-04-23 22:02:31 +08:00
|
|
|
if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "calling shutdown on init pipe")
|
2015-02-28 07:55:53 +08:00
|
|
|
}
|
2015-12-17 17:16:34 +08:00
|
|
|
// Must be done after Shutdown so the child will exit and we can wait for it.
|
2015-02-28 07:55:53 +08:00
|
|
|
if ierr != nil {
|
2015-10-06 07:38:27 +08:00
|
|
|
p.wait()
|
2016-04-19 02:37:26 +08:00
|
|
|
return ierr
|
2015-02-28 07:55:53 +08:00
|
|
|
}
|
2015-02-07 13:12:27 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// execSetns runs the process that executes C code to perform the setns calls
|
|
|
|
// because setns support requires the C process to fork off a child and perform the setns
|
|
|
|
// before the go runtime boots, we wait on the process to die and receive the child's pid
|
|
|
|
// over the provided pipe.
|
2015-02-23 17:26:43 +08:00
|
|
|
func (p *setnsProcess) execSetns() error {
|
2015-02-07 13:12:27 +08:00
|
|
|
status, err := p.cmd.Process.Wait()
|
|
|
|
if err != nil {
|
2015-02-23 17:26:43 +08:00
|
|
|
p.cmd.Wait()
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "waiting on setns process to finish")
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
if !status.Success() {
|
2015-02-23 17:26:43 +08:00
|
|
|
p.cmd.Wait()
|
|
|
|
return newSystemError(&exec.ExitError{ProcessState: status})
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
var pid *pid
|
2019-04-23 22:02:31 +08:00
|
|
|
if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
|
2015-02-23 17:26:43 +08:00
|
|
|
p.cmd.Wait()
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "reading pid from init pipe")
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
2017-05-06 19:34:32 +08:00
|
|
|
|
|
|
|
// Clean up the zombie parent process
|
2019-03-21 10:24:04 +08:00
|
|
|
// On Unix systems FindProcess always succeeds.
|
|
|
|
firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
|
2017-05-06 19:34:32 +08:00
|
|
|
|
|
|
|
// Ignore the error in case the child has already been reaped for any reason
|
|
|
|
_, _ = firstChildProcess.Wait()
|
|
|
|
|
2015-02-23 17:26:43 +08:00
|
|
|
process, err := os.FindProcess(pid.Pid)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
p.cmd.Process = process
|
2015-11-07 08:49:06 +08:00
|
|
|
p.process.ops = p
|
2015-02-23 17:26:43 +08:00
|
|
|
return nil
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// terminate sends a SIGKILL to the forked process for the setns routine then waits to
|
2016-10-12 07:22:48 +08:00
|
|
|
// avoid the process becoming a zombie.
|
2015-02-07 13:12:27 +08:00
|
|
|
func (p *setnsProcess) terminate() error {
|
2015-04-23 02:30:42 +08:00
|
|
|
if p.cmd.Process == nil {
|
|
|
|
return nil
|
|
|
|
}
|
2015-02-23 17:26:43 +08:00
|
|
|
err := p.cmd.Process.Kill()
|
2015-02-07 13:12:27 +08:00
|
|
|
if _, werr := p.wait(); err == nil {
|
|
|
|
err = werr
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *setnsProcess) wait() (*os.ProcessState, error) {
|
2015-02-23 17:26:43 +08:00
|
|
|
err := p.cmd.Wait()
|
|
|
|
|
2015-08-12 22:37:34 +08:00
|
|
|
// Return actual ProcessState even on Wait error
|
|
|
|
return p.cmd.ProcessState, err
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func (p *setnsProcess) pid() int {
|
2015-02-23 17:26:43 +08:00
|
|
|
return p.cmd.Process.Pid
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
|
2015-04-29 19:52:17 +08:00
|
|
|
func (p *setnsProcess) externalDescriptors() []string {
|
2015-04-29 04:54:03 +08:00
|
|
|
return p.fds
|
|
|
|
}
|
|
|
|
|
2015-04-29 19:52:17 +08:00
|
|
|
func (p *setnsProcess) setExternalDescriptors(newFds []string) {
|
2015-04-29 04:54:03 +08:00
|
|
|
p.fds = newFds
|
2015-04-29 03:13:57 +08:00
|
|
|
}
|
|
|
|
|
2019-04-04 19:57:28 +08:00
|
|
|
func (p *setnsProcess) forwardChildLogs() {
|
2019-04-23 22:02:31 +08:00
|
|
|
go logs.ForwardLogs(p.logFilePair.parent)
|
2018-08-04 01:11:20 +08:00
|
|
|
}
|
|
|
|
|
2015-02-07 13:12:27 +08:00
|
|
|
type initProcess struct {
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
cmd *exec.Cmd
|
2019-04-23 22:02:31 +08:00
|
|
|
messageSockPair filePair
|
|
|
|
logFilePair filePair
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
config *initConfig
|
|
|
|
manager cgroups.Manager
|
|
|
|
intelRdtManager intelrdt.Manager
|
|
|
|
container *linuxContainer
|
|
|
|
fds []string
|
|
|
|
process *Process
|
|
|
|
bootstrapData io.Reader
|
|
|
|
sharePidns bool
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func (p *initProcess) pid() int {
|
|
|
|
return p.cmd.Process.Pid
|
|
|
|
}
|
|
|
|
|
2015-04-29 19:52:17 +08:00
|
|
|
func (p *initProcess) externalDescriptors() []string {
|
2015-04-29 03:13:57 +08:00
|
|
|
return p.fds
|
|
|
|
}
|
|
|
|
|
2016-11-14 20:54:17 +08:00
|
|
|
// getChildPid receives the final child's pid over the provided pipe.
|
|
|
|
func (p *initProcess) getChildPid() (int, error) {
|
|
|
|
var pid pid
|
2019-04-23 22:02:31 +08:00
|
|
|
if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
|
2016-11-14 20:54:17 +08:00
|
|
|
p.cmd.Wait()
|
|
|
|
return -1, err
|
|
|
|
}
|
2019-03-21 10:24:04 +08:00
|
|
|
|
|
|
|
// Clean up the zombie parent process
|
|
|
|
// On Unix systems FindProcess always succeeds.
|
|
|
|
firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
|
|
|
|
|
|
|
|
// Ignore the error in case the child has already been reaped for any reason
|
|
|
|
_, _ = firstChildProcess.Wait()
|
|
|
|
|
2016-11-14 20:54:17 +08:00
|
|
|
return pid.Pid, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *initProcess) waitForChildExit(childPid int) error {
|
2015-09-14 08:40:43 +08:00
|
|
|
status, err := p.cmd.Process.Wait()
|
|
|
|
if err != nil {
|
|
|
|
p.cmd.Wait()
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if !status.Success() {
|
|
|
|
p.cmd.Wait()
|
|
|
|
return &exec.ExitError{ProcessState: status}
|
|
|
|
}
|
2017-05-06 19:34:32 +08:00
|
|
|
|
2016-11-14 20:54:17 +08:00
|
|
|
process, err := os.FindProcess(childPid)
|
2015-09-14 08:40:43 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
p.cmd.Process = process
|
2016-05-20 08:28:58 +08:00
|
|
|
p.process.ops = p
|
2015-09-14 08:40:43 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-03-10 00:29:03 +08:00
|
|
|
func (p *initProcess) start() (retErr error) {
|
2019-04-23 22:02:31 +08:00
|
|
|
defer p.messageSockPair.parent.Close()
|
2015-09-14 08:40:43 +08:00
|
|
|
err := p.cmd.Start()
|
2015-11-07 08:49:06 +08:00
|
|
|
p.process.ops = p
|
2018-08-04 01:11:20 +08:00
|
|
|
// close the write-side of the pipes (controlled by child)
|
2019-04-23 22:02:31 +08:00
|
|
|
p.messageSockPair.child.Close()
|
|
|
|
p.logFilePair.child.Close()
|
2015-02-07 13:12:27 +08:00
|
|
|
if err != nil {
|
2015-11-07 08:49:06 +08:00
|
|
|
p.process.ops = nil
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "starting init process command")
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
2020-03-10 00:29:03 +08:00
|
|
|
defer func() {
|
|
|
|
if retErr != nil {
|
|
|
|
p.manager.Destroy()
|
|
|
|
if p.intelRdtManager != nil {
|
|
|
|
p.intelRdtManager.Destroy()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2016-04-26 00:19:39 +08:00
|
|
|
// Do this before syncing with child so that no children can escape the
|
|
|
|
// cgroup. We don't need to worry about not doing this and not being root
|
|
|
|
// because we'd be using the rootless cgroup manager in that case.
|
|
|
|
if err := p.manager.Apply(p.pid()); err != nil {
|
|
|
|
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
if p.intelRdtManager != nil {
|
|
|
|
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
|
|
|
|
return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
|
|
|
|
}
|
|
|
|
}
|
2019-04-23 22:02:31 +08:00
|
|
|
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
|
2017-09-07 23:27:33 +08:00
|
|
|
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
|
|
|
|
}
|
2016-11-14 20:54:17 +08:00
|
|
|
childPid, err := p.getChildPid()
|
|
|
|
if err != nil {
|
|
|
|
return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
|
2017-09-07 23:27:33 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Save the standard descriptor names before the container process
|
|
|
|
// can potentially move them (e.g., via dup2()). If we don't do this now,
|
|
|
|
// we won't know at checkpoint time which file descriptor to look up.
|
2016-11-14 20:54:17 +08:00
|
|
|
fds, err := getPipeFds(childPid)
|
2017-09-07 23:27:33 +08:00
|
|
|
if err != nil {
|
2016-11-14 20:54:17 +08:00
|
|
|
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
|
2017-09-07 23:27:33 +08:00
|
|
|
}
|
|
|
|
p.setExternalDescriptors(fds)
|
2020-06-02 10:48:53 +08:00
|
|
|
|
2016-11-14 20:54:17 +08:00
|
|
|
// Now it's time to setup cgroup namesapce
|
|
|
|
if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
|
2019-04-23 22:02:31 +08:00
|
|
|
if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil {
|
2016-11-14 20:54:17 +08:00
|
|
|
return newSystemErrorWithCause(err, "sending synchronization value to init process")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for our first child to exit
|
|
|
|
if err := p.waitForChildExit(childPid); err != nil {
|
|
|
|
return newSystemErrorWithCause(err, "waiting for our first child to exit")
|
|
|
|
}
|
|
|
|
|
2015-02-07 13:12:27 +08:00
|
|
|
if err := p.createNetworkInterfaces(); err != nil {
|
2016-10-11 16:38:15 +08:00
|
|
|
return newSystemErrorWithCause(err, "creating network interfaces")
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
if err := p.sendConfig(); err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "sending config to init process")
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
2015-12-17 17:16:34 +08:00
|
|
|
var (
|
2016-02-17 18:20:06 +08:00
|
|
|
sentRun bool
|
|
|
|
sentResume bool
|
2015-12-17 17:16:34 +08:00
|
|
|
)
|
|
|
|
|
2019-04-23 22:02:31 +08:00
|
|
|
ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
|
2016-06-06 18:26:35 +08:00
|
|
|
switch sync.Type {
|
2015-12-17 17:16:34 +08:00
|
|
|
case procReady:
|
2016-03-25 23:03:30 +08:00
|
|
|
// set rlimits, this has to be done here because we lose permissions
|
|
|
|
// to raise the limits once we enter a user-namespace
|
|
|
|
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "setting rlimits for ready process")
|
2016-03-25 23:03:30 +08:00
|
|
|
}
|
2016-02-03 09:27:44 +08:00
|
|
|
// call prestart hooks
|
|
|
|
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
|
2016-12-20 19:21:10 +08:00
|
|
|
// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
|
|
|
|
if err := p.manager.Set(p.config.Config); err != nil {
|
|
|
|
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
|
|
|
|
}
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
if p.intelRdtManager != nil {
|
|
|
|
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
|
|
|
|
return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
|
|
|
|
}
|
|
|
|
}
|
2016-12-20 19:21:10 +08:00
|
|
|
|
2016-02-03 09:27:44 +08:00
|
|
|
if p.config.Config.Hooks != nil {
|
libcontainer: Set 'status' in hook stdin
Finish off the work started in a344b2d6 (sync up `HookState` with OCI
spec `State`, 2016-12-19, #1201).
And drop HookState, since there's no need for a local alias for
specs.State.
Also set c.initProcess in newInitProcess to support OCIState calls
from within initProcess.start(). I think the cyclic references
between linuxContainer and initProcess are unfortunate, but didn't
want to address that here.
I've also left the timing of the Prestart hooks alone, although the
spec calls for them to happen before start (not as part of creation)
[1,2]. Once the timing gets fixed we can drop the
initProcessStartTime hacks which initProcess.start currently needs.
I'm not sure why we trigger the prestart hooks in response to both
procReady and procHooks. But we've had two prestart rounds in
initProcess.start since 2f276498 (Move pre-start hooks after container
mounts, 2016-02-17, #568). I've left that alone too.
I really think we should have len() guards to avoid computing the
state when .Hooks is non-nil but the particular phase we're looking at
is empty. Aleksa, however, is adamantly against them [3] citing a
risk of sloppy copy/pastes causing the hook slice being len-guarded to
diverge from the hook slice being iterated over within the guard. I
think that ort of thing is very lo-risk, because:
* We shouldn't be copy/pasting this, right? DRY for the win :).
* There's only ever a few lines between the guard and the guarded
loop. That makes broken copy/pastes easy to catch in review.
* We should have test coverage for these. Guarding with the wrong
slice is certainly not the only thing you can break with a sloppy
copy/paste.
But I'm not a maintainer ;).
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.0/config.md#prestart
[2]: https://github.com/opencontainers/runc/issues/1710
[3]: https://github.com/opencontainers/runc/pull/1741#discussion_r233331570
Signed-off-by: W. Trevor King <wking@tremily.us>
2018-02-26 06:47:41 +08:00
|
|
|
s, err := p.container.currentOCIState()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2016-02-03 09:27:44 +08:00
|
|
|
}
|
libcontainer: Set 'status' in hook stdin
Finish off the work started in a344b2d6 (sync up `HookState` with OCI
spec `State`, 2016-12-19, #1201).
And drop HookState, since there's no need for a local alias for
specs.State.
Also set c.initProcess in newInitProcess to support OCIState calls
from within initProcess.start(). I think the cyclic references
between linuxContainer and initProcess are unfortunate, but didn't
want to address that here.
I've also left the timing of the Prestart hooks alone, although the
spec calls for them to happen before start (not as part of creation)
[1,2]. Once the timing gets fixed we can drop the
initProcessStartTime hacks which initProcess.start currently needs.
I'm not sure why we trigger the prestart hooks in response to both
procReady and procHooks. But we've had two prestart rounds in
initProcess.start since 2f276498 (Move pre-start hooks after container
mounts, 2016-02-17, #568). I've left that alone too.
I really think we should have len() guards to avoid computing the
state when .Hooks is non-nil but the particular phase we're looking at
is empty. Aleksa, however, is adamantly against them [3] citing a
risk of sloppy copy/pastes causing the hook slice being len-guarded to
diverge from the hook slice being iterated over within the guard. I
think that ort of thing is very lo-risk, because:
* We shouldn't be copy/pasting this, right? DRY for the win :).
* There's only ever a few lines between the guard and the guarded
loop. That makes broken copy/pastes easy to catch in review.
* We should have test coverage for these. Guarding with the wrong
slice is certainly not the only thing you can break with a sloppy
copy/paste.
But I'm not a maintainer ;).
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.0/config.md#prestart
[2]: https://github.com/opencontainers/runc/issues/1710
[3]: https://github.com/opencontainers/runc/pull/1741#discussion_r233331570
Signed-off-by: W. Trevor King <wking@tremily.us>
2018-02-26 06:47:41 +08:00
|
|
|
// initProcessStartTime hasn't been set yet.
|
|
|
|
s.Pid = p.cmd.Process.Pid
|
|
|
|
s.Status = "creating"
|
2016-04-19 02:37:26 +08:00
|
|
|
for i, hook := range p.config.Config.Hooks.Prestart {
|
2016-02-03 09:27:44 +08:00
|
|
|
if err := hook.Run(s); err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
|
2016-02-03 09:27:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2015-12-17 17:16:34 +08:00
|
|
|
// Sync with child.
|
2019-04-23 22:02:31 +08:00
|
|
|
if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
|
2016-06-03 23:29:34 +08:00
|
|
|
return newSystemErrorWithCause(err, "writing syncT 'run'")
|
2015-12-17 17:16:34 +08:00
|
|
|
}
|
|
|
|
sentRun = true
|
2016-02-17 18:20:06 +08:00
|
|
|
case procHooks:
|
2016-12-20 19:21:10 +08:00
|
|
|
// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
|
|
|
|
if err := p.manager.Set(p.config.Config); err != nil {
|
|
|
|
return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
|
|
|
|
}
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
if p.intelRdtManager != nil {
|
|
|
|
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
|
|
|
|
return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
|
|
|
|
}
|
|
|
|
}
|
2016-02-17 18:20:06 +08:00
|
|
|
if p.config.Config.Hooks != nil {
|
libcontainer: Set 'status' in hook stdin
Finish off the work started in a344b2d6 (sync up `HookState` with OCI
spec `State`, 2016-12-19, #1201).
And drop HookState, since there's no need for a local alias for
specs.State.
Also set c.initProcess in newInitProcess to support OCIState calls
from within initProcess.start(). I think the cyclic references
between linuxContainer and initProcess are unfortunate, but didn't
want to address that here.
I've also left the timing of the Prestart hooks alone, although the
spec calls for them to happen before start (not as part of creation)
[1,2]. Once the timing gets fixed we can drop the
initProcessStartTime hacks which initProcess.start currently needs.
I'm not sure why we trigger the prestart hooks in response to both
procReady and procHooks. But we've had two prestart rounds in
initProcess.start since 2f276498 (Move pre-start hooks after container
mounts, 2016-02-17, #568). I've left that alone too.
I really think we should have len() guards to avoid computing the
state when .Hooks is non-nil but the particular phase we're looking at
is empty. Aleksa, however, is adamantly against them [3] citing a
risk of sloppy copy/pastes causing the hook slice being len-guarded to
diverge from the hook slice being iterated over within the guard. I
think that ort of thing is very lo-risk, because:
* We shouldn't be copy/pasting this, right? DRY for the win :).
* There's only ever a few lines between the guard and the guarded
loop. That makes broken copy/pastes easy to catch in review.
* We should have test coverage for these. Guarding with the wrong
slice is certainly not the only thing you can break with a sloppy
copy/paste.
But I'm not a maintainer ;).
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.0/config.md#prestart
[2]: https://github.com/opencontainers/runc/issues/1710
[3]: https://github.com/opencontainers/runc/pull/1741#discussion_r233331570
Signed-off-by: W. Trevor King <wking@tremily.us>
2018-02-26 06:47:41 +08:00
|
|
|
s, err := p.container.currentOCIState()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2016-02-17 18:20:06 +08:00
|
|
|
}
|
libcontainer: Set 'status' in hook stdin
Finish off the work started in a344b2d6 (sync up `HookState` with OCI
spec `State`, 2016-12-19, #1201).
And drop HookState, since there's no need for a local alias for
specs.State.
Also set c.initProcess in newInitProcess to support OCIState calls
from within initProcess.start(). I think the cyclic references
between linuxContainer and initProcess are unfortunate, but didn't
want to address that here.
I've also left the timing of the Prestart hooks alone, although the
spec calls for them to happen before start (not as part of creation)
[1,2]. Once the timing gets fixed we can drop the
initProcessStartTime hacks which initProcess.start currently needs.
I'm not sure why we trigger the prestart hooks in response to both
procReady and procHooks. But we've had two prestart rounds in
initProcess.start since 2f276498 (Move pre-start hooks after container
mounts, 2016-02-17, #568). I've left that alone too.
I really think we should have len() guards to avoid computing the
state when .Hooks is non-nil but the particular phase we're looking at
is empty. Aleksa, however, is adamantly against them [3] citing a
risk of sloppy copy/pastes causing the hook slice being len-guarded to
diverge from the hook slice being iterated over within the guard. I
think that ort of thing is very lo-risk, because:
* We shouldn't be copy/pasting this, right? DRY for the win :).
* There's only ever a few lines between the guard and the guarded
loop. That makes broken copy/pastes easy to catch in review.
* We should have test coverage for these. Guarding with the wrong
slice is certainly not the only thing you can break with a sloppy
copy/paste.
But I'm not a maintainer ;).
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.0/config.md#prestart
[2]: https://github.com/opencontainers/runc/issues/1710
[3]: https://github.com/opencontainers/runc/pull/1741#discussion_r233331570
Signed-off-by: W. Trevor King <wking@tremily.us>
2018-02-26 06:47:41 +08:00
|
|
|
// initProcessStartTime hasn't been set yet.
|
|
|
|
s.Pid = p.cmd.Process.Pid
|
|
|
|
s.Status = "creating"
|
2016-04-19 02:37:26 +08:00
|
|
|
for i, hook := range p.config.Config.Hooks.Prestart {
|
2016-02-17 18:20:06 +08:00
|
|
|
if err := hook.Run(s); err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
|
2016-02-17 18:20:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Sync with child.
|
2019-04-23 22:02:31 +08:00
|
|
|
if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
|
2016-06-03 23:29:34 +08:00
|
|
|
return newSystemErrorWithCause(err, "writing syncT 'resume'")
|
2016-02-17 18:20:06 +08:00
|
|
|
}
|
|
|
|
sentResume = true
|
2015-12-17 17:16:34 +08:00
|
|
|
default:
|
2020-05-17 08:20:44 +08:00
|
|
|
return newSystemError(errors.New("invalid JSON payload from child"))
|
2015-12-17 17:16:34 +08:00
|
|
|
}
|
2016-06-06 18:26:35 +08:00
|
|
|
|
|
|
|
return nil
|
|
|
|
})
|
|
|
|
|
2015-12-17 17:16:34 +08:00
|
|
|
if !sentRun {
|
2016-09-14 15:55:46 +08:00
|
|
|
return newSystemErrorWithCause(ierr, "container init")
|
2015-12-17 17:16:34 +08:00
|
|
|
}
|
2016-02-17 18:20:06 +08:00
|
|
|
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
|
2020-05-17 08:20:44 +08:00
|
|
|
return newSystemError(errors.New("could not synchronise after executing prestart hooks with container process"))
|
2016-02-17 18:20:06 +08:00
|
|
|
}
|
2019-04-23 22:02:31 +08:00
|
|
|
if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
|
2016-04-19 02:37:26 +08:00
|
|
|
return newSystemErrorWithCause(err, "shutting down init pipe")
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
2016-06-06 18:26:35 +08:00
|
|
|
|
2015-12-17 17:16:34 +08:00
|
|
|
// Must be done after Shutdown so the child will exit and we can wait for it.
|
2015-02-07 13:12:27 +08:00
|
|
|
if ierr != nil {
|
2015-12-17 17:16:34 +08:00
|
|
|
p.wait()
|
2016-04-19 02:37:26 +08:00
|
|
|
return ierr
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *initProcess) wait() (*os.ProcessState, error) {
|
2015-02-23 17:26:43 +08:00
|
|
|
err := p.cmd.Wait()
|
2015-02-07 13:12:27 +08:00
|
|
|
if err != nil {
|
2015-02-26 03:45:53 +08:00
|
|
|
return p.cmd.ProcessState, err
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
// we should kill all processes in cgroup when init is died if we use host PID namespace
|
2015-09-14 08:40:43 +08:00
|
|
|
if p.sharePidns {
|
2017-05-10 05:38:27 +08:00
|
|
|
signalAllProcesses(p.manager, unix.SIGKILL)
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
2015-02-23 17:26:43 +08:00
|
|
|
return p.cmd.ProcessState, nil
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func (p *initProcess) terminate() error {
|
|
|
|
if p.cmd.Process == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
err := p.cmd.Process.Kill()
|
|
|
|
if _, werr := p.wait(); err == nil {
|
|
|
|
err = werr
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2017-06-15 06:38:45 +08:00
|
|
|
func (p *initProcess) startTime() (uint64, error) {
|
|
|
|
stat, err := system.Stat(p.pid())
|
|
|
|
return stat.StartTime, err
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func (p *initProcess) sendConfig() error {
|
2016-01-28 09:58:30 +08:00
|
|
|
// send the config to the container's init process, we don't use JSON Encode
|
|
|
|
// here because there might be a problem in JSON decoder in some cases, see:
|
|
|
|
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
|
2019-04-23 22:02:31 +08:00
|
|
|
return utils.WriteJSON(p.messageSockPair.parent, p.config)
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func (p *initProcess) createNetworkInterfaces() error {
|
|
|
|
for _, config := range p.config.Config.Networks {
|
2015-02-10 07:16:27 +08:00
|
|
|
strategy, err := getStrategy(config.Type)
|
2015-02-07 13:12:27 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2015-02-11 03:51:45 +08:00
|
|
|
n := &network{
|
|
|
|
Network: *config,
|
|
|
|
}
|
|
|
|
if err := strategy.create(n, p.pid()); err != nil {
|
2015-02-07 13:12:27 +08:00
|
|
|
return err
|
|
|
|
}
|
2015-02-11 03:51:45 +08:00
|
|
|
p.config.Networks = append(p.config.Networks, n)
|
2015-02-07 13:12:27 +08:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-03-28 01:50:32 +08:00
|
|
|
func (p *initProcess) signal(sig os.Signal) error {
|
2020-04-19 07:05:10 +08:00
|
|
|
s, ok := sig.(unix.Signal)
|
2015-03-28 01:50:32 +08:00
|
|
|
if !ok {
|
|
|
|
return errors.New("os: unsupported signal type")
|
|
|
|
}
|
2017-05-10 05:38:27 +08:00
|
|
|
return unix.Kill(p.pid(), s)
|
2015-02-07 14:33:10 +08:00
|
|
|
}
|
2015-04-29 04:54:03 +08:00
|
|
|
|
2015-04-29 19:52:17 +08:00
|
|
|
func (p *initProcess) setExternalDescriptors(newFds []string) {
|
2015-04-29 04:54:03 +08:00
|
|
|
p.fds = newFds
|
|
|
|
}
|
|
|
|
|
2019-04-04 19:57:28 +08:00
|
|
|
func (p *initProcess) forwardChildLogs() {
|
2019-04-23 22:02:31 +08:00
|
|
|
go logs.ForwardLogs(p.logFilePair.parent)
|
2018-08-04 01:11:20 +08:00
|
|
|
}
|
|
|
|
|
2015-04-29 19:52:17 +08:00
|
|
|
func getPipeFds(pid int) ([]string, error) {
|
2015-08-13 09:37:44 +08:00
|
|
|
fds := make([]string, 3)
|
2015-04-29 04:54:03 +08:00
|
|
|
|
|
|
|
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
|
|
|
|
for i := 0; i < 3; i++ {
|
2016-06-03 23:29:34 +08:00
|
|
|
// XXX: This breaks if the path is not a valid symlink (which can
|
|
|
|
// happen in certain particularly unlucky mount namespace setups).
|
2015-04-29 04:54:03 +08:00
|
|
|
f := filepath.Join(dirPath, strconv.Itoa(i))
|
|
|
|
target, err := os.Readlink(f)
|
|
|
|
if err != nil {
|
2016-04-23 21:39:42 +08:00
|
|
|
// Ignore permission errors, for rootless containers and other
|
|
|
|
// non-dumpable processes. if we can't get the fd for a particular
|
|
|
|
// file, there's not much we can do.
|
|
|
|
if os.IsPermission(err) {
|
|
|
|
continue
|
|
|
|
}
|
2015-04-29 04:54:03 +08:00
|
|
|
return fds, err
|
|
|
|
}
|
|
|
|
fds[i] = target
|
|
|
|
}
|
|
|
|
return fds, nil
|
|
|
|
}
|
2015-12-16 04:12:29 +08:00
|
|
|
|
2016-06-03 23:29:34 +08:00
|
|
|
// InitializeIO creates pipes for use with the process's stdio and returns the
|
|
|
|
// opposite side for each. Do not use this if you want to have a pseudoterminal
|
|
|
|
// set up for you by libcontainer (TODO: fix that too).
|
|
|
|
// TODO: This is mostly unnecessary, and should be handled by clients.
|
2016-04-23 21:39:38 +08:00
|
|
|
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
|
2015-12-16 04:12:29 +08:00
|
|
|
var fds []uintptr
|
|
|
|
i = &IO{}
|
|
|
|
// cleanup in case of an error
|
|
|
|
defer func() {
|
|
|
|
if err != nil {
|
|
|
|
for _, fd := range fds {
|
2017-05-10 05:38:27 +08:00
|
|
|
unix.Close(int(fd))
|
2015-12-16 04:12:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
// STDIN
|
|
|
|
r, w, err := os.Pipe()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
fds = append(fds, r.Fd(), w.Fd())
|
|
|
|
p.Stdin, i.Stdin = r, w
|
|
|
|
// STDOUT
|
|
|
|
if r, w, err = os.Pipe(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
fds = append(fds, r.Fd(), w.Fd())
|
|
|
|
p.Stdout, i.Stdout = w, r
|
|
|
|
// STDERR
|
|
|
|
if r, w, err = os.Pipe(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
fds = append(fds, r.Fd(), w.Fd())
|
|
|
|
p.Stderr, i.Stderr = w, r
|
2018-09-07 11:37:40 +08:00
|
|
|
// change ownership of the pipes in case we are in a user namespace
|
2015-12-16 04:12:29 +08:00
|
|
|
for _, fd := range fds {
|
2017-05-10 05:38:27 +08:00
|
|
|
if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
|
2015-12-16 04:12:29 +08:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return i, nil
|
|
|
|
}
|