2015-02-07 04:48:57 +08:00
|
|
|
// +build linux
|
|
|
|
|
|
|
|
package libcontainer
|
|
|
|
|
|
|
|
import (
|
2016-02-23 04:36:12 +08:00
|
|
|
"fmt"
|
2015-03-06 06:33:13 +08:00
|
|
|
"os"
|
2016-05-14 07:54:16 +08:00
|
|
|
"os/exec"
|
2017-05-10 05:38:27 +08:00
|
|
|
"syscall" //only for Exec
|
2015-02-07 04:48:57 +08:00
|
|
|
|
2015-06-22 10:29:59 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/apparmor"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
2016-01-21 07:12:25 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/keys"
|
2015-06-30 02:12:54 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/seccomp"
|
2015-06-22 10:29:59 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
2017-03-23 08:21:19 +08:00
|
|
|
"github.com/opencontainers/selinux/go-selinux/label"
|
2017-05-10 05:38:27 +08:00
|
|
|
|
|
|
|
"golang.org/x/sys/unix"
|
2015-02-07 04:48:57 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
type linuxStandardInit struct {
|
2017-03-03 04:53:06 +08:00
|
|
|
pipe *os.File
|
|
|
|
consoleSocket *os.File
|
|
|
|
parentPid int
|
2017-08-24 15:37:26 +08:00
|
|
|
fifoFd int
|
2017-03-03 04:53:06 +08:00
|
|
|
config *initConfig
|
2015-02-07 04:48:57 +08:00
|
|
|
}
|
|
|
|
|
2016-02-23 04:36:12 +08:00
|
|
|
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
|
|
|
|
var newperms uint32
|
|
|
|
|
|
|
|
if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
|
2017-08-24 14:59:01 +08:00
|
|
|
// With user ns we need 'other' search permissions.
|
2016-02-23 04:36:12 +08:00
|
|
|
newperms = 0x8
|
|
|
|
} else {
|
2017-08-24 14:59:01 +08:00
|
|
|
// Without user ns we need 'UID' search permissions.
|
2016-02-23 04:36:12 +08:00
|
|
|
newperms = 0x80000
|
|
|
|
}
|
|
|
|
|
2017-08-24 14:59:01 +08:00
|
|
|
// Create a unique per session container name that we can join in setns;
|
|
|
|
// However, other containers can also join it.
|
2016-02-23 04:36:12 +08:00
|
|
|
return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
|
|
|
|
}
|
|
|
|
|
2016-06-07 04:15:18 +08:00
|
|
|
func (l *linuxStandardInit) Init() error {
|
2016-06-04 02:53:07 +08:00
|
|
|
if !l.config.Config.NoNewKeyring {
|
|
|
|
ringname, keepperms, newperms := l.getSessionRingParams()
|
2016-02-23 04:36:12 +08:00
|
|
|
|
2017-08-24 14:59:01 +08:00
|
|
|
// Do not inherit the parent's session keyring.
|
2016-07-25 06:41:57 +08:00
|
|
|
sessKeyId, err := keys.JoinSessionKeyring(ringname)
|
2016-06-04 02:53:07 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-08-24 14:59:01 +08:00
|
|
|
// Make session keyring searcheable.
|
2016-07-25 06:41:57 +08:00
|
|
|
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
|
2016-06-04 02:53:07 +08:00
|
|
|
return err
|
|
|
|
}
|
2016-01-21 07:12:25 +08:00
|
|
|
}
|
|
|
|
|
2015-02-11 03:51:45 +08:00
|
|
|
if err := setupNetwork(l.config); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := setupRoute(l.config.Config); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-03-02 02:19:07 +08:00
|
|
|
|
2015-02-07 04:48:57 +08:00
|
|
|
label.Init()
|
2016-06-03 23:29:34 +08:00
|
|
|
|
|
|
|
// prepareRootfs() can be executed only for a new mount namespace.
|
2015-02-07 04:48:57 +08:00
|
|
|
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
2016-06-03 23:29:34 +08:00
|
|
|
if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2016-06-03 23:29:34 +08:00
|
|
|
|
|
|
|
// Set up the console. This has to be done *before* we finalize the rootfs,
|
|
|
|
// but *after* we've given the user the chance to set up all of the mounts
|
|
|
|
// they wanted.
|
|
|
|
if l.config.CreateConsole {
|
2017-03-03 04:53:06 +08:00
|
|
|
if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
|
2016-06-03 23:29:34 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := system.Setctty(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finish the rootfs setup.
|
|
|
|
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
|
|
|
if err := finalizeRootfs(l.config.Config); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-07 04:48:57 +08:00
|
|
|
if hostname := l.config.Config.Hostname; hostname != "" {
|
2017-05-10 05:38:27 +08:00
|
|
|
if err := unix.Sethostname([]byte(hostname)); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2016-03-04 02:44:33 +08:00
|
|
|
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
2016-03-04 02:44:33 +08:00
|
|
|
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
2015-04-23 10:17:30 +08:00
|
|
|
|
2015-07-07 07:18:08 +08:00
|
|
|
for key, value := range l.config.Config.Sysctl {
|
2015-04-23 10:17:30 +08:00
|
|
|
if err := writeSystemProperty(key, value); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2015-02-13 08:23:05 +08:00
|
|
|
for _, path := range l.config.Config.ReadonlyPaths {
|
Split the code for remounting mount points and mounting paths.
A remount of a mount point must include all the current flags or
these will be cleared:
```
The mountflags and data arguments should match the values used in the
original mount() call, except for those parameters that are being
deliberately changed.
```
The current code does not do this; the bug manifests in the specified
flags for `/dev` being lost on remount read only at present. As we
need to specify flags, split the code path for this from remounting
paths which are not mount points, as these can only inherit the
existing flags of the path, and these cannot be changed.
In the bind case, remove extra flags from the bind remount. A bind
mount can only be remounted read only, no other flags can be set,
all other flags are inherited from the parent. From the man page:
```
Since Linux 2.6.26, this flag can also be used to make an existing
bind mount read-only by specifying mountflags as:
MS_REMOUNT | MS_BIND | MS_RDONLY
Note that only the MS_RDONLY setting of the bind mount can be changed
in this manner.
```
MS_REC can only be set on the original bind, so move this. See note
in man page on bind mounts:
```
The remaining bits in the mountflags argument are also ignored, with
the exception of MS_REC.
```
Signed-off-by: Justin Cormack <justin.cormack@docker.com>
2016-12-11 10:25:02 +08:00
|
|
|
if err := readonlyPath(path); err != nil {
|
2015-02-13 08:23:05 +08:00
|
|
|
return err
|
2015-02-10 06:42:21 +08:00
|
|
|
}
|
2015-02-13 08:23:05 +08:00
|
|
|
}
|
|
|
|
for _, path := range l.config.Config.MaskPaths {
|
2016-09-23 15:02:10 +08:00
|
|
|
if err := maskPath(path); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pdeath, err := system.GetParentDeathSignal()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-03-04 02:44:33 +08:00
|
|
|
if l.config.NoNewPrivileges {
|
2017-07-13 21:29:10 +08:00
|
|
|
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
2016-02-16 19:55:26 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2015-12-17 17:16:34 +08:00
|
|
|
// Tell our parent that we're ready to Execv. This must be done before the
|
|
|
|
// Seccomp rules have been applied, because we need to be able to read and
|
|
|
|
// write to a socket.
|
|
|
|
if err := syncParentReady(l.pipe); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-27 23:15:58 +08:00
|
|
|
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
|
|
|
|
// do this before dropping capabilities; otherwise do it as late as possible
|
|
|
|
// just before execve so as few syscalls take place after it as possible.
|
|
|
|
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
|
2015-06-30 02:12:54 +08:00
|
|
|
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2015-02-10 05:11:57 +08:00
|
|
|
if err := finalizeNamespace(l.config); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
// finalizeNamespace can change user/group which clears the parent death
|
|
|
|
// signal, so we restore it here.
|
|
|
|
if err := pdeath.Restore(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-08-24 14:59:01 +08:00
|
|
|
// Compare the parent from the initial start of the init process and make
|
|
|
|
// sure that it did not change. if the parent changes that means it died
|
|
|
|
// and we were reparented to something else so we should just kill ourself
|
|
|
|
// and not cause problems for someone else.
|
2017-05-10 05:38:27 +08:00
|
|
|
if unix.Getppid() != l.parentPid {
|
|
|
|
return unix.Kill(unix.Getpid(), unix.SIGKILL)
|
2015-02-07 04:48:57 +08:00
|
|
|
}
|
2017-08-24 14:59:01 +08:00
|
|
|
// Check for the arg before waiting to make sure it exists and it is
|
|
|
|
// returned as a create time error.
|
2016-05-14 07:54:16 +08:00
|
|
|
name, err := exec.LookPath(l.config.Args[0])
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-08-24 14:59:01 +08:00
|
|
|
// Close the pipe to signal that we have completed our init.
|
2016-05-14 07:54:16 +08:00
|
|
|
l.pipe.Close()
|
2017-08-24 15:37:26 +08:00
|
|
|
// Wait for the FIFO to be opened on the other side before exec-ing the
|
|
|
|
// user process. We open it through /proc/self/fd/$fd, because the fd that
|
|
|
|
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
|
|
|
|
// re-open an O_PATH fd through /proc.
|
|
|
|
fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
|
2016-06-07 04:15:18 +08:00
|
|
|
if err != nil {
|
2017-08-24 14:59:01 +08:00
|
|
|
return newSystemErrorWithCause(err, "open exec fifo")
|
2016-06-07 04:15:18 +08:00
|
|
|
}
|
2017-05-10 05:38:27 +08:00
|
|
|
if _, err := unix.Write(fd, []byte("0")); err != nil {
|
2016-06-14 08:21:28 +08:00
|
|
|
return newSystemErrorWithCause(err, "write 0 exec fifo")
|
2016-06-07 04:15:18 +08:00
|
|
|
}
|
2017-08-24 14:59:01 +08:00
|
|
|
// Close the O_PATH fifofd fd before exec because the kernel resets
|
|
|
|
// dumpable in the wrong order. This has been fixed in newer kernels, but
|
|
|
|
// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
|
|
|
|
// N.B. the core issue itself (passing dirfds to the host filesystem) has
|
|
|
|
// since been resolved.
|
|
|
|
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
|
|
|
|
unix.Close(l.fifoFd)
|
|
|
|
// Set seccomp as close to execve as possible, so as few syscalls take
|
|
|
|
// place afterward (reducing the amount of syscalls that users need to
|
|
|
|
// enable in their seccomp profiles).
|
2016-06-07 04:15:18 +08:00
|
|
|
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
|
|
|
|
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
2016-06-14 08:21:28 +08:00
|
|
|
return newSystemErrorWithCause(err, "init seccomp")
|
2016-06-07 04:15:18 +08:00
|
|
|
}
|
|
|
|
}
|
2016-06-14 08:21:28 +08:00
|
|
|
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
|
|
|
|
return newSystemErrorWithCause(err, "exec user process")
|
|
|
|
}
|
|
|
|
return nil
|
2015-02-07 04:48:57 +08:00
|
|
|
}
|