2015-02-07 04:48:57 +08:00
|
|
|
// +build linux
|
|
|
|
|
|
|
|
package libcontainer
|
|
|
|
|
|
|
|
import (
|
2016-02-23 04:36:12 +08:00
|
|
|
"fmt"
|
2015-03-06 06:33:13 +08:00
|
|
|
"os"
|
2016-05-14 07:54:16 +08:00
|
|
|
"os/exec"
|
2015-02-07 04:48:57 +08:00
|
|
|
"syscall"
|
|
|
|
|
2015-06-22 10:29:59 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/apparmor"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
2016-01-21 07:12:25 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/keys"
|
2015-06-22 10:29:59 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/label"
|
2015-06-30 02:12:54 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/seccomp"
|
2015-06-22 10:29:59 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
2015-02-07 04:48:57 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
type linuxStandardInit struct {
|
2016-06-03 23:29:34 +08:00
|
|
|
pipe *os.File
|
2016-06-07 04:15:18 +08:00
|
|
|
parentPid int
|
|
|
|
stateDirFD int
|
|
|
|
config *initConfig
|
2015-02-07 04:48:57 +08:00
|
|
|
}
|
|
|
|
|
2016-02-23 04:36:12 +08:00
|
|
|
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
|
|
|
|
var newperms uint32
|
|
|
|
|
|
|
|
if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
|
|
|
|
// with user ns we need 'other' search permissions
|
|
|
|
newperms = 0x8
|
|
|
|
} else {
|
|
|
|
// without user ns we need 'UID' search permissions
|
|
|
|
newperms = 0x80000
|
|
|
|
}
|
|
|
|
|
|
|
|
// create a unique per session container name that we can
|
|
|
|
// join in setns; however, other containers can also join it
|
|
|
|
return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
|
|
|
|
}
|
|
|
|
|
2016-02-16 19:55:26 +08:00
|
|
|
// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
|
|
|
|
// the kernel
|
|
|
|
const PR_SET_NO_NEW_PRIVS = 0x26
|
|
|
|
|
2016-06-07 04:15:18 +08:00
|
|
|
func (l *linuxStandardInit) Init() error {
|
2016-06-04 02:53:07 +08:00
|
|
|
if !l.config.Config.NoNewKeyring {
|
|
|
|
ringname, keepperms, newperms := l.getSessionRingParams()
|
2016-02-23 04:36:12 +08:00
|
|
|
|
2016-06-04 02:53:07 +08:00
|
|
|
// do not inherit the parent's session keyring
|
2016-07-25 06:41:57 +08:00
|
|
|
sessKeyId, err := keys.JoinSessionKeyring(ringname)
|
2016-06-04 02:53:07 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// make session keyring searcheable
|
2016-07-25 06:41:57 +08:00
|
|
|
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
|
2016-06-04 02:53:07 +08:00
|
|
|
return err
|
|
|
|
}
|
2016-01-21 07:12:25 +08:00
|
|
|
}
|
|
|
|
|
2015-02-11 03:51:45 +08:00
|
|
|
if err := setupNetwork(l.config); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := setupRoute(l.config.Config); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-03-02 02:19:07 +08:00
|
|
|
|
2015-02-07 04:48:57 +08:00
|
|
|
label.Init()
|
2016-06-03 23:29:34 +08:00
|
|
|
|
|
|
|
// prepareRootfs() can be executed only for a new mount namespace.
|
2015-02-07 04:48:57 +08:00
|
|
|
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
2016-06-03 23:29:34 +08:00
|
|
|
if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2016-06-03 23:29:34 +08:00
|
|
|
|
|
|
|
// Set up the console. This has to be done *before* we finalize the rootfs,
|
|
|
|
// but *after* we've given the user the chance to set up all of the mounts
|
|
|
|
// they wanted.
|
|
|
|
if l.config.CreateConsole {
|
|
|
|
if err := setupConsole(l.pipe, l.config, true); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := system.Setctty(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finish the rootfs setup.
|
|
|
|
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
|
|
|
if err := finalizeRootfs(l.config.Config); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-07 04:48:57 +08:00
|
|
|
if hostname := l.config.Config.Hostname; hostname != "" {
|
|
|
|
if err := syscall.Sethostname([]byte(hostname)); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2016-03-04 02:44:33 +08:00
|
|
|
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
2016-03-04 02:44:33 +08:00
|
|
|
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
2015-04-23 10:17:30 +08:00
|
|
|
|
2015-07-07 07:18:08 +08:00
|
|
|
for key, value := range l.config.Config.Sysctl {
|
2015-04-23 10:17:30 +08:00
|
|
|
if err := writeSystemProperty(key, value); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2015-02-13 08:23:05 +08:00
|
|
|
for _, path := range l.config.Config.ReadonlyPaths {
|
Split the code for remounting mount points and mounting paths.
A remount of a mount point must include all the current flags or
these will be cleared:
```
The mountflags and data arguments should match the values used in the
original mount() call, except for those parameters that are being
deliberately changed.
```
The current code does not do this; the bug manifests in the specified
flags for `/dev` being lost on remount read only at present. As we
need to specify flags, split the code path for this from remounting
paths which are not mount points, as these can only inherit the
existing flags of the path, and these cannot be changed.
In the bind case, remove extra flags from the bind remount. A bind
mount can only be remounted read only, no other flags can be set,
all other flags are inherited from the parent. From the man page:
```
Since Linux 2.6.26, this flag can also be used to make an existing
bind mount read-only by specifying mountflags as:
MS_REMOUNT | MS_BIND | MS_RDONLY
Note that only the MS_RDONLY setting of the bind mount can be changed
in this manner.
```
MS_REC can only be set on the original bind, so move this. See note
in man page on bind mounts:
```
The remaining bits in the mountflags argument are also ignored, with
the exception of MS_REC.
```
Signed-off-by: Justin Cormack <justin.cormack@docker.com>
2016-12-11 10:25:02 +08:00
|
|
|
if err := readonlyPath(path); err != nil {
|
2015-02-13 08:23:05 +08:00
|
|
|
return err
|
2015-02-10 06:42:21 +08:00
|
|
|
}
|
2015-02-13 08:23:05 +08:00
|
|
|
}
|
|
|
|
for _, path := range l.config.Config.MaskPaths {
|
2016-09-23 15:02:10 +08:00
|
|
|
if err := maskPath(path); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pdeath, err := system.GetParentDeathSignal()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-03-04 02:44:33 +08:00
|
|
|
if l.config.NoNewPrivileges {
|
2016-02-16 19:55:26 +08:00
|
|
|
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2015-12-17 17:16:34 +08:00
|
|
|
// Tell our parent that we're ready to Execv. This must be done before the
|
|
|
|
// Seccomp rules have been applied, because we need to be able to read and
|
|
|
|
// write to a socket.
|
|
|
|
if err := syncParentReady(l.pipe); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-27 23:15:58 +08:00
|
|
|
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
|
|
|
|
// do this before dropping capabilities; otherwise do it as late as possible
|
|
|
|
// just before execve so as few syscalls take place after it as possible.
|
|
|
|
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
|
2015-06-30 02:12:54 +08:00
|
|
|
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2015-02-10 05:11:57 +08:00
|
|
|
if err := finalizeNamespace(l.config); err != nil {
|
2015-02-07 04:48:57 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
// finalizeNamespace can change user/group which clears the parent death
|
|
|
|
// signal, so we restore it here.
|
|
|
|
if err := pdeath.Restore(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-10-12 07:22:48 +08:00
|
|
|
// compare the parent from the initial start of the init process and make sure that it did not change.
|
2016-07-13 23:32:38 +08:00
|
|
|
// if the parent changes that means it died and we were reparented to something else so we should
|
2015-04-03 04:55:55 +08:00
|
|
|
// just kill ourself and not cause problems for someone else.
|
|
|
|
if syscall.Getppid() != l.parentPid {
|
2015-02-07 04:48:57 +08:00
|
|
|
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
|
|
|
|
}
|
2016-05-14 07:54:16 +08:00
|
|
|
// check for the arg before waiting to make sure it exists and it is returned
|
2016-06-07 04:15:18 +08:00
|
|
|
// as a create time error.
|
2016-05-14 07:54:16 +08:00
|
|
|
name, err := exec.LookPath(l.config.Args[0])
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-06-07 04:15:18 +08:00
|
|
|
// close the pipe to signal that we have completed our init.
|
2016-05-14 07:54:16 +08:00
|
|
|
l.pipe.Close()
|
2016-06-07 04:15:18 +08:00
|
|
|
// wait for the fifo to be opened on the other side before
|
|
|
|
// exec'ing the users process.
|
|
|
|
fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
|
|
|
|
if err != nil {
|
2016-06-14 08:21:28 +08:00
|
|
|
return newSystemErrorWithCause(err, "openat exec fifo")
|
2016-06-07 04:15:18 +08:00
|
|
|
}
|
|
|
|
if _, err := syscall.Write(fd, []byte("0")); err != nil {
|
2016-06-14 08:21:28 +08:00
|
|
|
return newSystemErrorWithCause(err, "write 0 exec fifo")
|
2016-06-07 04:15:18 +08:00
|
|
|
}
|
|
|
|
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
|
|
|
|
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
2016-06-14 08:21:28 +08:00
|
|
|
return newSystemErrorWithCause(err, "init seccomp")
|
2016-06-07 04:15:18 +08:00
|
|
|
}
|
|
|
|
}
|
2016-06-14 08:21:28 +08:00
|
|
|
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
|
|
|
|
return newSystemErrorWithCause(err, "exec user process")
|
|
|
|
}
|
|
|
|
return nil
|
2015-02-07 04:48:57 +08:00
|
|
|
}
|