runc/libcontainer/setns_init_linux.go

// +build linux

package libcontainer

import (
	"fmt"
	"os"
	"runtime"

	"github.com/opencontainers/runc/libcontainer/apparmor"
	"github.com/opencontainers/runc/libcontainer/keys"
	"github.com/opencontainers/runc/libcontainer/seccomp"
	"github.com/opencontainers/runc/libcontainer/system"
	"github.com/opencontainers/selinux/go-selinux/label"
	"github.com/pkg/errors"

	"golang.org/x/sys/unix"
)

// linuxSetnsInit performs the container's initialization for running a new process
// inside an existing container.
type linuxSetnsInit struct {
	pipe          *os.File
	consoleSocket *os.File
	config        *initConfig
}

func (l *linuxSetnsInit) getSessionRingName() string {
	return fmt.Sprintf("_ses.%s", l.config.ContainerId)
}

func (l *linuxSetnsInit) Init() error {
	runtime.LockOSThread()
	defer runtime.UnlockOSThread()

	if !l.config.Config.NoNewKeyring {
		// Do not inherit the parent's session keyring.
		if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
			// Same justification as in standart_init_linux.go as to why we
			// don't bail on ENOSYS.
			//
			// TODO(cyphar): And we should have logging here too.
			if errors.Cause(err) != unix.ENOSYS {
				return errors.Wrap(err, "join session keyring")
			}
		}
	}
	if l.config.CreateConsole {
		if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
			return err
		}
		if err := system.Setctty(); err != nil {
			return err
		}
	}
	if l.config.NoNewPrivileges {
		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
			return err
		}
	}
	if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
		return err
	}
	defer label.SetProcessLabel("")
	// Without NoNewPrivileges seccomp is a privileged operation, so we need to
	// do this before dropping capabilities; otherwise do it as late as possible
	// just before execve so as few syscalls take place after it as possible.
	if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
			return err
		}
	}
	if err := finalizeNamespace(l.config); err != nil {
		return err
	}
	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
		return err
	}
	// Set seccomp as close to execve as possible, so as few syscalls take
	// place afterward (reducing the amount of syscalls that users need to
	// enable in their seccomp profiles).
	if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
			return newSystemErrorWithCause(err, "init seccomp")
		}
	}
	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}
Refactor init actions into separate types Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2015-02-07 04:48:57 +08:00			`// +build linux`

			`package libcontainer`

			`import (`
Create unique session key name for every container Create a unique session key name for every container. Use the pattern _ses.<postfix> with postfix being the container's Id. This patch does not prevent containers from joining each other's session keyring. Signed-off-by: Stefan Berger <stefanb@linux.vnet.ibm.com> 2016-02-23 04:36:12 +08:00			`"fmt"`
Pass os.Environ() as environment to process from init. Replacement of #418 Signed-off-by: Alexander Morozov <lk4d4@docker.com> 2015-03-06 06:33:13 +08:00			`"os"`
SELinux labels are tied to the thread We need to lock the threads for the SetProcessLabel to work, should also call SetProcessLabel("") after the container starts to go back to the default SELinux behaviour. Once you call SetProcessLabel, then any process executed by runc will run with this label, even if the process is for setup rather then the container. It is always safest to call the SELinux calls just before the exec of the container, so that other processes do not get started with the incorrect label. Signed-off-by: Daniel J Walsh <dwalsh@redhat.com> 2018-06-08 01:52:01 +08:00			`"runtime"`
Pass os.Environ() as environment to process from init. Replacement of #418 Signed-off-by: Alexander Morozov <lk4d4@docker.com> 2015-03-06 06:33:13 +08:00
Update import paths for new repository Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2015-06-22 10:29:59 +08:00			`"github.com/opencontainers/runc/libcontainer/apparmor"`
Create a new session key for every container Create a new session key ring '_ses' for every container. This avoids sharing the key structure with the process that created the container and the container inherits from. This patch fixes it init and exec. Signed-off-by: Stefan Berger <stefanb@linux.vnet.ibm.com> 2016-01-21 07:12:25 +08:00			`"github.com/opencontainers/runc/libcontainer/keys"`
Convert Seccomp support to use Libseccomp This removes the existing, native Go seccomp filter generation and replaces it with Libseccomp. Libseccomp is a C library which provides architecture independent generation of Seccomp filters for the Linux kernel. This adds a dependency on v2.2.1 or above of Libseccomp. Signed-off-by: Matthew Heon <mheon@redhat.com> 2015-06-30 02:12:54 +08:00			`"github.com/opencontainers/runc/libcontainer/seccomp"`
Update import paths for new repository Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2015-06-22 10:29:59 +08:00			`"github.com/opencontainers/runc/libcontainer/system"`
Use opencontainers/selinux package It's splitted as a separate project. Signed-off-by: Qiang Huang <h.huangqiang@huawei.com> 2017-03-23 08:21:19 +08:00			`"github.com/opencontainers/selinux/go-selinux/label"`
keyring: handle ENOSYS with keyctl(KEYCTL_JOIN_SESSION_KEYRING) While all modern kernels (and I do mean _all_ of them -- this syscall was added in 2.6.10 before git had begun development!) have support for this syscall, LXC has a default seccomp profile that returns ENOSYS for this syscall. For most syscalls this would be a deal-breaker, and our use of session keyrings is security-based there are a few mitigating factors that make this change not-completely-insane: * We already have a flag that disables the use of session keyrings (for older kernels that had system-wide keyring limits and so on). So disabling it is not a new idea. * While the primary justification of using session keys is security-based, it's more of a security-by-obscurity protection. The main defense keyrings have is VFS credentials -- which is something that users already have better security tools for (setuid(2) and user namespaces). * Given the security justification you might argue that we shouldn't silently ignore this. However, the only way for the kernel to return -ENOSYS is either being ridiculously old (at which point we wouldn't work anyway) or that there is a seccomp profile in place blocking it. Given that the seccomp profile (if malicious) could very easily just return 0 or a silly return code (or something even more clever with seccomp-bpf) and trick us without this patch, there isn't much of a significant change in how much seccomp can trick us with or without this patch. Given all of that over-analysis, I'm pretty convinced there isn't a security problem in this very specific case and it will help out the ChromeOS folks by allowing Docker to run inside their LXC container setup. I'd be happy to be proven wrong. Ref: https://bugs.chromium.org/p/chromium/issues/detail?id=860565 Signed-off-by: Aleksa Sarai <asarai@suse.de> 2018-09-17 19:38:30 +08:00			`"github.com/pkg/errors"`
Use Prctl() from x/sys/unix instead of own wrapper Use unix.Prctl() instead of reimplemnting it as system.Prctl(). Signed-off-by: Tobias Klauser <tklauser@distanz.ch> 2017-06-07 21:03:15 +08:00
			`"golang.org/x/sys/unix"`
Refactor init actions into separate types Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2015-02-07 04:48:57 +08:00			`)`

			`// linuxSetnsInit performs the container's initialization for running a new process`
			`// inside an existing container.`
			`type linuxSetnsInit struct {`
Add separate console socket Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2017-03-03 04:53:06 +08:00			`pipe *os.File`
			`consoleSocket *os.File`
			`config *initConfig`
Refactor init actions into separate types Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2015-02-07 04:48:57 +08:00			`}`

Create unique session key name for every container Create a unique session key name for every container. Use the pattern _ses.<postfix> with postfix being the container's Id. This patch does not prevent containers from joining each other's session keyring. Signed-off-by: Stefan Berger <stefanb@linux.vnet.ibm.com> 2016-02-23 04:36:12 +08:00			`func (l *linuxSetnsInit) getSessionRingName() string {`
			`return fmt.Sprintf("_ses.%s", l.config.ContainerId)`
			`}`

Use fifo for create/start This removes the use of a signal handler and SIGCONT to signal the init process to exec the users process. Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2016-06-07 04:15:18 +08:00			`func (l *linuxSetnsInit) Init() error {`
SELinux labels are tied to the thread We need to lock the threads for the SetProcessLabel to work, should also call SetProcessLabel("") after the container starts to go back to the default SELinux behaviour. Once you call SetProcessLabel, then any process executed by runc will run with this label, even if the process is for setup rather then the container. It is always safest to call the SELinux calls just before the exec of the container, so that other processes do not get started with the incorrect label. Signed-off-by: Daniel J Walsh <dwalsh@redhat.com> 2018-06-08 01:52:01 +08:00			`runtime.LockOSThread()`
			`defer runtime.UnlockOSThread()`

Add option to disable new session keys This adds an `--no-new-keyring` flag to run and create so that a new session keyring is not created for the container and the calling processes keyring is inherited. Fixes #818 Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2016-06-04 02:53:07 +08:00			`if !l.config.Config.NoNewKeyring {`
keyring: handle ENOSYS with keyctl(KEYCTL_JOIN_SESSION_KEYRING) While all modern kernels (and I do mean _all_ of them -- this syscall was added in 2.6.10 before git had begun development!) have support for this syscall, LXC has a default seccomp profile that returns ENOSYS for this syscall. For most syscalls this would be a deal-breaker, and our use of session keyrings is security-based there are a few mitigating factors that make this change not-completely-insane: * We already have a flag that disables the use of session keyrings (for older kernels that had system-wide keyring limits and so on). So disabling it is not a new idea. * While the primary justification of using session keys is security-based, it's more of a security-by-obscurity protection. The main defense keyrings have is VFS credentials -- which is something that users already have better security tools for (setuid(2) and user namespaces). * Given the security justification you might argue that we shouldn't silently ignore this. However, the only way for the kernel to return -ENOSYS is either being ridiculously old (at which point we wouldn't work anyway) or that there is a seccomp profile in place blocking it. Given that the seccomp profile (if malicious) could very easily just return 0 or a silly return code (or something even more clever with seccomp-bpf) and trick us without this patch, there isn't much of a significant change in how much seccomp can trick us with or without this patch. Given all of that over-analysis, I'm pretty convinced there isn't a security problem in this very specific case and it will help out the ChromeOS folks by allowing Docker to run inside their LXC container setup. I'd be happy to be proven wrong. Ref: https://bugs.chromium.org/p/chromium/issues/detail?id=860565 Signed-off-by: Aleksa Sarai <asarai@suse.de> 2018-09-17 19:38:30 +08:00			`// Do not inherit the parent's session keyring.`
libcontainer: rename keyctl package to keys This avoid the goimports tool from remove the libcontainer/keys import line due the package name is diferent from folder name Signed-off-by: Guilherme Rezende <guilhermebr@gmail.com> 2016-07-25 06:41:57 +08:00			`if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {`
keyring: handle ENOSYS with keyctl(KEYCTL_JOIN_SESSION_KEYRING) While all modern kernels (and I do mean _all_ of them -- this syscall was added in 2.6.10 before git had begun development!) have support for this syscall, LXC has a default seccomp profile that returns ENOSYS for this syscall. For most syscalls this would be a deal-breaker, and our use of session keyrings is security-based there are a few mitigating factors that make this change not-completely-insane: * We already have a flag that disables the use of session keyrings (for older kernels that had system-wide keyring limits and so on). So disabling it is not a new idea. * While the primary justification of using session keys is security-based, it's more of a security-by-obscurity protection. The main defense keyrings have is VFS credentials -- which is something that users already have better security tools for (setuid(2) and user namespaces). * Given the security justification you might argue that we shouldn't silently ignore this. However, the only way for the kernel to return -ENOSYS is either being ridiculously old (at which point we wouldn't work anyway) or that there is a seccomp profile in place blocking it. Given that the seccomp profile (if malicious) could very easily just return 0 or a silly return code (or something even more clever with seccomp-bpf) and trick us without this patch, there isn't much of a significant change in how much seccomp can trick us with or without this patch. Given all of that over-analysis, I'm pretty convinced there isn't a security problem in this very specific case and it will help out the ChromeOS folks by allowing Docker to run inside their LXC container setup. I'd be happy to be proven wrong. Ref: https://bugs.chromium.org/p/chromium/issues/detail?id=860565 Signed-off-by: Aleksa Sarai <asarai@suse.de> 2018-09-17 19:38:30 +08:00			`// Same justification as in standart_init_linux.go as to why we`
			`// don't bail on ENOSYS.`
			`//`
			`// TODO(cyphar): And we should have logging here too.`
			`if errors.Cause(err) != unix.ENOSYS {`
			`return errors.Wrap(err, "join session keyring")`
			`}`
Add option to disable new session keys This adds an `--no-new-keyring` flag to run and create so that a new session keyring is not created for the container and the calling processes keyring is inherited. Fixes #818 Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2016-06-04 02:53:07 +08:00			`}`
Create a new session key for every container Create a new session key ring '_ses' for every container. This avoids sharing the key structure with the process that created the container and the container inherits from. This patch fixes it init and exec. Signed-off-by: Stefan Berger <stefanb@linux.vnet.ibm.com> 2016-01-21 07:12:25 +08:00			`}`
*: console rewrite This implements {createTTY, detach} and all of the combinations and negations of the two that were previously implemented. There are some valid questions about out-of-OCI-scope topics like !createTTY and how things should be handled (why do we dup the current stdio to the process, and how is that not a security issue). However, these will be dealt with in a separate patchset. In order to allow for late console setup, split setupRootfs into the "preparation" section where all of the mounts are created and the "finalize" section where we pivot_root and set things as ro. In between the two we can set up all of the console mountpoints and symlinks we need. We use two-stage synchronisation to ensures that when the syscalls are reordered in a suboptimal way, an out-of-place read() on the parentPipe will not gobble the ancilliary information. This patch is part of the console rewrite patchset. Signed-off-by: Aleksa Sarai <asarai@suse.de> 2016-06-03 23:29:34 +08:00			`if l.config.CreateConsole {`
Add separate console socket Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2017-03-03 04:53:06 +08:00			`if err := setupConsole(l.consoleSocket, l.config, false); err != nil {`
*: console rewrite This implements {createTTY, detach} and all of the combinations and negations of the two that were previously implemented. There are some valid questions about out-of-OCI-scope topics like !createTTY and how things should be handled (why do we dup the current stdio to the process, and how is that not a security issue). However, these will be dealt with in a separate patchset. In order to allow for late console setup, split setupRootfs into the "preparation" section where all of the mounts are created and the "finalize" section where we pivot_root and set things as ro. In between the two we can set up all of the console mountpoints and symlinks we need. We use two-stage synchronisation to ensures that when the syscalls are reordered in a suboptimal way, an out-of-place read() on the parentPipe will not gobble the ancilliary information. This patch is part of the console rewrite patchset. Signed-off-by: Aleksa Sarai <asarai@suse.de> 2016-06-03 23:29:34 +08:00			`return err`
			`}`
			`if err := system.Setctty(); err != nil {`
			`return err`
			`}`
			`}`
Add support for process overrides of settings This commit adds support to libcontainer to allow caps, no new privs, apparmor, and selinux process label to the process struct so that it can be used together of override the base settings on the container config per individual process. Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2016-03-04 02:44:33 +08:00			`if l.config.NoNewPrivileges {`
libcontainer: use PR_SET_NO_NEW_PRIVS from x/sys/unix Use PR_SET_NO_NEW_PRIVS defined in golang.org/x/sys/unix instead of manually defining it. Signed-off-by: Tobias Klauser <tklauser@distanz.ch> 2017-07-13 21:29:10 +08:00			`if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {`
Implement NoNewPrivileges support in libcontainer Signed-off-by: Mrunal Patel <mrunalp@gmail.com> 2016-02-16 19:55:26 +08:00			`return err`
			`}`
			`}`
SELinux labels are tied to the thread We need to lock the threads for the SetProcessLabel to work, should also call SetProcessLabel("") after the container starts to go back to the default SELinux behaviour. Once you call SetProcessLabel, then any process executed by runc will run with this label, even if the process is for setup rather then the container. It is always safest to call the SELinux calls just before the exec of the container, so that other processes do not get started with the incorrect label. Signed-off-by: Daniel J Walsh <dwalsh@redhat.com> 2018-06-08 01:52:01 +08:00			`if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {`
			`return err`
			`}`
			`defer label.SetProcessLabel("")`
setns init: delay seccomp as late as possible This mirrors the standard_init_linux.go seccomp code, which only applies seccomp early if NoNewPrivileges is enabled. Otherwise it's done immediately before execve to reduce the amount of syscalls necessary for users to enable in their seccomp profiles. Signed-off-by: Aleksa Sarai <asarai@suse.de> 2017-08-24 15:00:50 +08:00			`// Without NoNewPrivileges seccomp is a privileged operation, so we need to`
			`// do this before dropping capabilities; otherwise do it as late as possible`
			`// just before execve so as few syscalls take place after it as possible.`
			`if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {`
Convert Seccomp support to use Libseccomp This removes the existing, native Go seccomp filter generation and replaces it with Libseccomp. Libseccomp is a C library which provides architecture independent generation of Seccomp filters for the Linux kernel. This adds a dependency on v2.2.1 or above of Libseccomp. Signed-off-by: Matthew Heon <mheon@redhat.com> 2015-06-30 02:12:54 +08:00			`if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {`
			`return err`
			`}`
			`}`
Refactor init actions into separate types Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2015-02-07 04:48:57 +08:00			`if err := finalizeNamespace(l.config); err != nil {`
			`return err`
			`}`
Add support for process overrides of settings This commit adds support to libcontainer to allow caps, no new privs, apparmor, and selinux process label to the process struct so that it can be used together of override the base settings on the container config per individual process. Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2016-03-04 02:44:33 +08:00			`if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {`
Refactor init actions into separate types Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2015-02-07 04:48:57 +08:00			`return err`
			`}`
setns init: delay seccomp as late as possible This mirrors the standard_init_linux.go seccomp code, which only applies seccomp early if NoNewPrivileges is enabled. Otherwise it's done immediately before execve to reduce the amount of syscalls necessary for users to enable in their seccomp profiles. Signed-off-by: Aleksa Sarai <asarai@suse.de> 2017-08-24 15:00:50 +08:00			`// Set seccomp as close to execve as possible, so as few syscalls take`
			`// place afterward (reducing the amount of syscalls that users need to`
			`// enable in their seccomp profiles).`
			`if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {`
			`if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {`
			`return newSystemErrorWithCause(err, "init seccomp")`
			`}`
			`}`
Pass os.Environ() as environment to process from init. Replacement of #418 Signed-off-by: Alexander Morozov <lk4d4@docker.com> 2015-03-06 06:33:13 +08:00			`return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())`
Refactor init actions into separate types Signed-off-by: Michael Crosby <crosbymichael@gmail.com> 2015-02-07 04:48:57 +08:00			`}`