Merge pull request #977 from cyphar/nsenter-userns-ordering

nsenter: guarantee correct user namespace ordering
This commit is contained in:
Qiang Huang 2016-10-26 16:45:15 +08:00 committed by GitHub
commit 157a96a428
5 changed files with 575 additions and 220 deletions

View File

@ -22,8 +22,8 @@ var (
supportedNamespaces = make(map[NamespaceType]bool)
)
// nsToFile converts the namespace type to its filename
func nsToFile(ns NamespaceType) string {
// NsName converts the namespace type to its filename
func NsName(ns NamespaceType) string {
switch ns {
case NEWNET:
return "net"
@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
if ok {
return supported
}
nsFile := nsToFile(ns)
nsFile := NsName(ns)
// if the namespace type is unknown, just return false
if nsFile == "" {
return false
@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
if n.Path != "" {
return n.Path
}
return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
}
func (n *Namespaces) Remove(t NamespaceType) bool {

View File

@ -1223,16 +1223,22 @@ func (c *linuxContainer) currentState() (*State, error) {
// can setns in order.
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
paths := []string{}
nsTypes := []configs.NamespaceType{
order := []configs.NamespaceType{
// The user namespace *must* be done first.
configs.NEWUSER,
configs.NEWIPC,
configs.NEWUTS,
configs.NEWNET,
configs.NEWPID,
configs.NEWNS,
}
// join userns if the init process explicitly requires NEWUSER
if c.config.Namespaces.Contains(configs.NEWUSER) {
nsTypes = append(nsTypes, configs.NEWUSER)
// Remove namespaces that we don't need to join.
var nsTypes []configs.NamespaceType
for _, ns := range order {
if c.config.Namespaces.Contains(ns) {
nsTypes = append(nsTypes, ns)
}
}
for _, nsType := range nsTypes {
if p, ok := namespaces[nsType]; ok && p != "" {
@ -1249,7 +1255,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
if strings.ContainsRune(p, ',') {
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
}
paths = append(paths, p)
paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p))
}
}
return paths, nil

View File

@ -0,0 +1,32 @@
#ifndef NSENTER_NAMESPACE_H
#define NSENTER_NAMESPACE_H
#ifndef _GNU_SOURCE
# define _GNU_SOURCE
#endif
#include <sched.h>
/* All of these are taken from include/uapi/linux/sched.h */
#ifndef CLONE_NEWNS
# define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#endif
#ifndef CLONE_NEWCGROUP
# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#endif
#ifndef CLONE_NEWUTS
# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#endif
#ifndef CLONE_NEWIPC
# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#endif
#ifndef CLONE_NEWUSER
# define CLONE_NEWUSER 0x10000000 /* New user namespace */
#endif
#ifndef CLONE_NEWPID
# define CLONE_NEWPID 0x20000000 /* New pid namespace */
#endif
#ifndef CLONE_NEWNET
# define CLONE_NEWNET 0x40000000 /* New network namespace */
#endif
#endif /* NSENTER_NAMESPACE_H */

View File

@ -29,7 +29,7 @@ func TestNsenterValidPaths(t *testing.T) {
namespaces := []string{
// join pid ns of the current process
fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()),
fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()),
}
cmd := &exec.Cmd{
Path: os.Args[0],
@ -87,7 +87,47 @@ func TestNsenterInvalidPaths(t *testing.T) {
namespaces := []string{
// join pid ns of the current process
fmt.Sprintf("/proc/%d/ns/pid", -1),
fmt.Sprintf("pid:/proc/%d/ns/pid", -1),
}
cmd := &exec.Cmd{
Path: os.Args[0],
Args: args,
ExtraFiles: []*os.File{child},
Env: []string{"_LIBCONTAINER_INITPIPE=3"},
}
if err := cmd.Start(); err != nil {
t.Fatal(err)
}
// write cloneFlags
r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
r.AddData(&libcontainer.Int32msg{
Type: libcontainer.CloneFlagsAttr,
Value: uint32(syscall.CLONE_NEWNET),
})
r.AddData(&libcontainer.Bytemsg{
Type: libcontainer.NsPathsAttr,
Value: []byte(strings.Join(namespaces, ",")),
})
if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
t.Fatal(err)
}
if err := cmd.Wait(); err == nil {
t.Fatalf("nsenter exits with a zero exit status")
}
}
func TestNsenterIncorrectPathType(t *testing.T) {
args := []string{"nsenter-exec"}
parent, child, err := newPipe()
if err != nil {
t.Fatalf("failed to create pipe %v", err)
}
namespaces := []string{
// join pid ns of the current process
fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()),
}
cmd := &exec.Cmd{
Path: os.Args[0],

View File

@ -11,6 +11,7 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
@ -23,27 +24,51 @@
#include <linux/netlink.h>
#include <linux/types.h>
#define SYNC_VAL 0x42
#define JUMP_VAL 0x43
/* Get all of the CLONE_NEW* flags. */
#include "namespace.h"
/* Synchronisation values. */
enum sync_t {
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
/* XXX: This doesn't help with segfaults and other such issues. */
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
};
/* longjmp() arguments. */
#define JUMP_PARENT 0x00
#define JUMP_CHILD 0xA0
#define JUMP_INIT 0xA1
/* JSON buffer. */
#define JSON_MAX 4096
/* Assume the stack grows down, so arguments should be above it. */
struct clone_arg {
struct clone_t {
/*
* Reserve some space for clone() to locate arguments
* and retcode in this place
*/
char stack[4096] __attribute__ ((aligned(16)));
char stack_ptr[0];
/* There's two children. This is used to execute the different code. */
jmp_buf *env;
int jmpval;
};
struct nlconfig_t {
char *data;
uint32_t cloneflags;
char *uidmap;
int uidmap_len;
size_t uidmap_len;
char *gidmap;
int gidmap_len;
size_t gidmap_len;
char *namespaces;
size_t namespaces_len;
uint8_t is_setgroup;
int consolefd;
};
@ -81,80 +106,24 @@ int setns(int fd, int nstype)
}
#endif
/* XXX: This is ugly. */
static int syncfd = -1;
/* TODO(cyphar): Fix this so it correctly deals with syncT. */
#define bail(fmt, ...) \
do { \
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
exit(__COUNTER__ + 1); \
#define bail(fmt, ...) \
do { \
int ret = __COUNTER__ + 1; \
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
if (syncfd >= 0) { \
enum sync_t s = SYNC_ERR; \
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \
fprintf(stderr, "nsenter: failed: write(s)"); \
if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \
fprintf(stderr, "nsenter: failed: write(ret)"); \
} \
exit(ret); \
} while(0)
static int child_func(void *arg)
{
struct clone_arg *ca = (struct clone_arg *)arg;
longjmp(*ca->env, JUMP_VAL);
}
static int clone_parent(jmp_buf *env, int flags) __attribute__ ((noinline));
static int clone_parent(jmp_buf *env, int flags)
{
int child;
struct clone_arg ca = {
.env = env,
};
child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, &ca);
/*
* On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so we have
* to unshare(2) before clone(2) in order to do this. This was fixed in
* upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
* introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e.
*
* As far as we're aware, the last mainline kernel which had this bug was
* Linux 3.12. However, we cannot comment on which kernels the broken patch
* was backported to.
*/
if (errno == EINVAL) {
if (unshare(flags) < 0)
bail("unable to unshare namespaces");
child = clone(child_func, ca.stack_ptr, SIGCHLD | CLONE_PARENT, &ca);
}
return child;
}
/*
* Gets the init pipe fd from the environment, which is used to read the
* bootstrap data and tell the parent what the new pid is after we finish
* setting up the environment.
*/
static int initpipe(void)
{
int pipenum;
char *initpipe, *endptr;
initpipe = getenv("_LIBCONTAINER_INITPIPE");
if (initpipe == NULL || *initpipe == '\0')
return -1;
errno = 0;
pipenum = strtol(initpipe, &endptr, 10);
if (errno != 0 || *endptr != '\0')
bail("unable to parse _LIBCONTAINER_INITPIPE");
return pipenum;
}
static uint32_t readint32(char *buf)
{
return *(uint32_t *) buf;
}
static uint8_t readint8(char *buf)
{
return *(uint8_t *) buf;
}
static int write_file(char *data, size_t data_len, char *pathfmt, ...)
{
int fd, len, ret = 0;
@ -184,18 +153,28 @@ out:
return ret;
}
#define SETGROUPS_ALLOW "allow"
#define SETGROUPS_DENY "deny"
enum policy_t {
SETGROUPS_DEFAULT = 0,
SETGROUPS_ALLOW,
SETGROUPS_DENY,
};
/* This *must* be called before we touch gid_map. */
static void update_setgroups(int pid, bool setgroup)
static void update_setgroups(int pid, enum policy_t setgroup)
{
char *policy;
if (setgroup)
policy = SETGROUPS_ALLOW;
else
policy = SETGROUPS_DENY;
switch (setgroup) {
case SETGROUPS_ALLOW:
policy = "allow";
break;
case SETGROUPS_DENY:
policy = "deny";
break;
case SETGROUPS_DEFAULT:
/* Nothing to do. */
return;
}
if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
/*
@ -225,44 +204,76 @@ static void update_gidmap(int pid, char *map, int map_len)
bail("failed to update /proc/%d/gid_map", pid);
}
#define JSON_MAX 4096
static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], struct nlconfig_t *config)
/* A dummy function that just jumps to the given jumpval. */
static int child_func(void *arg) __attribute__ ((noinline));
static int child_func(void *arg)
{
int len, childpid;
char buf[JSON_MAX];
uint8_t syncval;
struct clone_t *ca = (struct clone_t *)arg;
longjmp(*ca->env, ca->jmpval);
}
/*
* We must fork to actually enter the PID namespace, and use
* CLONE_PARENT so that the child init can have the right parent
* (the bootstrap process). Also so we don't need to forward the
* child's exit code or resend its death signal.
*/
childpid = clone_parent(env, config->cloneflags);
if (childpid < 0)
bail("unable to fork");
static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
static int clone_parent(jmp_buf *env, int jmpval)
{
struct clone_t ca = {
.env = env,
.jmpval = jmpval,
};
/* Update setgroups, uid_map and gid_map for the process if provided. */
if (config->is_setgroup)
update_setgroups(childpid, true);
update_uidmap(childpid, config->uidmap, config->uidmap_len);
update_gidmap(childpid, config->gidmap, config->gidmap_len);
return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
}
/* Send the sync signal to the child. */
close(syncpipe[0]);
syncval = SYNC_VAL;
if (write(syncpipe[1], &syncval, sizeof(syncval)) != sizeof(syncval))
bail("failed to write sync byte to child");
/*
* Gets the init pipe fd from the environment, which is used to read the
* bootstrap data and tell the parent what the new pid is after we finish
* setting up the environment.
*/
static int initpipe(void)
{
int pipenum;
char *initpipe, *endptr;
/* Send the child pid back to our parent */
len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", childpid);
if (len < 0 || write(pipenum, buf, len) != len) {
kill(childpid, SIGKILL);
bail("unable to send a child pid");
}
initpipe = getenv("_LIBCONTAINER_INITPIPE");
if (initpipe == NULL || *initpipe == '\0')
return -1;
exit(0);
pipenum = strtol(initpipe, &endptr, 10);
if (*endptr != '\0')
bail("unable to parse _LIBCONTAINER_INITPIPE");
return pipenum;
}
/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
static int nsflag(char *name)
{
if (!strcmp(name, "cgroup"))
return CLONE_NEWCGROUP;
else if (!strcmp(name, "ipc"))
return CLONE_NEWIPC;
else if (!strcmp(name, "mnt"))
return CLONE_NEWNS;
else if (!strcmp(name, "net"))
return CLONE_NEWNET;
else if (!strcmp(name, "pid"))
return CLONE_NEWPID;
else if (!strcmp(name, "user"))
return CLONE_NEWUSER;
else if (!strcmp(name, "uts"))
return CLONE_NEWUTS;
/* If we don't recognise a name, fallback to 0. */
return 0;
}
static uint32_t readint32(char *buf)
{
return *(uint32_t *) buf;
}
static uint8_t readint8(char *buf)
{
return *(uint8_t *) buf;
}
static void nl_parse(int fd, struct nlconfig_t *config)
@ -309,66 +320,17 @@ static void nl_parse(int fd, struct nlconfig_t *config)
break;
case CONSOLE_PATH_ATTR:
/*
* The context in which this is done (before or after we
* join the other namespaces) will affect how the path
* resolution of the console works. This order is not
* decided here, but rather in container_linux.go. We just
* follow the order given by the netlink message.
* We open the console here because we currently evaluate console
* paths from the *host* namespaces.
*/
config->consolefd = open(current, O_RDWR);
if (config->consolefd < 0)
bail("failed to open console %s", current);
break;
case NS_PATHS_ATTR:{
/*
* Open each namespace path and setns it in the
* order provided to us. We currently don't have
* any context for what kind of namespace we're
* joining, so just blindly do it.
*/
char *saveptr = NULL;
char *ns = strtok_r(current, ",", &saveptr);
int *fds = NULL, num = 0, i;
char **paths = NULL;
if (!ns || !strlen(current))
bail("ns paths are empty");
/*
* We have to open the file descriptors first, since after
* we join the mnt namespace we might no longer be able to
* access the paths.
*/
do {
int fd;
/* Resize fds. */
num++;
fds = realloc(fds, num * sizeof(int));
paths = realloc(paths, num * sizeof(char *));
fd = open(ns, O_RDONLY);
if (fd < 0)
bail("failed to open %s", ns);
fds[num - 1] = fd;
paths[num - 1] = ns;
} while ((ns = strtok_r(NULL, ",", &saveptr)) != NULL);
for (i = 0; i < num; i++) {
int fd = fds[i];
char *path = paths[i];
if (setns(fd, 0) < 0)
bail("failed to setns to %s", path);
close(fd);
}
free(fds);
free(paths);
break;
}
case NS_PATHS_ATTR:
config->namespaces = current;
config->namespaces_len = payload_len;
break;
case UIDMAP_ATTR:
config->uidmap = current;
config->uidmap_len = payload_len;
@ -393,6 +355,71 @@ void nl_free(struct nlconfig_t *config)
free(config->data);
}
void join_namespaces(char *nslist)
{
int num = 0, i;
char *saveptr = NULL;
char *namespace = strtok_r(nslist, ",", &saveptr);
struct namespace_t {
int fd;
int ns;
char type[PATH_MAX];
char path[PATH_MAX];
} *namespaces = NULL;
if (!namespace || !strlen(namespace) || !strlen(nslist))
bail("ns paths are empty");
/*
* We have to open the file descriptors first, since after
* we join the mnt namespace we might no longer be able to
* access the paths.
*/
do {
int fd;
char *path;
struct namespace_t *ns;
/* Resize the namespace array. */
namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
if (!namespaces)
bail("failed to reallocate namespace array");
ns = &namespaces[num - 1];
/* Split 'ns:path'. */
path = strstr(namespace, ":");
if (!path)
bail("failed to parse %s", namespace);
*path++ = '\0';
fd = open(path, O_RDONLY);
if (fd < 0)
bail("failed to open %s", namespace);
ns->fd = fd;
ns->ns = nsflag(namespace);
strncpy(ns->path, path, PATH_MAX);
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
/*
* The ordering in which we join namespaces is important. We should
* always join the user namespace *first*. This is all guaranteed
* from the container_linux.go side of this, so we're just going to
* follow the order given to us.
*/
for (i = 0; i < num; i++) {
struct namespace_t ns = namespaces[i];
if (setns(ns.fd, ns.ns) < 0)
bail("failed to setns to %s", ns.path);
close(ns.fd);
}
free(namespaces);
}
void nsexec(void)
{
int pipenum;
@ -413,61 +440,311 @@ void nsexec(void)
/* clone(2) flags are mandatory. */
if (config.cloneflags == -1)
bail("missing clone_flags");
bail("missing cloneflags");
/* Pipe so we can tell the child when we've finished setting up. */
if (pipe(syncpipe) < 0)
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0)
bail("failed to setup sync pipe between parent and child");
/* Set up the jump point. */
if (setjmp(env) == JUMP_VAL) {
/*
* We're inside the child now, having jumped from the
* start_child() code after forking in the parent.
*/
uint8_t s = 0;
int consolefd = config.consolefd;
/* TODO: Currently we aren't dealing with child deaths properly. */
/* Close the writing side of pipe. */
close(syncpipe[1]);
/*
* Okay, so this is quite annoying.
*
* In order to make sure that deal with older kernels (when CLONE_NEWUSER
* wasn't guaranteed to be done first if you specify multiple namespaces in
* a clone(2) invocation) as well as with certain usecases like rootless
* containers, we cannot just dump all of the cloneflags into clone(2).
* However, if we unshare(2) the user namespace *before* we clone(2), then
* all hell breaks loose.
*
* The parent no longer has permissions to do many things (unshare(2) drops
* all capabilities in your old namespace), and the container cannot be set
* up to have more than one {uid,gid} mapping. This is obviously less than
* ideal. In order to fix this, we have to first clone(2) and then unshare.
*
* Unfortunately, it's not as simple as that. We have to fork to enter the
* PID namespace (the PID namespace only applies to children). Since we'll
* have to double-fork, this clone_parent() call won't be able to get the
* PID of the _actual_ init process (without doing more synchronisation than
* I can deal with at the moment). So we'll just get the parent to send it
* for us, the only job of this process is to update
* /proc/pid/{setgroups,uid_map,gid_map}.
*
* And as a result of the above, we also need to setns(2) in the first child
* because if we join a PID namespace in the topmost parent then our child
* will be in that namespace (and it will not be able to give us a PID value
* that makes sense without resorting to sending things with cmsg).
*
* This also deals with an older issue caused by dumping cloneflags into
* clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
* we have to unshare(2) before clone(2) in order to do this. This was fixed
* in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
* introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
* aware, the last mainline kernel which had this bug was Linux 3.12.
* However, we cannot comment on which kernels the broken patch was
* backported to.
*
* -- Aleksa "what has my life come to?" Sarai
*/
/* Sync with parent. */
if (read(syncpipe[0], &s, sizeof(s)) != sizeof(s) || s != SYNC_VAL)
bail("failed to read sync byte from parent");
switch (setjmp(env)) {
/*
* Stage 0: We're in the parent. Our job is just to create a new child
* (stage 1: JUMP_CHILD) process and write its uid_map and
* gid_map. That process will go on to create a new process, then
* it will send us its PID which we will send to the bootstrap
* process.
*/
case JUMP_PARENT: {
int len;
pid_t child;
char buf[JSON_MAX];
if (setsid() < 0)
bail("setsid failed");
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
if (setuid(0) < 0)
bail("setuid failed");
/* Start the process of getting a container. */
child = clone_parent(&env, JUMP_CHILD);
if (child < 0)
bail("unable to fork: child_func");
if (setgid(0) < 0)
bail("setgid failed");
/* State machine for synchronisation with the children. */
while (true) {
enum sync_t s;
if (setgroups(0, NULL) < 0)
bail("setgroups failed");
/* This doesn't need to be global, we're in the parent. */
int syncfd = syncpipe[1];
if (consolefd != -1) {
if (ioctl(consolefd, TIOCSCTTY, 0) < 0)
bail("ioctl TIOCSCTTY failed");
if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO)
bail("failed to dup stdin");
if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO)
bail("failed to dup stdout");
if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO)
bail("failed to dup stderr");
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with child: next state");
switch (s) {
case SYNC_ERR: {
/* We have to mirror the error code of the child. */
int ret;
if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
bail("failed to sync with child: read(error code)");
exit(ret);
}
break;
case SYNC_USERMAP_PLS:
/* Enable setgroups(2) if we've been asked to. */
if (config.is_setgroup)
update_setgroups(child, SETGROUPS_ALLOW);
/* Set up mappings. */
update_uidmap(child, config.uidmap, config.uidmap_len);
update_gidmap(child, config.gidmap, config.gidmap_len);
s = SYNC_USERMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
}
break;
case SYNC_USERMAP_ACK:
/* We should _never_ receive acks. */
kill(child, SIGKILL);
bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
break;
case SYNC_RECVPID_PLS: {
pid_t old = child;
/* Get the init_func pid. */
if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
kill(old, SIGKILL);
bail("failed to sync with child: read(childpid)");
}
/* Send ACK. */
s = SYNC_RECVPID_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(old, SIGKILL);
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
}
}
/* Leave the loop. */
goto out;
case SYNC_RECVPID_ACK:
/* We should _never_ receive acks. */
kill(child, SIGKILL);
bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
break;
}
}
out:
/* Send the init_func pid back to our parent. */
len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
if (write(pipenum, buf, len) != len) {
kill(child, SIGKILL);
bail("unable to send child pid to bootstrapper");
}
exit(0);
}
/* Free netlink data. */
nl_free(&config);
/*
* Stage 1: We're in the first child process. Our job is to join any
* provided user namespaces in the netlink payload. If we've been
* asked to CLONE_NEWUSER, we will unshare the user namespace and
* ask our parent (stage 0) to set up our user mappings for us.
* Then, we unshare the rest of the requested namespaces and
* create a new child (stage 2: JUMP_INIT). We then send the
* child's PID to our parent (stage 0).
*/
case JUMP_CHILD: {
pid_t child;
enum sync_t s;
/* Finish executing, let the Go runtime take over. */
return;
/* We're in a child and thus need to tell the parent if we die. */
syncfd = syncpipe[0];
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
/*
* We need to setns first. We cannot do this earlier (in stage 0)
* because of the fact that we forked to get here (the PID of
* [stage 2: JUMP_INIT]) would be meaningless). We could send it
* using cmsg(3) but that's just annoying.
*/
if (config.namespaces)
join_namespaces(config.namespaces);
/*
* Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless
* containers). But for now, it's not possible to split this into
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
*
* We also can't be sure if the current kernel supports
* clone(CLONE_PARENT | CLONE_NEWPID), so we'll just do it the long
* way anyway.
*/
if (unshare(config.cloneflags) < 0)
bail("failed to unshare namespaces");
/*
* Deal with user namespaces first. They are quite special, as they
* affect our ability to unshare other namespaces and are used as
* context for privilege checks.
*/
if (config.cloneflags & CLONE_NEWUSER) {
/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal our parent to hook us up.
*/
s = SYNC_USERMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
/* ... wait for mapping ... */
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
if (s != SYNC_USERMAP_ACK)
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
}
/* TODO: What about non-namespace clone flags that we're dropping here? */
child = clone_parent(&env, JUMP_INIT);
if (child < 0)
bail("unable to fork: init_func");
/* Send the child to our parent, which knows what it's doing. */
s = SYNC_RECVPID_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
}
if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
kill(child, SIGKILL);
bail("failed to sync with parent: write(childpid)");
}
/* ... wait for parent to get the pid ... */
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
}
if (s != SYNC_RECVPID_ACK) {
kill(child, SIGKILL);
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
}
/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
exit(0);
}
/*
* Stage 2: We're the final child process, and the only process that will
* actually return to the Go runtime. Our job is to just do the
* final cleanup steps and then return to the Go runtime to allow
* init_linux.go to run.
*/
case JUMP_INIT: {
/*
* We're inside the child now, having jumped from the
* start_child() code after forking in the parent.
*/
int consolefd = config.consolefd;
/* We're in a child and thus need to tell the parent if we die. */
syncfd = syncpipe[0];
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long) "runc:[1:INIT]", 0, 0, 0);
if (setsid() < 0)
bail("setsid failed");
if (setuid(0) < 0)
bail("setuid failed");
if (setgid(0) < 0)
bail("setgid failed");
if (setgroups(0, NULL) < 0)
bail("setgroups failed");
if (consolefd != -1) {
if (ioctl(consolefd, TIOCSCTTY, 0) < 0)
bail("ioctl TIOCSCTTY failed");
if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO)
bail("failed to dup stdin");
if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO)
bail("failed to dup stdout");
if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO)
bail("failed to dup stderr");
}
/* Close sync pipes. */
close(syncpipe[0]);
close(syncpipe[1]);
/* Free netlink data. */
nl_free(&config);
/* Finish executing, let the Go runtime take over. */
return;
}
default:
bail("unexpected jump value");
break;
}
/* Run the parent code. */
start_child(pipenum, &env, syncpipe, &config);
/* Should never be reached. */
bail("should never be reached");
}