Merge pull request #977 from cyphar/nsenter-userns-ordering

nsenter: guarantee correct user namespace ordering
2016-10-26 16:45:15 +08:00 · 2016-10-26 16:45:15 +08:00 · 157a96a428
parent d6b68e8a60 e3cd191acc
commit 157a96a428
5 changed files with 575 additions and 220 deletions
--- a/libcontainer/configs/namespaces_unix.go
+++ b/libcontainer/configs/namespaces_unix.go
@ -22,8 +22,8 @@ var (
 	supportedNamespaces = make(map[NamespaceType]bool)
 )

-// nsToFile converts the namespace type to its filename
-func nsToFile(ns NamespaceType) string {
+// NsName converts the namespace type to its filename
+func NsName(ns NamespaceType) string {
 	switch ns {
 	case NEWNET:
 		return "net"
@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
 	if ok {
 		return supported
 	}
-	nsFile := nsToFile(ns)
+	nsFile := NsName(ns)
 	// if the namespace type is unknown, just return false
 	if nsFile == "" {
 		return false
@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
 	if n.Path != "" {
 		return n.Path
 	}
-	return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
+	return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
 }

 func (n *Namespaces) Remove(t NamespaceType) bool {
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@ -1223,16 +1223,22 @@ func (c *linuxContainer) currentState() (*State, error) {
 // can setns in order.
 func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
 	paths := []string{}
-	nsTypes := []configs.NamespaceType{
+	order := []configs.NamespaceType{
+		// The user namespace *must* be done first.
+		configs.NEWUSER,
 		configs.NEWIPC,
 		configs.NEWUTS,
 		configs.NEWNET,
 		configs.NEWPID,
 		configs.NEWNS,
 	}
-	// join userns if the init process explicitly requires NEWUSER
-	if c.config.Namespaces.Contains(configs.NEWUSER) {
-		nsTypes = append(nsTypes, configs.NEWUSER)
+
+	// Remove namespaces that we don't need to join.
+	var nsTypes []configs.NamespaceType
+	for _, ns := range order {
+		if c.config.Namespaces.Contains(ns) {
+			nsTypes = append(nsTypes, ns)
+		}
 	}
 	for _, nsType := range nsTypes {
 		if p, ok := namespaces[nsType]; ok && p != "" {
@ -1249,7 +1255,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
 			if strings.ContainsRune(p, ',') {
 				return nil, newSystemError(fmt.Errorf("invalid path %s", p))
 			}
-			paths = append(paths, p)
+			paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p))
 		}
 	}
 	return paths, nil
--- a/libcontainer/nsenter/namespace.h
+++ b/libcontainer/nsenter/namespace.h
@ -0,0 +1,32 @@
+#ifndef NSENTER_NAMESPACE_H
+#define NSENTER_NAMESPACE_H
+
+#ifndef _GNU_SOURCE
+#	define _GNU_SOURCE
+#endif
+#include <sched.h>
+
+/* All of these are taken from include/uapi/linux/sched.h */
+#ifndef CLONE_NEWNS
+#	define CLONE_NEWNS 0x00020000 /* New mount namespace group */
+#endif
+#ifndef CLONE_NEWCGROUP
+#	define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
+#endif
+#ifndef CLONE_NEWUTS
+#	define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
+#endif
+#ifndef CLONE_NEWIPC
+#	define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
+#endif
+#ifndef CLONE_NEWUSER
+#	define CLONE_NEWUSER 0x10000000 /* New user namespace */
+#endif
+#ifndef CLONE_NEWPID
+#	define CLONE_NEWPID 0x20000000 /* New pid namespace */
+#endif
+#ifndef CLONE_NEWNET
+#	define CLONE_NEWNET 0x40000000 /* New network namespace */
+#endif
+
+#endif /* NSENTER_NAMESPACE_H */
--- a/libcontainer/nsenter/nsenter_test.go
+++ b/libcontainer/nsenter/nsenter_test.go
@ -29,7 +29,7 @@ func TestNsenterValidPaths(t *testing.T) {

 	namespaces := []string{
 		// join pid ns of the current process
-		fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()),
+		fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()),
 	}
 	cmd := &exec.Cmd{
 		Path:       os.Args[0],
@ -87,7 +87,47 @@ func TestNsenterInvalidPaths(t *testing.T) {

 	namespaces := []string{
 		// join pid ns of the current process
-		fmt.Sprintf("/proc/%d/ns/pid", -1),
+		fmt.Sprintf("pid:/proc/%d/ns/pid", -1),
+	}
+	cmd := &exec.Cmd{
+		Path:       os.Args[0],
+		Args:       args,
+		ExtraFiles: []*os.File{child},
+		Env:        []string{"_LIBCONTAINER_INITPIPE=3"},
+	}
+
+	if err := cmd.Start(); err != nil {
+		t.Fatal(err)
+	}
+	// write cloneFlags
+	r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+	r.AddData(&libcontainer.Int32msg{
+		Type:  libcontainer.CloneFlagsAttr,
+		Value: uint32(syscall.CLONE_NEWNET),
+	})
+	r.AddData(&libcontainer.Bytemsg{
+		Type:  libcontainer.NsPathsAttr,
+		Value: []byte(strings.Join(namespaces, ",")),
+	})
+	if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := cmd.Wait(); err == nil {
+		t.Fatalf("nsenter exits with a zero exit status")
+	}
+}
+
+func TestNsenterIncorrectPathType(t *testing.T) {
+	args := []string{"nsenter-exec"}
+	parent, child, err := newPipe()
+	if err != nil {
+		t.Fatalf("failed to create pipe %v", err)
+	}
+
+	namespaces := []string{
+		// join pid ns of the current process
+		fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()),
 	}
 	cmd := &exec.Cmd{
 		Path:       os.Args[0],
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@ -11,6 +11,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
 #include <string.h>
 #include <unistd.h>

@ -23,27 +24,51 @@
 #include <linux/netlink.h>
 #include <linux/types.h>

-#define SYNC_VAL 0x42
-#define JUMP_VAL 0x43
+/* Get all of the CLONE_NEW* flags. */
+#include "namespace.h"
+
+/* Synchronisation values. */
+enum sync_t {
+	SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
+	SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
+	SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
+	SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
+
+	/* XXX: This doesn't help with segfaults and other such issues. */
+	SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
+};
+
+/* longjmp() arguments. */
+#define JUMP_PARENT 0x00
+#define JUMP_CHILD  0xA0
+#define JUMP_INIT   0xA1
+
+/* JSON buffer. */
+#define JSON_MAX 4096

 /* Assume the stack grows down, so arguments should be above it. */
-struct clone_arg {
+struct clone_t {
 	/*
 	 * Reserve some space for clone() to locate arguments
 	 * and retcode in this place
 	 */
 	char stack[4096] __attribute__ ((aligned(16)));
 	char stack_ptr[0];
+
+	/* There's two children. This is used to execute the different code. */
 	jmp_buf *env;
+	int jmpval;
 };

 struct nlconfig_t {
 	char *data;
 	uint32_t cloneflags;
 	char *uidmap;
-	int uidmap_len;
+	size_t uidmap_len;
 	char *gidmap;
-	int gidmap_len;
+	size_t gidmap_len;
+	char *namespaces;
+	size_t namespaces_len;
 	uint8_t is_setgroup;
 	int consolefd;
 };
@ -81,80 +106,24 @@ int setns(int fd, int nstype)
 }
 #endif

+/* XXX: This is ugly. */
+static int syncfd = -1;
+
 /* TODO(cyphar): Fix this so it correctly deals with syncT. */
-#define bail(fmt, ...)							\
-	do {								\
-		fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
-		exit(__COUNTER__ + 1);					\
+#define bail(fmt, ...)								\
+	do {									\
+		int ret = __COUNTER__ + 1;					\
+		fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__);	\
+		if (syncfd >= 0) {						\
+			enum sync_t s = SYNC_ERR;				\
+			if (write(syncfd, &s, sizeof(s)) != sizeof(s))		\
+				fprintf(stderr, "nsenter: failed: write(s)");	\
+			if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret))	\
+				fprintf(stderr, "nsenter: failed: write(ret)");	\
+		}								\
+		exit(ret);							\
 	} while(0)

-static int child_func(void *arg)
-{
-	struct clone_arg *ca = (struct clone_arg *)arg;
-	longjmp(*ca->env, JUMP_VAL);
-}
-
-static int clone_parent(jmp_buf *env, int flags) __attribute__ ((noinline));
-static int clone_parent(jmp_buf *env, int flags)
-{
-	int child;
-	struct clone_arg ca = {
-		.env = env,
-	};
-
-	child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, &ca);
-
-	/*
-	 * On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so we have
-	 * to unshare(2) before clone(2) in order to do this. This was fixed in
-	 * upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
-	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e.
-	 *
-	 * As far as we're aware, the last mainline kernel which had this bug was
-	 * Linux 3.12. However, we cannot comment on which kernels the broken patch
-	 * was backported to.
-	 */
-	if (errno == EINVAL) {
-		if (unshare(flags) < 0)
-			bail("unable to unshare namespaces");
-		child = clone(child_func, ca.stack_ptr, SIGCHLD | CLONE_PARENT, &ca);
-	}
-
-	return child;
-}
-
-/*
- * Gets the init pipe fd from the environment, which is used to read the
- * bootstrap data and tell the parent what the new pid is after we finish
- * setting up the environment.
- */
-static int initpipe(void)
-{
-	int pipenum;
-	char *initpipe, *endptr;
-
-	initpipe = getenv("_LIBCONTAINER_INITPIPE");
-	if (initpipe == NULL || *initpipe == '\0')
-		return -1;
-
-	errno = 0;
-	pipenum = strtol(initpipe, &endptr, 10);
-	if (errno != 0 || *endptr != '\0')
-		bail("unable to parse _LIBCONTAINER_INITPIPE");
-
-	return pipenum;
-}
-
-static uint32_t readint32(char *buf)
-{
-	return *(uint32_t *) buf;
-}
-
-static uint8_t readint8(char *buf)
-{
-	return *(uint8_t *) buf;
-}
-
 static int write_file(char *data, size_t data_len, char *pathfmt, ...)
 {
 	int fd, len, ret = 0;
@ -184,18 +153,28 @@ out:
 	return ret;
 }

-#define SETGROUPS_ALLOW "allow"
-#define SETGROUPS_DENY  "deny"
+enum policy_t {
+	SETGROUPS_DEFAULT = 0,
+	SETGROUPS_ALLOW,
+	SETGROUPS_DENY,
+};

 /* This *must* be called before we touch gid_map. */
-static void update_setgroups(int pid, bool setgroup)
+static void update_setgroups(int pid, enum policy_t setgroup)
 {
 	char *policy;

-	if (setgroup)
-		policy = SETGROUPS_ALLOW;
-	else
-		policy = SETGROUPS_DENY;
+	switch (setgroup) {
+		case SETGROUPS_ALLOW:
+			policy = "allow";
+			break;
+		case SETGROUPS_DENY:
+			policy = "deny";
+			break;
+		case SETGROUPS_DEFAULT:
+			/* Nothing to do. */
+			return;
+	}

 	if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
 		/*
@ -225,44 +204,76 @@ static void update_gidmap(int pid, char *map, int map_len)
 		bail("failed to update /proc/%d/gid_map", pid);
 }

-#define JSON_MAX 4096
-
-static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], struct nlconfig_t *config)
+/* A dummy function that just jumps to the given jumpval. */
+static int child_func(void *arg) __attribute__ ((noinline));
+static int child_func(void *arg)
 {
-	int len, childpid;
-	char buf[JSON_MAX];
-	uint8_t syncval;
+	struct clone_t *ca = (struct clone_t *)arg;
+	longjmp(*ca->env, ca->jmpval);
+}

-	/*
-	 * We must fork to actually enter the PID namespace, and use
-	 * CLONE_PARENT so that the child init can have the right parent
-	 * (the bootstrap process). Also so we don't need to forward the
-	 * child's exit code or resend its death signal.
-	 */
-	childpid = clone_parent(env, config->cloneflags);
-	if (childpid < 0)
-		bail("unable to fork");
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
+static int clone_parent(jmp_buf *env, int jmpval)
+{
+	struct clone_t ca = {
+		.env    = env,
+		.jmpval = jmpval,
+	};

-	/* Update setgroups, uid_map and gid_map for the process if provided. */
-	if (config->is_setgroup)
-		update_setgroups(childpid, true);
-	update_uidmap(childpid, config->uidmap, config->uidmap_len);
-	update_gidmap(childpid, config->gidmap, config->gidmap_len);
+	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
+}

-	/* Send the sync signal to the child. */
-	close(syncpipe[0]);
-	syncval = SYNC_VAL;
-	if (write(syncpipe[1], &syncval, sizeof(syncval)) != sizeof(syncval))
-		bail("failed to write sync byte to child");
+/*
+ * Gets the init pipe fd from the environment, which is used to read the
+ * bootstrap data and tell the parent what the new pid is after we finish
+ * setting up the environment.
+ */
+static int initpipe(void)
+{
+	int pipenum;
+	char *initpipe, *endptr;

-	/* Send the child pid back to our parent */
-	len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", childpid);
-	if (len < 0 || write(pipenum, buf, len) != len) {
-		kill(childpid, SIGKILL);
-		bail("unable to send a child pid");
-	}
+	initpipe = getenv("_LIBCONTAINER_INITPIPE");
+	if (initpipe == NULL || *initpipe == '\0')
+		return -1;

-	exit(0);
+	pipenum = strtol(initpipe, &endptr, 10);
+	if (*endptr != '\0')
+		bail("unable to parse _LIBCONTAINER_INITPIPE");
+
+	return pipenum;
+}
+
+/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
+static int nsflag(char *name)
+{
+	if (!strcmp(name, "cgroup"))
+		return CLONE_NEWCGROUP;
+	else if (!strcmp(name, "ipc"))
+		return CLONE_NEWIPC;
+	else if (!strcmp(name, "mnt"))
+		return CLONE_NEWNS;
+	else if (!strcmp(name, "net"))
+		return CLONE_NEWNET;
+	else if (!strcmp(name, "pid"))
+		return CLONE_NEWPID;
+	else if (!strcmp(name, "user"))
+		return CLONE_NEWUSER;
+	else if (!strcmp(name, "uts"))
+		return CLONE_NEWUTS;
+
+	/* If we don't recognise a name, fallback to 0. */
+	return 0;
+}
+
+static uint32_t readint32(char *buf)
+{
+	return *(uint32_t *) buf;
+}
+
+static uint8_t readint8(char *buf)
+{
+	return *(uint8_t *) buf;
 }

 static void nl_parse(int fd, struct nlconfig_t *config)
@ -309,66 +320,17 @@ static void nl_parse(int fd, struct nlconfig_t *config)
 			break;
 		case CONSOLE_PATH_ATTR:
 			/*
-			 * The context in which this is done (before or after we
-			 * join the other namespaces) will affect how the path
-			 * resolution of the console works. This order is not
-			 * decided here, but rather in container_linux.go. We just
-			 * follow the order given by the netlink message.
+			 * We open the console here because we currently evaluate console
+			 * paths from the *host* namespaces.
 			 */
 			config->consolefd = open(current, O_RDWR);
 			if (config->consolefd < 0)
 				bail("failed to open console %s", current);
 			break;
-		case NS_PATHS_ATTR:{
-				/*
-				 * Open each namespace path and setns it in the
-				 * order provided to us. We currently don't have
-				 * any context for what kind of namespace we're
-				 * joining, so just blindly do it.
-				 */
-				char *saveptr = NULL;
-				char *ns = strtok_r(current, ",", &saveptr);
-				int *fds = NULL, num = 0, i;
-				char **paths = NULL;
-
-				if (!ns || !strlen(current))
-					bail("ns paths are empty");
-
-				/*
-				 * We have to open the file descriptors first, since after
-				 * we join the mnt namespace we might no longer be able to
-				 * access the paths.
-				 */
-				do {
-					int fd;
-
-					/* Resize fds. */
-					num++;
-					fds = realloc(fds, num * sizeof(int));
-					paths = realloc(paths, num * sizeof(char *));
-
-					fd = open(ns, O_RDONLY);
-					if (fd < 0)
-						bail("failed to open %s", ns);
-
-					fds[num - 1] = fd;
-					paths[num - 1] = ns;
-				} while ((ns = strtok_r(NULL, ",", &saveptr)) != NULL);
-
-				for (i = 0; i < num; i++) {
-					int fd = fds[i];
-					char *path = paths[i];
-
-					if (setns(fd, 0) < 0)
-						bail("failed to setns to %s", path);
-
-					close(fd);
-				}
-
-				free(fds);
-				free(paths);
-				break;
-			}
+		case NS_PATHS_ATTR:
+			config->namespaces = current;
+			config->namespaces_len = payload_len;
+			break;
 		case UIDMAP_ATTR:
 			config->uidmap = current;
 			config->uidmap_len = payload_len;
@ -393,6 +355,71 @@ void nl_free(struct nlconfig_t *config)
 	free(config->data);
 }

+void join_namespaces(char *nslist)
+{
+	int num = 0, i;
+	char *saveptr = NULL;
+	char *namespace = strtok_r(nslist, ",", &saveptr);
+	struct namespace_t {
+		int fd;
+		int ns;
+		char type[PATH_MAX];
+		char path[PATH_MAX];
+	} *namespaces = NULL;
+
+	if (!namespace || !strlen(namespace) || !strlen(nslist))
+		bail("ns paths are empty");
+
+	/*
+	 * We have to open the file descriptors first, since after
+	 * we join the mnt namespace we might no longer be able to
+	 * access the paths.
+	 */
+	do {
+		int fd;
+		char *path;
+		struct namespace_t *ns;
+
+		/* Resize the namespace array. */
+		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
+		if (!namespaces)
+			bail("failed to reallocate namespace array");
+		ns = &namespaces[num - 1];
+
+		/* Split 'ns:path'. */
+		path = strstr(namespace, ":");
+		if (!path)
+			bail("failed to parse %s", namespace);
+		*path++ = '\0';
+
+		fd = open(path, O_RDONLY);
+		if (fd < 0)
+			bail("failed to open %s", namespace);
+
+		ns->fd = fd;
+		ns->ns = nsflag(namespace);
+		strncpy(ns->path, path, PATH_MAX);
+	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
+
+	/*
+	 * The ordering in which we join namespaces is important. We should
+	 * always join the user namespace *first*. This is all guaranteed
+	 * from the container_linux.go side of this, so we're just going to
+	 * follow the order given to us.
+	 */
+
+	for (i = 0; i < num; i++) {
+		struct namespace_t ns = namespaces[i];
+
+		if (setns(ns.fd, ns.ns) < 0)
+			bail("failed to setns to %s", ns.path);
+
+		close(ns.fd);
+	}
+
+	free(namespaces);
+}
+
 void nsexec(void)
 {
 	int pipenum;
@ -413,61 +440,311 @@ void nsexec(void)

 	/* clone(2) flags are mandatory. */
 	if (config.cloneflags == -1)
-		bail("missing clone_flags");
+		bail("missing cloneflags");

 	/* Pipe so we can tell the child when we've finished setting up. */
-	if (pipe(syncpipe) < 0)
+	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0)
 		bail("failed to setup sync pipe between parent and child");

-	/* Set up the jump point. */
-	if (setjmp(env) == JUMP_VAL) {
-		/*
-		 * We're inside the child now, having jumped from the
-		 * start_child() code after forking in the parent.
-		 */
-		uint8_t s = 0;
-		int consolefd = config.consolefd;
+	/* TODO: Currently we aren't dealing with child deaths properly. */

-		/* Close the writing side of pipe. */
-		close(syncpipe[1]);
+	/*
+	 * Okay, so this is quite annoying.
+	 *
+	 * In order to make sure that deal with older kernels (when CLONE_NEWUSER
+	 * wasn't guaranteed to be done first if you specify multiple namespaces in
+	 * a clone(2) invocation) as well as with certain usecases like rootless
+	 * containers, we cannot just dump all of the cloneflags into clone(2).
+	 * However, if we unshare(2) the user namespace *before* we clone(2), then
+	 * all hell breaks loose.
+	 *
+	 * The parent no longer has permissions to do many things (unshare(2) drops
+	 * all capabilities in your old namespace), and the container cannot be set
+	 * up to have more than one {uid,gid} mapping. This is obviously less than
+	 * ideal. In order to fix this, we have to first clone(2) and then unshare.
+	 *
+	 * Unfortunately, it's not as simple as that. We have to fork to enter the
+	 * PID namespace (the PID namespace only applies to children). Since we'll
+	 * have to double-fork, this clone_parent() call won't be able to get the
+	 * PID of the _actual_ init process (without doing more synchronisation than
+	 * I can deal with at the moment). So we'll just get the parent to send it
+	 * for us, the only job of this process is to update
+	 * /proc/pid/{setgroups,uid_map,gid_map}.
+	 *
+	 * And as a result of the above, we also need to setns(2) in the first child
+	 * because if we join a PID namespace in the topmost parent then our child
+	 * will be in that namespace (and it will not be able to give us a PID value
+	 * that makes sense without resorting to sending things with cmsg).
+	 *
+	 * This also deals with an older issue caused by dumping cloneflags into
+	 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
+	 * we have to unshare(2) before clone(2) in order to do this. This was fixed
+	 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
+	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
+	 * aware, the last mainline kernel which had this bug was Linux 3.12.
+	 * However, we cannot comment on which kernels the broken patch was
+	 * backported to.
+	 *
+	 * -- Aleksa "what has my life come to?" Sarai
+	 */

-		/* Sync with parent. */
-		if (read(syncpipe[0], &s, sizeof(s)) != sizeof(s) || s != SYNC_VAL)
-			bail("failed to read sync byte from parent");
+	switch (setjmp(env)) {
+	/*
+	 * Stage 0: We're in the parent. Our job is just to create a new child
+	 *          (stage 1: JUMP_CHILD) process and write its uid_map and
+	 *          gid_map. That process will go on to create a new process, then
+	 *          it will send us its PID which we will send to the bootstrap
+	 *          process.
+	 */
+	case JUMP_PARENT: {
+			int len;
+			pid_t child;
+			char buf[JSON_MAX];

-		if (setsid() < 0)
-			bail("setsid failed");
+			/* For debugging. */
+			prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);

-		if (setuid(0) < 0)
-			bail("setuid failed");
+			/* Start the process of getting a container. */
+			child = clone_parent(&env, JUMP_CHILD);
+			if (child < 0)
+				bail("unable to fork: child_func");

-		if (setgid(0) < 0)
-			bail("setgid failed");
+			/* State machine for synchronisation with the children. */
+			while (true) {
+				enum sync_t s;

-		if (setgroups(0, NULL) < 0)
-			bail("setgroups failed");
+				/* This doesn't need to be global, we're in the parent. */
+				int syncfd = syncpipe[1];

-		if (consolefd != -1) {
-			if (ioctl(consolefd, TIOCSCTTY, 0) < 0)
-				bail("ioctl TIOCSCTTY failed");
-			if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO)
-				bail("failed to dup stdin");
-			if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO)
-				bail("failed to dup stdout");
-			if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO)
-				bail("failed to dup stderr");
+				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+					bail("failed to sync with child: next state");
+
+				switch (s) {
+				case SYNC_ERR: {
+						/* We have to mirror the error code of the child. */
+						int ret;
+
+						if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
+							bail("failed to sync with child: read(error code)");
+
+						exit(ret);
+					}
+					break;
+				case SYNC_USERMAP_PLS:
+					/* Enable setgroups(2) if we've been asked to. */
+					if (config.is_setgroup)
+						update_setgroups(child, SETGROUPS_ALLOW);
+
+					/* Set up mappings. */
+					update_uidmap(child, config.uidmap, config.uidmap_len);
+					update_gidmap(child, config.gidmap, config.gidmap_len);
+
+					s = SYNC_USERMAP_ACK;
+					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+						kill(child, SIGKILL);
+						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
+					}
+					break;
+				case SYNC_USERMAP_ACK:
+					/* We should _never_ receive acks. */
+					kill(child, SIGKILL);
+					bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
+					break;
+				case SYNC_RECVPID_PLS: {
+						pid_t old = child;
+
+						/* Get the init_func pid. */
+						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
+							kill(old, SIGKILL);
+							bail("failed to sync with child: read(childpid)");
+						}
+
+						/* Send ACK. */
+						s = SYNC_RECVPID_ACK;
+						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+							kill(old, SIGKILL);
+							kill(child, SIGKILL);
+							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
+						}
+					}
+
+					/* Leave the loop. */
+					goto out;
+				case SYNC_RECVPID_ACK:
+					/* We should _never_ receive acks. */
+					kill(child, SIGKILL);
+					bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
+					break;
+				}
+			}
+
+		out:
+			/* Send the init_func pid back to our parent. */
+			len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
+			if (len < 0) {
+				kill(child, SIGKILL);
+				bail("unable to generate JSON for child pid");
+			}
+			if (write(pipenum, buf, len) != len) {
+				kill(child, SIGKILL);
+				bail("unable to send child pid to bootstrapper");
+			}
+
+			exit(0);
 		}

-		/* Free netlink data. */
-		nl_free(&config);
+	/*
+	 * Stage 1: We're in the first child process. Our job is to join any
+	 *          provided user namespaces in the netlink payload. If we've been
+	 *          asked to CLONE_NEWUSER, we will unshare the user namespace and
+	 *          ask our parent (stage 0) to set up our user mappings for us.
+	 *          Then, we unshare the rest of the requested namespaces and
+	 *          create a new child (stage 2: JUMP_INIT).  We then send the
+	 *          child's PID to our parent (stage 0).
+	 */
+	case JUMP_CHILD: {
+			pid_t child;
+			enum sync_t s;

-		/* Finish executing, let the Go runtime take over. */
-		return;
+			/* We're in a child and thus need to tell the parent if we die. */
+			syncfd = syncpipe[0];
+
+			/* For debugging. */
+			prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
+
+			/*
+			 * We need to setns first. We cannot do this earlier (in stage 0)
+			 * because of the fact that we forked to get here (the PID of
+			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
+			 * using cmsg(3) but that's just annoying.
+			 */
+			if (config.namespaces)
+				join_namespaces(config.namespaces);
+
+			/*
+			 * Unshare all of the namespaces. Now, it should be noted that this
+			 * ordering might break in the future (especially with rootless
+			 * containers). But for now, it's not possible to split this into
+			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
+			 *
+			 * We also  can't be sure if the current kernel supports
+			 * clone(CLONE_PARENT | CLONE_NEWPID), so we'll just do it the long
+			 * way anyway.
+			 */
+			if (unshare(config.cloneflags) < 0)
+				bail("failed to unshare namespaces");
+
+			/*
+			 * Deal with user namespaces first. They are quite special, as they
+			 * affect our ability to unshare other namespaces and are used as
+			 * context for privilege checks.
+			 */
+			if (config.cloneflags & CLONE_NEWUSER) {
+				/*
+				 * We don't have the privileges to do any mapping here (see the
+				 * clone_parent rant). So signal our parent to hook us up.
+				 */
+
+				s = SYNC_USERMAP_PLS;
+				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
+					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
+
+				/* ... wait for mapping ... */
+
+				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
+				if (s != SYNC_USERMAP_ACK)
+					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
+			}
+
+			/* TODO: What about non-namespace clone flags that we're dropping here? */
+			child = clone_parent(&env, JUMP_INIT);
+			if (child < 0)
+				bail("unable to fork: init_func");
+
+			/* Send the child to our parent, which knows what it's doing. */
+			s = SYNC_RECVPID_PLS;
+			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
+			}
+			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: write(childpid)");
+			}
+
+			/* ... wait for parent to get the pid ... */
+
+			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
+			}
+			if (s != SYNC_RECVPID_ACK) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
+			}
+
+			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
+			exit(0);
+		}
+
+	/*
+	 * Stage 2: We're the final child process, and the only process that will
+	 *          actually return to the Go runtime. Our job is to just do the
+	 *          final cleanup steps and then return to the Go runtime to allow
+	 *          init_linux.go to run.
+	 */
+	case JUMP_INIT: {
+			/*
+			 * We're inside the child now, having jumped from the
+			 * start_child() code after forking in the parent.
+			 */
+			int consolefd = config.consolefd;
+
+			/* We're in a child and thus need to tell the parent if we die. */
+			syncfd = syncpipe[0];
+
+			/* For debugging. */
+			prctl(PR_SET_NAME, (unsigned long) "runc:[1:INIT]", 0, 0, 0);
+
+			if (setsid() < 0)
+				bail("setsid failed");
+
+			if (setuid(0) < 0)
+				bail("setuid failed");
+
+			if (setgid(0) < 0)
+				bail("setgid failed");
+
+			if (setgroups(0, NULL) < 0)
+				bail("setgroups failed");
+
+			if (consolefd != -1) {
+				if (ioctl(consolefd, TIOCSCTTY, 0) < 0)
+					bail("ioctl TIOCSCTTY failed");
+				if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO)
+					bail("failed to dup stdin");
+				if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO)
+					bail("failed to dup stdout");
+				if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO)
+					bail("failed to dup stderr");
+			}
+
+			/* Close sync pipes. */
+			close(syncpipe[0]);
+			close(syncpipe[1]);
+
+			/* Free netlink data. */
+			nl_free(&config);
+
+			/* Finish executing, let the Go runtime take over. */
+			return;
+		}
+	default:
+		bail("unexpected jump value");
+		break;
 	}

-	/* Run the parent code. */
-	start_child(pipenum, &env, syncpipe, &config);
-
 	/* Should never be reached. */
 	bail("should never be reached");
 }