diff --git a/libcontainer/configs/namespaces_unix.go b/libcontainer/configs/namespaces_unix.go index b9c820d0..8beba9d3 100644 --- a/libcontainer/configs/namespaces_unix.go +++ b/libcontainer/configs/namespaces_unix.go @@ -22,8 +22,8 @@ var ( supportedNamespaces = make(map[NamespaceType]bool) ) -// nsToFile converts the namespace type to its filename -func nsToFile(ns NamespaceType) string { +// NsName converts the namespace type to its filename +func NsName(ns NamespaceType) string { switch ns { case NEWNET: return "net" @@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool { if ok { return supported } - nsFile := nsToFile(ns) + nsFile := NsName(ns) // if the namespace type is unknown, just return false if nsFile == "" { return false @@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string { if n.Path != "" { return n.Path } - return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type)) + return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type)) } func (n *Namespaces) Remove(t NamespaceType) bool { diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 29c8b343..4ba2735d 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1223,16 +1223,22 @@ func (c *linuxContainer) currentState() (*State, error) { // can setns in order. func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { paths := []string{} - nsTypes := []configs.NamespaceType{ + order := []configs.NamespaceType{ + // The user namespace *must* be done first. + configs.NEWUSER, configs.NEWIPC, configs.NEWUTS, configs.NEWNET, configs.NEWPID, configs.NEWNS, } - // join userns if the init process explicitly requires NEWUSER - if c.config.Namespaces.Contains(configs.NEWUSER) { - nsTypes = append(nsTypes, configs.NEWUSER) + + // Remove namespaces that we don't need to join. + var nsTypes []configs.NamespaceType + for _, ns := range order { + if c.config.Namespaces.Contains(ns) { + nsTypes = append(nsTypes, ns) + } } for _, nsType := range nsTypes { if p, ok := namespaces[nsType]; ok && p != "" { @@ -1249,7 +1255,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp if strings.ContainsRune(p, ',') { return nil, newSystemError(fmt.Errorf("invalid path %s", p)) } - paths = append(paths, p) + paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p)) } } return paths, nil diff --git a/libcontainer/nsenter/namespace.h b/libcontainer/nsenter/namespace.h new file mode 100644 index 00000000..9e9bdca0 --- /dev/null +++ b/libcontainer/nsenter/namespace.h @@ -0,0 +1,32 @@ +#ifndef NSENTER_NAMESPACE_H +#define NSENTER_NAMESPACE_H + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif +#include + +/* All of these are taken from include/uapi/linux/sched.h */ +#ifndef CLONE_NEWNS +# define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#endif +#ifndef CLONE_NEWCGROUP +# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#endif +#ifndef CLONE_NEWUTS +# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#endif +#ifndef CLONE_NEWIPC +# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#endif +#ifndef CLONE_NEWUSER +# define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#endif +#ifndef CLONE_NEWPID +# define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#endif +#ifndef CLONE_NEWNET +# define CLONE_NEWNET 0x40000000 /* New network namespace */ +#endif + +#endif /* NSENTER_NAMESPACE_H */ diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go index 598e80b5..98b026f7 100644 --- a/libcontainer/nsenter/nsenter_test.go +++ b/libcontainer/nsenter/nsenter_test.go @@ -29,7 +29,7 @@ func TestNsenterValidPaths(t *testing.T) { namespaces := []string{ // join pid ns of the current process - fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()), + fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()), } cmd := &exec.Cmd{ Path: os.Args[0], @@ -87,7 +87,47 @@ func TestNsenterInvalidPaths(t *testing.T) { namespaces := []string{ // join pid ns of the current process - fmt.Sprintf("/proc/%d/ns/pid", -1), + fmt.Sprintf("pid:/proc/%d/ns/pid", -1), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + } + + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(syscall.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + if err := cmd.Wait(); err == nil { + t.Fatalf("nsenter exits with a zero exit status") + } +} + +func TestNsenterIncorrectPathType(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()), } cmd := &exec.Cmd{ Path: os.Args[0], diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index b93f827b..93265c26 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -23,27 +24,51 @@ #include #include -#define SYNC_VAL 0x42 -#define JUMP_VAL 0x43 +/* Get all of the CLONE_NEW* flags. */ +#include "namespace.h" + +/* Synchronisation values. */ +enum sync_t { + SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ + SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ + SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ + SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ + + /* XXX: This doesn't help with segfaults and other such issues. */ + SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ +}; + +/* longjmp() arguments. */ +#define JUMP_PARENT 0x00 +#define JUMP_CHILD 0xA0 +#define JUMP_INIT 0xA1 + +/* JSON buffer. */ +#define JSON_MAX 4096 /* Assume the stack grows down, so arguments should be above it. */ -struct clone_arg { +struct clone_t { /* * Reserve some space for clone() to locate arguments * and retcode in this place */ char stack[4096] __attribute__ ((aligned(16))); char stack_ptr[0]; + + /* There's two children. This is used to execute the different code. */ jmp_buf *env; + int jmpval; }; struct nlconfig_t { char *data; uint32_t cloneflags; char *uidmap; - int uidmap_len; + size_t uidmap_len; char *gidmap; - int gidmap_len; + size_t gidmap_len; + char *namespaces; + size_t namespaces_len; uint8_t is_setgroup; int consolefd; }; @@ -81,80 +106,24 @@ int setns(int fd, int nstype) } #endif +/* XXX: This is ugly. */ +static int syncfd = -1; + /* TODO(cyphar): Fix this so it correctly deals with syncT. */ -#define bail(fmt, ...) \ - do { \ - fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ - exit(__COUNTER__ + 1); \ +#define bail(fmt, ...) \ + do { \ + int ret = __COUNTER__ + 1; \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ + if (syncfd >= 0) { \ + enum sync_t s = SYNC_ERR; \ + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \ + fprintf(stderr, "nsenter: failed: write(s)"); \ + if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \ + fprintf(stderr, "nsenter: failed: write(ret)"); \ + } \ + exit(ret); \ } while(0) -static int child_func(void *arg) -{ - struct clone_arg *ca = (struct clone_arg *)arg; - longjmp(*ca->env, JUMP_VAL); -} - -static int clone_parent(jmp_buf *env, int flags) __attribute__ ((noinline)); -static int clone_parent(jmp_buf *env, int flags) -{ - int child; - struct clone_arg ca = { - .env = env, - }; - - child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, &ca); - - /* - * On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so we have - * to unshare(2) before clone(2) in order to do this. This was fixed in - * upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was - * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. - * - * As far as we're aware, the last mainline kernel which had this bug was - * Linux 3.12. However, we cannot comment on which kernels the broken patch - * was backported to. - */ - if (errno == EINVAL) { - if (unshare(flags) < 0) - bail("unable to unshare namespaces"); - child = clone(child_func, ca.stack_ptr, SIGCHLD | CLONE_PARENT, &ca); - } - - return child; -} - -/* - * Gets the init pipe fd from the environment, which is used to read the - * bootstrap data and tell the parent what the new pid is after we finish - * setting up the environment. - */ -static int initpipe(void) -{ - int pipenum; - char *initpipe, *endptr; - - initpipe = getenv("_LIBCONTAINER_INITPIPE"); - if (initpipe == NULL || *initpipe == '\0') - return -1; - - errno = 0; - pipenum = strtol(initpipe, &endptr, 10); - if (errno != 0 || *endptr != '\0') - bail("unable to parse _LIBCONTAINER_INITPIPE"); - - return pipenum; -} - -static uint32_t readint32(char *buf) -{ - return *(uint32_t *) buf; -} - -static uint8_t readint8(char *buf) -{ - return *(uint8_t *) buf; -} - static int write_file(char *data, size_t data_len, char *pathfmt, ...) { int fd, len, ret = 0; @@ -184,18 +153,28 @@ out: return ret; } -#define SETGROUPS_ALLOW "allow" -#define SETGROUPS_DENY "deny" +enum policy_t { + SETGROUPS_DEFAULT = 0, + SETGROUPS_ALLOW, + SETGROUPS_DENY, +}; /* This *must* be called before we touch gid_map. */ -static void update_setgroups(int pid, bool setgroup) +static void update_setgroups(int pid, enum policy_t setgroup) { char *policy; - if (setgroup) - policy = SETGROUPS_ALLOW; - else - policy = SETGROUPS_DENY; + switch (setgroup) { + case SETGROUPS_ALLOW: + policy = "allow"; + break; + case SETGROUPS_DENY: + policy = "deny"; + break; + case SETGROUPS_DEFAULT: + /* Nothing to do. */ + return; + } if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { /* @@ -225,44 +204,76 @@ static void update_gidmap(int pid, char *map, int map_len) bail("failed to update /proc/%d/gid_map", pid); } -#define JSON_MAX 4096 - -static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], struct nlconfig_t *config) +/* A dummy function that just jumps to the given jumpval. */ +static int child_func(void *arg) __attribute__ ((noinline)); +static int child_func(void *arg) { - int len, childpid; - char buf[JSON_MAX]; - uint8_t syncval; + struct clone_t *ca = (struct clone_t *)arg; + longjmp(*ca->env, ca->jmpval); +} - /* - * We must fork to actually enter the PID namespace, and use - * CLONE_PARENT so that the child init can have the right parent - * (the bootstrap process). Also so we don't need to forward the - * child's exit code or resend its death signal. - */ - childpid = clone_parent(env, config->cloneflags); - if (childpid < 0) - bail("unable to fork"); +static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); +static int clone_parent(jmp_buf *env, int jmpval) +{ + struct clone_t ca = { + .env = env, + .jmpval = jmpval, + }; - /* Update setgroups, uid_map and gid_map for the process if provided. */ - if (config->is_setgroup) - update_setgroups(childpid, true); - update_uidmap(childpid, config->uidmap, config->uidmap_len); - update_gidmap(childpid, config->gidmap, config->gidmap_len); + return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); +} - /* Send the sync signal to the child. */ - close(syncpipe[0]); - syncval = SYNC_VAL; - if (write(syncpipe[1], &syncval, sizeof(syncval)) != sizeof(syncval)) - bail("failed to write sync byte to child"); +/* + * Gets the init pipe fd from the environment, which is used to read the + * bootstrap data and tell the parent what the new pid is after we finish + * setting up the environment. + */ +static int initpipe(void) +{ + int pipenum; + char *initpipe, *endptr; - /* Send the child pid back to our parent */ - len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", childpid); - if (len < 0 || write(pipenum, buf, len) != len) { - kill(childpid, SIGKILL); - bail("unable to send a child pid"); - } + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL || *initpipe == '\0') + return -1; - exit(0); + pipenum = strtol(initpipe, &endptr, 10); + if (*endptr != '\0') + bail("unable to parse _LIBCONTAINER_INITPIPE"); + + return pipenum; +} + +/* Returns the clone(2) flag for a namespace, given the name of a namespace. */ +static int nsflag(char *name) +{ + if (!strcmp(name, "cgroup")) + return CLONE_NEWCGROUP; + else if (!strcmp(name, "ipc")) + return CLONE_NEWIPC; + else if (!strcmp(name, "mnt")) + return CLONE_NEWNS; + else if (!strcmp(name, "net")) + return CLONE_NEWNET; + else if (!strcmp(name, "pid")) + return CLONE_NEWPID; + else if (!strcmp(name, "user")) + return CLONE_NEWUSER; + else if (!strcmp(name, "uts")) + return CLONE_NEWUTS; + + /* If we don't recognise a name, fallback to 0. */ + return 0; +} + +static uint32_t readint32(char *buf) +{ + return *(uint32_t *) buf; +} + +static uint8_t readint8(char *buf) +{ + return *(uint8_t *) buf; } static void nl_parse(int fd, struct nlconfig_t *config) @@ -309,66 +320,17 @@ static void nl_parse(int fd, struct nlconfig_t *config) break; case CONSOLE_PATH_ATTR: /* - * The context in which this is done (before or after we - * join the other namespaces) will affect how the path - * resolution of the console works. This order is not - * decided here, but rather in container_linux.go. We just - * follow the order given by the netlink message. + * We open the console here because we currently evaluate console + * paths from the *host* namespaces. */ config->consolefd = open(current, O_RDWR); if (config->consolefd < 0) bail("failed to open console %s", current); break; - case NS_PATHS_ATTR:{ - /* - * Open each namespace path and setns it in the - * order provided to us. We currently don't have - * any context for what kind of namespace we're - * joining, so just blindly do it. - */ - char *saveptr = NULL; - char *ns = strtok_r(current, ",", &saveptr); - int *fds = NULL, num = 0, i; - char **paths = NULL; - - if (!ns || !strlen(current)) - bail("ns paths are empty"); - - /* - * We have to open the file descriptors first, since after - * we join the mnt namespace we might no longer be able to - * access the paths. - */ - do { - int fd; - - /* Resize fds. */ - num++; - fds = realloc(fds, num * sizeof(int)); - paths = realloc(paths, num * sizeof(char *)); - - fd = open(ns, O_RDONLY); - if (fd < 0) - bail("failed to open %s", ns); - - fds[num - 1] = fd; - paths[num - 1] = ns; - } while ((ns = strtok_r(NULL, ",", &saveptr)) != NULL); - - for (i = 0; i < num; i++) { - int fd = fds[i]; - char *path = paths[i]; - - if (setns(fd, 0) < 0) - bail("failed to setns to %s", path); - - close(fd); - } - - free(fds); - free(paths); - break; - } + case NS_PATHS_ATTR: + config->namespaces = current; + config->namespaces_len = payload_len; + break; case UIDMAP_ATTR: config->uidmap = current; config->uidmap_len = payload_len; @@ -393,6 +355,71 @@ void nl_free(struct nlconfig_t *config) free(config->data); } +void join_namespaces(char *nslist) +{ + int num = 0, i; + char *saveptr = NULL; + char *namespace = strtok_r(nslist, ",", &saveptr); + struct namespace_t { + int fd; + int ns; + char type[PATH_MAX]; + char path[PATH_MAX]; + } *namespaces = NULL; + + if (!namespace || !strlen(namespace) || !strlen(nslist)) + bail("ns paths are empty"); + + /* + * We have to open the file descriptors first, since after + * we join the mnt namespace we might no longer be able to + * access the paths. + */ + do { + int fd; + char *path; + struct namespace_t *ns; + + /* Resize the namespace array. */ + namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); + if (!namespaces) + bail("failed to reallocate namespace array"); + ns = &namespaces[num - 1]; + + /* Split 'ns:path'. */ + path = strstr(namespace, ":"); + if (!path) + bail("failed to parse %s", namespace); + *path++ = '\0'; + + fd = open(path, O_RDONLY); + if (fd < 0) + bail("failed to open %s", namespace); + + ns->fd = fd; + ns->ns = nsflag(namespace); + strncpy(ns->path, path, PATH_MAX); + } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); + + /* + * The ordering in which we join namespaces is important. We should + * always join the user namespace *first*. This is all guaranteed + * from the container_linux.go side of this, so we're just going to + * follow the order given to us. + */ + + for (i = 0; i < num; i++) { + struct namespace_t ns = namespaces[i]; + + if (setns(ns.fd, ns.ns) < 0) + bail("failed to setns to %s", ns.path); + + close(ns.fd); + } + + free(namespaces); +} + void nsexec(void) { int pipenum; @@ -413,61 +440,311 @@ void nsexec(void) /* clone(2) flags are mandatory. */ if (config.cloneflags == -1) - bail("missing clone_flags"); + bail("missing cloneflags"); /* Pipe so we can tell the child when we've finished setting up. */ - if (pipe(syncpipe) < 0) + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0) bail("failed to setup sync pipe between parent and child"); - /* Set up the jump point. */ - if (setjmp(env) == JUMP_VAL) { - /* - * We're inside the child now, having jumped from the - * start_child() code after forking in the parent. - */ - uint8_t s = 0; - int consolefd = config.consolefd; + /* TODO: Currently we aren't dealing with child deaths properly. */ - /* Close the writing side of pipe. */ - close(syncpipe[1]); + /* + * Okay, so this is quite annoying. + * + * In order to make sure that deal with older kernels (when CLONE_NEWUSER + * wasn't guaranteed to be done first if you specify multiple namespaces in + * a clone(2) invocation) as well as with certain usecases like rootless + * containers, we cannot just dump all of the cloneflags into clone(2). + * However, if we unshare(2) the user namespace *before* we clone(2), then + * all hell breaks loose. + * + * The parent no longer has permissions to do many things (unshare(2) drops + * all capabilities in your old namespace), and the container cannot be set + * up to have more than one {uid,gid} mapping. This is obviously less than + * ideal. In order to fix this, we have to first clone(2) and then unshare. + * + * Unfortunately, it's not as simple as that. We have to fork to enter the + * PID namespace (the PID namespace only applies to children). Since we'll + * have to double-fork, this clone_parent() call won't be able to get the + * PID of the _actual_ init process (without doing more synchronisation than + * I can deal with at the moment). So we'll just get the parent to send it + * for us, the only job of this process is to update + * /proc/pid/{setgroups,uid_map,gid_map}. + * + * And as a result of the above, we also need to setns(2) in the first child + * because if we join a PID namespace in the topmost parent then our child + * will be in that namespace (and it will not be able to give us a PID value + * that makes sense without resorting to sending things with cmsg). + * + * This also deals with an older issue caused by dumping cloneflags into + * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so + * we have to unshare(2) before clone(2) in order to do this. This was fixed + * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was + * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're + * aware, the last mainline kernel which had this bug was Linux 3.12. + * However, we cannot comment on which kernels the broken patch was + * backported to. + * + * -- Aleksa "what has my life come to?" Sarai + */ - /* Sync with parent. */ - if (read(syncpipe[0], &s, sizeof(s)) != sizeof(s) || s != SYNC_VAL) - bail("failed to read sync byte from parent"); + switch (setjmp(env)) { + /* + * Stage 0: We're in the parent. Our job is just to create a new child + * (stage 1: JUMP_CHILD) process and write its uid_map and + * gid_map. That process will go on to create a new process, then + * it will send us its PID which we will send to the bootstrap + * process. + */ + case JUMP_PARENT: { + int len; + pid_t child; + char buf[JSON_MAX]; - if (setsid() < 0) - bail("setsid failed"); + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); - if (setuid(0) < 0) - bail("setuid failed"); + /* Start the process of getting a container. */ + child = clone_parent(&env, JUMP_CHILD); + if (child < 0) + bail("unable to fork: child_func"); - if (setgid(0) < 0) - bail("setgid failed"); + /* State machine for synchronisation with the children. */ + while (true) { + enum sync_t s; - if (setgroups(0, NULL) < 0) - bail("setgroups failed"); + /* This doesn't need to be global, we're in the parent. */ + int syncfd = syncpipe[1]; - if (consolefd != -1) { - if (ioctl(consolefd, TIOCSCTTY, 0) < 0) - bail("ioctl TIOCSCTTY failed"); - if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) - bail("failed to dup stdin"); - if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) - bail("failed to dup stdout"); - if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) - bail("failed to dup stderr"); + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_ERR: { + /* We have to mirror the error code of the child. */ + int ret; + + if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) + bail("failed to sync with child: read(error code)"); + + exit(ret); + } + break; + case SYNC_USERMAP_PLS: + /* Enable setgroups(2) if we've been asked to. */ + if (config.is_setgroup) + update_setgroups(child, SETGROUPS_ALLOW); + + /* Set up mappings. */ + update_uidmap(child, config.uidmap, config.uidmap_len); + update_gidmap(child, config.gidmap, config.gidmap_len); + + s = SYNC_USERMAP_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + } + break; + case SYNC_USERMAP_ACK: + /* We should _never_ receive acks. */ + kill(child, SIGKILL); + bail("failed to sync with child: unexpected SYNC_USERMAP_ACK"); + break; + case SYNC_RECVPID_PLS: { + pid_t old = child; + + /* Get the init_func pid. */ + if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(old, SIGKILL); + bail("failed to sync with child: read(childpid)"); + } + + /* Send ACK. */ + s = SYNC_RECVPID_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(old, SIGKILL); + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); + } + } + + /* Leave the loop. */ + goto out; + case SYNC_RECVPID_ACK: + /* We should _never_ receive acks. */ + kill(child, SIGKILL); + bail("failed to sync with child: unexpected SYNC_RECVPID_ACK"); + break; + } + } + + out: + /* Send the init_func pid back to our parent. */ + len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child); + if (len < 0) { + kill(child, SIGKILL); + bail("unable to generate JSON for child pid"); + } + if (write(pipenum, buf, len) != len) { + kill(child, SIGKILL); + bail("unable to send child pid to bootstrapper"); + } + + exit(0); } - /* Free netlink data. */ - nl_free(&config); + /* + * Stage 1: We're in the first child process. Our job is to join any + * provided user namespaces in the netlink payload. If we've been + * asked to CLONE_NEWUSER, we will unshare the user namespace and + * ask our parent (stage 0) to set up our user mappings for us. + * Then, we unshare the rest of the requested namespaces and + * create a new child (stage 2: JUMP_INIT). We then send the + * child's PID to our parent (stage 0). + */ + case JUMP_CHILD: { + pid_t child; + enum sync_t s; - /* Finish executing, let the Go runtime take over. */ - return; + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = syncpipe[0]; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); + + /* + * We need to setns first. We cannot do this earlier (in stage 0) + * because of the fact that we forked to get here (the PID of + * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * using cmsg(3) but that's just annoying. + */ + if (config.namespaces) + join_namespaces(config.namespaces); + + /* + * Unshare all of the namespaces. Now, it should be noted that this + * ordering might break in the future (especially with rootless + * containers). But for now, it's not possible to split this into + * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. + * + * We also can't be sure if the current kernel supports + * clone(CLONE_PARENT | CLONE_NEWPID), so we'll just do it the long + * way anyway. + */ + if (unshare(config.cloneflags) < 0) + bail("failed to unshare namespaces"); + + /* + * Deal with user namespaces first. They are quite special, as they + * affect our ability to unshare other namespaces and are used as + * context for privilege checks. + */ + if (config.cloneflags & CLONE_NEWUSER) { + /* + * We don't have the privileges to do any mapping here (see the + * clone_parent rant). So signal our parent to hook us up. + */ + + s = SYNC_USERMAP_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); + + /* ... wait for mapping ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); + if (s != SYNC_USERMAP_ACK) + bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); + } + + /* TODO: What about non-namespace clone flags that we're dropping here? */ + child = clone_parent(&env, JUMP_INIT); + if (child < 0) + bail("unable to fork: init_func"); + + /* Send the child to our parent, which knows what it's doing. */ + s = SYNC_RECVPID_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); + } + if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(childpid)"); + } + + /* ... wait for parent to get the pid ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); + } + if (s != SYNC_RECVPID_ACK) { + kill(child, SIGKILL); + bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); + } + + /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + exit(0); + } + + /* + * Stage 2: We're the final child process, and the only process that will + * actually return to the Go runtime. Our job is to just do the + * final cleanup steps and then return to the Go runtime to allow + * init_linux.go to run. + */ + case JUMP_INIT: { + /* + * We're inside the child now, having jumped from the + * start_child() code after forking in the parent. + */ + int consolefd = config.consolefd; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = syncpipe[0]; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long) "runc:[1:INIT]", 0, 0, 0); + + if (setsid() < 0) + bail("setsid failed"); + + if (setuid(0) < 0) + bail("setuid failed"); + + if (setgid(0) < 0) + bail("setgid failed"); + + if (setgroups(0, NULL) < 0) + bail("setgroups failed"); + + if (consolefd != -1) { + if (ioctl(consolefd, TIOCSCTTY, 0) < 0) + bail("ioctl TIOCSCTTY failed"); + if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) + bail("failed to dup stdin"); + if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) + bail("failed to dup stdout"); + if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) + bail("failed to dup stderr"); + } + + /* Close sync pipes. */ + close(syncpipe[0]); + close(syncpipe[1]); + + /* Free netlink data. */ + nl_free(&config); + + /* Finish executing, let the Go runtime take over. */ + return; + } + default: + bail("unexpected jump value"); + break; } - /* Run the parent code. */ - start_child(pipenum, &env, syncpipe, &config); - /* Should never be reached. */ bail("should never be reached"); }