|
|
|
@ -11,6 +11,7 @@
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <stdbool.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
|
|
|
|
@ -23,27 +24,51 @@
|
|
|
|
|
#include <linux/netlink.h>
|
|
|
|
|
#include <linux/types.h>
|
|
|
|
|
|
|
|
|
|
#define SYNC_VAL 0x42
|
|
|
|
|
#define JUMP_VAL 0x43
|
|
|
|
|
/* Get all of the CLONE_NEW* flags. */
|
|
|
|
|
#include "namespace.h"
|
|
|
|
|
|
|
|
|
|
/* Synchronisation values. */
|
|
|
|
|
enum sync_t {
|
|
|
|
|
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
|
|
|
|
|
SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
|
|
|
|
|
SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
|
|
|
|
|
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
|
|
|
|
|
|
|
|
|
|
/* XXX: This doesn't help with segfaults and other such issues. */
|
|
|
|
|
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/* longjmp() arguments. */
|
|
|
|
|
#define JUMP_PARENT 0x00
|
|
|
|
|
#define JUMP_CHILD 0xA0
|
|
|
|
|
#define JUMP_INIT 0xA1
|
|
|
|
|
|
|
|
|
|
/* JSON buffer. */
|
|
|
|
|
#define JSON_MAX 4096
|
|
|
|
|
|
|
|
|
|
/* Assume the stack grows down, so arguments should be above it. */
|
|
|
|
|
struct clone_arg {
|
|
|
|
|
struct clone_t {
|
|
|
|
|
/*
|
|
|
|
|
* Reserve some space for clone() to locate arguments
|
|
|
|
|
* and retcode in this place
|
|
|
|
|
*/
|
|
|
|
|
char stack[4096] __attribute__ ((aligned(16)));
|
|
|
|
|
char stack_ptr[0];
|
|
|
|
|
|
|
|
|
|
/* There's two children. This is used to execute the different code. */
|
|
|
|
|
jmp_buf *env;
|
|
|
|
|
int jmpval;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct nlconfig_t {
|
|
|
|
|
char *data;
|
|
|
|
|
uint32_t cloneflags;
|
|
|
|
|
char *uidmap;
|
|
|
|
|
int uidmap_len;
|
|
|
|
|
size_t uidmap_len;
|
|
|
|
|
char *gidmap;
|
|
|
|
|
int gidmap_len;
|
|
|
|
|
size_t gidmap_len;
|
|
|
|
|
char *namespaces;
|
|
|
|
|
size_t namespaces_len;
|
|
|
|
|
uint8_t is_setgroup;
|
|
|
|
|
int consolefd;
|
|
|
|
|
};
|
|
|
|
@ -81,80 +106,24 @@ int setns(int fd, int nstype)
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
/* XXX: This is ugly. */
|
|
|
|
|
static int syncfd = -1;
|
|
|
|
|
|
|
|
|
|
/* TODO(cyphar): Fix this so it correctly deals with syncT. */
|
|
|
|
|
#define bail(fmt, ...) \
|
|
|
|
|
do { \
|
|
|
|
|
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
|
|
|
|
|
exit(__COUNTER__ + 1); \
|
|
|
|
|
#define bail(fmt, ...) \
|
|
|
|
|
do { \
|
|
|
|
|
int ret = __COUNTER__ + 1; \
|
|
|
|
|
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
|
|
|
|
|
if (syncfd >= 0) { \
|
|
|
|
|
enum sync_t s = SYNC_ERR; \
|
|
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \
|
|
|
|
|
fprintf(stderr, "nsenter: failed: write(s)"); \
|
|
|
|
|
if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \
|
|
|
|
|
fprintf(stderr, "nsenter: failed: write(ret)"); \
|
|
|
|
|
} \
|
|
|
|
|
exit(ret); \
|
|
|
|
|
} while(0)
|
|
|
|
|
|
|
|
|
|
static int child_func(void *arg)
|
|
|
|
|
{
|
|
|
|
|
struct clone_arg *ca = (struct clone_arg *)arg;
|
|
|
|
|
longjmp(*ca->env, JUMP_VAL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int clone_parent(jmp_buf *env, int flags) __attribute__ ((noinline));
|
|
|
|
|
static int clone_parent(jmp_buf *env, int flags)
|
|
|
|
|
{
|
|
|
|
|
int child;
|
|
|
|
|
struct clone_arg ca = {
|
|
|
|
|
.env = env,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, &ca);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so we have
|
|
|
|
|
* to unshare(2) before clone(2) in order to do this. This was fixed in
|
|
|
|
|
* upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
|
|
|
|
|
* introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e.
|
|
|
|
|
*
|
|
|
|
|
* As far as we're aware, the last mainline kernel which had this bug was
|
|
|
|
|
* Linux 3.12. However, we cannot comment on which kernels the broken patch
|
|
|
|
|
* was backported to.
|
|
|
|
|
*/
|
|
|
|
|
if (errno == EINVAL) {
|
|
|
|
|
if (unshare(flags) < 0)
|
|
|
|
|
bail("unable to unshare namespaces");
|
|
|
|
|
child = clone(child_func, ca.stack_ptr, SIGCHLD | CLONE_PARENT, &ca);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return child;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Gets the init pipe fd from the environment, which is used to read the
|
|
|
|
|
* bootstrap data and tell the parent what the new pid is after we finish
|
|
|
|
|
* setting up the environment.
|
|
|
|
|
*/
|
|
|
|
|
static int initpipe(void)
|
|
|
|
|
{
|
|
|
|
|
int pipenum;
|
|
|
|
|
char *initpipe, *endptr;
|
|
|
|
|
|
|
|
|
|
initpipe = getenv("_LIBCONTAINER_INITPIPE");
|
|
|
|
|
if (initpipe == NULL || *initpipe == '\0')
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
errno = 0;
|
|
|
|
|
pipenum = strtol(initpipe, &endptr, 10);
|
|
|
|
|
if (errno != 0 || *endptr != '\0')
|
|
|
|
|
bail("unable to parse _LIBCONTAINER_INITPIPE");
|
|
|
|
|
|
|
|
|
|
return pipenum;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint32_t readint32(char *buf)
|
|
|
|
|
{
|
|
|
|
|
return *(uint32_t *) buf;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint8_t readint8(char *buf)
|
|
|
|
|
{
|
|
|
|
|
return *(uint8_t *) buf;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int write_file(char *data, size_t data_len, char *pathfmt, ...)
|
|
|
|
|
{
|
|
|
|
|
int fd, len, ret = 0;
|
|
|
|
@ -184,18 +153,28 @@ out:
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define SETGROUPS_ALLOW "allow"
|
|
|
|
|
#define SETGROUPS_DENY "deny"
|
|
|
|
|
enum policy_t {
|
|
|
|
|
SETGROUPS_DEFAULT = 0,
|
|
|
|
|
SETGROUPS_ALLOW,
|
|
|
|
|
SETGROUPS_DENY,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/* This *must* be called before we touch gid_map. */
|
|
|
|
|
static void update_setgroups(int pid, bool setgroup)
|
|
|
|
|
static void update_setgroups(int pid, enum policy_t setgroup)
|
|
|
|
|
{
|
|
|
|
|
char *policy;
|
|
|
|
|
|
|
|
|
|
if (setgroup)
|
|
|
|
|
policy = SETGROUPS_ALLOW;
|
|
|
|
|
else
|
|
|
|
|
policy = SETGROUPS_DENY;
|
|
|
|
|
switch (setgroup) {
|
|
|
|
|
case SETGROUPS_ALLOW:
|
|
|
|
|
policy = "allow";
|
|
|
|
|
break;
|
|
|
|
|
case SETGROUPS_DENY:
|
|
|
|
|
policy = "deny";
|
|
|
|
|
break;
|
|
|
|
|
case SETGROUPS_DEFAULT:
|
|
|
|
|
/* Nothing to do. */
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
|
|
|
|
|
/*
|
|
|
|
@ -225,44 +204,76 @@ static void update_gidmap(int pid, char *map, int map_len)
|
|
|
|
|
bail("failed to update /proc/%d/gid_map", pid);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define JSON_MAX 4096
|
|
|
|
|
|
|
|
|
|
static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], struct nlconfig_t *config)
|
|
|
|
|
/* A dummy function that just jumps to the given jumpval. */
|
|
|
|
|
static int child_func(void *arg) __attribute__ ((noinline));
|
|
|
|
|
static int child_func(void *arg)
|
|
|
|
|
{
|
|
|
|
|
int len, childpid;
|
|
|
|
|
char buf[JSON_MAX];
|
|
|
|
|
uint8_t syncval;
|
|
|
|
|
struct clone_t *ca = (struct clone_t *)arg;
|
|
|
|
|
longjmp(*ca->env, ca->jmpval);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We must fork to actually enter the PID namespace, and use
|
|
|
|
|
* CLONE_PARENT so that the child init can have the right parent
|
|
|
|
|
* (the bootstrap process). Also so we don't need to forward the
|
|
|
|
|
* child's exit code or resend its death signal.
|
|
|
|
|
*/
|
|
|
|
|
childpid = clone_parent(env, config->cloneflags);
|
|
|
|
|
if (childpid < 0)
|
|
|
|
|
bail("unable to fork");
|
|
|
|
|
static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
|
|
|
|
|
static int clone_parent(jmp_buf *env, int jmpval)
|
|
|
|
|
{
|
|
|
|
|
struct clone_t ca = {
|
|
|
|
|
.env = env,
|
|
|
|
|
.jmpval = jmpval,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/* Update setgroups, uid_map and gid_map for the process if provided. */
|
|
|
|
|
if (config->is_setgroup)
|
|
|
|
|
update_setgroups(childpid, true);
|
|
|
|
|
update_uidmap(childpid, config->uidmap, config->uidmap_len);
|
|
|
|
|
update_gidmap(childpid, config->gidmap, config->gidmap_len);
|
|
|
|
|
return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Send the sync signal to the child. */
|
|
|
|
|
close(syncpipe[0]);
|
|
|
|
|
syncval = SYNC_VAL;
|
|
|
|
|
if (write(syncpipe[1], &syncval, sizeof(syncval)) != sizeof(syncval))
|
|
|
|
|
bail("failed to write sync byte to child");
|
|
|
|
|
/*
|
|
|
|
|
* Gets the init pipe fd from the environment, which is used to read the
|
|
|
|
|
* bootstrap data and tell the parent what the new pid is after we finish
|
|
|
|
|
* setting up the environment.
|
|
|
|
|
*/
|
|
|
|
|
static int initpipe(void)
|
|
|
|
|
{
|
|
|
|
|
int pipenum;
|
|
|
|
|
char *initpipe, *endptr;
|
|
|
|
|
|
|
|
|
|
/* Send the child pid back to our parent */
|
|
|
|
|
len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", childpid);
|
|
|
|
|
if (len < 0 || write(pipenum, buf, len) != len) {
|
|
|
|
|
kill(childpid, SIGKILL);
|
|
|
|
|
bail("unable to send a child pid");
|
|
|
|
|
}
|
|
|
|
|
initpipe = getenv("_LIBCONTAINER_INITPIPE");
|
|
|
|
|
if (initpipe == NULL || *initpipe == '\0')
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
exit(0);
|
|
|
|
|
pipenum = strtol(initpipe, &endptr, 10);
|
|
|
|
|
if (*endptr != '\0')
|
|
|
|
|
bail("unable to parse _LIBCONTAINER_INITPIPE");
|
|
|
|
|
|
|
|
|
|
return pipenum;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
|
|
|
|
|
static int nsflag(char *name)
|
|
|
|
|
{
|
|
|
|
|
if (!strcmp(name, "cgroup"))
|
|
|
|
|
return CLONE_NEWCGROUP;
|
|
|
|
|
else if (!strcmp(name, "ipc"))
|
|
|
|
|
return CLONE_NEWIPC;
|
|
|
|
|
else if (!strcmp(name, "mnt"))
|
|
|
|
|
return CLONE_NEWNS;
|
|
|
|
|
else if (!strcmp(name, "net"))
|
|
|
|
|
return CLONE_NEWNET;
|
|
|
|
|
else if (!strcmp(name, "pid"))
|
|
|
|
|
return CLONE_NEWPID;
|
|
|
|
|
else if (!strcmp(name, "user"))
|
|
|
|
|
return CLONE_NEWUSER;
|
|
|
|
|
else if (!strcmp(name, "uts"))
|
|
|
|
|
return CLONE_NEWUTS;
|
|
|
|
|
|
|
|
|
|
/* If we don't recognise a name, fallback to 0. */
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint32_t readint32(char *buf)
|
|
|
|
|
{
|
|
|
|
|
return *(uint32_t *) buf;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint8_t readint8(char *buf)
|
|
|
|
|
{
|
|
|
|
|
return *(uint8_t *) buf;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void nl_parse(int fd, struct nlconfig_t *config)
|
|
|
|
@ -309,66 +320,17 @@ static void nl_parse(int fd, struct nlconfig_t *config)
|
|
|
|
|
break;
|
|
|
|
|
case CONSOLE_PATH_ATTR:
|
|
|
|
|
/*
|
|
|
|
|
* The context in which this is done (before or after we
|
|
|
|
|
* join the other namespaces) will affect how the path
|
|
|
|
|
* resolution of the console works. This order is not
|
|
|
|
|
* decided here, but rather in container_linux.go. We just
|
|
|
|
|
* follow the order given by the netlink message.
|
|
|
|
|
* We open the console here because we currently evaluate console
|
|
|
|
|
* paths from the *host* namespaces.
|
|
|
|
|
*/
|
|
|
|
|
config->consolefd = open(current, O_RDWR);
|
|
|
|
|
if (config->consolefd < 0)
|
|
|
|
|
bail("failed to open console %s", current);
|
|
|
|
|
break;
|
|
|
|
|
case NS_PATHS_ATTR:{
|
|
|
|
|
/*
|
|
|
|
|
* Open each namespace path and setns it in the
|
|
|
|
|
* order provided to us. We currently don't have
|
|
|
|
|
* any context for what kind of namespace we're
|
|
|
|
|
* joining, so just blindly do it.
|
|
|
|
|
*/
|
|
|
|
|
char *saveptr = NULL;
|
|
|
|
|
char *ns = strtok_r(current, ",", &saveptr);
|
|
|
|
|
int *fds = NULL, num = 0, i;
|
|
|
|
|
char **paths = NULL;
|
|
|
|
|
|
|
|
|
|
if (!ns || !strlen(current))
|
|
|
|
|
bail("ns paths are empty");
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We have to open the file descriptors first, since after
|
|
|
|
|
* we join the mnt namespace we might no longer be able to
|
|
|
|
|
* access the paths.
|
|
|
|
|
*/
|
|
|
|
|
do {
|
|
|
|
|
int fd;
|
|
|
|
|
|
|
|
|
|
/* Resize fds. */
|
|
|
|
|
num++;
|
|
|
|
|
fds = realloc(fds, num * sizeof(int));
|
|
|
|
|
paths = realloc(paths, num * sizeof(char *));
|
|
|
|
|
|
|
|
|
|
fd = open(ns, O_RDONLY);
|
|
|
|
|
if (fd < 0)
|
|
|
|
|
bail("failed to open %s", ns);
|
|
|
|
|
|
|
|
|
|
fds[num - 1] = fd;
|
|
|
|
|
paths[num - 1] = ns;
|
|
|
|
|
} while ((ns = strtok_r(NULL, ",", &saveptr)) != NULL);
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < num; i++) {
|
|
|
|
|
int fd = fds[i];
|
|
|
|
|
char *path = paths[i];
|
|
|
|
|
|
|
|
|
|
if (setns(fd, 0) < 0)
|
|
|
|
|
bail("failed to setns to %s", path);
|
|
|
|
|
|
|
|
|
|
close(fd);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
free(fds);
|
|
|
|
|
free(paths);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case NS_PATHS_ATTR:
|
|
|
|
|
config->namespaces = current;
|
|
|
|
|
config->namespaces_len = payload_len;
|
|
|
|
|
break;
|
|
|
|
|
case UIDMAP_ATTR:
|
|
|
|
|
config->uidmap = current;
|
|
|
|
|
config->uidmap_len = payload_len;
|
|
|
|
@ -393,6 +355,71 @@ void nl_free(struct nlconfig_t *config)
|
|
|
|
|
free(config->data);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void join_namespaces(char *nslist)
|
|
|
|
|
{
|
|
|
|
|
int num = 0, i;
|
|
|
|
|
char *saveptr = NULL;
|
|
|
|
|
char *namespace = strtok_r(nslist, ",", &saveptr);
|
|
|
|
|
struct namespace_t {
|
|
|
|
|
int fd;
|
|
|
|
|
int ns;
|
|
|
|
|
char type[PATH_MAX];
|
|
|
|
|
char path[PATH_MAX];
|
|
|
|
|
} *namespaces = NULL;
|
|
|
|
|
|
|
|
|
|
if (!namespace || !strlen(namespace) || !strlen(nslist))
|
|
|
|
|
bail("ns paths are empty");
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We have to open the file descriptors first, since after
|
|
|
|
|
* we join the mnt namespace we might no longer be able to
|
|
|
|
|
* access the paths.
|
|
|
|
|
*/
|
|
|
|
|
do {
|
|
|
|
|
int fd;
|
|
|
|
|
char *path;
|
|
|
|
|
struct namespace_t *ns;
|
|
|
|
|
|
|
|
|
|
/* Resize the namespace array. */
|
|
|
|
|
namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
|
|
|
|
|
if (!namespaces)
|
|
|
|
|
bail("failed to reallocate namespace array");
|
|
|
|
|
ns = &namespaces[num - 1];
|
|
|
|
|
|
|
|
|
|
/* Split 'ns:path'. */
|
|
|
|
|
path = strstr(namespace, ":");
|
|
|
|
|
if (!path)
|
|
|
|
|
bail("failed to parse %s", namespace);
|
|
|
|
|
*path++ = '\0';
|
|
|
|
|
|
|
|
|
|
fd = open(path, O_RDONLY);
|
|
|
|
|
if (fd < 0)
|
|
|
|
|
bail("failed to open %s", namespace);
|
|
|
|
|
|
|
|
|
|
ns->fd = fd;
|
|
|
|
|
ns->ns = nsflag(namespace);
|
|
|
|
|
strncpy(ns->path, path, PATH_MAX);
|
|
|
|
|
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The ordering in which we join namespaces is important. We should
|
|
|
|
|
* always join the user namespace *first*. This is all guaranteed
|
|
|
|
|
* from the container_linux.go side of this, so we're just going to
|
|
|
|
|
* follow the order given to us.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < num; i++) {
|
|
|
|
|
struct namespace_t ns = namespaces[i];
|
|
|
|
|
|
|
|
|
|
if (setns(ns.fd, ns.ns) < 0)
|
|
|
|
|
bail("failed to setns to %s", ns.path);
|
|
|
|
|
|
|
|
|
|
close(ns.fd);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
free(namespaces);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void nsexec(void)
|
|
|
|
|
{
|
|
|
|
|
int pipenum;
|
|
|
|
@ -413,61 +440,311 @@ void nsexec(void)
|
|
|
|
|
|
|
|
|
|
/* clone(2) flags are mandatory. */
|
|
|
|
|
if (config.cloneflags == -1)
|
|
|
|
|
bail("missing clone_flags");
|
|
|
|
|
bail("missing cloneflags");
|
|
|
|
|
|
|
|
|
|
/* Pipe so we can tell the child when we've finished setting up. */
|
|
|
|
|
if (pipe(syncpipe) < 0)
|
|
|
|
|
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0)
|
|
|
|
|
bail("failed to setup sync pipe between parent and child");
|
|
|
|
|
|
|
|
|
|
/* Set up the jump point. */
|
|
|
|
|
if (setjmp(env) == JUMP_VAL) {
|
|
|
|
|
/*
|
|
|
|
|
* We're inside the child now, having jumped from the
|
|
|
|
|
* start_child() code after forking in the parent.
|
|
|
|
|
*/
|
|
|
|
|
uint8_t s = 0;
|
|
|
|
|
int consolefd = config.consolefd;
|
|
|
|
|
/* TODO: Currently we aren't dealing with child deaths properly. */
|
|
|
|
|
|
|
|
|
|
/* Close the writing side of pipe. */
|
|
|
|
|
close(syncpipe[1]);
|
|
|
|
|
/*
|
|
|
|
|
* Okay, so this is quite annoying.
|
|
|
|
|
*
|
|
|
|
|
* In order to make sure that deal with older kernels (when CLONE_NEWUSER
|
|
|
|
|
* wasn't guaranteed to be done first if you specify multiple namespaces in
|
|
|
|
|
* a clone(2) invocation) as well as with certain usecases like rootless
|
|
|
|
|
* containers, we cannot just dump all of the cloneflags into clone(2).
|
|
|
|
|
* However, if we unshare(2) the user namespace *before* we clone(2), then
|
|
|
|
|
* all hell breaks loose.
|
|
|
|
|
*
|
|
|
|
|
* The parent no longer has permissions to do many things (unshare(2) drops
|
|
|
|
|
* all capabilities in your old namespace), and the container cannot be set
|
|
|
|
|
* up to have more than one {uid,gid} mapping. This is obviously less than
|
|
|
|
|
* ideal. In order to fix this, we have to first clone(2) and then unshare.
|
|
|
|
|
*
|
|
|
|
|
* Unfortunately, it's not as simple as that. We have to fork to enter the
|
|
|
|
|
* PID namespace (the PID namespace only applies to children). Since we'll
|
|
|
|
|
* have to double-fork, this clone_parent() call won't be able to get the
|
|
|
|
|
* PID of the _actual_ init process (without doing more synchronisation than
|
|
|
|
|
* I can deal with at the moment). So we'll just get the parent to send it
|
|
|
|
|
* for us, the only job of this process is to update
|
|
|
|
|
* /proc/pid/{setgroups,uid_map,gid_map}.
|
|
|
|
|
*
|
|
|
|
|
* And as a result of the above, we also need to setns(2) in the first child
|
|
|
|
|
* because if we join a PID namespace in the topmost parent then our child
|
|
|
|
|
* will be in that namespace (and it will not be able to give us a PID value
|
|
|
|
|
* that makes sense without resorting to sending things with cmsg).
|
|
|
|
|
*
|
|
|
|
|
* This also deals with an older issue caused by dumping cloneflags into
|
|
|
|
|
* clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
|
|
|
|
|
* we have to unshare(2) before clone(2) in order to do this. This was fixed
|
|
|
|
|
* in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
|
|
|
|
|
* introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
|
|
|
|
|
* aware, the last mainline kernel which had this bug was Linux 3.12.
|
|
|
|
|
* However, we cannot comment on which kernels the broken patch was
|
|
|
|
|
* backported to.
|
|
|
|
|
*
|
|
|
|
|
* -- Aleksa "what has my life come to?" Sarai
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* Sync with parent. */
|
|
|
|
|
if (read(syncpipe[0], &s, sizeof(s)) != sizeof(s) || s != SYNC_VAL)
|
|
|
|
|
bail("failed to read sync byte from parent");
|
|
|
|
|
switch (setjmp(env)) {
|
|
|
|
|
/*
|
|
|
|
|
* Stage 0: We're in the parent. Our job is just to create a new child
|
|
|
|
|
* (stage 1: JUMP_CHILD) process and write its uid_map and
|
|
|
|
|
* gid_map. That process will go on to create a new process, then
|
|
|
|
|
* it will send us its PID which we will send to the bootstrap
|
|
|
|
|
* process.
|
|
|
|
|
*/
|
|
|
|
|
case JUMP_PARENT: {
|
|
|
|
|
int len;
|
|
|
|
|
pid_t child;
|
|
|
|
|
char buf[JSON_MAX];
|
|
|
|
|
|
|
|
|
|
if (setsid() < 0)
|
|
|
|
|
bail("setsid failed");
|
|
|
|
|
/* For debugging. */
|
|
|
|
|
prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
|
|
|
|
|
|
|
|
|
|
if (setuid(0) < 0)
|
|
|
|
|
bail("setuid failed");
|
|
|
|
|
/* Start the process of getting a container. */
|
|
|
|
|
child = clone_parent(&env, JUMP_CHILD);
|
|
|
|
|
if (child < 0)
|
|
|
|
|
bail("unable to fork: child_func");
|
|
|
|
|
|
|
|
|
|
if (setgid(0) < 0)
|
|
|
|
|
bail("setgid failed");
|
|
|
|
|
/* State machine for synchronisation with the children. */
|
|
|
|
|
while (true) {
|
|
|
|
|
enum sync_t s;
|
|
|
|
|
|
|
|
|
|
if (setgroups(0, NULL) < 0)
|
|
|
|
|
bail("setgroups failed");
|
|
|
|
|
/* This doesn't need to be global, we're in the parent. */
|
|
|
|
|
int syncfd = syncpipe[1];
|
|
|
|
|
|
|
|
|
|
if (consolefd != -1) {
|
|
|
|
|
if (ioctl(consolefd, TIOCSCTTY, 0) < 0)
|
|
|
|
|
bail("ioctl TIOCSCTTY failed");
|
|
|
|
|
if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO)
|
|
|
|
|
bail("failed to dup stdin");
|
|
|
|
|
if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO)
|
|
|
|
|
bail("failed to dup stdout");
|
|
|
|
|
if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO)
|
|
|
|
|
bail("failed to dup stderr");
|
|
|
|
|
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
|
|
|
|
|
bail("failed to sync with child: next state");
|
|
|
|
|
|
|
|
|
|
switch (s) {
|
|
|
|
|
case SYNC_ERR: {
|
|
|
|
|
/* We have to mirror the error code of the child. */
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
|
|
|
|
|
bail("failed to sync with child: read(error code)");
|
|
|
|
|
|
|
|
|
|
exit(ret);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case SYNC_USERMAP_PLS:
|
|
|
|
|
/* Enable setgroups(2) if we've been asked to. */
|
|
|
|
|
if (config.is_setgroup)
|
|
|
|
|
update_setgroups(child, SETGROUPS_ALLOW);
|
|
|
|
|
|
|
|
|
|
/* Set up mappings. */
|
|
|
|
|
update_uidmap(child, config.uidmap, config.uidmap_len);
|
|
|
|
|
update_gidmap(child, config.gidmap, config.gidmap_len);
|
|
|
|
|
|
|
|
|
|
s = SYNC_USERMAP_ACK;
|
|
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
|
|
kill(child, SIGKILL);
|
|
|
|
|
bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case SYNC_USERMAP_ACK:
|
|
|
|
|
/* We should _never_ receive acks. */
|
|
|
|
|
kill(child, SIGKILL);
|
|
|
|
|
bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
|
|
|
|
|
break;
|
|
|
|
|
case SYNC_RECVPID_PLS: {
|
|
|
|
|
pid_t old = child;
|
|
|
|
|
|
|
|
|
|
/* Get the init_func pid. */
|
|
|
|
|
if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
|
|
|
|
|
kill(old, SIGKILL);
|
|
|
|
|
bail("failed to sync with child: read(childpid)");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Send ACK. */
|
|
|
|
|
s = SYNC_RECVPID_ACK;
|
|
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
|
|
kill(old, SIGKILL);
|
|
|
|
|
kill(child, SIGKILL);
|
|
|
|
|
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Leave the loop. */
|
|
|
|
|
goto out;
|
|
|
|
|
case SYNC_RECVPID_ACK:
|
|
|
|
|
/* We should _never_ receive acks. */
|
|
|
|
|
kill(child, SIGKILL);
|
|
|
|
|
bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
out:
|
|
|
|
|
/* Send the init_func pid back to our parent. */
|
|
|
|
|
len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
|
|
|
|
|
if (len < 0) {
|
|
|
|
|
kill(child, SIGKILL);
|
|
|
|
|
bail("unable to generate JSON for child pid");
|
|
|
|
|
}
|
|
|
|
|
if (write(pipenum, buf, len) != len) {
|
|
|
|
|
kill(child, SIGKILL);
|
|
|
|
|
bail("unable to send child pid to bootstrapper");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
exit(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Free netlink data. */
|
|
|
|
|
nl_free(&config);
|
|
|
|
|
/*
|
|
|
|
|
* Stage 1: We're in the first child process. Our job is to join any
|
|
|
|
|
* provided user namespaces in the netlink payload. If we've been
|
|
|
|
|
* asked to CLONE_NEWUSER, we will unshare the user namespace and
|
|
|
|
|
* ask our parent (stage 0) to set up our user mappings for us.
|
|
|
|
|
* Then, we unshare the rest of the requested namespaces and
|
|
|
|
|
* create a new child (stage 2: JUMP_INIT). We then send the
|
|
|
|
|
* child's PID to our parent (stage 0).
|
|
|
|
|
*/
|
|
|
|
|
case JUMP_CHILD: {
|
|
|
|
|
pid_t child;
|
|
|
|
|
enum sync_t s;
|
|
|
|
|
|
|
|
|
|
/* Finish executing, let the Go runtime take over. */
|
|
|
|
|
return;
|
|
|
|
|
/* We're in a child and thus need to tell the parent if we die. */
|
|
|
|
|
syncfd = syncpipe[0];
|
|
|
|
|
|
|
|
|
|
/* For debugging. */
|
|
|
|
|
prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We need to setns first. We cannot do this earlier (in stage 0)
|
|
|
|
|
* because of the fact that we forked to get here (the PID of
|
|
|
|
|
* [stage 2: JUMP_INIT]) would be meaningless). We could send it
|
|
|
|
|
* using cmsg(3) but that's just annoying.
|
|
|
|
|
*/
|
|
|
|
|
if (config.namespaces)
|
|
|
|
|
join_namespaces(config.namespaces);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Unshare all of the namespaces. Now, it should be noted that this
|
|
|
|
|
* ordering might break in the future (especially with rootless
|
|
|
|
|
* containers). But for now, it's not possible to split this into
|
|
|
|
|
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
|
|
|
|
|
*
|
|
|
|
|
* We also can't be sure if the current kernel supports
|
|
|
|
|
* clone(CLONE_PARENT | CLONE_NEWPID), so we'll just do it the long
|
|
|
|
|
* way anyway.
|
|
|
|
|
*/
|
|
|
|
|
if (unshare(config.cloneflags) < 0)
|
|
|
|
|
bail("failed to unshare namespaces");
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Deal with user namespaces first. They are quite special, as they
|
|
|
|
|
* affect our ability to unshare other namespaces and are used as
|
|
|
|
|
* context for privilege checks.
|
|
|
|
|
*/
|
|
|
|
|
if (config.cloneflags & CLONE_NEWUSER) {
|
|
|
|
|
/*
|
|
|
|
|
* We don't have the privileges to do any mapping here (see the
|
|
|
|
|
* clone_parent rant). So signal our parent to hook us up.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
s = SYNC_USERMAP_PLS;
|
|
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
|
|
|
|
|
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
|
|
|
|
|
|
|
|
|
|
/* ... wait for mapping ... */
|
|
|
|
|
|
|
|
|
|
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
|
|
|
|
|
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
|
|
|
|
|
if (s != SYNC_USERMAP_ACK)
|
|
|
|
|
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* TODO: What about non-namespace clone flags that we're dropping here? */
|
|
|
|
|
child = clone_parent(&env, JUMP_INIT);
|
|
|
|
|
if (child < 0)
|
|
|
|
|
bail("unable to fork: init_func");
|
|
|
|
|
|
|
|
|
|
/* Send the child to our parent, which knows what it's doing. */
|
|
|
|
|
s = SYNC_RECVPID_PLS;
|
|
|
|
|
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
|
|
kill(child, SIGKILL);
|
|
|
|
|
bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
|
|
|
|
|
}
|
|
|
|
|
if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
|
|
|
|
|
kill(child, SIGKILL);
|
|
|
|
|
bail("failed to sync with parent: write(childpid)");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* ... wait for parent to get the pid ... */
|
|
|
|
|
|
|
|
|
|
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
|
|
|
|
kill(child, SIGKILL);
|
|
|
|
|
bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
|
|
|
|
|
}
|
|
|
|
|
if (s != SYNC_RECVPID_ACK) {
|
|
|
|
|
kill(child, SIGKILL);
|
|
|
|
|
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
|
|
|
|
|
exit(0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Stage 2: We're the final child process, and the only process that will
|
|
|
|
|
* actually return to the Go runtime. Our job is to just do the
|
|
|
|
|
* final cleanup steps and then return to the Go runtime to allow
|
|
|
|
|
* init_linux.go to run.
|
|
|
|
|
*/
|
|
|
|
|
case JUMP_INIT: {
|
|
|
|
|
/*
|
|
|
|
|
* We're inside the child now, having jumped from the
|
|
|
|
|
* start_child() code after forking in the parent.
|
|
|
|
|
*/
|
|
|
|
|
int consolefd = config.consolefd;
|
|
|
|
|
|
|
|
|
|
/* We're in a child and thus need to tell the parent if we die. */
|
|
|
|
|
syncfd = syncpipe[0];
|
|
|
|
|
|
|
|
|
|
/* For debugging. */
|
|
|
|
|
prctl(PR_SET_NAME, (unsigned long) "runc:[1:INIT]", 0, 0, 0);
|
|
|
|
|
|
|
|
|
|
if (setsid() < 0)
|
|
|
|
|
bail("setsid failed");
|
|
|
|
|
|
|
|
|
|
if (setuid(0) < 0)
|
|
|
|
|
bail("setuid failed");
|
|
|
|
|
|
|
|
|
|
if (setgid(0) < 0)
|
|
|
|
|
bail("setgid failed");
|
|
|
|
|
|
|
|
|
|
if (setgroups(0, NULL) < 0)
|
|
|
|
|
bail("setgroups failed");
|
|
|
|
|
|
|
|
|
|
if (consolefd != -1) {
|
|
|
|
|
if (ioctl(consolefd, TIOCSCTTY, 0) < 0)
|
|
|
|
|
bail("ioctl TIOCSCTTY failed");
|
|
|
|
|
if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO)
|
|
|
|
|
bail("failed to dup stdin");
|
|
|
|
|
if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO)
|
|
|
|
|
bail("failed to dup stdout");
|
|
|
|
|
if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO)
|
|
|
|
|
bail("failed to dup stderr");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Close sync pipes. */
|
|
|
|
|
close(syncpipe[0]);
|
|
|
|
|
close(syncpipe[1]);
|
|
|
|
|
|
|
|
|
|
/* Free netlink data. */
|
|
|
|
|
nl_free(&config);
|
|
|
|
|
|
|
|
|
|
/* Finish executing, let the Go runtime take over. */
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
default:
|
|
|
|
|
bail("unexpected jump value");
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Run the parent code. */
|
|
|
|
|
start_child(pipenum, &env, syncpipe, &config);
|
|
|
|
|
|
|
|
|
|
/* Should never be reached. */
|
|
|
|
|
bail("should never be reached");
|
|
|
|
|
}
|
|
|
|
|