#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SYNC_VAL 0x42 #define JUMP_VAL 0x43 /* Assume the stack grows down, so arguments should be above it. */ struct clone_arg { /* * Reserve some space for clone() to locate arguments * and retcode in this place */ char stack[4096] __attribute__ ((aligned(16))); char stack_ptr[0]; jmp_buf *env; }; struct nsenter_config { uint32_t cloneflags; char *uidmap; int uidmap_len; char *gidmap; int gidmap_len; uint8_t is_setgroup; int consolefd; }; /* * List of netlink message types sent to us as part of bootstrapping the init. * These constants are defined in libcontainer/message_linux.go. */ #define INIT_MSG 62000 #define CLONE_FLAGS_ATTR 27281 #define CONSOLE_PATH_ATTR 27282 #define NS_PATHS_ATTR 27283 #define UIDMAP_ATTR 27284 #define GIDMAP_ATTR 27285 #define SETGROUP_ATTR 27286 /* * Use the raw syscall for versions of glibc which don't include a function for * it, namely (glibc 2.12). */ #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 # define _GNU_SOURCE # include "syscall.h" # if !defined(SYS_setns) && defined(__NR_setns) # define SYS_setns __NR_setns # endif #ifndef SYS_setns # error "setns(2) syscall not supported by glibc version" #endif int setns(int fd, int nstype) { return syscall(SYS_setns, fd, nstype); } #endif /* TODO(cyphar): Fix this so it correctly deals with syncT. */ #define bail(fmt, ...) \ do { \ fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \ exit(__COUNTER__ + 1); \ } while(0) static int child_func(void *arg) { struct clone_arg *ca = (struct clone_arg *)arg; longjmp(*ca->env, JUMP_VAL); } static int clone_parent(jmp_buf *env, int flags) __attribute__ ((noinline)); static int clone_parent(jmp_buf *env, int flags) { int child; struct clone_arg ca = { .env = env, }; child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, &ca); /* * On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so we have * to unshare(2) before clone(2) in order to do this. This was fixed in * upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was * introduced by by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. * * As far as we're aware, the last mainline kernel which had this bug was * Linux 3.12. However, we cannot comment on which kernels the broken patch * was backported to. */ if (errno == EINVAL) { if (unshare(flags) < 0) bail("unable to unshare namespaces"); child = clone(child_func, ca.stack_ptr, SIGCHLD | CLONE_PARENT, &ca); } return child; } /* * Gets the init pipe fd from the environment, which is used to read the * bootstrap data and tell the parent what the new pid is after we finish * setting up the environment. */ static int initpipe(void) { int pipenum; char *initpipe, *endptr; initpipe = getenv("_LIBCONTAINER_INITPIPE"); if (initpipe == NULL || *initpipe == '\0') return -1; errno = 0; pipenum = strtol(initpipe, &endptr, 10); if (errno != 0 || *endptr != '\0') bail("unable to parse _LIBCONTAINER_INITPIPE"); return pipenum; } static uint32_t readint32(char *buf) { return *(uint32_t *) buf; } static uint8_t readint8(char *buf) { return *(uint8_t *) buf; } static int write_file(char *data, size_t data_len, char *pathfmt, ...) { int fd, len, ret = 0; char path[PATH_MAX]; va_list ap; va_start(ap, pathfmt); len = vsnprintf(path, PATH_MAX, pathfmt, ap); va_end(ap); if (len < 0) return -1; fd = open(path, O_RDWR); if (fd < 0) { ret = -1; goto out; } len = write(fd, data, data_len); if (len != data_len) { ret = -1; goto out; } out: close(fd); return ret; } #define SETGROUPS_ALLOW "allow" #define SETGROUPS_DENY "deny" /* This *must* be called before we touch gid_map. */ static void update_setgroups(int pid, bool setgroup) { char *policy; if (setgroup) policy = SETGROUPS_ALLOW; else policy = SETGROUPS_DENY; if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { /* * If the kernel is too old to support /proc/pid/setgroups, * open(2) or write(2) will return ENOENT. This is fine. */ if (errno != ENOENT) bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); } } static void update_uidmap(int pid, char *map, int map_len) { if (map == NULL || map_len <= 0) return; if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) bail("failed to update /proc/%d/uid_map", pid); } static void update_gidmap(int pid, char *map, int map_len) { if (map == NULL || map_len <= 0) return; if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) bail("failed to update /proc/%d/gid_map", pid); } #define JSON_MAX 4096 static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], struct nsenter_config *config) { int len, childpid; char buf[JSON_MAX]; uint8_t syncval; /* * We must fork to actually enter the PID namespace, and use * CLONE_PARENT so that the child init can have the right parent * (the bootstrap process). Also so we don't need to forward the * child's exit code or resend its death signal. */ childpid = clone_parent(env, config->cloneflags); if (childpid < 0) bail("unable to fork"); /* Update setgroups, uid_map and gid_map for the process if provided. */ if (config->is_setgroup) update_setgroups(childpid, true); update_uidmap(childpid, config->uidmap, config->uidmap_len); update_gidmap(childpid, config->gidmap, config->gidmap_len); /* Send the sync signal to the child. */ close(syncpipe[0]); syncval = SYNC_VAL; if (write(syncpipe[1], &syncval, sizeof(syncval)) != sizeof(syncval)) bail("failed to write sync byte to child"); /* Send the child pid back to our parent */ len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", childpid); if (len < 0 || write(pipenum, buf, len) != len) { kill(childpid, SIGKILL); bail("unable to send a child pid"); } exit(0); } static struct nsenter_config process_nl_attributes(int pipenum, char *data, int data_size) { char *current = data; struct nsenter_config config = { 0 }; config.consolefd = -1; while (current < data + data_size) { struct nlattr *nlattr = (struct nlattr *)current; size_t payload_len = nlattr->nla_len - NLA_HDRLEN; /* Advance to payload. */ current += NLA_HDRLEN; /* Handle payload. */ switch (nlattr->nla_type) { case CLONE_FLAGS_ATTR: config.cloneflags = readint32(current); break; case CONSOLE_PATH_ATTR: /* * The context in which this is done (before or after we * join the other namespaces) will affect how the path * resolution of the console works. This order is not * decided here, but rather in container_linux.go. We just * follow the order given by the netlink message. */ config.consolefd = open(current, O_RDWR); if (config.consolefd < 0) bail("failed to open console %s", current); break; case NS_PATHS_ATTR:{ /* * Open each namespace path and setns it in the * order provided to us. We currently don't have * any context for what kind of namespace we're * joining, so just blindly do it. */ char *saveptr = NULL; char *ns = strtok_r(current, ",", &saveptr); int *fds = NULL, num = 0, i; char **paths = NULL; if (!ns || !strlen(current)) bail("ns paths are empty"); /* * We have to open the file descriptors first, since after * we join the mnt namespace we might no longer be able to * access the paths. */ do { int fd; /* Resize fds. */ num++; fds = realloc(fds, num * sizeof(int)); paths = realloc(paths, num * sizeof(char *)); fd = open(ns, O_RDONLY); if (fd < 0) bail("failed to open %s", ns); fds[num - 1] = fd; paths[num - 1] = ns; } while ((ns = strtok_r(NULL, ",", &saveptr)) != NULL); for (i = 0; i < num; i++) { int fd = fds[i]; char *path = paths[i]; if (setns(fd, 0) < 0) bail("failed to setns to %s", path); close(fd); } break; } case UIDMAP_ATTR: config.uidmap = current; config.uidmap_len = payload_len; break; case GIDMAP_ATTR: config.gidmap = current; config.gidmap_len = payload_len; break; case SETGROUP_ATTR: config.is_setgroup = readint8(current); break; default: bail("unknown netlink message type %d", nlattr->nla_type); } current += NLA_ALIGN(payload_len); } return config; } void nsexec(void) { int pipenum; /* * If we don't have an init pipe, just return to the go routine. * We'll only get an init pipe for start or exec. */ pipenum = initpipe(); if (pipenum == -1) return; /* Retrieve the netlink header. */ int len; struct nlmsghdr hdr; len = read(pipenum, &hdr, NLMSG_HDRLEN); if (len != NLMSG_HDRLEN) bail("invalid netlink header length %d", len); if (hdr.nlmsg_type == NLMSG_ERROR) bail("failed to read netlink message"); if (hdr.nlmsg_type != INIT_MSG) bail("unexpected msg type %d", hdr.nlmsg_type); /* Retrieve data. */ size_t size = NLMSG_PAYLOAD(&hdr, 0); char *data = alloca(size); len = read(pipenum, data, size); if (len != size) bail("failed to read netlink payload, %d != %lu", len, size); jmp_buf env; int syncpipe[2]; struct nsenter_config config = process_nl_attributes(pipenum, data, size); /* clone(2) flags are mandatory. */ if (config.cloneflags == -1) bail("missing clone_flags"); /* Pipe so we can tell the child when we've finished setting up. */ if (pipe(syncpipe) < 0) bail("failed to setup sync pipe between parent and child"); /* Set up the jump point. */ if (setjmp(env) == JUMP_VAL) { /* * We're inside the child now, having jumped from the * start_child() code after forking in the parent. */ uint8_t s = 0; int consolefd = config.consolefd; /* Close the writing side of pipe. */ close(syncpipe[1]); /* Sync with parent. */ if (read(syncpipe[0], &s, sizeof(s)) != sizeof(s) || s != SYNC_VAL) bail("failed to read sync byte from parent"); if (setsid() < 0) bail("setsid failed"); if (setuid(0) < 0) bail("setuid failed"); if (setgid(0) < 0) bail("setgid failed"); if (setgroups(0, NULL) < 0) bail("setgroups failed"); if (consolefd != -1) { if (ioctl(consolefd, TIOCSCTTY, 0) < 0) bail("ioctl TIOCSCTTY failed"); if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) bail("failed to dup stdin"); if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) bail("failed to dup stdout"); if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) bail("failed to dup stderr"); } /* Finish executing, let the Go runtime take over. */ return; } /* Run the parent code. */ start_child(pipenum, &env, syncpipe, &config); /* Should never be reached. */ bail("should never be reached"); }