Refactor nsexec

Cut nsexec in smaller chunk routines to make it more readable.

Signed-off-by: Kenfe-Mickael Laventure <mickael.laventure@gmail.com>
This commit is contained in:
Kenfe-Mickael Laventure 2016-01-14 17:08:45 -08:00
parent 002b6c2fe8
commit 08c3c6ebe2
1 changed files with 249 additions and 206 deletions

View File

@ -13,6 +13,7 @@
#include <string.h> #include <string.h>
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h> #include <unistd.h>
#include <bits/sockaddr.h> #include <bits/sockaddr.h>
@ -21,50 +22,71 @@
#include <stdint.h> #include <stdint.h>
#include <sys/socket.h> #include <sys/socket.h>
/* All arguments should be above stack, because it grows down */ // All arguments should be above the stack because it grows down
struct clone_arg { struct clone_arg {
/* /*
* Reserve some space for clone() to locate arguments * Reserve some space for clone() to locate arguments
* and retcode in this place * and retcode in this place
*/ */
char stack[4096] __attribute__ ((aligned(16))); char stack[4096] __attribute__((aligned(16)));
char stack_ptr[0]; char stack_ptr[0];
jmp_buf *env; jmp_buf *env;
}; };
#define pr_perror(fmt, ...) fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__) struct nsenter_config {
uint32_t cloneflags;
char *uidmap;
int uidmap_len;
char *gidmap;
int gidmap_len;
uint8_t is_setgroup;
};
// list of known message types we want to send to bootstrap program
// These are defined in libcontainer/message_linux.go
#define INIT_MSG 62000
#define CLONE_FLAGS_ATTR 27281
#define CONSOLE_PATH_ATTR 27282
#define NS_PATHS_ATTR 27283
#define UIDMAP_ATTR 27284
#define GIDMAP_ATTR 27285
#define SETGROUP_ATTR 27286
// Use raw setns syscall for versions of glibc that don't include it
// (namely glibc-2.12)
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
#define _GNU_SOURCE
#include "syscall.h"
#if defined(__NR_setns) && !defined(SYS_setns)
#define SYS_setns __NR_setns
#endif
#ifdef SYS_setns
int setns(int fd, int nstype)
{
return syscall(SYS_setns, fd, nstype);
}
#endif
#endif
#define pr_perror(fmt, ...) \
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__)
static int child_func(void *_arg) static int child_func(void *_arg)
{ {
struct clone_arg *arg = (struct clone_arg *)_arg; struct clone_arg *arg = (struct clone_arg *)_arg;
longjmp(*arg->env, 1); longjmp(*arg->env, 1);
} }
// Use raw setns syscall for versions of glibc that don't include it (namely glibc-2.12) static int clone_parent(jmp_buf *env, int flags) __attribute__((noinline));
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 static int clone_parent(jmp_buf *env, int flags)
#define _GNU_SOURCE
#include "syscall.h"
#if defined(__NR_setns) && !defined(SYS_setns)
#define SYS_setns __NR_setns
#endif
#ifdef SYS_setns
int setns(int fd, int nstype)
{
return syscall(SYS_setns, fd, nstype);
}
#endif
#endif
static int clone_parent(jmp_buf * env, int flags) __attribute__ ((noinline));
static int clone_parent(jmp_buf * env, int flags)
{ {
struct clone_arg ca; struct clone_arg ca;
int child; int child;
ca.env = env; ca.env = env;
child = child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags,
clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, &ca);
&ca);
return child; return child;
} }
@ -72,8 +94,9 @@ static int clone_parent(jmp_buf * env, int flags)
// write pid to after nsexec finishes setting up the environment. // write pid to after nsexec finishes setting up the environment.
static int get_init_pipe() static int get_init_pipe()
{ {
char buf[PATH_MAX], *initpipe; char buf[PATH_MAX];
int pipenum = -1; char *initpipe;
int pipenum = -1;
initpipe = getenv("_LIBCONTAINER_INITPIPE"); initpipe = getenv("_LIBCONTAINER_INITPIPE");
if (initpipe == NULL) { if (initpipe == NULL) {
@ -94,7 +117,8 @@ static int get_init_pipe()
// argument is a comma-separated string of namespace paths. // argument is a comma-separated string of namespace paths.
static int num_namespaces(char *nspaths) static int num_namespaces(char *nspaths)
{ {
int size = 0, i = 0; int i;
int size = 0;
for (i = 0; nspaths[i]; i++) { for (i = 0; nspaths[i]; i++) {
if (nspaths[i] == ',') { if (nspaths[i] == ',') {
@ -107,100 +131,154 @@ static int num_namespaces(char *nspaths)
static uint32_t readint32(char *buf) static uint32_t readint32(char *buf)
{ {
return *(uint32_t *) buf; return *(uint32_t *)buf;
} }
static uint8_t readint8(char *buf) static uint8_t readint8(char *buf)
{ {
return *(uint8_t *) buf; return *(uint8_t *)buf;
} }
static void writedata(int fd, char *data, int start, int len) static void update_process_idmap(char *pathfmt, int pid, char *map, int map_len)
{ {
int written = 0; char buf[PATH_MAX];
while (written < len) { int len;
size_t nbyte, i; int fd;
if ((len - written) < 1024) {
nbyte = len - written; len = snprintf(buf, sizeof(buf), pathfmt, pid);
} else { if (len < 0) {
nbyte = 1024; pr_perror("failed to construct '%s' for %d", pathfmt, pid);
} exit(1);
i = write(fd, data + start + written, nbyte);
if (i == -1) {
pr_perror("failed to write data to %d", fd);
exit(1);
}
written += i;
} }
fd = open(buf, O_RDWR);
if (fd == -1) {
pr_perror("failed to open %s", buf);
exit(1);
}
len = write(fd, map, map_len);
if (len == -1) {
pr_perror("failed to write to %s", buf);
exit(1);
} else if (len != map_len) {
fprintf(stderr, "Failed to write data to %s (%d/%d)",
buf, len, map_len);
exit(1);
}
close(fd);
} }
// list of known message types we want to send to bootstrap program static void update_process_uidmap(int pid, char *map, int map_len)
// These are defined in libcontainer/message_linux.go
#define INIT_MSG 62000
#define CLONE_FLAGS_ATTR 27281
#define CONSOLE_PATH_ATTR 27282
#define NS_PATHS_ATTR 27283
#define UIDMAP_ATTR 27284
#define GIDMAP_ATTR 27285
#define SETGROUP_ATTR 27286
void nsexec()
{ {
jmp_buf env; if ((map == NULL) || (map_len <= 0)) {
int pipenum;
// if we dont have init pipe, then just return to the parent
pipenum = get_init_pipe();
if (pipenum == -1) {
return; return;
} }
// Retrieve the netlink header
struct nlmsghdr nl_msg_hdr;
int len;
if ((len = read(pipenum, &nl_msg_hdr, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { update_process_idmap("/proc/%d/uid_map", pid, map, map_len);
pr_perror("Failed to read netlink header, got %d instead of %d", }
len, NLMSG_HDRLEN);
static void update_process_gidmap(int pid, uint8_t is_setgroup, char *map, int map_len)
{
if ((map == NULL) || (map_len <= 0)) {
return;
}
if (is_setgroup == 1) {
int fd;
int len;
char buf[PATH_MAX];
len = snprintf(buf, sizeof(buf), "/proc/%d/setgroups", pid);
if (len < 0) {
pr_perror("failed to get setgroups path for %d", pid);
exit(1);
}
fd = open(buf, O_RDWR);
if (fd == -1) {
pr_perror("failed to open %s", buf);
exit(1);
}
if (write(fd, "allow", 5) != 5) {
// If the kernel is too old to support
// /proc/PID/setgroups, write will return
// ENOENT; this is OK.
if (errno != ENOENT) {
pr_perror("failed to write allow to %s", buf);
exit(1);
}
}
close(fd);
}
update_process_idmap("/proc/%d/gid_map", pid, map, map_len);
}
static void start_child(int pipenum, jmp_buf *env, int syncpipe[2],
struct nsenter_config *config)
{
int len;
int childpid;
char buf[PATH_MAX];
uint8_t syncbyte = 1;
// We must fork to actually enter the PID namespace, use CLONE_PARENT
// so the child can have the right parent, and we don't need to forward
// the child's exit code or resend its death signal.
childpid = clone_parent(env, config->cloneflags);
if (childpid < 0) {
pr_perror("Unable to fork");
exit(1); exit(1);
} }
if (nl_msg_hdr.nlmsg_type == NLMSG_ERROR) { // update uid_map and gid_map for the child process if they
pr_perror("failed to read netlink message"); // were provided
exit(1); update_process_uidmap(childpid, config->uidmap, config->uidmap_len);
}
if (nl_msg_hdr.nlmsg_type != INIT_MSG) {
pr_perror("unexpected msg type %d", nl_msg_hdr.nlmsg_type);
exit(1);
}
// Retrieve data
int nl_total_size = NLMSG_PAYLOAD(&nl_msg_hdr, 0);
char data[nl_total_size];
if ((len = read(pipenum, data, nl_total_size)) != nl_total_size) { update_process_gidmap(childpid, config->is_setgroup, config->gidmap, config->gidmap_len);
pr_perror
("Failed to read netlink payload, got %d instead of %d", // Send the sync signal to the child
len, nl_total_size); close(syncpipe[0]);
syncbyte = 1;
if (write(syncpipe[1], &syncbyte, 1) != 1) {
pr_perror("failed to write sync byte to child");
exit(1); exit(1);
} }
// Process the passed attributes
int start = 0;
uint32_t cloneflags = -1;
uint8_t is_setgroup = 0;
int consolefd = -1;
int uidmap_start = -1, uidmap_len = -1;
int gidmap_start = -1, gidmap_len = -1;
int payload_len;
struct nlattr *nlattr;
while (start < nl_total_size) { // Send the child pid back to our parent
len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", childpid);
if ((len < 0) || (write(pipenum, buf, len) != len)) {
pr_perror("Unable to send a child pid");
kill(childpid, SIGKILL);
exit(1);
}
exit(0);
}
static void process_nl_attributes(int pipenum, char *data, int data_size)
{
jmp_buf env;
struct nsenter_config config = {0};
struct nlattr *nlattr;
int payload_len;
int start = 0;
int consolefd = -1;
int syncpipe[2] = {-1, -1};
while (start < data_size) {
nlattr = (struct nlattr *)(data + start); nlattr = (struct nlattr *)(data + start);
start += NLA_HDRLEN; start += NLA_HDRLEN;
payload_len = nlattr->nla_len - NLA_HDRLEN; payload_len = nlattr->nla_len - NLA_HDRLEN;
if (nlattr->nla_type == CLONE_FLAGS_ATTR) { if (nlattr->nla_type == CLONE_FLAGS_ATTR) {
cloneflags = readint32(data + start); config.cloneflags = readint32(data + start);
} else if (nlattr->nla_type == CONSOLE_PATH_ATTR) { } else if (nlattr->nla_type == CONSOLE_PATH_ATTR) {
// get the console path before setns because it may change mnt namespace // get the console path before setns because it may
// change mnt namespace
consolefd = open(data + start, O_RDWR); consolefd = open(data + start, O_RDWR);
if (consolefd < 0) { if (consolefd < 0) {
pr_perror("Failed to open console %s", pr_perror("Failed to open console %s",
@ -208,24 +286,20 @@ void nsexec()
exit(1); exit(1);
} }
} else if (nlattr->nla_type == NS_PATHS_ATTR) { } else if (nlattr->nla_type == NS_PATHS_ATTR) {
char nspaths[payload_len + 1]; // if custom namespaces are required, open all
// descriptors and perform setns on them
strncpy(nspaths, data + start, payload_len); int i;
nspaths[payload_len] = '\0'; int nslen = num_namespaces(data + start);
int fds[nslen];
// if custom namespaces are required, open all descriptors and perform char *nslist[nslen];
// setns on them char *ns;
int nslen = num_namespaces(nspaths); char *saveptr;
int fds[nslen];
char *nslist[nslen];
int i;
char *ns, *saveptr;
for (i = 0; i < nslen; i++) { for (i = 0; i < nslen; i++) {
char *str = NULL; char *str = NULL;
if (i == 0) { if (i == 0) {
str = nspaths; str = data + start;
} }
ns = strtok_r(str, ",", &saveptr); ns = strtok_r(str, ",", &saveptr);
if (ns == NULL) { if (ns == NULL) {
@ -241,22 +315,21 @@ void nsexec()
for (i = 0; i < nslen; i++) { for (i = 0; i < nslen; i++) {
if (setns(fds[i], 0) != 0) { if (setns(fds[i], 0) != 0) {
pr_perror("Failed to setns to %s", pr_perror("Failed to setns to %s", nslist[i]);
nslist[i]);
exit(1); exit(1);
} }
close(fds[i]); close(fds[i]);
} }
} else if (nlattr->nla_type == UIDMAP_ATTR) { } else if (nlattr->nla_type == UIDMAP_ATTR) {
uidmap_len = payload_len; config.uidmap = data + start;
uidmap_start = start; config.uidmap_len = payload_len;
} else if (nlattr->nla_type == GIDMAP_ATTR) { } else if (nlattr->nla_type == GIDMAP_ATTR) {
gidmap_len = payload_len; config.gidmap = data + start;
gidmap_start = start; config.gidmap_len = payload_len;
} else if (nlattr->nla_type == SETGROUP_ATTR) { } else if (nlattr->nla_type == SETGROUP_ATTR) {
is_setgroup = readint8(data + start); config.is_setgroup = readint8(data + start);
} else { } else {
pr_perror("unknown netlink message type %d", pr_perror("Unknown netlink message type %d",
nlattr->nla_type); nlattr->nla_type);
exit(1); exit(1);
} }
@ -265,30 +338,30 @@ void nsexec()
} }
// required clone_flags to be passed // required clone_flags to be passed
if (cloneflags == -1) { if (config.cloneflags == -1) {
pr_perror("missing clone_flags"); pr_perror("Missing clone_flags");
exit(1); exit(1);
} }
// prepare sync pipe between parent and child. We need this to let the child // prepare sync pipe between parent and child. We need this to let the
// child
// know that the parent has finished setting up // know that the parent has finished setting up
int syncpipe[2] = { -1, -1 };
if (pipe(syncpipe) != 0) { if (pipe(syncpipe) != 0) {
pr_perror("failed to setup sync pipe between parent and child"); pr_perror("Failed to setup sync pipe between parent and child");
exit(1); exit(1);
}; }
if (setjmp(env) == 1) { if (setjmp(env) == 1) {
// Child // Child
uint8_t s; uint8_t s = 0;
// close the writing side of pipe // close the writing side of pipe
close(syncpipe[1]); close(syncpipe[1]);
// sync with parent // sync with parent
if (read(syncpipe[0], &s, 1) != 1 || s != 1) { if ((read(syncpipe[0], &s, 1) != 1) || (s != 1)) {
pr_perror("failed to read sync byte from parent"); pr_perror("Failed to read sync byte from parent");
exit(1); exit(1);
}; }
if (setsid() == -1) { if (setsid() == -1) {
pr_perror("setsid failed"); pr_perror("setsid failed");
@ -301,95 +374,65 @@ void nsexec()
exit(1); exit(1);
} }
if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) { if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) {
pr_perror("Failed to dup 0"); pr_perror("Failed to dup stdin");
exit(1); exit(1);
} }
if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) { if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) {
pr_perror("Failed to dup 1"); pr_perror("Failed to dup stdout");
exit(1); exit(1);
} }
if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) { if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) {
pr_perror("Failed to dup 2"); pr_perror("Failed to dup stderr");
exit(1); exit(1);
} }
} }
// Finish executing, let the Go runtime take over. // Finish executing, let the Go runtime take over.
return; return;
} }
// Parent // Parent
start_child(pipenum, &env, syncpipe, &config);
// We must fork to actually enter the PID namespace, use CLONE_PARENT }
// so the child can have the right parent, and we don't need to forward
// the child's exit code or resend its death signal. void nsexec(void)
int child = clone_parent(&env, cloneflags); {
if (child < 0) { int pipenum;
pr_perror("Unable to fork");
exit(1); // if we dont have init pipe, then just return to the parent
} pipenum = get_init_pipe();
// if uid_map and gid_map were specified, writes the data to /proc files if (pipenum == -1) {
if (uidmap_start > 0 && uidmap_len > 0) { return;
char buf[PATH_MAX]; }
if (snprintf(buf, sizeof(buf), "/proc/%d/uid_map", child) < 0) {
pr_perror("failed to construct uid_map file for %d", // Retrieve the netlink header
child); struct nlmsghdr nl_msg_hdr;
exit(1); int len;
}
if ((len = read(pipenum, &nl_msg_hdr, NLMSG_HDRLEN)) != NLMSG_HDRLEN) {
int fd = open(buf, O_RDWR); pr_perror("Invalid netlink header length %d", len);
writedata(fd, data, uidmap_start, uidmap_len); exit(1);
} }
if (gidmap_start > 0 && gidmap_len > 0) { if (nl_msg_hdr.nlmsg_type == NLMSG_ERROR) {
if (is_setgroup == 1) { pr_perror("Failed to read netlink message");
char buf[PATH_MAX]; exit(1);
if (snprintf }
(buf, sizeof(buf), "/proc/%d/setgroups",
child) < 0) { if (nl_msg_hdr.nlmsg_type != INIT_MSG) {
pr_perror pr_perror("Unexpected msg type %d", nl_msg_hdr.nlmsg_type);
("failed to construct setgroups file for %d", exit(1);
child); }
exit(1);
} // Retrieve data
int nl_total_size = NLMSG_PAYLOAD(&nl_msg_hdr, 0);
int fd = open(buf, O_RDWR); char data[nl_total_size];
if (write(fd, "allow", 5) != 5) {
// If the kernel is too old to support /proc/PID/setgroups, if ((len = read(pipenum, data, nl_total_size)) != nl_total_size) {
// write will return ENOENT; this is OK. pr_perror("Failed to read netlink payload, %d != %d", len,
if (errno != ENOENT) { nl_total_size);
pr_perror("failed to write allow to %s", exit(1);
buf); }
exit(1);
} process_nl_attributes(pipenum, data, nl_total_size);
}
}
// write gid mappings
char buf[PATH_MAX];
if (snprintf(buf, sizeof(buf), "/proc/%d/gid_map", child) < 0) {
pr_perror("failed to construct gid_map file for %d",
child);
exit(1);
}
int fd = open(buf, O_RDWR);
writedata(fd, data, gidmap_start, gidmap_len);
}
// Send the sync signal to the child
close(syncpipe[0]);
uint8_t s = 1;
if (write(syncpipe[1], &s, 1) != 1) {
pr_perror("failed to write sync byte to child");
exit(1);
};
// parent to finish the bootstrap process
char child_data[PATH_MAX];
len =
snprintf(child_data, sizeof(child_data), "{ \"pid\" : %d }\n",
child);
if (write(pipenum, child_data, len) != len) {
pr_perror("Unable to send a child pid");
kill(child, SIGKILL);
exit(1);
}
exit(0);
} }