Merge pull request #1237 from hqhq/fix_sync_race

Fix race condition when sync with child and grandchild
This commit is contained in:
Michael Crosby 2017-02-20 17:16:43 -08:00 committed by GitHub
commit 8438b26e9f
1 changed files with 75 additions and 35 deletions

View File

@ -33,7 +33,8 @@ enum sync_t {
SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
SYNC_CHILD_READY = 0x44, /* The grandchild is ready to return. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
/* XXX: This doesn't help with segfaults and other such issues. */ /* XXX: This doesn't help with segfaults and other such issues. */
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
@ -413,7 +414,7 @@ void nsexec(void)
{ {
int pipenum; int pipenum;
jmp_buf env; jmp_buf env;
int syncpipe[2]; int sync_child_pipe[2], sync_grandchild_pipe[2];
struct nlconfig_t config = {0}; struct nlconfig_t config = {0};
/* /*
@ -433,9 +434,16 @@ void nsexec(void)
nl_parse(pipenum, &config); nl_parse(pipenum, &config);
/* Pipe so we can tell the child when we've finished setting up. */ /* Pipe so we can tell the child when we've finished setting up. */
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0) if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
bail("failed to setup sync pipe between parent and child"); bail("failed to setup sync pipe between parent and child");
/*
* We need a new socketpair to sync with grandchild so we don't have
* race condition with child.
*/
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
bail("failed to setup sync pipe between parent and grandchild");
/* TODO: Currently we aren't dealing with child deaths properly. */ /* TODO: Currently we aren't dealing with child deaths properly. */
/* /*
@ -494,9 +502,10 @@ void nsexec(void)
* process. * process.
*/ */
case JUMP_PARENT: { case JUMP_PARENT: {
int len, ready = 0; int len;
pid_t child; pid_t child;
char buf[JSON_MAX]; char buf[JSON_MAX];
bool ready = false;
/* For debugging. */ /* For debugging. */
prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0); prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
@ -513,26 +522,23 @@ void nsexec(void)
* ready, so we can receive all possible error codes * ready, so we can receive all possible error codes
* generated by children. * generated by children.
*/ */
while (ready < 2) { while (!ready) {
enum sync_t s; enum sync_t s;
int ret;
/* This doesn't need to be global, we're in the parent. */ syncfd = sync_child_pipe[1];
int syncfd = syncpipe[1]; close(sync_child_pipe[0]);
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with child: next state"); bail("failed to sync with child: next state");
switch (s) { switch (s) {
case SYNC_ERR: { case SYNC_ERR:
/* We have to mirror the error code of the child. */ /* We have to mirror the error code of the child. */
int ret;
if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret)) if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
bail("failed to sync with child: read(error code)"); bail("failed to sync with child: read(error code)");
exit(ret); exit(ret);
}
break;
case SYNC_USERMAP_PLS: case SYNC_USERMAP_PLS:
/* Enable setgroups(2) if we've been asked to. */ /* Enable setgroups(2) if we've been asked to. */
if (config.is_setgroup) if (config.is_setgroup)
@ -548,11 +554,6 @@ void nsexec(void)
bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
} }
break; break;
case SYNC_USERMAP_ACK:
/* We should _never_ receive acks. */
kill(child, SIGKILL);
bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
break;
case SYNC_RECVPID_PLS: { case SYNC_RECVPID_PLS: {
pid_t old = child; pid_t old = child;
@ -570,20 +571,46 @@ void nsexec(void)
bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
} }
} }
ready++;
break;
case SYNC_RECVPID_ACK:
/* We should _never_ receive acks. */
kill(child, SIGKILL);
bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
break; break;
case SYNC_CHILD_READY: case SYNC_CHILD_READY:
ready++; ready = true;
break; break;
default: default:
bail("unexpected sync value"); bail("unexpected sync value: %u", s);
}
}
/* Now sync with grandchild. */
ready = false;
while (!ready) {
enum sync_t s;
int ret;
syncfd = sync_grandchild_pipe[1];
close(sync_grandchild_pipe[0]);
s = SYNC_GRANDCHILD;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_GRANDCHILD)");
}
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with child: next state");
switch (s) {
case SYNC_ERR:
/* We have to mirror the error code of the child. */
if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
bail("failed to sync with child: read(error code)");
exit(ret);
case SYNC_CHILD_READY:
ready = true;
break; break;
default:
bail("unexpected sync value: %u", s);
} }
} }
@ -615,7 +642,8 @@ void nsexec(void)
enum sync_t s; enum sync_t s;
/* We're in a child and thus need to tell the parent if we die. */ /* We're in a child and thus need to tell the parent if we die. */
syncfd = syncpipe[0]; syncfd = sync_child_pipe[0];
close(sync_child_pipe[1]);
/* For debugging. */ /* For debugging. */
prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0); prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
@ -700,6 +728,12 @@ void nsexec(void)
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
} }
s = SYNC_CHILD_READY;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with parent: write(SYNC_CHILD_READY)");
}
/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
exit(0); exit(0);
} }
@ -718,11 +752,19 @@ void nsexec(void)
enum sync_t s; enum sync_t s;
/* We're in a child and thus need to tell the parent if we die. */ /* We're in a child and thus need to tell the parent if we die. */
syncfd = syncpipe[0]; syncfd = sync_grandchild_pipe[0];
close(sync_grandchild_pipe[1]);
close(sync_child_pipe[0]);
close(sync_child_pipe[1]);
/* For debugging. */ /* For debugging. */
prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0); prctl(PR_SET_NAME, (unsigned long) "runc:[2:INIT]", 0, 0, 0);
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
if (s != SYNC_GRANDCHILD)
bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
if (setsid() < 0) if (setsid() < 0)
bail("setsid failed"); bail("setsid failed");
@ -740,8 +782,7 @@ void nsexec(void)
bail("failed to sync with patent: write(SYNC_CHILD_READY)"); bail("failed to sync with patent: write(SYNC_CHILD_READY)");
/* Close sync pipes. */ /* Close sync pipes. */
close(syncpipe[0]); close(sync_grandchild_pipe[0]);
close(syncpipe[1]);
/* Free netlink data. */ /* Free netlink data. */
nl_free(&config); nl_free(&config);
@ -751,7 +792,6 @@ void nsexec(void)
} }
default: default:
bail("unexpected jump value"); bail("unexpected jump value");
break;
} }
/* Should never be reached. */ /* Should never be reached. */