nsenter: remove a proxy process
Currently nsexec() creates a proxy process to enter into a pid namespace. It isn't good, because we need to proxy an exit code and signals. We can use CLONE_PARENT to fork a process with the right parent. Signed-off-by: Andrey Vagin <avagin@openvz.org>
This commit is contained in:
parent
11b2dab1c5
commit
82367938b7
|
@ -16,6 +16,10 @@ import (
|
|||
"github.com/docker/libcontainer/system"
|
||||
)
|
||||
|
||||
type pid struct {
|
||||
Pid int `json:"Pid"`
|
||||
}
|
||||
|
||||
// ExecIn reexec's cmd with _LIBCONTAINER_INITPID=PID so that it is able to run the
|
||||
// setns code in a single threaded environment joining the existing containers' namespaces.
|
||||
func ExecIn(args []string, env []string, console string, cmd *exec.Cmd, container *configs.Config, state *configs.State) (int, error) {
|
||||
|
@ -36,13 +40,38 @@ func ExecIn(args []string, env []string, console string, cmd *exec.Cmd, containe
|
|||
}
|
||||
child.Close()
|
||||
|
||||
s, err := cmd.Process.Wait()
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
if !s.Success() {
|
||||
return -1, &exec.ExitError{s}
|
||||
}
|
||||
|
||||
decoder := json.NewDecoder(parent)
|
||||
var pid *pid
|
||||
|
||||
if err := decoder.Decode(&pid); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
p, err := os.FindProcess(pid.Pid)
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
terminate := func(terr error) (int, error) {
|
||||
// TODO: log the errors for kill and wait
|
||||
cmd.Process.Kill()
|
||||
cmd.Wait()
|
||||
p.Kill()
|
||||
p.Wait()
|
||||
return -1, terr
|
||||
}
|
||||
|
||||
// Enter cgroups.
|
||||
if err := EnterCgroups(state, pid.Pid); err != nil {
|
||||
return terminate(err)
|
||||
}
|
||||
|
||||
encoder := json.NewEncoder(parent)
|
||||
|
||||
if err := encoder.Encode(container); err != nil {
|
||||
|
@ -58,12 +87,7 @@ func ExecIn(args []string, env []string, console string, cmd *exec.Cmd, containe
|
|||
return terminate(err)
|
||||
}
|
||||
|
||||
// Enter cgroups.
|
||||
if err := EnterCgroups(state, cmd.Process.Pid); err != nil {
|
||||
return terminate(err)
|
||||
}
|
||||
|
||||
return cmd.Process.Pid, nil
|
||||
return pid.Pid, nil
|
||||
}
|
||||
|
||||
// Finalize entering into a container and execute a specified command
|
||||
|
|
|
@ -3,7 +3,9 @@
|
|||
package nsenter
|
||||
|
||||
/*
|
||||
__attribute__((constructor)) init() {
|
||||
#cgo CFLAGS: -Wall
|
||||
extern void nsexec();
|
||||
void __attribute__((constructor)) init() {
|
||||
nsexec();
|
||||
}
|
||||
*/
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
#define _GNU_SOURCE
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
|
@ -11,13 +12,32 @@
|
|||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#include <signal.h>
|
||||
#include <setjmp.h>
|
||||
#include <sched.h>
|
||||
#include <signal.h>
|
||||
|
||||
/* All arguments should be above stack, because it grows down */
|
||||
struct clone_arg {
|
||||
/*
|
||||
* Reserve some space for clone() to locate arguments
|
||||
* and retcode in this place
|
||||
*/
|
||||
char stack[4096] __attribute__((aligned (8)));
|
||||
char stack_ptr[0];
|
||||
jmp_buf *env;
|
||||
};
|
||||
|
||||
static int child_func(void *_arg)
|
||||
{
|
||||
struct clone_arg *arg = (struct clone_arg *) _arg;
|
||||
longjmp(*arg->env, 1);
|
||||
}
|
||||
|
||||
#define pr_perror(fmt, ...) fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__)
|
||||
|
||||
// Use raw setns syscall for versions of glibc that don't include it (namely glibc-2.12)
|
||||
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
|
||||
#define _GNU_SOURCE
|
||||
#include <sched.h>
|
||||
#include "syscall.h"
|
||||
#ifdef SYS_setns
|
||||
int setns(int fd, int nstype)
|
||||
|
@ -27,12 +47,25 @@ int setns(int fd, int nstype)
|
|||
#endif
|
||||
#endif
|
||||
|
||||
static int clone_parent(jmp_buf *env) __attribute__ ((noinline));
|
||||
static int clone_parent(jmp_buf *env)
|
||||
{
|
||||
struct clone_arg ca;
|
||||
int child;
|
||||
|
||||
ca.env = env;
|
||||
child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
|
||||
|
||||
return child;
|
||||
}
|
||||
|
||||
void nsexec()
|
||||
{
|
||||
char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt" };
|
||||
const int num = sizeof(namespaces) / sizeof(char *);
|
||||
jmp_buf env;
|
||||
char buf[PATH_MAX], *val;
|
||||
int child, i, tfd;
|
||||
int i, tfd, child, len;
|
||||
pid_t pid;
|
||||
|
||||
val = getenv("_LIBCONTAINER_INITPID");
|
||||
|
@ -78,31 +111,24 @@ void nsexec()
|
|||
close(fd);
|
||||
}
|
||||
|
||||
child = fork();
|
||||
if (setjmp(env) == 1) {
|
||||
// Finish executing, let the Go runtime take over.
|
||||
return;
|
||||
}
|
||||
|
||||
child = clone_parent(&env);
|
||||
if (child < 0) {
|
||||
pr_perror("Unable to fork");
|
||||
exit(1);
|
||||
}
|
||||
// We must fork to actually enter the PID namespace.
|
||||
if (child == 0) {
|
||||
// Finish executing, let the Go runtime take over.
|
||||
return;
|
||||
} else {
|
||||
// Parent, wait for the child.
|
||||
int status = 0;
|
||||
if (waitpid(child, &status, 0) == -1) {
|
||||
pr_perror("Failed to waitpid");
|
||||
exit(1);
|
||||
}
|
||||
// Forward the child's exit code or re-send its death signal.
|
||||
if (WIFEXITED(status)) {
|
||||
exit(WEXITSTATUS(status));
|
||||
} else if (WIFSIGNALED(status)) {
|
||||
kill(getpid(), WTERMSIG(status));
|
||||
}
|
||||
|
||||
len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", child);
|
||||
|
||||
if (write(3, buf, len) != len) {
|
||||
pr_perror("Unable to send a child pid");
|
||||
kill(child, SIGKILL);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return;
|
||||
exit(0);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue