diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 8756877d..d31a3435 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -23,6 +23,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/criurpc" "github.com/opencontainers/runc/libcontainer/utils" + "github.com/syndtr/gocapability/capability" "github.com/vishvananda/netlink/nl" ) @@ -268,37 +269,40 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. } func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { - t := "_LIBCONTAINER_INITTYPE=" + string(initStandard) - cloneFlags := c.config.Namespaces.CloneFlags() - if cloneFlags&syscall.CLONE_NEWUSER != 0 { - if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil { - // user mappings are not supported - return nil, err - } - enableSetgroups(cmd.SysProcAttr) - // Default to root user when user namespaces are enabled. - if cmd.SysProcAttr.Credential == nil { - cmd.SysProcAttr.Credential = &syscall.Credential{} + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path } } - cmd.Env = append(cmd.Env, t) - cmd.SysProcAttr.Cloneflags = cloneFlags + _, sharePidns := nsMaps[configs.NEWPID] + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "") + if err != nil { + return nil, err + } return &initProcess{ - cmd: cmd, - childPipe: childPipe, - parentPipe: parentPipe, - manager: c.cgroupManager, - config: c.newInitConfig(p), - container: c, - process: p, + cmd: cmd, + childPipe: childPipe, + parentPipe: parentPipe, + manager: c.cgroupManager, + config: c.newInitConfig(p), + container: c, + process: p, + bootstrapData: data, + sharePidns: sharePidns, }, nil } func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) + state, err := c.currentState() + if err != nil { + return nil, newSystemError(err) + } // for setns process, we dont have to set cloneflags as the process namespaces // will only be set via setns syscall - data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath) + data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath) if err != nil { return nil, err } @@ -1069,28 +1073,6 @@ func (c *linuxContainer) currentState() (*State, error) { return state, nil } -// bootstrapData encodes the necessary data in netlink binary format as a io.Reader. -// Consumer can write the data to a bootstrap program such as one that uses -// nsenter package to bootstrap the container's init process correctly, i.e. with -// correct namespaces, uid/gid mapping etc. -func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) { - // create the netlink message - r := nl.NewNetlinkRequest(int(InitMsg), 0) - // write pid - r.AddData(&Int32msg{ - Type: PidAttr, - Value: uint32(pid), - }) - // write console path - if consolePath != "" { - r.AddData(&Bytemsg{ - Type: ConsolePathAttr, - Value: []byte(consolePath), - }) - } - return bytes.NewReader(r.Serialize()), nil -} - // orderNamespacePaths sorts namespace paths into a list of paths that we // can setns in order. func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { @@ -1126,3 +1108,92 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp } return paths, nil } + +func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { + data := bytes.NewBuffer(nil) + for _, im := range idMap { + line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) + if _, err := data.WriteString(line); err != nil { + return nil, err + } + } + return data.Bytes(), nil +} + +// bootstrapData encodes the necessary data in netlink binary format +// as a io.Reader. +// Consumer can write the data to a bootstrap program +// such as one that uses nsenter package to bootstrap the container's +// init process correctly, i.e. with correct namespaces, uid/gid +// mapping etc. +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) { + // create the netlink message + r := nl.NewNetlinkRequest(int(InitMsg), 0) + + // write cloneFlags + r.AddData(&Int32msg{ + Type: CloneFlagsAttr, + Value: uint32(cloneFlags), + }) + + // write console path + if consolePath != "" { + r.AddData(&Bytemsg{ + Type: ConsolePathAttr, + Value: []byte(consolePath), + }) + } + + // write custom namespace paths + if len(nsMaps) > 0 { + nsPaths, err := c.orderNamespacePaths(nsMaps) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: NsPathsAttr, + Value: []byte(strings.Join(nsPaths, ",")), + }) + } + + // write namespace paths only when we are not joining an existing user ns + _, joinExistingUser := nsMaps[configs.NEWUSER] + if !joinExistingUser { + // write uid mappings + if len(c.config.UidMappings) > 0 { + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: UidmapAttr, + Value: b, + }) + } + + // write gid mappings + if len(c.config.GidMappings) > 0 { + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: GidmapAttr, + Value: b, + }) + // check if we have CAP_SETGID to setgroup properly + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } + } + } + + return bytes.NewReader(r.Serialize()), nil +} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 8abe9191..dd641e87 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -185,25 +185,6 @@ func syncParentHooks(pipe io.ReadWriter) error { return nil } -// joinExistingNamespaces gets all the namespace paths specified for the container and -// does a setns on the namespace fd so that the current process joins the namespace. -func joinExistingNamespaces(namespaces []configs.Namespace) error { - for _, ns := range namespaces { - if ns.Path != "" { - f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0) - if err != nil { - return err - } - err = system.Setns(f.Fd(), uintptr(ns.Syscall())) - f.Close() - if err != nil { - return err - } - } - } - return nil -} - // setupUser changes the groups, gid, and uid for the user inside the container func setupUser(config *initConfig) error { // Set up defaults. diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 0c3301f2..16630133 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -12,8 +12,12 @@ import ( // The number is randomly chosen to not conflict with known netlink types const ( InitMsg uint16 = 62000 - PidAttr uint16 = 27281 + CloneFlagsAttr uint16 = 27281 ConsolePathAttr uint16 = 27282 + NsPathsAttr uint16 = 27283 + UidmapAttr uint16 = 27284 + GidmapAttr uint16 = 27285 + SetgroupAttr uint16 = 27286 // When syscall.NLA_HDRLEN is in gccgo, take this out. syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) ) @@ -60,3 +64,25 @@ func (msg *Bytemsg) Serialize() []byte { func (msg *Bytemsg) Len() int { return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated } + +type Boolmsg struct { + Type uint16 + Value bool +} + +func (msg *Boolmsg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + if msg.Value { + buf[4] = 1 + } else { + buf[4] = 0 + } + return buf +} + +func (msg *Boolmsg) Len() int { + return syscall_NLA_HDRLEN + 1 +} diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go index 976ae6bb..f7b12be9 100644 --- a/libcontainer/nsenter/nsenter_test.go +++ b/libcontainer/nsenter/nsenter_test.go @@ -3,7 +3,9 @@ package nsenter import ( "bytes" "encoding/json" + "fmt" "io" + "io/ioutil" "os" "os/exec" "strings" @@ -18,35 +20,51 @@ type pid struct { Pid int `json:"Pid"` } -func TestNsenterAlivePid(t *testing.T) { +func TestNsenterValidPaths(t *testing.T) { args := []string{"nsenter-exec"} parent, child, err := newPipe() if err != nil { t.Fatalf("failed to create pipe %v", err) } + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()), + } cmd := &exec.Cmd{ Path: os.Args[0], Args: args, ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + Stdout: os.Stdout, + Stderr: os.Stderr, } if err := cmd.Start(); err != nil { t.Fatalf("nsenter failed to start %v", err) } + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ - Type: libcontainer.PidAttr, - Value: uint32(os.Getpid()), + Type: libcontainer.CloneFlagsAttr, + Value: uint32(syscall.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), }) if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { t.Fatal(err) } + decoder := json.NewDecoder(parent) var pid *pid if err := decoder.Decode(&pid); err != nil { + dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid())) + for _, d := range dir { + t.Log(d.Name()) + } t.Fatalf("%v", err) } @@ -60,70 +78,43 @@ func TestNsenterAlivePid(t *testing.T) { p.Wait() } -func TestNsenterInvalidPid(t *testing.T) { +func TestNsenterInvalidPaths(t *testing.T) { args := []string{"nsenter-exec"} parent, child, err := newPipe() if err != nil { t.Fatalf("failed to create pipe %v", err) } + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("/proc/%d/ns/pid", -1), + } cmd := &exec.Cmd{ Path: os.Args[0], Args: args, ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, } if err := cmd.Start(); err != nil { - t.Fatal("nsenter exits with a zero exit status") + t.Fatal(err) } + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ - Type: libcontainer.PidAttr, - Value: 0, + Type: libcontainer.CloneFlagsAttr, + Value: uint32(syscall.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), }) if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { t.Fatal(err) } if err := cmd.Wait(); err == nil { - t.Fatal("nsenter exits with a zero exit status") - } -} - -func TestNsenterDeadPid(t *testing.T) { - deadCmd := exec.Command("true") - if err := deadCmd.Run(); err != nil { - t.Fatal(err) - } - args := []string{"nsenter-exec"} - parent, child, err := newPipe() - if err != nil { - t.Fatalf("failed to create pipe %v", err) - } - - cmd := &exec.Cmd{ - Path: os.Args[0], - Args: args, - ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"}, - } - - if err := cmd.Start(); err != nil { - t.Fatal("nsenter exits with a zero exit status") - } - - r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) - r.AddData(&libcontainer.Int32msg{ - Type: libcontainer.PidAttr, - Value: uint32(deadCmd.Process.Pid), - }) - if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { - t.Fatal(err) - } - - if err := cmd.Wait(); err == nil { - t.Fatal("nsenter exits with a zero exit status") + t.Fatalf("nsenter exits with a zero exit status") } } diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 6634afc4..286c653c 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -4,7 +4,6 @@ #include #include #include - #include #include #include @@ -16,6 +15,14 @@ #include #include #include +#include +#include +#include + +// netlink related +#include +#include +#include #include #include @@ -57,166 +64,246 @@ int setns(int fd, int nstype) #endif #endif -static int clone_parent(jmp_buf * env) __attribute__ ((noinline)); -static int clone_parent(jmp_buf * env) +static int clone_parent(jmp_buf * env, int flags) __attribute__ ((noinline)); +static int clone_parent(jmp_buf * env, int flags) { struct clone_arg ca; int child; ca.env = env; - child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); - + child = + clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, + &ca); return child; } +// get init pipe from the parent. It's used to read bootstrap data, and to +// write pid to after nsexec finishes setting up the environment. +static int get_init_pipe() +{ + char buf[PATH_MAX], *initpipe; + int pipenum = -1; + + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL) { + return -1; + } + + pipenum = atoi(initpipe); + snprintf(buf, sizeof(buf), "%d", pipenum); + if (strcmp(initpipe, buf)) { + pr_perror("Unable to parse _LIBCONTAINER_INITPIPE"); + exit(1); + } + + return pipenum; +} + +// num_namespaces returns the number of additional namespaces to setns. The +// argument is a comma-separated string of namespace paths. +static int num_namespaces(char *nspaths) +{ + int size = 0, i = 0; + + for (i = 0; nspaths[i]; i++) { + if (nspaths[i] == ',') { + size += 1; + } + } + + return size + 1; +} + static uint32_t readint32(char *buf) { return *(uint32_t *) buf; } +static uint8_t readint8(char *buf) +{ + return *(uint8_t *) buf; +} + +static void writedata(int fd, char *data, int start, int len) +{ + int written = 0; + while (written < len) { + size_t nbyte, i; + if ((len - written) < 1024) { + nbyte = len - written; + } else { + nbyte = 1024; + } + i = write(fd, data + start + written, nbyte); + if (i == -1) { + pr_perror("failed to write data to %d", fd); + exit(1); + } + written += i; + } +} + // list of known message types we want to send to bootstrap program // These are defined in libcontainer/message_linux.go -#define INIT_MSG 62000 -#define PID_ATTR 27281 -#define CONSOLE_PATH_ATTR 27282 +#define INIT_MSG 62000 +#define CLONE_FLAGS_ATTR 27281 +#define CONSOLE_PATH_ATTR 27282 +#define NS_PATHS_ATTR 27283 +#define UIDMAP_ATTR 27284 +#define GIDMAP_ATTR 27285 +#define SETGROUP_ATTR 27286 void nsexec() { - char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt", "user" }; - const int num = sizeof(namespaces) / sizeof(char *); jmp_buf env; - char buf[PATH_MAX], *val; - int i, tfd, self_tfd, child, n, len, pipenum, consolefd = -1; - pid_t pid = 0; + int pipenum; - // if we dont have INITTYPE or this is the init process, skip the bootstrap process - val = getenv("_LIBCONTAINER_INITTYPE"); - if (val == NULL || strcmp(val, "standard") == 0) { + // if we dont have init pipe, then just return to the parent + pipenum = get_init_pipe(); + if (pipenum == -1) { return; } - if (strcmp(val, "setns") != 0) { - pr_perror("Invalid inittype %s", val); + // Retrieve the netlink header + struct nlmsghdr nl_msg_hdr; + int len; + + if ((len = read(pipenum, &nl_msg_hdr, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { + pr_perror("Failed to read netlink header, got %d instead of %d", + len, NLMSG_HDRLEN); exit(1); } - val = getenv("_LIBCONTAINER_INITPIPE"); - if (val == NULL) { - pr_perror("Child pipe not found"); + if (nl_msg_hdr.nlmsg_type == NLMSG_ERROR) { + pr_perror("failed to read netlink message"); exit(1); } - pipenum = atoi(val); - snprintf(buf, sizeof(buf), "%d", pipenum); - if (strcmp(val, buf)) { - pr_perror("Unable to parse _LIBCONTAINER_INITPIPE"); + if (nl_msg_hdr.nlmsg_type != INIT_MSG) { + pr_perror("unexpected msg type %d", nl_msg_hdr.nlmsg_type); exit(1); } + // Retrieve data + int nl_total_size = NLMSG_PAYLOAD(&nl_msg_hdr, 0); + char data[nl_total_size]; - char nlbuf[NLMSG_HDRLEN]; - struct nlmsghdr *nh; - if ((n = read(pipenum, nlbuf, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { - pr_perror("Failed to read netlink header, got %d", n); + if ((len = read(pipenum, data, nl_total_size)) != nl_total_size) { + pr_perror + ("Failed to read netlink payload, got %d instead of %d", + len, nl_total_size); exit(1); } - - nh = (struct nlmsghdr *)nlbuf; - if (nh->nlmsg_type == NLMSG_ERROR) { - pr_perror("Invalid netlink header message"); - exit(1); - } - if (nh->nlmsg_type != INIT_MSG) { - pr_perror("Unexpected netlink message type %d", nh->nlmsg_type); - exit(1); - } - // read the netlink payload - len = NLMSG_PAYLOAD(nh, 0); - char data[len]; - if ((n = read(pipenum, data, len)) != len) { - pr_perror("Failed to read netlink payload, got %d", n); - exit(1); - } - + // Process the passed attributes int start = 0; - struct nlattr *attr; - while (start < len) { - int payload_len; - attr = (struct nlattr *)((void *)data + start); + uint32_t cloneflags = -1; + uint8_t is_setgroup = 0; + int consolefd = -1; + int uidmap_start = -1, uidmap_len = -1; + int gidmap_start = -1, gidmap_len = -1; + int payload_len; + struct nlattr *nlattr; + + while (start < nl_total_size) { + nlattr = (struct nlattr *)(data + start); start += NLA_HDRLEN; - payload_len = attr->nla_len - NLA_HDRLEN; - switch (attr->nla_type) { - case PID_ATTR: - pid = (pid_t) readint32(data + start); - break; - case CONSOLE_PATH_ATTR: - consolefd = open((char *)data + start, O_RDWR); + payload_len = nlattr->nla_len - NLA_HDRLEN; + + if (nlattr->nla_type == CLONE_FLAGS_ATTR) { + cloneflags = readint32(data + start); + } else if (nlattr->nla_type == CONSOLE_PATH_ATTR) { + // get the console path before setns because it may change mnt namespace + consolefd = open(data + start, O_RDWR); if (consolefd < 0) { - pr_perror("Failed to open console %s", (char *)data + start); + pr_perror("Failed to open console %s", + data + start); exit(1); } - break; + } else if (nlattr->nla_type == NS_PATHS_ATTR) { + char nspaths[payload_len + 1]; + + strncpy(nspaths, data + start, payload_len); + nspaths[payload_len] = '\0'; + + // if custom namespaces are required, open all descriptors and perform + // setns on them + int nslen = num_namespaces(nspaths); + int fds[nslen]; + char *nslist[nslen]; + int i; + char *ns, *saveptr; + + for (i = 0; i < nslen; i++) { + char *str = NULL; + + if (i == 0) { + str = nspaths; + } + ns = strtok_r(str, ",", &saveptr); + if (ns == NULL) { + break; + } + fds[i] = open(ns, O_RDONLY); + if (fds[i] == -1) { + pr_perror("Failed to open %s", ns); + exit(1); + } + nslist[i] = ns; + } + + for (i = 0; i < nslen; i++) { + if (setns(fds[i], 0) != 0) { + pr_perror("Failed to setns to %s", + nslist[i]); + exit(1); + } + close(fds[i]); + } + } else if (nlattr->nla_type == UIDMAP_ATTR) { + uidmap_len = payload_len; + uidmap_start = start; + } else if (nlattr->nla_type == GIDMAP_ATTR) { + gidmap_len = payload_len; + gidmap_start = start; + } else if (nlattr->nla_type == SETGROUP_ATTR) { + is_setgroup = readint8(data + start); + } else { + pr_perror("unknown netlink message type %d", + nlattr->nla_type); + exit(1); } + start += NLA_ALIGN(payload_len); } - // required pid to be passed - if (pid == 0) { - pr_perror("missing pid"); + // required clone_flags to be passed + if (cloneflags == -1) { + pr_perror("missing clone_flags"); exit(1); } - - /* Check that the specified process exists */ - snprintf(buf, PATH_MAX - 1, "/proc/%d/ns", pid); - tfd = open(buf, O_DIRECTORY | O_RDONLY); - if (tfd == -1) { - pr_perror("Failed to open \"%s\"", buf); + // prepare sync pipe between parent and child. We need this to let the child + // know that the parent has finished setting up + int syncpipe[2] = { -1, -1 }; + if (pipe(syncpipe) != 0) { + pr_perror("failed to setup sync pipe between parent and child"); exit(1); - } - - self_tfd = open("/proc/self/ns", O_DIRECTORY | O_RDONLY); - if (self_tfd == -1) { - pr_perror("Failed to open /proc/self/ns"); - exit(1); - } - - for (i = 0; i < num; i++) { - struct stat st; - struct stat self_st; - int fd; - - /* Symlinks on all namespaces exist for dead processes, but they can't be opened */ - if (fstatat(tfd, namespaces[i], &st, 0) == -1) { - // Ignore nonexistent namespaces. - if (errno == ENOENT) - continue; - } - - /* Skip namespaces we're already part of */ - if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 && st.st_ino == self_st.st_ino) { - continue; - } - - fd = openat(tfd, namespaces[i], O_RDONLY); - if (fd == -1) { - pr_perror("Failed to open ns file %s for ns %s", buf, namespaces[i]); - exit(1); - } - // Set the namespace. - if (setns(fd, 0) == -1) { - pr_perror("Failed to setns for %s", namespaces[i]); - exit(1); - } - close(fd); - } - - close(self_tfd); - close(tfd); + }; if (setjmp(env) == 1) { // Child + uint8_t s; + + // close the writing side of pipe + close(syncpipe[1]); + + // sync with parent + if (read(syncpipe[0], &s, 1) != 1 || s != 1) { + pr_perror("failed to read sync byte from parent"); + exit(1); + }; if (setsid() == -1) { pr_perror("setsid failed"); exit(1); } + if (consolefd != -1) { if (ioctl(consolefd, TIOCSCTTY, 0) == -1) { pr_perror("ioctl TIOCSCTTY failed"); @@ -243,19 +330,75 @@ void nsexec() // We must fork to actually enter the PID namespace, use CLONE_PARENT // so the child can have the right parent, and we don't need to forward // the child's exit code or resend its death signal. - child = clone_parent(&env); + int child = clone_parent(&env, cloneflags); if (child < 0) { pr_perror("Unable to fork"); exit(1); } + // if uid_map and gid_map were specified, writes the data to /proc files + if (uidmap_start > 0 && uidmap_len > 0) { + char buf[PATH_MAX]; + if (snprintf(buf, sizeof(buf), "/proc/%d/uid_map", child) < 0) { + pr_perror("failed to construct uid_map file for %d", + child); + exit(1); + } - len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", child); + int fd = open(buf, O_RDWR); + writedata(fd, data, uidmap_start, uidmap_len); + } - if (write(pipenum, buf, len) != len) { + if (gidmap_start > 0 && gidmap_len > 0) { + if (is_setgroup == 1) { + char buf[PATH_MAX]; + if (snprintf + (buf, sizeof(buf), "/proc/%d/setgroups", + child) < 0) { + pr_perror + ("failed to construct setgroups file for %d", + child); + exit(1); + } + + int fd = open(buf, O_RDWR); + if (write(fd, "allow", 5) != 5) { + // If the kernel is too old to support /proc/PID/setgroups, + // write will return ENOENT; this is OK. + if (errno != ENOENT) { + pr_perror("failed to write allow to %s", + buf); + exit(1); + } + } + } + // write gid mappings + char buf[PATH_MAX]; + if (snprintf(buf, sizeof(buf), "/proc/%d/gid_map", child) < 0) { + pr_perror("failed to construct gid_map file for %d", + child); + exit(1); + } + + int fd = open(buf, O_RDWR); + writedata(fd, data, gidmap_start, gidmap_len); + } + // Send the sync signal to the child + close(syncpipe[0]); + uint8_t s = 1; + if (write(syncpipe[1], &s, 1) != 1) { + pr_perror("failed to write sync byte to child"); + exit(1); + }; + + // parent to finish the bootstrap process + char child_data[PATH_MAX]; + len = + snprintf(child_data, sizeof(child_data), "{ \"pid\" : %d }\n", + child); + if (write(pipenum, child_data, len) != len) { pr_perror("Unable to send a child pid"); kill(child, SIGKILL); exit(1); } - exit(0); } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index de2d5f00..1a4d4b04 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -167,14 +167,16 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { } type initProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - config *initConfig - manager cgroups.Manager - container *linuxContainer - fds []string - process *Process + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + config *initConfig + manager cgroups.Manager + container *linuxContainer + fds []string + process *Process + bootstrapData io.Reader + sharePidns bool } func (p *initProcess) pid() int { @@ -185,15 +187,49 @@ func (p *initProcess) externalDescriptors() []string { return p.fds } -func (p *initProcess) start() (err error) { +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +// This is called by initProcess.start function +func (p *initProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return err + } + if !status.Success() { + p.cmd.Wait() + return &exec.ExitError{ProcessState: status} + } + var pid *pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return err + } + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + return nil +} + +func (p *initProcess) start() error { defer p.parentPipe.Close() - err = p.cmd.Start() + err := p.cmd.Start() p.process.ops = p p.childPipe.Close() if err != nil { p.process.ops = nil return newSystemError(err) } + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return err + } + if err := p.execSetns(); err != nil { + return newSystemError(err) + } // Save the standard descriptor names before the container process // can potentially move them (e.g., via dup2()). If we don't do this now, // we won't know at checkpoint time which file descriptor to look up. @@ -317,7 +353,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) { return p.cmd.ProcessState, err } // we should kill all processes in cgroup when init is died if we use host PID namespace - if p.cmd.SysProcAttr.Cloneflags&syscall.CLONE_NEWPID == 0 { + if p.sharePidns { killCgroupProcesses(p.manager) } return p.cmd.ProcessState, nil diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 27ad8caf..935b2eea 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -55,10 +55,6 @@ func (l *linuxStandardInit) Init() error { return err } - // join any namespaces via a path to the namespace fd if provided - if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil { - return err - } var console *linuxConsole if l.config.Console != "" { console = newConsoleFromPath(l.config.Console) @@ -66,9 +62,6 @@ func (l *linuxStandardInit) Init() error { return err } } - if _, err := syscall.Setsid(); err != nil { - return err - } if console != nil { if err := system.Setctty(); err != nil { return err