diff --git a/libcontainer/configs/namespaces_unix.go b/libcontainer/configs/namespaces_unix.go index 7bc90854..b9c820d0 100644 --- a/libcontainer/configs/namespaces_unix.go +++ b/libcontainer/configs/namespaces_unix.go @@ -2,7 +2,11 @@ package configs -import "fmt" +import ( + "fmt" + "os" + "sync" +) const ( NEWNET NamespaceType = "NEWNET" @@ -13,6 +17,51 @@ const ( NEWUSER NamespaceType = "NEWUSER" ) +var ( + nsLock sync.Mutex + supportedNamespaces = make(map[NamespaceType]bool) +) + +// nsToFile converts the namespace type to its filename +func nsToFile(ns NamespaceType) string { + switch ns { + case NEWNET: + return "net" + case NEWNS: + return "mnt" + case NEWPID: + return "pid" + case NEWIPC: + return "ipc" + case NEWUSER: + return "user" + case NEWUTS: + return "uts" + } + return "" +} + +// IsNamespaceSupported returns whether a namespace is available or +// not +func IsNamespaceSupported(ns NamespaceType) bool { + nsLock.Lock() + defer nsLock.Unlock() + supported, ok := supportedNamespaces[ns] + if ok { + return supported + } + nsFile := nsToFile(ns) + // if the namespace type is unknown, just return false + if nsFile == "" { + return false + } + _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile)) + // a namespace is supported if it exists and we have permissions to read it + supported = err == nil + supportedNamespaces[ns] = supported + return supported +} + func NamespaceTypes() []NamespaceType { return []NamespaceType{ NEWNET, @@ -35,26 +84,7 @@ func (n *Namespace) GetPath(pid int) string { if n.Path != "" { return n.Path } - return fmt.Sprintf("/proc/%d/ns/%s", pid, n.file()) -} - -func (n *Namespace) file() string { - file := "" - switch n.Type { - case NEWNET: - file = "net" - case NEWNS: - file = "mnt" - case NEWPID: - file = "pid" - case NEWIPC: - file = "ipc" - case NEWUSER: - file = "user" - case NEWUTS: - file = "uts" - } - return file + return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type)) } func (n *Namespaces) Remove(t NamespaceType) bool { @@ -87,3 +117,11 @@ func (n *Namespaces) index(t NamespaceType) int { func (n *Namespaces) Contains(t NamespaceType) bool { return n.index(t) != -1 } + +func (n *Namespaces) PathOf(t NamespaceType) string { + i := n.index(t) + if i == -1 { + return "" + } + return (*n)[i].Path +} diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 253ef00a..d31a3435 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -23,6 +23,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/criurpc" "github.com/opencontainers/runc/libcontainer/utils" + "github.com/syndtr/gocapability/capability" "github.com/vishvananda/netlink/nl" ) @@ -268,37 +269,40 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. } func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { - t := "_LIBCONTAINER_INITTYPE=" + string(initStandard) - cloneFlags := c.config.Namespaces.CloneFlags() - if cloneFlags&syscall.CLONE_NEWUSER != 0 { - if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil { - // user mappings are not supported - return nil, err - } - enableSetgroups(cmd.SysProcAttr) - // Default to root user when user namespaces are enabled. - if cmd.SysProcAttr.Credential == nil { - cmd.SysProcAttr.Credential = &syscall.Credential{} + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path } } - cmd.Env = append(cmd.Env, t) - cmd.SysProcAttr.Cloneflags = cloneFlags + _, sharePidns := nsMaps[configs.NEWPID] + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "") + if err != nil { + return nil, err + } return &initProcess{ - cmd: cmd, - childPipe: childPipe, - parentPipe: parentPipe, - manager: c.cgroupManager, - config: c.newInitConfig(p), - container: c, - process: p, + cmd: cmd, + childPipe: childPipe, + parentPipe: parentPipe, + manager: c.cgroupManager, + config: c.newInitConfig(p), + container: c, + process: p, + bootstrapData: data, + sharePidns: sharePidns, }, nil } func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) + state, err := c.currentState() + if err != nil { + return nil, newSystemError(err) + } // for setns process, we dont have to set cloneflags as the process namespaces // will only be set via setns syscall - data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath) + data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath) if err != nil { return nil, err } @@ -1069,18 +1073,69 @@ func (c *linuxContainer) currentState() (*State, error) { return state, nil } -// bootstrapData encodes the necessary data in netlink binary format as a io.Reader. -// Consumer can write the data to a bootstrap program such as one that uses -// nsenter package to bootstrap the container's init process correctly, i.e. with -// correct namespaces, uid/gid mapping etc. -func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) { +// orderNamespacePaths sorts namespace paths into a list of paths that we +// can setns in order. +func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { + paths := []string{} + nsTypes := []configs.NamespaceType{ + configs.NEWIPC, + configs.NEWUTS, + configs.NEWNET, + configs.NEWPID, + configs.NEWNS, + } + // join userns if the init process explicitly requires NEWUSER + if c.config.Namespaces.Contains(configs.NEWUSER) { + nsTypes = append(nsTypes, configs.NEWUSER) + } + for _, nsType := range nsTypes { + if p, ok := namespaces[nsType]; ok && p != "" { + // check if the requested namespace is supported + if !configs.IsNamespaceSupported(nsType) { + return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType)) + } + // only set to join this namespace if it exists + if _, err := os.Lstat(p); err != nil { + return nil, newSystemError(err) + } + // do not allow namespace path with comma as we use it to separate + // the namespace paths + if strings.ContainsRune(p, ',') { + return nil, newSystemError(fmt.Errorf("invalid path %s", p)) + } + paths = append(paths, p) + } + } + return paths, nil +} + +func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { + data := bytes.NewBuffer(nil) + for _, im := range idMap { + line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) + if _, err := data.WriteString(line); err != nil { + return nil, err + } + } + return data.Bytes(), nil +} + +// bootstrapData encodes the necessary data in netlink binary format +// as a io.Reader. +// Consumer can write the data to a bootstrap program +// such as one that uses nsenter package to bootstrap the container's +// init process correctly, i.e. with correct namespaces, uid/gid +// mapping etc. +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) { // create the netlink message r := nl.NewNetlinkRequest(int(InitMsg), 0) - // write pid + + // write cloneFlags r.AddData(&Int32msg{ - Type: PidAttr, - Value: uint32(pid), + Type: CloneFlagsAttr, + Value: uint32(cloneFlags), }) + // write console path if consolePath != "" { r.AddData(&Bytemsg{ @@ -1088,5 +1143,57 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath Value: []byte(consolePath), }) } + + // write custom namespace paths + if len(nsMaps) > 0 { + nsPaths, err := c.orderNamespacePaths(nsMaps) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: NsPathsAttr, + Value: []byte(strings.Join(nsPaths, ",")), + }) + } + + // write namespace paths only when we are not joining an existing user ns + _, joinExistingUser := nsMaps[configs.NEWUSER] + if !joinExistingUser { + // write uid mappings + if len(c.config.UidMappings) > 0 { + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: UidmapAttr, + Value: b, + }) + } + + // write gid mappings + if len(c.config.GidMappings) > 0 { + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: GidmapAttr, + Value: b, + }) + // check if we have CAP_SETGID to setgroup properly + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } + } + } + return bytes.NewReader(r.Serialize()), nil } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 8abe9191..dd641e87 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -185,25 +185,6 @@ func syncParentHooks(pipe io.ReadWriter) error { return nil } -// joinExistingNamespaces gets all the namespace paths specified for the container and -// does a setns on the namespace fd so that the current process joins the namespace. -func joinExistingNamespaces(namespaces []configs.Namespace) error { - for _, ns := range namespaces { - if ns.Path != "" { - f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0) - if err != nil { - return err - } - err = system.Setns(f.Fd(), uintptr(ns.Syscall())) - f.Close() - if err != nil { - return err - } - } - } - return nil -} - // setupUser changes the groups, gid, and uid for the user inside the container func setupUser(config *initConfig) error { // Set up defaults. diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index d7f07e95..22a9afbf 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -2,10 +2,12 @@ package integration import ( "bytes" + "fmt" "io/ioutil" "os" "os/exec" "path/filepath" + "reflect" "strconv" "strings" "syscall" @@ -1209,7 +1211,6 @@ func TestRootfsPropagationSlaveMount(t *testing.T) { defer stdinW2.Close() ok(t, err) - // Wait for process stdinW2.Close() waitProcess(pconfig2, t) stdinW.Close() @@ -1375,3 +1376,195 @@ func TestPIDHost(t *testing.T) { t.Fatalf("ipc link not equal to host link %q %q", actual, l) } } + +func TestInitJoinPID(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + // Execute a long-running container + container1, err := newContainer(newTemplateConfig(rootfs)) + ok(t, err) + defer container1.Destroy() + + stdinR1, stdinW1, err := os.Pipe() + ok(t, err) + init1 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR1, + } + err = container1.Start(init1) + stdinR1.Close() + defer stdinW1.Close() + ok(t, err) + + // get the state of the first container + state1, err := container1.State() + ok(t, err) + pidns1 := state1.NamespacePaths[configs.NEWPID] + + // Start a container inside the existing pidns but with different cgroups + config2 := newTemplateConfig(rootfs) + config2.Namespaces.Add(configs.NEWPID, pidns1) + config2.Cgroups.Path = "integration/test2" + container2, err := newContainerWithName("testCT2", config2) + ok(t, err) + defer container2.Destroy() + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + init2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR2, + } + err = container2.Start(init2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + // get the state of the second container + state2, err := container2.State() + ok(t, err) + + ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state1.InitProcessPid)) + ok(t, err) + ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state2.InitProcessPid)) + ok(t, err) + if ns1 != ns2 { + t.Errorf("pidns(%s), wanted %s", ns2, ns1) + } + + // check that namespaces are not the same + if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) { + t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths, + state1.NamespacePaths) + } + // check that pidns is joined correctly. The initial container process list + // should contain the second container's init process + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"ps"}, + Env: standardEnvironment, + Stdout: buffers.Stdout, + } + err = container1.Start(ps) + ok(t, err) + waitProcess(ps, t) + + // Stop init processes one by one. Stop the second container should + // not stop the first. + stdinW2.Close() + waitProcess(init2, t) + stdinW1.Close() + waitProcess(init1, t) + + out := strings.TrimSpace(buffers.Stdout.String()) + // output of ps inside the initial PID namespace should have + // 1 line of header, + // 2 lines of init processes, + // 1 line of ps process + if len(strings.Split(out, "\n")) != 4 { + t.Errorf("unexpected running process, output %q", out) + } +} + +func TestInitJoinNetworkAndUser(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + // Execute a long-running container + config1 := newTemplateConfig(rootfs) + config1.UidMappings = []configs.IDMap{{0, 0, 1000}} + config1.GidMappings = []configs.IDMap{{0, 0, 1000}} + config1.Namespaces = append(config1.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + container1, err := newContainer(config1) + ok(t, err) + defer container1.Destroy() + + stdinR1, stdinW1, err := os.Pipe() + ok(t, err) + init1 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR1, + } + err = container1.Start(init1) + stdinR1.Close() + defer stdinW1.Close() + ok(t, err) + + // get the state of the first container + state1, err := container1.State() + ok(t, err) + netns1 := state1.NamespacePaths[configs.NEWNET] + userns1 := state1.NamespacePaths[configs.NEWUSER] + + // Start a container inside the existing pidns but with different cgroups + rootfs2, err := newRootfs() + ok(t, err) + defer remove(rootfs2) + + config2 := newTemplateConfig(rootfs2) + config2.UidMappings = []configs.IDMap{{0, 0, 1000}} + config2.GidMappings = []configs.IDMap{{0, 0, 1000}} + config2.Namespaces.Add(configs.NEWNET, netns1) + config2.Namespaces.Add(configs.NEWUSER, userns1) + config2.Cgroups.Path = "integration/test2" + container2, err := newContainerWithName("testCT2", config2) + ok(t, err) + defer container2.Destroy() + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + init2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR2, + } + err = container2.Start(init2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + // get the state of the second container + state2, err := container2.State() + ok(t, err) + + for _, ns := range []string{"net", "user"} { + ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state1.InitProcessPid, ns)) + ok(t, err) + ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state2.InitProcessPid, ns)) + ok(t, err) + if ns1 != ns2 { + t.Errorf("%s(%s), wanted %s", ns, ns2, ns1) + } + } + + // check that namespaces are not the same + if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) { + t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths, + state1.NamespacePaths) + } + // Stop init processes one by one. Stop the second container should + // not stop the first. + stdinW2.Close() + waitProcess(init2, t) + stdinW1.Close() + waitProcess(init1, t) +} diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go index 8327362f..98dd2ac0 100644 --- a/libcontainer/integration/execin_test.go +++ b/libcontainer/integration/execin_test.go @@ -2,6 +2,7 @@ package integration import ( "bytes" + "fmt" "io" "os" "os/exec" @@ -12,6 +13,7 @@ import ( "time" "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" ) func TestExecIn(t *testing.T) { @@ -404,3 +406,62 @@ func TestExecInOomScoreAdj(t *testing.T) { t.Fatalf("expected oomScoreAdj to be %d, got %s", config.OomScoreAdj, oomScoreAdj) } } + +func TestExecInUserns(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + config.UidMappings = []configs.IDMap{{0, 0, 1000}} + config.GidMappings = []configs.IDMap{{0, 0, 1000}} + config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + } + err = container.Start(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + initPID, err := process.Pid() + ok(t, err) + initUserns, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/user", initPID)) + ok(t, err) + + buffers := newStdBuffers() + process2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"readlink", "/proc/self/ns/user"}, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + }, + Stdout: buffers.Stdout, + Stderr: os.Stderr, + } + err = container.Start(process2) + ok(t, err) + waitProcess(process2, t) + stdinW.Close() + waitProcess(process, t) + + if out := strings.TrimSpace(buffers.Stdout.String()); out != initUserns { + t.Errorf("execin userns(%s), wanted %s", out, initUserns) + } +} diff --git a/libcontainer/integration/utils_test.go b/libcontainer/integration/utils_test.go index 3dcd0bb1..e2ca10e0 100644 --- a/libcontainer/integration/utils_test.go +++ b/libcontainer/integration/utils_test.go @@ -92,13 +92,15 @@ func copyBusybox(dest string) error { } func newContainer(config *configs.Config) (libcontainer.Container, error) { - f := factory + return newContainerWithName("testCT", config) +} +func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) { + f := factory if config.Cgroups != nil && config.Cgroups.Parent == "system.slice" { f = systemdFactory } - - return f.Create("testCT", config) + return f.Create(name, config) } // runContainer runs the container with the specific config and arguments diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 0c3301f2..16630133 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -12,8 +12,12 @@ import ( // The number is randomly chosen to not conflict with known netlink types const ( InitMsg uint16 = 62000 - PidAttr uint16 = 27281 + CloneFlagsAttr uint16 = 27281 ConsolePathAttr uint16 = 27282 + NsPathsAttr uint16 = 27283 + UidmapAttr uint16 = 27284 + GidmapAttr uint16 = 27285 + SetgroupAttr uint16 = 27286 // When syscall.NLA_HDRLEN is in gccgo, take this out. syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) ) @@ -60,3 +64,25 @@ func (msg *Bytemsg) Serialize() []byte { func (msg *Bytemsg) Len() int { return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated } + +type Boolmsg struct { + Type uint16 + Value bool +} + +func (msg *Boolmsg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + if msg.Value { + buf[4] = 1 + } else { + buf[4] = 0 + } + return buf +} + +func (msg *Boolmsg) Len() int { + return syscall_NLA_HDRLEN + 1 +} diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go index 976ae6bb..f7b12be9 100644 --- a/libcontainer/nsenter/nsenter_test.go +++ b/libcontainer/nsenter/nsenter_test.go @@ -3,7 +3,9 @@ package nsenter import ( "bytes" "encoding/json" + "fmt" "io" + "io/ioutil" "os" "os/exec" "strings" @@ -18,35 +20,51 @@ type pid struct { Pid int `json:"Pid"` } -func TestNsenterAlivePid(t *testing.T) { +func TestNsenterValidPaths(t *testing.T) { args := []string{"nsenter-exec"} parent, child, err := newPipe() if err != nil { t.Fatalf("failed to create pipe %v", err) } + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()), + } cmd := &exec.Cmd{ Path: os.Args[0], Args: args, ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + Stdout: os.Stdout, + Stderr: os.Stderr, } if err := cmd.Start(); err != nil { t.Fatalf("nsenter failed to start %v", err) } + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ - Type: libcontainer.PidAttr, - Value: uint32(os.Getpid()), + Type: libcontainer.CloneFlagsAttr, + Value: uint32(syscall.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), }) if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { t.Fatal(err) } + decoder := json.NewDecoder(parent) var pid *pid if err := decoder.Decode(&pid); err != nil { + dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid())) + for _, d := range dir { + t.Log(d.Name()) + } t.Fatalf("%v", err) } @@ -60,70 +78,43 @@ func TestNsenterAlivePid(t *testing.T) { p.Wait() } -func TestNsenterInvalidPid(t *testing.T) { +func TestNsenterInvalidPaths(t *testing.T) { args := []string{"nsenter-exec"} parent, child, err := newPipe() if err != nil { t.Fatalf("failed to create pipe %v", err) } + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("/proc/%d/ns/pid", -1), + } cmd := &exec.Cmd{ Path: os.Args[0], Args: args, ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, } if err := cmd.Start(); err != nil { - t.Fatal("nsenter exits with a zero exit status") + t.Fatal(err) } + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ - Type: libcontainer.PidAttr, - Value: 0, + Type: libcontainer.CloneFlagsAttr, + Value: uint32(syscall.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), }) if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { t.Fatal(err) } if err := cmd.Wait(); err == nil { - t.Fatal("nsenter exits with a zero exit status") - } -} - -func TestNsenterDeadPid(t *testing.T) { - deadCmd := exec.Command("true") - if err := deadCmd.Run(); err != nil { - t.Fatal(err) - } - args := []string{"nsenter-exec"} - parent, child, err := newPipe() - if err != nil { - t.Fatalf("failed to create pipe %v", err) - } - - cmd := &exec.Cmd{ - Path: os.Args[0], - Args: args, - ExtraFiles: []*os.File{child}, - Env: []string{"_LIBCONTAINER_INITTYPE=setns", "_LIBCONTAINER_INITPIPE=3"}, - } - - if err := cmd.Start(); err != nil { - t.Fatal("nsenter exits with a zero exit status") - } - - r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) - r.AddData(&libcontainer.Int32msg{ - Type: libcontainer.PidAttr, - Value: uint32(deadCmd.Process.Pid), - }) - if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { - t.Fatal(err) - } - - if err := cmd.Wait(); err == nil { - t.Fatal("nsenter exits with a zero exit status") + t.Fatalf("nsenter exits with a zero exit status") } } diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 6634afc4..555e7245 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -1,21 +1,20 @@ #define _GNU_SOURCE -#include -#include -#include +#include #include -#include - -#include -#include -#include -#include -#include -#include #include -#include -#include +#include +#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -23,239 +22,417 @@ #include #include -/* All arguments should be above stack, because it grows down */ +// All arguments should be above the stack because it grows down struct clone_arg { /* * Reserve some space for clone() to locate arguments * and retcode in this place */ - char stack[4096] __attribute__ ((aligned(16))); - char stack_ptr[0]; + char stack[4096] __attribute__((aligned(16))); + char stack_ptr[0]; jmp_buf *env; }; -#define pr_perror(fmt, ...) fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__) - -static int child_func(void *_arg) -{ - struct clone_arg *arg = (struct clone_arg *)_arg; - longjmp(*arg->env, 1); -} - -// Use raw setns syscall for versions of glibc that don't include it (namely glibc-2.12) -#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 -#define _GNU_SOURCE -#include "syscall.h" -#if defined(__NR_setns) && !defined(SYS_setns) -#define SYS_setns __NR_setns -#endif -#ifdef SYS_setns -int setns(int fd, int nstype) -{ - return syscall(SYS_setns, fd, nstype); -} -#endif -#endif - -static int clone_parent(jmp_buf * env) __attribute__ ((noinline)); -static int clone_parent(jmp_buf * env) -{ - struct clone_arg ca; - int child; - - ca.env = env; - child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); - - return child; -} - -static uint32_t readint32(char *buf) -{ - return *(uint32_t *) buf; -} +struct nsenter_config { + uint32_t cloneflags; + char *uidmap; + int uidmap_len; + char *gidmap; + int gidmap_len; + uint8_t is_setgroup; +}; // list of known message types we want to send to bootstrap program // These are defined in libcontainer/message_linux.go -#define INIT_MSG 62000 -#define PID_ATTR 27281 -#define CONSOLE_PATH_ATTR 27282 +#define INIT_MSG 62000 +#define CLONE_FLAGS_ATTR 27281 +#define CONSOLE_PATH_ATTR 27282 +#define NS_PATHS_ATTR 27283 +#define UIDMAP_ATTR 27284 +#define GIDMAP_ATTR 27285 +#define SETGROUP_ATTR 27286 -void nsexec() +// Use raw setns syscall for versions of glibc that don't include it +// (namely glibc-2.12) +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 + #define _GNU_SOURCE + #include "syscall.h" + #if defined(__NR_setns) && !defined(SYS_setns) + #define SYS_setns __NR_setns + #endif + + #ifdef SYS_setns + int setns(int fd, int nstype) + { + return syscall(SYS_setns, fd, nstype); + } + #endif +#endif + +#define pr_perror(fmt, ...) \ + fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__) + +static int child_func(void *_arg) { - char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt", "user" }; - const int num = sizeof(namespaces) / sizeof(char *); - jmp_buf env; - char buf[PATH_MAX], *val; - int i, tfd, self_tfd, child, n, len, pipenum, consolefd = -1; - pid_t pid = 0; + struct clone_arg *arg = (struct clone_arg *)_arg; + longjmp(*arg->env, 1); +} - // if we dont have INITTYPE or this is the init process, skip the bootstrap process - val = getenv("_LIBCONTAINER_INITTYPE"); - if (val == NULL || strcmp(val, "standard") == 0) { - return; - } - if (strcmp(val, "setns") != 0) { - pr_perror("Invalid inittype %s", val); - exit(1); +static int clone_parent(jmp_buf *env, int flags) __attribute__((noinline)); +static int clone_parent(jmp_buf *env, int flags) +{ + struct clone_arg ca; + int child; + + ca.env = env; + child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, + &ca); + return child; +} + +// get init pipe from the parent. It's used to read bootstrap data, and to +// write pid to after nsexec finishes setting up the environment. +static int get_init_pipe() +{ + char buf[PATH_MAX]; + char *initpipe; + int pipenum = -1; + + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL) { + return -1; } - val = getenv("_LIBCONTAINER_INITPIPE"); - if (val == NULL) { - pr_perror("Child pipe not found"); - exit(1); - } - pipenum = atoi(val); + pipenum = atoi(initpipe); snprintf(buf, sizeof(buf), "%d", pipenum); - if (strcmp(val, buf)) { + if (strcmp(initpipe, buf)) { pr_perror("Unable to parse _LIBCONTAINER_INITPIPE"); exit(1); } - char nlbuf[NLMSG_HDRLEN]; - struct nlmsghdr *nh; - if ((n = read(pipenum, nlbuf, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { - pr_perror("Failed to read netlink header, got %d", n); + return pipenum; +} + +// num_namespaces returns the number of additional namespaces to setns. The +// argument is a comma-separated string of namespace paths. +static int num_namespaces(char *nspaths) +{ + int i; + int size = 0; + + for (i = 0; nspaths[i]; i++) { + if (nspaths[i] == ',') { + size += 1; + } + } + + return size + 1; +} + +static uint32_t readint32(char *buf) +{ + return *(uint32_t *)buf; +} + +static uint8_t readint8(char *buf) +{ + return *(uint8_t *)buf; +} + +static void update_process_idmap(char *pathfmt, int pid, char *map, int map_len) +{ + char buf[PATH_MAX]; + int len; + int fd; + + len = snprintf(buf, sizeof(buf), pathfmt, pid); + if (len < 0) { + pr_perror("failed to construct '%s' for %d", pathfmt, pid); exit(1); } - nh = (struct nlmsghdr *)nlbuf; - if (nh->nlmsg_type == NLMSG_ERROR) { - pr_perror("Invalid netlink header message"); - exit(1); - } - if (nh->nlmsg_type != INIT_MSG) { - pr_perror("Unexpected netlink message type %d", nh->nlmsg_type); - exit(1); - } - // read the netlink payload - len = NLMSG_PAYLOAD(nh, 0); - char data[len]; - if ((n = read(pipenum, data, len)) != len) { - pr_perror("Failed to read netlink payload, got %d", n); + fd = open(buf, O_RDWR); + if (fd == -1) { + pr_perror("failed to open %s", buf); exit(1); } - int start = 0; - struct nlattr *attr; - while (start < len) { - int payload_len; - attr = (struct nlattr *)((void *)data + start); - start += NLA_HDRLEN; - payload_len = attr->nla_len - NLA_HDRLEN; - switch (attr->nla_type) { - case PID_ATTR: - pid = (pid_t) readint32(data + start); - break; - case CONSOLE_PATH_ATTR: - consolefd = open((char *)data + start, O_RDWR); - if (consolefd < 0) { - pr_perror("Failed to open console %s", (char *)data + start); + len = write(fd, map, map_len); + if (len == -1) { + pr_perror("failed to write to %s", buf); + exit(1); + } else if (len != map_len) { + fprintf(stderr, "Failed to write data to %s (%d/%d)", + buf, len, map_len); + exit(1); + } + + close(fd); +} + +static void update_process_uidmap(int pid, char *map, int map_len) +{ + if ((map == NULL) || (map_len <= 0)) { + return; + } + + update_process_idmap("/proc/%d/uid_map", pid, map, map_len); +} + +static void update_process_gidmap(int pid, uint8_t is_setgroup, char *map, int map_len) +{ + if ((map == NULL) || (map_len <= 0)) { + return; + } + + if (is_setgroup == 1) { + int fd; + int len; + char buf[PATH_MAX]; + + len = snprintf(buf, sizeof(buf), "/proc/%d/setgroups", pid); + if (len < 0) { + pr_perror("failed to get setgroups path for %d", pid); + exit(1); + } + + fd = open(buf, O_RDWR); + if (fd == -1) { + pr_perror("failed to open %s", buf); + exit(1); + } + if (write(fd, "allow", 5) != 5) { + // If the kernel is too old to support + // /proc/PID/setgroups, write will return + // ENOENT; this is OK. + if (errno != ENOENT) { + pr_perror("failed to write allow to %s", buf); exit(1); } - break; - } - start += NLA_ALIGN(payload_len); - } - - // required pid to be passed - if (pid == 0) { - pr_perror("missing pid"); - exit(1); - } - - /* Check that the specified process exists */ - snprintf(buf, PATH_MAX - 1, "/proc/%d/ns", pid); - tfd = open(buf, O_DIRECTORY | O_RDONLY); - if (tfd == -1) { - pr_perror("Failed to open \"%s\"", buf); - exit(1); - } - - self_tfd = open("/proc/self/ns", O_DIRECTORY | O_RDONLY); - if (self_tfd == -1) { - pr_perror("Failed to open /proc/self/ns"); - exit(1); - } - - for (i = 0; i < num; i++) { - struct stat st; - struct stat self_st; - int fd; - - /* Symlinks on all namespaces exist for dead processes, but they can't be opened */ - if (fstatat(tfd, namespaces[i], &st, 0) == -1) { - // Ignore nonexistent namespaces. - if (errno == ENOENT) - continue; - } - - /* Skip namespaces we're already part of */ - if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 && st.st_ino == self_st.st_ino) { - continue; - } - - fd = openat(tfd, namespaces[i], O_RDONLY); - if (fd == -1) { - pr_perror("Failed to open ns file %s for ns %s", buf, namespaces[i]); - exit(1); - } - // Set the namespace. - if (setns(fd, 0) == -1) { - pr_perror("Failed to setns for %s", namespaces[i]); - exit(1); } close(fd); } - close(self_tfd); - close(tfd); + update_process_idmap("/proc/%d/gid_map", pid, map, map_len); +} + + +static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], + struct nsenter_config *config) +{ + int len; + int childpid; + char buf[PATH_MAX]; + uint8_t syncbyte = 1; + + // We must fork to actually enter the PID namespace, use CLONE_PARENT + // so the child can have the right parent, and we don't need to forward + // the child's exit code or resend its death signal. + childpid = clone_parent(env, config->cloneflags); + if (childpid < 0) { + pr_perror("Unable to fork"); + exit(1); + } + + // update uid_map and gid_map for the child process if they + // were provided + update_process_uidmap(childpid, config->uidmap, config->uidmap_len); + + update_process_gidmap(childpid, config->is_setgroup, config->gidmap, config->gidmap_len); + + // Send the sync signal to the child + close(syncpipe[0]); + syncbyte = 1; + if (write(syncpipe[1], &syncbyte, 1) != 1) { + pr_perror("failed to write sync byte to child"); + exit(1); + } + + // Send the child pid back to our parent + len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", childpid); + if ((len < 0) || (write(pipenum, buf, len) != len)) { + pr_perror("Unable to send a child pid"); + kill(childpid, SIGKILL); + exit(1); + } + + exit(0); +} + +static void process_nl_attributes(int pipenum, char *data, int data_size) +{ + jmp_buf env; + struct nsenter_config config = {0}; + struct nlattr *nlattr; + int payload_len; + int start = 0; + int consolefd = -1; + int syncpipe[2] = {-1, -1}; + + while (start < data_size) { + nlattr = (struct nlattr *)(data + start); + start += NLA_HDRLEN; + payload_len = nlattr->nla_len - NLA_HDRLEN; + + if (nlattr->nla_type == CLONE_FLAGS_ATTR) { + config.cloneflags = readint32(data + start); + } else if (nlattr->nla_type == CONSOLE_PATH_ATTR) { + // get the console path before setns because it may + // change mnt namespace + consolefd = open(data + start, O_RDWR); + if (consolefd < 0) { + pr_perror("Failed to open console %s", + data + start); + exit(1); + } + } else if (nlattr->nla_type == NS_PATHS_ATTR) { + // if custom namespaces are required, open all + // descriptors and perform setns on them + int i; + int nslen = num_namespaces(data + start); + int fds[nslen]; + char *nslist[nslen]; + char *ns; + char *saveptr; + + for (i = 0; i < nslen; i++) { + char *str = NULL; + + if (i == 0) { + str = data + start; + } + ns = strtok_r(str, ",", &saveptr); + if (ns == NULL) { + break; + } + fds[i] = open(ns, O_RDONLY); + if (fds[i] == -1) { + pr_perror("Failed to open %s", ns); + exit(1); + } + nslist[i] = ns; + } + + for (i = 0; i < nslen; i++) { + if (setns(fds[i], 0) != 0) { + pr_perror("Failed to setns to %s", nslist[i]); + exit(1); + } + close(fds[i]); + } + } else if (nlattr->nla_type == UIDMAP_ATTR) { + config.uidmap = data + start; + config.uidmap_len = payload_len; + } else if (nlattr->nla_type == GIDMAP_ATTR) { + config.gidmap = data + start; + config.gidmap_len = payload_len; + } else if (nlattr->nla_type == SETGROUP_ATTR) { + config.is_setgroup = readint8(data + start); + } else { + pr_perror("Unknown netlink message type %d", + nlattr->nla_type); + exit(1); + } + + start += NLA_ALIGN(payload_len); + } + + // required clone_flags to be passed + if (config.cloneflags == -1) { + pr_perror("Missing clone_flags"); + exit(1); + } + // prepare sync pipe between parent and child. We need this to let the + // child + // know that the parent has finished setting up + if (pipe(syncpipe) != 0) { + pr_perror("Failed to setup sync pipe between parent and child"); + exit(1); + } if (setjmp(env) == 1) { // Child + uint8_t s = 0; + + // close the writing side of pipe + close(syncpipe[1]); + + // sync with parent + if ((read(syncpipe[0], &s, 1) != 1) || (s != 1)) { + pr_perror("Failed to read sync byte from parent"); + exit(1); + } if (setsid() == -1) { pr_perror("setsid failed"); exit(1); } + if (consolefd != -1) { if (ioctl(consolefd, TIOCSCTTY, 0) == -1) { pr_perror("ioctl TIOCSCTTY failed"); exit(1); } if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) { - pr_perror("Failed to dup 0"); + pr_perror("Failed to dup stdin"); exit(1); } if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) { - pr_perror("Failed to dup 1"); + pr_perror("Failed to dup stdout"); exit(1); } if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) { - pr_perror("Failed to dup 2"); + pr_perror("Failed to dup stderr"); exit(1); } } + // Finish executing, let the Go runtime take over. return; } + // Parent - - // We must fork to actually enter the PID namespace, use CLONE_PARENT - // so the child can have the right parent, and we don't need to forward - // the child's exit code or resend its death signal. - child = clone_parent(&env); - if (child < 0) { - pr_perror("Unable to fork"); - exit(1); - } - - len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", child); - - if (write(pipenum, buf, len) != len) { - pr_perror("Unable to send a child pid"); - kill(child, SIGKILL); - exit(1); - } - - exit(0); + start_child(pipenum, &env, syncpipe, &config); +} + +void nsexec(void) +{ + int pipenum; + + // if we dont have init pipe, then just return to the parent + pipenum = get_init_pipe(); + if (pipenum == -1) { + return; + } + + // Retrieve the netlink header + struct nlmsghdr nl_msg_hdr; + int len; + + if ((len = read(pipenum, &nl_msg_hdr, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { + pr_perror("Invalid netlink header length %d", len); + exit(1); + } + + if (nl_msg_hdr.nlmsg_type == NLMSG_ERROR) { + pr_perror("Failed to read netlink message"); + exit(1); + } + + if (nl_msg_hdr.nlmsg_type != INIT_MSG) { + pr_perror("Unexpected msg type %d", nl_msg_hdr.nlmsg_type); + exit(1); + } + + // Retrieve data + int nl_total_size = NLMSG_PAYLOAD(&nl_msg_hdr, 0); + char data[nl_total_size]; + + if ((len = read(pipenum, data, nl_total_size)) != nl_total_size) { + pr_perror("Failed to read netlink payload, %d != %d", len, + nl_total_size); + exit(1); + } + + process_nl_attributes(pipenum, data, nl_total_size); } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index de2d5f00..ebae030b 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -167,14 +167,16 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { } type initProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - config *initConfig - manager cgroups.Manager - container *linuxContainer - fds []string - process *Process + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + config *initConfig + manager cgroups.Manager + container *linuxContainer + fds []string + process *Process + bootstrapData io.Reader + sharePidns bool } func (p *initProcess) pid() int { @@ -185,15 +187,49 @@ func (p *initProcess) externalDescriptors() []string { return p.fds } -func (p *initProcess) start() (err error) { +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +// This is called by initProcess.start function +func (p *initProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return err + } + if !status.Success() { + p.cmd.Wait() + return &exec.ExitError{ProcessState: status} + } + var pid *pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return err + } + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + return nil +} + +func (p *initProcess) start() error { defer p.parentPipe.Close() - err = p.cmd.Start() + err := p.cmd.Start() p.process.ops = p p.childPipe.Close() if err != nil { p.process.ops = nil return newSystemError(err) } + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return err + } + if err := p.execSetns(); err != nil { + return newSystemError(err) + } // Save the standard descriptor names before the container process // can potentially move them (e.g., via dup2()). If we don't do this now, // we won't know at checkpoint time which file descriptor to look up. @@ -213,21 +249,6 @@ func (p *initProcess) start() (err error) { p.manager.Destroy() } }() - if !p.config.Config.Namespaces.Contains(configs.NEWNS) { - if p.config.Config.Hooks != nil { - s := configs.HookState{ - Version: p.container.config.Version, - ID: p.container.id, - Pid: p.pid(), - Root: p.config.Config.Rootfs, - } - for _, hook := range p.config.Config.Hooks.Prestart { - if err := hook.Run(s); err != nil { - return newSystemError(err) - } - } - } - } if err := p.createNetworkInterfaces(); err != nil { return newSystemError(err) } @@ -255,6 +276,22 @@ loop: if err := p.manager.Set(p.config.Config); err != nil { return newSystemError(err) } + // call prestart hooks + if !p.config.Config.Namespaces.Contains(configs.NEWNS) { + if p.config.Config.Hooks != nil { + s := configs.HookState{ + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Root: p.config.Config.Rootfs, + } + for _, hook := range p.config.Config.Hooks.Prestart { + if err := hook.Run(s); err != nil { + return newSystemError(err) + } + } + } + } // Sync with child. if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil { return newSystemError(err) @@ -317,7 +354,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) { return p.cmd.ProcessState, err } // we should kill all processes in cgroup when init is died if we use host PID namespace - if p.cmd.SysProcAttr.Cloneflags&syscall.CLONE_NEWPID == 0 { + if p.sharePidns { killCgroupProcesses(p.manager) } return p.cmd.ProcessState, nil diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 27ad8caf..935b2eea 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -55,10 +55,6 @@ func (l *linuxStandardInit) Init() error { return err } - // join any namespaces via a path to the namespace fd if provided - if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil { - return err - } var console *linuxConsole if l.config.Console != "" { console = newConsoleFromPath(l.config.Console) @@ -66,9 +62,6 @@ func (l *linuxStandardInit) Init() error { return err } } - if _, err := syscall.Setsid(); err != nil { - return err - } if console != nil { if err := system.Setctty(); err != nil { return err diff --git a/spec.go b/spec.go index d75c28cf..a4d7c4c8 100644 --- a/spec.go +++ b/spec.go @@ -593,7 +593,10 @@ func setupUserNamespace(spec *specs.LinuxSpec, config *configs.Config) error { if len(spec.Linux.UIDMappings) == 0 { return nil } - config.Namespaces.Add(configs.NEWUSER, "") + // do not override the specified user namespace path + if config.Namespaces.PathOf(configs.NEWUSER) == "" { + config.Namespaces.Add(configs.NEWUSER, "") + } create := func(m specs.IDMapping) configs.IDMap { return configs.IDMap{ HostID: int(m.HostID),