From 5dd6caf6cfbaabeb7dcef187cd4418c955fc334d Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 1 Oct 2015 17:03:02 -0400 Subject: [PATCH 1/6] Replace config.Privatefs with config.RootPropagation Right now config.Privatefs is a boolean which determines if / is applied with propagation flag syscall.MS_PRIVATE | syscall.MS_REC or not. Soon we want to represent other propagation states like private, [r]slave, and [r]shared. So either we can introduce more boolean variable or keep track of propagation flags in an integer variable. Keeping an integer variable is more versatile and can allow various kind of propagation flags to be specified. So replace Privatefs with RootPropagation which is an integer. Note, this will require changes in docker. Instead of setting Privatefs to true, they will need to set. config.RootPropagation = syscall.MS_PRIVATE | syscall.MS_REC Signed-off-by: Vivek Goyal --- libcontainer/configs/config.go | 4 ++-- libcontainer/rootfs_linux.go | 4 ++-- spec.go | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 4b298e1e..7fd311db 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -92,8 +92,8 @@ type Config struct { // bind mounts are writtable. Readonlyfs bool `json:"readonlyfs"` - // Privatefs will mount the container's rootfs as private where mount points from the parent will not propogate - Privatefs bool `json:"privatefs"` + // Specifies the mount propagation flags to be applied to /. + RootPropagation int `json:"rootPropagation"` // Mounts specify additional source and destination paths that will be mounted inside the container's // rootfs and mount namespace if specified diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index f9566975..f8fc90fd 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -422,8 +422,8 @@ func mknodDevice(dest string, node *configs.Device) error { func prepareRoot(config *configs.Config) error { flag := syscall.MS_SLAVE | syscall.MS_REC - if config.Privatefs { - flag = syscall.MS_PRIVATE | syscall.MS_REC + if config.RootPropagation != 0 { + flag = config.RootPropagation } if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil { return err diff --git a/spec.go b/spec.go index 8604121b..90876d7c 100644 --- a/spec.go +++ b/spec.go @@ -329,11 +329,11 @@ func createLibcontainerConfig(cgroupName string, spec *specs.LinuxSpec, rspec *s rootfsPath = filepath.Join(cwd, rootfsPath) } config := &configs.Config{ - Rootfs: rootfsPath, - Capabilities: spec.Linux.Capabilities, - Readonlyfs: spec.Root.Readonly, - Hostname: spec.Hostname, - Privatefs: true, + Rootfs: rootfsPath, + Capabilities: spec.Linux.Capabilities, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + RootPropagation: syscall.MS_PRIVATE | syscall.MS_REC, } for _, ns := range rspec.Linux.Namespaces { t, exists := namespaceMapping[ns.Type] From f6fadd2ffe454ad9ef62511d65448f0b584907f0 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 1 Oct 2015 17:03:02 -0400 Subject: [PATCH 2/6] Start parsing rootfsPropagation and make it effective spec introduced a new field rootfsPropagation. Right now that field is not parsed by runc and it does not take effect. Starting parsing it and for now allow only limited propagation flags. More can be opened as new use cases show up. We are apply propagation flags on / and not rootfs. So ideally we should introduce another field in spec say rootPropagation. For now I am parsing rootfsPropagation. Once we agree on design, we can discuss if we need another field in spec or not. Signed-off-by: Vivek Goyal --- spec.go | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/spec.go b/spec.go index 90876d7c..6d5a6ca6 100644 --- a/spec.go +++ b/spec.go @@ -280,6 +280,16 @@ var namespaceMapping = map[specs.NamespaceType]configs.NamespaceType{ specs.UTSNamespace: configs.NEWUTS, } +var mountPropagationMapping = map[string]int{ + "rprivate": syscall.MS_PRIVATE | syscall.MS_REC, + "private": syscall.MS_PRIVATE, + "rslave": syscall.MS_SLAVE | syscall.MS_REC, + "slave": syscall.MS_SLAVE, + "rshared": syscall.MS_SHARED | syscall.MS_REC, + "shared": syscall.MS_SHARED, + "": syscall.MS_PRIVATE | syscall.MS_REC, +} + // loadSpec loads the specification from the provided path. // If the path is empty then the default path will be "config.json" func loadSpec(cPath, rPath string) (spec *specs.LinuxSpec, rspec *specs.LinuxRuntimeSpec, err error) { @@ -329,12 +339,17 @@ func createLibcontainerConfig(cgroupName string, spec *specs.LinuxSpec, rspec *s rootfsPath = filepath.Join(cwd, rootfsPath) } config := &configs.Config{ - Rootfs: rootfsPath, - Capabilities: spec.Linux.Capabilities, - Readonlyfs: spec.Root.Readonly, - Hostname: spec.Hostname, - RootPropagation: syscall.MS_PRIVATE | syscall.MS_REC, + Rootfs: rootfsPath, + Capabilities: spec.Linux.Capabilities, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, } + + exists := false + if config.RootPropagation, exists = mountPropagationMapping[rspec.Linux.RootfsPropagation]; !exists { + return nil, fmt.Errorf("rootfsPropagation=%v is not supported", rspec.Linux.RootfsPropagation) + } + for _, ns := range rspec.Linux.Namespaces { t, exists := namespaceMapping[ns.Type] if !exists { From 23ec72a4261e38d6b44aa62a21cbdfc5423e46c1 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 1 Oct 2015 17:03:02 -0400 Subject: [PATCH 3/6] Make parent mount of container root private if it is shared. pivot_root() introduces bunch of restrictions otherwise it fails. parent mount of container root can not be shared otherwise pivot_root() will fail. So far parent could not be shared as we marked everything either private or slave. But now we have introduced new propagation modes where parent mount of container rootfs could be shared and pivot_root() will fail. So check if parent mount is shared and if yes, make it private. This will make sure pivot_root() works. Also it will make sure that when we bind mount container rootfs, it does not propagate to parent mount namespace. Otherwise cleanup becomes a problem. Signed-off-by: Vivek Goyal --- libcontainer/rootfs_linux.go | 76 ++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index f8fc90fd..c93d7849 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -13,6 +13,7 @@ import ( "syscall" "time" + "github.com/docker/docker/pkg/mount" "github.com/docker/docker/pkg/symlink" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" @@ -420,6 +421,76 @@ func mknodDevice(dest string, node *configs.Device) error { return syscall.Chown(dest, int(node.Uid), int(node.Gid)) } +func getMountInfo(mountinfo []*mount.MountInfo, dir string) *mount.MountInfo { + for _, m := range mountinfo { + if m.Mountpoint == dir { + return m + } + } + return nil +} + +// Get the parent mount point of directory passed in as argument. Also return +// optional fields. +func getParentMount(rootfs string) (string, string, error) { + var path string + + mountinfos, err := mount.GetMounts() + if err != nil { + return "", "", err + } + + mountinfo := getMountInfo(mountinfos, rootfs) + if mountinfo != nil { + return rootfs, mountinfo.Optional, nil + } + + path = rootfs + for { + path = filepath.Dir(path) + + mountinfo = getMountInfo(mountinfos, path) + if mountinfo != nil { + return path, mountinfo.Optional, nil + } + + if path == "/" { + break + } + } + + // If we are here, we did not find parent mount. Something is wrong. + return "", "", fmt.Errorf("Could not find parent mount of %s", rootfs) +} + +// Make parent mount private if it was shared +func rootfsParentMountPrivate(config *configs.Config) error { + sharedMount := false + + parentMount, optionalOpts, err := getParentMount(config.Rootfs) + if err != nil { + return err + } + + optsSplit := strings.Split(optionalOpts, " ") + for _, opt := range optsSplit { + if strings.HasPrefix(opt, "shared:") { + sharedMount = true + break + } + } + + // Make parent mount PRIVATE if it was shared. It is needed for two + // reasons. First of all pivot_root() will fail if parent mount is + // shared. Secondly when we bind mount rootfs it will propagate to + // parent namespace and we don't want that to happen. + if sharedMount { + return syscall.Mount("", parentMount, "", syscall.MS_PRIVATE, "") + } + + return nil +} + func prepareRoot(config *configs.Config) error { flag := syscall.MS_SLAVE | syscall.MS_REC if config.RootPropagation != 0 { @@ -428,6 +499,11 @@ func prepareRoot(config *configs.Config) error { if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil { return err } + + if err := rootfsParentMountPrivate(config); err != nil { + return err + } + return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "") } From da8d776c088b943a380e4a061f226a946ccf4c27 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 1 Oct 2015 17:03:02 -0400 Subject: [PATCH 4/6] Make pivotDir rprivate pivotDir is the one where pivot_root() call puts the old root. We will unmount pivotDir() and delete it. Previously we were making / always rslave or rprivate. That will mean that pivotDir() could never have mounts which would be shared with parent mount namespace. That also means that unmounting pivotDir() was safe and none of the unmount will propagate to parent namespace and unmount things which we did not want to. But now user can specify that apply private, shared, slave on /. That means some of the mounts we inherited from parent could be shared and that also means if we umount pivotDir/, those mounts will get unmounted in parent too. That's not what we want. Instead make pivotDir rprivate so that unmounts don't propagate back to parent. Signed-off-by: Vivek Goyal --- libcontainer/rootfs_linux.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index c93d7849..147c93d6 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -545,6 +545,13 @@ func pivotRoot(rootfs, pivotBaseDir string) error { } // path to pivot dir now changed, update pivotDir = filepath.Join(pivotBaseDir, filepath.Base(pivotDir)) + + // Make pivotDir rprivate to make sure any of the unmounts don't + // propagate to parent. + if err := syscall.Mount("", pivotDir, "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil { + return err + } + if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { return fmt.Errorf("unmount pivot_root dir %s", err) } From 175e4b8aecdc1c54f1bfa6e327501dbe70baaf07 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 1 Oct 2015 17:03:02 -0400 Subject: [PATCH 5/6] exec_test.go: Test cases for rootfsPropagation=rslave test case to test rootfsPropagation=rslave Signed-off-by: Vivek Goyal --- libcontainer/integration/exec_test.go | 124 ++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 04fbe651..c1e793c0 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -1014,3 +1014,127 @@ func TestSTDIOPermissions(t *testing.T) { t.Fatalf("stderr should equal be equal %q %q", actual, "hi") } } + +func unmountOp(path string) error { + if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil { + return err + } + return nil +} + +// Launch container with rootfsPropagation in rslave mode. Also +// bind mount a volume /mnt1host at /mnt1cont at the time of launch. Now do +// another mount on host (/mnt1host/mnt2host) and this new mount should +// propagate to container (/mnt1cont/mnt2host) +func TestRootfsPropagationSlaveMount(t *testing.T) { + var mountPropagated bool + var dir1cont string + var dir2cont string + + dir1cont = "/root/mnt1cont" + + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + + config.RootPropagation = syscall.MS_SLAVE | syscall.MS_REC + + // Bind mount a volume + dir1host, err := ioutil.TempDir("", "mnt1host") + ok(t, err) + defer os.RemoveAll(dir1host) + + // Make this dir a "shared" mount point. This will make sure a + // slave relation ship can be established in container. + err = syscall.Mount(dir1host, dir1host, "bind", syscall.MS_BIND|syscall.MS_REC, "") + ok(t, err) + err = syscall.Mount("", dir1host, "", syscall.MS_SHARED|syscall.MS_REC, "") + ok(t, err) + defer unmountOp(dir1host) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Source: dir1host, + Destination: dir1cont, + Device: "bind", + Flags: syscall.MS_BIND | syscall.MS_REC}) + + // TODO: systemd specific processing + f := factory + + container, err := f.Create("testSlaveMount", config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + pconfig := &libcontainer.Process{ + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + } + + err = container.Start(pconfig) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + // Create mnt1host/mnt2host and bind mount itself on top of it. This + // should be visible in container. + dir2host, err := ioutil.TempDir(dir1host, "mnt2host") + ok(t, err) + defer os.RemoveAll(dir2host) + + err = syscall.Mount(dir2host, dir2host, "bind", syscall.MS_BIND, "") + defer unmountOp(dir2host) + ok(t, err) + + // Run "cat /proc/self/mountinfo" in container and look at mount points. + var stdout2 bytes.Buffer + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + + pconfig2 := &libcontainer.Process{ + Args: []string{"cat", "/proc/self/mountinfo"}, + Env: standardEnvironment, + Stdin: stdinR2, + Stdout: &stdout2, + } + + err = container.Start(pconfig2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + // Wait for process + stdinW2.Close() + waitProcess(pconfig2, t) + stdinW.Close() + waitProcess(pconfig, t) + + mountPropagated = false + dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host)) + + propagationInfo := string(stdout2.Bytes()) + lines := strings.Split(propagationInfo, "\n") + for _, l := range lines { + linefields := strings.Split(l, " ") + if len(linefields) < 5 { + continue + } + + if linefields[4] == dir2cont { + mountPropagated = true + break + } + } + + if mountPropagated != true { + t.Fatalf("Mount on host %s did not propagate in container at %s\n", dir2host, dir2cont) + } +} From 6a851e1195e1c5d699248eea65f2b4b8dc7dd8d4 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 1 Oct 2015 17:03:02 -0400 Subject: [PATCH 6/6] exec_test.go: Test case for rootfsPropagation="private" A test case to test rootfsPropagation="private" and making sure shared volumes work. Signed-off-by: Vivek Goyal --- libcontainer/integration/exec_test.go | 114 +++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index c1e793c0..267c0731 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -4,6 +4,7 @@ import ( "bytes" "io/ioutil" "os" + "os/exec" "path/filepath" "strconv" "strings" @@ -1049,7 +1050,7 @@ func TestRootfsPropagationSlaveMount(t *testing.T) { defer os.RemoveAll(dir1host) // Make this dir a "shared" mount point. This will make sure a - // slave relation ship can be established in container. + // slave relationship can be established in container. err = syscall.Mount(dir1host, dir1host, "bind", syscall.MS_BIND|syscall.MS_REC, "") ok(t, err) err = syscall.Mount("", dir1host, "", syscall.MS_SHARED|syscall.MS_REC, "") @@ -1138,3 +1139,114 @@ func TestRootfsPropagationSlaveMount(t *testing.T) { t.Fatalf("Mount on host %s did not propagate in container at %s\n", dir2host, dir2cont) } } + +// Launch container with rootfsPropagation 0 so no propagation flags are +// applied. Also bind mount a volume /mnt1host at /mnt1cont at the time of +// launch. Now do a mount in container (/mnt1cont/mnt2cont) and this new +// mount should propagate to host (/mnt1host/mnt2cont) + +func TestRootfsPropagationSharedMount(t *testing.T) { + var dir1cont string + var dir2cont string + + dir1cont = "/root/mnt1cont" + + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(rootfs) + config.RootPropagation = syscall.MS_PRIVATE + + // Bind mount a volume + dir1host, err := ioutil.TempDir("", "mnt1host") + ok(t, err) + defer os.RemoveAll(dir1host) + + // Make this dir a "shared" mount point. This will make sure a + // shared relationship can be established in container. + err = syscall.Mount(dir1host, dir1host, "bind", syscall.MS_BIND|syscall.MS_REC, "") + ok(t, err) + err = syscall.Mount("", dir1host, "", syscall.MS_SHARED|syscall.MS_REC, "") + ok(t, err) + defer unmountOp(dir1host) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Source: dir1host, + Destination: dir1cont, + Device: "bind", + Flags: syscall.MS_BIND | syscall.MS_REC}) + + // TODO: systemd specific processing + f := factory + + container, err := f.Create("testSharedMount", config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + pconfig := &libcontainer.Process{ + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + } + + err = container.Start(pconfig) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + // Create mnt1host/mnt2cont. This will become visible inside container + // at mnt1cont/mnt2cont. Bind mount itself on top of it. This + // should be visible on host now. + dir2host, err := ioutil.TempDir(dir1host, "mnt2cont") + ok(t, err) + defer os.RemoveAll(dir2host) + + dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host)) + + // Mount something in container and see if it is visible on host. + var stdout2 bytes.Buffer + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + + // Provide CAP_SYS_ADMIN + processCaps := append(config.Capabilities, "CAP_SYS_ADMIN") + + pconfig2 := &libcontainer.Process{ + Args: []string{"mount", "--bind", dir2cont, dir2cont}, + Env: standardEnvironment, + Stdin: stdinR2, + Stdout: &stdout2, + Capabilities: processCaps, + } + + err = container.Start(pconfig2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + // Wait for process + stdinW2.Close() + waitProcess(pconfig2, t) + stdinW.Close() + waitProcess(pconfig, t) + + defer unmountOp(dir2host) + + // Check if mount is visible on host or not. + out, err := exec.Command("findmnt", "-n", "-f", "-oTARGET", dir2host).CombinedOutput() + outtrim := strings.TrimSpace(string(out)) + if err != nil { + t.Logf("findmnt error %q: %q", err, outtrim) + } + + if string(outtrim) != dir2host { + t.Fatalf("Mount in container on %s did not propagate to host on %s. finmnt output=%s", dir2cont, dir2host, outtrim) + } +}