diff --git a/mount/init.go b/mount/init.go index 735970cd..cc3ce215 100644 --- a/mount/init.go +++ b/mount/init.go @@ -11,7 +11,6 @@ import ( "github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/mount/nodes" - "github.com/dotcloud/docker/pkg/libcontainer/security/restrict" "github.com/dotcloud/docker/pkg/system" ) @@ -51,11 +50,6 @@ func InitializeMountNamespace(rootfs, console string, container *libcontainer.Co if err := nodes.CopyN(rootfs, nodes.DefaultNodes); err != nil { return fmt.Errorf("copy dev nodes %s", err) } - if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" { - if err := restrict.Restrict(rootfs, restrictionPath); err != nil { - return fmt.Errorf("restrict %s", err) - } - } if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil { return err } @@ -124,10 +118,11 @@ func setupBindmounts(rootfs string, bindMounts libcontainer.Mounts) error { } // TODO: this is crappy right now and should be cleaned up with a better way of handling system and -// standard bind mounts allowing them to be more dymanic +// standard bind mounts allowing them to be more dynamic func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mount { systemMounts := []mount{ {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, + {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, } if len(mounts.OfType("devtmpfs")) == 1 { @@ -138,8 +133,5 @@ func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mo mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}, ) - if len(mounts.OfType("sysfs")) == 1 { - systemMounts = append(systemMounts, mount{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}) - } return systemMounts } diff --git a/nsinit/init.go b/nsinit/init.go index faec12af..bafb877c 100644 --- a/nsinit/init.go +++ b/nsinit/init.go @@ -16,6 +16,7 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/mount" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/security/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/security/restrict" "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/user" @@ -68,18 +69,25 @@ func Init(container *libcontainer.Container, uncleanRootfs, consolePath string, if err := system.Sethostname(container.Hostname); err != nil { return fmt.Errorf("sethostname %s", err) } - if err := FinalizeNamespace(container); err != nil { - return fmt.Errorf("finalize namespace %s", err) - } runtime.LockOSThread() + if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" { + if err := restrict.Restrict("/", restrictionPath); err != nil { + return err + } + } + if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil { return err } if err := label.SetProcessLabel(container.Context["process_label"]); err != nil { return fmt.Errorf("set process label %s", err) } + + if err := FinalizeNamespace(container); err != nil { + return fmt.Errorf("finalize namespace %s", err) + } return system.Execv(args[0], args[0:], container.Env) } diff --git a/security/restrict/restrict.go b/security/restrict/restrict.go index 291d6ca5..8c08ea18 100644 --- a/security/restrict/restrict.go +++ b/security/restrict/restrict.go @@ -9,43 +9,67 @@ import ( "github.com/dotcloud/docker/pkg/system" ) -const flags = syscall.MS_BIND | syscall.MS_REC | syscall.MS_RDONLY - -var restrictions = map[string]string{ - // dirs - "/proc/sys": "", - "/proc/irq": "", - "/proc/acpi": "", - - // files - "/proc/sysrq-trigger": "/dev/null", - "/proc/kcore": "/dev/null", +// "restrictions" are container paths (files, directories, whatever) that have to be masked. +// maskPath is a "safe" path to be mounted over maskedPath. It can take two special values: +// - if it is "", then nothing is mounted; +// - if it is "EMPTY", then an empty directory is mounted instead. +// If remountRO is true then the maskedPath is remounted read-only (regardless of whether a maskPath was used). +type restriction struct { + maskedPath string + maskPath string + remountRO bool } -// Restrict locks down access to many areas of proc -// by using the asumption that the user does not have mount caps to -// revert the changes made here -func Restrict(rootfs, empty string) error { - for dest, source := range restrictions { - dest = filepath.Join(rootfs, dest) +var restrictions = []restriction{ + {"/proc", "", true}, + {"/sys", "", true}, + {"/proc/kcore", "/dev/null", false}, +} - // we don't have a "/dev/null" for dirs so have the requester pass a dir - // for us to bind mount - switch source { - case "": - source = empty - default: - source = filepath.Join(rootfs, source) - } - if err := system.Mount(source, dest, "bind", flags, ""); err != nil { - if os.IsNotExist(err) { - continue +// This has to be called while the container still has CAP_SYS_ADMIN (to be able to perform mounts). +// However, afterwards, CAP_SYS_ADMIN should be dropped (otherwise the user will be able to revert those changes). +// "empty" should be the path to an empty directory. +func Restrict(rootfs, empty string) error { + for _, restriction := range restrictions { + dest := filepath.Join(rootfs, restriction.maskedPath) + if restriction.maskPath != "" { + var source string + if restriction.maskPath == "EMPTY" { + source = empty + } else { + source = filepath.Join(rootfs, restriction.maskPath) + } + if err := system.Mount(source, dest, "", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("unable to bind-mount %s over %s: %s", source, dest, err) } - return fmt.Errorf("unable to mount %s over %s %s", source, dest, err) } - if err := system.Mount("", dest, "bind", flags|syscall.MS_REMOUNT, ""); err != nil { - return fmt.Errorf("unable to mount %s over %s %s", source, dest, err) + if restriction.remountRO { + if err := system.Mount("", dest, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil { + return fmt.Errorf("unable to remount %s readonly: %s", dest, err) + } } } + + // This weird trick will allow us to mount /proc read-only, while being able to use AppArmor. + // This is because apparently, loading an AppArmor profile requires write access to /proc/1/attr. + // So we do another mount of procfs, ensure it's write-able, and bind-mount a subset of it. + tmpProcPath := filepath.Join(rootfs, ".proc") + if err := os.Mkdir(tmpProcPath, 0700); err != nil { + return fmt.Errorf("unable to create temporary proc mountpoint %s: %s", tmpProcPath, err) + } + if err := system.Mount("proc", tmpProcPath, "proc", 0, ""); err != nil { + return fmt.Errorf("unable to mount proc on temporary proc mountpoint: %s", err) + } + if err := system.Mount("proc", tmpProcPath, "", syscall.MS_REMOUNT, ""); err != nil { + return fmt.Errorf("unable to remount proc read-write: %s", err) + } + rwAttrPath := filepath.Join(rootfs, ".proc", "1", "attr") + roAttrPath := filepath.Join(rootfs, "proc", "1", "attr") + if err := system.Mount(rwAttrPath, roAttrPath, "", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("unable to bind-mount %s on %s: %s", rwAttrPath, roAttrPath, err) + } + if err := system.Unmount(tmpProcPath, 0); err != nil { + return fmt.Errorf("unable to unmount temporary proc filesystem: %s", err) + } return nil }