diff --git a/Dockerfile b/Dockerfile index 614e5979..0771c808 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,5 @@ -FROM crosbymichael/golang +FROM golang:1.4 -RUN apt-get update && apt-get install -y gcc make RUN go get golang.org/x/tools/cmd/cover ENV GOPATH $GOPATH:/go/src/github.com/docker/libcontainer/vendor diff --git a/MAINTAINERS b/MAINTAINERS index 7295c603..52351317 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2,4 +2,5 @@ Michael Crosby (@crosbymichael) Rohit Jnagal (@rjnagal) Victor Marmol (@vmarmol) Mrunal Patel (@mrunalp) +Alexandr Morozov (@LK4D4) update-vendor.sh: Tianon Gravi (@tianon) diff --git a/Makefile b/Makefile index bc7f1785..c7cec0b9 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,13 @@ all: - docker build -t docker/libcontainer . + docker build -t dockercore/libcontainer . test: # we need NET_ADMIN for the netlink tests and SYS_ADMIN for mounting - docker run --rm -it --privileged docker/libcontainer + docker run --rm -it --privileged dockercore/libcontainer sh: - docker run --rm -it --privileged -w /busybox docker/libcontainer nsinit exec sh + docker run --rm -it --privileged -w /busybox dockercore/libcontainer nsinit exec sh GO_PACKAGES = $(shell find . -not \( -wholename ./vendor -prune -o -wholename ./.git -prune \) -name '*.go' -print0 | xargs -0n1 dirname | sort -u) @@ -23,3 +23,5 @@ direct-build: direct-install: go install -v $(GO_PACKAGES) +local: + go test -v diff --git a/ROADMAP.md b/ROADMAP.md index 08deb9ad..f5903535 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -13,4 +13,8 @@ Our goal is to make libcontainer run everywhere, but currently libcontainer requ ## Cross-architecture support -Our goal is to make libcontainer run everywhere. However currently libcontainer only runs on x86_64 systems. We plan on expanding architecture support, so that libcontainer containers can be created and used on more architectures. +Our goal is to make libcontainer run everywhere. Recently libcontainer has +expanded from its initial support for x86_64 systems to include POWER (ppc64 +little and big endian variants), IBM System z (s390x 64-bit), and ARM. We plan +to continue expanding architecture support such that libcontainer containers +can be created and used on more architectures. diff --git a/SPEC.md b/SPEC.md index f5afaadc..d83d758d 100644 --- a/SPEC.md +++ b/SPEC.md @@ -318,4 +318,29 @@ a container. | Resume | Resume all processes inside the container if paused | | Exec | Execute a new process inside of the container ( requires setns ) | +### Execute a new process inside of a running container. +User can execute a new process inside of a running container. Any binaries to be +executed must be accessible within the container's rootfs. + +The started process will run inside the container's rootfs. Any changes +made by the process to the container's filesystem will persist after the +process finished executing. + +The started process will join all the container's existing namespaces. When the +container is paused, the process will also be paused and will resume when +the container is unpaused. The started process will only run when the container's +primary process (PID 1) is running, and will not be restarted when the container +is restarted. + +#### Planned additions + +The started process will have its own cgroups nested inside the container's +cgroups. This is used for process tracking and optionally resource allocation +handling for the new process. Freezer cgroup is required, the rest of the cgroups +are optional. The process executor must place its pid inside the correct +cgroups before starting the process. This is done so that no child processes or +threads can escape the cgroups. + +When the process is stopped, the process executor will try (in a best-effort way) +to stop all its children and remove the sub-cgroups. diff --git a/cgroups/cgroups.go b/cgroups/cgroups.go index 894c8125..60b1135a 100644 --- a/cgroups/cgroups.go +++ b/cgroups/cgroups.go @@ -77,6 +77,8 @@ type Cgroup struct { CpuQuota int64 `json:"cpu_quota,omitempty"` // CPU hardcap limit (in usecs). Allowed cpu time in a given period. CpuPeriod int64 `json:"cpu_period,omitempty"` // CPU period to be used for hardcapping (in usecs). 0 to use system default. CpusetCpus string `json:"cpuset_cpus,omitempty"` // CPU to use + CpusetMems string `json:"cpuset_mems,omitempty"` // MEM to use + BlkioWeight int64 `json:"blkio_weight,omitempty"` // Specifies per cgroup weight, range is from 10 to 1000. Freezer FreezerState `json:"freezer,omitempty"` // set the freeze value for the process Slice string `json:"slice,omitempty"` // Parent slice to use for systemd } diff --git a/cgroups/fs/apply_raw.go b/cgroups/fs/apply_raw.go index 930738bf..11d35d7a 100644 --- a/cgroups/fs/apply_raw.go +++ b/cgroups/fs/apply_raw.go @@ -124,7 +124,7 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { stats := cgroups.NewStats() for name, path := range m.Paths { sys, ok := subsystems[name] - if !ok { + if !ok || !cgroups.PathExists(path) { continue } if err := sys.GetStats(path, stats); err != nil { diff --git a/cgroups/fs/blkio.go b/cgroups/fs/blkio.go index ce824d56..b64e4684 100644 --- a/cgroups/fs/blkio.go +++ b/cgroups/fs/blkio.go @@ -15,11 +15,17 @@ type BlkioGroup struct { } func (s *BlkioGroup) Set(d *data) error { - // we just want to join this group even though we don't set anything - if _, err := d.join("blkio"); err != nil && !cgroups.IsNotFound(err) { + dir, err := d.join("blkio") + if err != nil && !cgroups.IsNotFound(err) { return err } + if d.c.BlkioWeight != 0 { + if err := writeFile(dir, "blkio.weight", strconv.FormatInt(d.c.BlkioWeight, 10)); err != nil { + return err + } + } + return nil } diff --git a/cgroups/fs/cpuset.go b/cgroups/fs/cpuset.go index 54d2ed57..ff67a53e 100644 --- a/cgroups/fs/cpuset.go +++ b/cgroups/fs/cpuset.go @@ -18,7 +18,7 @@ func (s *CpusetGroup) Set(d *data) error { if err != nil { return err } - return s.SetDir(dir, d.c.CpusetCpus, d.pid) + return s.SetDir(dir, d.c.CpusetCpus, d.c.CpusetMems, d.pid) } func (s *CpusetGroup) Remove(d *data) error { @@ -29,7 +29,7 @@ func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } -func (s *CpusetGroup) SetDir(dir, value string, pid int) error { +func (s *CpusetGroup) SetDir(dir, cpus string, mems string, pid int) error { if err := s.ensureParent(dir); err != nil { return err } @@ -40,10 +40,15 @@ func (s *CpusetGroup) SetDir(dir, value string, pid int) error { return err } - // If we don't use --cpuset, the default cpuset.cpus is set in - // s.ensureParent, otherwise, use the value we set - if value != "" { - if err := writeFile(dir, "cpuset.cpus", value); err != nil { + // If we don't use --cpuset-xxx, the default value inherit from parent cgroup + // is set in s.ensureParent, otherwise, use the value we set + if cpus != "" { + if err := writeFile(dir, "cpuset.cpus", cpus); err != nil { + return err + } + } + if mems != "" { + if err := writeFile(dir, "cpuset.mems", mems); err != nil { return err } } diff --git a/cgroups/fs/memory.go b/cgroups/fs/memory.go index 3f9647c2..01713fd7 100644 --- a/cgroups/fs/memory.go +++ b/cgroups/fs/memory.go @@ -38,12 +38,17 @@ func (s *MemoryGroup) Set(d *data) error { } } // By default, MemorySwap is set to twice the size of RAM. - // If you want to omit MemorySwap, set it to `-1'. - if d.c.MemorySwap != -1 { + // If you want to omit MemorySwap, set it to '-1'. + if d.c.MemorySwap == 0 { if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(d.c.Memory*2, 10)); err != nil { return err } } + if d.c.MemorySwap > 0 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(d.c.MemorySwap, 10)); err != nil { + return err + } + } } return nil } diff --git a/cgroups/fs/stats_util_test.go b/cgroups/fs/stats_util_test.go index 1a9e590f..c55ba938 100644 --- a/cgroups/fs/stats_util_test.go +++ b/cgroups/fs/stats_util_test.go @@ -53,7 +53,7 @@ func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) { } if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil { - log.Printf("blkio IoMergedRecursive do not match - %s vs %s\n", expected.IoMergedRecursive, actual.IoMergedRecursive) + log.Printf("blkio IoMergedRecursive do not match - %v vs %v\n", expected.IoMergedRecursive, actual.IoMergedRecursive) t.Fail() } @@ -90,4 +90,8 @@ func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) t.Fail() } } + if expected.Failcnt != actual.Failcnt { + log.Printf("Expected memory failcnt %d but found %d\n", expected.Failcnt, actual.Failcnt) + t.Fail() + } } diff --git a/cgroups/systemd/apply_systemd.go b/cgroups/systemd/apply_systemd.go index 05b97444..7143a595 100644 --- a/cgroups/systemd/apply_systemd.go +++ b/cgroups/systemd/apply_systemd.go @@ -118,6 +118,11 @@ func (m *Manager) Apply(pid int) error { newProp("CPUShares", uint64(c.CpuShares))) } + if c.BlkioWeight != 0 { + properties = append(properties, + newProp("BlockIOWeight", uint64(c.BlkioWeight))) + } + if _, err := theConn.StartTransientUnit(unitName, "replace", properties...); err != nil { return err } @@ -330,5 +335,5 @@ func joinCpuset(c *cgroups.Cgroup, pid int) error { s := &fs.CpusetGroup{} - return s.SetDir(path, c.CpusetCpus, pid) + return s.SetDir(path, c.CpusetCpus, c.CpusetMems, pid) } diff --git a/cgroups/utils.go b/cgroups/utils.go index 224a20b9..a360904c 100644 --- a/cgroups/utils.go +++ b/cgroups/utils.go @@ -9,6 +9,7 @@ import ( "path/filepath" "strconv" "strings" + "time" "github.com/docker/docker/pkg/mount" ) @@ -173,7 +174,7 @@ func ParseCgroupFile(subsystem string, r io.Reader) (string, error) { return "", NewNotFoundError(subsystem) } -func pathExists(path string) bool { +func PathExists(path string) bool { if _, err := os.Stat(path); err != nil { return false } @@ -182,7 +183,7 @@ func pathExists(path string) bool { func EnterPid(cgroupPaths map[string]string, pid int) error { for _, path := range cgroupPaths { - if pathExists(path) { + if PathExists(path) { if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"), []byte(strconv.Itoa(pid)), 0700); err != nil { return err @@ -193,13 +194,30 @@ func EnterPid(cgroupPaths map[string]string, pid int) error { } // RemovePaths iterates over the provided paths removing them. -// If an error is encountered the removal proceeds and the first error is -// returned to ensure a partial removal is not possible. +// We trying to remove all paths five times with increasing delay between tries. +// If after all there are not removed cgroups - appropriate error will be +// returned. func RemovePaths(paths map[string]string) (err error) { - for _, path := range paths { - if rerr := os.RemoveAll(path); err == nil { - err = rerr + delay := 10 * time.Millisecond + for i := 0; i < 5; i++ { + if i != 0 { + time.Sleep(delay) + delay *= 2 + } + for s, p := range paths { + os.RemoveAll(p) + // TODO: here probably should be logging + _, err := os.Stat(p) + // We need this strange way of checking cgroups existence because + // RemoveAll almost always returns error, even on already removed + // cgroups + if os.IsNotExist(err) { + delete(paths, s) + } + } + if len(paths) == 0 { + return nil } } - return err + return fmt.Errorf("Failed to remove paths: %s", paths) } diff --git a/configs/config.go b/configs/config.go index ab40b2b4..d1e03f61 100644 --- a/configs/config.go +++ b/configs/config.go @@ -10,11 +10,55 @@ type MountConfig mount.MountConfig type Network network.Network +type NamespaceType string + +const ( + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" +) + // Namespace defines configuration for each namespace. It specifies an // alternate path that is able to be joined via setns. type Namespace struct { - Name string `json:"name"` - Path string `json:"path,omitempty"` + Type NamespaceType `json:"type"` + Path string `json:"path,omitempty"` +} + +type Namespaces []Namespace + +func (n *Namespaces) Remove(t NamespaceType) bool { + i := n.index(t) + if i == -1 { + return false + } + *n = append((*n)[:i], (*n)[i+1:]...) + return true +} + +func (n *Namespaces) Add(t NamespaceType, path string) { + i := n.index(t) + if i == -1 { + *n = append(*n, Namespace{Type: t, Path: path}) + return + } + (*n)[i].Path = path +} + +func (n *Namespaces) index(t NamespaceType) int { + for i, ns := range *n { + if ns.Type == t { + return i + } + } + return -1 +} + +func (n *Namespaces) Contains(t NamespaceType) bool { + return n.index(t) != -1 } // Config defines configuration options for executing a process inside a contained environment. @@ -45,7 +89,7 @@ type Config struct { // Namespaces specifies the container's namespaces that it should setup when cloning the init process // If a namespace is not provided that namespace is shared from the container's parent process - Namespaces []Namespace `json:"namespaces,omitempty"` + Namespaces Namespaces `json:"namespaces,omitempty"` // Capabilities specify the capabilities to keep when executing the process inside the container // All capbilities not specified will be dropped from the processes capability mask @@ -76,6 +120,15 @@ type Config struct { // Rlimits specifies the resource limits, such as max open files, to set in the container // If Rlimits are not set, the container will inherit rlimits from the parent process Rlimits []Rlimit `json:"rlimits,omitempty"` + + // AdditionalGroups specifies the gids that should be added to supplementary groups + // in addition to those that the user belongs to. + AdditionalGroups []int `json:"additional_groups,omitempty"` + // UidMappings is an array of User ID mappings for User Namespaces + UidMappings []IDMap `json:"uid_mappings,omitempty"` + + // GidMappings is an array of Group ID mappings for User Namespaces + GidMappings []IDMap `json:"gid_mappings,omitempty"` } // Routes can be specified to create entries in the route table as the container is started @@ -104,3 +157,10 @@ type Rlimit struct { Hard uint64 `json:"hard,omitempty"` Soft uint64 `json:"soft,omitempty"` } + +// IDMap represents UID/GID Mappings for User Namespaces. +type IDMap struct { + ContainerID int `json:"container_id,omitempty"` + HostID int `json:"host_id,omitempty"` + Size int `json:"size,omitempty"` +} diff --git a/configs/config_test.go b/configs/config_test.go index f698e3d9..d64066c4 100644 --- a/configs/config_test.go +++ b/configs/config_test.go @@ -64,12 +64,12 @@ func TestConfigJsonFormat(t *testing.T) { t.Fail() } - if getNamespaceIndex(container, "NEWNET") == -1 { + if !container.Namespaces.Contains(NEWNET) { t.Log("namespaces should contain NEWNET") t.Fail() } - if getNamespaceIndex(container, "NEWUSER") != -1 { + if container.Namespaces.Contains(NEWUSER) { t.Log("namespaces should not contain NEWUSER") t.Fail() } @@ -159,11 +159,14 @@ func TestSelinuxLabels(t *testing.T) { } } -func getNamespaceIndex(config *Config, name string) int { - for i, v := range config.Namespaces { - if v.Name == name { - return i - } +func TestRemoveNamespace(t *testing.T) { + ns := Namespaces{ + {Type: NEWNET}, + } + if !ns.Remove(NEWNET) { + t.Fatal("NEWNET was not removed") + } + if len(ns) != 0 { + t.Fatalf("namespaces should have 0 items but reports %d", len(ns)) } - return -1 } diff --git a/console/console.go b/console/console.go index 438e6704..69af70c1 100644 --- a/console/console.go +++ b/console/console.go @@ -13,7 +13,7 @@ import ( ) // Setup initializes the proper /dev/console inside the rootfs path -func Setup(rootfs, consolePath, mountLabel string) error { +func Setup(rootfs, consolePath, mountLabel string, hostRootUid, hostRootGid int) error { oldMask := syscall.Umask(0000) defer syscall.Umask(oldMask) @@ -21,7 +21,7 @@ func Setup(rootfs, consolePath, mountLabel string) error { return err } - if err := os.Chown(consolePath, 0, 0); err != nil { + if err := os.Chown(consolePath, hostRootUid, hostRootGid); err != nil { return err } diff --git a/integration/exec_test.go b/integration/exec_test.go index 8132e25b..9ec617d1 100644 --- a/integration/exec_test.go +++ b/integration/exec_test.go @@ -71,7 +71,7 @@ func TestIPCPrivate(t *testing.T) { } if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l { - t.Fatalf("ipc link should be private to the conatiner but equals host %q %q", actual, l) + t.Fatalf("ipc link should be private to the container but equals host %q %q", actual, l) } } @@ -92,8 +92,7 @@ func TestIPCHost(t *testing.T) { } config := newTemplateConfig(rootfs) - i := getNamespaceIndex(config, "NEWIPC") - config.Namespaces = append(config.Namespaces[:i], config.Namespaces[i+1:]...) + config.Namespaces.Remove(configs.NEWIPC) buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") if err != nil { t.Fatal(err) @@ -125,8 +124,7 @@ func TestIPCJoinPath(t *testing.T) { } config := newTemplateConfig(rootfs) - i := getNamespaceIndex(config, "NEWIPC") - config.Namespaces[i].Path = "/proc/1/ns/ipc" + config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipc") buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") if err != nil { @@ -154,12 +152,11 @@ func TestIPCBadPath(t *testing.T) { defer remove(rootfs) config := newTemplateConfig(rootfs) - i := getNamespaceIndex(config, "NEWIPC") - config.Namespaces[i].Path = "/proc/1/ns/ipcc" + config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipcc") _, _, err = runContainer(config, "", "true") if err == nil { - t.Fatal("container succeded with bad ipc path") + t.Fatal("container succeeded with bad ipc path") } } @@ -184,15 +181,6 @@ func TestRlimit(t *testing.T) { } } -func getNamespaceIndex(config *configs.Config, name string) int { - for i, v := range config.Namespaces { - if v.Name == name { - return i - } - } - return -1 -} - func newTestRoot() (string, error) { dir, err := ioutil.TempDir("", "libcontainer") if err != nil { diff --git a/integration/template_test.go b/integration/template_test.go index 834099d2..372cc695 100644 --- a/integration/template_test.go +++ b/integration/template_test.go @@ -32,13 +32,13 @@ func newTemplateConfig(rootfs string) *configs.Config { "KILL", "AUDIT_WRITE", }, - Namespaces: []configs.Namespace{ - {Name: "NEWNS"}, - {Name: "NEWUTS"}, - {Name: "NEWIPC"}, - {Name: "NEWPID"}, - {Name: "NEWNET"}, - }, + Namespaces: configs.Namespaces([]configs.Namespace{ + {Type: configs.NEWNS}, + {Type: configs.NEWUTS}, + {Type: configs.NEWIPC}, + {Type: configs.NEWPID}, + {Type: configs.NEWNET}, + }), Cgroups: &cgroups.Cgroup{ Name: "test", Parent: "integration", diff --git a/linux_factory.go b/linux_factory.go index 542331d4..b88a66de 100644 --- a/linux_factory.go +++ b/linux_factory.go @@ -168,10 +168,11 @@ func (l *linuxFactory) loadContainerState(root string) (*configs.State, error) { func (f *linuxFactory) StartInitialization(pipefd uintptr) (err error) { pipe := os.NewFile(uintptr(pipefd), "pipe") + setupUserns := os.Getenv("_LIBCONTAINER_USERNS") pid := os.Getenv("_LIBCONTAINER_INITPID") - if pid != "" { + if pid != "" && setupUserns == "" { return namespaces.InitIn(pipe) } - return namespaces.Init(pipe) + return namespaces.Init(pipe, setupUserns != "") } diff --git a/mount/init.go b/mount/init.go index a2c3d520..91a27294 100644 --- a/mount/init.go +++ b/mount/init.go @@ -25,7 +25,7 @@ type mount struct { // InitializeMountNamespace sets up the devices, mount points, and filesystems for use inside a // new mount namespace. -func InitializeMountNamespace(rootfs, console string, sysReadonly bool, mountConfig *MountConfig) error { +func InitializeMountNamespace(rootfs, console string, sysReadonly bool, hostRootUid, hostRootGid int, mountConfig *MountConfig) error { var ( err error flag = syscall.MS_PRIVATE @@ -58,14 +58,17 @@ func InitializeMountNamespace(rootfs, console string, sysReadonly bool, mountCon return fmt.Errorf("create device nodes %s", err) } - if err := SetupPtmx(rootfs, console, mountConfig.MountLabel); err != nil { + if err := SetupPtmx(rootfs, console, mountConfig.MountLabel, hostRootUid, hostRootGid); err != nil { return err } // stdin, stdout and stderr could be pointing to /dev/null from parent namespace. // Re-open them inside this namespace. - if err := reOpenDevNull(rootfs); err != nil { - return fmt.Errorf("Failed to reopen /dev/null %s", err) + // FIXME: Need to fix this for user namespaces. + if hostRootUid == 0 { + if err := reOpenDevNull(rootfs); err != nil { + return fmt.Errorf("Failed to reopen /dev/null %s", err) + } } if err := setupDevSymlinks(rootfs); err != nil { @@ -79,7 +82,7 @@ func InitializeMountNamespace(rootfs, console string, sysReadonly bool, mountCon if mountConfig.NoPivotRoot { err = MsMoveRoot(rootfs) } else { - err = PivotRoot(rootfs) + err = PivotRoot(rootfs, mountConfig.PivotDir) } if err != nil { diff --git a/mount/mount_config.go b/mount/mount_config.go index eef9b8ce..f19465e6 100644 --- a/mount/mount_config.go +++ b/mount/mount_config.go @@ -13,6 +13,11 @@ type MountConfig struct { // This is a common option when the container is running in ramdisk NoPivotRoot bool `json:"no_pivot_root,omitempty"` + // PivotDir allows a custom directory inside the container's root filesystem to be used as pivot, when NoPivotRoot is not set. + // When a custom PivotDir not set, a temporary dir inside the root filesystem will be used. The pivot dir needs to be writeable. + // This is required when using read only root filesystems. In these cases, a read/writeable path can be (bind) mounted somewhere inside the root filesystem to act as pivot. + PivotDir string `json:"pivot_dir,omitempty"` + // ReadonlyFs will remount the container's rootfs as readonly where only externally mounted // bind mounts are writtable ReadonlyFs bool `json:"readonly_fs,omitempty"` diff --git a/mount/pivotroot.go b/mount/pivotroot.go index a88ed4a8..acc3be24 100644 --- a/mount/pivotroot.go +++ b/mount/pivotroot.go @@ -10,8 +10,15 @@ import ( "syscall" ) -func PivotRoot(rootfs string) error { - pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root") +func PivotRoot(rootfs, pivotBaseDir string) error { + if pivotBaseDir == "" { + pivotBaseDir = "/" + } + tmpDir := filepath.Join(rootfs, pivotBaseDir) + if err := os.MkdirAll(tmpDir, 0755); err != nil { + return fmt.Errorf("can't create tmp dir %s, error %v", tmpDir, err) + } + pivotDir, err := ioutil.TempDir(tmpDir, ".pivot_root") if err != nil { return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err) } @@ -25,7 +32,7 @@ func PivotRoot(rootfs string) error { } // path to pivot dir now changed, update - pivotDir = filepath.Join("/", filepath.Base(pivotDir)) + pivotDir = filepath.Join(pivotBaseDir, filepath.Base(pivotDir)) if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { return fmt.Errorf("unmount pivot_root dir %s", err) } diff --git a/mount/ptmx.go b/mount/ptmx.go index c316481a..5b558775 100644 --- a/mount/ptmx.go +++ b/mount/ptmx.go @@ -10,7 +10,7 @@ import ( "github.com/docker/libcontainer/console" ) -func SetupPtmx(rootfs, consolePath, mountLabel string) error { +func SetupPtmx(rootfs, consolePath, mountLabel string, hostRootUid, hostRootGid int) error { ptmx := filepath.Join(rootfs, "dev/ptmx") if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { return err @@ -21,7 +21,7 @@ func SetupPtmx(rootfs, consolePath, mountLabel string) error { } if consolePath != "" { - if err := console.Setup(rootfs, consolePath, mountLabel); err != nil { + if err := console.Setup(rootfs, consolePath, mountLabel, hostRootUid, hostRootGid); err != nil { return err } } diff --git a/namespaces/create.go b/namespaces/create.go deleted file mode 100644 index 30de84ce..00000000 --- a/namespaces/create.go +++ /dev/null @@ -1,10 +0,0 @@ -package namespaces - -import ( - "os" - "os/exec" - - "github.com/docker/libcontainer/configs" -) - -type CreateCommand func(container *configs.Config, console, dataPath, init string, childPipe *os.File, args []string) *exec.Cmd diff --git a/namespaces/exec.go b/namespaces/exec.go index 68c3a2be..0c0f6cf6 100644 --- a/namespaces/exec.go +++ b/namespaces/exec.go @@ -4,6 +4,7 @@ package namespaces import ( "encoding/json" + "fmt" "io" "os" "os/exec" @@ -15,6 +16,99 @@ import ( "github.com/docker/libcontainer/system" ) +const ( + EXIT_SIGNAL_OFFSET = 128 +) + +func executeSetupCmd(args []string, ppid int, container *configs.Config, process *processArgs, networkState *network.NetworkState) error { + command := exec.Command(args[0], args[1:]...) + + parent, child, err := newInitPipe() + if err != nil { + return err + } + defer parent.Close() + command.ExtraFiles = []*os.File{child} + + command.Dir = container.RootFs + command.Env = append(command.Env, + fmt.Sprintf("_LIBCONTAINER_INITPID=%d", ppid), + fmt.Sprintf("_LIBCONTAINER_USERNS=1")) + + err = command.Start() + child.Close() + if err != nil { + return err + } + + s, err := command.Process.Wait() + if err != nil { + return err + } + if !s.Success() { + return &exec.ExitError{s} + } + + decoder := json.NewDecoder(parent) + var pid *pid + + if err := decoder.Decode(&pid); err != nil { + return err + } + + p, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + + terminate := func(terr error) error { + // TODO: log the errors for kill and wait + p.Kill() + p.Wait() + return terr + } + + encoder := json.NewEncoder(parent) + + if err := encoder.Encode(container); err != nil { + return terminate(err) + } + + if err := encoder.Encode(process); err != nil { + return terminate(err) + } + + // send the state to the container's init process then shutdown writes for the parent + if err := encoder.Encode(networkState); err != nil { + return terminate(err) + } + + // shutdown writes for the parent side of the pipe + if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil { + return terminate(err) + } + + // wait for the child process to fully complete and receive an error message + // if one was encoutered + var ierr *initError + if err := decoder.Decode(&ierr); err != nil && err != io.EOF { + return terminate(err) + } + if ierr != nil { + return ierr + } + + s, err = p.Wait() + if err != nil { + return err + } + if !s.Success() { + return &exec.ExitError{s} + } + + return nil +} + // TODO(vishh): This is part of the libcontainer API and it does much more than just namespaces related work. // Move this to libcontainer package. // Exec performs setup outside of a namespace so that a container can be @@ -32,16 +126,35 @@ func Exec(args []string, env []string, console string, command *exec.Cmd, contai command.Dir = container.RootFs command.SysProcAttr.Cloneflags = uintptr(GetNamespaceFlags(container.Namespaces)) + if container.Namespaces.Contains(configs.NEWUSER) { + AddUidGidMappings(command.SysProcAttr, container) + + // Default to root user when user namespaces are enabled. + if command.SysProcAttr.Credential == nil { + command.SysProcAttr.Credential = &syscall.Credential{} + } + } + if err := command.Start(); err != nil { child.Close() return err } child.Close() + wait := func() (*os.ProcessState, error) { + ps, err := command.Process.Wait() + // we should kill all processes in cgroup when init is died if we use + // host PID namespace + if !container.Namespaces.Contains(configs.NEWPID) { + killAllPids(cgroupManager) + } + return ps, err + } + terminate := func(terr error) error { // TODO: log the errors for kill and wait command.Process.Kill() - command.Wait() + wait() return terr } @@ -81,6 +194,14 @@ func Exec(args []string, env []string, console string, command *exec.Cmd, contai if err := InitializeNetworking(container, command.Process.Pid, &networkState); err != nil { return terminate(err) } + + // Start the setup process to setup the init process + if container.Namespaces.Contains(configs.NEWUSER) { + if err = executeSetupCmd(command.Args, command.Process.Pid, container, &process, &networkState); err != nil { + return terminate(err) + } + } + // send the state to the container's init process then shutdown writes for the parent if err := encoder.Encode(networkState); err != nil { return terminate(err) @@ -108,6 +229,101 @@ func Exec(args []string, env []string, console string, command *exec.Cmd, contai return nil } +// killAllPids iterates over all of the container's processes +// sending a SIGKILL to each process. +func killAllPids(m cgroups.Manager) error { + var ( + procs []*os.Process + ) + m.Freeze(cgroups.Frozen) + pids, err := m.GetPids() + if err != nil { + return err + } + for _, pid := range pids { + // TODO: log err without aborting if we are unable to find + // a single PID + if p, err := os.FindProcess(pid); err == nil { + procs = append(procs, p) + p.Kill() + } + } + m.Freeze(cgroups.Thawed) + for _, p := range procs { + p.Wait() + } + return err +} + +// Utility function that gets a host ID for a container ID from user namespace map +// if that ID is present in the map. +func hostIDFromMapping(containerID int, uMap []configs.IDMap) (int, bool) { + for _, m := range uMap { + if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { + hostID := m.HostID + (containerID - m.ContainerID) + return hostID, true + } + } + return -1, false +} + +// Gets the root uid for the process on host which could be non-zero +// when user namespaces are enabled. +func GetHostRootGid(container *configs.Config) (int, error) { + if container.Namespaces.Contains(configs.NEWUSER) { + if container.GidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.") + } + hostRootGid, found := hostIDFromMapping(0, container.GidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no root user mapping found.") + } + return hostRootGid, nil + } + + // Return default root uid 0 + return 0, nil +} + +// Gets the root uid for the process on host which could be non-zero +// when user namespaces are enabled. +func GetHostRootUid(container *configs.Config) (int, error) { + if container.Namespaces.Contains(configs.NEWUSER) { + if container.UidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no user mappings found.") + } + hostRootUid, found := hostIDFromMapping(0, container.UidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no root user mapping found.") + } + return hostRootUid, nil + } + + // Return default root uid 0 + return 0, nil +} + +// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr. +func AddUidGidMappings(sys *syscall.SysProcAttr, container *configs.Config) { + if container.UidMappings != nil { + sys.UidMappings = make([]syscall.SysProcIDMap, len(container.UidMappings)) + for i, um := range container.UidMappings { + sys.UidMappings[i].ContainerID = um.ContainerID + sys.UidMappings[i].HostID = um.HostID + sys.UidMappings[i].Size = um.Size + } + } + + if container.GidMappings != nil { + sys.GidMappings = make([]syscall.SysProcIDMap, len(container.GidMappings)) + for i, gm := range container.GidMappings { + sys.GidMappings[i].ContainerID = gm.ContainerID + sys.GidMappings[i].HostID = gm.HostID + sys.GidMappings[i].Size = gm.Size + } + } +} + // InitializeNetworking creates the container's network stack outside of the namespace and moves // interfaces into the container's net namespaces if necessary func InitializeNetworking(container *configs.Config, nspid int, networkState *network.NetworkState) error { diff --git a/namespaces/execin.go b/namespaces/execin.go index cff1cca7..514ba999 100644 --- a/namespaces/execin.go +++ b/namespaces/execin.go @@ -8,12 +8,16 @@ import ( "io/ioutil" "os" "os/exec" + "syscall" "github.com/docker/libcontainer/apparmor" "github.com/docker/libcontainer/cgroups" "github.com/docker/libcontainer/configs" "github.com/docker/libcontainer/label" + "github.com/docker/libcontainer/mount" + "github.com/docker/libcontainer/network" "github.com/docker/libcontainer/system" + "github.com/docker/libcontainer/utils" ) type pid struct { @@ -140,6 +144,10 @@ func FinalizeSetns(container *configs.Config) error { return err } + if err := setupRlimits(container); err != nil { + return fmt.Errorf("setup rlimits %s", err) + } + if err := FinalizeNamespace(container); err != nil { return err } @@ -157,6 +165,68 @@ func FinalizeSetns(container *configs.Config) error { return nil } +// SetupContainer is run to setup mounts and networking related operations +// for a user namespace enabled process as a user namespace root doesn't +// have permissions to perform these operations. +// The setup process joins all the namespaces of user namespace enabled init +// except the user namespace, so it run as root in the root user namespace +// to perform these operations. +func SetupContainer(container *configs.Config, networkState *network.NetworkState, consolePath string) error { + rootfs, err := utils.ResolveRootfs(container.RootFs) + if err != nil { + return err + } + + // clear the current processes env and replace it with the environment + // defined on the container + if err := LoadContainerEnvironment(container); err != nil { + return err + } + + cloneFlags := GetNamespaceFlags(container.Namespaces) + + if (cloneFlags & syscall.CLONE_NEWNET) == 0 { + if len(container.Networks) != 0 || len(container.Routes) != 0 { + return fmt.Errorf("unable to apply network parameters without network namespace") + } + } else { + if err := setupNetwork(container, networkState); err != nil { + return fmt.Errorf("setup networking %s", err) + } + if err := setupRoute(container); err != nil { + return fmt.Errorf("setup route %s", err) + } + } + + label.Init() + + hostRootUid, err := GetHostRootUid(container) + if err != nil { + return fmt.Errorf("failed to get hostRootUid %s", err) + } + + hostRootGid, err := GetHostRootGid(container) + if err != nil { + return fmt.Errorf("failed to get hostRootGid %s", err) + } + + // InitializeMountNamespace() can be executed only for a new mount namespace + if (cloneFlags & syscall.CLONE_NEWNS) == 0 { + if container.MountConfig != nil { + return fmt.Errorf("mount config is set without mount namespace") + } + } else if err := mount.InitializeMountNamespace(rootfs, + consolePath, + container.RestrictSys, + hostRootUid, + hostRootGid, + (*mount.MountConfig)(container.MountConfig)); err != nil { + return fmt.Errorf("setup mount namespace %s", err) + } + + return nil +} + func EnterCgroups(state *configs.State, pid int) error { return cgroups.EnterPid(state.CgroupPaths, pid) } diff --git a/namespaces/init.go b/namespaces/init.go index 441b3c34..01f72114 100644 --- a/namespaces/init.go +++ b/namespaces/init.go @@ -37,7 +37,7 @@ type processArgs struct { // and other options required for the new container. // The caller of Init function has to ensure that the go runtime is locked to an OS thread // (using runtime.LockOSThread) else system calls like setns called within Init may not work as intended. -func Init(pipe *os.File) (err error) { +func Init(pipe *os.File, setupUserns bool) (err error) { defer func() { // if we have an error during the initialization of the container's init then send it back to the // parent process in the form of an initError. @@ -72,6 +72,29 @@ func Init(pipe *os.File) (err error) { return err } + // We always read this as it is a way to sync with the parent as well + var networkState *network.NetworkState + if err := decoder.Decode(&networkState); err != nil { + return err + } + + if setupUserns { + err = SetupContainer(container, networkState, process.ConsolePath) + if err == nil { + os.Exit(0) + } else { + os.Exit(1) + } + } + + if container.Namespaces.Contains(configs.NEWUSER) { + return initUserNs(container, uncleanRootfs, process, networkState) + } else { + return initDefault(container, uncleanRootfs, process, networkState) + } +} + +func initDefault(container *configs.Config, uncleanRootfs string, process *processArgs, networkState *network.NetworkState) (err error) { rootfs, err := utils.ResolveRootfs(uncleanRootfs) if err != nil { return err @@ -83,11 +106,6 @@ func Init(pipe *os.File) (err error) { return err } - // We always read this as it is a way to sync with the parent as well - var networkState *network.NetworkState - if err := decoder.Decode(&networkState); err != nil { - return err - } // join any namespaces via a path to the namespace fd if provided if err := joinExistingNamespaces(container.Namespaces); err != nil { return err @@ -106,11 +124,19 @@ func Init(pipe *os.File) (err error) { } } - if err := setupNetwork(container, networkState); err != nil { - return fmt.Errorf("setup networking %s", err) - } - if err := setupRoute(container); err != nil { - return fmt.Errorf("setup route %s", err) + cloneFlags := GetNamespaceFlags(container.Namespaces) + + if (cloneFlags & syscall.CLONE_NEWNET) == 0 { + if len(container.Networks) != 0 || len(container.Routes) != 0 { + return fmt.Errorf("unable to apply network parameters without network namespace") + } + } else { + if err := setupNetwork(container, networkState); err != nil { + return fmt.Errorf("setup networking %s", err) + } + if err := setupRoute(container); err != nil { + return fmt.Errorf("setup route %s", err) + } } if err := setupRlimits(container); err != nil { @@ -119,14 +145,24 @@ func Init(pipe *os.File) (err error) { label.Init() - if err := mount.InitializeMountNamespace(rootfs, + // InitializeMountNamespace() can be executed only for a new mount namespace + if (cloneFlags & syscall.CLONE_NEWNS) == 0 { + if container.MountConfig != nil { + return fmt.Errorf("mount config is set without mount namespace") + } + } else if err := mount.InitializeMountNamespace(rootfs, process.ConsolePath, container.RestrictSys, + 0, // Default Root Uid + 0, // Default Root Gid (*mount.MountConfig)(container.MountConfig)); err != nil { return fmt.Errorf("setup mount namespace %s", err) } if container.Hostname != "" { + if (cloneFlags & syscall.CLONE_NEWUTS) == 0 { + return fmt.Errorf("unable to set the hostname without UTS namespace") + } if err := syscall.Sethostname([]byte(container.Hostname)); err != nil { return fmt.Errorf("unable to sethostname %q: %s", container.Hostname, err) } @@ -142,6 +178,88 @@ func Init(pipe *os.File) (err error) { // TODO: (crosbymichael) make this configurable at the Config level if container.RestrictSys { + if (cloneFlags & syscall.CLONE_NEWNS) == 0 { + return fmt.Errorf("unable to restrict access to kernel files without mount namespace") + } + if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { + return err + } + } + + pdeathSignal, err := system.GetParentDeathSignal() + if err != nil { + return fmt.Errorf("get parent death signal %s", err) + } + + if err := FinalizeNamespace(container); err != nil { + return fmt.Errorf("finalize namespace %s", err) + } + + // FinalizeNamespace can change user/group which clears the parent death + // signal, so we restore it here. + if err := RestoreParentDeathSignal(pdeathSignal); err != nil { + return fmt.Errorf("restore parent death signal %s", err) + } + + return system.Execv(process.Args[0], process.Args[0:], process.Env) +} + +func initUserNs(container *configs.Config, uncleanRootfs string, process *processArgs, networkState *network.NetworkState) (err error) { + // clear the current processes env and replace it with the environment + // defined on the container + if err := LoadContainerEnvironment(container); err != nil { + return err + } + + // join any namespaces via a path to the namespace fd if provided + if err := joinExistingNamespaces(container.Namespaces); err != nil { + return err + } + if process.ConsolePath != "" { + if err := console.OpenAndDup("/dev/console"); err != nil { + return err + } + } + if _, err := syscall.Setsid(); err != nil { + return fmt.Errorf("setsid %s", err) + } + if process.ConsolePath != "" { + if err := system.Setctty(); err != nil { + return fmt.Errorf("setctty %s", err) + } + } + + if container.WorkingDir == "" { + container.WorkingDir = "/" + } + + if err := setupRlimits(container); err != nil { + return fmt.Errorf("setup rlimits %s", err) + } + + cloneFlags := GetNamespaceFlags(container.Namespaces) + + if container.Hostname != "" { + if (cloneFlags & syscall.CLONE_NEWUTS) == 0 { + return fmt.Errorf("unable to set the hostname without UTS namespace") + } + if err := syscall.Sethostname([]byte(container.Hostname)); err != nil { + return fmt.Errorf("unable to sethostname %q: %s", container.Hostname, err) + } + } + + if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil { + return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err) + } + + if err := label.SetProcessLabel(container.ProcessLabel); err != nil { + return fmt.Errorf("set process label %s", err) + } + + if container.RestrictSys { + if (cloneFlags & syscall.CLONE_NEWNS) == 0 { + return fmt.Errorf("unable to restrict access to kernel files without mount namespace") + } if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { return err } @@ -194,7 +312,7 @@ func RestoreParentDeathSignal(old int) error { } // SetupUser changes the groups, gid, and uid for the user inside the container -func SetupUser(u string) error { +func SetupUser(container *configs.Config) error { // Set up defaults. defaultExecUser := user.ExecUser{ Uid: syscall.Getuid(), @@ -202,22 +320,24 @@ func SetupUser(u string) error { Home: "/", } - passwdFile, err := user.GetPasswdFile() + passwdPath, err := user.GetPasswdPath() if err != nil { return err } - groupFile, err := user.GetGroupFile() + groupPath, err := user.GetGroupPath() if err != nil { return err } - execUser, err := user.GetExecUserFile(u, &defaultExecUser, passwdFile, groupFile) + execUser, err := user.GetExecUserPath(container.User, &defaultExecUser, passwdPath, groupPath) if err != nil { return fmt.Errorf("get supplementary groups %s", err) } - if err := syscall.Setgroups(execUser.Sgids); err != nil { + suppGroups := append(execUser.Sgids, container.AdditionalGroups...) + + if err := syscall.Setgroups(suppGroups); err != nil { return fmt.Errorf("setgroups %s", err) } @@ -297,7 +417,7 @@ func FinalizeNamespace(container *configs.Config) error { return fmt.Errorf("set keep caps %s", err) } - if err := SetupUser(container.User); err != nil { + if err := SetupUser(container); err != nil { return fmt.Errorf("setup user %s", err) } @@ -342,7 +462,7 @@ func joinExistingNamespaces(namespaces []configs.Namespace) error { if err != nil { return err } - err = system.Setns(f.Fd(), uintptr(namespaceInfo[ns.Name])) + err = system.Setns(f.Fd(), uintptr(namespaceInfo[ns.Type])) f.Close() if err != nil { return err diff --git a/namespaces/utils.go b/namespaces/utils.go index 4aa590fd..978a02d8 100644 --- a/namespaces/utils.go +++ b/namespaces/utils.go @@ -17,13 +17,13 @@ func (i initError) Error() string { return i.Message } -var namespaceInfo = map[string]int{ - "NEWNET": syscall.CLONE_NEWNET, - "NEWNS": syscall.CLONE_NEWNS, - "NEWUSER": syscall.CLONE_NEWUSER, - "NEWIPC": syscall.CLONE_NEWIPC, - "NEWUTS": syscall.CLONE_NEWUTS, - "NEWPID": syscall.CLONE_NEWPID, +var namespaceInfo = map[configs.NamespaceType]int{ + configs.NEWNET: syscall.CLONE_NEWNET, + configs.NEWNS: syscall.CLONE_NEWNS, + configs.NEWUSER: syscall.CLONE_NEWUSER, + configs.NEWIPC: syscall.CLONE_NEWIPC, + configs.NEWUTS: syscall.CLONE_NEWUTS, + configs.NEWPID: syscall.CLONE_NEWPID, } // New returns a newly initialized Pipe for communication between processes @@ -36,10 +36,13 @@ func newInitPipe() (parent *os.File, child *os.File, err error) { } // GetNamespaceFlags parses the container's Namespaces options to set the correct -// flags on clone, unshare, and setns -func GetNamespaceFlags(namespaces []configs.Namespace) (flag int) { +// flags on clone, unshare. This functions returns flags only for new namespaces. +func GetNamespaceFlags(namespaces configs.Namespaces) (flag int) { for _, v := range namespaces { - flag |= namespaceInfo[v.Name] + if v.Path != "" { + continue + } + flag |= namespaceInfo[v.Type] } return flag } diff --git a/netlink/netlink_linux.go b/netlink/netlink_linux.go index 1bf70430..3cc3cc94 100644 --- a/netlink/netlink_linux.go +++ b/netlink/netlink_linux.go @@ -522,11 +522,10 @@ func NetworkSetMacAddress(iface *net.Interface, macaddr string) error { var ( MULTICAST byte = 0x1 - LOCALOUI byte = 0x2 ) - if hwaddr[0]&0x1 == MULTICAST || hwaddr[0]&0x2 != LOCALOUI { - return fmt.Errorf("Incorrect Local MAC Address specified: %s", macaddr) + if hwaddr[0]&0x1 == MULTICAST { + return fmt.Errorf("Multicast MAC Address is not supported: %s", macaddr) } wb := newNetlinkRequest(syscall.RTM_SETLINK, syscall.NLM_F_ACK) diff --git a/network/network.go b/network/network.go index ba8f6f74..40b25b13 100644 --- a/network/network.go +++ b/network/network.go @@ -88,6 +88,18 @@ func SetInterfaceIp(name string, rawIp string) error { return netlink.NetworkLinkAddIp(iface, ip, ipNet) } +func DeleteInterfaceIp(name string, rawIp string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + ip, ipNet, err := net.ParseCIDR(rawIp) + if err != nil { + return err + } + return netlink.NetworkLinkDelIp(iface, ip, ipNet) +} + func SetMtu(name string, mtu int) error { iface, err := net.InterfaceByName(name) if err != nil { diff --git a/cgroups/fs/notify_linux.go b/notify_linux.go similarity index 54% rename from cgroups/fs/notify_linux.go rename to notify_linux.go index d92063ba..059ce513 100644 --- a/cgroups/fs/notify_linux.go +++ b/notify_linux.go @@ -1,33 +1,30 @@ // +build linux -package fs +package libcontainer import ( "fmt" + "github.com/docker/libcontainer/configs" + "io/ioutil" "os" "path/filepath" "syscall" - - "github.com/docker/libcontainer/cgroups" ) -// NotifyOnOOM sends signals on the returned channel when the cgroup reaches -// its memory limit. The channel is closed when the cgroup is removed. -func NotifyOnOOM(c *cgroups.Cgroup) (<-chan struct{}, error) { - d, err := getCgroupData(c, 0) +const oomCgroupName = "memory" + +// NotifyOnOOM returns channel on which you can expect event about OOM, +// if process died without OOM this channel will be closed. +// s is current *libcontainer.State for container. +func NotifyOnOOM(s *configs.State) (<-chan struct{}, error) { + dir := s.CgroupPaths[oomCgroupName] + if dir == "" { + return nil, fmt.Errorf("There is no path for %q in state", oomCgroupName) + } + oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control")) if err != nil { return nil, err } - - return notifyOnOOM(d) -} - -func notifyOnOOM(d *data) (<-chan struct{}, error) { - dir, err := d.path("memory") - if err != nil { - return nil, err - } - fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0) if syserr != 0 { return nil, syserr @@ -35,48 +32,32 @@ func notifyOnOOM(d *data) (<-chan struct{}, error) { eventfd := os.NewFile(fd, "eventfd") - oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control")) - if err != nil { - eventfd.Close() - return nil, err - } - - var ( - eventControlPath = filepath.Join(dir, "cgroup.event_control") - data = fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd()) - ) - - if err := writeFile(dir, "cgroup.event_control", data); err != nil { + eventControlPath := filepath.Join(dir, "cgroup.event_control") + data := fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd()) + if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil { eventfd.Close() oomControl.Close() return nil, err } - ch := make(chan struct{}) - go func() { defer func() { close(ch) eventfd.Close() oomControl.Close() }() - buf := make([]byte, 8) - for { if _, err := eventfd.Read(buf); err != nil { return } - // When a cgroup is destroyed, an event is sent to eventfd. // So if the control path is gone, return instead of notifying. if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) { return } - ch <- struct{}{} } }() - return ch, nil } diff --git a/cgroups/fs/notify_linux_test.go b/notify_linux_test.go similarity index 66% rename from cgroups/fs/notify_linux_test.go rename to notify_linux_test.go index a11880cb..8a3026ed 100644 --- a/cgroups/fs/notify_linux_test.go +++ b/notify_linux_test.go @@ -1,38 +1,50 @@ // +build linux -package fs +package libcontainer import ( "encoding/binary" "fmt" + "io/ioutil" + "os" + "path/filepath" "syscall" "testing" "time" + + "github.com/docker/libcontainer/configs" ) func TestNotifyOnOOM(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - - helper.writeFileContents(map[string]string{ - "memory.oom_control": "", - "cgroup.event_control": "", - }) - + memoryPath, err := ioutil.TempDir("", "testnotifyoom-") + if err != nil { + t.Fatal(err) + } + oomPath := filepath.Join(memoryPath, "memory.oom_control") + eventPath := filepath.Join(memoryPath, "cgroup.event_control") + if err := ioutil.WriteFile(oomPath, []byte{}, 0700); err != nil { + t.Fatal(err) + } + if err := ioutil.WriteFile(eventPath, []byte{}, 0700); err != nil { + t.Fatal(err) + } var eventFd, oomControlFd int - - ooms, err := notifyOnOOM(helper.CgroupData) + st := &configs.State{ + CgroupPaths: map[string]string{ + "memory": memoryPath, + }, + } + ooms, err := NotifyOnOOM(st) if err != nil { t.Fatal("expected no error, got:", err) } - memoryPath, _ := helper.CgroupData.path("memory") - data, err := readFile(memoryPath, "cgroup.event_control") + data, err := ioutil.ReadFile(eventPath) if err != nil { t.Fatal("couldn't read event control file:", err) } - if _, err := fmt.Sscanf(data, "%d %d", &eventFd, &oomControlFd); err != nil { + if _, err := fmt.Sscanf(string(data), "%d %d", &eventFd, &oomControlFd); err != nil { t.Fatalf("invalid control data %q: %s", data, err) } @@ -62,7 +74,9 @@ func TestNotifyOnOOM(t *testing.T) { // simulate what happens when a cgroup is destroyed by cleaning up and then // writing to the eventfd. - helper.cleanup() + if err := os.RemoveAll(memoryPath); err != nil { + t.Fatal(err) + } if _, err := syscall.Write(efd, buf); err != nil { t.Fatal("unable to write to eventfd:", err) } diff --git a/nsinit/main.go b/nsinit/main.go index d1e4bf1e..2de7bc3e 100644 --- a/nsinit/main.go +++ b/nsinit/main.go @@ -26,11 +26,12 @@ func main() { app.Before = preload app.Commands = []cli.Command{ + configCommand, execCommand, initCommand, - statsCommand, - configCommand, + oomCommand, pauseCommand, + statsCommand, unpauseCommand, } diff --git a/nsinit/oom.go b/nsinit/oom.go new file mode 100644 index 00000000..f7a333d4 --- /dev/null +++ b/nsinit/oom.go @@ -0,0 +1,29 @@ +package main + +import ( + "log" + + "github.com/codegangsta/cli" + "github.com/docker/libcontainer" + "github.com/docker/libcontainer/configs" +) + +var oomCommand = cli.Command{ + Name: "oom", + Usage: "display oom notifications for a container", + Action: oomAction, +} + +func oomAction(context *cli.Context) { + state, err := configs.GetState(dataPath) + if err != nil { + log.Fatal(err) + } + n, err := libcontainer.NotifyOnOOM(state) + if err != nil { + log.Fatal(err) + } + for range n { + log.Printf("OOM notification received") + } +} diff --git a/sample_configs/apparmor.json b/sample_configs/apparmor.json index 50421ec8..96f73cb7 100644 --- a/sample_configs/apparmor.json +++ b/sample_configs/apparmor.json @@ -177,11 +177,11 @@ ], "hostname": "koye", "namespaces": [ - {"name":"NEWIPC"}, - {"name": "NEWNET"}, - {"name": "NEWNS"}, - {"name": "NEWPID"}, - {"name": "NEWUTS"} + {"type":"NEWIPC"}, + {"type": "NEWNET"}, + {"type": "NEWNS"}, + {"type": "NEWPID"}, + {"type": "NEWUTS"} ], "networks": [ { diff --git a/sample_configs/attach_to_bridge.json b/sample_configs/attach_to_bridge.json index 9b190293..e5c03a7e 100644 --- a/sample_configs/attach_to_bridge.json +++ b/sample_configs/attach_to_bridge.json @@ -176,11 +176,11 @@ ], "hostname": "koye", "namespaces": [ - {"name": "NEWIPC"}, - {"name": "NEWNET"}, - {"name": "NEWNS"}, - {"name": "NEWPID"}, - {"name": "NEWUTS"} + {"type": "NEWIPC"}, + {"type": "NEWNET"}, + {"type": "NEWNS"}, + {"type": "NEWPID"}, + {"type": "NEWUTS"} ], "networks": [ { diff --git a/sample_configs/host-pid.json b/sample_configs/host-pid.json new file mode 100644 index 00000000..f47af930 --- /dev/null +++ b/sample_configs/host-pid.json @@ -0,0 +1,200 @@ +{ + "capabilities": [ + "CHOWN", + "DAC_OVERRIDE", + "FOWNER", + "MKNOD", + "NET_RAW", + "SETGID", + "SETUID", + "SETFCAP", + "SETPCAP", + "NET_BIND_SERVICE", + "SYS_CHROOT", + "KILL" + ], + "cgroups": { + "allowed_devices": [ + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 98 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 1, + "path": "/dev/console", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "path": "/dev/tty0", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "minor_number": 1, + "path": "/dev/tty1", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 136, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 2, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 10, + "minor_number": 200, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ], + "name": "docker-koye", + "parent": "docker" + }, + "restrict_sys": true, + "mount_config": { + "device_nodes": [ + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ], + "mounts": [ + { + "type": "tmpfs", + "destination": "/tmp" + } + ] + }, + "environment": [ + "HOME=/", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOSTNAME=koye", + "TERM=xterm" + ], + "hostname": "koye", + "namespaces": [ + {"type": "NEWIPC"}, + {"type": "NEWNET"}, + {"type": "NEWNS"}, + {"type": "NEWUTS"} + ], + "networks": [ + { + "address": "127.0.0.1/0", + "gateway": "localhost", + "mtu": 1500, + "type": "loopback" + } + ], + "tty": true, + "user": "daemon" +} diff --git a/sample_configs/minimal.json b/sample_configs/minimal.json index 720be64f..01de4674 100644 --- a/sample_configs/minimal.json +++ b/sample_configs/minimal.json @@ -182,11 +182,11 @@ ], "hostname": "koye", "namespaces": [ - {"name": "NEWIPC"}, - {"name": "NEWNET"}, - {"name": "NEWNS"}, - {"name": "NEWPID"}, - {"name": "NEWUTS"} + {"type": "NEWIPC"}, + {"type": "NEWNET"}, + {"type": "NEWNS"}, + {"type": "NEWPID"}, + {"type": "NEWUTS"} ], "networks": [ { diff --git a/sample_configs/route_source_address_selection.json b/sample_configs/route_source_address_selection.json index f403996d..9c62045a 100644 --- a/sample_configs/route_source_address_selection.json +++ b/sample_configs/route_source_address_selection.json @@ -176,11 +176,11 @@ ], "hostname": "koye", "namespaces": [ - {"name": "NEWIPC"}, - {"name": "NEWNET"}, - {"name": "NEWNS"}, - {"name": "NEWPID"}, - {"name": "NEWUTS"} + {"type": "NEWIPC"}, + {"type": "NEWNET"}, + {"type": "NEWNS"}, + {"type": "NEWPID"}, + {"type": "NEWUTS"} ], "networks": [ { diff --git a/sample_configs/selinux.json b/sample_configs/selinux.json index cfb83e09..15556488 100644 --- a/sample_configs/selinux.json +++ b/sample_configs/selinux.json @@ -178,11 +178,11 @@ ], "hostname": "koye", "namespaces": [ - {"name": "NEWIPC"}, - {"name": "NEWNET"}, - {"name": "NEWNS"}, - {"name": "NEWPID"}, - {"name": "NEWUTS"} + {"type": "NEWIPC"}, + {"type": "NEWNET"}, + {"type": "NEWNS"}, + {"type": "NEWPID"}, + {"type": "NEWUTS"} ], "networks": [ { diff --git a/sample_configs/userns.json b/sample_configs/userns.json new file mode 100644 index 00000000..8c9c841f --- /dev/null +++ b/sample_configs/userns.json @@ -0,0 +1,251 @@ +{ + "capabilities": [ + "CHOWN", + "DAC_OVERRIDE", + "FOWNER", + "MKNOD", + "NET_RAW", + "SETGID", + "SETUID", + "SETFCAP", + "SETPCAP", + "NET_BIND_SERVICE", + "SYS_CHROOT", + "KILL" + ], + "cgroups": { + "allowed_devices": [ + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "m", + "major_number": -1, + "minor_number": -1, + "type": 98 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 1, + "path": "/dev/console", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "path": "/dev/tty0", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 4, + "minor_number": 1, + "path": "/dev/tty1", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 136, + "minor_number": -1, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 5, + "minor_number": 2, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "major_number": 10, + "minor_number": 200, + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ], + "name": "docker-koye", + "parent": "docker" + }, + "restrict_sys": true, + "mount_config": { + "device_nodes": [ + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 3, + "path": "/dev/null", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 5, + "path": "/dev/zero", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 7, + "path": "/dev/full", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 5, + "path": "/dev/tty", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 9, + "path": "/dev/urandom", + "type": 99 + }, + { + "cgroup_permissions": "rwm", + "file_mode": 438, + "major_number": 1, + "minor_number": 8, + "path": "/dev/random", + "type": 99 + } + ], + "mounts": [ + { + "type": "tmpfs", + "destination": "/tmp" + } + ] + }, + "environment": [ + "HOME=/", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOSTNAME=koye", + "TERM=xterm" + ], + "hostname": "koye", + "namespaces": [ + {"type": "NEWIPC"}, + {"type": "NEWNET"}, + {"type": "NEWNS"}, + {"type": "NEWPID"}, + {"type": "NEWUTS"}, + {"type": "NEWUSER"} + ], + "networks": [ + { + "address": "127.0.0.1/0", + "gateway": "localhost", + "mtu": 1500, + "type": "loopback" + }, + { + "address": "172.17.0.9/16", + "gateway": "172.17.42.1", + "bridge": "docker0", + "veth_prefix": "veth", + "mtu": 1500, + "type": "veth" + } + ], + "tty": true, + "user": "root", + "uid_mappings": [ + { + "container_id": 0, + "host_id": 1000, + "size": 1 + }, + { + "container_id": 1, + "host_id": 1, + "size": 999 + }, + { + "container_id": 1001, + "host_id": 1001, + "size": 9000 + } + ], + "gid_mappings": [ + { + "container_id": 0, + "host_id": 1000, + "size": 1 + }, + { + "container_id": 1, + "host_id": 1, + "size": 999 + }, + { + "container_id": 1001, + "host_id": 1001, + "size": 9000 + } + ], + "rlimits": [ + { + "type": 7, + "hard": 999, + "soft": 999 + } + ] +} diff --git a/user/MAINTAINERS b/user/MAINTAINERS index 18e05a30..edbe2006 100644 --- a/user/MAINTAINERS +++ b/user/MAINTAINERS @@ -1 +1,2 @@ Tianon Gravi (@tianon) +Aleksa Sarai (@cyphar) diff --git a/user/lookup_unix.go b/user/lookup_unix.go index 409c114e..758b734c 100644 --- a/user/lookup_unix.go +++ b/user/lookup_unix.go @@ -9,22 +9,22 @@ import ( // Unix-specific path to the passwd and group formatted files. const ( - unixPasswdFile = "/etc/passwd" - unixGroupFile = "/etc/group" + unixPasswdPath = "/etc/passwd" + unixGroupPath = "/etc/group" ) -func GetPasswdFile() (string, error) { - return unixPasswdFile, nil +func GetPasswdPath() (string, error) { + return unixPasswdPath, nil } func GetPasswd() (io.ReadCloser, error) { - return os.Open(unixPasswdFile) + return os.Open(unixPasswdPath) } -func GetGroupFile() (string, error) { - return unixGroupFile, nil +func GetGroupPath() (string, error) { + return unixGroupPath, nil } func GetGroup() (io.ReadCloser, error) { - return os.Open(unixGroupFile) + return os.Open(unixGroupPath) } diff --git a/user/lookup_unsupported.go b/user/lookup_unsupported.go index 0f15c57d..72179488 100644 --- a/user/lookup_unsupported.go +++ b/user/lookup_unsupported.go @@ -4,7 +4,7 @@ package user import "io" -func GetPasswdFile() (string, error) { +func GetPasswdPath() (string, error) { return "", ErrUnsupported } @@ -12,7 +12,7 @@ func GetPasswd() (io.ReadCloser, error) { return nil, ErrUnsupported } -func GetGroupFile() (string, error) { +func GetGroupPath() (string, error) { return "", ErrUnsupported } diff --git a/user/user.go b/user/user.go index 69387f2e..d7439f12 100644 --- a/user/user.go +++ b/user/user.go @@ -197,11 +197,11 @@ type ExecUser struct { Home string } -// GetExecUserFile is a wrapper for GetExecUser. It reads data from each of the +// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the // given file paths and uses that data as the arguments to GetExecUser. If the // files cannot be opened for any reason, the error is ignored and a nil // io.Reader is passed instead. -func GetExecUserFile(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) { +func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) { passwd, err := os.Open(passwdPath) if err != nil { passwd = nil