diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index a808740f..30514da3 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -46,6 +46,10 @@ "ImportPath": "github.com/golang/protobuf/proto", "Rev": "f7137ae6b19afbfd61a94b746fda3b3fe0491874" }, + { + "ImportPath": "github.com/opencontainers/specs", + "Rev": "89fbfc172945b685f28205bdd1bef2b738bc0b62" + }, { "ImportPath": "github.com/syndtr/gocapability/capability", "Rev": "e55e5833692b49e49a0073ad5baf7803f21bebf4" diff --git a/Godeps/_workspace/src/github.com/opencontainers/specs/README.md b/Godeps/_workspace/src/github.com/opencontainers/specs/README.md new file mode 100644 index 00000000..a3ae9587 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/specs/README.md @@ -0,0 +1,45 @@ +# Open Container Specifications + +This project is where the Open Container Project Specifications are written. This is a work in progress. We should have a first draft by end of July 2015. + +Table of Contents + +- [Filesystem Bundle](bundle.md) +- [Container Configuration](config.md) + - [Linux Specific Configuration](config-linux.md) +- [Runtime and Lifecycle](runtime.md) + +# The 5 principles of Standard Containers + +Define a unit of software delivery called a Standard Container. The goal of a Standard Container is to encapsulate a software component and all its dependencies in a format that is self-describing and portable, so that any compliant runtime can run it without extra dependencies, regardless of the underlying machine and the contents of the container. + +The specification for Standard Containers is straightforward. It mostly defines 1) a file format, 2) a set of standard operations, and 3) an execution environment. + +A great analogy for this is the shipping container. Just like how Standard Containers are a fundamental unit of software delivery, shipping containers are a fundamental unit of physical delivery. + +## 1. Standard operations + +Just like shipping containers, Standard Containers define a set of STANDARD OPERATIONS. Shipping containers can be lifted, stacked, locked, loaded, unloaded and labelled. Similarly, Standard Containers can be created, started, and stopped using standard container tools (what this spec is about); copied and snapshotted using standard filesystem tools; and downloaded and uploaded using standard network tools. + +## 2. Content-agnostic + +Just like shipping containers, Standard Containers are CONTENT-AGNOSTIC: all standard operations have the same effect regardless of the contents. A shipping container will be stacked in exactly the same way whether it contains Vietnamese powder coffee or spare Maserati parts. Similarly, Standard Containers are started or uploaded in the same way whether they contain a postgres database, a php application with its dependencies and application server, or Java build artifacts. + +## 3. Infrastructure-agnostic + +Both types of containers are INFRASTRUCTURE-AGNOSTIC: they can be transported to thousands of facilities around the world, and manipulated by a wide variety of equipment. A shipping container can be packed in a factory in Ukraine, transported by truck to the nearest routing center, stacked onto a train, loaded into a German boat by an Australian-built crane, stored in a warehouse at a US facility, etc. Similarly, a standard container can be bundled on my laptop, uploaded to S3, downloaded, run and snapshotted by a build server at Equinix in Virginia, uploaded to 10 staging servers in a home-made Openstack cluster, then sent to 30 production instances across 3 EC2 regions. + +## 4. Designed for automation + +Because they offer the same standard operations regardless of content and infrastructure, Standard Containers, just like their physical counterparts, are extremely well-suited for automation. In fact, you could say automation is their secret weapon. + +Many things that once required time-consuming and error-prone human effort can now be programmed. Before shipping containers, a bag of powder coffee was hauled, dragged, dropped, rolled and stacked by 10 different people in 10 different locations by the time it reached its destination. 1 out of 50 disappeared. 1 out of 20 was damaged. The process was slow, inefficient and cost a fortune - and was entirely different depending on the facility and the type of goods. + +Similarly, before Standard Containers, by the time a software component ran in production, it had been individually built, configured, bundled, documented, patched, vendored, templated, tweaked and instrumented by 10 different people on 10 different computers. Builds failed, libraries conflicted, mirrors crashed, post-it notes were lost, logs were misplaced, cluster updates were half-broken. The process was slow, inefficient and cost a fortune - and was entirely different depending on the language and infrastructure provider. + +## 5. Industrial-grade delivery + +There are 17 million shipping containers in existence, packed with every physical good imaginable. Every single one of them can be loaded onto the same boats, by the same cranes, in the same facilities, and sent anywhere in the World with incredible efficiency. It is embarrassing to think that a 30 ton shipment of coffee can safely travel half-way across the World in *less time* than it takes a software team to deliver its code from one datacenter to another sitting 10 miles away. + +With Standard Containers we can put an end to that embarrassment, by making INDUSTRIAL-GRADE DELIVERY of software a reality. + diff --git a/Godeps/_workspace/src/github.com/opencontainers/specs/bundle.md b/Godeps/_workspace/src/github.com/opencontainers/specs/bundle.md new file mode 100644 index 00000000..897ad4db --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/specs/bundle.md @@ -0,0 +1,31 @@ +# Bundle Container Format + +This section defines a format for encoding a container as a *bundle* - a directory organized in a certain way, and containing all the necessary data and metadata for any compliant runtime to perform all standard operations against it. See also [OS X application bundles](http://en.wikipedia.org/wiki/Bundle_%28OS_X%29) for a similar use of the term *bundle*. + +The format does not define distribution. In other words, it only specifies how a container must be stored on a local filesystem, for consumption by a runtime. It does not specify how to transfer a container between computers, how to discover containers, or assign names or versions to them. Any distribution method capable of preserving the original layout of a container, as specified here, is considered compliant. + +A standard container bundle is made of the following 3 parts: + +- A top-level directory holding everything else +- One or more content directories +- A configuration file + +# Directory layout + +A Standard Container bundle is a directory containing all the content needed to load and run a container. This includes its configuration file, content directories, and cryptographic signatures. The main property of this directory layout is that it can be moved as a unit to another machine and run the same container. + +One or more *content directories* may be adjacent to the configuration file. This at least includes the root filesystem (referenced in the configuration by the *rootfs* field) and other related content (signatures, other configs, etc.). The interpretation of these resources is specified in the configuration. + +``` +/ +! +-- config.json +! +--- rootfs1 +! +--- rootfs2 +``` + +The syntax and semantics for config.json are described in this specification. + +One or more content directories can be specified as root file systems for containers. They COULD be called rootfs..10^100 but SHALL be called whatever you want. diff --git a/Godeps/_workspace/src/github.com/opencontainers/specs/config-linux.md b/Godeps/_workspace/src/github.com/opencontainers/specs/config-linux.md new file mode 100644 index 00000000..1ee02f31 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/specs/config-linux.md @@ -0,0 +1,124 @@ +# Linux + +The Linux container specification uses various kernel features like namespaces, +cgroups, capabilities, LSM, and file system jails to fulfill the spec. +Additional information is needed for Linux over the default spec configuration +in order to configure these various kernel features. + +## Linux namespaces + +A namespace wraps a global system resource in an abstraction that makes it +appear to the processes within the namespace that they have their own isolated +instance of the global resource. Changes to the global resource are visible to +other processes that are members of the namespace, but are invisible to other +processes. For more information, see [the man page](http://man7.org/linux/man-pages/man7/namespaces.7.html) + +Namespaces are specified in the spec as an array of entries. Each entry has a +type field with possible values described below and an optional path element. +If a path is specified, that particular file is used to join that type of namespace. + +```json + "namespaces": [ + { + "type": "pid", + "path": "/proc/1234/ns/pid" + }, + { + "type": "net", + "path": "/var/run/netns/neta" + }, + { + "type": "mnt", + }, + { + "type": "ipc", + }, + { + "type": "uts", + }, + { + "type": "user", + }, + ] +``` + +#### Namespace types + +* **pid** processes inside the container will only be able to see other processes inside the same container. +* **network** the container will have it's own network stack. +* **mnt** the container will have an isolated mount table. +* **ipc** processes inside the container will only be able to communicate to other processes inside the same +container via system level IPC. +* **uts** the container will be able to have it's own hostname and domain name. +* **user** the container will be able to remap user and group IDs from the host to local users and groups +within the container. + +### Access to devices + +Devices is an array specifying the list of devices from the host to make available in the container. +By providing a device name within the list the runtime should look up the same device on the host's `/dev` +and collect information about the device node so that it can be recreated for the container. The runtime +should not only create the device inside the container but ensure that the root user inside +the container has access rights for the device. + +```json + "devices": [ + "null", + "random", + "full", + "tty", + "zero", + "urandom" + ] +``` + +## Linux control groups + +Also known as cgroups, they are used to restrict resource usage for a container and handle +device access. cgroups provide controls to restrict cpu, memory, IO, and network for +the container. For more information, see the [kernel cgroups documentation](https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt) + +## Linux capabilities + +Capabilities is an array that specifies Linux capabilities that can be provided to the process +inside the container. Valid values are the string after `CAP_` for capabilities defined +in [the man page](http://man7.org/linux/man-pages/man7/capabilities.7.html) + +```json + "capabilities": [ + "AUDIT_WRITE", + "KILL", + "NET_BIND_SERVICE" + ] +``` + +## Linux sysctl + +sysctl allows kernel parameters to be modified at runtime for the container. +For more information, see [the man page](http://man7.org/linux/man-pages/man8/sysctl.8.html) + +``` + "sysctl": { + "net.ipv4.ip_forward": "1", + "net.core.somaxconn": "256" + } +``` + +## Linux rlimits + +``` + "rlimits": [ + { + "type": "RLIMIT_NPROC", + "soft": 1024, + "hard": 102400 + } + ] +``` + +rlimits allow setting resource limits. The type is from the values defined in [the man page](http://man7.org/linux/man-pages/man2/setrlimit.2.html). The kernel enforces the soft limit for a resource while the hard limit acts as a ceiling for that value that could be set by an unprivileged process. + +## Security + +**TODO:** security profiles + diff --git a/Godeps/_workspace/src/github.com/opencontainers/specs/config.md b/Godeps/_workspace/src/github.com/opencontainers/specs/config.md new file mode 100644 index 00000000..9fc1b430 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/specs/config.md @@ -0,0 +1,152 @@ +# Configuration file + +The container’s top-level directory MUST contain a configuration file called `config.json`. +For now the schema is defined in [spec.go](https://github.com/opencontainers/runc/blob/master/spec.go) and [spec_linux.go](https://github.com/opencontainers/runc/blob/master/spec_linux.go), this will be moved to a JSON schema overtime. + +The configuration file contains metadata necessary to implement standard operations against the container. +This includes the process to run, environment variables to inject, sandboxing features to use, etc. + +Below is a detailed description of each field defined in the configuration format. + +## Manifest version + +* **version** (string, required) must be in SemVer v2.0.0 format and specifies the version of the OCF specification with which the container bundle complies. The Open Container spec follows semantic versioning and retains forward and backward compatibility within major versions. For example, if an implementation is compliant with version 1.0.1 of the spec, it is compatible with the complete 1.x series. + +*Example* + +```json + "version": "0.1.0" +``` + +## Root Configuration + +Each container has exactly one *root filesystem*, specified in the *root* object: + +* **path** (string, required) Specifies the path to the root filesystem for the container, relative to the path where the manifest is. A directory MUST exist at the relative path declared by the field. +* **readonly** (bool, optional) If true then the root filesystem MUST be read-only inside the container. Defaults to false. + +*Example* + +```json +"root": { + "path": "rootfs", + "readonly": true +} +``` + +## Mount Configuration + +Additional filesystems can be declared as "mounts", specified in the *mounts* array. The parameters are similar to the ones in Linux mount system call. [http://linux.die.net/man/2/mount](http://linux.die.net/man/2/mount) + +* **type** (string, required) Linux, *filesystemtype* argument supported by the kernel are listed in */proc/filesystems* (e.g., "minix", "ext2", "ext3", "jfs", "xfs", "reiserfs", "msdos", "proc", "nfs", "iso9660"). Windows: ntfs +* **source** (string, required) a device name, but can also be a directory name or a dummy. Windows, the volume name that is the target of the mount point. \\?\Volume\{GUID}\ (on Windows source is called target) +* **destination** (string, required) where the source filesystem is mounted relative to the container rootfs. +* **options** (string, optional) in the fstab format [https://wiki.archlinux.org/index.php/Fstab](https://wiki.archlinux.org/index.php/Fstab). + +*Example (Linux)* + +```json +"mounts": [ + { + "type": "proc", + "source": "proc", + "destination": "/proc", + "options": "" + }, + { + "type": "tmpfs", + "source": "tmpfs", + "destination": "/dev", + "options": "nosuid,strictatime,mode=755,size=65536k" + }, + { + "type": "devpts", + "source": "devpts", + "destination": "/dev/pts", + "options": "nosuid,noexec,newinstance,ptmxmode=0666,mode=0620,gid=5" + }, + { + "type": "bind", + "source": "/volumes/testing", + "destination": "/data", + "options": "rbind,rw" + } +] +``` + +*Example (Windows)* + +```json +"mounts": [ + { + "type": "ntfs", + "source": "\\?\Volume\{2eca078d-5cbc-43d3-aff8-7e8511f60d0e}\", + "destination": "C:\Users\crosbymichael\My Fancy Mount Point\", + "options": "" + } +] +``` + +See links for details about [mountvol](http://ss64.com/nt/mountvol.html) and [SetVolumeMountPoint](https://msdn.microsoft.com/en-us/library/windows/desktop/aa365561(v=vs.85).aspx) in Windows. + +## Process configuration + +* **terminal** (bool, optional) specifies whether you want a terminal attached to that process. Defaults to false. +* **cwd** (string, optional) is the working directory that will be set for the executable. +* **env** (array of strings, optional) contains a list of variables that will be set in the process's environment prior to execution. Elements in the array are specified as Strings in the form "KEY=value". The left hand side must consist solely of letters, digits, and underscores '_' as outlined in [IEEE Std 1003.1-2001](http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap08.html). +* **args** (string, required) executable to launch and any flags as an array. The executable is the first element and must be available at the given path inside of the rootfs. If the executable path is not an absolute path then the search $PATH is interpreted to find the executable. + +The user for the process is a platform-specific structure that allows specific control over which user the process runs as. +For Linux-based systems the user structure has the following fields: + +* **uid** (int, required) specifies the user id. +* **gid** (int, required) specifies the group id. +* **additionalGids** (array of ints, optional) specifies additional group ids to be added to the process. + +*Example (Linux)* + +```json +"process": { + "terminal": true, + "user": { + "uid": 1, + "gid": 1, + "additionalGids": [] + }, + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "", + "args": [ + "sh" + ] +} +``` + + +## Hostname + +* **hostname** (string, optional) as it is accessible to processes running inside. + +*Example* + +```json +"hostname": "mrsdalloway" +``` + +## Platform-specific configuration + +* **os** (string, required) specifies the operating system family this image must run on. Values for arch must be in the list specified by the Go Language document for [$GOOS](https://golang.org/doc/install/source#environment). +* **arch** (string, required) specifies the instruction set for which the binaries in the image have been compiled. Values for arch must be in the list specified by the Go Language document for [$GOARCH](https://golang.org/doc/install/source#environment). + +```json +"platform": { + "os": "linux", + "arch": "amd64" +} +``` + +Interpretation of the platform section of the JSON file is used to find which platform specific section may be available in the document. For example if `os` is set to `linux` then the `linux` JSON object SHOULD be found in the `config.json`. + + diff --git a/Godeps/_workspace/src/github.com/opencontainers/specs/runtime.md b/Godeps/_workspace/src/github.com/opencontainers/specs/runtime.md new file mode 100644 index 00000000..c5583089 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/specs/runtime.md @@ -0,0 +1,17 @@ +# Runtime and Lifecycle + +## Lifecycle + +### Create + +Creates the container: file system, namespaces, cgroups, capabilities. + +### Start (process) + +Runs a process in a container. Can be invoked several times. + +### Stop (process) + +Not sure we need that from oc cli. Process is killed from the outside. + +This event needs to be captured by oc to run onstop event handlers. diff --git a/Godeps/_workspace/src/github.com/opencontainers/specs/spec.go b/Godeps/_workspace/src/github.com/opencontainers/specs/spec.go new file mode 100644 index 00000000..30b3c1b9 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/specs/spec.go @@ -0,0 +1,63 @@ +package specs + +// Spec is the base configuration for the container. It specifies platform +// independent configuration. +type Spec struct { + // Version is the version of the specification that is supported. + Version string `json:"version"` + // Platform is the host information for OS and Arch. + Platform Platform `json:"platform"` + // Process is the container's main process. + Process Process `json:"process"` + // Root is the root information for the container's filesystem. + Root Root `json:"root"` + // Hostname is the containers host name. + Hostname string `json:"hostname"` + // Mounts profile configuration for adding mounts to the container's filesystem. + Mounts []Mount `json:"mounts"` +} + +// Mount specifies a mount for a container. +type Mount struct { + // Type specifies the mount kind. + Type string `json:"type"` + // Source specifies the source path of the mount. In the case of bind mounts on + // linux based systems this would be the file on the host. + Source string `json:"source"` + // Destination is the path where the mount will be placed relative to the container's root. + Destination string `json:"destination"` + // Options are fstab style mount options. + Options string `json:"options"` +} + +// Process contains information to start a specific application inside the container. +type Process struct { + // Terminal creates an interactive terminal for the container. + Terminal bool `json:"terminal"` + // User specifies user information for the process. + User User `json:"user"` + // Args specifies the binary and arguments for the application to execute. + Args []string `json:"args"` + // Env populates the process environment for the process. + Env []string `json:"env"` + // Cwd is the current working directory for the process and must be + // relative to the container's root. + Cwd string `json:"cwd"` +} + +// Root contains information about the container's root filesystem on the host. +type Root struct { + // Path is the absolute path to the container's root filesystem. + Path string `json:"path"` + // Readonly makes the root filesystem for the container readonly before the process is executed. + Readonly bool `json:"readonly"` +} + +// Platform specifies OS and arch information for the host system that the container +// is created for. +type Platform struct { + // OS is the operating system. + OS string `json:"os"` + // Arch is the architecture + Arch string `json:"arch"` +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/specs/spec_linux.go b/Godeps/_workspace/src/github.com/opencontainers/specs/spec_linux.go new file mode 100644 index 00000000..207f5c88 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/specs/spec_linux.go @@ -0,0 +1,148 @@ +// +build linux + +package specs + +// LinuxSpec is the full specification for linux containers. +type LinuxSpec struct { + Spec + // Linux is platform specific configuration for linux based containers. + Linux Linux `json:"linux"` +} + +// Linux contains platform specific configuration for linux based containers. +type Linux struct { + // UidMapping specifies user mappings for supporting user namespaces on linux. + UidMappings []IDMapping `json:"uidMappings"` + // UidMapping specifies group mappings for supporting user namespaces on linux. + GidMappings []IDMapping `json:"gidMappings"` + // Rlimits specifies rlimit options to apply to the container's process. + Rlimits []Rlimit `json:"rlimits"` + // SystemProperties are a set of key value pairs that are set for the container on start. + SystemProperties map[string]string `json:"systemProperties"` + // Resources contain cgroup information for handling resource constraints + // for the container. + Resources Resources `json:"resources"` + // Namespaces contains the namespaces that are created and/or joined by the container. + Namespaces []Namespace `json:"namespaces"` + // Capabilities are linux capabilities that are kept for the container. + Capabilities []string `json:"capabilities"` + // Devices are a list of device nodes that are created and enabled for the container. + Devices []string `json:"devices"` +} + +// User specifies linux specific user and group information for the container's +// main process. +type User struct { + // Uid is the user id. + Uid int32 `json:"uid"` + // Gid is the group id. + Gid int32 `json:"gid"` + // AdditionalGids are additional group ids set the the container's process. + AdditionalGids []int32 `json:"additionalGids"` +} + +// Namespace is the configuration for a linux namespace. +type Namespace struct { + // Type is the type of linux namespace. + Type string `json:"type"` + // Path is a path to an existing namespace persisted on disk that can be joined + // and is of the same type. + Path string `json:"path"` +} + +// IDMapping specifies uid/gid mappings. +type IDMapping struct { + // From is the uid/gid of the host user or group. + From int32 `json:"from"` + // To is the uid/gid of the container's user or group. + To int32 `json:"to"` + // Count is how many uid/gids to map after To. + Count int32 `json:"count"` +} + +// Rlimit type and restrictions. +type Rlimit struct { + // Type of the rlimit to set. + Type int `json:"type"` + // Hard is the hard limit for the specified type. + Hard uint64 `json:"hard"` + // Soft is the soft limit for the specified type. + Soft uint64 `json:"soft"` +} + +type HugepageLimit struct { + Pagesize string `json:"pageSize"` + Limit int `json:"limit"` +} + +type InterfacePriority struct { + // Name is the name of the network interface. + Name string `json:"name"` + // Priority for the interface. + Priority int64 `json:"priority"` +} + +type BlockIO struct { + // Specifies per cgroup weight, range is from 10 to 1000. + Weight int64 `json:"blkioWeight"` + // Weight per cgroup per device, can override BlkioWeight. + WeightDevice string `json:"blkioWeightDevice"` + // IO read rate limit per cgroup per device, bytes per second. + ThrottleReadBpsDevice string `json:"blkioThrottleReadBpsDevice"` + // IO write rate limit per cgroup per divice, bytes per second. + ThrottleWriteBpsDevice string `json:"blkioThrottleWriteBpsDevice"` + // IO read rate limit per cgroup per device, IO per second. + ThrottleReadIOpsDevice string `json:"blkioThrottleReadIopsDevice"` + // IO write rate limit per cgroup per device, IO per second. + ThrottleWriteIOpsDevice string `json:"blkioThrottleWriteIopsDevice"` +} + +type Memory struct { + // Memory limit (in bytes) + Limit int64 `json:"limit"` + // Memory reservation or soft_limit (in bytes) + Reservation int64 `json:"reservation"` + // Total memory usage (memory + swap); set `-1' to disable swap + Swap int64 `json:"swap"` + // Kernel memory limit (in bytes) + Kernel int64 `json:"kernel"` +} + +type CPU struct { + // CPU shares (relative weight vs. other cgroups with cpu shares). + Shares int64 `json:"shares"` + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + Quota int64 `json:"quota"` + // CPU period to be used for hardcapping (in usecs). 0 to use system default. + Period int64 `json:"period"` + // How many time CPU will use in realtime scheduling (in usecs). + RealtimeRuntime int64 `json:"realtimeRuntime"` + // CPU period to be used for realtime scheduling (in usecs). + RealtimePeriod int64 `json:"realtimePeriod"` + // CPU to use within the cpuset. + Cpus string `json:"cpus"` + // MEM to use within the cpuset. + Mems string `json:"mems"` +} + +type Network struct { + // Set class identifier for container's network packets. + ClassID string `json:"classId"` + // Set priority of network traffic for container. + Priorities []InterfacePriority `json:"priorities"` +} + +type Resources struct { + // DisableOOMKiller disables the OOM killer for out of memory conditions. + DisableOOMKiller bool `json:"disableOOMKiller"` + // Memory restriction configuration. + Memory Memory `json:"memory"` + // CPU resource restriction configuration. + CPU CPU `json:"cpu"` + // BlockIO restriction configuration. + BlockIO BlockIO `json:"blockIO"` + // Hugetlb limit (in bytes) + HugepageLimits []HugepageLimit `json:"hugepageLimits"` + // Network restriction configuration. + Network Network `json:"network"` +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/specs/version.go b/Godeps/_workspace/src/github.com/opencontainers/specs/version.go new file mode 100644 index 00000000..f6cb6a92 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/specs/version.go @@ -0,0 +1,4 @@ +package specs + +// Version is the specification version that the package types support. +const Version = "pre-draft" diff --git a/README.md b/README.md index 9ec62397..94f31079 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,13 @@ to have a v1 of the spec out within a quick timeframe of a few weeks, ~July 2015 so the `runc` config format will be constantly changing until the spec is finalized. However, we encourage you to try out the tool and give feedback. +### OCF + +How does `runc` integrate with the Open Container Format? `runc` depends on the types +specified in the [specs](https://github.com/opencontainers/specs) repository. Whenever +the specification is updated and ready to be versioned `runc` will update it's dependency +on the specs repository and support the update spec. + ### Building: ```bash @@ -42,14 +49,18 @@ user named `daemon` defined within that file-system. ```json { - "version": "0.1.1", + "version": "pre-draft", "platform": { "os": "linux", "arch": "amd64" }, "process": { "terminal": true, - "user": "daemon", + "user": { + "uid": 0, + "gid": 0, + "additionalGids": null + }, "args": [ "sh" ], @@ -103,21 +114,61 @@ user named `daemon` defined within that file-system. } ], "linux": { + "uidMapping": null, + "gidMapping": null, + "rlimits": null, + "systemProperties": null, + "resources": { + "disableOOMKiller": false, + "memory": { + "limit": 0, + "reservation": 0, + "swap": 0, + "kernel": 0 + }, + "cpu": { + "shares": 0, + "quota": 0, + "period": 0, + "realtimeRuntime": 0, + "realtimePeriod": 0, + "cpus": "", + "mems": "" + }, + "blockIO": { + "blkioWeight": 0, + "blkioWeightDevice": "", + "blkioThrottleReadBpsDevice": "", + "blkioThrottleWriteBpsDevice": "", + "blkioThrottleReadIopsDevice": "", + "blkioThrottleWriteIopsDevice": "" + }, + "hugepageLimits": null, + "network": { + "classId": "", + "priorities": null + } + }, "namespaces": [ { - "type": "process" + "type": "process", + "path": "" }, { - "type": "network" + "type": "network", + "path": "" }, { - "type": "mount" + "type": "ipc", + "path": "" }, { - "type": "ipc" + "type": "uts", + "path": "" }, { - "type": "uts" + "type": "mount", + "path": "" } ], "capabilities": [ diff --git a/libcontainer/apparmor/apparmor.go b/libcontainer/apparmor/apparmor.go index 18cedf6a..22c17f52 100644 --- a/libcontainer/apparmor/apparmor.go +++ b/libcontainer/apparmor/apparmor.go @@ -12,6 +12,7 @@ import ( "unsafe" ) +// IsEnabled returns true if apparmor is enabled for the host. func IsEnabled() bool { if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" { if _, err = os.Stat("/sbin/apparmor_parser"); err == nil { @@ -22,13 +23,14 @@ func IsEnabled() bool { return false } +// ApplyProfile will apply the profile with the specified name to the process after +// the next exec. func ApplyProfile(name string) error { if name == "" { return nil } cName := C.CString(name) defer C.free(unsafe.Pointer(cName)) - if _, err := C.aa_change_onexec(cName); err != nil { return err } diff --git a/libcontainer/apparmor/gen.go b/libcontainer/apparmor/gen.go deleted file mode 100644 index 653bf34d..00000000 --- a/libcontainer/apparmor/gen.go +++ /dev/null @@ -1,83 +0,0 @@ -// +build linux - -package apparmor - -import ( - "io" - "os" - "text/template" -) - -type data struct { - Name string - Imports []string - InnerImports []string -} - -const baseTemplate = ` -{{range $value := .Imports}} -{{$value}} -{{end}} - -profile {{.Name}} flags=(attach_disconnected,mediate_deleted) { -{{range $value := .InnerImports}} - {{$value}} -{{end}} - - network, - capability, - file, - umount, - - deny @{PROC}/sys/fs/** wklx, - deny @{PROC}/sysrq-trigger rwklx, - deny @{PROC}/mem rwklx, - deny @{PROC}/kmem rwklx, - deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx, - deny @{PROC}/sys/kernel/*/** wklx, - - deny mount, - - deny /sys/[^f]*/** wklx, - deny /sys/f[^s]*/** wklx, - deny /sys/fs/[^c]*/** wklx, - deny /sys/fs/c[^g]*/** wklx, - deny /sys/fs/cg[^r]*/** wklx, - deny /sys/firmware/efi/efivars/** rwklx, - deny /sys/kernel/security/** rwklx, -} -` - -func generateProfile(out io.Writer) error { - compiled, err := template.New("apparmor_profile").Parse(baseTemplate) - if err != nil { - return err - } - data := &data{ - Name: "docker-default", - } - if tunablesExists() { - data.Imports = append(data.Imports, "#include ") - } else { - data.Imports = append(data.Imports, "@{PROC}=/proc/") - } - if abstractionsExists() { - data.InnerImports = append(data.InnerImports, "#include ") - } - if err := compiled.Execute(out, data); err != nil { - return err - } - return nil -} - -// check if the tunables/global exist -func tunablesExists() bool { - _, err := os.Stat("/etc/apparmor.d/tunables/global") - return err == nil -} - -// check if abstractions/base exist -func abstractionsExists() bool { - _, err := os.Stat("/etc/apparmor.d/abstractions/base") - return err == nil -} diff --git a/libcontainer/apparmor/setup.go b/libcontainer/apparmor/setup.go deleted file mode 100644 index 2df21268..00000000 --- a/libcontainer/apparmor/setup.go +++ /dev/null @@ -1,46 +0,0 @@ -// +build linux - -package apparmor - -import ( - "fmt" - "os" - "os/exec" - "path" -) - -const ( - DefaultProfilePath = "/etc/apparmor.d/docker" -) - -func InstallDefaultProfile() error { - if !IsEnabled() { - return nil - } - - // Make sure /etc/apparmor.d exists - if err := os.MkdirAll(path.Dir(DefaultProfilePath), 0755); err != nil { - return err - } - - f, err := os.OpenFile(DefaultProfilePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) - if err != nil { - return err - } - if err := generateProfile(f); err != nil { - f.Close() - return err - } - f.Close() - - cmd := exec.Command("/sbin/apparmor_parser", "-r", "-W", "docker") - // to use the parser directly we have to make sure we are in the correct - // dir with the profile - cmd.Dir = "/etc/apparmor.d" - - output, err := cmd.CombinedOutput() - if err != nil { - return fmt.Errorf("Error loading docker apparmor profile: %s (%s)", err, output) - } - return nil -} diff --git a/main.go b/main.go index 72958a76..baf7a0dc 100644 --- a/main.go +++ b/main.go @@ -8,7 +8,7 @@ import ( ) const ( - version = "0.1.1" + version = "0.2" usage = `Open Container Project runtime runc is a command line client for running applications packaged according to the Open Container Format (OCF) and is diff --git a/main_unsupported.go b/main_unsupported.go index 7cb1e9ab..837324c6 100644 --- a/main_unsupported.go +++ b/main_unsupported.go @@ -15,6 +15,7 @@ var ( checkpointCommand cli.Command eventsCommand cli.Command restoreCommand cli.Command + specCommand cli.Command ) func runAction(*cli.Context) { diff --git a/restore.go b/restore.go index 8170bfca..10ad7911 100644 --- a/restore.go +++ b/restore.go @@ -12,6 +12,7 @@ import ( "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/specs" ) var restoreCommand = cli.Command{ @@ -46,7 +47,7 @@ var restoreCommand = cli.Command{ }, } -func restoreContainer(context *cli.Context, spec *Spec, config *configs.Config, imagePath string) (code int, err error) { +func restoreContainer(context *cli.Context, spec *specs.LinuxSpec, config *configs.Config, imagePath string) (code int, err error) { rootuid := 0 factory, err := loadFactory(context) if err != nil { diff --git a/run.go b/run.go index 80d2b748..07012ee8 100644 --- a/run.go +++ b/run.go @@ -10,6 +10,7 @@ import ( "github.com/Sirupsen/logrus" "github.com/codegangsta/cli" "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/specs" ) func init() { @@ -24,7 +25,7 @@ func init() { } } -func execContainer(context *cli.Context, spec *Spec) (int, error) { +func execContainer(context *cli.Context, spec *specs.LinuxSpec) (int, error) { config, err := createLibcontainerConfig(spec) if err != nil { return -1, err diff --git a/spec.go b/spec.go index 42f80c89..4a31f02d 100644 --- a/spec.go +++ b/spec.go @@ -1,112 +1,120 @@ +// +build linux + package main import ( "encoding/json" "fmt" + "os" + "path/filepath" "runtime" + "strings" + "syscall" "github.com/Sirupsen/logrus" "github.com/codegangsta/cli" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/specs" ) -const cpuQuotaMultiplyer = 100000 - -type Mount struct { - Type string `json:"type"` - Source string `json:"source"` - Destination string `json:"destination"` - Options string `json:"options"` -} - -type Process struct { - Terminal bool `json:"terminal"` - User string `json:"user"` - Args []string `json:"args"` - Env []string `json:"env"` - Cwd string `json:"cwd"` -} - -type Root struct { - Path string `json:"path"` - Readonly bool `json:"readonly"` -} - -type Platform struct { - OS string `json:"os"` - Arch string `json:"arch"` -} - -type PortableSpec struct { - Version string `json:"version"` - Platform Platform `json:"platform"` - Process Process `json:"process"` - Root Root `json:"root"` - Hostname string `json:"hostname"` - Mounts []Mount `json:"mounts"` -} - var specCommand = cli.Command{ Name: "spec", Usage: "create a new specification file", Action: func(context *cli.Context) { - spec := PortableSpec{ - Version: version, - Platform: Platform{ - OS: runtime.GOOS, - Arch: runtime.GOARCH, - }, - Root: Root{ - Path: "rootfs", - Readonly: true, - }, - Process: Process{ - Terminal: true, - User: "daemon", - Args: []string{ - "sh", + spec := specs.LinuxSpec{ + Spec: specs.Spec{ + Version: specs.Version, + Platform: specs.Platform{ + OS: runtime.GOOS, + Arch: runtime.GOARCH, }, - Env: []string{ - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm", + Root: specs.Root{ + Path: "rootfs", + Readonly: true, + }, + Process: specs.Process{ + Terminal: true, + User: specs.User{}, + Args: []string{ + "sh", + }, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + }, + Hostname: "shell", + Mounts: []specs.Mount{ + { + Type: "proc", + Source: "proc", + Destination: "/proc", + Options: "", + }, + { + Type: "tmpfs", + Source: "tmpfs", + Destination: "/dev", + Options: "nosuid,strictatime,mode=755,size=65536k", + }, + { + Type: "devpts", + Source: "devpts", + Destination: "/dev/pts", + Options: "nosuid,noexec,newinstance,ptmxmode=0666,mode=0620,gid=5", + }, + { + Type: "tmpfs", + Source: "shm", + Destination: "/dev/shm", + Options: "nosuid,noexec,nodev,mode=1777,size=65536k", + }, + { + Type: "mqueue", + Source: "mqueue", + Destination: "/dev/mqueue", + Options: "nosuid,noexec,nodev", + }, + { + Type: "sysfs", + Source: "sysfs", + Destination: "/sys", + Options: "nosuid,noexec,nodev", + }, }, }, - Hostname: "shell", - Mounts: []Mount{ - { - Type: "proc", - Source: "proc", - Destination: "/proc", - Options: "", + Linux: specs.Linux{ + Namespaces: []specs.Namespace{ + { + Type: "process", + }, + { + Type: "network", + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, }, - { - Type: "tmpfs", - Source: "tmpfs", - Destination: "/dev", - Options: "nosuid,strictatime,mode=755,size=65536k", + Capabilities: []string{ + "AUDIT_WRITE", + "KILL", + "NET_BIND_SERVICE", }, - { - Type: "devpts", - Source: "devpts", - Destination: "/dev/pts", - Options: "nosuid,noexec,newinstance,ptmxmode=0666,mode=0620,gid=5", - }, - { - Type: "tmpfs", - Source: "shm", - Destination: "/dev/shm", - Options: "nosuid,noexec,nodev,mode=1777,size=65536k", - }, - { - Type: "mqueue", - Source: "mqueue", - Destination: "/dev/mqueue", - Options: "nosuid,noexec,nodev", - }, - { - Type: "sysfs", - Source: "sysfs", - Destination: "/sys", - Options: "nosuid,noexec,nodev", + Devices: []string{ + "null", + "random", + "full", + "tty", + "zero", + "urandom", }, }, } @@ -117,3 +125,266 @@ var specCommand = cli.Command{ fmt.Printf("%s", data) }, } + +var namespaceMapping = map[string]configs.NamespaceType{ + "process": configs.NEWPID, + "network": configs.NEWNET, + "mount": configs.NEWNS, + "user": configs.NEWUSER, + "ipc": configs.NEWIPC, + "uts": configs.NEWUTS, +} + +// loadSpec loads the specification from the provided path. +// If the path is empty then the default path will be "config.json" +func loadSpec(path string) (*specs.LinuxSpec, error) { + if path == "" { + path = "config.json" + } + f, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return nil, fmt.Errorf("JSON specification file for %s not found", path) + } + return nil, err + } + defer f.Close() + var s *specs.LinuxSpec + if err := json.NewDecoder(f).Decode(&s); err != nil { + return nil, err + } + return s, checkSpecVersion(s) +} + +// checkSpecVersion makes sure that the spec version matches runc's while we are in the initial +// development period. It is better to hard fail than have missing fields or options in the spec. +func checkSpecVersion(s *specs.LinuxSpec) error { + if s.Version != specs.Version { + return fmt.Errorf("spec version is not compatible with implemented version %q: spec %q", specs.Version, s.Version) + } + return nil +} + +func createLibcontainerConfig(spec *specs.LinuxSpec) (*configs.Config, error) { + cwd, err := os.Getwd() + if err != nil { + return nil, err + } + rootfsPath := spec.Root.Path + if !filepath.IsAbs(rootfsPath) { + rootfsPath = filepath.Join(cwd, rootfsPath) + } + config := &configs.Config{ + Rootfs: rootfsPath, + Capabilities: spec.Linux.Capabilities, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + Privatefs: true, + } + for _, ns := range spec.Linux.Namespaces { + t, exists := namespaceMapping[ns.Type] + if !exists { + return nil, fmt.Errorf("namespace %q does not exist", ns) + } + config.Namespaces.Add(t, ns.Path) + } + for _, m := range spec.Mounts { + config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m)) + } + if err := createDevices(spec, config); err != nil { + return nil, err + } + if err := setupUserNamespace(spec, config); err != nil { + return nil, err + } + c, err := createCgroupConfig(spec, config.Devices) + if err != nil { + return nil, err + } + config.Cgroups = c + if config.Readonlyfs { + setReadonly(config) + config.MaskPaths = []string{ + "/proc/kcore", + } + config.ReadonlyPaths = []string{ + "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", + } + } + return config, nil +} + +func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { + flags, data := parseMountOptions(m.Options) + source := m.Source + if m.Type == "bind" { + if !filepath.IsAbs(source) { + source = filepath.Join(cwd, m.Source) + } + } + return &configs.Mount{ + Device: m.Type, + Source: source, + Destination: m.Destination, + Data: data, + Flags: flags, + } +} + +func createCgroupConfig(spec *specs.LinuxSpec, devices []*configs.Device) (*configs.Cgroup, error) { + myCgroupPath, err := cgroups.GetThisCgroupDir("devices") + if err != nil { + return nil, err + } + c := &configs.Cgroup{ + Name: getDefaultID(), + Parent: myCgroupPath, + AllowedDevices: append(devices, allowedDevices...), + } + r := spec.Linux.Resources + c.MemoryReservation = r.Memory.Reservation + c.MemorySwap = r.Memory.Swap + c.KernelMemory = r.Memory.Kernel + c.CpuShares = r.CPU.Shares + c.CpuQuota = r.CPU.Quota + c.CpuPeriod = r.CPU.Period + c.CpuRtRuntime = r.CPU.RealtimeRuntime + c.CpuRtPeriod = r.CPU.RealtimePeriod + c.CpusetCpus = r.CPU.Cpus + c.CpusetMems = r.CPU.Mems + c.BlkioThrottleReadBpsDevice = r.BlockIO.ThrottleReadBpsDevice + c.BlkioThrottleWriteBpsDevice = r.BlockIO.ThrottleWriteBpsDevice + c.BlkioThrottleReadIOpsDevice = r.BlockIO.ThrottleReadIOpsDevice + c.BlkioThrottleWriteIOpsDevice = r.BlockIO.ThrottleWriteIOpsDevice + c.BlkioWeight = r.BlockIO.Weight + c.BlkioWeightDevice = r.BlockIO.WeightDevice + for _, l := range r.HugepageLimits { + c.HugetlbLimit = append(c.HugetlbLimit, &configs.HugepageLimit{ + Pagesize: l.Pagesize, + Limit: l.Limit, + }) + } + c.OomKillDisable = r.DisableOOMKiller + c.NetClsClassid = r.Network.ClassID + for _, m := range r.Network.Priorities { + c.NetPrioIfpriomap = append(c.NetPrioIfpriomap, &configs.IfPrioMap{ + Interface: m.Name, + Priority: m.Priority, + }) + } + return c, nil +} + +func createDevices(spec *specs.LinuxSpec, config *configs.Config) error { + for _, name := range spec.Linux.Devices { + d, err := devices.DeviceFromPath(filepath.Join("/dev", name), "rwm") + if err != nil { + return err + } + config.Devices = append(config.Devices, d) + } + return nil +} + +func setReadonly(config *configs.Config) { + for _, m := range config.Mounts { + if m.Device == "sysfs" { + m.Flags |= syscall.MS_RDONLY + } + } +} + +func setupUserNamespace(spec *specs.LinuxSpec, config *configs.Config) error { + if len(spec.Linux.UidMappings) == 0 { + return nil + } + config.Namespaces.Add(configs.NEWUSER, "") + create := func(m specs.IDMapping) configs.IDMap { + return configs.IDMap{ + ContainerID: int(m.From), + HostID: int(m.To), + Size: int(m.Count), + } + } + for _, m := range spec.Linux.UidMappings { + config.UidMappings = append(config.UidMappings, create(m)) + } + for _, m := range spec.Linux.GidMappings { + config.GidMappings = append(config.GidMappings, create(m)) + } + rootUid, err := config.HostUID() + if err != nil { + return err + } + rootGid, err := config.HostGID() + if err != nil { + return err + } + for _, node := range config.Devices { + node.Uid = uint32(rootUid) + node.Gid = uint32(rootGid) + } + return nil +} + +// parseMountOptions parses the string and returns the flags and any mount data that +// it contains. +func parseMountOptions(options string) (int, string) { + var ( + flag int + data []string + ) + flags := map[string]struct { + clear bool + flag int + }{ + "defaults": {false, 0}, + "ro": {false, syscall.MS_RDONLY}, + "rw": {true, syscall.MS_RDONLY}, + "suid": {true, syscall.MS_NOSUID}, + "nosuid": {false, syscall.MS_NOSUID}, + "dev": {true, syscall.MS_NODEV}, + "nodev": {false, syscall.MS_NODEV}, + "exec": {true, syscall.MS_NOEXEC}, + "noexec": {false, syscall.MS_NOEXEC}, + "sync": {false, syscall.MS_SYNCHRONOUS}, + "async": {true, syscall.MS_SYNCHRONOUS}, + "dirsync": {false, syscall.MS_DIRSYNC}, + "remount": {false, syscall.MS_REMOUNT}, + "mand": {false, syscall.MS_MANDLOCK}, + "nomand": {true, syscall.MS_MANDLOCK}, + "atime": {true, syscall.MS_NOATIME}, + "noatime": {false, syscall.MS_NOATIME}, + "diratime": {true, syscall.MS_NODIRATIME}, + "nodiratime": {false, syscall.MS_NODIRATIME}, + "bind": {false, syscall.MS_BIND}, + "rbind": {false, syscall.MS_BIND | syscall.MS_REC}, + "unbindable": {false, syscall.MS_UNBINDABLE}, + "runbindable": {false, syscall.MS_UNBINDABLE | syscall.MS_REC}, + "private": {false, syscall.MS_PRIVATE}, + "rprivate": {false, syscall.MS_PRIVATE | syscall.MS_REC}, + "shared": {false, syscall.MS_SHARED}, + "rshared": {false, syscall.MS_SHARED | syscall.MS_REC}, + "slave": {false, syscall.MS_SLAVE}, + "rslave": {false, syscall.MS_SLAVE | syscall.MS_REC}, + "relatime": {false, syscall.MS_RELATIME}, + "norelatime": {true, syscall.MS_RELATIME}, + "strictatime": {false, syscall.MS_STRICTATIME}, + "nostrictatime": {true, syscall.MS_STRICTATIME}, + } + for _, o := range strings.Split(options, ",") { + // If the option does not exist in the flags table or the flag + // is not supported on the platform, + // then it is a data value for a specific fs type + if f, exists := flags[o]; exists && f.flag != 0 { + if f.clear { + flag &= ^f.flag + } else { + flag |= f.flag + } + } else { + data = append(data, o) + } + } + return flag, strings.Join(data, ",") +} diff --git a/spec_linux.go b/spec_linux.go deleted file mode 100644 index 6725e9c4..00000000 --- a/spec_linux.go +++ /dev/null @@ -1,370 +0,0 @@ -// +build linux - -package main - -import ( - "encoding/json" - "fmt" - "os" - "path/filepath" - "strings" - "syscall" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" -) - -type Spec struct { - PortableSpec - Linux Linux `json:"linux"` -} - -type Linux struct { - UserMapping map[string]UserMapping `json:"userMapping"` - Rlimits []Rlimit `json:"rlimits"` - SystemProperties map[string]string `json:"systemProperties"` - Resources *Resources `json:"resources"` - Namespaces []Namespace `json:"namespaces"` - Capabilities []string `json:"capabilities"` - Devices []string `json:"devices"` -} - -type Namespace struct { - Type string `json:"type"` - Path string `json:"path"` -} - -type UserMapping struct { - From int `json:"from"` - To int `json:"to"` - Count int `json:"count"` -} - -type Rlimit struct { - Type int `json:"type"` - Hard uint64 `json:"hard"` - Soft uint64 `json:"soft"` -} - -type HugepageLimit struct { - Pagesize string `json:"pageSize"` - Limit int `json:"limit"` -} - -type IfPrioMap struct { - Interface string `json:"interface"` - Priority int64 `json:"priority"` -} - -type Resources struct { - // Memory limit (in bytes) - MemoryLimit int64 `json:"memoryLimit"` - // Memory reservation or soft_limit (in bytes) - MemoryReservation int64 `json:"memoryReservation"` - // Total memory usage (memory + swap); set `-1' to disable swap - MemorySwap int64 `json:"memorySwap"` - // Kernel memory limit (in bytes) - KernelMemory int64 `json:"kernelMemory"` - // CPU shares (relative weight vs. other containers) - CpuShares int64 `json:"cpuShares"` - // CPU hardcap limit (in usecs). Allowed cpu time in a given period. - CpuQuota int64 `json:"cpuQuota"` - // CPU period to be used for hardcapping (in usecs). 0 to use system default. - CpuPeriod int64 `json:"cpuPeriod"` - // How many time CPU will use in realtime scheduling (in usecs). - CpuRtRuntime int64 `json:"cpuQuota"` - // CPU period to be used for realtime scheduling (in usecs). - CpuRtPeriod int64 `json:"cpuPeriod"` - // CPU to use - CpusetCpus string `json:"cpusetCpus"` - // MEM to use - CpusetMems string `json:"cpusetMems"` - // IO read rate limit per cgroup per device, bytes per second. - BlkioThrottleReadBpsDevice string `json:"blkioThrottleReadBpsDevice"` - // IO write rate limit per cgroup per divice, bytes per second. - BlkioThrottleWriteBpsDevice string `json:"blkioThrottleWriteBpsDevice"` - // IO read rate limit per cgroup per device, IO per second. - BlkioThrottleReadIOpsDevice string `json:"blkioThrottleReadIopsDevice"` - // IO write rate limit per cgroup per device, IO per second. - BlkioThrottleWriteIOpsDevice string `json:"blkioThrottleWriteIopsDevice"` - // Specifies per cgroup weight, range is from 10 to 1000. - BlkioWeight int64 `json:"blkioWeight"` - // Weight per cgroup per device, can override BlkioWeight. - BlkioWeightDevice string `json:"blkioWeightDevice"` - // Hugetlb limit (in bytes) - HugetlbLimit []*HugepageLimit `json:"hugetlbLimit"` - // Whether to disable OOM Killer - DisableOOMKiller bool `json:"disableOOMKiller"` - // Set priority of network traffic for container - NetPrioIfpriomap []*IfPrioMap `json:"netPrioIfpriomap"` - // Set class identifier for container's network packets - NetClsClassid string `json:"netClsClassid"` -} - -var namespaceMapping = map[string]configs.NamespaceType{ - "process": configs.NEWPID, - "network": configs.NEWNET, - "mount": configs.NEWNS, - "user": configs.NEWUSER, - "ipc": configs.NEWIPC, - "uts": configs.NEWUTS, -} - -// loadSpec loads the specification from the provided path. -// If the path is empty then the default path will be "config.json" -func loadSpec(path string) (*Spec, error) { - if path == "" { - path = "config.json" - } - f, err := os.Open(path) - if err != nil { - if os.IsNotExist(err) { - return nil, fmt.Errorf("JSON specification file for %s not found", path) - } - return nil, err - } - defer f.Close() - var s *Spec - if err := json.NewDecoder(f).Decode(&s); err != nil { - return nil, err - } - return s, checkSpecVersion(s) -} - -// checkSpecVersion makes sure that the spec version matches runc's while we are in the initial -// development period. It is better to hard fail than have missing fields or options in the spec. -func checkSpecVersion(s *Spec) error { - if s.Version != version { - return fmt.Errorf("spec version is not compatible with runc version %q: spec %q", version, s.Version) - } - return nil -} - -func createLibcontainerConfig(spec *Spec) (*configs.Config, error) { - cwd, err := os.Getwd() - if err != nil { - return nil, err - } - rootfsPath := spec.Root.Path - if !filepath.IsAbs(rootfsPath) { - rootfsPath = filepath.Join(cwd, rootfsPath) - } - config := &configs.Config{ - Rootfs: rootfsPath, - Capabilities: spec.Linux.Capabilities, - Readonlyfs: spec.Root.Readonly, - Hostname: spec.Hostname, - Privatefs: true, - } - for _, ns := range spec.Linux.Namespaces { - t, exists := namespaceMapping[ns.Type] - if !exists { - return nil, fmt.Errorf("namespace %q does not exist", ns) - } - config.Namespaces.Add(t, ns.Path) - } - for _, m := range spec.Mounts { - config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m)) - } - if err := createDevices(spec, config); err != nil { - return nil, err - } - if err := setupUserNamespace(spec, config); err != nil { - return nil, err - } - c, err := createCgroupConfig(spec, config.Devices) - if err != nil { - return nil, err - } - config.Cgroups = c - if config.Readonlyfs { - setReadonly(config) - config.MaskPaths = []string{ - "/proc/kcore", - } - config.ReadonlyPaths = []string{ - "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", - } - } - return config, nil -} - -func createLibcontainerMount(cwd string, m Mount) *configs.Mount { - flags, data := parseMountOptions(m.Options) - source := m.Source - if m.Type == "bind" { - if !filepath.IsAbs(source) { - source = filepath.Join(cwd, m.Source) - } - } - return &configs.Mount{ - Device: m.Type, - Source: source, - Destination: m.Destination, - Data: data, - Flags: flags, - } -} - -func createCgroupConfig(spec *Spec, devices []*configs.Device) (*configs.Cgroup, error) { - myCgroupPath, err := cgroups.GetThisCgroupDir("devices") - if err != nil { - return nil, err - } - c := &configs.Cgroup{ - Name: getDefaultID(), - Parent: myCgroupPath, - AllowedDevices: append(devices, allowedDevices...), - MemorySwap: -1, - MemorySwappiness: -1, - } - if r := spec.Linux.Resources; r != nil { - c.MemoryReservation = r.MemoryReservation - c.MemorySwap = r.MemorySwap - c.KernelMemory = r.KernelMemory - c.CpuShares = r.CpuShares - c.CpuQuota = r.CpuQuota - c.CpuPeriod = r.CpuPeriod - c.CpuRtRuntime = r.CpuRtRuntime - c.CpuRtPeriod = r.CpuRtPeriod - c.CpusetCpus = r.CpusetCpus - c.CpusetMems = r.CpusetMems - c.BlkioThrottleReadBpsDevice = r.BlkioThrottleReadBpsDevice - c.BlkioThrottleWriteBpsDevice = r.BlkioThrottleWriteBpsDevice - c.BlkioThrottleReadIOpsDevice = r.BlkioThrottleReadIOpsDevice - c.BlkioThrottleWriteIOpsDevice = r.BlkioThrottleWriteIOpsDevice - c.BlkioWeight = r.BlkioWeight - c.BlkioWeightDevice = r.BlkioWeightDevice - for _, l := range r.HugetlbLimit { - c.HugetlbLimit = append(c.HugetlbLimit, &configs.HugepageLimit{ - Pagesize: l.Pagesize, - Limit: l.Limit, - }) - } - c.OomKillDisable = r.DisableOOMKiller - for _, m := range r.NetPrioIfpriomap { - c.NetPrioIfpriomap = append(c.NetPrioIfpriomap, &configs.IfPrioMap{ - Interface: m.Interface, - Priority: m.Priority, - }) - } - c.NetClsClassid = r.NetClsClassid - } - return c, nil -} - -func createDevices(spec *Spec, config *configs.Config) error { - for _, name := range spec.Linux.Devices { - d, err := devices.DeviceFromPath(filepath.Join("/dev", name), "rwm") - if err != nil { - return err - } - config.Devices = append(config.Devices, d) - } - return nil -} - -func setReadonly(config *configs.Config) { - for _, m := range config.Mounts { - if m.Device == "sysfs" { - m.Flags |= syscall.MS_RDONLY - } - } -} - -func getCPUQuota(cpus float64) int64 { - return int64(cpus * cpuQuotaMultiplyer) -} - -func setupUserNamespace(spec *Spec, config *configs.Config) error { - if len(spec.Linux.UserMapping) == 0 { - return nil - } - config.Namespaces.Add(configs.NEWUSER, "") - mappings := make(map[string][]configs.IDMap) - for k, v := range spec.Linux.UserMapping { - mappings[k] = append(mappings[k], configs.IDMap{ - ContainerID: v.From, - HostID: v.To, - Size: v.Count, - }) - } - config.UidMappings = mappings["uid"] - config.GidMappings = mappings["gid"] - rootUid, err := config.HostUID() - if err != nil { - return err - } - rootGid, err := config.HostGID() - if err != nil { - return err - } - for _, node := range config.Devices { - node.Uid = uint32(rootUid) - node.Gid = uint32(rootGid) - } - return nil -} - -// parseMountOptions parses the string and returns the flags and any mount data that -// it contains. -func parseMountOptions(options string) (int, string) { - var ( - flag int - data []string - ) - flags := map[string]struct { - clear bool - flag int - }{ - "defaults": {false, 0}, - "ro": {false, syscall.MS_RDONLY}, - "rw": {true, syscall.MS_RDONLY}, - "suid": {true, syscall.MS_NOSUID}, - "nosuid": {false, syscall.MS_NOSUID}, - "dev": {true, syscall.MS_NODEV}, - "nodev": {false, syscall.MS_NODEV}, - "exec": {true, syscall.MS_NOEXEC}, - "noexec": {false, syscall.MS_NOEXEC}, - "sync": {false, syscall.MS_SYNCHRONOUS}, - "async": {true, syscall.MS_SYNCHRONOUS}, - "dirsync": {false, syscall.MS_DIRSYNC}, - "remount": {false, syscall.MS_REMOUNT}, - "mand": {false, syscall.MS_MANDLOCK}, - "nomand": {true, syscall.MS_MANDLOCK}, - "atime": {true, syscall.MS_NOATIME}, - "noatime": {false, syscall.MS_NOATIME}, - "diratime": {true, syscall.MS_NODIRATIME}, - "nodiratime": {false, syscall.MS_NODIRATIME}, - "bind": {false, syscall.MS_BIND}, - "rbind": {false, syscall.MS_BIND | syscall.MS_REC}, - "unbindable": {false, syscall.MS_UNBINDABLE}, - "runbindable": {false, syscall.MS_UNBINDABLE | syscall.MS_REC}, - "private": {false, syscall.MS_PRIVATE}, - "rprivate": {false, syscall.MS_PRIVATE | syscall.MS_REC}, - "shared": {false, syscall.MS_SHARED}, - "rshared": {false, syscall.MS_SHARED | syscall.MS_REC}, - "slave": {false, syscall.MS_SLAVE}, - "rslave": {false, syscall.MS_SLAVE | syscall.MS_REC}, - "relatime": {false, syscall.MS_RELATIME}, - "norelatime": {true, syscall.MS_RELATIME}, - "strictatime": {false, syscall.MS_STRICTATIME}, - "nostrictatime": {true, syscall.MS_STRICTATIME}, - } - for _, o := range strings.Split(options, ",") { - // If the option does not exist in the flags table or the flag - // is not supported on the platform, - // then it is a data value for a specific fs type - if f, exists := flags[o]; exists && f.flag != 0 { - if f.clear { - flag &= ^f.flag - } else { - flag |= f.flag - } - } else { - data = append(data, o) - } - } - return flag, strings.Join(data, ",") -} diff --git a/utils.go b/utils.go index 7153c4d4..5c35edc6 100644 --- a/utils.go +++ b/utils.go @@ -10,6 +10,7 @@ import ( "github.com/codegangsta/cli" "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/specs" ) const wildcard = -1 @@ -162,11 +163,12 @@ func getDefaultImagePath(context *cli.Context) string { // newProcess returns a new libcontainer Process with the arguments from the // spec and stdio from the current process. -func newProcess(p Process) *libcontainer.Process { +func newProcess(p specs.Process) *libcontainer.Process { return &libcontainer.Process{ - Args: p.Args, - Env: p.Env, - User: p.User, + Args: p.Args, + Env: p.Env, + // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. + User: fmt.Sprintf("%d:%d", p.User.Uid, p.User.Gid), Cwd: p.Cwd, Stdin: os.Stdin, Stdout: os.Stdout,