runc/spec.go

844 lines
22 KiB
Go

// +build linux
package main
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"syscall"
"github.com/codegangsta/cli"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/seccomp"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/specs/specs-go"
)
var specCommand = cli.Command{
Name: "spec",
Usage: "create a new specification file",
ArgsUsage: "",
Description: `The spec command creates the new specification file named "` + specConfig + `" for
the bundle.
The spec generated is just a starter file. Editing of the spec is required to
achieve desired results. For example, the newly generated spec includes an args
parameter that is initially set to call the "sh" command when the container is
started. Calling "sh" may work for an ubuntu container or busybox, but will not
work for containers that do not include the "sh" program.
EXAMPLE:
To run docker's hello-world container one needs to set the args parameter
in the spec to call hello. This can be done using the sed command or a text
editor. The following commands create a bundle for hello-world, change the
default args parameter in the spec from "sh" to "/hello", then run the hello
command in a new hello-world container named container1:
mkdir hello
cd hello
docker pull hello-world
docker export $(docker create hello-world) > hello-world.tar
mkdir rootfs
tar -C rootfs -xf hello-world.tar
runc spec
sed -i 's;"sh";"/hello";' ` + specConfig + `
runc start container1
In the start command above, "container1" is the name for the instance of the
container that you are starting. The name you provide for the container instance
must be unique on your host.
When starting a container through runc, runc needs root privilege. If not
already running as root, you can use sudo to give runc root privilege. For
example: "sudo runc start container1" will give runc root privilege to start the
container on your host.`,
Flags: []cli.Flag{
cli.StringFlag{
Name: "bundle, b",
Value: "",
Usage: "path to the root of the bundle directory",
},
},
Action: func(context *cli.Context) {
spec := specs.Spec{
Version: specs.Version,
Platform: specs.Platform{
OS: runtime.GOOS,
Arch: runtime.GOARCH,
},
Root: specs.Root{
Path: "rootfs",
Readonly: true,
},
Process: specs.Process{
Terminal: true,
User: specs.User{},
Args: []string{
"sh",
},
Env: []string{
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm",
},
Cwd: "/",
NoNewPrivileges: true,
Capabilities: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Rlimits: []specs.Rlimit{
{
Type: "RLIMIT_NOFILE",
Hard: uint64(1024),
Soft: uint64(1024),
},
},
},
Hostname: "runc",
Mounts: []specs.Mount{
{
Destination: "/proc",
Type: "proc",
Source: "proc",
Options: nil,
},
{
Destination: "/dev",
Type: "tmpfs",
Source: "tmpfs",
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
},
{
Destination: "/dev/pts",
Type: "devpts",
Source: "devpts",
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
},
{
Destination: "/dev/shm",
Type: "tmpfs",
Source: "shm",
Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
},
{
Destination: "/dev/mqueue",
Type: "mqueue",
Source: "mqueue",
Options: []string{"nosuid", "noexec", "nodev"},
},
{
Destination: "/sys",
Type: "sysfs",
Source: "sysfs",
Options: []string{"nosuid", "noexec", "nodev", "ro"},
},
{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "cgroup",
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
},
},
Linux: specs.Linux{
Resources: &specs.Resources{
Devices: []specs.DeviceCgroup{
{
Allow: false,
Access: sPtr("rwm"),
},
},
},
Namespaces: []specs.Namespace{
{
Type: "pid",
},
{
Type: "network",
},
{
Type: "ipc",
},
{
Type: "uts",
},
{
Type: "mount",
},
},
},
}
checkNoFile := func(name string) error {
_, err := os.Stat(name)
if err == nil {
return fmt.Errorf("File %s exists. Remove it first", name)
}
if !os.IsNotExist(err) {
return err
}
return nil
}
bundle := context.String("bundle")
if bundle != "" {
if err := os.Chdir(bundle); err != nil {
fatal(err)
}
}
if err := checkNoFile(specConfig); err != nil {
fatal(err)
}
data, err := json.MarshalIndent(&spec, "", "\t")
if err != nil {
fatal(err)
}
if err := ioutil.WriteFile(specConfig, data, 0666); err != nil {
fatal(err)
}
},
}
func sPtr(s string) *string { return &s }
func rPtr(r rune) *rune { return &r }
func iPtr(i int64) *int64 { return &i }
func u32Ptr(i int64) *uint32 { u := uint32(i); return &u }
func fmPtr(i int64) *os.FileMode { fm := os.FileMode(i); return &fm }
var namespaceMapping = map[specs.NamespaceType]configs.NamespaceType{
specs.PIDNamespace: configs.NEWPID,
specs.NetworkNamespace: configs.NEWNET,
specs.MountNamespace: configs.NEWNS,
specs.UserNamespace: configs.NEWUSER,
specs.IPCNamespace: configs.NEWIPC,
specs.UTSNamespace: configs.NEWUTS,
}
var mountPropagationMapping = map[string]int{
"rprivate": syscall.MS_PRIVATE | syscall.MS_REC,
"private": syscall.MS_PRIVATE,
"rslave": syscall.MS_SLAVE | syscall.MS_REC,
"slave": syscall.MS_SLAVE,
"rshared": syscall.MS_SHARED | syscall.MS_REC,
"shared": syscall.MS_SHARED,
"": syscall.MS_PRIVATE | syscall.MS_REC,
}
// loadSpec loads the specification from the provided path.
// If the path is empty then the default path will be "config.json"
func loadSpec(cPath string) (spec *specs.Spec, err error) {
cf, err := os.Open(cPath)
if err != nil {
if os.IsNotExist(err) {
return nil, fmt.Errorf("JSON specification file %s not found", cPath)
}
return nil, err
}
defer cf.Close()
if err = json.NewDecoder(cf).Decode(&spec); err != nil {
return nil, err
}
return spec, validateProcessSpec(&spec.Process)
}
func createLibcontainerConfig(cgroupName string, useSystemdCgroup bool, spec *specs.Spec) (*configs.Config, error) {
// runc's cwd will always be the bundle path
rcwd, err := os.Getwd()
if err != nil {
return nil, err
}
cwd, err := filepath.Abs(rcwd)
if err != nil {
return nil, err
}
rootfsPath := spec.Root.Path
if !filepath.IsAbs(rootfsPath) {
rootfsPath = filepath.Join(cwd, rootfsPath)
}
config := &configs.Config{
Rootfs: rootfsPath,
Readonlyfs: spec.Root.Readonly,
Hostname: spec.Hostname,
Labels: []string{
"bundle=" + cwd,
},
}
exists := false
if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
}
for _, ns := range spec.Linux.Namespaces {
t, exists := namespaceMapping[ns.Type]
if !exists {
return nil, fmt.Errorf("namespace %q does not exist", ns)
}
config.Namespaces.Add(t, ns.Path)
}
if config.Namespaces.Contains(configs.NEWNET) {
config.Networks = []*configs.Network{
{
Type: "loopback",
},
}
}
for _, m := range spec.Mounts {
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
}
if err := createDevices(spec, config); err != nil {
return nil, err
}
if err := setupUserNamespace(spec, config); err != nil {
return nil, err
}
c, err := createCgroupConfig(cgroupName, useSystemdCgroup, spec)
if err != nil {
return nil, err
}
config.Cgroups = c
// set extra path masking for libcontainer for the various unsafe places in proc
config.MaskPaths = maskedPaths
config.ReadonlyPaths = readonlyPaths
if spec.Linux.Seccomp != nil {
seccomp, err := setupSeccomp(spec.Linux.Seccomp)
if err != nil {
return nil, err
}
config.Seccomp = seccomp
}
config.Sysctl = spec.Linux.Sysctl
if oomScoreAdj := spec.Linux.Resources.OOMScoreAdj; oomScoreAdj != nil {
config.OomScoreAdj = *oomScoreAdj
}
for _, g := range spec.Process.User.AdditionalGids {
config.AdditionalGroups = append(config.AdditionalGroups, strconv.FormatUint(uint64(g), 10))
}
createHooks(spec, config)
config.Version = specs.Version
return config, nil
}
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
flags, pgflags, data := parseMountOptions(m.Options)
source := m.Source
if m.Type == "bind" {
if !filepath.IsAbs(source) {
source = filepath.Join(cwd, m.Source)
}
}
return &configs.Mount{
Device: m.Type,
Source: source,
Destination: m.Destination,
Data: data,
Flags: flags,
PropagationFlags: pgflags,
}
}
func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*configs.Cgroup, error) {
var (
err error
myCgroupPath string
)
c := &configs.Cgroup{
Resources: &configs.Resources{},
}
if spec.Linux.CgroupsPath != nil {
myCgroupPath = libcontainerUtils.CleanPath(*spec.Linux.CgroupsPath)
}
if useSystemdCgroup {
if myCgroupPath == "" {
c.Parent = "system.slice"
c.ScopePrefix = "runc"
c.Name = name
} else {
// Parse the path from expected "slice:prefix:name"
// for e.g. "system.slice:docker:1234"
parts := strings.Split(myCgroupPath, ":")
if len(parts) != 3 {
return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups")
}
c.Parent = parts[0]
c.ScopePrefix = parts[1]
c.Name = parts[2]
}
} else {
if myCgroupPath == "" {
myCgroupPath, err = cgroups.GetThisCgroupDir("devices")
if err != nil {
return nil, err
}
myCgroupPath = filepath.Join(myCgroupPath, name)
}
c.Path = myCgroupPath
}
c.Resources.AllowedDevices = allowedDevices
r := spec.Linux.Resources
if r == nil {
return c, nil
}
for i, d := range spec.Linux.Resources.Devices {
var (
t = "a"
major = int64(-1)
minor = int64(-1)
)
if d.Type != nil {
t = *d.Type
}
if d.Major != nil {
major = *d.Major
}
if d.Minor != nil {
minor = *d.Minor
}
if d.Access == nil || *d.Access == "" {
return nil, fmt.Errorf("device access at %d field cannot be empty", i)
}
dt, err := stringToDeviceRune(t)
if err != nil {
return nil, err
}
dd := &configs.Device{
Type: dt,
Major: major,
Minor: minor,
Permissions: *d.Access,
Allow: d.Allow,
}
c.Resources.Devices = append(c.Resources.Devices, dd)
}
// append the default allowed devices to the end of the list
c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
if r.Memory != nil {
if r.Memory.Limit != nil {
c.Resources.Memory = int64(*r.Memory.Limit)
}
if r.Memory.Reservation != nil {
c.Resources.MemoryReservation = int64(*r.Memory.Reservation)
}
if r.Memory.Swap != nil {
c.Resources.MemorySwap = int64(*r.Memory.Swap)
}
if r.Memory.Kernel != nil {
c.Resources.KernelMemory = int64(*r.Memory.Kernel)
}
if r.Memory.KernelTCP != nil {
c.Resources.KernelMemoryTCP = int64(*r.Memory.KernelTCP)
}
if r.Memory.Swappiness != nil {
swappiness := int64(*r.Memory.Swappiness)
c.Resources.MemorySwappiness = &swappiness
}
}
if r.CPU != nil {
if r.CPU.Shares != nil {
c.Resources.CpuShares = int64(*r.CPU.Shares)
}
if r.CPU.Quota != nil {
c.Resources.CpuQuota = int64(*r.CPU.Quota)
}
if r.CPU.Period != nil {
c.Resources.CpuPeriod = int64(*r.CPU.Period)
}
if r.CPU.RealtimeRuntime != nil {
c.Resources.CpuRtRuntime = int64(*r.CPU.RealtimeRuntime)
}
if r.CPU.RealtimePeriod != nil {
c.Resources.CpuRtPeriod = int64(*r.CPU.RealtimePeriod)
}
if r.CPU.Cpus != nil {
c.Resources.CpusetCpus = *r.CPU.Cpus
}
if r.CPU.Mems != nil {
c.Resources.CpusetMems = *r.CPU.Mems
}
}
if r.Pids != nil {
c.Resources.PidsLimit = *r.Pids.Limit
}
if r.BlockIO != nil {
if r.BlockIO.Weight != nil {
c.Resources.BlkioWeight = *r.BlockIO.Weight
}
if r.BlockIO.LeafWeight != nil {
c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
}
if r.BlockIO.WeightDevice != nil {
for _, wd := range r.BlockIO.WeightDevice {
weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, *wd.Weight, *wd.LeafWeight)
c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
}
}
if r.BlockIO.ThrottleReadBpsDevice != nil {
for _, td := range r.BlockIO.ThrottleReadBpsDevice {
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate)
c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleWriteBpsDevice != nil {
for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate)
c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleReadIOPSDevice != nil {
for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate)
c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleWriteIOPSDevice != nil {
for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, *td.Rate)
c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
}
}
}
for _, l := range r.HugepageLimits {
c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
Pagesize: *l.Pagesize,
Limit: *l.Limit,
})
}
if r.DisableOOMKiller != nil {
c.Resources.OomKillDisable = *r.DisableOOMKiller
}
if r.Network != nil {
if r.Network.ClassID != nil {
c.Resources.NetClsClassid = string(*r.Network.ClassID)
}
for _, m := range r.Network.Priorities {
c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
Interface: m.Name,
Priority: int64(m.Priority),
})
}
}
return c, nil
}
func stringToDeviceRune(s string) (rune, error) {
switch s {
case "a":
return 'a', nil
case "b":
return 'b', nil
case "c":
return 'c', nil
default:
return 0, fmt.Errorf("invalid device type %q", s)
}
}
func createDevices(spec *specs.Spec, config *configs.Config) error {
// add whitelisted devices
config.Devices = []*configs.Device{
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/full",
Major: 1,
Minor: 7,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/tty",
Major: 5,
Minor: 0,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/zero",
Major: 1,
Minor: 5,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/urandom",
Major: 1,
Minor: 9,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
}
// merge in additional devices from the spec
for _, d := range spec.Linux.Devices {
var uid, gid uint32
if d.UID != nil {
uid = *d.UID
}
if d.GID != nil {
gid = *d.GID
}
dt, err := stringToDeviceRune(d.Type)
if err != nil {
return err
}
device := &configs.Device{
Type: dt,
Path: d.Path,
Major: d.Major,
Minor: d.Minor,
FileMode: *d.FileMode,
Uid: uid,
Gid: gid,
}
config.Devices = append(config.Devices, device)
}
return nil
}
func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
if len(spec.Linux.UIDMappings) == 0 {
return nil
}
// do not override the specified user namespace path
if config.Namespaces.PathOf(configs.NEWUSER) == "" {
config.Namespaces.Add(configs.NEWUSER, "")
}
create := func(m specs.IDMapping) configs.IDMap {
return configs.IDMap{
HostID: int(m.HostID),
ContainerID: int(m.ContainerID),
Size: int(m.Size),
}
}
for _, m := range spec.Linux.UIDMappings {
config.UidMappings = append(config.UidMappings, create(m))
}
for _, m := range spec.Linux.GIDMappings {
config.GidMappings = append(config.GidMappings, create(m))
}
rootUID, err := config.HostUID()
if err != nil {
return err
}
rootGID, err := config.HostGID()
if err != nil {
return err
}
for _, node := range config.Devices {
node.Uid = uint32(rootUID)
node.Gid = uint32(rootGID)
}
return nil
}
func createLibContainerRlimit(rlimit specs.Rlimit) (configs.Rlimit, error) {
rl, err := strToRlimit(rlimit.Type)
if err != nil {
return configs.Rlimit{}, err
}
return configs.Rlimit{
Type: rl,
Hard: uint64(rlimit.Hard),
Soft: uint64(rlimit.Soft),
}, nil
}
// parseMountOptions parses the string and returns the flags, propagation
// flags and any mount data that it contains.
func parseMountOptions(options []string) (int, []int, string) {
var (
flag int
pgflag []int
data []string
)
flags := map[string]struct {
clear bool
flag int
}{
"async": {true, syscall.MS_SYNCHRONOUS},
"atime": {true, syscall.MS_NOATIME},
"bind": {false, syscall.MS_BIND},
"defaults": {false, 0},
"dev": {true, syscall.MS_NODEV},
"diratime": {true, syscall.MS_NODIRATIME},
"dirsync": {false, syscall.MS_DIRSYNC},
"exec": {true, syscall.MS_NOEXEC},
"mand": {false, syscall.MS_MANDLOCK},
"noatime": {false, syscall.MS_NOATIME},
"nodev": {false, syscall.MS_NODEV},
"nodiratime": {false, syscall.MS_NODIRATIME},
"noexec": {false, syscall.MS_NOEXEC},
"nomand": {true, syscall.MS_MANDLOCK},
"norelatime": {true, syscall.MS_RELATIME},
"nostrictatime": {true, syscall.MS_STRICTATIME},
"nosuid": {false, syscall.MS_NOSUID},
"rbind": {false, syscall.MS_BIND | syscall.MS_REC},
"relatime": {false, syscall.MS_RELATIME},
"remount": {false, syscall.MS_REMOUNT},
"ro": {false, syscall.MS_RDONLY},
"rw": {true, syscall.MS_RDONLY},
"strictatime": {false, syscall.MS_STRICTATIME},
"suid": {true, syscall.MS_NOSUID},
"sync": {false, syscall.MS_SYNCHRONOUS},
}
propagationFlags := map[string]struct {
clear bool
flag int
}{
"private": {false, syscall.MS_PRIVATE},
"shared": {false, syscall.MS_SHARED},
"slave": {false, syscall.MS_SLAVE},
"unbindable": {false, syscall.MS_UNBINDABLE},
"rprivate": {false, syscall.MS_PRIVATE | syscall.MS_REC},
"rshared": {false, syscall.MS_SHARED | syscall.MS_REC},
"rslave": {false, syscall.MS_SLAVE | syscall.MS_REC},
"runbindable": {false, syscall.MS_UNBINDABLE | syscall.MS_REC},
}
for _, o := range options {
// If the option does not exist in the flags table or the flag
// is not supported on the platform,
// then it is a data value for a specific fs type
if f, exists := flags[o]; exists && f.flag != 0 {
if f.clear {
flag &= ^f.flag
} else {
flag |= f.flag
}
} else if f, exists := propagationFlags[o]; exists && f.flag != 0 {
pgflag = append(pgflag, f.flag)
} else {
data = append(data, o)
}
}
return flag, pgflag, strings.Join(data, ",")
}
func setupSeccomp(config *specs.Seccomp) (*configs.Seccomp, error) {
if config == nil {
return nil, nil
}
// No default action specified, no syscalls listed, assume seccomp disabled
if config.DefaultAction == "" && len(config.Syscalls) == 0 {
return nil, nil
}
newConfig := new(configs.Seccomp)
newConfig.Syscalls = []*configs.Syscall{}
if len(config.Architectures) > 0 {
newConfig.Architectures = []string{}
for _, arch := range config.Architectures {
newArch, err := seccomp.ConvertStringToArch(string(arch))
if err != nil {
return nil, err
}
newConfig.Architectures = append(newConfig.Architectures, newArch)
}
}
// Convert default action from string representation
newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
if err != nil {
return nil, err
}
newConfig.DefaultAction = newDefaultAction
// Loop through all syscall blocks and convert them to libcontainer format
for _, call := range config.Syscalls {
newAction, err := seccomp.ConvertStringToAction(string(call.Action))
if err != nil {
return nil, err
}
newCall := configs.Syscall{
Name: call.Name,
Action: newAction,
Args: []*configs.Arg{},
}
// Loop through all the arguments of the syscall and convert them
for _, arg := range call.Args {
newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
if err != nil {
return nil, err
}
newArg := configs.Arg{
Index: arg.Index,
Value: arg.Value,
ValueTwo: arg.ValueTwo,
Op: newOp,
}
newCall.Args = append(newCall.Args, &newArg)
}
newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
}
return newConfig, nil
}
func createHooks(rspec *specs.Spec, config *configs.Config) {
config.Hooks = &configs.Hooks{}
for _, h := range rspec.Hooks.Prestart {
cmd := configs.Command{
Path: h.Path,
Args: h.Args,
Env: h.Env,
}
config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd))
}
for _, h := range rspec.Hooks.Poststart {
cmd := configs.Command{
Path: h.Path,
Args: h.Args,
Env: h.Env,
}
config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd))
}
for _, h := range rspec.Hooks.Poststop {
cmd := configs.Command{
Path: h.Path,
Args: h.Args,
Env: h.Env,
}
config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd))
}
}