// +build linux package main import ( "encoding/json" "fmt" "os" "path/filepath" "strings" "syscall" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" ) type UserMapping struct { From int `json:"from"` To int `json:"to"` Count int `json:"count"` } type Rlimit struct { Type int `json:"type"` Hard uint64 `json:"hard"` Soft uint64 `json:"soft"` } type HugepageLimit struct { Pagesize string `json:"pageSize"` Limit int `json:"limit"` } type IfPrioMap struct { Interface string `json:"interface"` Priority int64 `json:"priority"` } type Resources struct { // Memory reservation or soft_limit (in bytes) MemoryReservation int64 `json:"memoryReservation"` // Total memory usage (memory + swap); set `-1' to disable swap MemorySwap int64 `json:"memorySwap"` // Kernel memory limit (in bytes) KernelMemory int64 `json:"kernelMemory"` // CPU shares (relative weight vs. other containers) CpuShares int64 `json:"cpuShares"` // CPU hardcap limit (in usecs). Allowed cpu time in a given period. CpuQuota int64 `json:"cpuQuota"` // CPU period to be used for hardcapping (in usecs). 0 to use system default. CpuPeriod int64 `json:"cpuPeriod"` // How many time CPU will use in realtime scheduling (in usecs). CpuRtRuntime int64 `json:"cpuQuota"` // CPU period to be used for realtime scheduling (in usecs). CpuRtPeriod int64 `json:"cpuPeriod"` // CPU to use CpusetCpus string `json:"cpusetCpus"` // MEM to use CpusetMems string `json:"cpusetMems"` // IO read rate limit per cgroup per device, bytes per second. BlkioThrottleReadBpsDevice string `json:"blkioThrottleReadBpsDevice"` // IO write rate limit per cgroup per divice, bytes per second. BlkioThrottleWriteBpsDevice string `json:"blkioThrottleWriteBpsDevice"` // IO read rate limit per cgroup per device, IO per second. BlkioThrottleReadIOpsDevice string `json:"blkioThrottleReadIopsDevice"` // IO write rate limit per cgroup per device, IO per second. BlkioThrottleWriteIOpsDevice string `json:"blkioThrottleWriteIopsDevice"` // Specifies per cgroup weight, range is from 10 to 1000. BlkioWeight int64 `json:"blkioWeight"` // Weight per cgroup per device, can override BlkioWeight. BlkioWeightDevice string `json:"blkioWeightDevice"` // Hugetlb limit (in bytes) HugetlbLimit []*HugepageLimit `json:"hugetlbLimit"` // Whether to disable OOM Killer DisableOOMKiller bool `json:"disableOOMKiller"` // Set priority of network traffic for container NetPrioIfpriomap []*IfPrioMap `json:"netPrioIfpriomap"` // Set class identifier for container's network packets NetClsClassid string `json:"netClsClassid"` } type LinuxSpec struct { PortableSpec UserMapping map[string]UserMapping `json:"userMapping"` Rlimits []Rlimit `json:"rlimits"` SystemProperties map[string]string `json:"systemProperties"` Resources *Resources `json:"resources"` } var namespaceMapping = map[string]configs.NamespaceType{ "process": configs.NEWPID, "network": configs.NEWNET, "mount": configs.NEWNS, "user": configs.NEWUSER, "ipc": configs.NEWIPC, "uts": configs.NEWUTS, } // loadSpec loads the specification from the provided path. // If the path is empty then the default path will be "container.json" func loadSpec(path string) (*LinuxSpec, error) { if path == "" { path = "container.json" } f, err := os.Open(path) if err != nil { if os.IsNotExist(err) { return nil, fmt.Errorf("JSON specification file for %s not found", path) } return nil, err } defer f.Close() var s *LinuxSpec if err := json.NewDecoder(f).Decode(&s); err != nil { return nil, err } return s, nil } func createLibcontainerConfig(spec *LinuxSpec) (*configs.Config, error) { cwd, err := os.Getwd() if err != nil { return nil, err } config := &configs.Config{ Capabilities: spec.Capabilities, Rootfs: filepath.Join(cwd, spec.Root.Path), Readonlyfs: spec.Root.Readonly, Hostname: spec.Hostname, Privatefs: true, } for _, ns := range spec.Namespaces { t, exists := namespaceMapping[ns.Type] if !exists { return nil, fmt.Errorf("namespace %q does not exist", ns) } config.Namespaces.Add(t, ns.Path) } for _, m := range spec.Mounts { config.Mounts = append(config.Mounts, createLibcontianerMount(cwd, m)) } if err := createDevices(spec, config); err != nil { return nil, err } if err := setupUserNamespace(spec, config); err != nil { return nil, err } c, err := createCgroupConfig(spec, config.Devices) if err != nil { return nil, err } config.Cgroups = c if config.Readonlyfs { setReadonly(config) config.MaskPaths = []string{ "/proc/kcore", } config.ReadonlyPaths = []string{ "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", } } return config, nil } func createLibcontianerMount(cwd string, m Mount) *configs.Mount { flags, data := parseMountOptions(m.Options) source := m.Source if m.Type == "bind" { if !filepath.IsAbs(source) { source = filepath.Join(cwd, m.Source) } } return &configs.Mount{ Device: m.Type, Source: source, Destination: m.Destination, Data: data, Flags: flags, } } func createCgroupConfig(spec *LinuxSpec, devices []*configs.Device) (*configs.Cgroup, error) { myCgroupPath, err := cgroups.GetThisCgroupDir("devices") if err != nil { return nil, err } c := &configs.Cgroup{ Name: getDefaultID(), Parent: myCgroupPath, AllowedDevices: append(devices, allowedDevices...), CpuQuota: getCPUQuota(spec.Cpus), Memory: spec.Memory * 1024 * 1024, MemorySwap: -1, } if r := spec.Resources; r != nil { c.MemoryReservation = r.MemoryReservation c.MemorySwap = r.MemorySwap c.KernelMemory = r.KernelMemory c.CpuShares = r.CpuShares c.CpuQuota = r.CpuQuota c.CpuPeriod = r.CpuPeriod c.CpuRtRuntime = r.CpuRtRuntime c.CpuRtPeriod = r.CpuRtPeriod c.CpusetCpus = r.CpusetCpus c.CpusetMems = r.CpusetMems c.BlkioThrottleReadBpsDevice = r.BlkioThrottleReadBpsDevice c.BlkioThrottleWriteBpsDevice = r.BlkioThrottleWriteBpsDevice c.BlkioThrottleReadIOpsDevice = r.BlkioThrottleReadIOpsDevice c.BlkioThrottleWriteIOpsDevice = r.BlkioThrottleWriteIOpsDevice c.BlkioWeight = r.BlkioWeight c.BlkioWeightDevice = r.BlkioWeightDevice for _, l := range r.HugetlbLimit { c.HugetlbLimit = append(c.HugetlbLimit, &configs.HugepageLimit{ Pagesize: l.Pagesize, Limit: l.Limit, }) } c.OomKillDisable = r.DisableOOMKiller for _, m := range r.NetPrioIfpriomap { c.NetPrioIfpriomap = append(c.NetPrioIfpriomap, &configs.IfPrioMap{ Interface: m.Interface, Priority: m.Priority, }) } c.NetClsClassid = r.NetClsClassid } return c, nil } func createDevices(spec *LinuxSpec, config *configs.Config) error { for _, name := range spec.Devices { d, err := devices.DeviceFromPath(filepath.Join("/dev", name), "rwm") if err != nil { return err } config.Devices = append(config.Devices, d) } return nil } func setReadonly(config *configs.Config) { for _, m := range config.Mounts { if m.Device == "sysfs" { m.Flags |= syscall.MS_RDONLY } } } func getCPUQuota(cpus float64) int64 { return int64(cpus * cpuQuotaMultiplyer) } func setupUserNamespace(spec *LinuxSpec, config *configs.Config) error { if len(spec.UserMapping) == 0 { return nil } config.Namespaces.Add(configs.NEWUSER, "") mappings := make(map[string][]configs.IDMap) for k, v := range spec.UserMapping { mappings[k] = append(mappings[k], configs.IDMap{ ContainerID: v.From, HostID: v.To, Size: v.Count, }) } config.UidMappings = mappings["uid"] config.GidMappings = mappings["gid"] rootUid, err := config.HostUID() if err != nil { return err } rootGid, err := config.HostGID() if err != nil { return err } for _, node := range config.Devices { node.Uid = uint32(rootUid) node.Gid = uint32(rootGid) } return nil } // parseMountOptions parses the string and returns the flags and any mount data that // it contains. func parseMountOptions(options string) (int, string) { var ( flag int data []string ) flags := map[string]struct { clear bool flag int }{ "defaults": {false, 0}, "ro": {false, syscall.MS_RDONLY}, "rw": {true, syscall.MS_RDONLY}, "suid": {true, syscall.MS_NOSUID}, "nosuid": {false, syscall.MS_NOSUID}, "dev": {true, syscall.MS_NODEV}, "nodev": {false, syscall.MS_NODEV}, "exec": {true, syscall.MS_NOEXEC}, "noexec": {false, syscall.MS_NOEXEC}, "sync": {false, syscall.MS_SYNCHRONOUS}, "async": {true, syscall.MS_SYNCHRONOUS}, "dirsync": {false, syscall.MS_DIRSYNC}, "remount": {false, syscall.MS_REMOUNT}, "mand": {false, syscall.MS_MANDLOCK}, "nomand": {true, syscall.MS_MANDLOCK}, "atime": {true, syscall.MS_NOATIME}, "noatime": {false, syscall.MS_NOATIME}, "diratime": {true, syscall.MS_NODIRATIME}, "nodiratime": {false, syscall.MS_NODIRATIME}, "bind": {false, syscall.MS_BIND}, "rbind": {false, syscall.MS_BIND | syscall.MS_REC}, "unbindable": {false, syscall.MS_UNBINDABLE}, "runbindable": {false, syscall.MS_UNBINDABLE | syscall.MS_REC}, "private": {false, syscall.MS_PRIVATE}, "rprivate": {false, syscall.MS_PRIVATE | syscall.MS_REC}, "shared": {false, syscall.MS_SHARED}, "rshared": {false, syscall.MS_SHARED | syscall.MS_REC}, "slave": {false, syscall.MS_SLAVE}, "rslave": {false, syscall.MS_SLAVE | syscall.MS_REC}, "relatime": {false, syscall.MS_RELATIME}, "norelatime": {true, syscall.MS_RELATIME}, "strictatime": {false, syscall.MS_STRICTATIME}, "nostrictatime": {true, syscall.MS_STRICTATIME}, } for _, o := range strings.Split(options, ",") { // If the option does not exist in the flags table or the flag // is not supported on the platform, // then it is a data value for a specific fs type if f, exists := flags[o]; exists && f.flag != 0 { if f.clear { flag &= ^f.flag } else { flag |= f.flag } } else { data = append(data, o) } } return flag, strings.Join(data, ",") }