2015-07-03 00:59:30 +08:00
|
|
|
// +build linux
|
|
|
|
|
2015-06-22 10:31:12 +08:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/json"
|
|
|
|
"fmt"
|
2015-09-02 00:32:29 +08:00
|
|
|
"io/ioutil"
|
2015-07-03 00:59:30 +08:00
|
|
|
"os"
|
|
|
|
"path/filepath"
|
2015-06-22 10:31:12 +08:00
|
|
|
"runtime"
|
2015-07-03 00:59:30 +08:00
|
|
|
"strings"
|
|
|
|
"syscall"
|
2015-06-22 10:31:12 +08:00
|
|
|
|
2015-06-30 07:49:13 +08:00
|
|
|
"github.com/Sirupsen/logrus"
|
2015-06-22 10:31:12 +08:00
|
|
|
"github.com/codegangsta/cli"
|
2015-07-03 00:59:30 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
2015-08-25 02:30:45 +08:00
|
|
|
"github.com/opencontainers/runc/libcontainer/seccomp"
|
2015-07-03 00:55:24 +08:00
|
|
|
"github.com/opencontainers/specs"
|
2015-06-22 10:31:12 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
var specCommand = cli.Command{
|
|
|
|
Name: "spec",
|
|
|
|
Usage: "create a new specification file",
|
2015-09-02 00:32:29 +08:00
|
|
|
Flags: []cli.Flag{
|
|
|
|
cli.StringFlag{Name: "config-file, c", Value: "config.json", Usage: "path to spec file for writing"},
|
|
|
|
cli.StringFlag{Name: "runtime-file, r", Value: "runtime.json", Usage: "path to runtime file for writing"},
|
|
|
|
},
|
2015-06-22 10:31:12 +08:00
|
|
|
Action: func(context *cli.Context) {
|
2015-07-03 00:59:30 +08:00
|
|
|
spec := specs.LinuxSpec{
|
|
|
|
Spec: specs.Spec{
|
|
|
|
Version: specs.Version,
|
|
|
|
Platform: specs.Platform{
|
|
|
|
OS: runtime.GOOS,
|
|
|
|
Arch: runtime.GOARCH,
|
2015-06-30 02:21:05 +08:00
|
|
|
},
|
2015-07-03 00:59:30 +08:00
|
|
|
Root: specs.Root{
|
|
|
|
Path: "rootfs",
|
|
|
|
Readonly: true,
|
2015-06-22 10:31:12 +08:00
|
|
|
},
|
2015-07-03 00:59:30 +08:00
|
|
|
Process: specs.Process{
|
|
|
|
Terminal: true,
|
|
|
|
User: specs.User{},
|
|
|
|
Args: []string{
|
|
|
|
"sh",
|
|
|
|
},
|
|
|
|
Env: []string{
|
|
|
|
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
|
|
|
|
"TERM=xterm",
|
|
|
|
},
|
2015-06-22 10:31:12 +08:00
|
|
|
},
|
2015-07-03 00:59:30 +08:00
|
|
|
Hostname: "shell",
|
2015-09-02 00:32:29 +08:00
|
|
|
Mounts: []specs.MountPoint{
|
2015-07-03 00:59:30 +08:00
|
|
|
{
|
2015-09-02 00:32:29 +08:00
|
|
|
Name: "proc",
|
|
|
|
Path: "/proc",
|
2015-07-03 00:59:30 +08:00
|
|
|
},
|
|
|
|
{
|
2015-09-02 00:32:29 +08:00
|
|
|
Name: "dev",
|
|
|
|
Path: "/dev",
|
2015-07-03 00:59:30 +08:00
|
|
|
},
|
|
|
|
{
|
2015-09-02 00:32:29 +08:00
|
|
|
Name: "devpts",
|
|
|
|
Path: "/dev/pts",
|
2015-07-03 00:59:30 +08:00
|
|
|
},
|
|
|
|
{
|
2015-09-02 00:32:29 +08:00
|
|
|
Name: "shm",
|
|
|
|
Path: "/dev/shm",
|
2015-07-03 00:59:30 +08:00
|
|
|
},
|
|
|
|
{
|
2015-09-02 00:32:29 +08:00
|
|
|
Name: "mqueue",
|
|
|
|
Path: "/dev/mqueue",
|
2015-07-03 00:59:30 +08:00
|
|
|
},
|
|
|
|
{
|
2015-09-02 00:32:29 +08:00
|
|
|
Name: "sysfs",
|
|
|
|
Path: "/sys",
|
2015-07-03 00:59:30 +08:00
|
|
|
},
|
2015-07-15 09:31:39 +08:00
|
|
|
{
|
2015-09-02 00:32:29 +08:00
|
|
|
Name: "cgroup",
|
|
|
|
Path: "/sys/fs/cgroup",
|
2015-07-15 09:31:39 +08:00
|
|
|
},
|
2015-06-22 10:31:12 +08:00
|
|
|
},
|
2015-07-03 00:59:30 +08:00
|
|
|
},
|
|
|
|
Linux: specs.Linux{
|
2015-09-02 00:32:29 +08:00
|
|
|
Capabilities: []string{
|
2015-09-10 07:25:43 +08:00
|
|
|
"CAP_AUDIT_WRITE",
|
|
|
|
"CAP_KILL",
|
|
|
|
"CAP_NET_BIND_SERVICE",
|
2015-09-02 00:32:29 +08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
rspec := specs.LinuxRuntimeSpec{
|
|
|
|
RuntimeSpec: specs.RuntimeSpec{
|
|
|
|
Mounts: map[string]specs.Mount{
|
|
|
|
"proc": {
|
|
|
|
Type: "proc",
|
|
|
|
Source: "proc",
|
|
|
|
Options: nil,
|
|
|
|
},
|
|
|
|
"dev": {
|
|
|
|
Type: "tmpfs",
|
|
|
|
Source: "tmpfs",
|
|
|
|
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
|
|
|
|
},
|
|
|
|
"devpts": {
|
|
|
|
Type: "devpts",
|
|
|
|
Source: "devpts",
|
|
|
|
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
|
|
|
|
},
|
|
|
|
"shm": {
|
|
|
|
Type: "tmpfs",
|
|
|
|
Source: "shm",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
|
|
|
|
},
|
|
|
|
"mqueue": {
|
|
|
|
Type: "mqueue",
|
|
|
|
Source: "mqueue",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev"},
|
|
|
|
},
|
|
|
|
"sysfs": {
|
|
|
|
Type: "sysfs",
|
|
|
|
Source: "sysfs",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev"},
|
|
|
|
},
|
|
|
|
"cgroup": {
|
|
|
|
Type: "cgroup",
|
|
|
|
Source: "cgroup",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
Linux: specs.LinuxRuntime{
|
2015-07-03 00:59:30 +08:00
|
|
|
Namespaces: []specs.Namespace{
|
|
|
|
{
|
2015-08-05 05:12:18 +08:00
|
|
|
Type: "pid",
|
2015-07-03 00:59:30 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: "network",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: "ipc",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: "uts",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: "mount",
|
|
|
|
},
|
2015-06-22 10:31:12 +08:00
|
|
|
},
|
2015-08-23 19:17:31 +08:00
|
|
|
Rlimits: []specs.Rlimit{
|
|
|
|
{
|
2015-09-02 00:32:29 +08:00
|
|
|
Type: "RLIMIT_NOFILE",
|
2015-08-23 19:17:31 +08:00
|
|
|
Hard: uint64(1024),
|
|
|
|
Soft: uint64(1024),
|
|
|
|
},
|
|
|
|
},
|
2015-08-12 05:24:00 +08:00
|
|
|
Devices: []specs.Device{
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/null",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 3,
|
|
|
|
Permissions: "rwm",
|
|
|
|
FileMode: 0666,
|
|
|
|
UID: 0,
|
|
|
|
GID: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/random",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 8,
|
|
|
|
Permissions: "rwm",
|
|
|
|
FileMode: 0666,
|
|
|
|
UID: 0,
|
|
|
|
GID: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/full",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 7,
|
|
|
|
Permissions: "rwm",
|
|
|
|
FileMode: 0666,
|
|
|
|
UID: 0,
|
|
|
|
GID: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/tty",
|
|
|
|
Major: 5,
|
|
|
|
Minor: 0,
|
|
|
|
Permissions: "rwm",
|
|
|
|
FileMode: 0666,
|
|
|
|
UID: 0,
|
|
|
|
GID: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/zero",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 5,
|
|
|
|
Permissions: "rwm",
|
|
|
|
FileMode: 0666,
|
|
|
|
UID: 0,
|
|
|
|
GID: 0,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Type: 'c',
|
|
|
|
Path: "/dev/urandom",
|
|
|
|
Major: 1,
|
|
|
|
Minor: 9,
|
|
|
|
Permissions: "rwm",
|
|
|
|
FileMode: 0666,
|
|
|
|
UID: 0,
|
|
|
|
GID: 0,
|
|
|
|
},
|
2015-06-22 10:31:12 +08:00
|
|
|
},
|
2015-09-02 00:32:29 +08:00
|
|
|
Resources: &specs.Resources{
|
2015-07-14 10:28:21 +08:00
|
|
|
Memory: specs.Memory{
|
|
|
|
Swappiness: -1,
|
|
|
|
},
|
|
|
|
},
|
2015-08-25 02:30:45 +08:00
|
|
|
Seccomp: specs.Seccomp{
|
|
|
|
DefaultAction: "SCMP_ACT_ALLOW",
|
|
|
|
Syscalls: []*specs.Syscall{},
|
|
|
|
},
|
2015-06-22 10:31:12 +08:00
|
|
|
},
|
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
checkNoFile := func(name string) error {
|
|
|
|
_, err := os.Stat(name)
|
|
|
|
if err == nil {
|
|
|
|
return fmt.Errorf("File %s exists. Remove it first", name)
|
|
|
|
}
|
|
|
|
if !os.IsNotExist(err) {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
cName := context.String("config-file")
|
|
|
|
rName := context.String("runtime-file")
|
|
|
|
if err := checkNoFile(cName); err != nil {
|
|
|
|
logrus.Fatal(err)
|
|
|
|
}
|
|
|
|
if err := checkNoFile(rName); err != nil {
|
|
|
|
logrus.Fatal(err)
|
|
|
|
}
|
2015-06-22 10:31:12 +08:00
|
|
|
data, err := json.MarshalIndent(&spec, "", "\t")
|
|
|
|
if err != nil {
|
2015-06-30 07:49:13 +08:00
|
|
|
logrus.Fatal(err)
|
2015-06-22 10:31:12 +08:00
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
if err := ioutil.WriteFile(cName, data, 0666); err != nil {
|
|
|
|
logrus.Fatal(err)
|
|
|
|
}
|
|
|
|
rdata, err := json.MarshalIndent(&rspec, "", "\t")
|
|
|
|
if err != nil {
|
|
|
|
logrus.Fatal(err)
|
|
|
|
}
|
|
|
|
if err := ioutil.WriteFile(rName, rdata, 0666); err != nil {
|
|
|
|
logrus.Fatal(err)
|
|
|
|
}
|
2015-06-22 10:31:12 +08:00
|
|
|
},
|
|
|
|
}
|
2015-07-03 00:59:30 +08:00
|
|
|
|
2015-09-02 00:32:29 +08:00
|
|
|
var namespaceMapping = map[specs.NamespaceType]configs.NamespaceType{
|
|
|
|
specs.PIDNamespace: configs.NEWPID,
|
|
|
|
specs.NetworkNamespace: configs.NEWNET,
|
|
|
|
specs.MountNamespace: configs.NEWNS,
|
|
|
|
specs.UserNamespace: configs.NEWUSER,
|
|
|
|
specs.IPCNamespace: configs.NEWIPC,
|
|
|
|
specs.UTSNamespace: configs.NEWUTS,
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// loadSpec loads the specification from the provided path.
|
|
|
|
// If the path is empty then the default path will be "config.json"
|
2015-09-02 00:32:29 +08:00
|
|
|
func loadSpec(cPath, rPath string) (spec *specs.LinuxSpec, rspec *specs.LinuxRuntimeSpec, err error) {
|
|
|
|
cf, err := os.Open(cPath)
|
|
|
|
if err != nil {
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
return nil, nil, fmt.Errorf("JSON specification file at %s not found", cPath)
|
|
|
|
}
|
|
|
|
return
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
2015-09-16 10:54:53 +08:00
|
|
|
defer cf.Close()
|
|
|
|
|
2015-09-02 00:32:29 +08:00
|
|
|
rf, err := os.Open(rPath)
|
2015-07-03 00:59:30 +08:00
|
|
|
if err != nil {
|
|
|
|
if os.IsNotExist(err) {
|
2015-09-02 00:32:29 +08:00
|
|
|
return nil, nil, fmt.Errorf("JSON runtime config file at %s not found", rPath)
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
return
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
defer rf.Close()
|
2015-09-16 10:54:53 +08:00
|
|
|
|
2015-09-02 00:32:29 +08:00
|
|
|
if err = json.NewDecoder(cf).Decode(&spec); err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if err = json.NewDecoder(rf).Decode(&rspec); err != nil {
|
|
|
|
return
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
return spec, rspec, checkSpecVersion(spec)
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// checkSpecVersion makes sure that the spec version matches runc's while we are in the initial
|
|
|
|
// development period. It is better to hard fail than have missing fields or options in the spec.
|
|
|
|
func checkSpecVersion(s *specs.LinuxSpec) error {
|
|
|
|
if s.Version != specs.Version {
|
|
|
|
return fmt.Errorf("spec version is not compatible with implemented version %q: spec %q", specs.Version, s.Version)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-09-02 00:32:29 +08:00
|
|
|
func createLibcontainerConfig(cgroupName string, spec *specs.LinuxSpec, rspec *specs.LinuxRuntimeSpec) (*configs.Config, error) {
|
2015-07-03 00:59:30 +08:00
|
|
|
cwd, err := os.Getwd()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
rootfsPath := spec.Root.Path
|
|
|
|
if !filepath.IsAbs(rootfsPath) {
|
|
|
|
rootfsPath = filepath.Join(cwd, rootfsPath)
|
|
|
|
}
|
|
|
|
config := &configs.Config{
|
|
|
|
Rootfs: rootfsPath,
|
|
|
|
Capabilities: spec.Linux.Capabilities,
|
|
|
|
Readonlyfs: spec.Root.Readonly,
|
|
|
|
Hostname: spec.Hostname,
|
|
|
|
Privatefs: true,
|
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
for _, ns := range rspec.Linux.Namespaces {
|
2015-07-03 00:59:30 +08:00
|
|
|
t, exists := namespaceMapping[ns.Type]
|
|
|
|
if !exists {
|
|
|
|
return nil, fmt.Errorf("namespace %q does not exist", ns)
|
|
|
|
}
|
|
|
|
config.Namespaces.Add(t, ns.Path)
|
|
|
|
}
|
2015-07-28 07:36:28 +08:00
|
|
|
if config.Namespaces.Contains(configs.NEWNET) {
|
|
|
|
config.Networks = []*configs.Network{
|
|
|
|
{
|
|
|
|
Type: "loopback",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
for _, mp := range spec.Mounts {
|
|
|
|
m, ok := rspec.Mounts[mp.Name]
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("Mount with Name %q not found in runtime config", mp.Name)
|
|
|
|
}
|
|
|
|
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, mp.Path, m))
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
if err := createDevices(rspec, config); err != nil {
|
2015-07-03 00:59:30 +08:00
|
|
|
return nil, err
|
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
if err := setupUserNamespace(rspec, config); err != nil {
|
2015-07-03 00:59:30 +08:00
|
|
|
return nil, err
|
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
for _, rlimit := range rspec.Linux.Rlimits {
|
|
|
|
rl, err := createLibContainerRlimit(rlimit)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
config.Rlimits = append(config.Rlimits, rl)
|
2015-08-23 19:17:31 +08:00
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
c, err := createCgroupConfig(cgroupName, rspec, config.Devices)
|
2015-07-03 00:59:30 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
config.Cgroups = c
|
|
|
|
if config.Readonlyfs {
|
|
|
|
setReadonly(config)
|
|
|
|
config.MaskPaths = []string{
|
|
|
|
"/proc/kcore",
|
|
|
|
}
|
|
|
|
config.ReadonlyPaths = []string{
|
|
|
|
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
|
|
|
|
}
|
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
seccomp, err := setupSeccomp(&rspec.Linux.Seccomp)
|
2015-08-25 02:30:45 +08:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
config.Seccomp = seccomp
|
2015-09-02 00:32:29 +08:00
|
|
|
config.Sysctl = rspec.Linux.Sysctl
|
|
|
|
config.ProcessLabel = rspec.Linux.SelinuxProcessLabel
|
|
|
|
config.AppArmorProfile = rspec.Linux.ApparmorProfile
|
2015-07-03 00:59:30 +08:00
|
|
|
return config, nil
|
|
|
|
}
|
|
|
|
|
2015-09-02 00:32:29 +08:00
|
|
|
func createLibcontainerMount(cwd, dest string, m specs.Mount) *configs.Mount {
|
libcontainer: Allow passing mount propagation flags
Right now if one passes a mount propagation flag in spec file, it
does not take effect. For example, try following in spec json file.
{
"type": "bind",
"source": "/root/mnt-source",
"destination": "/root/mnt-dest",
"options": "rbind,shared"
}
One would expect that /root/mnt-dest will be shared inside the container
but that's not the case.
#findmnt -o TARGET,PROPAGATION
`-/root/mnt-dest private
Reason being that propagation flags can't be passed in along with other
regular flags. They need to be passed in a separate call to mount syscall.
That too, one propagation flag at a time. (from mount man page).
Hence, store propagation flags separately in a slice and apply these
in that order after the mount call wherever appropriate. This allows
user to control the propagation property of mount point inside
the container.
Storing them separately also solves another problem where recursive flag
(syscall.MS_REC) can get mixed up. For example, options "rbind,private"
and "bind,rprivate" will be same and there will be no way to differentiate
between these if all the flags are stored in a single integer.
This patch would allow one to pass propagation flags "[r]shared,[r]slave,
[r]private,[r]unbindable" in spec file as per mount property.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
2015-09-17 03:53:23 +08:00
|
|
|
flags, pgflags, data := parseMountOptions(m.Options)
|
2015-07-03 00:59:30 +08:00
|
|
|
source := m.Source
|
|
|
|
if m.Type == "bind" {
|
|
|
|
if !filepath.IsAbs(source) {
|
|
|
|
source = filepath.Join(cwd, m.Source)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return &configs.Mount{
|
libcontainer: Allow passing mount propagation flags
Right now if one passes a mount propagation flag in spec file, it
does not take effect. For example, try following in spec json file.
{
"type": "bind",
"source": "/root/mnt-source",
"destination": "/root/mnt-dest",
"options": "rbind,shared"
}
One would expect that /root/mnt-dest will be shared inside the container
but that's not the case.
#findmnt -o TARGET,PROPAGATION
`-/root/mnt-dest private
Reason being that propagation flags can't be passed in along with other
regular flags. They need to be passed in a separate call to mount syscall.
That too, one propagation flag at a time. (from mount man page).
Hence, store propagation flags separately in a slice and apply these
in that order after the mount call wherever appropriate. This allows
user to control the propagation property of mount point inside
the container.
Storing them separately also solves another problem where recursive flag
(syscall.MS_REC) can get mixed up. For example, options "rbind,private"
and "bind,rprivate" will be same and there will be no way to differentiate
between these if all the flags are stored in a single integer.
This patch would allow one to pass propagation flags "[r]shared,[r]slave,
[r]private,[r]unbindable" in spec file as per mount property.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
2015-09-17 03:53:23 +08:00
|
|
|
Device: m.Type,
|
|
|
|
Source: source,
|
|
|
|
Destination: dest,
|
|
|
|
Data: data,
|
|
|
|
Flags: flags,
|
|
|
|
PropagationFlags: pgflags,
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-02 00:32:29 +08:00
|
|
|
func createCgroupConfig(name string, spec *specs.LinuxRuntimeSpec, devices []*configs.Device) (*configs.Cgroup, error) {
|
2015-07-03 00:59:30 +08:00
|
|
|
myCgroupPath, err := cgroups.GetThisCgroupDir("devices")
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
c := &configs.Cgroup{
|
2015-08-11 07:22:49 +08:00
|
|
|
Name: name,
|
2015-07-03 00:59:30 +08:00
|
|
|
Parent: myCgroupPath,
|
|
|
|
AllowedDevices: append(devices, allowedDevices...),
|
|
|
|
}
|
|
|
|
r := spec.Linux.Resources
|
2015-07-07 09:20:51 +08:00
|
|
|
c.Memory = r.Memory.Limit
|
2015-07-03 00:59:30 +08:00
|
|
|
c.MemoryReservation = r.Memory.Reservation
|
|
|
|
c.MemorySwap = r.Memory.Swap
|
|
|
|
c.KernelMemory = r.Memory.Kernel
|
2015-07-14 10:28:21 +08:00
|
|
|
c.MemorySwappiness = r.Memory.Swappiness
|
2015-07-03 00:59:30 +08:00
|
|
|
c.CpuShares = r.CPU.Shares
|
|
|
|
c.CpuQuota = r.CPU.Quota
|
|
|
|
c.CpuPeriod = r.CPU.Period
|
|
|
|
c.CpuRtRuntime = r.CPU.RealtimeRuntime
|
|
|
|
c.CpuRtPeriod = r.CPU.RealtimePeriod
|
|
|
|
c.CpusetCpus = r.CPU.Cpus
|
|
|
|
c.CpusetMems = r.CPU.Mems
|
|
|
|
c.BlkioThrottleReadBpsDevice = r.BlockIO.ThrottleReadBpsDevice
|
|
|
|
c.BlkioThrottleWriteBpsDevice = r.BlockIO.ThrottleWriteBpsDevice
|
|
|
|
c.BlkioThrottleReadIOpsDevice = r.BlockIO.ThrottleReadIOpsDevice
|
|
|
|
c.BlkioThrottleWriteIOpsDevice = r.BlockIO.ThrottleWriteIOpsDevice
|
|
|
|
c.BlkioWeight = r.BlockIO.Weight
|
|
|
|
c.BlkioWeightDevice = r.BlockIO.WeightDevice
|
|
|
|
for _, l := range r.HugepageLimits {
|
|
|
|
c.HugetlbLimit = append(c.HugetlbLimit, &configs.HugepageLimit{
|
|
|
|
Pagesize: l.Pagesize,
|
|
|
|
Limit: l.Limit,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
c.OomKillDisable = r.DisableOOMKiller
|
|
|
|
c.NetClsClassid = r.Network.ClassID
|
|
|
|
for _, m := range r.Network.Priorities {
|
|
|
|
c.NetPrioIfpriomap = append(c.NetPrioIfpriomap, &configs.IfPrioMap{
|
|
|
|
Interface: m.Name,
|
|
|
|
Priority: m.Priority,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
return c, nil
|
|
|
|
}
|
|
|
|
|
2015-09-02 00:32:29 +08:00
|
|
|
func createDevices(spec *specs.LinuxRuntimeSpec, config *configs.Config) error {
|
2015-08-12 05:24:00 +08:00
|
|
|
for _, d := range spec.Linux.Devices {
|
|
|
|
device := &configs.Device{
|
|
|
|
Type: d.Type,
|
|
|
|
Path: d.Path,
|
|
|
|
Major: d.Major,
|
|
|
|
Minor: d.Minor,
|
|
|
|
Permissions: d.Permissions,
|
|
|
|
FileMode: d.FileMode,
|
|
|
|
Uid: d.UID,
|
|
|
|
Gid: d.GID,
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
2015-08-12 05:24:00 +08:00
|
|
|
config.Devices = append(config.Devices, device)
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func setReadonly(config *configs.Config) {
|
|
|
|
for _, m := range config.Mounts {
|
|
|
|
if m.Device == "sysfs" {
|
|
|
|
m.Flags |= syscall.MS_RDONLY
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-02 00:32:29 +08:00
|
|
|
func setupUserNamespace(spec *specs.LinuxRuntimeSpec, config *configs.Config) error {
|
2015-08-04 03:00:36 +08:00
|
|
|
if len(spec.Linux.UIDMappings) == 0 {
|
2015-07-03 00:59:30 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
config.Namespaces.Add(configs.NEWUSER, "")
|
|
|
|
create := func(m specs.IDMapping) configs.IDMap {
|
|
|
|
return configs.IDMap{
|
2015-07-08 15:41:43 +08:00
|
|
|
HostID: int(m.HostID),
|
|
|
|
ContainerID: int(m.ContainerID),
|
|
|
|
Size: int(m.Size),
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
|
|
|
}
|
2015-08-04 03:00:36 +08:00
|
|
|
for _, m := range spec.Linux.UIDMappings {
|
2015-07-03 00:59:30 +08:00
|
|
|
config.UidMappings = append(config.UidMappings, create(m))
|
|
|
|
}
|
2015-08-04 03:00:36 +08:00
|
|
|
for _, m := range spec.Linux.GIDMappings {
|
2015-07-03 00:59:30 +08:00
|
|
|
config.GidMappings = append(config.GidMappings, create(m))
|
|
|
|
}
|
2015-08-05 05:44:45 +08:00
|
|
|
rootUID, err := config.HostUID()
|
2015-07-03 00:59:30 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2015-08-05 05:44:45 +08:00
|
|
|
rootGID, err := config.HostGID()
|
2015-07-03 00:59:30 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
for _, node := range config.Devices {
|
2015-08-05 05:44:45 +08:00
|
|
|
node.Uid = uint32(rootUID)
|
|
|
|
node.Gid = uint32(rootGID)
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-09-02 00:32:29 +08:00
|
|
|
func createLibContainerRlimit(rlimit specs.Rlimit) (configs.Rlimit, error) {
|
|
|
|
rl, err := strToRlimit(rlimit.Type)
|
|
|
|
if err != nil {
|
|
|
|
return configs.Rlimit{}, err
|
|
|
|
}
|
2015-08-23 19:17:31 +08:00
|
|
|
return configs.Rlimit{
|
2015-09-02 00:32:29 +08:00
|
|
|
Type: rl,
|
2015-08-23 19:17:31 +08:00
|
|
|
Hard: uint64(rlimit.Hard),
|
|
|
|
Soft: uint64(rlimit.Soft),
|
2015-09-02 00:32:29 +08:00
|
|
|
}, nil
|
2015-08-23 19:17:31 +08:00
|
|
|
}
|
|
|
|
|
libcontainer: Allow passing mount propagation flags
Right now if one passes a mount propagation flag in spec file, it
does not take effect. For example, try following in spec json file.
{
"type": "bind",
"source": "/root/mnt-source",
"destination": "/root/mnt-dest",
"options": "rbind,shared"
}
One would expect that /root/mnt-dest will be shared inside the container
but that's not the case.
#findmnt -o TARGET,PROPAGATION
`-/root/mnt-dest private
Reason being that propagation flags can't be passed in along with other
regular flags. They need to be passed in a separate call to mount syscall.
That too, one propagation flag at a time. (from mount man page).
Hence, store propagation flags separately in a slice and apply these
in that order after the mount call wherever appropriate. This allows
user to control the propagation property of mount point inside
the container.
Storing them separately also solves another problem where recursive flag
(syscall.MS_REC) can get mixed up. For example, options "rbind,private"
and "bind,rprivate" will be same and there will be no way to differentiate
between these if all the flags are stored in a single integer.
This patch would allow one to pass propagation flags "[r]shared,[r]slave,
[r]private,[r]unbindable" in spec file as per mount property.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
2015-09-17 03:53:23 +08:00
|
|
|
// parseMountOptions parses the string and returns the flags, propagation
|
|
|
|
// flags and any mount data that it contains.
|
|
|
|
func parseMountOptions(options []string) (int, []int, string) {
|
2015-07-03 00:59:30 +08:00
|
|
|
var (
|
libcontainer: Allow passing mount propagation flags
Right now if one passes a mount propagation flag in spec file, it
does not take effect. For example, try following in spec json file.
{
"type": "bind",
"source": "/root/mnt-source",
"destination": "/root/mnt-dest",
"options": "rbind,shared"
}
One would expect that /root/mnt-dest will be shared inside the container
but that's not the case.
#findmnt -o TARGET,PROPAGATION
`-/root/mnt-dest private
Reason being that propagation flags can't be passed in along with other
regular flags. They need to be passed in a separate call to mount syscall.
That too, one propagation flag at a time. (from mount man page).
Hence, store propagation flags separately in a slice and apply these
in that order after the mount call wherever appropriate. This allows
user to control the propagation property of mount point inside
the container.
Storing them separately also solves another problem where recursive flag
(syscall.MS_REC) can get mixed up. For example, options "rbind,private"
and "bind,rprivate" will be same and there will be no way to differentiate
between these if all the flags are stored in a single integer.
This patch would allow one to pass propagation flags "[r]shared,[r]slave,
[r]private,[r]unbindable" in spec file as per mount property.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
2015-09-17 03:53:23 +08:00
|
|
|
flag int
|
|
|
|
pgflag []int
|
|
|
|
data []string
|
2015-07-03 00:59:30 +08:00
|
|
|
)
|
|
|
|
flags := map[string]struct {
|
|
|
|
clear bool
|
|
|
|
flag int
|
|
|
|
}{
|
2015-07-10 10:31:20 +08:00
|
|
|
"async": {true, syscall.MS_SYNCHRONOUS},
|
|
|
|
"atime": {true, syscall.MS_NOATIME},
|
|
|
|
"bind": {false, syscall.MS_BIND},
|
2015-07-03 00:59:30 +08:00
|
|
|
"defaults": {false, 0},
|
|
|
|
"dev": {true, syscall.MS_NODEV},
|
2015-07-10 10:31:20 +08:00
|
|
|
"diratime": {true, syscall.MS_NODIRATIME},
|
2015-07-03 00:59:30 +08:00
|
|
|
"dirsync": {false, syscall.MS_DIRSYNC},
|
2015-07-10 10:31:20 +08:00
|
|
|
"exec": {true, syscall.MS_NOEXEC},
|
2015-07-03 00:59:30 +08:00
|
|
|
"mand": {false, syscall.MS_MANDLOCK},
|
|
|
|
"noatime": {false, syscall.MS_NOATIME},
|
2015-07-10 10:31:20 +08:00
|
|
|
"nodev": {false, syscall.MS_NODEV},
|
2015-07-03 00:59:30 +08:00
|
|
|
"nodiratime": {false, syscall.MS_NODIRATIME},
|
2015-07-10 10:31:20 +08:00
|
|
|
"noexec": {false, syscall.MS_NOEXEC},
|
|
|
|
"nomand": {true, syscall.MS_MANDLOCK},
|
|
|
|
"norelatime": {true, syscall.MS_RELATIME},
|
|
|
|
"nostrictatime": {true, syscall.MS_STRICTATIME},
|
|
|
|
"nosuid": {false, syscall.MS_NOSUID},
|
|
|
|
"rbind": {false, syscall.MS_BIND | syscall.MS_REC},
|
|
|
|
"relatime": {false, syscall.MS_RELATIME},
|
|
|
|
"remount": {false, syscall.MS_REMOUNT},
|
|
|
|
"ro": {false, syscall.MS_RDONLY},
|
|
|
|
"rw": {true, syscall.MS_RDONLY},
|
2015-07-03 00:59:30 +08:00
|
|
|
"strictatime": {false, syscall.MS_STRICTATIME},
|
2015-07-10 10:31:20 +08:00
|
|
|
"suid": {true, syscall.MS_NOSUID},
|
|
|
|
"sync": {false, syscall.MS_SYNCHRONOUS},
|
libcontainer: Allow passing mount propagation flags
Right now if one passes a mount propagation flag in spec file, it
does not take effect. For example, try following in spec json file.
{
"type": "bind",
"source": "/root/mnt-source",
"destination": "/root/mnt-dest",
"options": "rbind,shared"
}
One would expect that /root/mnt-dest will be shared inside the container
but that's not the case.
#findmnt -o TARGET,PROPAGATION
`-/root/mnt-dest private
Reason being that propagation flags can't be passed in along with other
regular flags. They need to be passed in a separate call to mount syscall.
That too, one propagation flag at a time. (from mount man page).
Hence, store propagation flags separately in a slice and apply these
in that order after the mount call wherever appropriate. This allows
user to control the propagation property of mount point inside
the container.
Storing them separately also solves another problem where recursive flag
(syscall.MS_REC) can get mixed up. For example, options "rbind,private"
and "bind,rprivate" will be same and there will be no way to differentiate
between these if all the flags are stored in a single integer.
This patch would allow one to pass propagation flags "[r]shared,[r]slave,
[r]private,[r]unbindable" in spec file as per mount property.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
2015-09-17 03:53:23 +08:00
|
|
|
}
|
|
|
|
propagationFlags := map[string]struct {
|
|
|
|
clear bool
|
|
|
|
flag int
|
|
|
|
}{
|
|
|
|
"private": {false, syscall.MS_PRIVATE},
|
|
|
|
"shared": {false, syscall.MS_SHARED},
|
|
|
|
"slave": {false, syscall.MS_SLAVE},
|
|
|
|
"unbindable": {false, syscall.MS_UNBINDABLE},
|
|
|
|
"rprivate": {false, syscall.MS_PRIVATE | syscall.MS_REC},
|
|
|
|
"rshared": {false, syscall.MS_SHARED | syscall.MS_REC},
|
|
|
|
"rslave": {false, syscall.MS_SLAVE | syscall.MS_REC},
|
|
|
|
"runbindable": {false, syscall.MS_UNBINDABLE | syscall.MS_REC},
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
2015-09-02 00:32:29 +08:00
|
|
|
for _, o := range options {
|
2015-07-03 00:59:30 +08:00
|
|
|
// If the option does not exist in the flags table or the flag
|
|
|
|
// is not supported on the platform,
|
|
|
|
// then it is a data value for a specific fs type
|
|
|
|
if f, exists := flags[o]; exists && f.flag != 0 {
|
|
|
|
if f.clear {
|
|
|
|
flag &= ^f.flag
|
|
|
|
} else {
|
|
|
|
flag |= f.flag
|
|
|
|
}
|
libcontainer: Allow passing mount propagation flags
Right now if one passes a mount propagation flag in spec file, it
does not take effect. For example, try following in spec json file.
{
"type": "bind",
"source": "/root/mnt-source",
"destination": "/root/mnt-dest",
"options": "rbind,shared"
}
One would expect that /root/mnt-dest will be shared inside the container
but that's not the case.
#findmnt -o TARGET,PROPAGATION
`-/root/mnt-dest private
Reason being that propagation flags can't be passed in along with other
regular flags. They need to be passed in a separate call to mount syscall.
That too, one propagation flag at a time. (from mount man page).
Hence, store propagation flags separately in a slice and apply these
in that order after the mount call wherever appropriate. This allows
user to control the propagation property of mount point inside
the container.
Storing them separately also solves another problem where recursive flag
(syscall.MS_REC) can get mixed up. For example, options "rbind,private"
and "bind,rprivate" will be same and there will be no way to differentiate
between these if all the flags are stored in a single integer.
This patch would allow one to pass propagation flags "[r]shared,[r]slave,
[r]private,[r]unbindable" in spec file as per mount property.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
2015-09-17 03:53:23 +08:00
|
|
|
} else if f, exists := propagationFlags[o]; exists && f.flag != 0 {
|
|
|
|
pgflag = append(pgflag, f.flag)
|
2015-07-03 00:59:30 +08:00
|
|
|
} else {
|
|
|
|
data = append(data, o)
|
|
|
|
}
|
|
|
|
}
|
libcontainer: Allow passing mount propagation flags
Right now if one passes a mount propagation flag in spec file, it
does not take effect. For example, try following in spec json file.
{
"type": "bind",
"source": "/root/mnt-source",
"destination": "/root/mnt-dest",
"options": "rbind,shared"
}
One would expect that /root/mnt-dest will be shared inside the container
but that's not the case.
#findmnt -o TARGET,PROPAGATION
`-/root/mnt-dest private
Reason being that propagation flags can't be passed in along with other
regular flags. They need to be passed in a separate call to mount syscall.
That too, one propagation flag at a time. (from mount man page).
Hence, store propagation flags separately in a slice and apply these
in that order after the mount call wherever appropriate. This allows
user to control the propagation property of mount point inside
the container.
Storing them separately also solves another problem where recursive flag
(syscall.MS_REC) can get mixed up. For example, options "rbind,private"
and "bind,rprivate" will be same and there will be no way to differentiate
between these if all the flags are stored in a single integer.
This patch would allow one to pass propagation flags "[r]shared,[r]slave,
[r]private,[r]unbindable" in spec file as per mount property.
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
2015-09-17 03:53:23 +08:00
|
|
|
return flag, pgflag, strings.Join(data, ",")
|
2015-07-03 00:59:30 +08:00
|
|
|
}
|
2015-08-25 02:30:45 +08:00
|
|
|
|
|
|
|
func setupSeccomp(config *specs.Seccomp) (*configs.Seccomp, error) {
|
|
|
|
if config == nil {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// No default action specified, no syscalls listed, assume seccomp disabled
|
|
|
|
if config.DefaultAction == "" && len(config.Syscalls) == 0 {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
newConfig := new(configs.Seccomp)
|
|
|
|
newConfig.Syscalls = []*configs.Syscall{}
|
|
|
|
|
|
|
|
// Convert default action from string representation
|
|
|
|
newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
newConfig.DefaultAction = newDefaultAction
|
|
|
|
|
|
|
|
// Loop through all syscall blocks and convert them to libcontainer format
|
|
|
|
for _, call := range config.Syscalls {
|
|
|
|
newAction, err := seccomp.ConvertStringToAction(string(call.Action))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
newCall := configs.Syscall{
|
|
|
|
Name: call.Name,
|
|
|
|
Action: newAction,
|
|
|
|
Args: []*configs.Arg{},
|
|
|
|
}
|
|
|
|
|
|
|
|
// Loop through all the arguments of the syscall and convert them
|
|
|
|
for _, arg := range call.Args {
|
|
|
|
newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
newArg := configs.Arg{
|
|
|
|
Index: arg.Index,
|
|
|
|
Value: arg.Value,
|
|
|
|
ValueTwo: arg.ValueTwo,
|
|
|
|
Op: newOp,
|
|
|
|
}
|
|
|
|
|
|
|
|
newCall.Args = append(newCall.Args, &newArg)
|
|
|
|
}
|
|
|
|
|
|
|
|
newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
|
|
|
|
}
|
|
|
|
|
|
|
|
return newConfig, nil
|
|
|
|
}
|