917 lines
25 KiB
Go
917 lines
25 KiB
Go
// +build linux
|
|
|
|
// Package specconv implements conversion of specifications to libcontainer
|
|
// configurations
|
|
package specconv
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
|
|
dbus "github.com/godbus/dbus/v5"
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/seccomp"
|
|
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
const wildcard = -1
|
|
|
|
var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{
|
|
specs.PIDNamespace: configs.NEWPID,
|
|
specs.NetworkNamespace: configs.NEWNET,
|
|
specs.MountNamespace: configs.NEWNS,
|
|
specs.UserNamespace: configs.NEWUSER,
|
|
specs.IPCNamespace: configs.NEWIPC,
|
|
specs.UTSNamespace: configs.NEWUTS,
|
|
specs.CgroupNamespace: configs.NEWCGROUP,
|
|
}
|
|
|
|
var mountPropagationMapping = map[string]int{
|
|
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
|
|
"private": unix.MS_PRIVATE,
|
|
"rslave": unix.MS_SLAVE | unix.MS_REC,
|
|
"slave": unix.MS_SLAVE,
|
|
"rshared": unix.MS_SHARED | unix.MS_REC,
|
|
"shared": unix.MS_SHARED,
|
|
"runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
|
|
"unbindable": unix.MS_UNBINDABLE,
|
|
"": 0,
|
|
}
|
|
|
|
// AllowedDevices is the set of devices which are automatically included for
|
|
// all containers.
|
|
//
|
|
// XXX (cyphar)
|
|
// This behaviour is at the very least "questionable" (if not outright
|
|
// wrong) according to the runtime-spec.
|
|
//
|
|
// Yes, we have to include certain devices other than the ones the user
|
|
// specifies, but several devices listed here are not part of the spec
|
|
// (including "mknod for any device"?!). In addition, these rules are
|
|
// appended to the user-provided set which means that users *cannot disable
|
|
// this behaviour*.
|
|
//
|
|
// ... unfortunately I'm too scared to change this now because who knows how
|
|
// many people depend on this (incorrect and arguably insecure) behaviour.
|
|
var AllowedDevices = []*configs.Device{
|
|
// allow mknod for any device
|
|
{
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.CharDevice,
|
|
Major: configs.Wildcard,
|
|
Minor: configs.Wildcard,
|
|
Permissions: "m",
|
|
Allow: true,
|
|
},
|
|
},
|
|
{
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.BlockDevice,
|
|
Major: configs.Wildcard,
|
|
Minor: configs.Wildcard,
|
|
Permissions: "m",
|
|
Allow: true,
|
|
},
|
|
},
|
|
{
|
|
Path: "/dev/null",
|
|
FileMode: 0666,
|
|
Uid: 0,
|
|
Gid: 0,
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.CharDevice,
|
|
Major: 1,
|
|
Minor: 3,
|
|
Permissions: "rwm",
|
|
Allow: true,
|
|
},
|
|
},
|
|
{
|
|
Path: "/dev/random",
|
|
FileMode: 0666,
|
|
Uid: 0,
|
|
Gid: 0,
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.CharDevice,
|
|
Major: 1,
|
|
Minor: 8,
|
|
Permissions: "rwm",
|
|
Allow: true,
|
|
},
|
|
},
|
|
{
|
|
Path: "/dev/full",
|
|
FileMode: 0666,
|
|
Uid: 0,
|
|
Gid: 0,
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.CharDevice,
|
|
Major: 1,
|
|
Minor: 7,
|
|
Permissions: "rwm",
|
|
Allow: true,
|
|
},
|
|
},
|
|
{
|
|
Path: "/dev/tty",
|
|
FileMode: 0666,
|
|
Uid: 0,
|
|
Gid: 0,
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.CharDevice,
|
|
Major: 5,
|
|
Minor: 0,
|
|
Permissions: "rwm",
|
|
Allow: true,
|
|
},
|
|
},
|
|
{
|
|
Path: "/dev/zero",
|
|
FileMode: 0666,
|
|
Uid: 0,
|
|
Gid: 0,
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.CharDevice,
|
|
Major: 1,
|
|
Minor: 5,
|
|
Permissions: "rwm",
|
|
Allow: true,
|
|
},
|
|
},
|
|
{
|
|
Path: "/dev/urandom",
|
|
FileMode: 0666,
|
|
Uid: 0,
|
|
Gid: 0,
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.CharDevice,
|
|
Major: 1,
|
|
Minor: 9,
|
|
Permissions: "rwm",
|
|
Allow: true,
|
|
},
|
|
},
|
|
// /dev/pts/ - pts namespaces are "coming soon"
|
|
{
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.CharDevice,
|
|
Major: 136,
|
|
Minor: configs.Wildcard,
|
|
Permissions: "rwm",
|
|
Allow: true,
|
|
},
|
|
},
|
|
{
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.CharDevice,
|
|
Major: 5,
|
|
Minor: 2,
|
|
Permissions: "rwm",
|
|
Allow: true,
|
|
},
|
|
},
|
|
// tuntap
|
|
{
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: configs.CharDevice,
|
|
Major: 10,
|
|
Minor: 200,
|
|
Permissions: "rwm",
|
|
Allow: true,
|
|
},
|
|
},
|
|
}
|
|
|
|
type CreateOpts struct {
|
|
CgroupName string
|
|
UseSystemdCgroup bool
|
|
NoPivotRoot bool
|
|
NoNewKeyring bool
|
|
Spec *specs.Spec
|
|
RootlessEUID bool
|
|
RootlessCgroups bool
|
|
}
|
|
|
|
// CreateLibcontainerConfig creates a new libcontainer configuration from a
|
|
// given specification and a cgroup name
|
|
func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
|
|
// runc's cwd will always be the bundle path
|
|
rcwd, err := os.Getwd()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cwd, err := filepath.Abs(rcwd)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
spec := opts.Spec
|
|
if spec.Root == nil {
|
|
return nil, fmt.Errorf("Root must be specified")
|
|
}
|
|
rootfsPath := spec.Root.Path
|
|
if !filepath.IsAbs(rootfsPath) {
|
|
rootfsPath = filepath.Join(cwd, rootfsPath)
|
|
}
|
|
labels := []string{}
|
|
for k, v := range spec.Annotations {
|
|
labels = append(labels, fmt.Sprintf("%s=%s", k, v))
|
|
}
|
|
config := &configs.Config{
|
|
Rootfs: rootfsPath,
|
|
NoPivotRoot: opts.NoPivotRoot,
|
|
Readonlyfs: spec.Root.Readonly,
|
|
Hostname: spec.Hostname,
|
|
Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
|
|
NoNewKeyring: opts.NoNewKeyring,
|
|
RootlessEUID: opts.RootlessEUID,
|
|
RootlessCgroups: opts.RootlessCgroups,
|
|
}
|
|
|
|
exists := false
|
|
for _, m := range spec.Mounts {
|
|
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
|
|
}
|
|
if err := createDevices(spec, config); err != nil {
|
|
return nil, err
|
|
}
|
|
c, err := CreateCgroupConfig(opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
config.Cgroups = c
|
|
// set linux-specific config
|
|
if spec.Linux != nil {
|
|
if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
|
|
return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
|
|
}
|
|
if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) {
|
|
return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root")
|
|
}
|
|
|
|
for _, ns := range spec.Linux.Namespaces {
|
|
t, exists := namespaceMapping[ns.Type]
|
|
if !exists {
|
|
return nil, fmt.Errorf("namespace %q does not exist", ns)
|
|
}
|
|
if config.Namespaces.Contains(t) {
|
|
return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
|
|
}
|
|
config.Namespaces.Add(t, ns.Path)
|
|
}
|
|
if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" {
|
|
config.Networks = []*configs.Network{
|
|
{
|
|
Type: "loopback",
|
|
},
|
|
}
|
|
}
|
|
if config.Namespaces.Contains(configs.NEWUSER) {
|
|
if err := setupUserNamespace(spec, config); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
config.MaskPaths = spec.Linux.MaskedPaths
|
|
config.ReadonlyPaths = spec.Linux.ReadonlyPaths
|
|
config.MountLabel = spec.Linux.MountLabel
|
|
config.Sysctl = spec.Linux.Sysctl
|
|
if spec.Linux.Seccomp != nil {
|
|
seccomp, err := SetupSeccomp(spec.Linux.Seccomp)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
config.Seccomp = seccomp
|
|
}
|
|
if spec.Linux.IntelRdt != nil {
|
|
config.IntelRdt = &configs.IntelRdt{}
|
|
if spec.Linux.IntelRdt.L3CacheSchema != "" {
|
|
config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema
|
|
}
|
|
if spec.Linux.IntelRdt.MemBwSchema != "" {
|
|
config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema
|
|
}
|
|
}
|
|
}
|
|
if spec.Process != nil {
|
|
config.OomScoreAdj = spec.Process.OOMScoreAdj
|
|
config.NoNewPrivileges = spec.Process.NoNewPrivileges
|
|
if spec.Process.SelinuxLabel != "" {
|
|
config.ProcessLabel = spec.Process.SelinuxLabel
|
|
}
|
|
if spec.Process.Capabilities != nil {
|
|
config.Capabilities = &configs.Capabilities{
|
|
Bounding: spec.Process.Capabilities.Bounding,
|
|
Effective: spec.Process.Capabilities.Effective,
|
|
Permitted: spec.Process.Capabilities.Permitted,
|
|
Inheritable: spec.Process.Capabilities.Inheritable,
|
|
Ambient: spec.Process.Capabilities.Ambient,
|
|
}
|
|
}
|
|
}
|
|
createHooks(spec, config)
|
|
config.Version = specs.Version
|
|
return config, nil
|
|
}
|
|
|
|
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
|
|
flags, pgflags, data, ext := parseMountOptions(m.Options)
|
|
source := m.Source
|
|
device := m.Type
|
|
if flags&unix.MS_BIND != 0 {
|
|
// Any "type" the user specified is meaningless (and ignored) for
|
|
// bind-mounts -- so we set it to "bind" because rootfs_linux.go
|
|
// (incorrectly) relies on this for some checks.
|
|
device = "bind"
|
|
if !filepath.IsAbs(source) {
|
|
source = filepath.Join(cwd, m.Source)
|
|
}
|
|
}
|
|
return &configs.Mount{
|
|
Device: device,
|
|
Source: source,
|
|
Destination: m.Destination,
|
|
Data: data,
|
|
Flags: flags,
|
|
PropagationFlags: pgflags,
|
|
Extensions: ext,
|
|
}
|
|
}
|
|
|
|
// systemd property name check: latin letters only, at least 3 of them
|
|
var isValidName = regexp.MustCompile(`^[a-zA-Z]{3,}$`).MatchString
|
|
|
|
var isSecSuffix = regexp.MustCompile(`[a-z]Sec$`).MatchString
|
|
|
|
// Some systemd properties are documented as having "Sec" suffix
|
|
// (e.g. TimeoutStopSec) but are expected to have "USec" suffix
|
|
// here, so let's provide conversion to improve compatibility.
|
|
func convertSecToUSec(value dbus.Variant) (dbus.Variant, error) {
|
|
var sec uint64
|
|
const M = 1000000
|
|
vi := value.Value()
|
|
switch value.Signature().String() {
|
|
case "y":
|
|
sec = uint64(vi.(byte)) * M
|
|
case "n":
|
|
sec = uint64(vi.(int16)) * M
|
|
case "q":
|
|
sec = uint64(vi.(uint16)) * M
|
|
case "i":
|
|
sec = uint64(vi.(int32)) * M
|
|
case "u":
|
|
sec = uint64(vi.(uint32)) * M
|
|
case "x":
|
|
sec = uint64(vi.(int64)) * M
|
|
case "t":
|
|
sec = vi.(uint64) * M
|
|
case "d":
|
|
sec = uint64(vi.(float64) * M)
|
|
default:
|
|
return value, errors.New("not a number")
|
|
}
|
|
return dbus.MakeVariant(sec), nil
|
|
}
|
|
|
|
func initSystemdProps(spec *specs.Spec) ([]systemdDbus.Property, error) {
|
|
const keyPrefix = "org.systemd.property."
|
|
var sp []systemdDbus.Property
|
|
|
|
for k, v := range spec.Annotations {
|
|
name := strings.TrimPrefix(k, keyPrefix)
|
|
if len(name) == len(k) { // prefix not there
|
|
continue
|
|
}
|
|
if !isValidName(name) {
|
|
return nil, fmt.Errorf("Annotation %s name incorrect: %s", k, name)
|
|
}
|
|
value, err := dbus.ParseVariant(v, dbus.Signature{})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Annotation %s=%s value parse error: %v", k, v, err)
|
|
}
|
|
if isSecSuffix(name) {
|
|
name = strings.TrimSuffix(name, "Sec") + "USec"
|
|
value, err = convertSecToUSec(value)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Annotation %s=%s value parse error: %v", k, v, err)
|
|
}
|
|
}
|
|
sp = append(sp, systemdDbus.Property{Name: name, Value: value})
|
|
}
|
|
|
|
return sp, nil
|
|
}
|
|
|
|
func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
|
|
var (
|
|
myCgroupPath string
|
|
|
|
spec = opts.Spec
|
|
useSystemdCgroup = opts.UseSystemdCgroup
|
|
name = opts.CgroupName
|
|
)
|
|
|
|
c := &configs.Cgroup{
|
|
Resources: &configs.Resources{},
|
|
}
|
|
|
|
if useSystemdCgroup {
|
|
sp, err := initSystemdProps(spec)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
c.SystemdProps = sp
|
|
}
|
|
|
|
if spec.Linux != nil && spec.Linux.CgroupsPath != "" {
|
|
myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath)
|
|
if useSystemdCgroup {
|
|
myCgroupPath = spec.Linux.CgroupsPath
|
|
}
|
|
}
|
|
|
|
if useSystemdCgroup {
|
|
if myCgroupPath == "" {
|
|
c.Parent = "system.slice"
|
|
c.ScopePrefix = "runc"
|
|
c.Name = name
|
|
} else {
|
|
// Parse the path from expected "slice:prefix:name"
|
|
// for e.g. "system.slice:docker:1234"
|
|
parts := strings.Split(myCgroupPath, ":")
|
|
if len(parts) != 3 {
|
|
return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", myCgroupPath)
|
|
}
|
|
c.Parent = parts[0]
|
|
c.ScopePrefix = parts[1]
|
|
c.Name = parts[2]
|
|
}
|
|
} else {
|
|
if myCgroupPath == "" {
|
|
c.Name = name
|
|
}
|
|
c.Path = myCgroupPath
|
|
}
|
|
|
|
// In rootless containers, any attempt to make cgroup changes is likely to fail.
|
|
// libcontainer will validate this but ignores the error.
|
|
if spec.Linux != nil {
|
|
r := spec.Linux.Resources
|
|
if r != nil {
|
|
for i, d := range spec.Linux.Resources.Devices {
|
|
var (
|
|
t = "a"
|
|
major = int64(-1)
|
|
minor = int64(-1)
|
|
)
|
|
if d.Type != "" {
|
|
t = d.Type
|
|
}
|
|
if d.Major != nil {
|
|
major = *d.Major
|
|
}
|
|
if d.Minor != nil {
|
|
minor = *d.Minor
|
|
}
|
|
if d.Access == "" {
|
|
return nil, fmt.Errorf("device access at %d field cannot be empty", i)
|
|
}
|
|
dt, err := stringToCgroupDeviceRune(t)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
c.Resources.Devices = append(c.Resources.Devices, &configs.DeviceRule{
|
|
Type: dt,
|
|
Major: major,
|
|
Minor: minor,
|
|
Permissions: configs.DevicePermissions(d.Access),
|
|
Allow: d.Allow,
|
|
})
|
|
}
|
|
if r.Memory != nil {
|
|
if r.Memory.Limit != nil {
|
|
c.Resources.Memory = *r.Memory.Limit
|
|
}
|
|
if r.Memory.Reservation != nil {
|
|
c.Resources.MemoryReservation = *r.Memory.Reservation
|
|
}
|
|
if r.Memory.Swap != nil {
|
|
c.Resources.MemorySwap = *r.Memory.Swap
|
|
}
|
|
if r.Memory.Kernel != nil {
|
|
c.Resources.KernelMemory = *r.Memory.Kernel
|
|
}
|
|
if r.Memory.KernelTCP != nil {
|
|
c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
|
|
}
|
|
if r.Memory.Swappiness != nil {
|
|
c.Resources.MemorySwappiness = r.Memory.Swappiness
|
|
}
|
|
if r.Memory.DisableOOMKiller != nil {
|
|
c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
|
|
}
|
|
}
|
|
if r.CPU != nil {
|
|
if r.CPU.Shares != nil {
|
|
c.Resources.CpuShares = *r.CPU.Shares
|
|
|
|
//CpuWeight is used for cgroupv2 and should be converted
|
|
c.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(c.Resources.CpuShares)
|
|
}
|
|
if r.CPU.Quota != nil {
|
|
c.Resources.CpuQuota = *r.CPU.Quota
|
|
}
|
|
if r.CPU.Period != nil {
|
|
c.Resources.CpuPeriod = *r.CPU.Period
|
|
}
|
|
//CpuMax is used for cgroupv2 and should be converted
|
|
c.Resources.CpuMax = cgroups.ConvertCPUQuotaCPUPeriodToCgroupV2Value(c.Resources.CpuQuota, c.Resources.CpuPeriod)
|
|
|
|
if r.CPU.RealtimeRuntime != nil {
|
|
c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
|
|
}
|
|
if r.CPU.RealtimePeriod != nil {
|
|
c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
|
|
}
|
|
if r.CPU.Cpus != "" {
|
|
c.Resources.CpusetCpus = r.CPU.Cpus
|
|
}
|
|
if r.CPU.Mems != "" {
|
|
c.Resources.CpusetMems = r.CPU.Mems
|
|
}
|
|
}
|
|
if r.Pids != nil {
|
|
c.Resources.PidsLimit = r.Pids.Limit
|
|
}
|
|
if r.BlockIO != nil {
|
|
if r.BlockIO.Weight != nil {
|
|
c.Resources.BlkioWeight = *r.BlockIO.Weight
|
|
}
|
|
if r.BlockIO.LeafWeight != nil {
|
|
c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
|
|
}
|
|
if r.BlockIO.WeightDevice != nil {
|
|
for _, wd := range r.BlockIO.WeightDevice {
|
|
var weight, leafWeight uint16
|
|
if wd.Weight != nil {
|
|
weight = *wd.Weight
|
|
}
|
|
if wd.LeafWeight != nil {
|
|
leafWeight = *wd.LeafWeight
|
|
}
|
|
weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
|
|
c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
|
|
}
|
|
}
|
|
if r.BlockIO.ThrottleReadBpsDevice != nil {
|
|
for _, td := range r.BlockIO.ThrottleReadBpsDevice {
|
|
rate := td.Rate
|
|
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
|
c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
|
|
}
|
|
}
|
|
if r.BlockIO.ThrottleWriteBpsDevice != nil {
|
|
for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
|
|
rate := td.Rate
|
|
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
|
c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
|
|
}
|
|
}
|
|
if r.BlockIO.ThrottleReadIOPSDevice != nil {
|
|
for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
|
|
rate := td.Rate
|
|
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
|
c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
|
|
}
|
|
}
|
|
if r.BlockIO.ThrottleWriteIOPSDevice != nil {
|
|
for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
|
|
rate := td.Rate
|
|
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
|
|
c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
|
|
}
|
|
}
|
|
}
|
|
for _, l := range r.HugepageLimits {
|
|
c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
|
|
Pagesize: l.Pagesize,
|
|
Limit: l.Limit,
|
|
})
|
|
}
|
|
if r.Network != nil {
|
|
if r.Network.ClassID != nil {
|
|
c.Resources.NetClsClassid = *r.Network.ClassID
|
|
}
|
|
for _, m := range r.Network.Priorities {
|
|
c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
|
|
Interface: m.Name,
|
|
Priority: int64(m.Priority),
|
|
})
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Append the default allowed devices to the end of the list.
|
|
// XXX: Really this should be prefixed...
|
|
for _, device := range AllowedDevices {
|
|
c.Resources.Devices = append(c.Resources.Devices, &device.DeviceRule)
|
|
}
|
|
return c, nil
|
|
}
|
|
|
|
func stringToCgroupDeviceRune(s string) (configs.DeviceType, error) {
|
|
switch s {
|
|
case "a":
|
|
return configs.WildcardDevice, nil
|
|
case "b":
|
|
return configs.BlockDevice, nil
|
|
case "c":
|
|
return configs.CharDevice, nil
|
|
default:
|
|
return 0, fmt.Errorf("invalid cgroup device type %q", s)
|
|
}
|
|
}
|
|
|
|
func stringToDeviceRune(s string) (configs.DeviceType, error) {
|
|
switch s {
|
|
case "p":
|
|
return configs.FifoDevice, nil
|
|
case "u", "c":
|
|
return configs.CharDevice, nil
|
|
case "b":
|
|
return configs.BlockDevice, nil
|
|
default:
|
|
return 0, fmt.Errorf("invalid device type %q", s)
|
|
}
|
|
}
|
|
|
|
func createDevices(spec *specs.Spec, config *configs.Config) error {
|
|
// Add default set of devices.
|
|
for _, device := range AllowedDevices {
|
|
if device.Path != "" {
|
|
config.Devices = append(config.Devices, device)
|
|
}
|
|
}
|
|
// Merge in additional devices from the spec.
|
|
if spec.Linux != nil {
|
|
for _, d := range spec.Linux.Devices {
|
|
var uid, gid uint32
|
|
var filemode os.FileMode = 0666
|
|
|
|
if d.UID != nil {
|
|
uid = *d.UID
|
|
}
|
|
if d.GID != nil {
|
|
gid = *d.GID
|
|
}
|
|
dt, err := stringToDeviceRune(d.Type)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if d.FileMode != nil {
|
|
filemode = *d.FileMode
|
|
}
|
|
device := &configs.Device{
|
|
DeviceRule: configs.DeviceRule{
|
|
Type: dt,
|
|
Major: d.Major,
|
|
Minor: d.Minor,
|
|
},
|
|
Path: d.Path,
|
|
FileMode: filemode,
|
|
Uid: uid,
|
|
Gid: gid,
|
|
}
|
|
config.Devices = append(config.Devices, device)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
|
|
create := func(m specs.LinuxIDMapping) configs.IDMap {
|
|
return configs.IDMap{
|
|
HostID: int(m.HostID),
|
|
ContainerID: int(m.ContainerID),
|
|
Size: int(m.Size),
|
|
}
|
|
}
|
|
if spec.Linux != nil {
|
|
for _, m := range spec.Linux.UIDMappings {
|
|
config.UidMappings = append(config.UidMappings, create(m))
|
|
}
|
|
for _, m := range spec.Linux.GIDMappings {
|
|
config.GidMappings = append(config.GidMappings, create(m))
|
|
}
|
|
}
|
|
rootUID, err := config.HostRootUID()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rootGID, err := config.HostRootGID()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, node := range config.Devices {
|
|
node.Uid = uint32(rootUID)
|
|
node.Gid = uint32(rootGID)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// parseMountOptions parses the string and returns the flags, propagation
|
|
// flags and any mount data that it contains.
|
|
func parseMountOptions(options []string) (int, []int, string, int) {
|
|
var (
|
|
flag int
|
|
pgflag []int
|
|
data []string
|
|
extFlags int
|
|
)
|
|
flags := map[string]struct {
|
|
clear bool
|
|
flag int
|
|
}{
|
|
"acl": {false, unix.MS_POSIXACL},
|
|
"async": {true, unix.MS_SYNCHRONOUS},
|
|
"atime": {true, unix.MS_NOATIME},
|
|
"bind": {false, unix.MS_BIND},
|
|
"defaults": {false, 0},
|
|
"dev": {true, unix.MS_NODEV},
|
|
"diratime": {true, unix.MS_NODIRATIME},
|
|
"dirsync": {false, unix.MS_DIRSYNC},
|
|
"exec": {true, unix.MS_NOEXEC},
|
|
"iversion": {false, unix.MS_I_VERSION},
|
|
"lazytime": {false, unix.MS_LAZYTIME},
|
|
"loud": {true, unix.MS_SILENT},
|
|
"mand": {false, unix.MS_MANDLOCK},
|
|
"noacl": {true, unix.MS_POSIXACL},
|
|
"noatime": {false, unix.MS_NOATIME},
|
|
"nodev": {false, unix.MS_NODEV},
|
|
"nodiratime": {false, unix.MS_NODIRATIME},
|
|
"noexec": {false, unix.MS_NOEXEC},
|
|
"noiversion": {true, unix.MS_I_VERSION},
|
|
"nolazytime": {true, unix.MS_LAZYTIME},
|
|
"nomand": {true, unix.MS_MANDLOCK},
|
|
"norelatime": {true, unix.MS_RELATIME},
|
|
"nostrictatime": {true, unix.MS_STRICTATIME},
|
|
"nosuid": {false, unix.MS_NOSUID},
|
|
"rbind": {false, unix.MS_BIND | unix.MS_REC},
|
|
"relatime": {false, unix.MS_RELATIME},
|
|
"remount": {false, unix.MS_REMOUNT},
|
|
"ro": {false, unix.MS_RDONLY},
|
|
"rw": {true, unix.MS_RDONLY},
|
|
"silent": {false, unix.MS_SILENT},
|
|
"strictatime": {false, unix.MS_STRICTATIME},
|
|
"suid": {true, unix.MS_NOSUID},
|
|
"sync": {false, unix.MS_SYNCHRONOUS},
|
|
}
|
|
propagationFlags := map[string]int{
|
|
"private": unix.MS_PRIVATE,
|
|
"shared": unix.MS_SHARED,
|
|
"slave": unix.MS_SLAVE,
|
|
"unbindable": unix.MS_UNBINDABLE,
|
|
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
|
|
"rshared": unix.MS_SHARED | unix.MS_REC,
|
|
"rslave": unix.MS_SLAVE | unix.MS_REC,
|
|
"runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
|
|
}
|
|
extensionFlags := map[string]struct {
|
|
clear bool
|
|
flag int
|
|
}{
|
|
"tmpcopyup": {false, configs.EXT_COPYUP},
|
|
}
|
|
for _, o := range options {
|
|
// If the option does not exist in the flags table or the flag
|
|
// is not supported on the platform,
|
|
// then it is a data value for a specific fs type
|
|
if f, exists := flags[o]; exists && f.flag != 0 {
|
|
if f.clear {
|
|
flag &= ^f.flag
|
|
} else {
|
|
flag |= f.flag
|
|
}
|
|
} else if f, exists := propagationFlags[o]; exists && f != 0 {
|
|
pgflag = append(pgflag, f)
|
|
} else if f, exists := extensionFlags[o]; exists && f.flag != 0 {
|
|
if f.clear {
|
|
extFlags &= ^f.flag
|
|
} else {
|
|
extFlags |= f.flag
|
|
}
|
|
} else {
|
|
data = append(data, o)
|
|
}
|
|
}
|
|
return flag, pgflag, strings.Join(data, ","), extFlags
|
|
}
|
|
|
|
func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
|
|
if config == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
// No default action specified, no syscalls listed, assume seccomp disabled
|
|
if config.DefaultAction == "" && len(config.Syscalls) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
newConfig := new(configs.Seccomp)
|
|
newConfig.Syscalls = []*configs.Syscall{}
|
|
|
|
if len(config.Architectures) > 0 {
|
|
newConfig.Architectures = []string{}
|
|
for _, arch := range config.Architectures {
|
|
newArch, err := seccomp.ConvertStringToArch(string(arch))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
newConfig.Architectures = append(newConfig.Architectures, newArch)
|
|
}
|
|
}
|
|
|
|
// Convert default action from string representation
|
|
newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
newConfig.DefaultAction = newDefaultAction
|
|
|
|
// Loop through all syscall blocks and convert them to libcontainer format
|
|
for _, call := range config.Syscalls {
|
|
newAction, err := seccomp.ConvertStringToAction(string(call.Action))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, name := range call.Names {
|
|
newCall := configs.Syscall{
|
|
Name: name,
|
|
Action: newAction,
|
|
ErrnoRet: call.ErrnoRet,
|
|
Args: []*configs.Arg{},
|
|
}
|
|
// Loop through all the arguments of the syscall and convert them
|
|
for _, arg := range call.Args {
|
|
newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
newArg := configs.Arg{
|
|
Index: arg.Index,
|
|
Value: arg.Value,
|
|
ValueTwo: arg.ValueTwo,
|
|
Op: newOp,
|
|
}
|
|
|
|
newCall.Args = append(newCall.Args, &newArg)
|
|
}
|
|
newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
|
|
}
|
|
}
|
|
|
|
return newConfig, nil
|
|
}
|
|
|
|
func createHooks(rspec *specs.Spec, config *configs.Config) {
|
|
config.Hooks = &configs.Hooks{}
|
|
if rspec.Hooks != nil {
|
|
|
|
for _, h := range rspec.Hooks.Prestart {
|
|
cmd := createCommandHook(h)
|
|
config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd))
|
|
}
|
|
for _, h := range rspec.Hooks.Poststart {
|
|
cmd := createCommandHook(h)
|
|
config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd))
|
|
}
|
|
for _, h := range rspec.Hooks.Poststop {
|
|
cmd := createCommandHook(h)
|
|
config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd))
|
|
}
|
|
}
|
|
}
|
|
|
|
func createCommandHook(h specs.Hook) configs.Command {
|
|
cmd := configs.Command{
|
|
Path: h.Path,
|
|
Args: h.Args,
|
|
Env: h.Env,
|
|
}
|
|
if h.Timeout != nil {
|
|
d := time.Duration(*h.Timeout) * time.Second
|
|
cmd.Timeout = &d
|
|
}
|
|
return cmd
|
|
}
|