560 lines
17 KiB
Go
560 lines
17 KiB
Go
// +build linux
|
|
|
|
package libcontainer
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
"syscall"
|
|
|
|
"github.com/golang/glog"
|
|
|
|
"github.com/docker/libcontainer/apparmor"
|
|
cgroups "github.com/docker/libcontainer/cgroups/manager"
|
|
"github.com/docker/libcontainer/configs"
|
|
"github.com/docker/libcontainer/console"
|
|
"github.com/docker/libcontainer/label"
|
|
"github.com/docker/libcontainer/mount"
|
|
"github.com/docker/libcontainer/netlink"
|
|
"github.com/docker/libcontainer/network"
|
|
"github.com/docker/libcontainer/security/capabilities"
|
|
"github.com/docker/libcontainer/security/restrict"
|
|
"github.com/docker/libcontainer/system"
|
|
"github.com/docker/libcontainer/user"
|
|
"github.com/docker/libcontainer/utils"
|
|
)
|
|
|
|
const (
|
|
configFilename = "config.json"
|
|
stateFilename = "state.json"
|
|
)
|
|
|
|
var (
|
|
idRegex = regexp.MustCompile(`^[\w_]+$`)
|
|
maxIdLen = 1024
|
|
)
|
|
|
|
// Process is used for transferring parameters from Exec() to Init()
|
|
type processArgs struct {
|
|
Args []string `json:"args,omitempty"`
|
|
Config *configs.Config `json:"config,omitempty"`
|
|
NetworkState *configs.NetworkState `json:"network_state,omitempty"`
|
|
}
|
|
|
|
// New returns a linux based container factory based in the root directory.
|
|
func New(root string, initArgs []string) (Factory, error) {
|
|
if root != "" {
|
|
if err := os.MkdirAll(root, 0700); err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
}
|
|
return &linuxFactory{
|
|
root: root,
|
|
initArgs: initArgs,
|
|
}, nil
|
|
}
|
|
|
|
// linuxFactory implements the default factory interface for linux based systems.
|
|
type linuxFactory struct {
|
|
// root is the root directory
|
|
root string
|
|
initArgs []string
|
|
}
|
|
|
|
func (l *linuxFactory) Create(id string, config *configs.Config) (Container, error) {
|
|
if l.root == "" {
|
|
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
|
|
}
|
|
if err := l.validateID(id); err != nil {
|
|
return nil, err
|
|
}
|
|
containerRoot := filepath.Join(l.root, id)
|
|
if _, err := os.Stat(containerRoot); err == nil {
|
|
return nil, newGenericError(fmt.Errorf("Container with id exists: %v", id), IdInUse)
|
|
} else if !os.IsNotExist(err) {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
data, err := json.MarshalIndent(config, "", "\t")
|
|
if err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
if err := os.MkdirAll(containerRoot, 0700); err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
f, err := os.Create(filepath.Join(containerRoot, configFilename))
|
|
if err != nil {
|
|
os.RemoveAll(containerRoot)
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
defer f.Close()
|
|
if _, err := f.Write(data); err != nil {
|
|
os.RemoveAll(containerRoot)
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
cgroupManager := cgroups.NewCgroupManager(config.Cgroups)
|
|
return &linuxContainer{
|
|
id: id,
|
|
root: containerRoot,
|
|
config: config,
|
|
initArgs: l.initArgs,
|
|
state: &configs.State{},
|
|
cgroupManager: cgroupManager,
|
|
}, nil
|
|
}
|
|
|
|
func (l *linuxFactory) Load(id string) (Container, error) {
|
|
if l.root == "" {
|
|
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
|
|
}
|
|
containerRoot := filepath.Join(l.root, id)
|
|
glog.Infof("loading container config from %s", containerRoot)
|
|
config, err := l.loadContainerConfig(containerRoot)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
glog.Infof("loading container state from %s", containerRoot)
|
|
state, err := l.loadContainerState(containerRoot)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
cgroupManager := cgroups.LoadCgroupManager(config.Cgroups, state.CgroupPaths)
|
|
glog.Infof("using %s as cgroup manager", cgroupManager)
|
|
return &linuxContainer{
|
|
id: id,
|
|
root: containerRoot,
|
|
config: config,
|
|
state: state,
|
|
cgroupManager: cgroupManager,
|
|
initArgs: l.initArgs,
|
|
}, nil
|
|
}
|
|
|
|
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
|
|
// This is a low level implementation detail of the reexec and should not be consumed externally
|
|
func (l *linuxFactory) StartInitialization(pipefd uintptr) (err error) {
|
|
pipe := os.NewFile(uintptr(pipefd), "pipe")
|
|
setupUserns := os.Getenv("_LIBCONTAINER_USERNS") != ""
|
|
pid := os.Getenv("_LIBCONTAINER_INITPID")
|
|
if pid != "" && !setupUserns {
|
|
return InitIn(pipe)
|
|
}
|
|
defer func() {
|
|
// if we have an error during the initialization of the container's init then send it back to the
|
|
// parent process in the form of an initError.
|
|
if err != nil {
|
|
// ensure that any data sent from the parent is consumed so it doesn't
|
|
// receive ECONNRESET when the child writes to the pipe.
|
|
ioutil.ReadAll(pipe)
|
|
if err := json.NewEncoder(pipe).Encode(initError{
|
|
Message: err.Error(),
|
|
}); err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
// ensure that this pipe is always closed
|
|
pipe.Close()
|
|
}()
|
|
uncleanRootfs, err := os.Getwd()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var process *processArgs
|
|
// We always read this as it is a way to sync with the parent as well
|
|
if err := json.NewDecoder(pipe).Decode(&process); err != nil {
|
|
return err
|
|
}
|
|
if setupUserns {
|
|
err = SetupContainer(process)
|
|
if err == nil {
|
|
os.Exit(0)
|
|
} else {
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
if process.Config.Namespaces.Contains(configs.NEWUSER) {
|
|
return l.initUserNs(uncleanRootfs, process)
|
|
}
|
|
return l.initDefault(uncleanRootfs, process)
|
|
}
|
|
|
|
func (l *linuxFactory) loadContainerConfig(root string) (*configs.Config, error) {
|
|
f, err := os.Open(filepath.Join(root, configFilename))
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, newGenericError(err, ContainerNotExists)
|
|
}
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
defer f.Close()
|
|
var config *configs.Config
|
|
if err := json.NewDecoder(f).Decode(&config); err != nil {
|
|
return nil, newGenericError(err, ConfigInvalid)
|
|
}
|
|
return config, nil
|
|
}
|
|
|
|
func (l *linuxFactory) loadContainerState(root string) (*configs.State, error) {
|
|
f, err := os.Open(filepath.Join(root, stateFilename))
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, newGenericError(err, ContainerNotExists)
|
|
}
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
defer f.Close()
|
|
var state *configs.State
|
|
if err := json.NewDecoder(f).Decode(&state); err != nil {
|
|
return nil, newGenericError(err, SystemError)
|
|
}
|
|
return state, nil
|
|
}
|
|
|
|
func (l *linuxFactory) validateID(id string) error {
|
|
if !idRegex.MatchString(id) {
|
|
return newGenericError(fmt.Errorf("Invalid id format: %v", id), InvalidIdFormat)
|
|
}
|
|
if len(id) > maxIdLen {
|
|
return newGenericError(fmt.Errorf("Invalid id format: %v", id), InvalidIdFormat)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (l *linuxFactory) initDefault(uncleanRootfs string, process *processArgs) (err error) {
|
|
config := process.Config
|
|
networkState := process.NetworkState
|
|
|
|
// TODO: move to validation
|
|
/*
|
|
rootfs, err := utils.ResolveRootfs(uncleanRootfs)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
*/
|
|
|
|
// clear the current processes env and replace it with the environment
|
|
// defined on the container
|
|
if err := loadContainerEnvironment(config); err != nil {
|
|
return err
|
|
}
|
|
// join any namespaces via a path to the namespace fd if provided
|
|
if err := joinExistingNamespaces(config.Namespaces); err != nil {
|
|
return err
|
|
}
|
|
if config.Console != "" {
|
|
if err := console.OpenAndDup(config.Console); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if _, err := syscall.Setsid(); err != nil {
|
|
return fmt.Errorf("setsid %s", err)
|
|
}
|
|
if config.Console != "" {
|
|
if err := system.Setctty(); err != nil {
|
|
return fmt.Errorf("setctty %s", err)
|
|
}
|
|
}
|
|
|
|
cloneFlags := config.Namespaces.CloneFlags()
|
|
if (cloneFlags & syscall.CLONE_NEWNET) == 0 {
|
|
if len(config.Networks) != 0 || len(config.Routes) != 0 {
|
|
return fmt.Errorf("unable to apply network parameters without network namespace")
|
|
}
|
|
} else {
|
|
if err := setupNetwork(config, networkState); err != nil {
|
|
return fmt.Errorf("setup networking %s", err)
|
|
}
|
|
if err := setupRoute(config); err != nil {
|
|
return fmt.Errorf("setup route %s", err)
|
|
}
|
|
}
|
|
if err := setupRlimits(config); err != nil {
|
|
return fmt.Errorf("setup rlimits %s", err)
|
|
}
|
|
label.Init()
|
|
// InitializeMountNamespace() can be executed only for a new mount namespace
|
|
if (cloneFlags & syscall.CLONE_NEWNS) != 0 {
|
|
if err := mount.InitializeMountNamespace(config); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if config.Hostname != "" {
|
|
// TODO: (crosbymichael) move this to pre spawn validation
|
|
if (cloneFlags & syscall.CLONE_NEWUTS) == 0 {
|
|
return fmt.Errorf("unable to set the hostname without UTS namespace")
|
|
}
|
|
if err := syscall.Sethostname([]byte(config.Hostname)); err != nil {
|
|
return fmt.Errorf("unable to sethostname %q: %s", config.Hostname, err)
|
|
}
|
|
}
|
|
if err := apparmor.ApplyProfile(config.AppArmorProfile); err != nil {
|
|
return fmt.Errorf("set apparmor profile %s: %s", config.AppArmorProfile, err)
|
|
}
|
|
if err := label.SetProcessLabel(config.ProcessLabel); err != nil {
|
|
return fmt.Errorf("set process label %s", err)
|
|
}
|
|
// TODO: (crosbymichael) make this configurable at the Config level
|
|
if config.RestrictSys {
|
|
if (cloneFlags & syscall.CLONE_NEWNS) == 0 {
|
|
return fmt.Errorf("unable to restrict access to kernel files without mount namespace")
|
|
}
|
|
if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
pdeathSignal, err := system.GetParentDeathSignal()
|
|
if err != nil {
|
|
return fmt.Errorf("get parent death signal %s", err)
|
|
}
|
|
if err := finalizeNamespace(config); err != nil {
|
|
return fmt.Errorf("finalize namespace %s", err)
|
|
}
|
|
// finalizeNamespace can change user/group which clears the parent death
|
|
// signal, so we restore it here.
|
|
if err := restoreParentDeathSignal(pdeathSignal); err != nil {
|
|
return fmt.Errorf("restore parent death signal %s", err)
|
|
}
|
|
return system.Execv(process.Args[0], process.Args[0:], config.Env)
|
|
}
|
|
|
|
func (l *linuxFactory) initUserNs(uncleanRootfs string, process *processArgs) (err error) {
|
|
config := process.Config
|
|
// clear the current processes env and replace it with the environment
|
|
// defined on the config
|
|
if err := loadContainerEnvironment(config); err != nil {
|
|
return err
|
|
}
|
|
// join any namespaces via a path to the namespace fd if provided
|
|
if err := joinExistingNamespaces(config.Namespaces); err != nil {
|
|
return err
|
|
}
|
|
if config.Console != "" {
|
|
if err := console.OpenAndDup("/dev/console"); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if _, err := syscall.Setsid(); err != nil {
|
|
return fmt.Errorf("setsid %s", err)
|
|
}
|
|
if config.Console != "" {
|
|
if err := system.Setctty(); err != nil {
|
|
return fmt.Errorf("setctty %s", err)
|
|
}
|
|
}
|
|
if config.WorkingDir == "" {
|
|
config.WorkingDir = "/"
|
|
}
|
|
|
|
if err := setupRlimits(config); err != nil {
|
|
return fmt.Errorf("setup rlimits %s", err)
|
|
}
|
|
cloneFlags := config.Namespaces.CloneFlags()
|
|
if config.Hostname != "" {
|
|
// TODO: move validation
|
|
if (cloneFlags & syscall.CLONE_NEWUTS) == 0 {
|
|
return fmt.Errorf("unable to set the hostname without UTS namespace")
|
|
}
|
|
if err := syscall.Sethostname([]byte(config.Hostname)); err != nil {
|
|
return fmt.Errorf("unable to sethostname %q: %s", config.Hostname, err)
|
|
}
|
|
}
|
|
if err := apparmor.ApplyProfile(config.AppArmorProfile); err != nil {
|
|
return fmt.Errorf("set apparmor profile %s: %s", config.AppArmorProfile, err)
|
|
}
|
|
if err := label.SetProcessLabel(config.ProcessLabel); err != nil {
|
|
return fmt.Errorf("set process label %s", err)
|
|
}
|
|
if config.RestrictSys {
|
|
if (cloneFlags & syscall.CLONE_NEWNS) == 0 {
|
|
return fmt.Errorf("unable to restrict access to kernel files without mount namespace")
|
|
}
|
|
if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
pdeathSignal, err := system.GetParentDeathSignal()
|
|
if err != nil {
|
|
return fmt.Errorf("get parent death signal %s", err)
|
|
}
|
|
if err := finalizeNamespace(config); err != nil {
|
|
return fmt.Errorf("finalize namespace %s", err)
|
|
}
|
|
// finalizeNamespace can change user/group which clears the parent death
|
|
// signal, so we restore it here.
|
|
if err := restoreParentDeathSignal(pdeathSignal); err != nil {
|
|
return fmt.Errorf("restore parent death signal %s", err)
|
|
}
|
|
return system.Execv(process.Args[0], process.Args[0:], config.Env)
|
|
}
|
|
|
|
// restoreParentDeathSignal sets the parent death signal to old.
|
|
func restoreParentDeathSignal(old int) error {
|
|
if old == 0 {
|
|
return nil
|
|
}
|
|
current, err := system.GetParentDeathSignal()
|
|
if err != nil {
|
|
return fmt.Errorf("get parent death signal %s", err)
|
|
}
|
|
if old == current {
|
|
return nil
|
|
}
|
|
if err := system.ParentDeathSignal(uintptr(old)); err != nil {
|
|
return fmt.Errorf("set parent death signal %s", err)
|
|
}
|
|
// Signal self if parent is already dead. Does nothing if running in a new
|
|
// PID namespace, as Getppid will always return 0.
|
|
if syscall.Getppid() == 1 {
|
|
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// setupUser changes the groups, gid, and uid for the user inside the container
|
|
func setupUser(config *configs.Config) error {
|
|
// Set up defaults.
|
|
defaultExecUser := user.ExecUser{
|
|
Uid: syscall.Getuid(),
|
|
Gid: syscall.Getgid(),
|
|
Home: "/",
|
|
}
|
|
passwdPath, err := user.GetPasswdPath()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
groupPath, err := user.GetGroupPath()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
|
|
if err != nil {
|
|
return fmt.Errorf("get supplementary groups %s", err)
|
|
}
|
|
suppGroups := append(execUser.Sgids, config.AdditionalGroups...)
|
|
if err := syscall.Setgroups(suppGroups); err != nil {
|
|
return fmt.Errorf("setgroups %s", err)
|
|
}
|
|
if err := system.Setgid(execUser.Gid); err != nil {
|
|
return fmt.Errorf("setgid %s", err)
|
|
}
|
|
if err := system.Setuid(execUser.Uid); err != nil {
|
|
return fmt.Errorf("setuid %s", err)
|
|
}
|
|
// if we didn't get HOME already, set it based on the user's HOME
|
|
if envHome := os.Getenv("HOME"); envHome == "" {
|
|
if err := os.Setenv("HOME", execUser.Home); err != nil {
|
|
return fmt.Errorf("set HOME %s", err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// setupVethNetwork uses the Network config if it is not nil to initialize
|
|
// the new veth interface inside the container for use by changing the name to eth0
|
|
// setting the MTU and IP address along with the default gateway
|
|
func setupNetwork(config *configs.Config, networkState *configs.NetworkState) error {
|
|
for _, config := range config.Networks {
|
|
strategy, err := network.GetStrategy(config.Type)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err1 := strategy.Initialize(config, networkState)
|
|
if err1 != nil {
|
|
return err1
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setupRoute(config *configs.Config) error {
|
|
for _, config := range config.Routes {
|
|
if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setupRlimits(config *configs.Config) error {
|
|
for _, rlimit := range config.Rlimits {
|
|
l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
|
|
if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
|
|
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// finalizeNamespace drops the caps, sets the correct user
|
|
// and working dir, and closes any leaky file descriptors
|
|
// before execing the command inside the namespace
|
|
func finalizeNamespace(config *configs.Config) error {
|
|
// Ensure that all non-standard fds we may have accidentally
|
|
// inherited are marked close-on-exec so they stay out of the
|
|
// container
|
|
if err := utils.CloseExecFrom(3); err != nil {
|
|
return fmt.Errorf("close open file descriptors %s", err)
|
|
}
|
|
// drop capabilities in bounding set before changing user
|
|
if err := capabilities.DropBoundingSet(config.Capabilities); err != nil {
|
|
return fmt.Errorf("drop bounding set %s", err)
|
|
}
|
|
// preserve existing capabilities while we change users
|
|
if err := system.SetKeepCaps(); err != nil {
|
|
return fmt.Errorf("set keep caps %s", err)
|
|
}
|
|
if err := setupUser(config); err != nil {
|
|
return fmt.Errorf("setup user %s", err)
|
|
}
|
|
if err := system.ClearKeepCaps(); err != nil {
|
|
return fmt.Errorf("clear keep caps %s", err)
|
|
}
|
|
// drop all other capabilities
|
|
if err := capabilities.DropCapabilities(config.Capabilities); err != nil {
|
|
return fmt.Errorf("drop capabilities %s", err)
|
|
}
|
|
if config.WorkingDir != "" {
|
|
if err := syscall.Chdir(config.WorkingDir); err != nil {
|
|
return fmt.Errorf("chdir to %s %s", config.WorkingDir, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func loadContainerEnvironment(config *configs.Config) error {
|
|
os.Clearenv()
|
|
for _, pair := range config.Env {
|
|
p := strings.SplitN(pair, "=", 2)
|
|
if len(p) < 2 {
|
|
return fmt.Errorf("invalid environment '%v'", pair)
|
|
}
|
|
if err := os.Setenv(p[0], p[1]); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// joinExistingNamespaces gets all the namespace paths specified for the container and
|
|
// does a setns on the namespace fd so that the current process joins the namespace.
|
|
func joinExistingNamespaces(namespaces []configs.Namespace) error {
|
|
for _, ns := range namespaces {
|
|
if ns.Path != "" {
|
|
f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = system.Setns(f.Fd(), uintptr(ns.Syscall()))
|
|
f.Close()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|