runc/linux_factory.go

673 lines
20 KiB
Go

// +build linux
package libcontainer
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"strings"
"syscall"
"github.com/golang/glog"
"github.com/docker/libcontainer/apparmor"
cgroups "github.com/docker/libcontainer/cgroups/manager"
"github.com/docker/libcontainer/configs"
"github.com/docker/libcontainer/console"
"github.com/docker/libcontainer/label"
"github.com/docker/libcontainer/mount"
"github.com/docker/libcontainer/netlink"
"github.com/docker/libcontainer/network"
"github.com/docker/libcontainer/security/capabilities"
"github.com/docker/libcontainer/security/restrict"
"github.com/docker/libcontainer/system"
"github.com/docker/libcontainer/user"
"github.com/docker/libcontainer/utils"
)
const (
configFilename = "config.json"
stateFilename = "state.json"
)
var (
idRegex = regexp.MustCompile(`^[\w_]+$`)
maxIdLen = 1024
)
// Process is used for transferring parameters from Exec() to Init()
type processArgs struct {
Args []string `json:"args,omitempty"`
Config *configs.Config `json:"config,omitempty"`
NetworkState *configs.NetworkState `json:"network_state,omitempty"`
}
// New returns a linux based container factory based in the root directory.
func New(root string, initArgs []string) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0700); err != nil {
return nil, newGenericError(err, SystemError)
}
}
return &linuxFactory{
root: root,
initArgs: initArgs,
}, nil
}
// linuxFactory implements the default factory interface for linux based systems.
type linuxFactory struct {
// root is the root directory
root string
initArgs []string
}
func (l *linuxFactory) Create(id string, config *configs.Config) (Container, error) {
if l.root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
if err := l.validateID(id); err != nil {
return nil, err
}
containerRoot := filepath.Join(l.root, id)
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("Container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
}
data, err := json.MarshalIndent(config, "", "\t")
if err != nil {
return nil, newGenericError(err, SystemError)
}
if err := os.MkdirAll(containerRoot, 0700); err != nil {
return nil, newGenericError(err, SystemError)
}
f, err := os.Create(filepath.Join(containerRoot, configFilename))
if err != nil {
os.RemoveAll(containerRoot)
return nil, newGenericError(err, SystemError)
}
defer f.Close()
if _, err := f.Write(data); err != nil {
os.RemoveAll(containerRoot)
return nil, newGenericError(err, SystemError)
}
cgroupManager := cgroups.NewCgroupManager(config.Cgroups)
return &linuxContainer{
id: id,
root: containerRoot,
config: config,
initArgs: l.initArgs,
state: &configs.State{},
cgroupManager: cgroupManager,
}, nil
}
func (l *linuxFactory) Load(id string) (Container, error) {
if l.root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
containerRoot := filepath.Join(l.root, id)
glog.Infof("loading container config from %s", containerRoot)
config, err := l.loadContainerConfig(containerRoot)
if err != nil {
return nil, err
}
glog.Infof("loading container state from %s", containerRoot)
state, err := l.loadContainerState(containerRoot)
if err != nil {
return nil, err
}
cgroupManager := cgroups.LoadCgroupManager(config.Cgroups, state.CgroupPaths)
glog.Infof("using %s as cgroup manager", cgroupManager)
return &linuxContainer{
id: id,
root: containerRoot,
config: config,
state: state,
cgroupManager: cgroupManager,
initArgs: l.initArgs,
}, nil
}
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *linuxFactory) StartInitialization(pipefd uintptr) (err error) {
pipe := os.NewFile(uintptr(pipefd), "pipe")
setupUserns := os.Getenv("_LIBCONTAINER_USERNS") != ""
pid := os.Getenv("_LIBCONTAINER_INITPID")
if pid != "" && !setupUserns {
return initIn(pipe)
}
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't
// receive ECONNRESET when the child writes to the pipe.
ioutil.ReadAll(pipe)
if err := json.NewEncoder(pipe).Encode(initError{
Message: err.Error(),
}); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed
pipe.Close()
}()
uncleanRootfs, err := os.Getwd()
if err != nil {
return err
}
var process *processArgs
// We always read this as it is a way to sync with the parent as well
if err := json.NewDecoder(pipe).Decode(&process); err != nil {
return err
}
if setupUserns {
err = setupContainer(process)
if err == nil {
os.Exit(0)
} else {
os.Exit(1)
}
}
if process.Config.Namespaces.Contains(configs.NEWUSER) {
return l.initUserNs(uncleanRootfs, process)
}
return l.initDefault(uncleanRootfs, process)
}
func (l *linuxFactory) loadContainerConfig(root string) (*configs.Config, error) {
f, err := os.Open(filepath.Join(root, configFilename))
if err != nil {
if os.IsNotExist(err) {
return nil, newGenericError(err, ContainerNotExists)
}
return nil, newGenericError(err, SystemError)
}
defer f.Close()
var config *configs.Config
if err := json.NewDecoder(f).Decode(&config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
return config, nil
}
func (l *linuxFactory) loadContainerState(root string) (*configs.State, error) {
f, err := os.Open(filepath.Join(root, stateFilename))
if err != nil {
if os.IsNotExist(err) {
return nil, newGenericError(err, ContainerNotExists)
}
return nil, newGenericError(err, SystemError)
}
defer f.Close()
var state *configs.State
if err := json.NewDecoder(f).Decode(&state); err != nil {
return nil, newGenericError(err, SystemError)
}
return state, nil
}
func (l *linuxFactory) validateID(id string) error {
if !idRegex.MatchString(id) {
return newGenericError(fmt.Errorf("Invalid id format: %v", id), InvalidIdFormat)
}
if len(id) > maxIdLen {
return newGenericError(fmt.Errorf("Invalid id format: %v", id), InvalidIdFormat)
}
return nil
}
func (l *linuxFactory) initDefault(uncleanRootfs string, process *processArgs) (err error) {
config := process.Config
networkState := process.NetworkState
// TODO: move to validation
/*
rootfs, err := utils.ResolveRootfs(uncleanRootfs)
if err != nil {
return err
}
*/
// clear the current processes env and replace it with the environment
// defined on the container
if err := loadContainerEnvironment(config); err != nil {
return err
}
// join any namespaces via a path to the namespace fd if provided
if err := joinExistingNamespaces(config.Namespaces); err != nil {
return err
}
if config.Console != "" {
if err := console.OpenAndDup(config.Console); err != nil {
return err
}
}
if _, err := syscall.Setsid(); err != nil {
return fmt.Errorf("setsid %s", err)
}
if config.Console != "" {
if err := system.Setctty(); err != nil {
return fmt.Errorf("setctty %s", err)
}
}
cloneFlags := config.Namespaces.CloneFlags()
if (cloneFlags & syscall.CLONE_NEWNET) == 0 {
if len(config.Networks) != 0 || len(config.Routes) != 0 {
return fmt.Errorf("unable to apply network parameters without network namespace")
}
} else {
if err := setupNetwork(config, networkState); err != nil {
return fmt.Errorf("setup networking %s", err)
}
if err := setupRoute(config); err != nil {
return fmt.Errorf("setup route %s", err)
}
}
if err := setupRlimits(config); err != nil {
return fmt.Errorf("setup rlimits %s", err)
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if (cloneFlags & syscall.CLONE_NEWNS) != 0 {
if err := mount.InitializeMountNamespace(config); err != nil {
return err
}
}
if config.Hostname != "" {
// TODO: (crosbymichael) move this to pre spawn validation
if (cloneFlags & syscall.CLONE_NEWUTS) == 0 {
return fmt.Errorf("unable to set the hostname without UTS namespace")
}
if err := syscall.Sethostname([]byte(config.Hostname)); err != nil {
return fmt.Errorf("unable to sethostname %q: %s", config.Hostname, err)
}
}
if err := apparmor.ApplyProfile(config.AppArmorProfile); err != nil {
return fmt.Errorf("set apparmor profile %s: %s", config.AppArmorProfile, err)
}
if err := label.SetProcessLabel(config.ProcessLabel); err != nil {
return fmt.Errorf("set process label %s", err)
}
// TODO: (crosbymichael) make this configurable at the Config level
if config.RestrictSys {
if (cloneFlags & syscall.CLONE_NEWNS) == 0 {
return fmt.Errorf("unable to restrict access to kernel files without mount namespace")
}
if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil {
return err
}
}
pdeathSignal, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("get parent death signal %s", err)
}
if err := finalizeNamespace(config); err != nil {
return fmt.Errorf("finalize namespace %s", err)
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := restoreParentDeathSignal(pdeathSignal); err != nil {
return fmt.Errorf("restore parent death signal %s", err)
}
return system.Execv(process.Args[0], process.Args[0:], config.Env)
}
func (l *linuxFactory) initUserNs(uncleanRootfs string, process *processArgs) (err error) {
config := process.Config
// clear the current processes env and replace it with the environment
// defined on the config
if err := loadContainerEnvironment(config); err != nil {
return err
}
// join any namespaces via a path to the namespace fd if provided
if err := joinExistingNamespaces(config.Namespaces); err != nil {
return err
}
if config.Console != "" {
if err := console.OpenAndDup("/dev/console"); err != nil {
return err
}
}
if _, err := syscall.Setsid(); err != nil {
return fmt.Errorf("setsid %s", err)
}
if config.Console != "" {
if err := system.Setctty(); err != nil {
return fmt.Errorf("setctty %s", err)
}
}
if config.WorkingDir == "" {
config.WorkingDir = "/"
}
if err := setupRlimits(config); err != nil {
return fmt.Errorf("setup rlimits %s", err)
}
cloneFlags := config.Namespaces.CloneFlags()
if config.Hostname != "" {
// TODO: move validation
if (cloneFlags & syscall.CLONE_NEWUTS) == 0 {
return fmt.Errorf("unable to set the hostname without UTS namespace")
}
if err := syscall.Sethostname([]byte(config.Hostname)); err != nil {
return fmt.Errorf("unable to sethostname %q: %s", config.Hostname, err)
}
}
if err := apparmor.ApplyProfile(config.AppArmorProfile); err != nil {
return fmt.Errorf("set apparmor profile %s: %s", config.AppArmorProfile, err)
}
if err := label.SetProcessLabel(config.ProcessLabel); err != nil {
return fmt.Errorf("set process label %s", err)
}
if config.RestrictSys {
if (cloneFlags & syscall.CLONE_NEWNS) == 0 {
return fmt.Errorf("unable to restrict access to kernel files without mount namespace")
}
if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil {
return err
}
}
pdeathSignal, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("get parent death signal %s", err)
}
if err := finalizeNamespace(config); err != nil {
return fmt.Errorf("finalize namespace %s", err)
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := restoreParentDeathSignal(pdeathSignal); err != nil {
return fmt.Errorf("restore parent death signal %s", err)
}
return system.Execv(process.Args[0], process.Args[0:], config.Env)
}
// restoreParentDeathSignal sets the parent death signal to old.
func restoreParentDeathSignal(old int) error {
if old == 0 {
return nil
}
current, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("get parent death signal %s", err)
}
if old == current {
return nil
}
if err := system.ParentDeathSignal(uintptr(old)); err != nil {
return fmt.Errorf("set parent death signal %s", err)
}
// Signal self if parent is already dead. Does nothing if running in a new
// PID namespace, as Getppid will always return 0.
if syscall.Getppid() == 1 {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
}
return nil
}
// setupUser changes the groups, gid, and uid for the user inside the container
func setupUser(config *configs.Config) error {
// Set up defaults.
defaultExecUser := user.ExecUser{
Uid: syscall.Getuid(),
Gid: syscall.Getgid(),
Home: "/",
}
passwdPath, err := user.GetPasswdPath()
if err != nil {
return err
}
groupPath, err := user.GetGroupPath()
if err != nil {
return err
}
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
if err != nil {
return fmt.Errorf("get supplementary groups %s", err)
}
suppGroups := append(execUser.Sgids, config.AdditionalGroups...)
if err := syscall.Setgroups(suppGroups); err != nil {
return fmt.Errorf("setgroups %s", err)
}
if err := system.Setgid(execUser.Gid); err != nil {
return fmt.Errorf("setgid %s", err)
}
if err := system.Setuid(execUser.Uid); err != nil {
return fmt.Errorf("setuid %s", err)
}
// if we didn't get HOME already, set it based on the user's HOME
if envHome := os.Getenv("HOME"); envHome == "" {
if err := os.Setenv("HOME", execUser.Home); err != nil {
return fmt.Errorf("set HOME %s", err)
}
}
return nil
}
// setupVethNetwork uses the Network config if it is not nil to initialize
// the new veth interface inside the container for use by changing the name to eth0
// setting the MTU and IP address along with the default gateway
func setupNetwork(config *configs.Config, networkState *configs.NetworkState) error {
for _, config := range config.Networks {
strategy, err := network.GetStrategy(config.Type)
if err != nil {
return err
}
err1 := strategy.Initialize(config, networkState)
if err1 != nil {
return err1
}
}
return nil
}
func setupRoute(config *configs.Config) error {
for _, config := range config.Routes {
if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil {
return err
}
}
return nil
}
func setupRlimits(config *configs.Config) error {
for _, rlimit := range config.Rlimits {
l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
}
}
return nil
}
// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaky file descriptors
// before execing the command inside the namespace
func finalizeNamespace(config *configs.Config) error {
// Ensure that all non-standard fds we may have accidentally
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(3); err != nil {
return fmt.Errorf("close open file descriptors %s", err)
}
// drop capabilities in bounding set before changing user
if err := capabilities.DropBoundingSet(config.Capabilities); err != nil {
return fmt.Errorf("drop bounding set %s", err)
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return fmt.Errorf("set keep caps %s", err)
}
if err := setupUser(config); err != nil {
return fmt.Errorf("setup user %s", err)
}
if err := system.ClearKeepCaps(); err != nil {
return fmt.Errorf("clear keep caps %s", err)
}
// drop all other capabilities
if err := capabilities.DropCapabilities(config.Capabilities); err != nil {
return fmt.Errorf("drop capabilities %s", err)
}
if config.WorkingDir != "" {
if err := syscall.Chdir(config.WorkingDir); err != nil {
return fmt.Errorf("chdir to %s %s", config.WorkingDir, err)
}
}
return nil
}
func loadContainerEnvironment(config *configs.Config) error {
os.Clearenv()
for _, pair := range config.Env {
p := strings.SplitN(pair, "=", 2)
if len(p) < 2 {
return fmt.Errorf("invalid environment '%v'", pair)
}
if err := os.Setenv(p[0], p[1]); err != nil {
return err
}
}
return nil
}
// joinExistingNamespaces gets all the namespace paths specified for the container and
// does a setns on the namespace fd so that the current process joins the namespace.
func joinExistingNamespaces(namespaces []configs.Namespace) error {
for _, ns := range namespaces {
if ns.Path != "" {
f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
if err != nil {
return err
}
err = system.Setns(f.Fd(), uintptr(ns.Syscall()))
f.Close()
if err != nil {
return err
}
}
}
return nil
}
// setupContainer is run to setup mounts and networking related operations
// for a user namespace enabled process as a user namespace root doesn't
// have permissions to perform these operations.
// The setup process joins all the namespaces of user namespace enabled init
// except the user namespace, so it run as root in the root user namespace
// to perform these operations.
func setupContainer(process *processArgs) error {
container := process.Config
networkState := process.NetworkState
// TODO : move to validation
/*
rootfs, err := utils.ResolveRootfs(container.Rootfs)
if err != nil {
return err
}
*/
// clear the current processes env and replace it with the environment
// defined on the container
if err := loadContainerEnvironment(container); err != nil {
return err
}
cloneFlags := container.Namespaces.CloneFlags()
if (cloneFlags & syscall.CLONE_NEWNET) == 0 {
if len(container.Networks) != 0 || len(container.Routes) != 0 {
return fmt.Errorf("unable to apply network parameters without network namespace")
}
} else {
if err := setupNetwork(container, networkState); err != nil {
return fmt.Errorf("setup networking %s", err)
}
if err := setupRoute(container); err != nil {
return fmt.Errorf("setup route %s", err)
}
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if (cloneFlags & syscall.CLONE_NEWNS) != 0 {
if err := mount.InitializeMountNamespace(container); err != nil {
return fmt.Errorf("setup mount namespace %s", err)
}
}
return nil
}
// Finalize entering into a container and execute a specified command
func initIn(pipe *os.File) (err error) {
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't
// receive ECONNRESET when the child writes to the pipe.
ioutil.ReadAll(pipe)
if err := json.NewEncoder(pipe).Encode(initError{
Message: err.Error(),
}); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed
pipe.Close()
}()
decoder := json.NewDecoder(pipe)
var config *configs.Config
if err := decoder.Decode(&config); err != nil {
return err
}
var process *processArgs
if err := decoder.Decode(&process); err != nil {
return err
}
if err := finalizeSetns(config); err != nil {
return err
}
if err := system.Execv(process.Args[0], process.Args[0:], config.Env); err != nil {
return err
}
panic("unreachable")
}
// finalize expects that the setns calls have been setup and that is has joined an
// existing namespace
func finalizeSetns(container *configs.Config) error {
// clear the current processes env and replace it with the environment defined on the container
if err := loadContainerEnvironment(container); err != nil {
return err
}
if err := setupRlimits(container); err != nil {
return fmt.Errorf("setup rlimits %s", err)
}
if err := finalizeNamespace(container); err != nil {
return err
}
if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil {
return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err)
}
if container.ProcessLabel != "" {
if err := label.SetProcessLabel(container.ProcessLabel); err != nil {
return err
}
}
return nil
}