From bbeae7445a904592c19e571f9de05bb4500c3cc5 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Sat, 31 Jan 2015 21:21:06 -0800 Subject: [PATCH] Remove namespaces package Signed-off-by: Michael Crosby --- configs/namespaces.go | 30 ++ container.go | 2 +- linux_container.go | 464 +++++++++++++++-- linux_factory.go | 406 ++++++++++++++- namespaces/exec.go | 262 ---------- namespaces/execin.go | 218 -------- namespaces/init.go | 465 ------------------ namespaces/utils.go | 48 -- {namespaces/nsenter => nsenter}/README.md | 0 {namespaces/nsenter => nsenter}/nsenter.go | 0 .../nsenter => nsenter}/nsenter_test.go | 0 .../nsenter_unsupported.go | 0 {namespaces/nsenter => nsenter}/nsexec.c | 0 nsinit/exec.go | 140 ++++-- nsinit/init.go | 2 +- 15 files changed, 952 insertions(+), 1085 deletions(-) delete mode 100644 namespaces/exec.go delete mode 100644 namespaces/execin.go delete mode 100644 namespaces/init.go delete mode 100644 namespaces/utils.go rename {namespaces/nsenter => nsenter}/README.md (100%) rename {namespaces/nsenter => nsenter}/nsenter.go (100%) rename {namespaces/nsenter => nsenter}/nsenter_test.go (100%) rename {namespaces/nsenter => nsenter}/nsenter_unsupported.go (100%) rename {namespaces/nsenter => nsenter}/nsexec.c (100%) diff --git a/configs/namespaces.go b/configs/namespaces.go index 5e891eab..a227f1ba 100644 --- a/configs/namespaces.go +++ b/configs/namespaces.go @@ -1,5 +1,9 @@ package configs +import ( + "syscall" +) + type NamespaceType string const ( @@ -18,6 +22,10 @@ type Namespace struct { Path string `json:"path,omitempty"` } +func (n *Namespace) Syscall() int { + return namespaceInfo[n.Type] +} + type Namespaces []Namespace func (n *Namespaces) Remove(t NamespaceType) bool { @@ -50,3 +58,25 @@ func (n *Namespaces) index(t NamespaceType) int { func (n *Namespaces) Contains(t NamespaceType) bool { return n.index(t) != -1 } + +var namespaceInfo = map[NamespaceType]int{ + NEWNET: syscall.CLONE_NEWNET, + NEWNS: syscall.CLONE_NEWNS, + NEWUSER: syscall.CLONE_NEWUSER, + NEWIPC: syscall.CLONE_NEWIPC, + NEWUTS: syscall.CLONE_NEWUTS, + NEWPID: syscall.CLONE_NEWPID, +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This functions returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + var flag int + for _, v := range *n { + if v.Path != "" { + continue + } + flag |= namespaceInfo[v.Type] + } + return uintptr(flag) +} diff --git a/container.go b/container.go index 72d284f8..bb6bce89 100644 --- a/container.go +++ b/container.go @@ -33,7 +33,7 @@ type Container interface { Status() (configs.Status, error) // Returns the current config of the container. - Config() *configs.Config + Config() configs.Config // Returns the PIDs inside this container. The PIDs are in the namespace of the calling process. // diff --git a/linux_container.go b/linux_container.go index 1d7ab309..f684188c 100644 --- a/linux_container.go +++ b/linux_container.go @@ -5,18 +5,35 @@ package libcontainer import ( "encoding/json" "fmt" + "io" + "io/ioutil" "os" "os/exec" "path/filepath" "syscall" + "github.com/docker/libcontainer/apparmor" "github.com/docker/libcontainer/cgroups" "github.com/docker/libcontainer/configs" - "github.com/docker/libcontainer/namespaces" + "github.com/docker/libcontainer/label" + "github.com/docker/libcontainer/mount" "github.com/docker/libcontainer/network" + "github.com/docker/libcontainer/system" "github.com/golang/glog" ) +const ( + EXIT_SIGNAL_OFFSET = 128 +) + +type initError struct { + Message string `json:"message,omitempty"` +} + +func (i initError) Error() string { + return i.Message +} + type linuxContainer struct { id string root string @@ -26,12 +43,14 @@ type linuxContainer struct { initArgs []string } +// ID returns the container's unique ID func (c *linuxContainer) ID() string { return c.id } -func (c *linuxContainer) Config() *configs.Config { - return c.config +// Config returns the container's configuration +func (c *linuxContainer) Config() configs.Config { + return *c.config } func (c *linuxContainer) Status() (configs.Status, error) { @@ -96,48 +115,158 @@ func (c *linuxContainer) Start(process *Process) (int, error) { if status != configs.Destroyed { glog.Info("start new container process") // TODO: (crosbymichael) check out console use for execin - return namespaces.ExecIn(process.Args, c.config.Env, "", cmd, c.config, c.state) + //return namespaces.ExecIn(process.Args, c.config.Env, "", cmd, c.config, c.state) + return c.startNewProcess(cmd, process.Args) } - if err := c.startInitProcess(cmd, process); err != nil { + if err := c.startInitProcess(cmd, process.Args); err != nil { return -1, err } return c.state.InitPid, nil } -func (c *linuxContainer) updateStateFile() error { - fnew := filepath.Join(c.root, fmt.Sprintf("%s.new", stateFilename)) - f, err := os.Create(fnew) +func (c *linuxContainer) startNewProcess(cmd *exec.Cmd, args []string) (int, error) { + var err error + parent, child, err := newInitPipe() if err != nil { - return newGenericError(err, SystemError) + return -1, err } - - err = json.NewEncoder(f).Encode(c.state) + defer parent.Close() + cmd.ExtraFiles = []*os.File{child} + cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPID=%d", c.state.InitPid)) + if err := cmd.Start(); err != nil { + child.Close() + return -1, err + } + child.Close() + s, err := cmd.Process.Wait() if err != nil { - f.Close() - os.Remove(fnew) - return newGenericError(err, SystemError) + return -1, err } - f.Close() - - fname := filepath.Join(c.root, stateFilename) - if err := os.Rename(fnew, fname); err != nil { - return newGenericError(err, SystemError) + if !s.Success() { + return -1, &exec.ExitError{s} } - - return nil + decoder := json.NewDecoder(parent) + var pid *pid + if err := decoder.Decode(&pid); err != nil { + return -1, err + } + p, err := os.FindProcess(pid.Pid) + if err != nil { + return -1, err + } + terminate := func(terr error) (int, error) { + // TODO: log the errors for kill and wait + p.Kill() + p.Wait() + return -1, terr + } + // Enter cgroups. + if err := enterCgroups(c.state, pid.Pid); err != nil { + return terminate(err) + } + encoder := json.NewEncoder(parent) + if err := encoder.Encode(c.config); err != nil { + return terminate(err) + } + process := processArgs{ + Config: c.config, + Args: args, + } + if err := encoder.Encode(process); err != nil { + return terminate(err) + } + return pid.Pid, nil } -func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, config *Process) error { - err := namespaces.Exec(config.Args, c.config.Env, c.config.Console, cmd, c.config, c.cgroupManager, c.state) +func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, args []string) error { + // create a pipe so that we can syncronize with the namespaced process and + // pass the state and configuration to the child process + parent, child, err := newInitPipe() if err != nil { return err } + defer parent.Close() + cmd.ExtraFiles = []*os.File{child} + cmd.SysProcAttr.Cloneflags = c.config.Namespaces.CloneFlags() + if c.config.Namespaces.Contains(configs.NEWUSER) { + addUidGidMappings(cmd.SysProcAttr, c.config) + // Default to root user when user namespaces are enabled. + if cmd.SysProcAttr.Credential == nil { + cmd.SysProcAttr.Credential = &syscall.Credential{} + } + } + glog.Info("starting container init process") + err = cmd.Start() + child.Close() + if err != nil { + return newGenericError(err, SystemError) + } + wait := func() (*os.ProcessState, error) { + ps, err := cmd.Process.Wait() + // we should kill all processes in cgroup when init is died if we use + // host PID namespace + if !c.config.Namespaces.Contains(configs.NEWPID) { + c.killAllPids() + } + return ps, newGenericError(err, SystemError) + } + terminate := func(terr error) error { + // TODO: log the errors for kill and wait + cmd.Process.Kill() + wait() + return terr + } + started, err := system.GetProcessStartTime(cmd.Process.Pid) + if err != nil { + return terminate(err) + } + // Do this before syncing with child so that no children + // can escape the cgroup + if err := c.cgroupManager.Apply(cmd.Process.Pid); err != nil { + return terminate(err) + } + defer func() { + if err != nil { + c.cgroupManager.Destroy() + } + }() + var networkState configs.NetworkState + if err := c.initializeNetworking(cmd.Process.Pid, &networkState); err != nil { + return terminate(err) + } + process := processArgs{ + Args: args, + Config: c.config, + NetworkState: &networkState, + } + // Start the setup process to setup the init process + if c.config.Namespaces.Contains(configs.NEWUSER) { + if err = executeSetupCmd(cmd.Args, cmd.Process.Pid, c.config, &process, &networkState); err != nil { + return terminate(err) + } + } + // send the state to the container's init process then shutdown writes for the parent + if err := json.NewEncoder(parent).Encode(process); err != nil { + return terminate(err) + } + // shutdown writes for the parent side of the pipe + if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil { + return terminate(err) + } + // wait for the child process to fully complete and receive an error message + // if one was encoutered + var ierr *initError + if err := json.NewDecoder(parent).Decode(&ierr); err != nil && err != io.EOF { + return terminate(err) + } + if ierr != nil { + return terminate(ierr) + } - err = c.updateStateFile() - if err != nil { - // FIXME c.Kill() - return err - } + c.state.InitPid = cmd.Process.Pid + c.state.InitStartTime = started + c.state.NetworkState = networkState + c.state.CgroupPaths = c.cgroupManager.GetPaths() return nil } @@ -179,3 +308,282 @@ func (c *linuxContainer) Wait() (syscall.WaitStatus, error) { func (c *linuxContainer) OOM() (<-chan struct{}, error) { return NotifyOnOOM(c.state) } + +func (c *linuxContainer) updateStateFile() error { + fnew := filepath.Join(c.root, fmt.Sprintf("%s.new", stateFilename)) + f, err := os.Create(fnew) + if err != nil { + return newGenericError(err, SystemError) + } + defer f.Close() + + if err := json.NewEncoder(f).Encode(c.state); err != nil { + f.Close() + os.Remove(fnew) + return newGenericError(err, SystemError) + } + fname := filepath.Join(c.root, stateFilename) + if err := os.Rename(fnew, fname); err != nil { + return newGenericError(err, SystemError) + } + return nil +} + +// New returns a newly initialized Pipe for communication between processes +func newInitPipe() (parent *os.File, child *os.File, err error) { + fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} + +// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr. +func addUidGidMappings(sys *syscall.SysProcAttr, container *configs.Config) { + if container.UidMappings != nil { + sys.UidMappings = make([]syscall.SysProcIDMap, len(container.UidMappings)) + for i, um := range container.UidMappings { + sys.UidMappings[i].ContainerID = um.ContainerID + sys.UidMappings[i].HostID = um.HostID + sys.UidMappings[i].Size = um.Size + } + } + + if container.GidMappings != nil { + sys.GidMappings = make([]syscall.SysProcIDMap, len(container.GidMappings)) + for i, gm := range container.GidMappings { + sys.GidMappings[i].ContainerID = gm.ContainerID + sys.GidMappings[i].HostID = gm.HostID + sys.GidMappings[i].Size = gm.Size + } + } +} + +// killAllPids iterates over all of the container's processes +// sending a SIGKILL to each process. +func (c *linuxContainer) killAllPids() error { + glog.Info("killing all processes in container") + var procs []*os.Process + c.cgroupManager.Freeze(configs.Frozen) + pids, err := c.cgroupManager.GetPids() + if err != nil { + return err + } + for _, pid := range pids { + // TODO: log err without aborting if we are unable to find + // a single PID + if p, err := os.FindProcess(pid); err == nil { + procs = append(procs, p) + p.Kill() + } + } + c.cgroupManager.Freeze(configs.Thawed) + for _, p := range procs { + p.Wait() + } + return err +} + +// initializeNetworking creates the container's network stack outside of the namespace and moves +// interfaces into the container's net namespaces if necessary +func (c *linuxContainer) initializeNetworking(nspid int, networkState *configs.NetworkState) error { + glog.Info("initailzing container's network stack") + for _, config := range c.config.Networks { + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err + } + if err := strategy.Create(config, nspid, networkState); err != nil { + return err + } + } + return nil +} + +func executeSetupCmd(args []string, ppid int, container *configs.Config, process *processArgs, networkState *configs.NetworkState) error { + command := exec.Command(args[0], args[1:]...) + parent, child, err := newInitPipe() + if err != nil { + return err + } + defer parent.Close() + command.ExtraFiles = []*os.File{child} + command.Dir = container.RootFs + command.Env = append(command.Env, + fmt.Sprintf("_LIBCONTAINER_INITPID=%d", ppid), + fmt.Sprintf("_LIBCONTAINER_USERNS=1")) + err = command.Start() + child.Close() + if err != nil { + return err + } + s, err := command.Process.Wait() + if err != nil { + return err + } + if !s.Success() { + return &exec.ExitError{s} + } + decoder := json.NewDecoder(parent) + var pid *pid + if err := decoder.Decode(&pid); err != nil { + return err + } + p, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + terminate := func(terr error) error { + // TODO: log the errors for kill and wait + p.Kill() + p.Wait() + return terr + } + // send the state to the container's init process then shutdown writes for the parent + if err := json.NewEncoder(parent).Encode(process); err != nil { + return terminate(err) + } + // shutdown writes for the parent side of the pipe + if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil { + return terminate(err) + } + // wait for the child process to fully complete and receive an error message + // if one was encoutered + var ierr *initError + if err := decoder.Decode(&ierr); err != nil && err != io.EOF { + return terminate(err) + } + if ierr != nil { + return ierr + } + s, err = p.Wait() + if err != nil { + return err + } + if !s.Success() { + return &exec.ExitError{s} + } + return nil +} + +type pid struct { + Pid int `json:"Pid"` +} + +// Finalize entering into a container and execute a specified command +func InitIn(pipe *os.File) (err error) { + defer func() { + // if we have an error during the initialization of the container's init then send it back to the + // parent process in the form of an initError. + if err != nil { + // ensure that any data sent from the parent is consumed so it doesn't + // receive ECONNRESET when the child writes to the pipe. + ioutil.ReadAll(pipe) + if err := json.NewEncoder(pipe).Encode(initError{ + Message: err.Error(), + }); err != nil { + panic(err) + } + } + // ensure that this pipe is always closed + pipe.Close() + }() + decoder := json.NewDecoder(pipe) + var config *configs.Config + if err := decoder.Decode(&config); err != nil { + return err + } + var process *processArgs + if err := decoder.Decode(&process); err != nil { + return err + } + if err := finalizeSetns(config); err != nil { + return err + } + if err := system.Execv(process.Args[0], process.Args[0:], config.Env); err != nil { + return err + } + panic("unreachable") +} + +// finalize expects that the setns calls have been setup and that is has joined an +// existing namespace +func finalizeSetns(container *configs.Config) error { + // clear the current processes env and replace it with the environment defined on the container + if err := loadContainerEnvironment(container); err != nil { + return err + } + + if err := setupRlimits(container); err != nil { + return fmt.Errorf("setup rlimits %s", err) + } + + if err := finalizeNamespace(container); err != nil { + return err + } + + if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil { + return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err) + } + + if container.ProcessLabel != "" { + if err := label.SetProcessLabel(container.ProcessLabel); err != nil { + return err + } + } + + return nil +} + +// SetupContainer is run to setup mounts and networking related operations +// for a user namespace enabled process as a user namespace root doesn't +// have permissions to perform these operations. +// The setup process joins all the namespaces of user namespace enabled init +// except the user namespace, so it run as root in the root user namespace +// to perform these operations. +func SetupContainer(process *processArgs) error { + container := process.Config + networkState := process.NetworkState + + // TODO : move to validation + /* + rootfs, err := utils.ResolveRootfs(container.RootFs) + if err != nil { + return err + } + */ + + // clear the current processes env and replace it with the environment + // defined on the container + if err := loadContainerEnvironment(container); err != nil { + return err + } + + cloneFlags := container.Namespaces.CloneFlags() + if (cloneFlags & syscall.CLONE_NEWNET) == 0 { + if len(container.Networks) != 0 || len(container.Routes) != 0 { + return fmt.Errorf("unable to apply network parameters without network namespace") + } + } else { + if err := setupNetwork(container, networkState); err != nil { + return fmt.Errorf("setup networking %s", err) + } + if err := setupRoute(container); err != nil { + return fmt.Errorf("setup route %s", err) + } + } + + label.Init() + + // InitializeMountNamespace() can be executed only for a new mount namespace + if (cloneFlags & syscall.CLONE_NEWNS) != 0 { + if err := mount.InitializeMountNamespace(container); err != nil { + return fmt.Errorf("setup mount namespace %s", err) + } + } + return nil +} + +func enterCgroups(state *configs.State, pid int) error { + return cgroups.EnterPid(state.CgroupPaths, pid) +} diff --git a/linux_factory.go b/linux_factory.go index 3583bf72..d7bf8a50 100644 --- a/linux_factory.go +++ b/linux_factory.go @@ -5,15 +5,28 @@ package libcontainer import ( "encoding/json" "fmt" + "io/ioutil" "os" "path/filepath" "regexp" + "strings" + "syscall" "github.com/golang/glog" + "github.com/docker/libcontainer/apparmor" cgroups "github.com/docker/libcontainer/cgroups/manager" "github.com/docker/libcontainer/configs" - "github.com/docker/libcontainer/namespaces" + "github.com/docker/libcontainer/console" + "github.com/docker/libcontainer/label" + "github.com/docker/libcontainer/mount" + "github.com/docker/libcontainer/netlink" + "github.com/docker/libcontainer/network" + "github.com/docker/libcontainer/security/capabilities" + "github.com/docker/libcontainer/security/restrict" + "github.com/docker/libcontainer/system" + "github.com/docker/libcontainer/user" + "github.com/docker/libcontainer/utils" ) const ( @@ -26,6 +39,13 @@ var ( maxIdLen = 1024 ) +// Process is used for transferring parameters from Exec() to Init() +type processArgs struct { + Args []string `json:"args,omitempty"` + Config *configs.Config `json:"config,omitempty"` + NetworkState *configs.NetworkState `json:"network_state,omitempty"` +} + // New returns a linux based container factory based in the root directory. func New(root string, initArgs []string) (Factory, error) { if root != "" { @@ -116,16 +136,50 @@ func (l *linuxFactory) Load(id string) (Container, error) { // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // This is a low level implementation detail of the reexec and should not be consumed externally -func (f *linuxFactory) StartInitialization(pipefd uintptr) (err error) { +func (l *linuxFactory) StartInitialization(pipefd uintptr) (err error) { pipe := os.NewFile(uintptr(pipefd), "pipe") - - setupUserns := os.Getenv("_LIBCONTAINER_USERNS") + setupUserns := os.Getenv("_LIBCONTAINER_USERNS") != "" pid := os.Getenv("_LIBCONTAINER_INITPID") - if pid != "" && setupUserns == "" { - return namespaces.InitIn(pipe) + if pid != "" && !setupUserns { + return InitIn(pipe) } - - return namespaces.Init(pipe, setupUserns != "") + defer func() { + // if we have an error during the initialization of the container's init then send it back to the + // parent process in the form of an initError. + if err != nil { + // ensure that any data sent from the parent is consumed so it doesn't + // receive ECONNRESET when the child writes to the pipe. + ioutil.ReadAll(pipe) + if err := json.NewEncoder(pipe).Encode(initError{ + Message: err.Error(), + }); err != nil { + panic(err) + } + } + // ensure that this pipe is always closed + pipe.Close() + }() + uncleanRootfs, err := os.Getwd() + if err != nil { + return err + } + var process *processArgs + // We always read this as it is a way to sync with the parent as well + if err := json.NewDecoder(pipe).Decode(&process); err != nil { + return err + } + if setupUserns { + err = SetupContainer(process) + if err == nil { + os.Exit(0) + } else { + os.Exit(1) + } + } + if process.Config.Namespaces.Contains(configs.NEWUSER) { + return l.initUserNs(uncleanRootfs, process) + } + return l.initDefault(uncleanRootfs, process) } func (l *linuxFactory) loadContainerConfig(root string) (*configs.Config, error) { @@ -137,7 +191,6 @@ func (l *linuxFactory) loadContainerConfig(root string) (*configs.Config, error) return nil, newGenericError(err, SystemError) } defer f.Close() - var config *configs.Config if err := json.NewDecoder(f).Decode(&config); err != nil { return nil, newGenericError(err, ConfigInvalid) @@ -154,7 +207,6 @@ func (l *linuxFactory) loadContainerState(root string) (*configs.State, error) { return nil, newGenericError(err, SystemError) } defer f.Close() - var state *configs.State if err := json.NewDecoder(f).Decode(&state); err != nil { return nil, newGenericError(err, SystemError) @@ -171,3 +223,337 @@ func (l *linuxFactory) validateID(id string) error { } return nil } + +func (l *linuxFactory) initDefault(uncleanRootfs string, process *processArgs) (err error) { + config := process.Config + networkState := process.NetworkState + + // TODO: move to validation + /* + rootfs, err := utils.ResolveRootfs(uncleanRootfs) + if err != nil { + return err + } + */ + + // clear the current processes env and replace it with the environment + // defined on the container + if err := loadContainerEnvironment(config); err != nil { + return err + } + // join any namespaces via a path to the namespace fd if provided + if err := joinExistingNamespaces(config.Namespaces); err != nil { + return err + } + if config.Console != "" { + if err := console.OpenAndDup(config.Console); err != nil { + return err + } + } + if _, err := syscall.Setsid(); err != nil { + return fmt.Errorf("setsid %s", err) + } + if config.Console != "" { + if err := system.Setctty(); err != nil { + return fmt.Errorf("setctty %s", err) + } + } + + cloneFlags := config.Namespaces.CloneFlags() + if (cloneFlags & syscall.CLONE_NEWNET) == 0 { + if len(config.Networks) != 0 || len(config.Routes) != 0 { + return fmt.Errorf("unable to apply network parameters without network namespace") + } + } else { + if err := setupNetwork(config, networkState); err != nil { + return fmt.Errorf("setup networking %s", err) + } + if err := setupRoute(config); err != nil { + return fmt.Errorf("setup route %s", err) + } + } + if err := setupRlimits(config); err != nil { + return fmt.Errorf("setup rlimits %s", err) + } + label.Init() + // InitializeMountNamespace() can be executed only for a new mount namespace + if (cloneFlags & syscall.CLONE_NEWNS) != 0 { + if err := mount.InitializeMountNamespace(config); err != nil { + return err + } + } + if config.Hostname != "" { + // TODO: (crosbymichael) move this to pre spawn validation + if (cloneFlags & syscall.CLONE_NEWUTS) == 0 { + return fmt.Errorf("unable to set the hostname without UTS namespace") + } + if err := syscall.Sethostname([]byte(config.Hostname)); err != nil { + return fmt.Errorf("unable to sethostname %q: %s", config.Hostname, err) + } + } + if err := apparmor.ApplyProfile(config.AppArmorProfile); err != nil { + return fmt.Errorf("set apparmor profile %s: %s", config.AppArmorProfile, err) + } + if err := label.SetProcessLabel(config.ProcessLabel); err != nil { + return fmt.Errorf("set process label %s", err) + } + // TODO: (crosbymichael) make this configurable at the Config level + if config.RestrictSys { + if (cloneFlags & syscall.CLONE_NEWNS) == 0 { + return fmt.Errorf("unable to restrict access to kernel files without mount namespace") + } + if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { + return err + } + } + pdeathSignal, err := system.GetParentDeathSignal() + if err != nil { + return fmt.Errorf("get parent death signal %s", err) + } + if err := finalizeNamespace(config); err != nil { + return fmt.Errorf("finalize namespace %s", err) + } + // finalizeNamespace can change user/group which clears the parent death + // signal, so we restore it here. + if err := restoreParentDeathSignal(pdeathSignal); err != nil { + return fmt.Errorf("restore parent death signal %s", err) + } + return system.Execv(process.Args[0], process.Args[0:], config.Env) +} + +func (l *linuxFactory) initUserNs(uncleanRootfs string, process *processArgs) (err error) { + config := process.Config + // clear the current processes env and replace it with the environment + // defined on the config + if err := loadContainerEnvironment(config); err != nil { + return err + } + // join any namespaces via a path to the namespace fd if provided + if err := joinExistingNamespaces(config.Namespaces); err != nil { + return err + } + if config.Console != "" { + if err := console.OpenAndDup("/dev/console"); err != nil { + return err + } + } + if _, err := syscall.Setsid(); err != nil { + return fmt.Errorf("setsid %s", err) + } + if config.Console != "" { + if err := system.Setctty(); err != nil { + return fmt.Errorf("setctty %s", err) + } + } + if config.WorkingDir == "" { + config.WorkingDir = "/" + } + + if err := setupRlimits(config); err != nil { + return fmt.Errorf("setup rlimits %s", err) + } + cloneFlags := config.Namespaces.CloneFlags() + if config.Hostname != "" { + // TODO: move validation + if (cloneFlags & syscall.CLONE_NEWUTS) == 0 { + return fmt.Errorf("unable to set the hostname without UTS namespace") + } + if err := syscall.Sethostname([]byte(config.Hostname)); err != nil { + return fmt.Errorf("unable to sethostname %q: %s", config.Hostname, err) + } + } + if err := apparmor.ApplyProfile(config.AppArmorProfile); err != nil { + return fmt.Errorf("set apparmor profile %s: %s", config.AppArmorProfile, err) + } + if err := label.SetProcessLabel(config.ProcessLabel); err != nil { + return fmt.Errorf("set process label %s", err) + } + if config.RestrictSys { + if (cloneFlags & syscall.CLONE_NEWNS) == 0 { + return fmt.Errorf("unable to restrict access to kernel files without mount namespace") + } + if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { + return err + } + } + pdeathSignal, err := system.GetParentDeathSignal() + if err != nil { + return fmt.Errorf("get parent death signal %s", err) + } + if err := finalizeNamespace(config); err != nil { + return fmt.Errorf("finalize namespace %s", err) + } + // finalizeNamespace can change user/group which clears the parent death + // signal, so we restore it here. + if err := restoreParentDeathSignal(pdeathSignal); err != nil { + return fmt.Errorf("restore parent death signal %s", err) + } + return system.Execv(process.Args[0], process.Args[0:], config.Env) +} + +// restoreParentDeathSignal sets the parent death signal to old. +func restoreParentDeathSignal(old int) error { + if old == 0 { + return nil + } + current, err := system.GetParentDeathSignal() + if err != nil { + return fmt.Errorf("get parent death signal %s", err) + } + if old == current { + return nil + } + if err := system.ParentDeathSignal(uintptr(old)); err != nil { + return fmt.Errorf("set parent death signal %s", err) + } + // Signal self if parent is already dead. Does nothing if running in a new + // PID namespace, as Getppid will always return 0. + if syscall.Getppid() == 1 { + return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) + } + return nil +} + +// setupUser changes the groups, gid, and uid for the user inside the container +func setupUser(config *configs.Config) error { + // Set up defaults. + defaultExecUser := user.ExecUser{ + Uid: syscall.Getuid(), + Gid: syscall.Getgid(), + Home: "/", + } + passwdPath, err := user.GetPasswdPath() + if err != nil { + return err + } + groupPath, err := user.GetGroupPath() + if err != nil { + return err + } + execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) + if err != nil { + return fmt.Errorf("get supplementary groups %s", err) + } + suppGroups := append(execUser.Sgids, config.AdditionalGroups...) + if err := syscall.Setgroups(suppGroups); err != nil { + return fmt.Errorf("setgroups %s", err) + } + if err := system.Setgid(execUser.Gid); err != nil { + return fmt.Errorf("setgid %s", err) + } + if err := system.Setuid(execUser.Uid); err != nil { + return fmt.Errorf("setuid %s", err) + } + // if we didn't get HOME already, set it based on the user's HOME + if envHome := os.Getenv("HOME"); envHome == "" { + if err := os.Setenv("HOME", execUser.Home); err != nil { + return fmt.Errorf("set HOME %s", err) + } + } + return nil +} + +// setupVethNetwork uses the Network config if it is not nil to initialize +// the new veth interface inside the container for use by changing the name to eth0 +// setting the MTU and IP address along with the default gateway +func setupNetwork(config *configs.Config, networkState *configs.NetworkState) error { + for _, config := range config.Networks { + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err + } + err1 := strategy.Initialize(config, networkState) + if err1 != nil { + return err1 + } + } + return nil +} + +func setupRoute(config *configs.Config) error { + for _, config := range config.Routes { + if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil { + return err + } + } + return nil +} + +func setupRlimits(config *configs.Config) error { + for _, rlimit := range config.Rlimits { + l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft} + if err := syscall.Setrlimit(rlimit.Type, l); err != nil { + return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) + } + } + return nil +} + +// finalizeNamespace drops the caps, sets the correct user +// and working dir, and closes any leaky file descriptors +// before execing the command inside the namespace +func finalizeNamespace(config *configs.Config) error { + // Ensure that all non-standard fds we may have accidentally + // inherited are marked close-on-exec so they stay out of the + // container + if err := utils.CloseExecFrom(3); err != nil { + return fmt.Errorf("close open file descriptors %s", err) + } + // drop capabilities in bounding set before changing user + if err := capabilities.DropBoundingSet(config.Capabilities); err != nil { + return fmt.Errorf("drop bounding set %s", err) + } + // preserve existing capabilities while we change users + if err := system.SetKeepCaps(); err != nil { + return fmt.Errorf("set keep caps %s", err) + } + if err := setupUser(config); err != nil { + return fmt.Errorf("setup user %s", err) + } + if err := system.ClearKeepCaps(); err != nil { + return fmt.Errorf("clear keep caps %s", err) + } + // drop all other capabilities + if err := capabilities.DropCapabilities(config.Capabilities); err != nil { + return fmt.Errorf("drop capabilities %s", err) + } + if config.WorkingDir != "" { + if err := syscall.Chdir(config.WorkingDir); err != nil { + return fmt.Errorf("chdir to %s %s", config.WorkingDir, err) + } + } + return nil +} + +func loadContainerEnvironment(config *configs.Config) error { + os.Clearenv() + for _, pair := range config.Env { + p := strings.SplitN(pair, "=", 2) + if len(p) < 2 { + return fmt.Errorf("invalid environment '%v'", pair) + } + if err := os.Setenv(p[0], p[1]); err != nil { + return err + } + } + return nil +} + +// joinExistingNamespaces gets all the namespace paths specified for the container and +// does a setns on the namespace fd so that the current process joins the namespace. +func joinExistingNamespaces(namespaces []configs.Namespace) error { + for _, ns := range namespaces { + if ns.Path != "" { + f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0) + if err != nil { + return err + } + err = system.Setns(f.Fd(), uintptr(ns.Syscall())) + f.Close() + if err != nil { + return err + } + } + } + return nil +} diff --git a/namespaces/exec.go b/namespaces/exec.go deleted file mode 100644 index ec462cbc..00000000 --- a/namespaces/exec.go +++ /dev/null @@ -1,262 +0,0 @@ -// +build linux - -package namespaces - -import ( - "encoding/json" - "fmt" - "io" - "os" - "os/exec" - "syscall" - - "github.com/docker/libcontainer/cgroups" - "github.com/docker/libcontainer/configs" - "github.com/docker/libcontainer/network" - "github.com/docker/libcontainer/system" -) - -const ( - EXIT_SIGNAL_OFFSET = 128 -) - -func executeSetupCmd(args []string, ppid int, container *configs.Config, process *processArgs, networkState *configs.NetworkState) error { - command := exec.Command(args[0], args[1:]...) - parent, child, err := newInitPipe() - if err != nil { - return err - } - defer parent.Close() - command.ExtraFiles = []*os.File{child} - command.Dir = container.RootFs - command.Env = append(command.Env, - fmt.Sprintf("_LIBCONTAINER_INITPID=%d", ppid), - fmt.Sprintf("_LIBCONTAINER_USERNS=1")) - err = command.Start() - child.Close() - if err != nil { - return err - } - s, err := command.Process.Wait() - if err != nil { - return err - } - if !s.Success() { - return &exec.ExitError{s} - } - decoder := json.NewDecoder(parent) - var pid *pid - if err := decoder.Decode(&pid); err != nil { - return err - } - p, err := os.FindProcess(pid.Pid) - if err != nil { - return err - } - terminate := func(terr error) error { - // TODO: log the errors for kill and wait - p.Kill() - p.Wait() - return terr - } - // send the state to the container's init process then shutdown writes for the parent - if err := json.NewEncoder(parent).Encode(process); err != nil { - return terminate(err) - } - // shutdown writes for the parent side of the pipe - if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil { - return terminate(err) - } - // wait for the child process to fully complete and receive an error message - // if one was encoutered - var ierr *initError - if err := decoder.Decode(&ierr); err != nil && err != io.EOF { - return terminate(err) - } - if ierr != nil { - return ierr - } - s, err = p.Wait() - if err != nil { - return err - } - if !s.Success() { - return &exec.ExitError{s} - } - return nil -} - -// TODO(vishh): This is part of the libcontainer API and it does much more than just namespaces related work. -// Move this to libcontainer package. -// Exec performs setup outside of a namespace so that a container can be -// executed. Exec is a high level function for working with container namespaces. -func Exec(args []string, env []string, console string, command *exec.Cmd, container *configs.Config, cgroupManager cgroups.Manager, state *configs.State) (err error) { - // create a pipe so that we can syncronize with the namespaced process and - // pass the state and configuration to the child process - parent, child, err := newInitPipe() - if err != nil { - return err - } - defer parent.Close() - command.ExtraFiles = []*os.File{child} - - command.Dir = container.RootFs - command.SysProcAttr.Cloneflags = uintptr(GetNamespaceFlags(container.Namespaces)) - - if container.Namespaces.Contains(configs.NEWUSER) { - AddUidGidMappings(command.SysProcAttr, container) - - // Default to root user when user namespaces are enabled. - if command.SysProcAttr.Credential == nil { - command.SysProcAttr.Credential = &syscall.Credential{} - } - } - - if err := command.Start(); err != nil { - child.Close() - return err - } - child.Close() - - wait := func() (*os.ProcessState, error) { - ps, err := command.Process.Wait() - // we should kill all processes in cgroup when init is died if we use - // host PID namespace - if !container.Namespaces.Contains(configs.NEWPID) { - killAllPids(cgroupManager) - } - return ps, err - } - - terminate := func(terr error) error { - // TODO: log the errors for kill and wait - command.Process.Kill() - wait() - return terr - } - - started, err := system.GetProcessStartTime(command.Process.Pid) - if err != nil { - return terminate(err) - } - - // Do this before syncing with child so that no children - // can escape the cgroup - err = cgroupManager.Apply(command.Process.Pid) - if err != nil { - return terminate(err) - } - defer func() { - if err != nil { - cgroupManager.Destroy() - } - }() - - var networkState configs.NetworkState - if err := InitializeNetworking(container, command.Process.Pid, &networkState); err != nil { - return terminate(err) - } - - process := processArgs{ - Env: append(env[0:], container.Env...), - Args: args, - ConsolePath: console, - Config: container, - NetworkState: &networkState, - } - - // Start the setup process to setup the init process - if container.Namespaces.Contains(configs.NEWUSER) { - if err = executeSetupCmd(command.Args, command.Process.Pid, container, &process, &networkState); err != nil { - return terminate(err) - } - } - - // send the state to the container's init process then shutdown writes for the parent - if err := json.NewEncoder(parent).Encode(process); err != nil { - return terminate(err) - } - // shutdown writes for the parent side of the pipe - if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil { - return terminate(err) - } - - // wait for the child process to fully complete and receive an error message - // if one was encoutered - var ierr *initError - if err := json.NewDecoder(parent).Decode(&ierr); err != nil && err != io.EOF { - return terminate(err) - } - if ierr != nil { - return terminate(ierr) - } - - state.InitPid = command.Process.Pid - state.InitStartTime = started - state.NetworkState = networkState - state.CgroupPaths = cgroupManager.GetPaths() - - return nil -} - -// killAllPids iterates over all of the container's processes -// sending a SIGKILL to each process. -func killAllPids(m cgroups.Manager) error { - var ( - procs []*os.Process - ) - m.Freeze(configs.Frozen) - pids, err := m.GetPids() - if err != nil { - return err - } - for _, pid := range pids { - // TODO: log err without aborting if we are unable to find - // a single PID - if p, err := os.FindProcess(pid); err == nil { - procs = append(procs, p) - p.Kill() - } - } - m.Freeze(configs.Thawed) - for _, p := range procs { - p.Wait() - } - return err -} - -// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr. -func AddUidGidMappings(sys *syscall.SysProcAttr, container *configs.Config) { - if container.UidMappings != nil { - sys.UidMappings = make([]syscall.SysProcIDMap, len(container.UidMappings)) - for i, um := range container.UidMappings { - sys.UidMappings[i].ContainerID = um.ContainerID - sys.UidMappings[i].HostID = um.HostID - sys.UidMappings[i].Size = um.Size - } - } - - if container.GidMappings != nil { - sys.GidMappings = make([]syscall.SysProcIDMap, len(container.GidMappings)) - for i, gm := range container.GidMappings { - sys.GidMappings[i].ContainerID = gm.ContainerID - sys.GidMappings[i].HostID = gm.HostID - sys.GidMappings[i].Size = gm.Size - } - } -} - -// InitializeNetworking creates the container's network stack outside of the namespace and moves -// interfaces into the container's net namespaces if necessary -func InitializeNetworking(container *configs.Config, nspid int, networkState *configs.NetworkState) error { - for _, config := range container.Networks { - strategy, err := network.GetStrategy(config.Type) - if err != nil { - return err - } - if err := strategy.Create(config, nspid, networkState); err != nil { - return err - } - } - return nil -} diff --git a/namespaces/execin.go b/namespaces/execin.go deleted file mode 100644 index 75e70a06..00000000 --- a/namespaces/execin.go +++ /dev/null @@ -1,218 +0,0 @@ -// +build linux - -package namespaces - -import ( - "encoding/json" - "fmt" - "io/ioutil" - "os" - "os/exec" - "syscall" - - "github.com/docker/libcontainer/apparmor" - "github.com/docker/libcontainer/cgroups" - "github.com/docker/libcontainer/configs" - "github.com/docker/libcontainer/label" - "github.com/docker/libcontainer/mount" - "github.com/docker/libcontainer/system" -) - -type pid struct { - Pid int `json:"Pid"` -} - -// ExecIn reexec's cmd with _LIBCONTAINER_INITPID=PID so that it is able to run the -// setns code in a single threaded environment joining the existing containers' namespaces. -func ExecIn(args []string, env []string, console string, cmd *exec.Cmd, container *configs.Config, state *configs.State) (int, error) { - var err error - - parent, child, err := newInitPipe() - if err != nil { - return -1, err - } - defer parent.Close() - - cmd.ExtraFiles = []*os.File{child} - cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPID=%d", state.InitPid)) - - if err := cmd.Start(); err != nil { - child.Close() - return -1, err - } - child.Close() - - s, err := cmd.Process.Wait() - if err != nil { - return -1, err - } - if !s.Success() { - return -1, &exec.ExitError{s} - } - - decoder := json.NewDecoder(parent) - var pid *pid - - if err := decoder.Decode(&pid); err != nil { - return -1, err - } - - p, err := os.FindProcess(pid.Pid) - if err != nil { - return -1, err - } - - terminate := func(terr error) (int, error) { - // TODO: log the errors for kill and wait - p.Kill() - p.Wait() - return -1, terr - } - - // Enter cgroups. - if err := EnterCgroups(state, pid.Pid); err != nil { - return terminate(err) - } - - encoder := json.NewEncoder(parent) - - if err := encoder.Encode(container); err != nil { - return terminate(err) - } - - process := processArgs{ - Env: append(env[0:], container.Env...), - Args: args, - ConsolePath: console, - } - if err := encoder.Encode(process); err != nil { - return terminate(err) - } - - return pid.Pid, nil -} - -// Finalize entering into a container and execute a specified command -func InitIn(pipe *os.File) (err error) { - defer func() { - // if we have an error during the initialization of the container's init then send it back to the - // parent process in the form of an initError. - if err != nil { - // ensure that any data sent from the parent is consumed so it doesn't - // receive ECONNRESET when the child writes to the pipe. - ioutil.ReadAll(pipe) - if err := json.NewEncoder(pipe).Encode(initError{ - Message: err.Error(), - }); err != nil { - panic(err) - } - } - // ensure that this pipe is always closed - pipe.Close() - }() - - decoder := json.NewDecoder(pipe) - - var container *configs.Config - if err := decoder.Decode(&container); err != nil { - return err - } - - var process *processArgs - if err := decoder.Decode(&process); err != nil { - return err - } - - if err := FinalizeSetns(container); err != nil { - return err - } - - if err := system.Execv(process.Args[0], process.Args[0:], process.Env); err != nil { - return err - } - - panic("unreachable") -} - -// Finalize expects that the setns calls have been setup and that is has joined an -// existing namespace -func FinalizeSetns(container *configs.Config) error { - // clear the current processes env and replace it with the environment defined on the container - if err := LoadContainerEnvironment(container); err != nil { - return err - } - - if err := setupRlimits(container); err != nil { - return fmt.Errorf("setup rlimits %s", err) - } - - if err := FinalizeNamespace(container); err != nil { - return err - } - - if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil { - return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err) - } - - if container.ProcessLabel != "" { - if err := label.SetProcessLabel(container.ProcessLabel); err != nil { - return err - } - } - - return nil -} - -// SetupContainer is run to setup mounts and networking related operations -// for a user namespace enabled process as a user namespace root doesn't -// have permissions to perform these operations. -// The setup process joins all the namespaces of user namespace enabled init -// except the user namespace, so it run as root in the root user namespace -// to perform these operations. -func SetupContainer(process *processArgs) error { - container := process.Config - networkState := process.NetworkState - - // TODO : move to validation - /* - rootfs, err := utils.ResolveRootfs(container.RootFs) - if err != nil { - return err - } - */ - - // clear the current processes env and replace it with the environment - // defined on the container - if err := LoadContainerEnvironment(container); err != nil { - return err - } - - cloneFlags := GetNamespaceFlags(container.Namespaces) - - if (cloneFlags & syscall.CLONE_NEWNET) == 0 { - if len(container.Networks) != 0 || len(container.Routes) != 0 { - return fmt.Errorf("unable to apply network parameters without network namespace") - } - } else { - if err := setupNetwork(container, networkState); err != nil { - return fmt.Errorf("setup networking %s", err) - } - if err := setupRoute(container); err != nil { - return fmt.Errorf("setup route %s", err) - } - } - - label.Init() - - // InitializeMountNamespace() can be executed only for a new mount namespace - if (cloneFlags & syscall.CLONE_NEWNS) != 0 { - if err := mount.InitializeMountNamespace(container); err != nil { - return fmt.Errorf("setup mount namespace %s", err) - } - } - return nil -} - -func EnterCgroups(state *configs.State, pid int) error { - return cgroups.EnterPid(state.CgroupPaths, pid) -} diff --git a/namespaces/init.go b/namespaces/init.go deleted file mode 100644 index d651352a..00000000 --- a/namespaces/init.go +++ /dev/null @@ -1,465 +0,0 @@ -// +build linux - -package namespaces - -import ( - "encoding/json" - "fmt" - "io/ioutil" - "os" - "strings" - "syscall" - - "github.com/docker/libcontainer/apparmor" - "github.com/docker/libcontainer/configs" - "github.com/docker/libcontainer/console" - "github.com/docker/libcontainer/label" - "github.com/docker/libcontainer/mount" - "github.com/docker/libcontainer/netlink" - "github.com/docker/libcontainer/network" - "github.com/docker/libcontainer/security/capabilities" - "github.com/docker/libcontainer/security/restrict" - "github.com/docker/libcontainer/system" - "github.com/docker/libcontainer/user" - "github.com/docker/libcontainer/utils" -) - -// Process is used for transferring parameters from Exec() to Init() -type processArgs struct { - Args []string `json:"args,omitempty"` - Env []string `json:"environment,omitempty"` - ConsolePath string `json:"console_path,omitempty"` - Config *configs.Config `json:"config,omitempty"` - NetworkState *configs.NetworkState `json:"network_state,omitempty"` -} - -// TODO(vishh): This is part of the libcontainer API and it does much more than just namespaces related work. -// Move this to libcontainer package. -// Init is the init process that first runs inside a new namespace to setup mounts, users, networking, -// and other options required for the new container. -// The caller of Init function has to ensure that the go runtime is locked to an OS thread -// (using runtime.LockOSThread) else system calls like setns called within Init may not work as intended. -func Init(pipe *os.File, setupUserns bool) (err error) { - defer func() { - // if we have an error during the initialization of the container's init then send it back to the - // parent process in the form of an initError. - if err != nil { - // ensure that any data sent from the parent is consumed so it doesn't - // receive ECONNRESET when the child writes to the pipe. - ioutil.ReadAll(pipe) - if err := json.NewEncoder(pipe).Encode(initError{ - Message: err.Error(), - }); err != nil { - panic(err) - } - } - // ensure that this pipe is always closed - pipe.Close() - }() - - uncleanRootfs, err := os.Getwd() - if err != nil { - return err - } - - var process *processArgs - // We always read this as it is a way to sync with the parent as well - if err := json.NewDecoder(pipe).Decode(&process); err != nil { - return err - } - - if setupUserns { - err = SetupContainer(process) - if err == nil { - os.Exit(0) - } else { - os.Exit(1) - } - } - - if process.Config.Namespaces.Contains(configs.NEWUSER) { - return initUserNs(uncleanRootfs, process) - } else { - return initDefault(uncleanRootfs, process) - } -} - -func initDefault(uncleanRootfs string, process *processArgs) (err error) { - container := process.Config - networkState := process.NetworkState - - // TODO: move to validation - /* - rootfs, err := utils.ResolveRootfs(uncleanRootfs) - if err != nil { - return err - } - */ - - // clear the current processes env and replace it with the environment - // defined on the container - if err := LoadContainerEnvironment(container); err != nil { - return err - } - - // join any namespaces via a path to the namespace fd if provided - if err := joinExistingNamespaces(container.Namespaces); err != nil { - return err - } - if process.ConsolePath != "" { - if err := console.OpenAndDup(process.ConsolePath); err != nil { - return err - } - } - if _, err := syscall.Setsid(); err != nil { - return fmt.Errorf("setsid %s", err) - } - if process.ConsolePath != "" { - if err := system.Setctty(); err != nil { - return fmt.Errorf("setctty %s", err) - } - } - - cloneFlags := GetNamespaceFlags(container.Namespaces) - - if (cloneFlags & syscall.CLONE_NEWNET) == 0 { - if len(container.Networks) != 0 || len(container.Routes) != 0 { - return fmt.Errorf("unable to apply network parameters without network namespace") - } - } else { - if err := setupNetwork(container, networkState); err != nil { - return fmt.Errorf("setup networking %s", err) - } - if err := setupRoute(container); err != nil { - return fmt.Errorf("setup route %s", err) - } - } - - if err := setupRlimits(container); err != nil { - return fmt.Errorf("setup rlimits %s", err) - } - - label.Init() - - // InitializeMountNamespace() can be executed only for a new mount namespace - if (cloneFlags & syscall.CLONE_NEWNS) != 0 { - if err := mount.InitializeMountNamespace(container); err != nil { - return err - } - } - - if container.Hostname != "" { - // TODO: (crosbymichael) move this to pre spawn validation - if (cloneFlags & syscall.CLONE_NEWUTS) == 0 { - return fmt.Errorf("unable to set the hostname without UTS namespace") - } - if err := syscall.Sethostname([]byte(container.Hostname)); err != nil { - return fmt.Errorf("unable to sethostname %q: %s", container.Hostname, err) - } - } - - if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil { - return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err) - } - - if err := label.SetProcessLabel(container.ProcessLabel); err != nil { - return fmt.Errorf("set process label %s", err) - } - - // TODO: (crosbymichael) make this configurable at the Config level - if container.RestrictSys { - if (cloneFlags & syscall.CLONE_NEWNS) == 0 { - return fmt.Errorf("unable to restrict access to kernel files without mount namespace") - } - if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { - return err - } - } - - pdeathSignal, err := system.GetParentDeathSignal() - if err != nil { - return fmt.Errorf("get parent death signal %s", err) - } - - if err := FinalizeNamespace(container); err != nil { - return fmt.Errorf("finalize namespace %s", err) - } - - // FinalizeNamespace can change user/group which clears the parent death - // signal, so we restore it here. - if err := RestoreParentDeathSignal(pdeathSignal); err != nil { - return fmt.Errorf("restore parent death signal %s", err) - } - - return system.Execv(process.Args[0], process.Args[0:], process.Env) -} - -func initUserNs(uncleanRootfs string, process *processArgs) (err error) { - container := process.Config - - // clear the current processes env and replace it with the environment - // defined on the container - if err := LoadContainerEnvironment(container); err != nil { - return err - } - - // join any namespaces via a path to the namespace fd if provided - if err := joinExistingNamespaces(container.Namespaces); err != nil { - return err - } - if process.ConsolePath != "" { - if err := console.OpenAndDup("/dev/console"); err != nil { - return err - } - } - if _, err := syscall.Setsid(); err != nil { - return fmt.Errorf("setsid %s", err) - } - if process.ConsolePath != "" { - if err := system.Setctty(); err != nil { - return fmt.Errorf("setctty %s", err) - } - } - - if container.WorkingDir == "" { - container.WorkingDir = "/" - } - - if err := setupRlimits(container); err != nil { - return fmt.Errorf("setup rlimits %s", err) - } - - cloneFlags := GetNamespaceFlags(container.Namespaces) - - if container.Hostname != "" { - if (cloneFlags & syscall.CLONE_NEWUTS) == 0 { - return fmt.Errorf("unable to set the hostname without UTS namespace") - } - if err := syscall.Sethostname([]byte(container.Hostname)); err != nil { - return fmt.Errorf("unable to sethostname %q: %s", container.Hostname, err) - } - } - - if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil { - return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err) - } - - if err := label.SetProcessLabel(container.ProcessLabel); err != nil { - return fmt.Errorf("set process label %s", err) - } - - if container.RestrictSys { - if (cloneFlags & syscall.CLONE_NEWNS) == 0 { - return fmt.Errorf("unable to restrict access to kernel files without mount namespace") - } - if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { - return err - } - } - - pdeathSignal, err := system.GetParentDeathSignal() - if err != nil { - return fmt.Errorf("get parent death signal %s", err) - } - - if err := FinalizeNamespace(container); err != nil { - return fmt.Errorf("finalize namespace %s", err) - } - - // FinalizeNamespace can change user/group which clears the parent death - // signal, so we restore it here. - if err := RestoreParentDeathSignal(pdeathSignal); err != nil { - return fmt.Errorf("restore parent death signal %s", err) - } - - return system.Execv(process.Args[0], process.Args[0:], process.Env) -} - -// RestoreParentDeathSignal sets the parent death signal to old. -func RestoreParentDeathSignal(old int) error { - if old == 0 { - return nil - } - - current, err := system.GetParentDeathSignal() - if err != nil { - return fmt.Errorf("get parent death signal %s", err) - } - - if old == current { - return nil - } - - if err := system.ParentDeathSignal(uintptr(old)); err != nil { - return fmt.Errorf("set parent death signal %s", err) - } - - // Signal self if parent is already dead. Does nothing if running in a new - // PID namespace, as Getppid will always return 0. - if syscall.Getppid() == 1 { - return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) - } - - return nil -} - -// SetupUser changes the groups, gid, and uid for the user inside the container -func SetupUser(container *configs.Config) error { - // Set up defaults. - defaultExecUser := user.ExecUser{ - Uid: syscall.Getuid(), - Gid: syscall.Getgid(), - Home: "/", - } - - passwdPath, err := user.GetPasswdPath() - if err != nil { - return err - } - - groupPath, err := user.GetGroupPath() - if err != nil { - return err - } - - execUser, err := user.GetExecUserPath(container.User, &defaultExecUser, passwdPath, groupPath) - if err != nil { - return fmt.Errorf("get supplementary groups %s", err) - } - - suppGroups := append(execUser.Sgids, container.AdditionalGroups...) - - if err := syscall.Setgroups(suppGroups); err != nil { - return fmt.Errorf("setgroups %s", err) - } - - if err := system.Setgid(execUser.Gid); err != nil { - return fmt.Errorf("setgid %s", err) - } - - if err := system.Setuid(execUser.Uid); err != nil { - return fmt.Errorf("setuid %s", err) - } - - // if we didn't get HOME already, set it based on the user's HOME - if envHome := os.Getenv("HOME"); envHome == "" { - if err := os.Setenv("HOME", execUser.Home); err != nil { - return fmt.Errorf("set HOME %s", err) - } - } - - return nil -} - -// setupVethNetwork uses the Network config if it is not nil to initialize -// the new veth interface inside the container for use by changing the name to eth0 -// setting the MTU and IP address along with the default gateway -func setupNetwork(container *configs.Config, networkState *configs.NetworkState) error { - for _, config := range container.Networks { - strategy, err := network.GetStrategy(config.Type) - if err != nil { - return err - } - - err1 := strategy.Initialize(config, networkState) - if err1 != nil { - return err1 - } - } - return nil -} - -func setupRoute(container *configs.Config) error { - for _, config := range container.Routes { - if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil { - return err - } - } - return nil -} - -func setupRlimits(container *configs.Config) error { - for _, rlimit := range container.Rlimits { - l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft} - if err := syscall.Setrlimit(rlimit.Type, l); err != nil { - return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) - } - } - return nil -} - -// FinalizeNamespace drops the caps, sets the correct user -// and working dir, and closes any leaky file descriptors -// before execing the command inside the namespace -func FinalizeNamespace(container *configs.Config) error { - // Ensure that all non-standard fds we may have accidentally - // inherited are marked close-on-exec so they stay out of the - // container - if err := utils.CloseExecFrom(3); err != nil { - return fmt.Errorf("close open file descriptors %s", err) - } - - // drop capabilities in bounding set before changing user - if err := capabilities.DropBoundingSet(container.Capabilities); err != nil { - return fmt.Errorf("drop bounding set %s", err) - } - - // preserve existing capabilities while we change users - if err := system.SetKeepCaps(); err != nil { - return fmt.Errorf("set keep caps %s", err) - } - - if err := SetupUser(container); err != nil { - return fmt.Errorf("setup user %s", err) - } - - if err := system.ClearKeepCaps(); err != nil { - return fmt.Errorf("clear keep caps %s", err) - } - - // drop all other capabilities - if err := capabilities.DropCapabilities(container.Capabilities); err != nil { - return fmt.Errorf("drop capabilities %s", err) - } - - if container.WorkingDir != "" { - if err := syscall.Chdir(container.WorkingDir); err != nil { - return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) - } - } - - return nil -} - -func LoadContainerEnvironment(container *configs.Config) error { - os.Clearenv() - for _, pair := range container.Env { - p := strings.SplitN(pair, "=", 2) - if len(p) < 2 { - return fmt.Errorf("invalid environment '%v'", pair) - } - if err := os.Setenv(p[0], p[1]); err != nil { - return err - } - } - return nil -} - -// joinExistingNamespaces gets all the namespace paths specified for the container and -// does a setns on the namespace fd so that the current process joins the namespace. -func joinExistingNamespaces(namespaces []configs.Namespace) error { - for _, ns := range namespaces { - if ns.Path != "" { - f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0) - if err != nil { - return err - } - err = system.Setns(f.Fd(), uintptr(namespaceInfo[ns.Type])) - f.Close() - if err != nil { - return err - } - } - } - return nil -} diff --git a/namespaces/utils.go b/namespaces/utils.go deleted file mode 100644 index 978a02d8..00000000 --- a/namespaces/utils.go +++ /dev/null @@ -1,48 +0,0 @@ -// +build linux - -package namespaces - -import ( - "os" - "syscall" - - "github.com/docker/libcontainer/configs" -) - -type initError struct { - Message string `json:"message,omitempty"` -} - -func (i initError) Error() string { - return i.Message -} - -var namespaceInfo = map[configs.NamespaceType]int{ - configs.NEWNET: syscall.CLONE_NEWNET, - configs.NEWNS: syscall.CLONE_NEWNS, - configs.NEWUSER: syscall.CLONE_NEWUSER, - configs.NEWIPC: syscall.CLONE_NEWIPC, - configs.NEWUTS: syscall.CLONE_NEWUTS, - configs.NEWPID: syscall.CLONE_NEWPID, -} - -// New returns a newly initialized Pipe for communication between processes -func newInitPipe() (parent *os.File, child *os.File, err error) { - fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) - if err != nil { - return nil, nil, err - } - return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil -} - -// GetNamespaceFlags parses the container's Namespaces options to set the correct -// flags on clone, unshare. This functions returns flags only for new namespaces. -func GetNamespaceFlags(namespaces configs.Namespaces) (flag int) { - for _, v := range namespaces { - if v.Path != "" { - continue - } - flag |= namespaceInfo[v.Type] - } - return flag -} diff --git a/namespaces/nsenter/README.md b/nsenter/README.md similarity index 100% rename from namespaces/nsenter/README.md rename to nsenter/README.md diff --git a/namespaces/nsenter/nsenter.go b/nsenter/nsenter.go similarity index 100% rename from namespaces/nsenter/nsenter.go rename to nsenter/nsenter.go diff --git a/namespaces/nsenter/nsenter_test.go b/nsenter/nsenter_test.go similarity index 100% rename from namespaces/nsenter/nsenter_test.go rename to nsenter/nsenter_test.go diff --git a/namespaces/nsenter/nsenter_unsupported.go b/nsenter/nsenter_unsupported.go similarity index 100% rename from namespaces/nsenter/nsenter_unsupported.go rename to nsenter/nsenter_unsupported.go diff --git a/namespaces/nsenter/nsexec.c b/nsenter/nsexec.c similarity index 100% rename from namespaces/nsenter/nsexec.c rename to nsenter/nsexec.c diff --git a/nsinit/exec.go b/nsinit/exec.go index 3a91fa4e..f710a9b7 100644 --- a/nsinit/exec.go +++ b/nsinit/exec.go @@ -3,14 +3,55 @@ package main import ( "io" "os" + "os/signal" "syscall" "github.com/codegangsta/cli" "github.com/docker/docker/pkg/term" "github.com/docker/libcontainer" + "github.com/docker/libcontainer/configs" consolepkg "github.com/docker/libcontainer/console" ) +type tty struct { + master *os.File + console string + state *term.State +} + +func (t *tty) Close() error { + if t.master != nil { + t.master.Close() + } + if t.state != nil { + term.RestoreTerminal(os.Stdin.Fd(), t.state) + } + return nil +} + +func (t *tty) set(config *configs.Config) { + config.Console = t.console +} + +func (t *tty) attach(process *libcontainer.Process) { + if t.master != nil { + process.Stderr = nil + process.Stdout = nil + process.Stdin = nil + } +} + +func (t *tty) resize() error { + if t.master == nil { + return nil + } + ws, err := term.GetWinsize(os.Stdin.Fd()) + if err != nil { + return err + } + return term.SetWinsize(t.master.Fd(), ws) +} + var execCommand = cli.Command{ Name: "exec", Usage: "execute a new command inside a container", @@ -23,24 +64,14 @@ var execCommand = cli.Command{ } func execAction(context *cli.Context) { - var ( - master *os.File - console string - err error - - sigc = make(chan os.Signal, 10) - - stdin = os.Stdin - stdout = os.Stdout - stderr = os.Stderr - - exitCode int - ) - factory, err := loadFactory(context) if err != nil { fatal(err) } + tty, err := newTty(context) + if err != nil { + fatal(err) + } container, err := factory.Load(context.String("id")) if err != nil { if lerr, ok := err.(libcontainer.Error); !ok || lerr.Code() != libcontainer.ContainerNotExists { @@ -50,46 +81,22 @@ func execAction(context *cli.Context) { if err != nil { fatal(err) } - if context.Bool("tty") { - stdin = nil - stdout = nil - stderr = nil - if master, console, err = consolepkg.CreateMasterAndConsole(); err != nil { - fatal(err) - } - go io.Copy(master, os.Stdin) - go io.Copy(os.Stdout, master) - state, err := term.SetRawTerminal(os.Stdin.Fd()) - if err != nil { - fatal(err) - } - defer term.RestoreTerminal(os.Stdin.Fd(), state) - config.Console = console - } + tty.set(config) if container, err = factory.Create(context.String("id"), config); err != nil { fatal(err) } } + go handleSignals(container, tty) process := &libcontainer.Process{ Args: context.Args(), - Stdin: stdin, - Stdout: stdout, - Stderr: stderr, + Stdin: os.Stdin, + Stdout: os.Stdout, + Stderr: os.Stderr, } + tty.attach(process) if _, err := container.Start(process); err != nil { fatal(err) } - go func() { - resizeTty(master) - for sig := range sigc { - switch sig { - case syscall.SIGWINCH: - resizeTty(master) - default: - container.Signal(sig) - } - } - }() status, err := container.Wait() if err != nil { fatal(err) @@ -97,6 +104,11 @@ func execAction(context *cli.Context) { if err := container.Destroy(); err != nil { fatal(err) } + exit(status) +} + +func exit(status syscall.WaitStatus) { + var exitCode int if status.Exited() { exitCode = status.ExitStatus() } else if status.Signaled() { @@ -107,13 +119,37 @@ func execAction(context *cli.Context) { os.Exit(exitCode) } -func resizeTty(master *os.File) { - if master == nil { - return +func handleSignals(container libcontainer.Container, tty *tty) { + sigc := make(chan os.Signal, 10) + signal.Notify(sigc) + tty.resize() + for sig := range sigc { + switch sig { + case syscall.SIGWINCH: + tty.resize() + default: + container.Signal(sig) + } } - ws, err := term.GetWinsize(os.Stdin.Fd()) - if err != nil { - return - } - term.SetWinsize(master.Fd(), ws) +} + +func newTty(context *cli.Context) (*tty, error) { + if context.Bool("tty") { + master, console, err := consolepkg.CreateMasterAndConsole() + if err != nil { + return nil, err + } + go io.Copy(master, os.Stdin) + go io.Copy(os.Stdout, master) + state, err := term.SetRawTerminal(os.Stdin.Fd()) + if err != nil { + return nil, err + } + return &tty{ + master: master, + console: console, + state: state, + }, nil + } + return &tty{}, nil } diff --git a/nsinit/init.go b/nsinit/init.go index 2b8784cc..d45d1287 100644 --- a/nsinit/init.go +++ b/nsinit/init.go @@ -5,7 +5,7 @@ import ( "github.com/codegangsta/cli" "github.com/docker/libcontainer" - _ "github.com/docker/libcontainer/namespaces/nsenter" + _ "github.com/docker/libcontainer/nsenter" ) var initCommand = cli.Command{