Remove namespaces package

Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
This commit is contained in:
Michael Crosby 2015-01-31 21:21:06 -08:00
parent 8191d4d60f
commit bbeae7445a
15 changed files with 952 additions and 1085 deletions

View File

@ -1,5 +1,9 @@
package configs
import (
"syscall"
)
type NamespaceType string
const (
@ -18,6 +22,10 @@ type Namespace struct {
Path string `json:"path,omitempty"`
}
func (n *Namespace) Syscall() int {
return namespaceInfo[n.Type]
}
type Namespaces []Namespace
func (n *Namespaces) Remove(t NamespaceType) bool {
@ -50,3 +58,25 @@ func (n *Namespaces) index(t NamespaceType) int {
func (n *Namespaces) Contains(t NamespaceType) bool {
return n.index(t) != -1
}
var namespaceInfo = map[NamespaceType]int{
NEWNET: syscall.CLONE_NEWNET,
NEWNS: syscall.CLONE_NEWNS,
NEWUSER: syscall.CLONE_NEWUSER,
NEWIPC: syscall.CLONE_NEWIPC,
NEWUTS: syscall.CLONE_NEWUTS,
NEWPID: syscall.CLONE_NEWPID,
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This functions returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
var flag int
for _, v := range *n {
if v.Path != "" {
continue
}
flag |= namespaceInfo[v.Type]
}
return uintptr(flag)
}

View File

@ -33,7 +33,7 @@ type Container interface {
Status() (configs.Status, error)
// Returns the current config of the container.
Config() *configs.Config
Config() configs.Config
// Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
//

View File

@ -5,18 +5,35 @@ package libcontainer
import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"syscall"
"github.com/docker/libcontainer/apparmor"
"github.com/docker/libcontainer/cgroups"
"github.com/docker/libcontainer/configs"
"github.com/docker/libcontainer/namespaces"
"github.com/docker/libcontainer/label"
"github.com/docker/libcontainer/mount"
"github.com/docker/libcontainer/network"
"github.com/docker/libcontainer/system"
"github.com/golang/glog"
)
const (
EXIT_SIGNAL_OFFSET = 128
)
type initError struct {
Message string `json:"message,omitempty"`
}
func (i initError) Error() string {
return i.Message
}
type linuxContainer struct {
id string
root string
@ -26,12 +43,14 @@ type linuxContainer struct {
initArgs []string
}
// ID returns the container's unique ID
func (c *linuxContainer) ID() string {
return c.id
}
func (c *linuxContainer) Config() *configs.Config {
return c.config
// Config returns the container's configuration
func (c *linuxContainer) Config() configs.Config {
return *c.config
}
func (c *linuxContainer) Status() (configs.Status, error) {
@ -96,48 +115,158 @@ func (c *linuxContainer) Start(process *Process) (int, error) {
if status != configs.Destroyed {
glog.Info("start new container process")
// TODO: (crosbymichael) check out console use for execin
return namespaces.ExecIn(process.Args, c.config.Env, "", cmd, c.config, c.state)
//return namespaces.ExecIn(process.Args, c.config.Env, "", cmd, c.config, c.state)
return c.startNewProcess(cmd, process.Args)
}
if err := c.startInitProcess(cmd, process); err != nil {
if err := c.startInitProcess(cmd, process.Args); err != nil {
return -1, err
}
return c.state.InitPid, nil
}
func (c *linuxContainer) updateStateFile() error {
fnew := filepath.Join(c.root, fmt.Sprintf("%s.new", stateFilename))
f, err := os.Create(fnew)
func (c *linuxContainer) startNewProcess(cmd *exec.Cmd, args []string) (int, error) {
var err error
parent, child, err := newInitPipe()
if err != nil {
return newGenericError(err, SystemError)
return -1, err
}
err = json.NewEncoder(f).Encode(c.state)
defer parent.Close()
cmd.ExtraFiles = []*os.File{child}
cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPID=%d", c.state.InitPid))
if err := cmd.Start(); err != nil {
child.Close()
return -1, err
}
child.Close()
s, err := cmd.Process.Wait()
if err != nil {
f.Close()
os.Remove(fnew)
return newGenericError(err, SystemError)
return -1, err
}
f.Close()
fname := filepath.Join(c.root, stateFilename)
if err := os.Rename(fnew, fname); err != nil {
return newGenericError(err, SystemError)
if !s.Success() {
return -1, &exec.ExitError{s}
}
return nil
decoder := json.NewDecoder(parent)
var pid *pid
if err := decoder.Decode(&pid); err != nil {
return -1, err
}
p, err := os.FindProcess(pid.Pid)
if err != nil {
return -1, err
}
terminate := func(terr error) (int, error) {
// TODO: log the errors for kill and wait
p.Kill()
p.Wait()
return -1, terr
}
// Enter cgroups.
if err := enterCgroups(c.state, pid.Pid); err != nil {
return terminate(err)
}
encoder := json.NewEncoder(parent)
if err := encoder.Encode(c.config); err != nil {
return terminate(err)
}
process := processArgs{
Config: c.config,
Args: args,
}
if err := encoder.Encode(process); err != nil {
return terminate(err)
}
return pid.Pid, nil
}
func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, config *Process) error {
err := namespaces.Exec(config.Args, c.config.Env, c.config.Console, cmd, c.config, c.cgroupManager, c.state)
func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, args []string) error {
// create a pipe so that we can syncronize with the namespaced process and
// pass the state and configuration to the child process
parent, child, err := newInitPipe()
if err != nil {
return err
}
defer parent.Close()
cmd.ExtraFiles = []*os.File{child}
cmd.SysProcAttr.Cloneflags = c.config.Namespaces.CloneFlags()
if c.config.Namespaces.Contains(configs.NEWUSER) {
addUidGidMappings(cmd.SysProcAttr, c.config)
// Default to root user when user namespaces are enabled.
if cmd.SysProcAttr.Credential == nil {
cmd.SysProcAttr.Credential = &syscall.Credential{}
}
}
glog.Info("starting container init process")
err = cmd.Start()
child.Close()
if err != nil {
return newGenericError(err, SystemError)
}
wait := func() (*os.ProcessState, error) {
ps, err := cmd.Process.Wait()
// we should kill all processes in cgroup when init is died if we use
// host PID namespace
if !c.config.Namespaces.Contains(configs.NEWPID) {
c.killAllPids()
}
return ps, newGenericError(err, SystemError)
}
terminate := func(terr error) error {
// TODO: log the errors for kill and wait
cmd.Process.Kill()
wait()
return terr
}
started, err := system.GetProcessStartTime(cmd.Process.Pid)
if err != nil {
return terminate(err)
}
// Do this before syncing with child so that no children
// can escape the cgroup
if err := c.cgroupManager.Apply(cmd.Process.Pid); err != nil {
return terminate(err)
}
defer func() {
if err != nil {
c.cgroupManager.Destroy()
}
}()
var networkState configs.NetworkState
if err := c.initializeNetworking(cmd.Process.Pid, &networkState); err != nil {
return terminate(err)
}
process := processArgs{
Args: args,
Config: c.config,
NetworkState: &networkState,
}
// Start the setup process to setup the init process
if c.config.Namespaces.Contains(configs.NEWUSER) {
if err = executeSetupCmd(cmd.Args, cmd.Process.Pid, c.config, &process, &networkState); err != nil {
return terminate(err)
}
}
// send the state to the container's init process then shutdown writes for the parent
if err := json.NewEncoder(parent).Encode(process); err != nil {
return terminate(err)
}
// shutdown writes for the parent side of the pipe
if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil {
return terminate(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *initError
if err := json.NewDecoder(parent).Decode(&ierr); err != nil && err != io.EOF {
return terminate(err)
}
if ierr != nil {
return terminate(ierr)
}
err = c.updateStateFile()
if err != nil {
// FIXME c.Kill()
return err
}
c.state.InitPid = cmd.Process.Pid
c.state.InitStartTime = started
c.state.NetworkState = networkState
c.state.CgroupPaths = c.cgroupManager.GetPaths()
return nil
}
@ -179,3 +308,282 @@ func (c *linuxContainer) Wait() (syscall.WaitStatus, error) {
func (c *linuxContainer) OOM() (<-chan struct{}, error) {
return NotifyOnOOM(c.state)
}
func (c *linuxContainer) updateStateFile() error {
fnew := filepath.Join(c.root, fmt.Sprintf("%s.new", stateFilename))
f, err := os.Create(fnew)
if err != nil {
return newGenericError(err, SystemError)
}
defer f.Close()
if err := json.NewEncoder(f).Encode(c.state); err != nil {
f.Close()
os.Remove(fnew)
return newGenericError(err, SystemError)
}
fname := filepath.Join(c.root, stateFilename)
if err := os.Rename(fnew, fname); err != nil {
return newGenericError(err, SystemError)
}
return nil
}
// New returns a newly initialized Pipe for communication between processes
func newInitPipe() (parent *os.File, child *os.File, err error) {
fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
}
// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr.
func addUidGidMappings(sys *syscall.SysProcAttr, container *configs.Config) {
if container.UidMappings != nil {
sys.UidMappings = make([]syscall.SysProcIDMap, len(container.UidMappings))
for i, um := range container.UidMappings {
sys.UidMappings[i].ContainerID = um.ContainerID
sys.UidMappings[i].HostID = um.HostID
sys.UidMappings[i].Size = um.Size
}
}
if container.GidMappings != nil {
sys.GidMappings = make([]syscall.SysProcIDMap, len(container.GidMappings))
for i, gm := range container.GidMappings {
sys.GidMappings[i].ContainerID = gm.ContainerID
sys.GidMappings[i].HostID = gm.HostID
sys.GidMappings[i].Size = gm.Size
}
}
}
// killAllPids iterates over all of the container's processes
// sending a SIGKILL to each process.
func (c *linuxContainer) killAllPids() error {
glog.Info("killing all processes in container")
var procs []*os.Process
c.cgroupManager.Freeze(configs.Frozen)
pids, err := c.cgroupManager.GetPids()
if err != nil {
return err
}
for _, pid := range pids {
// TODO: log err without aborting if we are unable to find
// a single PID
if p, err := os.FindProcess(pid); err == nil {
procs = append(procs, p)
p.Kill()
}
}
c.cgroupManager.Freeze(configs.Thawed)
for _, p := range procs {
p.Wait()
}
return err
}
// initializeNetworking creates the container's network stack outside of the namespace and moves
// interfaces into the container's net namespaces if necessary
func (c *linuxContainer) initializeNetworking(nspid int, networkState *configs.NetworkState) error {
glog.Info("initailzing container's network stack")
for _, config := range c.config.Networks {
strategy, err := network.GetStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.Create(config, nspid, networkState); err != nil {
return err
}
}
return nil
}
func executeSetupCmd(args []string, ppid int, container *configs.Config, process *processArgs, networkState *configs.NetworkState) error {
command := exec.Command(args[0], args[1:]...)
parent, child, err := newInitPipe()
if err != nil {
return err
}
defer parent.Close()
command.ExtraFiles = []*os.File{child}
command.Dir = container.RootFs
command.Env = append(command.Env,
fmt.Sprintf("_LIBCONTAINER_INITPID=%d", ppid),
fmt.Sprintf("_LIBCONTAINER_USERNS=1"))
err = command.Start()
child.Close()
if err != nil {
return err
}
s, err := command.Process.Wait()
if err != nil {
return err
}
if !s.Success() {
return &exec.ExitError{s}
}
decoder := json.NewDecoder(parent)
var pid *pid
if err := decoder.Decode(&pid); err != nil {
return err
}
p, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
terminate := func(terr error) error {
// TODO: log the errors for kill and wait
p.Kill()
p.Wait()
return terr
}
// send the state to the container's init process then shutdown writes for the parent
if err := json.NewEncoder(parent).Encode(process); err != nil {
return terminate(err)
}
// shutdown writes for the parent side of the pipe
if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil {
return terminate(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *initError
if err := decoder.Decode(&ierr); err != nil && err != io.EOF {
return terminate(err)
}
if ierr != nil {
return ierr
}
s, err = p.Wait()
if err != nil {
return err
}
if !s.Success() {
return &exec.ExitError{s}
}
return nil
}
type pid struct {
Pid int `json:"Pid"`
}
// Finalize entering into a container and execute a specified command
func InitIn(pipe *os.File) (err error) {
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't
// receive ECONNRESET when the child writes to the pipe.
ioutil.ReadAll(pipe)
if err := json.NewEncoder(pipe).Encode(initError{
Message: err.Error(),
}); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed
pipe.Close()
}()
decoder := json.NewDecoder(pipe)
var config *configs.Config
if err := decoder.Decode(&config); err != nil {
return err
}
var process *processArgs
if err := decoder.Decode(&process); err != nil {
return err
}
if err := finalizeSetns(config); err != nil {
return err
}
if err := system.Execv(process.Args[0], process.Args[0:], config.Env); err != nil {
return err
}
panic("unreachable")
}
// finalize expects that the setns calls have been setup and that is has joined an
// existing namespace
func finalizeSetns(container *configs.Config) error {
// clear the current processes env and replace it with the environment defined on the container
if err := loadContainerEnvironment(container); err != nil {
return err
}
if err := setupRlimits(container); err != nil {
return fmt.Errorf("setup rlimits %s", err)
}
if err := finalizeNamespace(container); err != nil {
return err
}
if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil {
return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err)
}
if container.ProcessLabel != "" {
if err := label.SetProcessLabel(container.ProcessLabel); err != nil {
return err
}
}
return nil
}
// SetupContainer is run to setup mounts and networking related operations
// for a user namespace enabled process as a user namespace root doesn't
// have permissions to perform these operations.
// The setup process joins all the namespaces of user namespace enabled init
// except the user namespace, so it run as root in the root user namespace
// to perform these operations.
func SetupContainer(process *processArgs) error {
container := process.Config
networkState := process.NetworkState
// TODO : move to validation
/*
rootfs, err := utils.ResolveRootfs(container.RootFs)
if err != nil {
return err
}
*/
// clear the current processes env and replace it with the environment
// defined on the container
if err := loadContainerEnvironment(container); err != nil {
return err
}
cloneFlags := container.Namespaces.CloneFlags()
if (cloneFlags & syscall.CLONE_NEWNET) == 0 {
if len(container.Networks) != 0 || len(container.Routes) != 0 {
return fmt.Errorf("unable to apply network parameters without network namespace")
}
} else {
if err := setupNetwork(container, networkState); err != nil {
return fmt.Errorf("setup networking %s", err)
}
if err := setupRoute(container); err != nil {
return fmt.Errorf("setup route %s", err)
}
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if (cloneFlags & syscall.CLONE_NEWNS) != 0 {
if err := mount.InitializeMountNamespace(container); err != nil {
return fmt.Errorf("setup mount namespace %s", err)
}
}
return nil
}
func enterCgroups(state *configs.State, pid int) error {
return cgroups.EnterPid(state.CgroupPaths, pid)
}

View File

@ -5,15 +5,28 @@ package libcontainer
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"regexp"
"strings"
"syscall"
"github.com/golang/glog"
"github.com/docker/libcontainer/apparmor"
cgroups "github.com/docker/libcontainer/cgroups/manager"
"github.com/docker/libcontainer/configs"
"github.com/docker/libcontainer/namespaces"
"github.com/docker/libcontainer/console"
"github.com/docker/libcontainer/label"
"github.com/docker/libcontainer/mount"
"github.com/docker/libcontainer/netlink"
"github.com/docker/libcontainer/network"
"github.com/docker/libcontainer/security/capabilities"
"github.com/docker/libcontainer/security/restrict"
"github.com/docker/libcontainer/system"
"github.com/docker/libcontainer/user"
"github.com/docker/libcontainer/utils"
)
const (
@ -26,6 +39,13 @@ var (
maxIdLen = 1024
)
// Process is used for transferring parameters from Exec() to Init()
type processArgs struct {
Args []string `json:"args,omitempty"`
Config *configs.Config `json:"config,omitempty"`
NetworkState *configs.NetworkState `json:"network_state,omitempty"`
}
// New returns a linux based container factory based in the root directory.
func New(root string, initArgs []string) (Factory, error) {
if root != "" {
@ -116,16 +136,50 @@ func (l *linuxFactory) Load(id string) (Container, error) {
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (f *linuxFactory) StartInitialization(pipefd uintptr) (err error) {
func (l *linuxFactory) StartInitialization(pipefd uintptr) (err error) {
pipe := os.NewFile(uintptr(pipefd), "pipe")
setupUserns := os.Getenv("_LIBCONTAINER_USERNS")
setupUserns := os.Getenv("_LIBCONTAINER_USERNS") != ""
pid := os.Getenv("_LIBCONTAINER_INITPID")
if pid != "" && setupUserns == "" {
return namespaces.InitIn(pipe)
if pid != "" && !setupUserns {
return InitIn(pipe)
}
return namespaces.Init(pipe, setupUserns != "")
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't
// receive ECONNRESET when the child writes to the pipe.
ioutil.ReadAll(pipe)
if err := json.NewEncoder(pipe).Encode(initError{
Message: err.Error(),
}); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed
pipe.Close()
}()
uncleanRootfs, err := os.Getwd()
if err != nil {
return err
}
var process *processArgs
// We always read this as it is a way to sync with the parent as well
if err := json.NewDecoder(pipe).Decode(&process); err != nil {
return err
}
if setupUserns {
err = SetupContainer(process)
if err == nil {
os.Exit(0)
} else {
os.Exit(1)
}
}
if process.Config.Namespaces.Contains(configs.NEWUSER) {
return l.initUserNs(uncleanRootfs, process)
}
return l.initDefault(uncleanRootfs, process)
}
func (l *linuxFactory) loadContainerConfig(root string) (*configs.Config, error) {
@ -137,7 +191,6 @@ func (l *linuxFactory) loadContainerConfig(root string) (*configs.Config, error)
return nil, newGenericError(err, SystemError)
}
defer f.Close()
var config *configs.Config
if err := json.NewDecoder(f).Decode(&config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
@ -154,7 +207,6 @@ func (l *linuxFactory) loadContainerState(root string) (*configs.State, error) {
return nil, newGenericError(err, SystemError)
}
defer f.Close()
var state *configs.State
if err := json.NewDecoder(f).Decode(&state); err != nil {
return nil, newGenericError(err, SystemError)
@ -171,3 +223,337 @@ func (l *linuxFactory) validateID(id string) error {
}
return nil
}
func (l *linuxFactory) initDefault(uncleanRootfs string, process *processArgs) (err error) {
config := process.Config
networkState := process.NetworkState
// TODO: move to validation
/*
rootfs, err := utils.ResolveRootfs(uncleanRootfs)
if err != nil {
return err
}
*/
// clear the current processes env and replace it with the environment
// defined on the container
if err := loadContainerEnvironment(config); err != nil {
return err
}
// join any namespaces via a path to the namespace fd if provided
if err := joinExistingNamespaces(config.Namespaces); err != nil {
return err
}
if config.Console != "" {
if err := console.OpenAndDup(config.Console); err != nil {
return err
}
}
if _, err := syscall.Setsid(); err != nil {
return fmt.Errorf("setsid %s", err)
}
if config.Console != "" {
if err := system.Setctty(); err != nil {
return fmt.Errorf("setctty %s", err)
}
}
cloneFlags := config.Namespaces.CloneFlags()
if (cloneFlags & syscall.CLONE_NEWNET) == 0 {
if len(config.Networks) != 0 || len(config.Routes) != 0 {
return fmt.Errorf("unable to apply network parameters without network namespace")
}
} else {
if err := setupNetwork(config, networkState); err != nil {
return fmt.Errorf("setup networking %s", err)
}
if err := setupRoute(config); err != nil {
return fmt.Errorf("setup route %s", err)
}
}
if err := setupRlimits(config); err != nil {
return fmt.Errorf("setup rlimits %s", err)
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if (cloneFlags & syscall.CLONE_NEWNS) != 0 {
if err := mount.InitializeMountNamespace(config); err != nil {
return err
}
}
if config.Hostname != "" {
// TODO: (crosbymichael) move this to pre spawn validation
if (cloneFlags & syscall.CLONE_NEWUTS) == 0 {
return fmt.Errorf("unable to set the hostname without UTS namespace")
}
if err := syscall.Sethostname([]byte(config.Hostname)); err != nil {
return fmt.Errorf("unable to sethostname %q: %s", config.Hostname, err)
}
}
if err := apparmor.ApplyProfile(config.AppArmorProfile); err != nil {
return fmt.Errorf("set apparmor profile %s: %s", config.AppArmorProfile, err)
}
if err := label.SetProcessLabel(config.ProcessLabel); err != nil {
return fmt.Errorf("set process label %s", err)
}
// TODO: (crosbymichael) make this configurable at the Config level
if config.RestrictSys {
if (cloneFlags & syscall.CLONE_NEWNS) == 0 {
return fmt.Errorf("unable to restrict access to kernel files without mount namespace")
}
if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil {
return err
}
}
pdeathSignal, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("get parent death signal %s", err)
}
if err := finalizeNamespace(config); err != nil {
return fmt.Errorf("finalize namespace %s", err)
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := restoreParentDeathSignal(pdeathSignal); err != nil {
return fmt.Errorf("restore parent death signal %s", err)
}
return system.Execv(process.Args[0], process.Args[0:], config.Env)
}
func (l *linuxFactory) initUserNs(uncleanRootfs string, process *processArgs) (err error) {
config := process.Config
// clear the current processes env and replace it with the environment
// defined on the config
if err := loadContainerEnvironment(config); err != nil {
return err
}
// join any namespaces via a path to the namespace fd if provided
if err := joinExistingNamespaces(config.Namespaces); err != nil {
return err
}
if config.Console != "" {
if err := console.OpenAndDup("/dev/console"); err != nil {
return err
}
}
if _, err := syscall.Setsid(); err != nil {
return fmt.Errorf("setsid %s", err)
}
if config.Console != "" {
if err := system.Setctty(); err != nil {
return fmt.Errorf("setctty %s", err)
}
}
if config.WorkingDir == "" {
config.WorkingDir = "/"
}
if err := setupRlimits(config); err != nil {
return fmt.Errorf("setup rlimits %s", err)
}
cloneFlags := config.Namespaces.CloneFlags()
if config.Hostname != "" {
// TODO: move validation
if (cloneFlags & syscall.CLONE_NEWUTS) == 0 {
return fmt.Errorf("unable to set the hostname without UTS namespace")
}
if err := syscall.Sethostname([]byte(config.Hostname)); err != nil {
return fmt.Errorf("unable to sethostname %q: %s", config.Hostname, err)
}
}
if err := apparmor.ApplyProfile(config.AppArmorProfile); err != nil {
return fmt.Errorf("set apparmor profile %s: %s", config.AppArmorProfile, err)
}
if err := label.SetProcessLabel(config.ProcessLabel); err != nil {
return fmt.Errorf("set process label %s", err)
}
if config.RestrictSys {
if (cloneFlags & syscall.CLONE_NEWNS) == 0 {
return fmt.Errorf("unable to restrict access to kernel files without mount namespace")
}
if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil {
return err
}
}
pdeathSignal, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("get parent death signal %s", err)
}
if err := finalizeNamespace(config); err != nil {
return fmt.Errorf("finalize namespace %s", err)
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := restoreParentDeathSignal(pdeathSignal); err != nil {
return fmt.Errorf("restore parent death signal %s", err)
}
return system.Execv(process.Args[0], process.Args[0:], config.Env)
}
// restoreParentDeathSignal sets the parent death signal to old.
func restoreParentDeathSignal(old int) error {
if old == 0 {
return nil
}
current, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("get parent death signal %s", err)
}
if old == current {
return nil
}
if err := system.ParentDeathSignal(uintptr(old)); err != nil {
return fmt.Errorf("set parent death signal %s", err)
}
// Signal self if parent is already dead. Does nothing if running in a new
// PID namespace, as Getppid will always return 0.
if syscall.Getppid() == 1 {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
}
return nil
}
// setupUser changes the groups, gid, and uid for the user inside the container
func setupUser(config *configs.Config) error {
// Set up defaults.
defaultExecUser := user.ExecUser{
Uid: syscall.Getuid(),
Gid: syscall.Getgid(),
Home: "/",
}
passwdPath, err := user.GetPasswdPath()
if err != nil {
return err
}
groupPath, err := user.GetGroupPath()
if err != nil {
return err
}
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
if err != nil {
return fmt.Errorf("get supplementary groups %s", err)
}
suppGroups := append(execUser.Sgids, config.AdditionalGroups...)
if err := syscall.Setgroups(suppGroups); err != nil {
return fmt.Errorf("setgroups %s", err)
}
if err := system.Setgid(execUser.Gid); err != nil {
return fmt.Errorf("setgid %s", err)
}
if err := system.Setuid(execUser.Uid); err != nil {
return fmt.Errorf("setuid %s", err)
}
// if we didn't get HOME already, set it based on the user's HOME
if envHome := os.Getenv("HOME"); envHome == "" {
if err := os.Setenv("HOME", execUser.Home); err != nil {
return fmt.Errorf("set HOME %s", err)
}
}
return nil
}
// setupVethNetwork uses the Network config if it is not nil to initialize
// the new veth interface inside the container for use by changing the name to eth0
// setting the MTU and IP address along with the default gateway
func setupNetwork(config *configs.Config, networkState *configs.NetworkState) error {
for _, config := range config.Networks {
strategy, err := network.GetStrategy(config.Type)
if err != nil {
return err
}
err1 := strategy.Initialize(config, networkState)
if err1 != nil {
return err1
}
}
return nil
}
func setupRoute(config *configs.Config) error {
for _, config := range config.Routes {
if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil {
return err
}
}
return nil
}
func setupRlimits(config *configs.Config) error {
for _, rlimit := range config.Rlimits {
l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
}
}
return nil
}
// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaky file descriptors
// before execing the command inside the namespace
func finalizeNamespace(config *configs.Config) error {
// Ensure that all non-standard fds we may have accidentally
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(3); err != nil {
return fmt.Errorf("close open file descriptors %s", err)
}
// drop capabilities in bounding set before changing user
if err := capabilities.DropBoundingSet(config.Capabilities); err != nil {
return fmt.Errorf("drop bounding set %s", err)
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return fmt.Errorf("set keep caps %s", err)
}
if err := setupUser(config); err != nil {
return fmt.Errorf("setup user %s", err)
}
if err := system.ClearKeepCaps(); err != nil {
return fmt.Errorf("clear keep caps %s", err)
}
// drop all other capabilities
if err := capabilities.DropCapabilities(config.Capabilities); err != nil {
return fmt.Errorf("drop capabilities %s", err)
}
if config.WorkingDir != "" {
if err := syscall.Chdir(config.WorkingDir); err != nil {
return fmt.Errorf("chdir to %s %s", config.WorkingDir, err)
}
}
return nil
}
func loadContainerEnvironment(config *configs.Config) error {
os.Clearenv()
for _, pair := range config.Env {
p := strings.SplitN(pair, "=", 2)
if len(p) < 2 {
return fmt.Errorf("invalid environment '%v'", pair)
}
if err := os.Setenv(p[0], p[1]); err != nil {
return err
}
}
return nil
}
// joinExistingNamespaces gets all the namespace paths specified for the container and
// does a setns on the namespace fd so that the current process joins the namespace.
func joinExistingNamespaces(namespaces []configs.Namespace) error {
for _, ns := range namespaces {
if ns.Path != "" {
f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
if err != nil {
return err
}
err = system.Setns(f.Fd(), uintptr(ns.Syscall()))
f.Close()
if err != nil {
return err
}
}
}
return nil
}

View File

@ -1,262 +0,0 @@
// +build linux
package namespaces
import (
"encoding/json"
"fmt"
"io"
"os"
"os/exec"
"syscall"
"github.com/docker/libcontainer/cgroups"
"github.com/docker/libcontainer/configs"
"github.com/docker/libcontainer/network"
"github.com/docker/libcontainer/system"
)
const (
EXIT_SIGNAL_OFFSET = 128
)
func executeSetupCmd(args []string, ppid int, container *configs.Config, process *processArgs, networkState *configs.NetworkState) error {
command := exec.Command(args[0], args[1:]...)
parent, child, err := newInitPipe()
if err != nil {
return err
}
defer parent.Close()
command.ExtraFiles = []*os.File{child}
command.Dir = container.RootFs
command.Env = append(command.Env,
fmt.Sprintf("_LIBCONTAINER_INITPID=%d", ppid),
fmt.Sprintf("_LIBCONTAINER_USERNS=1"))
err = command.Start()
child.Close()
if err != nil {
return err
}
s, err := command.Process.Wait()
if err != nil {
return err
}
if !s.Success() {
return &exec.ExitError{s}
}
decoder := json.NewDecoder(parent)
var pid *pid
if err := decoder.Decode(&pid); err != nil {
return err
}
p, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
terminate := func(terr error) error {
// TODO: log the errors for kill and wait
p.Kill()
p.Wait()
return terr
}
// send the state to the container's init process then shutdown writes for the parent
if err := json.NewEncoder(parent).Encode(process); err != nil {
return terminate(err)
}
// shutdown writes for the parent side of the pipe
if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil {
return terminate(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *initError
if err := decoder.Decode(&ierr); err != nil && err != io.EOF {
return terminate(err)
}
if ierr != nil {
return ierr
}
s, err = p.Wait()
if err != nil {
return err
}
if !s.Success() {
return &exec.ExitError{s}
}
return nil
}
// TODO(vishh): This is part of the libcontainer API and it does much more than just namespaces related work.
// Move this to libcontainer package.
// Exec performs setup outside of a namespace so that a container can be
// executed. Exec is a high level function for working with container namespaces.
func Exec(args []string, env []string, console string, command *exec.Cmd, container *configs.Config, cgroupManager cgroups.Manager, state *configs.State) (err error) {
// create a pipe so that we can syncronize with the namespaced process and
// pass the state and configuration to the child process
parent, child, err := newInitPipe()
if err != nil {
return err
}
defer parent.Close()
command.ExtraFiles = []*os.File{child}
command.Dir = container.RootFs
command.SysProcAttr.Cloneflags = uintptr(GetNamespaceFlags(container.Namespaces))
if container.Namespaces.Contains(configs.NEWUSER) {
AddUidGidMappings(command.SysProcAttr, container)
// Default to root user when user namespaces are enabled.
if command.SysProcAttr.Credential == nil {
command.SysProcAttr.Credential = &syscall.Credential{}
}
}
if err := command.Start(); err != nil {
child.Close()
return err
}
child.Close()
wait := func() (*os.ProcessState, error) {
ps, err := command.Process.Wait()
// we should kill all processes in cgroup when init is died if we use
// host PID namespace
if !container.Namespaces.Contains(configs.NEWPID) {
killAllPids(cgroupManager)
}
return ps, err
}
terminate := func(terr error) error {
// TODO: log the errors for kill and wait
command.Process.Kill()
wait()
return terr
}
started, err := system.GetProcessStartTime(command.Process.Pid)
if err != nil {
return terminate(err)
}
// Do this before syncing with child so that no children
// can escape the cgroup
err = cgroupManager.Apply(command.Process.Pid)
if err != nil {
return terminate(err)
}
defer func() {
if err != nil {
cgroupManager.Destroy()
}
}()
var networkState configs.NetworkState
if err := InitializeNetworking(container, command.Process.Pid, &networkState); err != nil {
return terminate(err)
}
process := processArgs{
Env: append(env[0:], container.Env...),
Args: args,
ConsolePath: console,
Config: container,
NetworkState: &networkState,
}
// Start the setup process to setup the init process
if container.Namespaces.Contains(configs.NEWUSER) {
if err = executeSetupCmd(command.Args, command.Process.Pid, container, &process, &networkState); err != nil {
return terminate(err)
}
}
// send the state to the container's init process then shutdown writes for the parent
if err := json.NewEncoder(parent).Encode(process); err != nil {
return terminate(err)
}
// shutdown writes for the parent side of the pipe
if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil {
return terminate(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *initError
if err := json.NewDecoder(parent).Decode(&ierr); err != nil && err != io.EOF {
return terminate(err)
}
if ierr != nil {
return terminate(ierr)
}
state.InitPid = command.Process.Pid
state.InitStartTime = started
state.NetworkState = networkState
state.CgroupPaths = cgroupManager.GetPaths()
return nil
}
// killAllPids iterates over all of the container's processes
// sending a SIGKILL to each process.
func killAllPids(m cgroups.Manager) error {
var (
procs []*os.Process
)
m.Freeze(configs.Frozen)
pids, err := m.GetPids()
if err != nil {
return err
}
for _, pid := range pids {
// TODO: log err without aborting if we are unable to find
// a single PID
if p, err := os.FindProcess(pid); err == nil {
procs = append(procs, p)
p.Kill()
}
}
m.Freeze(configs.Thawed)
for _, p := range procs {
p.Wait()
}
return err
}
// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr.
func AddUidGidMappings(sys *syscall.SysProcAttr, container *configs.Config) {
if container.UidMappings != nil {
sys.UidMappings = make([]syscall.SysProcIDMap, len(container.UidMappings))
for i, um := range container.UidMappings {
sys.UidMappings[i].ContainerID = um.ContainerID
sys.UidMappings[i].HostID = um.HostID
sys.UidMappings[i].Size = um.Size
}
}
if container.GidMappings != nil {
sys.GidMappings = make([]syscall.SysProcIDMap, len(container.GidMappings))
for i, gm := range container.GidMappings {
sys.GidMappings[i].ContainerID = gm.ContainerID
sys.GidMappings[i].HostID = gm.HostID
sys.GidMappings[i].Size = gm.Size
}
}
}
// InitializeNetworking creates the container's network stack outside of the namespace and moves
// interfaces into the container's net namespaces if necessary
func InitializeNetworking(container *configs.Config, nspid int, networkState *configs.NetworkState) error {
for _, config := range container.Networks {
strategy, err := network.GetStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.Create(config, nspid, networkState); err != nil {
return err
}
}
return nil
}

View File

@ -1,218 +0,0 @@
// +build linux
package namespaces
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"os/exec"
"syscall"
"github.com/docker/libcontainer/apparmor"
"github.com/docker/libcontainer/cgroups"
"github.com/docker/libcontainer/configs"
"github.com/docker/libcontainer/label"
"github.com/docker/libcontainer/mount"
"github.com/docker/libcontainer/system"
)
type pid struct {
Pid int `json:"Pid"`
}
// ExecIn reexec's cmd with _LIBCONTAINER_INITPID=PID so that it is able to run the
// setns code in a single threaded environment joining the existing containers' namespaces.
func ExecIn(args []string, env []string, console string, cmd *exec.Cmd, container *configs.Config, state *configs.State) (int, error) {
var err error
parent, child, err := newInitPipe()
if err != nil {
return -1, err
}
defer parent.Close()
cmd.ExtraFiles = []*os.File{child}
cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPID=%d", state.InitPid))
if err := cmd.Start(); err != nil {
child.Close()
return -1, err
}
child.Close()
s, err := cmd.Process.Wait()
if err != nil {
return -1, err
}
if !s.Success() {
return -1, &exec.ExitError{s}
}
decoder := json.NewDecoder(parent)
var pid *pid
if err := decoder.Decode(&pid); err != nil {
return -1, err
}
p, err := os.FindProcess(pid.Pid)
if err != nil {
return -1, err
}
terminate := func(terr error) (int, error) {
// TODO: log the errors for kill and wait
p.Kill()
p.Wait()
return -1, terr
}
// Enter cgroups.
if err := EnterCgroups(state, pid.Pid); err != nil {
return terminate(err)
}
encoder := json.NewEncoder(parent)
if err := encoder.Encode(container); err != nil {
return terminate(err)
}
process := processArgs{
Env: append(env[0:], container.Env...),
Args: args,
ConsolePath: console,
}
if err := encoder.Encode(process); err != nil {
return terminate(err)
}
return pid.Pid, nil
}
// Finalize entering into a container and execute a specified command
func InitIn(pipe *os.File) (err error) {
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't
// receive ECONNRESET when the child writes to the pipe.
ioutil.ReadAll(pipe)
if err := json.NewEncoder(pipe).Encode(initError{
Message: err.Error(),
}); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed
pipe.Close()
}()
decoder := json.NewDecoder(pipe)
var container *configs.Config
if err := decoder.Decode(&container); err != nil {
return err
}
var process *processArgs
if err := decoder.Decode(&process); err != nil {
return err
}
if err := FinalizeSetns(container); err != nil {
return err
}
if err := system.Execv(process.Args[0], process.Args[0:], process.Env); err != nil {
return err
}
panic("unreachable")
}
// Finalize expects that the setns calls have been setup and that is has joined an
// existing namespace
func FinalizeSetns(container *configs.Config) error {
// clear the current processes env and replace it with the environment defined on the container
if err := LoadContainerEnvironment(container); err != nil {
return err
}
if err := setupRlimits(container); err != nil {
return fmt.Errorf("setup rlimits %s", err)
}
if err := FinalizeNamespace(container); err != nil {
return err
}
if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil {
return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err)
}
if container.ProcessLabel != "" {
if err := label.SetProcessLabel(container.ProcessLabel); err != nil {
return err
}
}
return nil
}
// SetupContainer is run to setup mounts and networking related operations
// for a user namespace enabled process as a user namespace root doesn't
// have permissions to perform these operations.
// The setup process joins all the namespaces of user namespace enabled init
// except the user namespace, so it run as root in the root user namespace
// to perform these operations.
func SetupContainer(process *processArgs) error {
container := process.Config
networkState := process.NetworkState
// TODO : move to validation
/*
rootfs, err := utils.ResolveRootfs(container.RootFs)
if err != nil {
return err
}
*/
// clear the current processes env and replace it with the environment
// defined on the container
if err := LoadContainerEnvironment(container); err != nil {
return err
}
cloneFlags := GetNamespaceFlags(container.Namespaces)
if (cloneFlags & syscall.CLONE_NEWNET) == 0 {
if len(container.Networks) != 0 || len(container.Routes) != 0 {
return fmt.Errorf("unable to apply network parameters without network namespace")
}
} else {
if err := setupNetwork(container, networkState); err != nil {
return fmt.Errorf("setup networking %s", err)
}
if err := setupRoute(container); err != nil {
return fmt.Errorf("setup route %s", err)
}
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if (cloneFlags & syscall.CLONE_NEWNS) != 0 {
if err := mount.InitializeMountNamespace(container); err != nil {
return fmt.Errorf("setup mount namespace %s", err)
}
}
return nil
}
func EnterCgroups(state *configs.State, pid int) error {
return cgroups.EnterPid(state.CgroupPaths, pid)
}

View File

@ -1,465 +0,0 @@
// +build linux
package namespaces
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"strings"
"syscall"
"github.com/docker/libcontainer/apparmor"
"github.com/docker/libcontainer/configs"
"github.com/docker/libcontainer/console"
"github.com/docker/libcontainer/label"
"github.com/docker/libcontainer/mount"
"github.com/docker/libcontainer/netlink"
"github.com/docker/libcontainer/network"
"github.com/docker/libcontainer/security/capabilities"
"github.com/docker/libcontainer/security/restrict"
"github.com/docker/libcontainer/system"
"github.com/docker/libcontainer/user"
"github.com/docker/libcontainer/utils"
)
// Process is used for transferring parameters from Exec() to Init()
type processArgs struct {
Args []string `json:"args,omitempty"`
Env []string `json:"environment,omitempty"`
ConsolePath string `json:"console_path,omitempty"`
Config *configs.Config `json:"config,omitempty"`
NetworkState *configs.NetworkState `json:"network_state,omitempty"`
}
// TODO(vishh): This is part of the libcontainer API and it does much more than just namespaces related work.
// Move this to libcontainer package.
// Init is the init process that first runs inside a new namespace to setup mounts, users, networking,
// and other options required for the new container.
// The caller of Init function has to ensure that the go runtime is locked to an OS thread
// (using runtime.LockOSThread) else system calls like setns called within Init may not work as intended.
func Init(pipe *os.File, setupUserns bool) (err error) {
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't
// receive ECONNRESET when the child writes to the pipe.
ioutil.ReadAll(pipe)
if err := json.NewEncoder(pipe).Encode(initError{
Message: err.Error(),
}); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed
pipe.Close()
}()
uncleanRootfs, err := os.Getwd()
if err != nil {
return err
}
var process *processArgs
// We always read this as it is a way to sync with the parent as well
if err := json.NewDecoder(pipe).Decode(&process); err != nil {
return err
}
if setupUserns {
err = SetupContainer(process)
if err == nil {
os.Exit(0)
} else {
os.Exit(1)
}
}
if process.Config.Namespaces.Contains(configs.NEWUSER) {
return initUserNs(uncleanRootfs, process)
} else {
return initDefault(uncleanRootfs, process)
}
}
func initDefault(uncleanRootfs string, process *processArgs) (err error) {
container := process.Config
networkState := process.NetworkState
// TODO: move to validation
/*
rootfs, err := utils.ResolveRootfs(uncleanRootfs)
if err != nil {
return err
}
*/
// clear the current processes env and replace it with the environment
// defined on the container
if err := LoadContainerEnvironment(container); err != nil {
return err
}
// join any namespaces via a path to the namespace fd if provided
if err := joinExistingNamespaces(container.Namespaces); err != nil {
return err
}
if process.ConsolePath != "" {
if err := console.OpenAndDup(process.ConsolePath); err != nil {
return err
}
}
if _, err := syscall.Setsid(); err != nil {
return fmt.Errorf("setsid %s", err)
}
if process.ConsolePath != "" {
if err := system.Setctty(); err != nil {
return fmt.Errorf("setctty %s", err)
}
}
cloneFlags := GetNamespaceFlags(container.Namespaces)
if (cloneFlags & syscall.CLONE_NEWNET) == 0 {
if len(container.Networks) != 0 || len(container.Routes) != 0 {
return fmt.Errorf("unable to apply network parameters without network namespace")
}
} else {
if err := setupNetwork(container, networkState); err != nil {
return fmt.Errorf("setup networking %s", err)
}
if err := setupRoute(container); err != nil {
return fmt.Errorf("setup route %s", err)
}
}
if err := setupRlimits(container); err != nil {
return fmt.Errorf("setup rlimits %s", err)
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if (cloneFlags & syscall.CLONE_NEWNS) != 0 {
if err := mount.InitializeMountNamespace(container); err != nil {
return err
}
}
if container.Hostname != "" {
// TODO: (crosbymichael) move this to pre spawn validation
if (cloneFlags & syscall.CLONE_NEWUTS) == 0 {
return fmt.Errorf("unable to set the hostname without UTS namespace")
}
if err := syscall.Sethostname([]byte(container.Hostname)); err != nil {
return fmt.Errorf("unable to sethostname %q: %s", container.Hostname, err)
}
}
if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil {
return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err)
}
if err := label.SetProcessLabel(container.ProcessLabel); err != nil {
return fmt.Errorf("set process label %s", err)
}
// TODO: (crosbymichael) make this configurable at the Config level
if container.RestrictSys {
if (cloneFlags & syscall.CLONE_NEWNS) == 0 {
return fmt.Errorf("unable to restrict access to kernel files without mount namespace")
}
if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil {
return err
}
}
pdeathSignal, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("get parent death signal %s", err)
}
if err := FinalizeNamespace(container); err != nil {
return fmt.Errorf("finalize namespace %s", err)
}
// FinalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := RestoreParentDeathSignal(pdeathSignal); err != nil {
return fmt.Errorf("restore parent death signal %s", err)
}
return system.Execv(process.Args[0], process.Args[0:], process.Env)
}
func initUserNs(uncleanRootfs string, process *processArgs) (err error) {
container := process.Config
// clear the current processes env and replace it with the environment
// defined on the container
if err := LoadContainerEnvironment(container); err != nil {
return err
}
// join any namespaces via a path to the namespace fd if provided
if err := joinExistingNamespaces(container.Namespaces); err != nil {
return err
}
if process.ConsolePath != "" {
if err := console.OpenAndDup("/dev/console"); err != nil {
return err
}
}
if _, err := syscall.Setsid(); err != nil {
return fmt.Errorf("setsid %s", err)
}
if process.ConsolePath != "" {
if err := system.Setctty(); err != nil {
return fmt.Errorf("setctty %s", err)
}
}
if container.WorkingDir == "" {
container.WorkingDir = "/"
}
if err := setupRlimits(container); err != nil {
return fmt.Errorf("setup rlimits %s", err)
}
cloneFlags := GetNamespaceFlags(container.Namespaces)
if container.Hostname != "" {
if (cloneFlags & syscall.CLONE_NEWUTS) == 0 {
return fmt.Errorf("unable to set the hostname without UTS namespace")
}
if err := syscall.Sethostname([]byte(container.Hostname)); err != nil {
return fmt.Errorf("unable to sethostname %q: %s", container.Hostname, err)
}
}
if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil {
return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err)
}
if err := label.SetProcessLabel(container.ProcessLabel); err != nil {
return fmt.Errorf("set process label %s", err)
}
if container.RestrictSys {
if (cloneFlags & syscall.CLONE_NEWNS) == 0 {
return fmt.Errorf("unable to restrict access to kernel files without mount namespace")
}
if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil {
return err
}
}
pdeathSignal, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("get parent death signal %s", err)
}
if err := FinalizeNamespace(container); err != nil {
return fmt.Errorf("finalize namespace %s", err)
}
// FinalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := RestoreParentDeathSignal(pdeathSignal); err != nil {
return fmt.Errorf("restore parent death signal %s", err)
}
return system.Execv(process.Args[0], process.Args[0:], process.Env)
}
// RestoreParentDeathSignal sets the parent death signal to old.
func RestoreParentDeathSignal(old int) error {
if old == 0 {
return nil
}
current, err := system.GetParentDeathSignal()
if err != nil {
return fmt.Errorf("get parent death signal %s", err)
}
if old == current {
return nil
}
if err := system.ParentDeathSignal(uintptr(old)); err != nil {
return fmt.Errorf("set parent death signal %s", err)
}
// Signal self if parent is already dead. Does nothing if running in a new
// PID namespace, as Getppid will always return 0.
if syscall.Getppid() == 1 {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
}
return nil
}
// SetupUser changes the groups, gid, and uid for the user inside the container
func SetupUser(container *configs.Config) error {
// Set up defaults.
defaultExecUser := user.ExecUser{
Uid: syscall.Getuid(),
Gid: syscall.Getgid(),
Home: "/",
}
passwdPath, err := user.GetPasswdPath()
if err != nil {
return err
}
groupPath, err := user.GetGroupPath()
if err != nil {
return err
}
execUser, err := user.GetExecUserPath(container.User, &defaultExecUser, passwdPath, groupPath)
if err != nil {
return fmt.Errorf("get supplementary groups %s", err)
}
suppGroups := append(execUser.Sgids, container.AdditionalGroups...)
if err := syscall.Setgroups(suppGroups); err != nil {
return fmt.Errorf("setgroups %s", err)
}
if err := system.Setgid(execUser.Gid); err != nil {
return fmt.Errorf("setgid %s", err)
}
if err := system.Setuid(execUser.Uid); err != nil {
return fmt.Errorf("setuid %s", err)
}
// if we didn't get HOME already, set it based on the user's HOME
if envHome := os.Getenv("HOME"); envHome == "" {
if err := os.Setenv("HOME", execUser.Home); err != nil {
return fmt.Errorf("set HOME %s", err)
}
}
return nil
}
// setupVethNetwork uses the Network config if it is not nil to initialize
// the new veth interface inside the container for use by changing the name to eth0
// setting the MTU and IP address along with the default gateway
func setupNetwork(container *configs.Config, networkState *configs.NetworkState) error {
for _, config := range container.Networks {
strategy, err := network.GetStrategy(config.Type)
if err != nil {
return err
}
err1 := strategy.Initialize(config, networkState)
if err1 != nil {
return err1
}
}
return nil
}
func setupRoute(container *configs.Config) error {
for _, config := range container.Routes {
if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil {
return err
}
}
return nil
}
func setupRlimits(container *configs.Config) error {
for _, rlimit := range container.Rlimits {
l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
}
}
return nil
}
// FinalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaky file descriptors
// before execing the command inside the namespace
func FinalizeNamespace(container *configs.Config) error {
// Ensure that all non-standard fds we may have accidentally
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(3); err != nil {
return fmt.Errorf("close open file descriptors %s", err)
}
// drop capabilities in bounding set before changing user
if err := capabilities.DropBoundingSet(container.Capabilities); err != nil {
return fmt.Errorf("drop bounding set %s", err)
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return fmt.Errorf("set keep caps %s", err)
}
if err := SetupUser(container); err != nil {
return fmt.Errorf("setup user %s", err)
}
if err := system.ClearKeepCaps(); err != nil {
return fmt.Errorf("clear keep caps %s", err)
}
// drop all other capabilities
if err := capabilities.DropCapabilities(container.Capabilities); err != nil {
return fmt.Errorf("drop capabilities %s", err)
}
if container.WorkingDir != "" {
if err := syscall.Chdir(container.WorkingDir); err != nil {
return fmt.Errorf("chdir to %s %s", container.WorkingDir, err)
}
}
return nil
}
func LoadContainerEnvironment(container *configs.Config) error {
os.Clearenv()
for _, pair := range container.Env {
p := strings.SplitN(pair, "=", 2)
if len(p) < 2 {
return fmt.Errorf("invalid environment '%v'", pair)
}
if err := os.Setenv(p[0], p[1]); err != nil {
return err
}
}
return nil
}
// joinExistingNamespaces gets all the namespace paths specified for the container and
// does a setns on the namespace fd so that the current process joins the namespace.
func joinExistingNamespaces(namespaces []configs.Namespace) error {
for _, ns := range namespaces {
if ns.Path != "" {
f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
if err != nil {
return err
}
err = system.Setns(f.Fd(), uintptr(namespaceInfo[ns.Type]))
f.Close()
if err != nil {
return err
}
}
}
return nil
}

View File

@ -1,48 +0,0 @@
// +build linux
package namespaces
import (
"os"
"syscall"
"github.com/docker/libcontainer/configs"
)
type initError struct {
Message string `json:"message,omitempty"`
}
func (i initError) Error() string {
return i.Message
}
var namespaceInfo = map[configs.NamespaceType]int{
configs.NEWNET: syscall.CLONE_NEWNET,
configs.NEWNS: syscall.CLONE_NEWNS,
configs.NEWUSER: syscall.CLONE_NEWUSER,
configs.NEWIPC: syscall.CLONE_NEWIPC,
configs.NEWUTS: syscall.CLONE_NEWUTS,
configs.NEWPID: syscall.CLONE_NEWPID,
}
// New returns a newly initialized Pipe for communication between processes
func newInitPipe() (parent *os.File, child *os.File, err error) {
fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
}
// GetNamespaceFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This functions returns flags only for new namespaces.
func GetNamespaceFlags(namespaces configs.Namespaces) (flag int) {
for _, v := range namespaces {
if v.Path != "" {
continue
}
flag |= namespaceInfo[v.Type]
}
return flag
}

View File

@ -3,14 +3,55 @@ package main
import (
"io"
"os"
"os/signal"
"syscall"
"github.com/codegangsta/cli"
"github.com/docker/docker/pkg/term"
"github.com/docker/libcontainer"
"github.com/docker/libcontainer/configs"
consolepkg "github.com/docker/libcontainer/console"
)
type tty struct {
master *os.File
console string
state *term.State
}
func (t *tty) Close() error {
if t.master != nil {
t.master.Close()
}
if t.state != nil {
term.RestoreTerminal(os.Stdin.Fd(), t.state)
}
return nil
}
func (t *tty) set(config *configs.Config) {
config.Console = t.console
}
func (t *tty) attach(process *libcontainer.Process) {
if t.master != nil {
process.Stderr = nil
process.Stdout = nil
process.Stdin = nil
}
}
func (t *tty) resize() error {
if t.master == nil {
return nil
}
ws, err := term.GetWinsize(os.Stdin.Fd())
if err != nil {
return err
}
return term.SetWinsize(t.master.Fd(), ws)
}
var execCommand = cli.Command{
Name: "exec",
Usage: "execute a new command inside a container",
@ -23,24 +64,14 @@ var execCommand = cli.Command{
}
func execAction(context *cli.Context) {
var (
master *os.File
console string
err error
sigc = make(chan os.Signal, 10)
stdin = os.Stdin
stdout = os.Stdout
stderr = os.Stderr
exitCode int
)
factory, err := loadFactory(context)
if err != nil {
fatal(err)
}
tty, err := newTty(context)
if err != nil {
fatal(err)
}
container, err := factory.Load(context.String("id"))
if err != nil {
if lerr, ok := err.(libcontainer.Error); !ok || lerr.Code() != libcontainer.ContainerNotExists {
@ -50,46 +81,22 @@ func execAction(context *cli.Context) {
if err != nil {
fatal(err)
}
if context.Bool("tty") {
stdin = nil
stdout = nil
stderr = nil
if master, console, err = consolepkg.CreateMasterAndConsole(); err != nil {
fatal(err)
}
go io.Copy(master, os.Stdin)
go io.Copy(os.Stdout, master)
state, err := term.SetRawTerminal(os.Stdin.Fd())
if err != nil {
fatal(err)
}
defer term.RestoreTerminal(os.Stdin.Fd(), state)
config.Console = console
}
tty.set(config)
if container, err = factory.Create(context.String("id"), config); err != nil {
fatal(err)
}
}
go handleSignals(container, tty)
process := &libcontainer.Process{
Args: context.Args(),
Stdin: stdin,
Stdout: stdout,
Stderr: stderr,
Stdin: os.Stdin,
Stdout: os.Stdout,
Stderr: os.Stderr,
}
tty.attach(process)
if _, err := container.Start(process); err != nil {
fatal(err)
}
go func() {
resizeTty(master)
for sig := range sigc {
switch sig {
case syscall.SIGWINCH:
resizeTty(master)
default:
container.Signal(sig)
}
}
}()
status, err := container.Wait()
if err != nil {
fatal(err)
@ -97,6 +104,11 @@ func execAction(context *cli.Context) {
if err := container.Destroy(); err != nil {
fatal(err)
}
exit(status)
}
func exit(status syscall.WaitStatus) {
var exitCode int
if status.Exited() {
exitCode = status.ExitStatus()
} else if status.Signaled() {
@ -107,13 +119,37 @@ func execAction(context *cli.Context) {
os.Exit(exitCode)
}
func resizeTty(master *os.File) {
if master == nil {
return
func handleSignals(container libcontainer.Container, tty *tty) {
sigc := make(chan os.Signal, 10)
signal.Notify(sigc)
tty.resize()
for sig := range sigc {
switch sig {
case syscall.SIGWINCH:
tty.resize()
default:
container.Signal(sig)
}
}
ws, err := term.GetWinsize(os.Stdin.Fd())
if err != nil {
return
}
term.SetWinsize(master.Fd(), ws)
}
func newTty(context *cli.Context) (*tty, error) {
if context.Bool("tty") {
master, console, err := consolepkg.CreateMasterAndConsole()
if err != nil {
return nil, err
}
go io.Copy(master, os.Stdin)
go io.Copy(os.Stdout, master)
state, err := term.SetRawTerminal(os.Stdin.Fd())
if err != nil {
return nil, err
}
return &tty{
master: master,
console: console,
state: state,
}, nil
}
return &tty{}, nil
}

View File

@ -5,7 +5,7 @@ import (
"github.com/codegangsta/cli"
"github.com/docker/libcontainer"
_ "github.com/docker/libcontainer/namespaces/nsenter"
_ "github.com/docker/libcontainer/nsenter"
)
var initCommand = cli.Command{