From 8850636eb3df747fc174c04fdba2a8bb447cf9be Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Fri, 6 Feb 2015 12:48:57 -0800 Subject: [PATCH] Refactor init actions into separate types Signed-off-by: Michael Crosby --- configs/validate/config.go | 75 +++++ error.go | 8 + linux_container.go | 93 ++++--- linux_factory.go | 517 ++--------------------------------- linux_init.go | 216 +++++++++++++++ linux_setns_init.go | 35 +++ linux_standard_init.go | 90 ++++++ linux_userns_init.go | 80 ++++++ linux_userns_sidecar_init.go | 37 +++ nsenter/nsexec.c | 11 +- stacktrace/capture.go | 2 - system/linux.go | 29 +- 12 files changed, 638 insertions(+), 555 deletions(-) create mode 100644 configs/validate/config.go create mode 100644 linux_init.go create mode 100644 linux_setns_init.go create mode 100644 linux_standard_init.go create mode 100644 linux_userns_init.go create mode 100644 linux_userns_sidecar_init.go diff --git a/configs/validate/config.go b/configs/validate/config.go new file mode 100644 index 00000000..6148e1eb --- /dev/null +++ b/configs/validate/config.go @@ -0,0 +1,75 @@ +package validate + +import ( + "fmt" + "path/filepath" + + "github.com/docker/libcontainer/configs" +) + +type Validator interface { + Validate(*configs.Config) error +} + +func New() Validator { + return &ConfigValidator{} +} + +type ConfigValidator struct { +} + +func (v *ConfigValidator) Validate(config *configs.Config) error { + if err := v.rootfs(config); err != nil { + return err + } + if err := v.network(config); err != nil { + return err + } + if err := v.hostname(config); err != nil { + return err + } + if err := v.security(config); err != nil { + return err + } + return nil +} + +// rootfs validates the the rootfs is an absolute path and is not a symlink +// to the container's root filesystem. +func (v *ConfigValidator) rootfs(config *configs.Config) error { + cleaned, err := filepath.Abs(config.Rootfs) + if err != nil { + return err + } + if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil { + return err + } + if config.Rootfs != cleaned { + return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs) + } + return nil +} + +func (v *ConfigValidator) network(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWNET) { + if len(config.Networks) > 0 || len(config.Routes) > 0 { + return fmt.Errorf("unable to apply network settings without a private NET namespace") + } + } + return nil +} + +func (v *ConfigValidator) hostname(config *configs.Config) error { + if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) { + return fmt.Errorf("unable to set hostname without a private UTS namespace") + } + return nil +} + +func (v *ConfigValidator) security(config *configs.Config) error { + // restrict sys without mount namespace + if config.RestrictSys && !config.Namespaces.Contains(configs.NEWNS) { + return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") + } + return nil +} diff --git a/error.go b/error.go index 062943a1..31ebb320 100644 --- a/error.go +++ b/error.go @@ -54,3 +54,11 @@ type Error interface { // Returns the error code for this error. Code() ErrorCode } + +type initError struct { + Message string `json:"message,omitempty"` +} + +func (i initError) Error() string { + return i.Message +} diff --git a/linux_container.go b/linux_container.go index bf6b6a09..5b8e18a6 100644 --- a/linux_container.go +++ b/linux_container.go @@ -22,12 +22,8 @@ const ( EXIT_SIGNAL_OFFSET = 128 ) -type initError struct { - Message string `json:"message,omitempty"` -} - -func (i initError) Error() string { - return i.Message +type pid struct { + Pid int `json:"Pid"` } type linuxContainer struct { @@ -97,6 +93,21 @@ func (c *linuxContainer) Start(process *Process) (int, error) { if err != nil { return -1, err } + cmd := c.commandTemplate(process) + if status != configs.Destroyed { + // TODO: (crosbymichael) check out console use for execin + return c.startNewProcess(cmd, process.Args) + //return namespaces.ExecIn(process.Args, c.config.Env, "", cmd, c.config, c.state) + } + if err := c.startInitialProcess(cmd, process.Args); err != nil { + return -1, err + } + return c.state.InitPid, nil +} + +// commandTemplate creates a template *exec.Cmd. It uses the init arguments provided +// to the factory and attaches IO to the process. +func (c *linuxContainer) commandTemplate(process *Process) *exec.Cmd { cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...) cmd.Stdin = process.Stdin cmd.Stdout = process.Stdout @@ -108,32 +119,26 @@ func (c *linuxContainer) Start(process *Process) (int, error) { } // TODO: add pdeath to config for a container cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL - if status != configs.Destroyed { - glog.Info("start new container process") - // TODO: (crosbymichael) check out console use for execin - //return namespaces.ExecIn(process.Args, c.config.Env, "", cmd, c.config, c.state) - return c.startNewProcess(cmd, process.Args) - } - if err := c.startInitProcess(cmd, process.Args); err != nil { - return -1, err - } - return c.state.InitPid, nil + return cmd } +// startNewProcess adds another process to an already running container func (c *linuxContainer) startNewProcess(cmd *exec.Cmd, args []string) (int, error) { - var err error + glog.Info("start new container process") parent, child, err := newInitPipe() if err != nil { return -1, err } defer parent.Close() cmd.ExtraFiles = []*os.File{child} - cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPID=%d", c.state.InitPid)) - if err := cmd.Start(); err != nil { - child.Close() + cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPID=%d", c.state.InitPid), "_LIBCONTAINER_INITTYPE=setns") + + // start the command + err = cmd.Start() + child.Close() + if err != nil { return -1, err } - child.Close() s, err := cmd.Process.Wait() if err != nil { return -1, err @@ -152,29 +157,28 @@ func (c *linuxContainer) startNewProcess(cmd *exec.Cmd, args []string) (int, err } terminate := func(terr error) (int, error) { // TODO: log the errors for kill and wait - p.Kill() - p.Wait() + if err := p.Kill(); err != nil { + glog.Warning(err) + } + if _, err := p.Wait(); err != nil { + glog.Warning(err) + } return -1, terr } - // Enter cgroups. if err := c.enterCgroups(pid.Pid); err != nil { return terminate(err) } - encoder := json.NewEncoder(parent) - if err := encoder.Encode(c.config); err != nil { - return terminate(err) - } - process := processArgs{ + if err := json.NewEncoder(parent).Encode(&initConfig{ Config: c.config, Args: args, - } - if err := encoder.Encode(process); err != nil { + }); err != nil { return terminate(err) } return pid.Pid, nil } -func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, args []string) error { +func (c *linuxContainer) startInitialProcess(cmd *exec.Cmd, args []string) error { + glog.Info("starting container initial process") // create a pipe so that we can syncronize with the namespaced process and // pass the state and configuration to the child process parent, child, err := newInitPipe() @@ -184,6 +188,9 @@ func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, args []string) error { defer parent.Close() cmd.ExtraFiles = []*os.File{child} cmd.SysProcAttr.Cloneflags = c.config.Namespaces.CloneFlags() + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE=standard") + // if the container is configured to use user namespaces we have to setup the + // uid:gid mapping on the command. if c.config.Namespaces.Contains(configs.NEWUSER) { addUidGidMappings(cmd.SysProcAttr, c.config) // Default to root user when user namespaces are enabled. @@ -191,7 +198,6 @@ func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, args []string) error { cmd.SysProcAttr.Credential = &syscall.Credential{} } } - glog.Info("starting container init process") err = cmd.Start() child.Close() if err != nil { @@ -199,12 +205,15 @@ func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, args []string) error { } wait := func() (*os.ProcessState, error) { ps, err := cmd.Process.Wait() + if err != nil { + return nil, newGenericError(err, SystemError) + } // we should kill all processes in cgroup when init is died if we use // host PID namespace if !c.config.Namespaces.Contains(configs.NEWPID) { c.killAllPids() } - return ps, newGenericError(err, SystemError) + return ps, nil } terminate := func(terr error) error { // TODO: log the errors for kill and wait @@ -230,19 +239,19 @@ func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, args []string) error { if err := c.initializeNetworking(cmd.Process.Pid, &networkState); err != nil { return terminate(err) } - process := processArgs{ + iconfig := &initConfig{ Args: args, Config: c.config, NetworkState: &networkState, } // Start the setup process to setup the init process if c.config.Namespaces.Contains(configs.NEWUSER) { - if err = executeSetupCmd(cmd.Args, cmd.Process.Pid, c.config, &process, &networkState); err != nil { + if err = executeSetupCmd(cmd.Args, cmd.Process.Pid, c.config, iconfig, &networkState); err != nil { return terminate(err) } } // send the state to the container's init process then shutdown writes for the parent - if err := json.NewEncoder(parent).Encode(process); err != nil { + if err := json.NewEncoder(parent).Encode(iconfig); err != nil { return terminate(err) } // shutdown writes for the parent side of the pipe @@ -258,12 +267,10 @@ func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, args []string) error { if ierr != nil { return terminate(ierr) } - c.state.InitPid = cmd.Process.Pid c.state.InitStartTime = started c.state.NetworkState = networkState c.state.CgroupPaths = c.cgroupManager.GetPaths() - return nil } @@ -386,7 +393,7 @@ func (c *linuxContainer) initializeNetworking(nspid int, networkState *configs.N return nil } -func executeSetupCmd(args []string, ppid int, container *configs.Config, process *processArgs, networkState *configs.NetworkState) error { +func executeSetupCmd(args []string, ppid int, container *configs.Config, process *initConfig, networkState *configs.NetworkState) error { command := exec.Command(args[0], args[1:]...) parent, child, err := newInitPipe() if err != nil { @@ -397,7 +404,7 @@ func executeSetupCmd(args []string, ppid int, container *configs.Config, process command.Dir = container.Rootfs command.Env = append(command.Env, fmt.Sprintf("_LIBCONTAINER_INITPID=%d", ppid), - fmt.Sprintf("_LIBCONTAINER_USERNS=1")) + fmt.Sprintf("_LIBCONTAINER_INITTYPE=userns_sidecar")) err = command.Start() child.Close() if err != nil { @@ -452,10 +459,6 @@ func executeSetupCmd(args []string, ppid int, container *configs.Config, process return nil } -type pid struct { - Pid int `json:"Pid"` -} - func (c *linuxContainer) enterCgroups(pid int) error { return cgroups.EnterPid(c.state.CgroupPaths, pid) } diff --git a/linux_factory.go b/linux_factory.go index 02456c12..3bb1659a 100644 --- a/linux_factory.go +++ b/linux_factory.go @@ -9,24 +9,12 @@ import ( "os" "path/filepath" "regexp" - "strings" - "syscall" "github.com/golang/glog" - "github.com/docker/libcontainer/apparmor" cgroups "github.com/docker/libcontainer/cgroups/manager" "github.com/docker/libcontainer/configs" - "github.com/docker/libcontainer/console" - "github.com/docker/libcontainer/label" - "github.com/docker/libcontainer/mount" - "github.com/docker/libcontainer/netlink" - "github.com/docker/libcontainer/network" - "github.com/docker/libcontainer/security/capabilities" - "github.com/docker/libcontainer/security/restrict" - "github.com/docker/libcontainer/system" - "github.com/docker/libcontainer/user" - "github.com/docker/libcontainer/utils" + "github.com/docker/libcontainer/configs/validate" ) const ( @@ -39,13 +27,6 @@ var ( maxIdLen = 1024 ) -// Process is used for transferring parameters from Exec() to Init() -type processArgs struct { - Args []string `json:"args,omitempty"` - Config *configs.Config `json:"config,omitempty"` - NetworkState *configs.NetworkState `json:"network_state,omitempty"` -} - // New returns a linux based container factory based in the root directory. func New(root string, initArgs []string) (Factory, error) { if root != "" { @@ -54,16 +35,18 @@ func New(root string, initArgs []string) (Factory, error) { } } return &linuxFactory{ - root: root, - initArgs: initArgs, + root: root, + initArgs: initArgs, + validator: validate.New(), }, nil } // linuxFactory implements the default factory interface for linux based systems. type linuxFactory struct { // root is the root directory - root string - initArgs []string + root string + initArgs []string + validator validate.Validator } func (l *linuxFactory) Create(id string, config *configs.Config) (Container, error) { @@ -73,6 +56,9 @@ func (l *linuxFactory) Create(id string, config *configs.Config) (Container, err if err := l.validateID(id); err != nil { return nil, err } + if err := l.validator.Validate(config); err != nil { + return nil, newGenericError(err, ConfigInvalid) + } containerRoot := filepath.Join(l.root, id) if _, err := os.Stat(containerRoot); err == nil { return nil, newGenericError(fmt.Errorf("Container with id exists: %v", id), IdInUse) @@ -96,14 +82,13 @@ func (l *linuxFactory) Create(id string, config *configs.Config) (Container, err os.RemoveAll(containerRoot) return nil, newGenericError(err, SystemError) } - cgroupManager := cgroups.NewCgroupManager(config.Cgroups) return &linuxContainer{ id: id, root: containerRoot, config: config, initArgs: l.initArgs, state: &configs.State{}, - cgroupManager: cgroupManager, + cgroupManager: cgroups.NewCgroupManager(config.Cgroups), }, nil } @@ -137,12 +122,13 @@ func (l *linuxFactory) Load(id string) (Container, error) { // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // This is a low level implementation detail of the reexec and should not be consumed externally func (l *linuxFactory) StartInitialization(pipefd uintptr) (err error) { - pipe := os.NewFile(uintptr(pipefd), "pipe") - setupUserns := os.Getenv("_LIBCONTAINER_USERNS") != "" - pid := os.Getenv("_LIBCONTAINER_INITPID") - if pid != "" && !setupUserns { - return initIn(pipe) - } + var ( + pipe = os.NewFile(uintptr(pipefd), "pipe") + it = initType(os.Getenv("_LIBCONTAINER_INITTYPE")) + ) + // clear the current process's environment to clean any libcontainer + // specific env vars. + os.Clearenv() defer func() { // if we have an error during the initialization of the container's init then send it back to the // parent process in the form of an initError. @@ -159,27 +145,11 @@ func (l *linuxFactory) StartInitialization(pipefd uintptr) (err error) { // ensure that this pipe is always closed pipe.Close() }() - uncleanRootfs, err := os.Getwd() + i, err := newContainerInit(it, pipe) if err != nil { return err } - var process *processArgs - // We always read this as it is a way to sync with the parent as well - if err := json.NewDecoder(pipe).Decode(&process); err != nil { - return err - } - if setupUserns { - err = setupContainer(process) - if err == nil { - os.Exit(0) - } else { - os.Exit(1) - } - } - if process.Config.Namespaces.Contains(configs.NEWUSER) { - return l.initUserNs(uncleanRootfs, process) - } - return l.initDefault(uncleanRootfs, process) + return i.Init() } func (l *linuxFactory) loadContainerConfig(root string) (*configs.Config, error) { @@ -223,450 +193,3 @@ func (l *linuxFactory) validateID(id string) error { } return nil } - -func (l *linuxFactory) initDefault(uncleanRootfs string, process *processArgs) (err error) { - config := process.Config - networkState := process.NetworkState - - // TODO: move to validation - /* - rootfs, err := utils.ResolveRootfs(uncleanRootfs) - if err != nil { - return err - } - */ - - // clear the current processes env and replace it with the environment - // defined on the container - if err := loadContainerEnvironment(config); err != nil { - return err - } - // join any namespaces via a path to the namespace fd if provided - if err := joinExistingNamespaces(config.Namespaces); err != nil { - return err - } - if config.Console != "" { - if err := console.OpenAndDup(config.Console); err != nil { - return err - } - } - if _, err := syscall.Setsid(); err != nil { - return fmt.Errorf("setsid %s", err) - } - if config.Console != "" { - if err := system.Setctty(); err != nil { - return fmt.Errorf("setctty %s", err) - } - } - - cloneFlags := config.Namespaces.CloneFlags() - if (cloneFlags & syscall.CLONE_NEWNET) == 0 { - if len(config.Networks) != 0 || len(config.Routes) != 0 { - return fmt.Errorf("unable to apply network parameters without network namespace") - } - } else { - if err := setupNetwork(config, networkState); err != nil { - return fmt.Errorf("setup networking %s", err) - } - if err := setupRoute(config); err != nil { - return fmt.Errorf("setup route %s", err) - } - } - if err := setupRlimits(config); err != nil { - return fmt.Errorf("setup rlimits %s", err) - } - label.Init() - // InitializeMountNamespace() can be executed only for a new mount namespace - if (cloneFlags & syscall.CLONE_NEWNS) != 0 { - if err := mount.InitializeMountNamespace(config); err != nil { - return err - } - } - if config.Hostname != "" { - // TODO: (crosbymichael) move this to pre spawn validation - if (cloneFlags & syscall.CLONE_NEWUTS) == 0 { - return fmt.Errorf("unable to set the hostname without UTS namespace") - } - if err := syscall.Sethostname([]byte(config.Hostname)); err != nil { - return fmt.Errorf("unable to sethostname %q: %s", config.Hostname, err) - } - } - if err := apparmor.ApplyProfile(config.AppArmorProfile); err != nil { - return fmt.Errorf("set apparmor profile %s: %s", config.AppArmorProfile, err) - } - if err := label.SetProcessLabel(config.ProcessLabel); err != nil { - return fmt.Errorf("set process label %s", err) - } - // TODO: (crosbymichael) make this configurable at the Config level - if config.RestrictSys { - if (cloneFlags & syscall.CLONE_NEWNS) == 0 { - return fmt.Errorf("unable to restrict access to kernel files without mount namespace") - } - if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { - return err - } - } - pdeathSignal, err := system.GetParentDeathSignal() - if err != nil { - return fmt.Errorf("get parent death signal %s", err) - } - if err := finalizeNamespace(config); err != nil { - return fmt.Errorf("finalize namespace %s", err) - } - // finalizeNamespace can change user/group which clears the parent death - // signal, so we restore it here. - if err := restoreParentDeathSignal(pdeathSignal); err != nil { - return fmt.Errorf("restore parent death signal %s", err) - } - return system.Execv(process.Args[0], process.Args[0:], config.Env) -} - -func (l *linuxFactory) initUserNs(uncleanRootfs string, process *processArgs) (err error) { - config := process.Config - // clear the current processes env and replace it with the environment - // defined on the config - if err := loadContainerEnvironment(config); err != nil { - return err - } - // join any namespaces via a path to the namespace fd if provided - if err := joinExistingNamespaces(config.Namespaces); err != nil { - return err - } - if config.Console != "" { - if err := console.OpenAndDup("/dev/console"); err != nil { - return err - } - } - if _, err := syscall.Setsid(); err != nil { - return fmt.Errorf("setsid %s", err) - } - if config.Console != "" { - if err := system.Setctty(); err != nil { - return fmt.Errorf("setctty %s", err) - } - } - if config.WorkingDir == "" { - config.WorkingDir = "/" - } - if err := setupRlimits(config); err != nil { - return fmt.Errorf("setup rlimits %s", err) - } - cloneFlags := config.Namespaces.CloneFlags() - if config.Hostname != "" { - // TODO: move validation - if (cloneFlags & syscall.CLONE_NEWUTS) == 0 { - return fmt.Errorf("unable to set the hostname without UTS namespace") - } - if err := syscall.Sethostname([]byte(config.Hostname)); err != nil { - return fmt.Errorf("unable to sethostname %q: %s", config.Hostname, err) - } - } - if err := apparmor.ApplyProfile(config.AppArmorProfile); err != nil { - return fmt.Errorf("set apparmor profile %s: %s", config.AppArmorProfile, err) - } - if err := label.SetProcessLabel(config.ProcessLabel); err != nil { - return fmt.Errorf("set process label %s", err) - } - if config.RestrictSys { - if (cloneFlags & syscall.CLONE_NEWNS) == 0 { - return fmt.Errorf("unable to restrict access to kernel files without mount namespace") - } - if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { - return err - } - } - pdeathSignal, err := system.GetParentDeathSignal() - if err != nil { - return fmt.Errorf("get parent death signal %s", err) - } - if err := finalizeNamespace(config); err != nil { - return fmt.Errorf("finalize namespace %s", err) - } - // finalizeNamespace can change user/group which clears the parent death - // signal, so we restore it here. - if err := restoreParentDeathSignal(pdeathSignal); err != nil { - return fmt.Errorf("restore parent death signal %s", err) - } - return system.Execv(process.Args[0], process.Args[0:], config.Env) -} - -// restoreParentDeathSignal sets the parent death signal to old. -func restoreParentDeathSignal(old int) error { - if old == 0 { - return nil - } - current, err := system.GetParentDeathSignal() - if err != nil { - return fmt.Errorf("get parent death signal %s", err) - } - if old == current { - return nil - } - if err := system.ParentDeathSignal(uintptr(old)); err != nil { - return fmt.Errorf("set parent death signal %s", err) - } - // Signal self if parent is already dead. Does nothing if running in a new - // PID namespace, as Getppid will always return 0. - if syscall.Getppid() == 1 { - return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) - } - return nil -} - -// setupUser changes the groups, gid, and uid for the user inside the container -func setupUser(config *configs.Config) error { - // Set up defaults. - defaultExecUser := user.ExecUser{ - Uid: syscall.Getuid(), - Gid: syscall.Getgid(), - Home: "/", - } - passwdPath, err := user.GetPasswdPath() - if err != nil { - return err - } - groupPath, err := user.GetGroupPath() - if err != nil { - return err - } - execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) - if err != nil { - return fmt.Errorf("get supplementary groups %s", err) - } - suppGroups := append(execUser.Sgids, config.AdditionalGroups...) - if err := syscall.Setgroups(suppGroups); err != nil { - return fmt.Errorf("setgroups %s", err) - } - if err := system.Setgid(execUser.Gid); err != nil { - return fmt.Errorf("setgid %s", err) - } - if err := system.Setuid(execUser.Uid); err != nil { - return fmt.Errorf("setuid %s", err) - } - // if we didn't get HOME already, set it based on the user's HOME - if envHome := os.Getenv("HOME"); envHome == "" { - if err := os.Setenv("HOME", execUser.Home); err != nil { - return fmt.Errorf("set HOME %s", err) - } - } - return nil -} - -// setupVethNetwork uses the Network config if it is not nil to initialize -// the new veth interface inside the container for use by changing the name to eth0 -// setting the MTU and IP address along with the default gateway -func setupNetwork(config *configs.Config, networkState *configs.NetworkState) error { - for _, config := range config.Networks { - strategy, err := network.GetStrategy(config.Type) - if err != nil { - return err - } - err1 := strategy.Initialize(config, networkState) - if err1 != nil { - return err1 - } - } - return nil -} - -func setupRoute(config *configs.Config) error { - for _, config := range config.Routes { - if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil { - return err - } - } - return nil -} - -func setupRlimits(config *configs.Config) error { - for _, rlimit := range config.Rlimits { - l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft} - if err := syscall.Setrlimit(rlimit.Type, l); err != nil { - return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) - } - } - return nil -} - -// finalizeNamespace drops the caps, sets the correct user -// and working dir, and closes any leaky file descriptors -// before execing the command inside the namespace -func finalizeNamespace(config *configs.Config) error { - // Ensure that all non-standard fds we may have accidentally - // inherited are marked close-on-exec so they stay out of the - // container - if err := utils.CloseExecFrom(3); err != nil { - return fmt.Errorf("close open file descriptors %s", err) - } - // drop capabilities in bounding set before changing user - if err := capabilities.DropBoundingSet(config.Capabilities); err != nil { - return fmt.Errorf("drop bounding set %s", err) - } - // preserve existing capabilities while we change users - if err := system.SetKeepCaps(); err != nil { - return fmt.Errorf("set keep caps %s", err) - } - if err := setupUser(config); err != nil { - return fmt.Errorf("setup user %s", err) - } - if err := system.ClearKeepCaps(); err != nil { - return fmt.Errorf("clear keep caps %s", err) - } - // drop all other capabilities - if err := capabilities.DropCapabilities(config.Capabilities); err != nil { - return fmt.Errorf("drop capabilities %s", err) - } - if config.WorkingDir != "" { - if err := syscall.Chdir(config.WorkingDir); err != nil { - return fmt.Errorf("chdir to %s %s", config.WorkingDir, err) - } - } - return nil -} - -func loadContainerEnvironment(config *configs.Config) error { - os.Clearenv() - for _, pair := range config.Env { - p := strings.SplitN(pair, "=", 2) - if len(p) < 2 { - return fmt.Errorf("invalid environment '%v'", pair) - } - if err := os.Setenv(p[0], p[1]); err != nil { - return err - } - } - return nil -} - -// joinExistingNamespaces gets all the namespace paths specified for the container and -// does a setns on the namespace fd so that the current process joins the namespace. -func joinExistingNamespaces(namespaces []configs.Namespace) error { - for _, ns := range namespaces { - if ns.Path != "" { - f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0) - if err != nil { - return err - } - err = system.Setns(f.Fd(), uintptr(ns.Syscall())) - f.Close() - if err != nil { - return err - } - } - } - return nil -} - -// setupContainer is run to setup mounts and networking related operations -// for a user namespace enabled process as a user namespace root doesn't -// have permissions to perform these operations. -// The setup process joins all the namespaces of user namespace enabled init -// except the user namespace, so it run as root in the root user namespace -// to perform these operations. -func setupContainer(process *processArgs) error { - container := process.Config - networkState := process.NetworkState - - // TODO : move to validation - /* - rootfs, err := utils.ResolveRootfs(container.Rootfs) - if err != nil { - return err - } - */ - - // clear the current processes env and replace it with the environment - // defined on the container - if err := loadContainerEnvironment(container); err != nil { - return err - } - - cloneFlags := container.Namespaces.CloneFlags() - if (cloneFlags & syscall.CLONE_NEWNET) == 0 { - if len(container.Networks) != 0 || len(container.Routes) != 0 { - return fmt.Errorf("unable to apply network parameters without network namespace") - } - } else { - if err := setupNetwork(container, networkState); err != nil { - return fmt.Errorf("setup networking %s", err) - } - if err := setupRoute(container); err != nil { - return fmt.Errorf("setup route %s", err) - } - } - - label.Init() - - // InitializeMountNamespace() can be executed only for a new mount namespace - if (cloneFlags & syscall.CLONE_NEWNS) != 0 { - if err := mount.InitializeMountNamespace(container); err != nil { - return fmt.Errorf("setup mount namespace %s", err) - } - } - return nil -} - -// Finalize entering into a container and execute a specified command -func initIn(pipe *os.File) (err error) { - defer func() { - // if we have an error during the initialization of the container's init then send it back to the - // parent process in the form of an initError. - if err != nil { - // ensure that any data sent from the parent is consumed so it doesn't - // receive ECONNRESET when the child writes to the pipe. - ioutil.ReadAll(pipe) - if err := json.NewEncoder(pipe).Encode(initError{ - Message: err.Error(), - }); err != nil { - panic(err) - } - } - // ensure that this pipe is always closed - pipe.Close() - }() - decoder := json.NewDecoder(pipe) - var config *configs.Config - if err := decoder.Decode(&config); err != nil { - return err - } - var process *processArgs - if err := decoder.Decode(&process); err != nil { - return err - } - if err := finalizeSetns(config); err != nil { - return err - } - if err := system.Execv(process.Args[0], process.Args[0:], config.Env); err != nil { - return err - } - panic("unreachable") -} - -// finalize expects that the setns calls have been setup and that is has joined an -// existing namespace -func finalizeSetns(container *configs.Config) error { - // clear the current processes env and replace it with the environment defined on the container - if err := loadContainerEnvironment(container); err != nil { - return err - } - - if err := setupRlimits(container); err != nil { - return fmt.Errorf("setup rlimits %s", err) - } - - if err := finalizeNamespace(container); err != nil { - return err - } - - if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil { - return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err) - } - - if container.ProcessLabel != "" { - if err := label.SetProcessLabel(container.ProcessLabel); err != nil { - return err - } - } - - return nil -} diff --git a/linux_init.go b/linux_init.go new file mode 100644 index 00000000..b35257d0 --- /dev/null +++ b/linux_init.go @@ -0,0 +1,216 @@ +// +build linux + +package libcontainer + +import ( + "encoding/json" + "fmt" + "os" + "strings" + "syscall" + + "github.com/docker/libcontainer/configs" + "github.com/docker/libcontainer/netlink" + "github.com/docker/libcontainer/network" + "github.com/docker/libcontainer/security/capabilities" + "github.com/docker/libcontainer/system" + "github.com/docker/libcontainer/user" + "github.com/docker/libcontainer/utils" +) + +type initType string + +const ( + initSetns initType = "setns" + initStandard initType = "standard" + initUserns initType = "userns" + initUsernsSideCar initType = "userns_sidecar" +) + +// Process is used for transferring parameters from Exec() to Init() +type initConfig struct { + Args []string `json:"args,omitempty"` + Config *configs.Config `json:"config,omitempty"` + NetworkState *configs.NetworkState `json:"network_state,omitempty"` +} + +type initer interface { + Init() error +} + +func newContainerInit(t initType, pipe *os.File) (initer, error) { + var config *initConfig + if err := json.NewDecoder(pipe).Decode(&config); err != nil { + return nil, err + } + if err := populateProcessEnvironment(config.Config.Env); err != nil { + return nil, err + } + switch t { + case initSetns: + return &linuxSetnsInit{ + args: config.Args, + config: config.Config, + }, nil + case initUserns: + return &linuxUsernsInit{ + args: config.Args, + config: config.Config, + }, nil + case initUsernsSideCar: + return &linuxUsernsSideCar{ + config: config.Config, + network: config.NetworkState, + }, nil + case initStandard: + return &linuxStandardInit{ + config: config, + }, nil + } + return nil, fmt.Errorf("unknown init type %q", t) +} + +// populateProcessEnvironment loads the provided environment variables into the +// current processes's environment. +func populateProcessEnvironment(env []string) error { + for _, pair := range env { + p := strings.SplitN(pair, "=", 2) + if len(p) < 2 { + return fmt.Errorf("invalid environment '%v'", pair) + } + if err := os.Setenv(p[0], p[1]); err != nil { + return err + } + } + return nil +} + +// finalizeNamespace drops the caps, sets the correct user +// and working dir, and closes any leaky file descriptors +// before execing the command inside the namespace +func finalizeNamespace(config *configs.Config) error { + // Ensure that all non-standard fds we may have accidentally + // inherited are marked close-on-exec so they stay out of the + // container + if err := utils.CloseExecFrom(3); err != nil { + return err + } + // drop capabilities in bounding set before changing user + if err := capabilities.DropBoundingSet(config.Capabilities); err != nil { + return err + } + // preserve existing capabilities while we change users + if err := system.SetKeepCaps(); err != nil { + return err + } + if err := setupUser(config); err != nil { + return err + } + if err := system.ClearKeepCaps(); err != nil { + return err + } + // drop all other capabilities + if err := capabilities.DropCapabilities(config.Capabilities); err != nil { + return err + } + if config.WorkingDir != "" { + if err := syscall.Chdir(config.WorkingDir); err != nil { + return fmt.Errorf("chdir to %s %s", config.WorkingDir, err) + } + } + return nil +} + +// joinExistingNamespaces gets all the namespace paths specified for the container and +// does a setns on the namespace fd so that the current process joins the namespace. +func joinExistingNamespaces(namespaces []configs.Namespace) error { + for _, ns := range namespaces { + if ns.Path != "" { + f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0) + if err != nil { + return err + } + err = system.Setns(f.Fd(), uintptr(ns.Syscall())) + f.Close() + if err != nil { + return err + } + } + } + return nil +} + +// setupUser changes the groups, gid, and uid for the user inside the container +func setupUser(config *configs.Config) error { + // Set up defaults. + defaultExecUser := user.ExecUser{ + Uid: syscall.Getuid(), + Gid: syscall.Getgid(), + Home: "/", + } + passwdPath, err := user.GetPasswdPath() + if err != nil { + return err + } + groupPath, err := user.GetGroupPath() + if err != nil { + return err + } + execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) + if err != nil { + return fmt.Errorf("get supplementary groups %s", err) + } + suppGroups := append(execUser.Sgids, config.AdditionalGroups...) + if err := syscall.Setgroups(suppGroups); err != nil { + return fmt.Errorf("setgroups %s", err) + } + if err := system.Setgid(execUser.Gid); err != nil { + return fmt.Errorf("setgid %s", err) + } + if err := system.Setuid(execUser.Uid); err != nil { + return fmt.Errorf("setuid %s", err) + } + // if we didn't get HOME already, set it based on the user's HOME + if envHome := os.Getenv("HOME"); envHome == "" { + if err := os.Setenv("HOME", execUser.Home); err != nil { + return fmt.Errorf("set HOME %s", err) + } + } + return nil +} + +// setupVethNetwork uses the Network config if it is not nil to initialize +// the new veth interface inside the container for use by changing the name to eth0 +// setting the MTU and IP address along with the default gateway +func setupNetwork(config *configs.Config, networkState *configs.NetworkState) error { + for _, config := range config.Networks { + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err + } + err1 := strategy.Initialize(config, networkState) + if err1 != nil { + return err1 + } + } + return nil +} + +func setupRoute(config *configs.Config) error { + for _, config := range config.Routes { + if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil { + return err + } + } + return nil +} + +func setupRlimits(config *configs.Config) error { + for _, rlimit := range config.Rlimits { + l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft} + if err := syscall.Setrlimit(rlimit.Type, l); err != nil { + return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) + } + } + return nil +} diff --git a/linux_setns_init.go b/linux_setns_init.go new file mode 100644 index 00000000..356e41b0 --- /dev/null +++ b/linux_setns_init.go @@ -0,0 +1,35 @@ +// +build linux + +package libcontainer + +import ( + "github.com/docker/libcontainer/apparmor" + "github.com/docker/libcontainer/configs" + "github.com/docker/libcontainer/label" + "github.com/docker/libcontainer/system" +) + +// linuxSetnsInit performs the container's initialization for running a new process +// inside an existing container. +type linuxSetnsInit struct { + args []string + config *configs.Config +} + +func (l *linuxSetnsInit) Init() error { + if err := setupRlimits(l.config); err != nil { + return err + } + if err := finalizeNamespace(l.config); err != nil { + return err + } + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { + return err + } + if l.config.ProcessLabel != "" { + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err + } + } + return system.Execv(l.args[0], l.args[0:], l.config.Env) +} diff --git a/linux_standard_init.go b/linux_standard_init.go new file mode 100644 index 00000000..6c0a4661 --- /dev/null +++ b/linux_standard_init.go @@ -0,0 +1,90 @@ +// +build linux + +package libcontainer + +import ( + "syscall" + + "github.com/docker/libcontainer/apparmor" + "github.com/docker/libcontainer/configs" + consolepkg "github.com/docker/libcontainer/console" + "github.com/docker/libcontainer/label" + "github.com/docker/libcontainer/mount" + "github.com/docker/libcontainer/security/restrict" + "github.com/docker/libcontainer/system" +) + +type linuxStandardInit struct { + config *initConfig +} + +func (l *linuxStandardInit) Init() error { + // join any namespaces via a path to the namespace fd if provided + if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil { + return err + } + console := l.config.Config.Console + if console != "" { + if err := consolepkg.OpenAndDup(console); err != nil { + return err + } + } + if _, err := syscall.Setsid(); err != nil { + return err + } + if console != "" { + if err := system.Setctty(); err != nil { + return err + } + } + if err := setupNetwork(l.config.Config, l.config.NetworkState); err != nil { + return err + } + if err := setupRoute(l.config.Config); err != nil { + return err + } + if err := setupRlimits(l.config.Config); err != nil { + return err + } + label.Init() + // InitializeMountNamespace() can be executed only for a new mount namespace + if l.config.Config.Namespaces.Contains(configs.NEWNS) { + if err := mount.InitializeMountNamespace(l.config.Config); err != nil { + return err + } + } + if hostname := l.config.Config.Hostname; hostname != "" { + if err := syscall.Sethostname([]byte(hostname)); err != nil { + return err + } + } + if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil { + return err + } + if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil { + return err + } + if l.config.Config.RestrictSys { + if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { + return err + } + } + pdeath, err := system.GetParentDeathSignal() + if err != nil { + return err + } + if err := finalizeNamespace(l.config.Config); err != nil { + return err + } + // finalizeNamespace can change user/group which clears the parent death + // signal, so we restore it here. + if err := pdeath.Restore(); err != nil { + return err + } + // Signal self if parent is already dead. Does nothing if running in a new + // PID namespace, as Getppid will always return 0. + if syscall.Getppid() == 1 { + return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) + } + return system.Execv(l.config.Args[0], l.config.Args[0:], l.config.Config.Env) +} diff --git a/linux_userns_init.go b/linux_userns_init.go new file mode 100644 index 00000000..cee2ebb5 --- /dev/null +++ b/linux_userns_init.go @@ -0,0 +1,80 @@ +// +build linux + +package libcontainer + +import ( + "syscall" + + "github.com/docker/libcontainer/apparmor" + "github.com/docker/libcontainer/configs" + consolepkg "github.com/docker/libcontainer/console" + "github.com/docker/libcontainer/label" + "github.com/docker/libcontainer/security/restrict" + "github.com/docker/libcontainer/system" +) + +type linuxUsernsInit struct { + args []string + config *configs.Config +} + +func (l *linuxUsernsInit) Init() error { + // join any namespaces via a path to the namespace fd if provided + if err := joinExistingNamespaces(l.config.Namespaces); err != nil { + return err + } + console := l.config.Console + if console != "" { + if err := consolepkg.OpenAndDup("/dev/console"); err != nil { + return err + } + } + if _, err := syscall.Setsid(); err != nil { + return err + } + if console != "" { + if err := system.Setctty(); err != nil { + return err + } + } + if l.config.WorkingDir == "" { + l.config.WorkingDir = "/" + } + if err := setupRlimits(l.config); err != nil { + return err + } + if hostname := l.config.Hostname; hostname != "" { + if err := syscall.Sethostname([]byte(hostname)); err != nil { + return err + } + } + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { + return err + } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err + } + if l.config.RestrictSys { + if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil { + return err + } + } + pdeath, err := system.GetParentDeathSignal() + if err != nil { + return err + } + if err := finalizeNamespace(l.config); err != nil { + return err + } + // finalizeNamespace can change user/group which clears the parent death + // signal, so we restore it here. + if err := pdeath.Restore(); err != nil { + return err + } + // Signal self if parent is already dead. Does nothing if running in a new + // PID namespace, as Getppid will always return 0. + if syscall.Getppid() == 1 { + return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) + } + return system.Execv(l.args[0], l.args[0:], l.config.Env) +} diff --git a/linux_userns_sidecar_init.go b/linux_userns_sidecar_init.go new file mode 100644 index 00000000..047a5b7f --- /dev/null +++ b/linux_userns_sidecar_init.go @@ -0,0 +1,37 @@ +// +build linux + +package libcontainer + +import ( + "github.com/docker/libcontainer/configs" + "github.com/docker/libcontainer/label" + "github.com/docker/libcontainer/mount" +) + +// linuxUsernsSideCar is run to setup mounts and networking related operations +// for a user namespace enabled process as a user namespace root doesn't +// have permissions to perform these operations. +// The setup process joins all the namespaces of user namespace enabled init +// except the user namespace, so it run as root in the root user namespace +// to perform these operations. +type linuxUsernsSideCar struct { + config *configs.Config + network *configs.NetworkState +} + +func (l *linuxUsernsSideCar) Init() error { + if err := setupNetwork(l.config, l.network); err != nil { + return err + } + if err := setupRoute(l.config); err != nil { + return err + } + label.Init() + // InitializeMountNamespace() can be executed only for a new mount namespace + if l.config.Namespaces.Contains(configs.NEWNET) { + if err := mount.InitializeMountNamespace(l.config); err != nil { + return err + } + } + return nil +} diff --git a/nsenter/nsexec.c b/nsenter/nsexec.c index 426dfc57..5b62729a 100644 --- a/nsenter/nsexec.c +++ b/nsenter/nsexec.c @@ -22,14 +22,14 @@ struct clone_arg { * Reserve some space for clone() to locate arguments * and retcode in this place */ - char stack[4096] __attribute__((aligned (8))); + char stack[4096] __attribute__ ((aligned(8))); char stack_ptr[0]; jmp_buf *env; }; static int child_func(void *_arg) { - struct clone_arg *arg = (struct clone_arg *) _arg; + struct clone_arg *arg = (struct clone_arg *)_arg; longjmp(*arg->env, 1); } @@ -47,8 +47,8 @@ int setns(int fd, int nstype) #endif #endif -static int clone_parent(jmp_buf *env) __attribute__ ((noinline)); -static int clone_parent(jmp_buf *env) +static int clone_parent(jmp_buf * env) __attribute__ ((noinline)); +static int clone_parent(jmp_buf * env) { struct clone_arg ca; int child; @@ -100,7 +100,8 @@ void nsexec() fd = openat(tfd, namespaces[i], O_RDONLY); if (fd == -1) { - pr_perror("Failed to open ns file %s for ns %s", buf, namespaces[i]); + pr_perror("Failed to open ns file %s for ns %s", buf, + namespaces[i]); exit(1); } // Set the namespace. diff --git a/stacktrace/capture.go b/stacktrace/capture.go index 9fc75f8a..15b3482c 100644 --- a/stacktrace/capture.go +++ b/stacktrace/capture.go @@ -10,7 +10,6 @@ func Capture(userSkip int) Stacktrace { skip = userSkip + 1 // add one for our own function frames []Frame ) - for i := skip; ; i++ { pc, file, line, ok := runtime.Caller(i) if !ok { @@ -18,7 +17,6 @@ func Capture(userSkip int) Stacktrace { } frames = append(frames, NewFrame(pc, file, line)) } - return Stacktrace{ Frames: frames, } diff --git a/system/linux.go b/system/linux.go index c07ef153..2cc3ef80 100644 --- a/system/linux.go +++ b/system/linux.go @@ -8,6 +8,26 @@ import ( "unsafe" ) +type ParentDeathSignal int + +func (p ParentDeathSignal) Restore() error { + if p == 0 { + return nil + } + current, err := GetParentDeathSignal() + if err != nil { + return err + } + if p == current { + return nil + } + return p.Set() +} + +func (p ParentDeathSignal) Set() error { + return SetParentDeathSignal(uintptr(p)) +} + func Execv(cmd string, args []string, env []string) error { name, err := exec.LookPath(cmd) if err != nil { @@ -17,23 +37,20 @@ func Execv(cmd string, args []string, env []string) error { return syscall.Exec(name, args, env) } -func ParentDeathSignal(sig uintptr) error { +func SetParentDeathSignal(sig uintptr) error { if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, sig, 0); err != 0 { return err } return nil } -func GetParentDeathSignal() (int, error) { +func GetParentDeathSignal() (ParentDeathSignal, error) { var sig int - _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0) - if err != 0 { return -1, err } - - return sig, nil + return ParentDeathSignal(sig), nil } func SetKeepCaps() error {