2014-02-20 08:50:10 +08:00
|
|
|
// +build linux
|
|
|
|
|
2014-06-05 06:47:57 +08:00
|
|
|
package namespaces
|
2014-02-19 09:52:06 +08:00
|
|
|
|
|
|
|
import (
|
2014-11-04 10:18:55 +08:00
|
|
|
"encoding/json"
|
2014-02-19 09:52:06 +08:00
|
|
|
"fmt"
|
2014-11-04 10:18:55 +08:00
|
|
|
"io/ioutil"
|
2014-03-04 13:46:49 +08:00
|
|
|
"os"
|
2014-05-01 08:55:15 +08:00
|
|
|
"strings"
|
2014-03-04 13:46:49 +08:00
|
|
|
"syscall"
|
|
|
|
|
2014-06-10 23:14:16 +08:00
|
|
|
"github.com/docker/libcontainer/apparmor"
|
2014-12-17 17:12:23 +08:00
|
|
|
"github.com/docker/libcontainer/configs"
|
2014-06-10 23:14:16 +08:00
|
|
|
"github.com/docker/libcontainer/console"
|
|
|
|
"github.com/docker/libcontainer/label"
|
|
|
|
"github.com/docker/libcontainer/mount"
|
|
|
|
"github.com/docker/libcontainer/netlink"
|
|
|
|
"github.com/docker/libcontainer/network"
|
|
|
|
"github.com/docker/libcontainer/security/capabilities"
|
|
|
|
"github.com/docker/libcontainer/security/restrict"
|
2014-07-15 07:55:49 +08:00
|
|
|
"github.com/docker/libcontainer/system"
|
2014-07-17 16:06:55 +08:00
|
|
|
"github.com/docker/libcontainer/user"
|
2014-06-10 23:14:16 +08:00
|
|
|
"github.com/docker/libcontainer/utils"
|
2014-02-19 09:52:06 +08:00
|
|
|
)
|
|
|
|
|
2014-12-19 17:40:03 +08:00
|
|
|
// Process is used for transferring parameters from Exec() to Init()
|
|
|
|
type processArgs struct {
|
|
|
|
Args []string `json:"args,omitempty"`
|
|
|
|
Env []string `json:"environment,omitempty"`
|
|
|
|
ConsolePath string `json:"console_path,omitempty"`
|
|
|
|
}
|
|
|
|
|
2014-06-20 07:36:39 +08:00
|
|
|
// TODO(vishh): This is part of the libcontainer API and it does much more than just namespaces related work.
|
|
|
|
// Move this to libcontainer package.
|
2014-02-21 10:27:42 +08:00
|
|
|
// Init is the init process that first runs inside a new namespace to setup mounts, users, networking,
|
|
|
|
// and other options required for the new container.
|
2014-08-02 05:47:15 +08:00
|
|
|
// The caller of Init function has to ensure that the go runtime is locked to an OS thread
|
|
|
|
// (using runtime.LockOSThread) else system calls like setns called within Init may not work as intended.
|
2014-12-19 17:40:03 +08:00
|
|
|
func Init(pipe *os.File) (err error) {
|
2014-06-16 21:30:42 +08:00
|
|
|
defer func() {
|
2014-11-04 10:18:55 +08:00
|
|
|
// if we have an error during the initialization of the container's init then send it back to the
|
|
|
|
// parent process in the form of an initError.
|
2014-06-16 21:30:42 +08:00
|
|
|
if err != nil {
|
2014-11-04 10:18:55 +08:00
|
|
|
// ensure that any data sent from the parent is consumed so it doesn't
|
|
|
|
// receive ECONNRESET when the child writes to the pipe.
|
|
|
|
ioutil.ReadAll(pipe)
|
|
|
|
if err := json.NewEncoder(pipe).Encode(initError{
|
|
|
|
Message: err.Error(),
|
|
|
|
}); err != nil {
|
|
|
|
panic(err)
|
|
|
|
}
|
2014-06-16 21:30:42 +08:00
|
|
|
}
|
2014-11-04 10:18:55 +08:00
|
|
|
// ensure that this pipe is always closed
|
|
|
|
pipe.Close()
|
2014-06-16 21:30:42 +08:00
|
|
|
}()
|
|
|
|
|
2014-12-19 17:40:03 +08:00
|
|
|
decoder := json.NewDecoder(pipe)
|
|
|
|
|
|
|
|
var container *configs.Config
|
|
|
|
if err := decoder.Decode(&container); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
var process *processArgs
|
|
|
|
if err := decoder.Decode(&process); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
uncleanRootfs, err := os.Getwd()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2014-02-25 13:11:52 +08:00
|
|
|
rootfs, err := utils.ResolveRootfs(uncleanRootfs)
|
2014-02-20 06:33:25 +08:00
|
|
|
if err != nil {
|
2014-02-20 08:40:36 +08:00
|
|
|
return err
|
2014-02-20 06:33:25 +08:00
|
|
|
}
|
|
|
|
|
2014-05-01 08:55:15 +08:00
|
|
|
// clear the current processes env and replace it with the environment
|
|
|
|
// defined on the container
|
|
|
|
if err := LoadContainerEnvironment(container); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2014-02-21 06:12:08 +08:00
|
|
|
// We always read this as it is a way to sync with the parent as well
|
2014-08-07 09:00:52 +08:00
|
|
|
var networkState *network.NetworkState
|
2014-12-19 17:40:03 +08:00
|
|
|
if err := decoder.Decode(&networkState); err != nil {
|
2014-02-21 06:12:08 +08:00
|
|
|
return err
|
2014-02-20 07:33:44 +08:00
|
|
|
}
|
2014-11-25 06:39:32 +08:00
|
|
|
// join any namespaces via a path to the namespace fd if provided
|
|
|
|
if err := joinExistingNamespaces(container.Namespaces); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-12-19 17:40:03 +08:00
|
|
|
if process.ConsolePath != "" {
|
|
|
|
if err := console.OpenAndDup(process.ConsolePath); err != nil {
|
2014-04-11 23:15:28 +08:00
|
|
|
return err
|
2014-02-21 10:05:40 +08:00
|
|
|
}
|
2014-02-19 09:52:06 +08:00
|
|
|
}
|
2014-07-15 07:55:49 +08:00
|
|
|
if _, err := syscall.Setsid(); err != nil {
|
2014-02-20 08:40:36 +08:00
|
|
|
return fmt.Errorf("setsid %s", err)
|
2014-02-19 09:52:06 +08:00
|
|
|
}
|
2014-12-19 17:40:03 +08:00
|
|
|
if process.ConsolePath != "" {
|
2014-02-21 10:05:40 +08:00
|
|
|
if err := system.Setctty(); err != nil {
|
|
|
|
return fmt.Errorf("setctty %s", err)
|
|
|
|
}
|
2014-02-19 09:52:06 +08:00
|
|
|
}
|
2014-11-25 06:39:32 +08:00
|
|
|
|
2014-06-26 06:51:28 +08:00
|
|
|
if err := setupNetwork(container, networkState); err != nil {
|
2014-03-04 13:46:49 +08:00
|
|
|
return fmt.Errorf("setup networking %s", err)
|
|
|
|
}
|
2014-05-17 15:06:29 +08:00
|
|
|
if err := setupRoute(container); err != nil {
|
|
|
|
return fmt.Errorf("setup route %s", err)
|
|
|
|
}
|
2014-04-01 22:03:29 +08:00
|
|
|
|
2014-11-27 02:16:53 +08:00
|
|
|
if err := setupRlimits(container); err != nil {
|
|
|
|
return fmt.Errorf("setup rlimits %s", err)
|
|
|
|
}
|
|
|
|
|
2014-04-01 22:03:29 +08:00
|
|
|
label.Init()
|
2014-05-01 08:18:07 +08:00
|
|
|
|
2014-06-24 05:11:01 +08:00
|
|
|
if err := mount.InitializeMountNamespace(rootfs,
|
2014-12-19 17:40:03 +08:00
|
|
|
process.ConsolePath,
|
2014-07-19 05:13:38 +08:00
|
|
|
container.RestrictSys,
|
2014-06-24 06:28:43 +08:00
|
|
|
(*mount.MountConfig)(container.MountConfig)); err != nil {
|
2014-02-20 08:40:36 +08:00
|
|
|
return fmt.Errorf("setup mount namespace %s", err)
|
2014-02-19 09:52:06 +08:00
|
|
|
}
|
2014-06-25 08:31:03 +08:00
|
|
|
|
2014-05-06 02:12:25 +08:00
|
|
|
if container.Hostname != "" {
|
2014-07-15 07:55:49 +08:00
|
|
|
if err := syscall.Sethostname([]byte(container.Hostname)); err != nil {
|
2014-11-15 09:46:11 +08:00
|
|
|
return fmt.Errorf("unable to sethostname %q: %s", container.Hostname, err)
|
2014-05-06 02:12:25 +08:00
|
|
|
}
|
2014-02-19 09:52:06 +08:00
|
|
|
}
|
|
|
|
|
2014-06-25 08:31:03 +08:00
|
|
|
if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil {
|
|
|
|
return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err)
|
2014-05-02 10:09:12 +08:00
|
|
|
}
|
2014-06-25 08:31:03 +08:00
|
|
|
|
|
|
|
if err := label.SetProcessLabel(container.ProcessLabel); err != nil {
|
2014-05-02 10:09:12 +08:00
|
|
|
return fmt.Errorf("set process label %s", err)
|
|
|
|
}
|
2014-06-25 08:31:03 +08:00
|
|
|
|
|
|
|
// TODO: (crosbymichael) make this configurable at the Config level
|
|
|
|
if container.RestrictSys {
|
2014-07-19 05:13:38 +08:00
|
|
|
if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus"); err != nil {
|
2014-05-01 09:00:42 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2014-05-14 17:49:06 +08:00
|
|
|
|
|
|
|
pdeathSignal, err := system.GetParentDeathSignal()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("get parent death signal %s", err)
|
|
|
|
}
|
|
|
|
|
2014-05-01 09:00:42 +08:00
|
|
|
if err := FinalizeNamespace(container); err != nil {
|
|
|
|
return fmt.Errorf("finalize namespace %s", err)
|
|
|
|
}
|
2014-05-14 17:49:06 +08:00
|
|
|
|
|
|
|
// FinalizeNamespace can change user/group which clears the parent death
|
|
|
|
// signal, so we restore it here.
|
|
|
|
if err := RestoreParentDeathSignal(pdeathSignal); err != nil {
|
|
|
|
return fmt.Errorf("restore parent death signal %s", err)
|
|
|
|
}
|
|
|
|
|
2014-12-19 17:40:03 +08:00
|
|
|
return system.Execv(process.Args[0], process.Args[0:], process.Env)
|
2014-02-22 14:58:30 +08:00
|
|
|
}
|
|
|
|
|
2014-05-14 17:49:06 +08:00
|
|
|
// RestoreParentDeathSignal sets the parent death signal to old.
|
|
|
|
func RestoreParentDeathSignal(old int) error {
|
|
|
|
if old == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
current, err := system.GetParentDeathSignal()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("get parent death signal %s", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if old == current {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := system.ParentDeathSignal(uintptr(old)); err != nil {
|
|
|
|
return fmt.Errorf("set parent death signal %s", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Signal self if parent is already dead. Does nothing if running in a new
|
|
|
|
// PID namespace, as Getppid will always return 0.
|
|
|
|
if syscall.Getppid() == 1 {
|
2014-05-28 04:38:24 +08:00
|
|
|
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
|
2014-05-14 17:49:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-05-01 06:27:59 +08:00
|
|
|
// SetupUser changes the groups, gid, and uid for the user inside the container
|
|
|
|
func SetupUser(u string) error {
|
2014-08-15 02:19:17 +08:00
|
|
|
// Set up defaults.
|
|
|
|
defaultExecUser := user.ExecUser{
|
|
|
|
Uid: syscall.Getuid(),
|
|
|
|
Gid: syscall.Getgid(),
|
|
|
|
Home: "/",
|
|
|
|
}
|
|
|
|
|
2014-08-29 16:43:40 +08:00
|
|
|
passwdFile, err := user.GetPasswdFile()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
groupFile, err := user.GetGroupFile()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
execUser, err := user.GetExecUserFile(u, &defaultExecUser, passwdFile, groupFile)
|
2014-04-29 06:46:03 +08:00
|
|
|
if err != nil {
|
2014-05-01 06:27:59 +08:00
|
|
|
return fmt.Errorf("get supplementary groups %s", err)
|
2014-04-29 06:46:03 +08:00
|
|
|
}
|
2014-07-15 07:55:49 +08:00
|
|
|
|
2014-08-15 02:19:17 +08:00
|
|
|
if err := syscall.Setgroups(execUser.Sgids); err != nil {
|
2014-04-29 06:46:03 +08:00
|
|
|
return fmt.Errorf("setgroups %s", err)
|
|
|
|
}
|
2014-07-15 07:55:49 +08:00
|
|
|
|
2014-08-15 02:19:17 +08:00
|
|
|
if err := system.Setgid(execUser.Gid); err != nil {
|
2014-04-29 06:46:03 +08:00
|
|
|
return fmt.Errorf("setgid %s", err)
|
|
|
|
}
|
2014-07-15 07:55:49 +08:00
|
|
|
|
2014-08-15 02:19:17 +08:00
|
|
|
if err := system.Setuid(execUser.Uid); err != nil {
|
2014-04-29 06:46:03 +08:00
|
|
|
return fmt.Errorf("setuid %s", err)
|
2014-02-19 09:52:06 +08:00
|
|
|
}
|
2014-07-15 07:55:49 +08:00
|
|
|
|
2014-07-17 16:06:55 +08:00
|
|
|
// if we didn't get HOME already, set it based on the user's HOME
|
|
|
|
if envHome := os.Getenv("HOME"); envHome == "" {
|
2014-08-15 02:19:17 +08:00
|
|
|
if err := os.Setenv("HOME", execUser.Home); err != nil {
|
2014-07-17 16:06:55 +08:00
|
|
|
return fmt.Errorf("set HOME %s", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-02-19 09:52:06 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-02-20 14:43:40 +08:00
|
|
|
// setupVethNetwork uses the Network config if it is not nil to initialize
|
|
|
|
// the new veth interface inside the container for use by changing the name to eth0
|
|
|
|
// setting the MTU and IP address along with the default gateway
|
2014-12-17 17:12:23 +08:00
|
|
|
func setupNetwork(container *configs.Config, networkState *network.NetworkState) error {
|
2014-02-27 06:19:39 +08:00
|
|
|
for _, config := range container.Networks {
|
2014-02-22 14:20:15 +08:00
|
|
|
strategy, err := network.GetStrategy(config.Type)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2014-02-20 06:55:34 +08:00
|
|
|
}
|
2014-03-16 08:01:31 +08:00
|
|
|
|
2014-06-26 06:51:28 +08:00
|
|
|
err1 := strategy.Initialize((*network.Network)(config), networkState)
|
2014-03-16 08:01:31 +08:00
|
|
|
if err1 != nil {
|
|
|
|
return err1
|
|
|
|
}
|
2014-02-20 02:44:29 +08:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
2014-03-04 04:15:47 +08:00
|
|
|
|
2014-12-17 17:12:23 +08:00
|
|
|
func setupRoute(container *configs.Config) error {
|
2014-05-17 15:06:29 +08:00
|
|
|
for _, config := range container.Routes {
|
|
|
|
if err := netlink.AddRoute(config.Destination, config.Source, config.Gateway, config.InterfaceName); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-12-17 17:12:23 +08:00
|
|
|
func setupRlimits(container *configs.Config) error {
|
2014-11-27 02:16:53 +08:00
|
|
|
for _, rlimit := range container.Rlimits {
|
|
|
|
l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
|
|
|
|
if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
|
|
|
|
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-05-01 08:18:07 +08:00
|
|
|
// FinalizeNamespace drops the caps, sets the correct user
|
2014-04-29 13:22:54 +08:00
|
|
|
// and working dir, and closes any leaky file descriptors
|
|
|
|
// before execing the command inside the namespace
|
2014-12-17 17:12:23 +08:00
|
|
|
func FinalizeNamespace(container *configs.Config) error {
|
2014-06-13 21:56:58 +08:00
|
|
|
// Ensure that all non-standard fds we may have accidentally
|
|
|
|
// inherited are marked close-on-exec so they stay out of the
|
|
|
|
// container
|
|
|
|
if err := utils.CloseExecFrom(3); err != nil {
|
2014-04-29 13:22:54 +08:00
|
|
|
return fmt.Errorf("close open file descriptors %s", err)
|
|
|
|
}
|
2014-05-28 22:41:48 +08:00
|
|
|
|
|
|
|
// drop capabilities in bounding set before changing user
|
2014-06-24 02:30:25 +08:00
|
|
|
if err := capabilities.DropBoundingSet(container.Capabilities); err != nil {
|
2014-05-28 22:41:48 +08:00
|
|
|
return fmt.Errorf("drop bounding set %s", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// preserve existing capabilities while we change users
|
|
|
|
if err := system.SetKeepCaps(); err != nil {
|
|
|
|
return fmt.Errorf("set keep caps %s", err)
|
|
|
|
}
|
|
|
|
|
2014-05-01 06:27:59 +08:00
|
|
|
if err := SetupUser(container.User); err != nil {
|
2014-03-04 04:15:47 +08:00
|
|
|
return fmt.Errorf("setup user %s", err)
|
|
|
|
}
|
2014-05-28 22:41:48 +08:00
|
|
|
|
|
|
|
if err := system.ClearKeepCaps(); err != nil {
|
|
|
|
return fmt.Errorf("clear keep caps %s", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// drop all other capabilities
|
2014-06-24 02:30:25 +08:00
|
|
|
if err := capabilities.DropCapabilities(container.Capabilities); err != nil {
|
2014-05-28 22:41:48 +08:00
|
|
|
return fmt.Errorf("drop capabilities %s", err)
|
|
|
|
}
|
|
|
|
|
2014-03-04 04:15:47 +08:00
|
|
|
if container.WorkingDir != "" {
|
2014-07-15 07:55:49 +08:00
|
|
|
if err := syscall.Chdir(container.WorkingDir); err != nil {
|
2014-03-04 04:15:47 +08:00
|
|
|
return fmt.Errorf("chdir to %s %s", container.WorkingDir, err)
|
|
|
|
}
|
|
|
|
}
|
2014-06-13 21:56:58 +08:00
|
|
|
|
2014-03-04 04:15:47 +08:00
|
|
|
return nil
|
|
|
|
}
|
2014-05-01 08:55:15 +08:00
|
|
|
|
2014-12-17 17:12:23 +08:00
|
|
|
func LoadContainerEnvironment(container *configs.Config) error {
|
2014-05-01 08:55:15 +08:00
|
|
|
os.Clearenv()
|
|
|
|
for _, pair := range container.Env {
|
|
|
|
p := strings.SplitN(pair, "=", 2)
|
2014-05-16 12:36:15 +08:00
|
|
|
if len(p) < 2 {
|
|
|
|
return fmt.Errorf("invalid environment '%v'", pair)
|
|
|
|
}
|
2014-05-01 08:55:15 +08:00
|
|
|
if err := os.Setenv(p[0], p[1]); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
2014-11-25 06:39:32 +08:00
|
|
|
|
|
|
|
// joinExistingNamespaces gets all the namespace paths specified for the container and
|
|
|
|
// does a setns on the namespace fd so that the current process joins the namespace.
|
2014-12-17 17:12:23 +08:00
|
|
|
func joinExistingNamespaces(namespaces []configs.Namespace) error {
|
2014-11-25 06:39:32 +08:00
|
|
|
for _, ns := range namespaces {
|
|
|
|
if ns.Path != "" {
|
|
|
|
f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-12-04 07:47:26 +08:00
|
|
|
err = system.Setns(f.Fd(), uintptr(namespaceInfo[ns.Name]))
|
2014-11-25 06:39:32 +08:00
|
|
|
f.Close()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|