diff --git a/capabilities/capabilities.go b/capabilities/capabilities.go index c19b7195..65fd455c 100644 --- a/capabilities/capabilities.go +++ b/capabilities/capabilities.go @@ -41,6 +41,7 @@ func DropCapabilities(container *libcontainer.Container) error { return nil } +// getCapabilities returns the specific cap values for the libcontainer types func getCapabilities(container *libcontainer.Container) []capability.Cap { drop := []capability.Cap{} for _, c := range container.Capabilities { diff --git a/container.go b/container.go index 763526f6..a6a57dab 100644 --- a/container.go +++ b/container.go @@ -1,16 +1,22 @@ package libcontainer +// Container defines configuration options for how a +// container is setup inside a directory and how a process should be executed type Container struct { - Hostname string `json:"hostname,omitempty"` - ReadonlyFs bool `json:"readonly_fs,omitempty"` - User string `json:"user,omitempty"` - WorkingDir string `json:"working_dir,omitempty"` - Env []string `json:"environment,omitempty"` - Namespaces Namespaces `json:"namespaces,omitempty"` - Capabilities Capabilities `json:"capabilities,omitempty"` - Network *Network `json:"network,omitempty"` + Hostname string `json:"hostname,omitempty"` // hostname + ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly + User string `json:"user,omitempty"` // user to execute the process as + WorkingDir string `json:"working_dir,omitempty"` // current working directory + Env []string `json:"environment,omitempty"` // environment to set + Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply + Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop + Network *Network `json:"network,omitempty"` // nil for host's network stack } +// Network defines configuration for a container's networking stack +// +// The network configuration can be omited from a container causing the +// container to be setup with the host's networking stack type Network struct { IP string `json:"ip,omitempty"` Gateway string `json:"gateway,omitempty"` diff --git a/nsinit/exec.go b/nsinit/exec.go index 67f907af..202cfcab 100644 --- a/nsinit/exec.go +++ b/nsinit/exec.go @@ -38,7 +38,7 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) defer deletePidFile() if container.Network != nil { - vethPair, err := setupVeth(container.Network.Bridge, command.Process.Pid) + vethPair, err := initializeContainerVeth(container.Network.Bridge, command.Process.Pid) if err != nil { return -1, err } @@ -63,14 +63,21 @@ func execCommand(container *libcontainer.Container, args []string) (int, error) return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil } +// sendVethName writes the veth pair name to the child's stdin then closes the +// pipe so that the child stops waiting for more data func sendVethName(name string, pipe io.WriteCloser) { - // write the veth pair name to the child's stdin then close the - // pipe so that the child stops waiting fmt.Fprint(pipe, name) pipe.Close() } -func setupVeth(bridge string, nspid int) (string, error) { +// initializeContainerVeth will create a veth pair and setup the host's +// side of the pair by setting the specified bridge as the master and bringing +// up the interface. +// +// Then will with set the other side of the veth pair into the container's namespaced +// using the pid and returns the veth's interface name to provide to the container to +// finish setting up the interface inside the namespace +func initializeContainerVeth(bridge string, nspid int) (string, error) { name1, name2, err := createVethPair() if err != nil { return "", err @@ -98,6 +105,8 @@ func setupWindow(master *os.File) (*term.State, error) { return term.SetRawTerminal(os.Stdin.Fd()) } +// createMasterAndConsole will open /dev/ptmx on the host and retreive the +// pts name for use as the pty slave inside the container func createMasterAndConsole() (*os.File, string, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { @@ -113,6 +122,8 @@ func createMasterAndConsole() (*os.File, string, error) { return master, console, nil } +// createVethPair will automatically generage two random names for +// the veth pair and ensure that they have been created func createVethPair() (name1 string, name2 string, err error) { name1, err = utils.GenerateRandomName("dock", 4) if err != nil { @@ -128,6 +139,7 @@ func createVethPair() (name1 string, name2 string, err error) { return } +// writePidFile writes the namespaced processes pid to .nspid in the rootfs for the container func writePidFile(command *exec.Cmd) error { return ioutil.WriteFile(".nspid", []byte(fmt.Sprint(command.Process.Pid)), 0655) } @@ -136,6 +148,9 @@ func deletePidFile() error { return os.Remove(".nspid") } +// createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided func createCommand(container *libcontainer.Container, console string, args []string) *exec.Cmd { command := exec.Command("nsinit", append([]string{"init", console}, args...)...) command.SysProcAttr = &syscall.SysProcAttr{ diff --git a/nsinit/execin.go b/nsinit/execin.go index 7f32620c..d6224f95 100644 --- a/nsinit/execin.go +++ b/nsinit/execin.go @@ -28,6 +28,7 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) return -1, err } + // foreach namespace fd, use setns to join an existing container's namespaces for _, fd := range fds { if fd > 0 { if err := system.Setns(fd, 0); err != nil { @@ -42,7 +43,6 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) // remount proc and sys to pick up the changes if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { - pid, err := system.Fork() if err != nil { return -1, err @@ -58,12 +58,7 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) if err := remountSys(); err != nil { return -1, fmt.Errorf("remount sys %s", err) } - if err := capabilities.DropCapabilities(container); err != nil { - return -1, fmt.Errorf("drop capabilities %s", err) - } - if err := system.Exec(args[0], args[0:], container.Env); err != nil { - return -1, err - } + goto dropAndExec } proc, err := os.FindProcess(pid) if err != nil { @@ -75,6 +70,7 @@ func execinCommand(container *libcontainer.Container, nspid int, args []string) } os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus()) } +dropAndExec: if err := capabilities.DropCapabilities(container); err != nil { return -1, fmt.Errorf("drop capabilities %s", err) } diff --git a/nsinit/init.go b/nsinit/init.go index 82706fda..c77fd904 100644 --- a/nsinit/init.go +++ b/nsinit/init.go @@ -37,9 +37,6 @@ func initCommand(container *libcontainer.Container, console string, args []strin if err != nil { return fmt.Errorf("open terminal %s", err) } - if slave.Fd() != 0 { - return fmt.Errorf("slave fd should be 0") - } if err := dupSlave(slave); err != nil { return fmt.Errorf("dup2 slave %s", err) } @@ -55,7 +52,7 @@ func initCommand(container *libcontainer.Container, console string, args []strin if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { return fmt.Errorf("setup mount namespace %s", err) } - if err := setupNetworking(container.Network, tempVethName); err != nil { + if err := setupVethNetwork(container.Network, tempVethName); err != nil { return fmt.Errorf("setup networking %s", err) } if err := system.Sethostname(container.Hostname); err != nil { @@ -78,6 +75,8 @@ func initCommand(container *libcontainer.Container, console string, args []strin panic("unreachable") } +// resolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs func resolveRootfs() (string, error) { cwd, err := os.Getwd() if err != nil { @@ -104,8 +103,9 @@ func setupUser(container *libcontainer.Container) error { return nil } +// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that +// the slave's fd is 0, or stdin func dupSlave(slave *os.File) error { - // we close Stdin,etc so our pty slave should have fd 0 if slave.Fd() != 0 { return fmt.Errorf("slave fd not 0 %d", slave.Fd()) } @@ -118,7 +118,8 @@ func dupSlave(slave *os.File) error { return nil } -// openTerminal is a clone of os.OpenFile without the O_CLOEXEC addition. +// openTerminal is a clone of os.OpenFile without the O_CLOEXEC +// used to open the pty slave inside the container namespace func openTerminal(name string, flag int) (*os.File, error) { r, e := syscall.Open(name, flag, 0) if e != nil { @@ -127,7 +128,10 @@ func openTerminal(name string, flag int) (*os.File, error) { return os.NewFile(uintptr(r), name), nil } -func setupNetworking(config *libcontainer.Network, tempVethName string) error { +// setupVethNetwork uses the Network config if it is not nil to initialize +// the new veth interface inside the container for use by changing the name to eth0 +// setting the MTU and IP address along with the default gateway +func setupVethNetwork(config *libcontainer.Network, tempVethName string) error { if config != nil { if err := network.InterfaceDown(tempVethName); err != nil { return fmt.Errorf("interface down %s %s", tempVethName, err) diff --git a/nsinit/main.go b/nsinit/main.go index 30c8b064..f45fe556 100644 --- a/nsinit/main.go +++ b/nsinit/main.go @@ -26,7 +26,7 @@ func main() { log.Fatal(ErrWrongArguments) } switch os.Args[1] { - case "exec": + case "exec": // this is executed outside of the namespace in the cwd var exitCode int nspid, err := readPid() if err != nil { @@ -43,7 +43,7 @@ func main() { log.Fatal(err) } os.Exit(exitCode) - case "init": + case "init": // this is executed inside of the namespace to setup the container if argc < 3 { log.Fatal(ErrWrongArguments) } diff --git a/nsinit/mount.go b/nsinit/mount.go index baa850f0..6eb2e090 100644 --- a/nsinit/mount.go +++ b/nsinit/mount.go @@ -10,10 +10,16 @@ import ( "syscall" ) -// default mount point options +// default mount point flags const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV +// setupNewMountNamespace is used to initialize a new mount namespace for an new +// container in the rootfs that is specified. +// +// There is no need to unmount the new mounts because as soon as the mount namespace +// is no longer in use, the mounts will be removed automatically func setupNewMountNamespace(rootfs, console string, readonly bool) error { + // mount as slave so that the new mounts do not propagate to the host if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { return fmt.Errorf("mounting / as slave %s", err) } @@ -55,6 +61,7 @@ func setupNewMountNamespace(rootfs, console string, readonly bool) error { return nil } +// copyDevNodes mknods the hosts devices so the new container has access to them func copyDevNodes(rootfs string) error { oldMask := system.Umask(0000) defer system.Umask(oldMask) @@ -82,6 +89,8 @@ func copyDevNodes(rootfs string) error { return nil } +// setupDev symlinks the current processes pipes into the +// appropriate destination on the containers rootfs func setupDev(rootfs string) error { for _, link := range []struct { from string @@ -104,6 +113,7 @@ func setupDev(rootfs string) error { return nil } +// setupConsole ensures that the container has a proper /dev/console setup func setupConsole(rootfs, console string) error { oldMask := system.Umask(0000) defer system.Umask(oldMask) @@ -161,6 +171,24 @@ func mountSystem(rootfs string) error { return nil } +// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and +// finishes setting up /dev/console +func setupPtmx(rootfs, console string) error { + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + if err := setupConsole(rootfs, console); err != nil { + return err + } + return nil +} + +// remountProc is used to detach and remount the proc filesystem +// commonly needed with running a new process inside an existing container func remountProc() error { if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { return err @@ -183,14 +211,3 @@ func remountSys() error { } return nil } - -func setupPtmx(rootfs, console string) error { - ptmx := filepath.Join(rootfs, "dev/ptmx") - if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { - return err - } - if err := os.Symlink("pts/ptmx", ptmx); err != nil { - return fmt.Errorf("symlink dev ptmx %s", err) - } - return setupConsole(rootfs, console) -} diff --git a/nsinit/ns_linux.go b/nsinit/ns_linux.go index a2809eb1..481bdf79 100644 --- a/nsinit/ns_linux.go +++ b/nsinit/ns_linux.go @@ -14,6 +14,9 @@ var namespaceMap = map[libcontainer.Namespace]int{ libcontainer.CLONE_NEWNET: syscall.CLONE_NEWNET, } +// namespaceFileMap is used to convert the libcontainer types +// into the names of the files located in /proc//ns/* for +// each namespace var namespaceFileMap = map[libcontainer.Namespace]string{ libcontainer.CLONE_NEWNS: "mnt", libcontainer.CLONE_NEWUTS: "uts", diff --git a/types.go b/types.go index fcd00fd4..bb54ff51 100644 --- a/types.go +++ b/types.go @@ -1,5 +1,8 @@ package libcontainer +// These constants are defined as string types so that +// it is clear when adding the configuration in config files +// instead of using ints or other types const ( CAP_SETPCAP Capability = "SETPCAP" CAP_SYS_MODULE Capability = "SYS_MODULE" @@ -25,9 +28,15 @@ const ( CLONE_NEWNET Namespace = "NEWNET" // network ) -type Namespace string -type Namespaces []Namespace +type ( + Namespace string + Namespaces []Namespace + Capability string + Capabilities []Capability +) +// Contains returns true if the specified Namespace is +// in the slice func (n Namespaces) Contains(ns Namespace) bool { for _, nns := range n { if nns == ns { @@ -37,9 +46,8 @@ func (n Namespaces) Contains(ns Namespace) bool { return false } -type Capability string -type Capabilities []Capability - +// Contains returns true if the specified Capability is +// in the slice func (c Capabilities) Contains(capp Capability) bool { for _, cc := range c { if cc == capp { diff --git a/utils/utils.go b/utils/utils.go index d3223c3e..5050997f 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -6,6 +6,8 @@ import ( "io" ) +// GenerateRandomName returns a new name joined with a prefix. This size +// specified is used to truncate the randomly generated value func GenerateRandomName(prefix string, size int) (string, error) { id := make([]byte, 32) if _, err := io.ReadFull(rand.Reader, id); err != nil {