runc/linux_container.go

590 lines
16 KiB
Go
Raw Normal View History

// +build linux
package libcontainer
import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"syscall"
"github.com/docker/libcontainer/apparmor"
"github.com/docker/libcontainer/cgroups"
"github.com/docker/libcontainer/configs"
"github.com/docker/libcontainer/label"
"github.com/docker/libcontainer/mount"
"github.com/docker/libcontainer/network"
"github.com/docker/libcontainer/system"
"github.com/golang/glog"
)
const (
EXIT_SIGNAL_OFFSET = 128
)
type initError struct {
Message string `json:"message,omitempty"`
}
func (i initError) Error() string {
return i.Message
}
type linuxContainer struct {
id string
root string
config *configs.Config
state *configs.State
cgroupManager cgroups.Manager
initArgs []string
}
// ID returns the container's unique ID
func (c *linuxContainer) ID() string {
return c.id
}
// Config returns the container's configuration
func (c *linuxContainer) Config() configs.Config {
return *c.config
}
func (c *linuxContainer) Status() (configs.Status, error) {
if c.state.InitPid <= 0 {
return configs.Destroyed, nil
}
// return Running if the init process is alive
err := syscall.Kill(c.state.InitPid, 0)
if err != nil {
if err == syscall.ESRCH {
return configs.Destroyed, nil
}
return 0, err
}
if c.config.Cgroups != nil &&
c.config.Cgroups.Freezer == configs.Frozen {
return configs.Paused, nil
}
return configs.Running, nil
}
func (c *linuxContainer) Processes() ([]int, error) {
glog.Info("fetch container processes")
pids, err := c.cgroupManager.GetPids()
if err != nil {
return nil, newGenericError(err, SystemError)
}
return pids, nil
}
func (c *linuxContainer) Stats() (*Stats, error) {
glog.Info("fetch container stats")
var (
err error
stats = &Stats{}
)
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
return stats, newGenericError(err, SystemError)
}
if stats.NetworkStats, err = network.GetStats(&c.state.NetworkState); err != nil {
return stats, newGenericError(err, SystemError)
}
return stats, nil
}
func (c *linuxContainer) Start(process *Process) (int, error) {
status, err := c.Status()
if err != nil {
return -1, err
}
cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
cmd.Stdin = process.Stdin
cmd.Stdout = process.Stdout
cmd.Stderr = process.Stderr
cmd.Env = c.config.Env
cmd.Dir = c.config.RootFs
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
// TODO: add pdeath to config for a container
cmd.SysProcAttr.Pdeathsig = syscall.SIGKILL
if status != configs.Destroyed {
glog.Info("start new container process")
// TODO: (crosbymichael) check out console use for execin
//return namespaces.ExecIn(process.Args, c.config.Env, "", cmd, c.config, c.state)
return c.startNewProcess(cmd, process.Args)
}
if err := c.startInitProcess(cmd, process.Args); err != nil {
return -1, err
}
return c.state.InitPid, nil
}
func (c *linuxContainer) startNewProcess(cmd *exec.Cmd, args []string) (int, error) {
var err error
parent, child, err := newInitPipe()
if err != nil {
return -1, err
}
defer parent.Close()
cmd.ExtraFiles = []*os.File{child}
cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPID=%d", c.state.InitPid))
if err := cmd.Start(); err != nil {
child.Close()
return -1, err
}
child.Close()
s, err := cmd.Process.Wait()
if err != nil {
return -1, err
}
if !s.Success() {
return -1, &exec.ExitError{s}
}
decoder := json.NewDecoder(parent)
var pid *pid
if err := decoder.Decode(&pid); err != nil {
return -1, err
}
p, err := os.FindProcess(pid.Pid)
if err != nil {
return -1, err
}
terminate := func(terr error) (int, error) {
// TODO: log the errors for kill and wait
p.Kill()
p.Wait()
return -1, terr
}
// Enter cgroups.
if err := enterCgroups(c.state, pid.Pid); err != nil {
return terminate(err)
}
encoder := json.NewEncoder(parent)
if err := encoder.Encode(c.config); err != nil {
return terminate(err)
}
process := processArgs{
Config: c.config,
Args: args,
}
if err := encoder.Encode(process); err != nil {
return terminate(err)
}
return pid.Pid, nil
}
func (c *linuxContainer) startInitProcess(cmd *exec.Cmd, args []string) error {
// create a pipe so that we can syncronize with the namespaced process and
// pass the state and configuration to the child process
parent, child, err := newInitPipe()
if err != nil {
return err
}
defer parent.Close()
cmd.ExtraFiles = []*os.File{child}
cmd.SysProcAttr.Cloneflags = c.config.Namespaces.CloneFlags()
if c.config.Namespaces.Contains(configs.NEWUSER) {
addUidGidMappings(cmd.SysProcAttr, c.config)
// Default to root user when user namespaces are enabled.
if cmd.SysProcAttr.Credential == nil {
cmd.SysProcAttr.Credential = &syscall.Credential{}
}
}
glog.Info("starting container init process")
err = cmd.Start()
child.Close()
if err != nil {
return newGenericError(err, SystemError)
}
wait := func() (*os.ProcessState, error) {
ps, err := cmd.Process.Wait()
// we should kill all processes in cgroup when init is died if we use
// host PID namespace
if !c.config.Namespaces.Contains(configs.NEWPID) {
c.killAllPids()
}
return ps, newGenericError(err, SystemError)
}
terminate := func(terr error) error {
// TODO: log the errors for kill and wait
cmd.Process.Kill()
wait()
return terr
}
started, err := system.GetProcessStartTime(cmd.Process.Pid)
if err != nil {
return terminate(err)
}
// Do this before syncing with child so that no children
// can escape the cgroup
if err := c.cgroupManager.Apply(cmd.Process.Pid); err != nil {
return terminate(err)
}
defer func() {
if err != nil {
c.cgroupManager.Destroy()
}
}()
var networkState configs.NetworkState
if err := c.initializeNetworking(cmd.Process.Pid, &networkState); err != nil {
return terminate(err)
}
process := processArgs{
Args: args,
Config: c.config,
NetworkState: &networkState,
}
// Start the setup process to setup the init process
if c.config.Namespaces.Contains(configs.NEWUSER) {
if err = executeSetupCmd(cmd.Args, cmd.Process.Pid, c.config, &process, &networkState); err != nil {
return terminate(err)
}
}
// send the state to the container's init process then shutdown writes for the parent
if err := json.NewEncoder(parent).Encode(process); err != nil {
return terminate(err)
}
// shutdown writes for the parent side of the pipe
if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil {
return terminate(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *initError
if err := json.NewDecoder(parent).Decode(&ierr); err != nil && err != io.EOF {
return terminate(err)
}
if ierr != nil {
return terminate(ierr)
}
c.state.InitPid = cmd.Process.Pid
c.state.InitStartTime = started
c.state.NetworkState = networkState
c.state.CgroupPaths = c.cgroupManager.GetPaths()
return nil
}
func (c *linuxContainer) Destroy() error {
status, err := c.Status()
if err != nil {
return err
}
if status != configs.Destroyed {
return newGenericError(nil, ContainerNotStopped)
}
return os.RemoveAll(c.root)
}
func (c *linuxContainer) Pause() error {
return c.cgroupManager.Freeze(configs.Frozen)
}
func (c *linuxContainer) Resume() error {
return c.cgroupManager.Freeze(configs.Thawed)
}
func (c *linuxContainer) Signal(signal os.Signal) error {
glog.Infof("sending signal %d to pid %d", signal, c.state.InitPid)
panic("not implemented")
}
func (c *linuxContainer) Wait() (syscall.WaitStatus, error) {
var status syscall.WaitStatus
// TODO : close exec.Cmd pipes, fix in master
_, err := syscall.Wait4(c.state.InitPid, &status, 0, nil)
if err != nil {
return 0, newGenericError(err, SystemError)
}
return status, err
}
func (c *linuxContainer) OOM() (<-chan struct{}, error) {
return NotifyOnOOM(c.state)
}
func (c *linuxContainer) updateStateFile() error {
fnew := filepath.Join(c.root, fmt.Sprintf("%s.new", stateFilename))
f, err := os.Create(fnew)
if err != nil {
return newGenericError(err, SystemError)
}
defer f.Close()
if err := json.NewEncoder(f).Encode(c.state); err != nil {
f.Close()
os.Remove(fnew)
return newGenericError(err, SystemError)
}
fname := filepath.Join(c.root, stateFilename)
if err := os.Rename(fnew, fname); err != nil {
return newGenericError(err, SystemError)
}
return nil
}
// New returns a newly initialized Pipe for communication between processes
func newInitPipe() (parent *os.File, child *os.File, err error) {
fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
}
// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr.
func addUidGidMappings(sys *syscall.SysProcAttr, container *configs.Config) {
if container.UidMappings != nil {
sys.UidMappings = make([]syscall.SysProcIDMap, len(container.UidMappings))
for i, um := range container.UidMappings {
sys.UidMappings[i].ContainerID = um.ContainerID
sys.UidMappings[i].HostID = um.HostID
sys.UidMappings[i].Size = um.Size
}
}
if container.GidMappings != nil {
sys.GidMappings = make([]syscall.SysProcIDMap, len(container.GidMappings))
for i, gm := range container.GidMappings {
sys.GidMappings[i].ContainerID = gm.ContainerID
sys.GidMappings[i].HostID = gm.HostID
sys.GidMappings[i].Size = gm.Size
}
}
}
// killAllPids iterates over all of the container's processes
// sending a SIGKILL to each process.
func (c *linuxContainer) killAllPids() error {
glog.Info("killing all processes in container")
var procs []*os.Process
c.cgroupManager.Freeze(configs.Frozen)
pids, err := c.cgroupManager.GetPids()
if err != nil {
return err
}
for _, pid := range pids {
// TODO: log err without aborting if we are unable to find
// a single PID
if p, err := os.FindProcess(pid); err == nil {
procs = append(procs, p)
p.Kill()
}
}
c.cgroupManager.Freeze(configs.Thawed)
for _, p := range procs {
p.Wait()
}
return err
}
// initializeNetworking creates the container's network stack outside of the namespace and moves
// interfaces into the container's net namespaces if necessary
func (c *linuxContainer) initializeNetworking(nspid int, networkState *configs.NetworkState) error {
glog.Info("initailzing container's network stack")
for _, config := range c.config.Networks {
strategy, err := network.GetStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.Create(config, nspid, networkState); err != nil {
return err
}
}
return nil
}
func executeSetupCmd(args []string, ppid int, container *configs.Config, process *processArgs, networkState *configs.NetworkState) error {
command := exec.Command(args[0], args[1:]...)
parent, child, err := newInitPipe()
if err != nil {
return err
}
defer parent.Close()
command.ExtraFiles = []*os.File{child}
command.Dir = container.RootFs
command.Env = append(command.Env,
fmt.Sprintf("_LIBCONTAINER_INITPID=%d", ppid),
fmt.Sprintf("_LIBCONTAINER_USERNS=1"))
err = command.Start()
child.Close()
if err != nil {
return err
}
s, err := command.Process.Wait()
if err != nil {
return err
}
if !s.Success() {
return &exec.ExitError{s}
}
decoder := json.NewDecoder(parent)
var pid *pid
if err := decoder.Decode(&pid); err != nil {
return err
}
p, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
terminate := func(terr error) error {
// TODO: log the errors for kill and wait
p.Kill()
p.Wait()
return terr
}
// send the state to the container's init process then shutdown writes for the parent
if err := json.NewEncoder(parent).Encode(process); err != nil {
return terminate(err)
}
// shutdown writes for the parent side of the pipe
if err := syscall.Shutdown(int(parent.Fd()), syscall.SHUT_WR); err != nil {
return terminate(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *initError
if err := decoder.Decode(&ierr); err != nil && err != io.EOF {
return terminate(err)
}
if ierr != nil {
return ierr
}
s, err = p.Wait()
if err != nil {
return err
}
if !s.Success() {
return &exec.ExitError{s}
}
return nil
}
type pid struct {
Pid int `json:"Pid"`
}
// Finalize entering into a container and execute a specified command
func InitIn(pipe *os.File) (err error) {
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't
// receive ECONNRESET when the child writes to the pipe.
ioutil.ReadAll(pipe)
if err := json.NewEncoder(pipe).Encode(initError{
Message: err.Error(),
}); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed
pipe.Close()
}()
decoder := json.NewDecoder(pipe)
var config *configs.Config
if err := decoder.Decode(&config); err != nil {
return err
}
var process *processArgs
if err := decoder.Decode(&process); err != nil {
return err
}
if err := finalizeSetns(config); err != nil {
return err
}
if err := system.Execv(process.Args[0], process.Args[0:], config.Env); err != nil {
return err
}
panic("unreachable")
}
// finalize expects that the setns calls have been setup and that is has joined an
// existing namespace
func finalizeSetns(container *configs.Config) error {
// clear the current processes env and replace it with the environment defined on the container
if err := loadContainerEnvironment(container); err != nil {
return err
}
if err := setupRlimits(container); err != nil {
return fmt.Errorf("setup rlimits %s", err)
}
if err := finalizeNamespace(container); err != nil {
return err
}
if err := apparmor.ApplyProfile(container.AppArmorProfile); err != nil {
return fmt.Errorf("set apparmor profile %s: %s", container.AppArmorProfile, err)
}
if container.ProcessLabel != "" {
if err := label.SetProcessLabel(container.ProcessLabel); err != nil {
return err
}
}
return nil
}
// SetupContainer is run to setup mounts and networking related operations
// for a user namespace enabled process as a user namespace root doesn't
// have permissions to perform these operations.
// The setup process joins all the namespaces of user namespace enabled init
// except the user namespace, so it run as root in the root user namespace
// to perform these operations.
func SetupContainer(process *processArgs) error {
container := process.Config
networkState := process.NetworkState
// TODO : move to validation
/*
rootfs, err := utils.ResolveRootfs(container.RootFs)
if err != nil {
return err
}
*/
// clear the current processes env and replace it with the environment
// defined on the container
if err := loadContainerEnvironment(container); err != nil {
return err
}
cloneFlags := container.Namespaces.CloneFlags()
if (cloneFlags & syscall.CLONE_NEWNET) == 0 {
if len(container.Networks) != 0 || len(container.Routes) != 0 {
return fmt.Errorf("unable to apply network parameters without network namespace")
}
} else {
if err := setupNetwork(container, networkState); err != nil {
return fmt.Errorf("setup networking %s", err)
}
if err := setupRoute(container); err != nil {
return fmt.Errorf("setup route %s", err)
}
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if (cloneFlags & syscall.CLONE_NEWNS) != 0 {
if err := mount.InitializeMountNamespace(container); err != nil {
return fmt.Errorf("setup mount namespace %s", err)
}
}
return nil
}
func enterCgroups(state *configs.State, pid int) error {
return cgroups.EnterPid(state.CgroupPaths, pid)
}