From 3aacff695db60634bfb1a2b30c7f5f9e54cd616f Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Mon, 6 Jun 2016 13:15:18 -0700 Subject: [PATCH] Use fifo for create/start This removes the use of a signal handler and SIGCONT to signal the init process to exec the users process. Signed-off-by: Michael Crosby --- libcontainer/container.go | 10 +++- libcontainer/container_linux.go | 60 +++++++++++++++++------- libcontainer/factory_linux.go | 33 ++++++++----- libcontainer/init_linux.go | 12 +++-- libcontainer/integration/init_test.go | 4 +- libcontainer/integration/seccomp_test.go | 8 ++-- libcontainer/integration/utils_test.go | 7 ++- libcontainer/process_linux.go | 4 ++ libcontainer/setns_init_linux.go | 5 +- libcontainer/standard_init_linux.go | 40 +++++++++------- start.go | 2 +- utils_linux.go | 14 ++---- 12 files changed, 124 insertions(+), 75 deletions(-) diff --git a/libcontainer/container.go b/libcontainer/container.go index 1dc283d8..1a71179c 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -124,8 +124,8 @@ type BaseContainer interface { Start(process *Process) (err error) // Run immediatly starts the process inside the conatiner. Returns error if process - // fails to start. It does not block waiting for a SIGCONT after start returns but - // sends the signal when the process has completed. + // fails to start. It does not block waiting for the exec fifo after start returns but + // opens the fifo after start returns. // // errors: // ContainerDestroyed - Container no longer exists, @@ -148,4 +148,10 @@ type BaseContainer interface { // errors: // SystemError - System error. Signal(s os.Signal) error + + // Exec signals the container to exec the users process at the end of the init. + // + // errors: + // SystemError - System error. + Exec() error } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 31a51c4f..0c518e35 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -29,10 +29,6 @@ import ( const stdioFdCount = 3 -// InitContinueSignal is used to signal the container init process to -// start the users specified process after the container create has finished. -const InitContinueSignal = syscall.SIGCONT - type linuxContainer struct { id string root string @@ -195,16 +191,39 @@ func (c *linuxContainer) Run(process *Process) error { if err != nil { return err } - isInit := status == Stopped - if err := c.start(process, isInit); err != nil { + if err := c.start(process, status == Stopped); err != nil { return err } - if isInit { - return process.ops.signal(InitContinueSignal) + if status == Stopped { + return c.exec() } return nil } +func (c *linuxContainer) Exec() error { + c.m.Lock() + defer c.m.Unlock() + return c.exec() +} + +func (c *linuxContainer) exec() error { + path := filepath.Join(c.root, execFifoFilename) + f, err := os.OpenFile(path, os.O_RDONLY, 0) + if err != nil { + return newSystemErrorWithCause(err, "open exec fifo for reading") + } + defer f.Close() + data, err := ioutil.ReadAll(f) + if err != nil { + return err + } + if len(data) > 0 { + os.Remove(path) + return nil + } + return fmt.Errorf("cannot start an already running container") +} + func (c *linuxContainer) start(process *Process, isInit bool) error { parent, err := c.newParentProcess(process, isInit) if err != nil { @@ -262,17 +281,21 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces if err != nil { return nil, newSystemErrorWithCause(err, "creating new init pipe") } - cmd, err := c.commandTemplate(p, childPipe) + rootDir, err := os.Open(c.root) + if err != nil { + return nil, err + } + cmd, err := c.commandTemplate(p, childPipe, rootDir) if err != nil { return nil, newSystemErrorWithCause(err, "creating new command template") } if !doInit { - return c.newSetnsProcess(p, cmd, parentPipe, childPipe) + return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir) } - return c.newInitProcess(p, cmd, parentPipe, childPipe) + return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) } -func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) { +func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) { cmd := &exec.Cmd{ Path: c.initPath, Args: c.initArgs, @@ -284,8 +307,10 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} } - cmd.ExtraFiles = append(p.ExtraFiles, childPipe) - cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir) + cmd.Env = append(cmd.Env, + fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2), + fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason // even with the parent still running. @@ -295,7 +320,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. return cmd, nil } -func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { +func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) nsMaps := make(map[configs.NamespaceType]string) for _, ns := range c.config.Namespaces { @@ -318,10 +343,11 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c process: p, bootstrapData: data, sharePidns: sharePidns, + rootDir: rootDir, }, nil } -func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) state, err := c.currentState() if err != nil { @@ -342,6 +368,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, config: c.newInitConfig(p), process: p, bootstrapData: data, + rootDir: rootDir, }, nil } @@ -360,6 +387,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, + ExecFifoPath: filepath.Join(c.root, execFifoFilename), } if process.NoNewPrivileges != nil { cfg.NoNewPrivileges = *process.NoNewPrivileges diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 1ac06fb0..c00fb459 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -7,7 +7,6 @@ import ( "fmt" "os" "os/exec" - "os/signal" "path/filepath" "regexp" "runtime/debug" @@ -24,7 +23,8 @@ import ( ) const ( - stateFilename = "state.json" + stateFilename = "state.json" + execFifoFilename = "exec.fifo" ) var ( @@ -168,6 +168,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := os.MkdirAll(containerRoot, 0700); err != nil { return nil, newGenericError(err, SystemError) } + if err := syscall.Mkfifo(filepath.Join(containerRoot, execFifoFilename), 0666); err != nil { + return nil, newGenericError(err, SystemError) + } c := &linuxContainer{ id: id, root: containerRoot, @@ -220,13 +223,18 @@ func (l *LinuxFactory) Type() string { // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // This is a low level implementation detail of the reexec and should not be consumed externally func (l *LinuxFactory) StartInitialization() (err error) { - // start the signal handler as soon as we can - s := make(chan os.Signal, 1) - signal.Notify(s, InitContinueSignal) - fdStr := os.Getenv("_LIBCONTAINER_INITPIPE") - pipefd, err := strconv.Atoi(fdStr) - if err != nil { - return fmt.Errorf("error converting env var _LIBCONTAINER_INITPIPE(%q) to an int: %s", fdStr, err) + var pipefd, rootfd int + for k, v := range map[string]*int{ + "_LIBCONTAINER_INITPIPE": &pipefd, + "_LIBCONTAINER_STATEDIR": &rootfd, + } { + s := os.Getenv(k) + + i, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("unable to convert %s=%s to int", k, s) + } + *v = i } var ( pipe = os.NewFile(uintptr(pipefd), "pipe") @@ -235,6 +243,7 @@ func (l *LinuxFactory) StartInitialization() (err error) { // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() + var i initer defer func() { // We have an error during the initialization of the container's init, @@ -253,18 +262,16 @@ func (l *LinuxFactory) StartInitialization() (err error) { // ensure that this pipe is always closed pipe.Close() }() - defer func() { if e := recover(); e != nil { err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) } }() - - i, err = newContainerInit(it, pipe) + i, err = newContainerInit(it, pipe, rootfd) if err != nil { return err } - return i.Init(s) + return i.Init() } func (l *LinuxFactory) loadState(root string) (*State, error) { diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 8e345f0c..d6d9901c 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -58,13 +58,14 @@ type initConfig struct { PassedFilesCount int `json:"passed_files_count"` ContainerId string `json:"containerid"` Rlimits []configs.Rlimit `json:"rlimits"` + ExecFifoPath string `json:"start_pipe_path"` } type initer interface { - Init(s chan os.Signal) error + Init() error } -func newContainerInit(t initType, pipe *os.File) (initer, error) { +func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return nil, err @@ -79,9 +80,10 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) { }, nil case initStandard: return &linuxStandardInit{ - pipe: pipe, - parentPid: syscall.Getppid(), - config: config, + pipe: pipe, + parentPid: syscall.Getppid(), + config: config, + stateDirFD: stateDirFD, }, nil } return nil, fmt.Errorf("unknown init type %q", t) diff --git a/libcontainer/integration/init_test.go b/libcontainer/integration/init_test.go index 73a9a8f9..87b4f2ac 100644 --- a/libcontainer/integration/init_test.go +++ b/libcontainer/integration/init_test.go @@ -42,13 +42,13 @@ func TestMain(m *testing.M) { logrus.SetOutput(os.Stderr) logrus.SetLevel(logrus.InfoLevel) - factory, err = libcontainer.New(".", libcontainer.Cgroupfs) + factory, err = libcontainer.New("/run/libctTests", libcontainer.Cgroupfs) if err != nil { logrus.Error(err) os.Exit(1) } if systemd.UseSystemd() { - systemdFactory, err = libcontainer.New(".", libcontainer.SystemdCgroups) + systemdFactory, err = libcontainer.New("/run/libctTests", libcontainer.SystemdCgroups) if err != nil { logrus.Error(err) os.Exit(1) diff --git a/libcontainer/integration/seccomp_test.go b/libcontainer/integration/seccomp_test.go index 95868d01..055f887f 100644 --- a/libcontainer/integration/seccomp_test.go +++ b/libcontainer/integration/seccomp_test.go @@ -101,8 +101,8 @@ func TestSeccompPermitWriteConditional(t *testing.T) { Args: []*configs.Arg{ { Index: 0, - Value: 1, - Op: configs.GreaterThan, + Value: 2, + Op: configs.EqualTo, }, }, }, @@ -162,8 +162,8 @@ func TestSeccompDenyWriteConditional(t *testing.T) { Args: []*configs.Arg{ { Index: 0, - Value: 1, - Op: configs.GreaterThan, + Value: 2, + Op: configs.EqualTo, }, }, }, diff --git a/libcontainer/integration/utils_test.go b/libcontainer/integration/utils_test.go index 04532f66..7cbbaf9d 100644 --- a/libcontainer/integration/utils_test.go +++ b/libcontainer/integration/utils_test.go @@ -2,6 +2,8 @@ package integration import ( "bytes" + "crypto/md5" + "encoding/hex" "fmt" "io/ioutil" "os" @@ -11,6 +13,7 @@ import ( "strings" "syscall" "testing" + "time" "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/configs" @@ -92,7 +95,9 @@ func copyBusybox(dest string) error { } func newContainer(config *configs.Config) (libcontainer.Container, error) { - return newContainerWithName("testCT", config) + h := md5.New() + h.Write([]byte(time.Now().String())) + return newContainerWithName(hex.EncodeToString(h.Sum(nil)), config) } func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) { diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index fc0c4fd2..33db3923 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -51,6 +51,7 @@ type setnsProcess struct { fds []string process *Process bootstrapData io.Reader + rootDir *os.File } func (p *setnsProcess) startTime() (string, error) { @@ -69,6 +70,7 @@ func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() err = p.cmd.Start() p.childPipe.Close() + p.rootDir.Close() if err != nil { return newSystemErrorWithCause(err, "starting setns process") } @@ -186,6 +188,7 @@ type initProcess struct { process *Process bootstrapData io.Reader sharePidns bool + rootDir *os.File } func (p *initProcess) pid() int { @@ -230,6 +233,7 @@ func (p *initProcess) start() error { err := p.cmd.Start() p.process.ops = p p.childPipe.Close() + p.rootDir.Close() if err != nil { p.process.ops = nil return newSystemErrorWithCause(err, "starting init process command") diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 2475587c..4b78ae80 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -5,7 +5,6 @@ package libcontainer import ( "fmt" "os" - "os/signal" "github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/keys" @@ -24,7 +23,7 @@ func (l *linuxSetnsInit) getSessionRingName() string { return fmt.Sprintf("_ses.%s", l.config.ContainerId) } -func (l *linuxSetnsInit) Init(s chan os.Signal) error { +func (l *linuxSetnsInit) Init() error { if !l.config.Config.NoNewKeyring { // do not inherit the parent's session keyring if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil { @@ -50,7 +49,5 @@ func (l *linuxSetnsInit) Init(s chan os.Signal) error { if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { return err } - signal.Stop(s) - close(s) return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index ac787707..14fbf636 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -7,7 +7,6 @@ import ( "io" "os" "os/exec" - "os/signal" "syscall" "github.com/opencontainers/runc/libcontainer/apparmor" @@ -19,9 +18,10 @@ import ( ) type linuxStandardInit struct { - pipe io.ReadWriteCloser - parentPid int - config *initConfig + pipe io.ReadWriteCloser + parentPid int + stateDirFD int + config *initConfig } func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { @@ -44,7 +44,7 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { // the kernel const PR_SET_NO_NEW_PRIVS = 0x26 -func (l *linuxStandardInit) Init(s chan os.Signal) error { +func (l *linuxStandardInit) Init() error { if !l.config.Config.NoNewKeyring { ringname, keepperms, newperms := l.getSessionRingParams() @@ -149,23 +149,27 @@ func (l *linuxStandardInit) Init(s chan os.Signal) error { if syscall.Getppid() != l.parentPid { return syscall.Kill(syscall.Getpid(), syscall.SIGKILL) } + // check for the arg before waiting to make sure it exists and it is returned + // as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // close the pipe to signal that we have completed our init. + l.pipe.Close() + // wait for the fifo to be opened on the other side before + // exec'ing the users process. + fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0) + if err != nil { + return err + } + if _, err := syscall.Write(fd, []byte("0")); err != nil { + return err + } if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { return err } } - // check for the arg before waiting to make sure it exists and it is returned - // as a create time error - name, err := exec.LookPath(l.config.Args[0]) - if err != nil { - return err - } - // close the pipe to signal that we have completed our init - l.pipe.Close() - // wait for the signal to exec the users process - <-s - // clean up the signal handler - signal.Stop(s) - close(s) return syscall.Exec(name, l.config.Args[0:], os.Environ()) } diff --git a/start.go b/start.go index b89c509a..2dd3cddb 100644 --- a/start.go +++ b/start.go @@ -27,7 +27,7 @@ your host.`, } switch status { case libcontainer.Created: - return container.Signal(libcontainer.InitContinueSignal) + return container.Exec() case libcontainer.Stopped: return fmt.Errorf("cannot start a container that has run and stopped") case libcontainer.Running: diff --git a/utils_linux.go b/utils_linux.go index ecb5b56e..aa98ebbf 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -229,7 +229,11 @@ func (r *runner) run(config *specs.Process) (int, error) { return -1, err } handler := newSignalHandler(tty, r.enableSubreaper) - if err := r.container.Start(process); err != nil { + startFn := r.container.Start + if !r.create { + startFn = r.container.Run + } + if err := startFn(process); err != nil { r.destroy() tty.Close() return -1, err @@ -248,14 +252,6 @@ func (r *runner) run(config *specs.Process) (int, error) { return -1, err } } - if !r.create { - if err := process.Signal(libcontainer.InitContinueSignal); err != nil { - r.terminate(process) - r.destroy() - tty.Close() - return -1, err - } - } if r.detach || r.create { tty.Close() return 0, nil