diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index c2d359ed..ae5db9cd 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -5,7 +5,6 @@ package libcontainer import ( "encoding/json" "fmt" - "io/ioutil" "os" "os/exec" "path/filepath" @@ -226,21 +225,29 @@ func (l *LinuxFactory) StartInitialization() (err error) { // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() + var i initer defer func() { // if we have an error during the initialization of the container's init then send it back to the // parent process in the form of an initError. if err != nil { - // ensure that any data sent from the parent is consumed so it doesn't - // receive ECONNRESET when the child writes to the pipe. - ioutil.ReadAll(pipe) + if _, ok := i.(*linuxStandardInit); ok { + // Synchronisation only necessary for standard init. + if err := json.NewEncoder(pipe).Encode(procError); err != nil { + panic(err) + } + } if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil { panic(err) } + } else { + if err := json.NewEncoder(pipe).Encode(procStart); err != nil { + panic(err) + } } // ensure that this pipe is always closed pipe.Close() }() - i, err := newContainerInit(it, pipe) + i, err = newContainerInit(it, pipe) if err != nil { return err } diff --git a/libcontainer/generic_error.go b/libcontainer/generic_error.go index 6fbc2d75..e4872e2d 100644 --- a/libcontainer/generic_error.go +++ b/libcontainer/generic_error.go @@ -9,6 +9,15 @@ import ( "github.com/opencontainers/runc/libcontainer/stacktrace" ) +type syncType uint8 + +const ( + procReady syncType = iota + procError + procStart + procRun +) + var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} Code: {{.ECode}} {{if .Message }} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index ddb11865..504bf732 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "fmt" + "io" "io/ioutil" "net" "os" @@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) { }, nil case initStandard: return &linuxStandardInit{ + pipe: pipe, parentPid: syscall.Getppid(), config: config, }, nil @@ -140,6 +142,28 @@ func finalizeNamespace(config *initConfig) error { return nil } +// syncParentReady sends to the given pipe a JSON payload which indicates that +// the init is ready to Exec the child process. It then waits for the parent to +// indicate that it is cleared to Exec. +func syncParentReady(pipe io.ReadWriter) error { + // Tell parent. + if err := json.NewEncoder(pipe).Encode(procReady); err != nil { + return err + } + + // Wait for parent to give the all-clear. + var procSync syncType + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + if procSync != procRun { + return fmt.Errorf("invalid synchronisation flag from parent") + } + } + return nil +} + // joinExistingNamespaces gets all the namespace paths specified for the container and // does a setns on the namespace fd so that the current process joins the namespace. func joinExistingNamespaces(namespaces []configs.Namespace) error { diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 0fc277ea..6be1dd60 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -587,6 +587,11 @@ func testPids(t *testing.T, systemd bool) { if err == nil { t.Fatalf("expected fork() to fail with restrictive pids limit") } + + // Minimal restrictions are not really supported, due to quirks in using Go + // due to the fact that it spawns random processes. While we do our best with + // late setting cgroup values, it's just too unreliable with very small pids.max. + // As such, we don't test that case. YMMV. } func TestRunWithKernelMemory(t *testing.T) { diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 5b32eb10..1a6dc124 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "errors" + "fmt" "io" "os" "os/exec" @@ -86,6 +87,7 @@ func (p *setnsProcess) start() (err error) { if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return newSystemError(err) } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } @@ -95,6 +97,7 @@ func (p *setnsProcess) start() (err error) { if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { return newSystemError(err) } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() return newSystemError(ierr) @@ -198,15 +201,11 @@ func (p *initProcess) start() (err error) { return newSystemError(err) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children // can escape the cgroup if err := p.manager.Apply(p.pid()); err != nil { return newSystemError(err) } - if err := p.manager.Set(p.config.Config); err != nil { - return newSystemError(err) - } defer func() { if err != nil { // TODO: should not be the responsibility to call here @@ -232,13 +231,58 @@ func (p *initProcess) start() (err error) { if err := p.sendConfig(); err != nil { return newSystemError(err) } - // wait for the child process to fully complete and receive an error message - // if one was encoutered - var ierr *genericError - if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + + var ( + procSync syncType + sentRun bool + ierr *genericError + ) + +loop: + for { + if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil { + if err == io.EOF { + break loop + } + return newSystemError(err) + } + + switch procSync { + case procStart: + break loop + case procReady: + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemError(err) + } + // Sync with child. + if err := json.NewEncoder(p.parentPipe).Encode(procRun); err != nil { + return newSystemError(err) + } + sentRun = true + case procError: + // wait for the child process to fully complete and receive an error message + // if one was encoutered + if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + return newSystemError(err) + } + if ierr != nil { + break loop + } + // Programmer error. + panic("No error following JSON procError payload.") + default: + return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child")) + } + } + if !sentRun { + return newSystemError(fmt.Errorf("could not synchronise with container process")) + } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { + p.wait() return newSystemError(ierr) } return nil @@ -276,8 +320,7 @@ func (p *initProcess) sendConfig() error { if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return err } - // shutdown writes for the parent side of the pipe - return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR) + return nil } func (p *initProcess) createNetworkInterfaces() error { diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index ec100578..d3b50863 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -3,6 +3,7 @@ package libcontainer import ( + "io" "os" "syscall" @@ -14,6 +15,7 @@ import ( ) type linuxStandardInit struct { + pipe io.ReadWriter parentPid int config *initConfig } @@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error { if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { return err } - label.Init() // InitializeMountNamespace() can be executed only for a new mount namespace if l.config.Config.Namespaces.Contains(configs.NEWNS) { @@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error { return err } } - for _, path := range l.config.Config.ReadonlyPaths { if err := remountReadonly(path); err != nil { return err @@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error { if err != nil { return err } + // Tell our parent that we're ready to Execv. This must be done before the + // Seccomp rules have been applied, because we need to be able to read and + // write to a socket. + if err := syncParentReady(l.pipe); err != nil { + return err + } if l.config.Config.Seccomp != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { return err