From 103853ead72ff1d71677464fce338a2644e8a6f6 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 17 Dec 2015 20:16:34 +1100 Subject: [PATCH] libcontainer: set cgroup config late Due to the fact that the init is implemented in Go (which seemingly randomly spawns new processes and loves eating memory), most cgroup configurations are required to have an arbitrary minimum dictated by the init. This confuses users and makes configuration more annoying than it should. An example of this is pids.max, where Go spawns multiple processes that then cause init to violate the pids cgroup constraint before the container can even start. Solve this problem by setting the cgroup configurations as late as possible, to avoid hitting as many of the resources hogged by the Go init as possible. This has to be done before seccomp rules are applied, as the parent and child must synchronise in order for the parent to correctly set the configurations (and writes might be blocked by seccomp). Signed-off-by: Aleksa Sarai --- libcontainer/factory_linux.go | 17 +++++--- libcontainer/generic_error.go | 9 ++++ libcontainer/init_linux.go | 24 ++++++++++ libcontainer/integration/exec_test.go | 5 +++ libcontainer/process_linux.go | 63 ++++++++++++++++++++++----- libcontainer/standard_init_linux.go | 10 ++++- 6 files changed, 111 insertions(+), 17 deletions(-) diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index c2d359ed..ae5db9cd 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -5,7 +5,6 @@ package libcontainer import ( "encoding/json" "fmt" - "io/ioutil" "os" "os/exec" "path/filepath" @@ -226,21 +225,29 @@ func (l *LinuxFactory) StartInitialization() (err error) { // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() + var i initer defer func() { // if we have an error during the initialization of the container's init then send it back to the // parent process in the form of an initError. if err != nil { - // ensure that any data sent from the parent is consumed so it doesn't - // receive ECONNRESET when the child writes to the pipe. - ioutil.ReadAll(pipe) + if _, ok := i.(*linuxStandardInit); ok { + // Synchronisation only necessary for standard init. + if err := json.NewEncoder(pipe).Encode(procError); err != nil { + panic(err) + } + } if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil { panic(err) } + } else { + if err := json.NewEncoder(pipe).Encode(procStart); err != nil { + panic(err) + } } // ensure that this pipe is always closed pipe.Close() }() - i, err := newContainerInit(it, pipe) + i, err = newContainerInit(it, pipe) if err != nil { return err } diff --git a/libcontainer/generic_error.go b/libcontainer/generic_error.go index 6fbc2d75..e4872e2d 100644 --- a/libcontainer/generic_error.go +++ b/libcontainer/generic_error.go @@ -9,6 +9,15 @@ import ( "github.com/opencontainers/runc/libcontainer/stacktrace" ) +type syncType uint8 + +const ( + procReady syncType = iota + procError + procStart + procRun +) + var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} Code: {{.ECode}} {{if .Message }} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index ddb11865..504bf732 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "fmt" + "io" "io/ioutil" "net" "os" @@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) { }, nil case initStandard: return &linuxStandardInit{ + pipe: pipe, parentPid: syscall.Getppid(), config: config, }, nil @@ -140,6 +142,28 @@ func finalizeNamespace(config *initConfig) error { return nil } +// syncParentReady sends to the given pipe a JSON payload which indicates that +// the init is ready to Exec the child process. It then waits for the parent to +// indicate that it is cleared to Exec. +func syncParentReady(pipe io.ReadWriter) error { + // Tell parent. + if err := json.NewEncoder(pipe).Encode(procReady); err != nil { + return err + } + + // Wait for parent to give the all-clear. + var procSync syncType + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + if procSync != procRun { + return fmt.Errorf("invalid synchronisation flag from parent") + } + } + return nil +} + // joinExistingNamespaces gets all the namespace paths specified for the container and // does a setns on the namespace fd so that the current process joins the namespace. func joinExistingNamespaces(namespaces []configs.Namespace) error { diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 0fc277ea..6be1dd60 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -587,6 +587,11 @@ func testPids(t *testing.T, systemd bool) { if err == nil { t.Fatalf("expected fork() to fail with restrictive pids limit") } + + // Minimal restrictions are not really supported, due to quirks in using Go + // due to the fact that it spawns random processes. While we do our best with + // late setting cgroup values, it's just too unreliable with very small pids.max. + // As such, we don't test that case. YMMV. } func TestRunWithKernelMemory(t *testing.T) { diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 5b32eb10..1a6dc124 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "errors" + "fmt" "io" "os" "os/exec" @@ -86,6 +87,7 @@ func (p *setnsProcess) start() (err error) { if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return newSystemError(err) } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } @@ -95,6 +97,7 @@ func (p *setnsProcess) start() (err error) { if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { return newSystemError(err) } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() return newSystemError(ierr) @@ -198,15 +201,11 @@ func (p *initProcess) start() (err error) { return newSystemError(err) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children // can escape the cgroup if err := p.manager.Apply(p.pid()); err != nil { return newSystemError(err) } - if err := p.manager.Set(p.config.Config); err != nil { - return newSystemError(err) - } defer func() { if err != nil { // TODO: should not be the responsibility to call here @@ -232,13 +231,58 @@ func (p *initProcess) start() (err error) { if err := p.sendConfig(); err != nil { return newSystemError(err) } - // wait for the child process to fully complete and receive an error message - // if one was encoutered - var ierr *genericError - if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + + var ( + procSync syncType + sentRun bool + ierr *genericError + ) + +loop: + for { + if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil { + if err == io.EOF { + break loop + } + return newSystemError(err) + } + + switch procSync { + case procStart: + break loop + case procReady: + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemError(err) + } + // Sync with child. + if err := json.NewEncoder(p.parentPipe).Encode(procRun); err != nil { + return newSystemError(err) + } + sentRun = true + case procError: + // wait for the child process to fully complete and receive an error message + // if one was encoutered + if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + return newSystemError(err) + } + if ierr != nil { + break loop + } + // Programmer error. + panic("No error following JSON procError payload.") + default: + return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child")) + } + } + if !sentRun { + return newSystemError(fmt.Errorf("could not synchronise with container process")) + } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { + p.wait() return newSystemError(ierr) } return nil @@ -276,8 +320,7 @@ func (p *initProcess) sendConfig() error { if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return err } - // shutdown writes for the parent side of the pipe - return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR) + return nil } func (p *initProcess) createNetworkInterfaces() error { diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index ec100578..d3b50863 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -3,6 +3,7 @@ package libcontainer import ( + "io" "os" "syscall" @@ -14,6 +15,7 @@ import ( ) type linuxStandardInit struct { + pipe io.ReadWriter parentPid int config *initConfig } @@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error { if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { return err } - label.Init() // InitializeMountNamespace() can be executed only for a new mount namespace if l.config.Config.Namespaces.Contains(configs.NEWNS) { @@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error { return err } } - for _, path := range l.config.Config.ReadonlyPaths { if err := remountReadonly(path); err != nil { return err @@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error { if err != nil { return err } + // Tell our parent that we're ready to Execv. This must be done before the + // Seccomp rules have been applied, because we need to be able to read and + // write to a socket. + if err := syncParentReady(l.pipe); err != nil { + return err + } if l.config.Config.Seccomp != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { return err