libcontainer: set cgroup config late

Due to the fact that the init is implemented in Go (which seemingly
randomly spawns new processes and loves eating memory), most cgroup
configurations are required to have an arbitrary minimum dictated by the
init. This confuses users and makes configuration more annoying than it
should. An example of this is pids.max, where Go spawns multiple
processes that then cause init to violate the pids cgroup constraint
before the container can even start.

Solve this problem by setting the cgroup configurations as late as
possible, to avoid hitting as many of the resources hogged by the Go
init as possible. This has to be done before seccomp rules are applied,
as the parent and child must synchronise in order for the parent to
correctly set the configurations (and writes might be blocked by seccomp).

Signed-off-by: Aleksa Sarai <asarai@suse.com>
This commit is contained in:
Aleksa Sarai 2015-12-17 20:16:34 +11:00
parent a95483402e
commit 103853ead7
6 changed files with 111 additions and 17 deletions

View File

@ -5,7 +5,6 @@ package libcontainer
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
@ -226,21 +225,29 @@ func (l *LinuxFactory) StartInitialization() (err error) {
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
var i initer
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't
// receive ECONNRESET when the child writes to the pipe.
ioutil.ReadAll(pipe)
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if err := json.NewEncoder(pipe).Encode(procError); err != nil {
panic(err)
}
}
if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil {
panic(err)
}
} else {
if err := json.NewEncoder(pipe).Encode(procStart); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed
pipe.Close()
}()
i, err := newContainerInit(it, pipe)
i, err = newContainerInit(it, pipe)
if err != nil {
return err
}

View File

@ -9,6 +9,15 @@ import (
"github.com/opencontainers/runc/libcontainer/stacktrace"
)
type syncType uint8
const (
procReady syncType = iota
procError
procStart
procRun
)
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}}
{{if .Message }}

View File

@ -5,6 +5,7 @@ package libcontainer
import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"os"
@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
parentPid: syscall.Getppid(),
config: config,
}, nil
@ -140,6 +142,28 @@ func finalizeNamespace(config *initConfig) error {
return nil
}
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
// Tell parent.
if err := json.NewEncoder(pipe).Encode(procReady); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncType
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync != procRun {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}
// joinExistingNamespaces gets all the namespace paths specified for the container and
// does a setns on the namespace fd so that the current process joins the namespace.
func joinExistingNamespaces(namespaces []configs.Namespace) error {

View File

@ -587,6 +587,11 @@ func testPids(t *testing.T, systemd bool) {
if err == nil {
t.Fatalf("expected fork() to fail with restrictive pids limit")
}
// Minimal restrictions are not really supported, due to quirks in using Go
// due to the fact that it spawns random processes. While we do our best with
// late setting cgroup values, it's just too unreliable with very small pids.max.
// As such, we don't test that case. YMMV.
}
func TestRunWithKernelMemory(t *testing.T) {

View File

@ -5,6 +5,7 @@ package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"io"
"os"
"os/exec"
@ -86,6 +87,7 @@ func (p *setnsProcess) start() (err error) {
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil {
return newSystemError(err)
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
}
@ -95,6 +97,7 @@ func (p *setnsProcess) start() (err error) {
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
@ -198,15 +201,11 @@ func (p *initProcess) start() (err error) {
return newSystemError(err)
}
p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children
// can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemError(err)
}
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
}
defer func() {
if err != nil {
// TODO: should not be the responsibility to call here
@ -232,13 +231,58 @@ func (p *initProcess) start() (err error) {
if err := p.sendConfig(); err != nil {
return newSystemError(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
var (
procSync syncType
sentRun bool
ierr *genericError
)
loop:
for {
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemError(err)
}
switch procSync {
case procStart:
break loop
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
}
// Sync with child.
if err := json.NewEncoder(p.parentPipe).Encode(procRun); err != nil {
return newSystemError(err)
}
sentRun = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
if ierr != nil {
break loop
}
// Programmer error.
panic("No error following JSON procError payload.")
default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
}
}
if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process"))
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
}
return nil
@ -276,8 +320,7 @@ func (p *initProcess) sendConfig() error {
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil {
return err
}
// shutdown writes for the parent side of the pipe
return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR)
return nil
}
func (p *initProcess) createNetworkInterfaces() error {

View File

@ -3,6 +3,7 @@
package libcontainer
import (
"io"
"os"
"syscall"
@ -14,6 +15,7 @@ import (
)
type linuxStandardInit struct {
pipe io.ReadWriter
parentPid int
config *initConfig
}
@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error {
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error {
return err
}
}
for _, path := range l.config.Config.ReadonlyPaths {
if err := remountReadonly(path); err != nil {
return err
@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error {
if err != nil {
return err
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
}
if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err