libcontainer: set cgroup config late
Due to the fact that the init is implemented in Go (which seemingly randomly spawns new processes and loves eating memory), most cgroup configurations are required to have an arbitrary minimum dictated by the init. This confuses users and makes configuration more annoying than it should. An example of this is pids.max, where Go spawns multiple processes that then cause init to violate the pids cgroup constraint before the container can even start. Solve this problem by setting the cgroup configurations as late as possible, to avoid hitting as many of the resources hogged by the Go init as possible. This has to be done before seccomp rules are applied, as the parent and child must synchronise in order for the parent to correctly set the configurations (and writes might be blocked by seccomp). Signed-off-by: Aleksa Sarai <asarai@suse.com>
This commit is contained in:
parent
a95483402e
commit
103853ead7
|
@ -5,7 +5,6 @@ package libcontainer
|
|||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
|
@ -226,21 +225,29 @@ func (l *LinuxFactory) StartInitialization() (err error) {
|
|||
// clear the current process's environment to clean any libcontainer
|
||||
// specific env vars.
|
||||
os.Clearenv()
|
||||
var i initer
|
||||
defer func() {
|
||||
// if we have an error during the initialization of the container's init then send it back to the
|
||||
// parent process in the form of an initError.
|
||||
if err != nil {
|
||||
// ensure that any data sent from the parent is consumed so it doesn't
|
||||
// receive ECONNRESET when the child writes to the pipe.
|
||||
ioutil.ReadAll(pipe)
|
||||
if _, ok := i.(*linuxStandardInit); ok {
|
||||
// Synchronisation only necessary for standard init.
|
||||
if err := json.NewEncoder(pipe).Encode(procError); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
} else {
|
||||
if err := json.NewEncoder(pipe).Encode(procStart); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
// ensure that this pipe is always closed
|
||||
pipe.Close()
|
||||
}()
|
||||
i, err := newContainerInit(it, pipe)
|
||||
i, err = newContainerInit(it, pipe)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -9,6 +9,15 @@ import (
|
|||
"github.com/opencontainers/runc/libcontainer/stacktrace"
|
||||
)
|
||||
|
||||
type syncType uint8
|
||||
|
||||
const (
|
||||
procReady syncType = iota
|
||||
procError
|
||||
procStart
|
||||
procRun
|
||||
)
|
||||
|
||||
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
|
||||
Code: {{.ECode}}
|
||||
{{if .Message }}
|
||||
|
|
|
@ -5,6 +5,7 @@ package libcontainer
|
|||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"net"
|
||||
"os"
|
||||
|
@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
|
|||
}, nil
|
||||
case initStandard:
|
||||
return &linuxStandardInit{
|
||||
pipe: pipe,
|
||||
parentPid: syscall.Getppid(),
|
||||
config: config,
|
||||
}, nil
|
||||
|
@ -140,6 +142,28 @@ func finalizeNamespace(config *initConfig) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// syncParentReady sends to the given pipe a JSON payload which indicates that
|
||||
// the init is ready to Exec the child process. It then waits for the parent to
|
||||
// indicate that it is cleared to Exec.
|
||||
func syncParentReady(pipe io.ReadWriter) error {
|
||||
// Tell parent.
|
||||
if err := json.NewEncoder(pipe).Encode(procReady); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Wait for parent to give the all-clear.
|
||||
var procSync syncType
|
||||
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
|
||||
if err == io.EOF {
|
||||
return fmt.Errorf("parent closed synchronisation channel")
|
||||
}
|
||||
if procSync != procRun {
|
||||
return fmt.Errorf("invalid synchronisation flag from parent")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// joinExistingNamespaces gets all the namespace paths specified for the container and
|
||||
// does a setns on the namespace fd so that the current process joins the namespace.
|
||||
func joinExistingNamespaces(namespaces []configs.Namespace) error {
|
||||
|
|
|
@ -587,6 +587,11 @@ func testPids(t *testing.T, systemd bool) {
|
|||
if err == nil {
|
||||
t.Fatalf("expected fork() to fail with restrictive pids limit")
|
||||
}
|
||||
|
||||
// Minimal restrictions are not really supported, due to quirks in using Go
|
||||
// due to the fact that it spawns random processes. While we do our best with
|
||||
// late setting cgroup values, it's just too unreliable with very small pids.max.
|
||||
// As such, we don't test that case. YMMV.
|
||||
}
|
||||
|
||||
func TestRunWithKernelMemory(t *testing.T) {
|
||||
|
|
|
@ -5,6 +5,7 @@ package libcontainer
|
|||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
|
@ -86,6 +87,7 @@ func (p *setnsProcess) start() (err error) {
|
|||
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
|
||||
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
|
@ -95,6 +97,7 @@ func (p *setnsProcess) start() (err error) {
|
|||
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
|
||||
return newSystemError(err)
|
||||
}
|
||||
// Must be done after Shutdown so the child will exit and we can wait for it.
|
||||
if ierr != nil {
|
||||
p.wait()
|
||||
return newSystemError(ierr)
|
||||
|
@ -198,15 +201,11 @@ func (p *initProcess) start() (err error) {
|
|||
return newSystemError(err)
|
||||
}
|
||||
p.setExternalDescriptors(fds)
|
||||
|
||||
// Do this before syncing with child so that no children
|
||||
// can escape the cgroup
|
||||
if err := p.manager.Apply(p.pid()); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
if err := p.manager.Set(p.config.Config); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
// TODO: should not be the responsibility to call here
|
||||
|
@ -232,13 +231,58 @@ func (p *initProcess) start() (err error) {
|
|||
if err := p.sendConfig(); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
// wait for the child process to fully complete and receive an error message
|
||||
// if one was encoutered
|
||||
var ierr *genericError
|
||||
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
|
||||
|
||||
var (
|
||||
procSync syncType
|
||||
sentRun bool
|
||||
ierr *genericError
|
||||
)
|
||||
|
||||
loop:
|
||||
for {
|
||||
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
|
||||
if err == io.EOF {
|
||||
break loop
|
||||
}
|
||||
return newSystemError(err)
|
||||
}
|
||||
|
||||
switch procSync {
|
||||
case procStart:
|
||||
break loop
|
||||
case procReady:
|
||||
if err := p.manager.Set(p.config.Config); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
// Sync with child.
|
||||
if err := json.NewEncoder(p.parentPipe).Encode(procRun); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
sentRun = true
|
||||
case procError:
|
||||
// wait for the child process to fully complete and receive an error message
|
||||
// if one was encoutered
|
||||
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
|
||||
return newSystemError(err)
|
||||
}
|
||||
if ierr != nil {
|
||||
break loop
|
||||
}
|
||||
// Programmer error.
|
||||
panic("No error following JSON procError payload.")
|
||||
default:
|
||||
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
|
||||
}
|
||||
}
|
||||
if !sentRun {
|
||||
return newSystemError(fmt.Errorf("could not synchronise with container process"))
|
||||
}
|
||||
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
// Must be done after Shutdown so the child will exit and we can wait for it.
|
||||
if ierr != nil {
|
||||
p.wait()
|
||||
return newSystemError(ierr)
|
||||
}
|
||||
return nil
|
||||
|
@ -276,8 +320,7 @@ func (p *initProcess) sendConfig() error {
|
|||
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil {
|
||||
return err
|
||||
}
|
||||
// shutdown writes for the parent side of the pipe
|
||||
return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *initProcess) createNetworkInterfaces() error {
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
package libcontainer
|
||||
|
||||
import (
|
||||
"io"
|
||||
"os"
|
||||
"syscall"
|
||||
|
||||
|
@ -14,6 +15,7 @@ import (
|
|||
)
|
||||
|
||||
type linuxStandardInit struct {
|
||||
pipe io.ReadWriter
|
||||
parentPid int
|
||||
config *initConfig
|
||||
}
|
||||
|
@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error {
|
|||
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
label.Init()
|
||||
// InitializeMountNamespace() can be executed only for a new mount namespace
|
||||
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
||||
|
@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error {
|
|||
return err
|
||||
}
|
||||
}
|
||||
|
||||
for _, path := range l.config.Config.ReadonlyPaths {
|
||||
if err := remountReadonly(path); err != nil {
|
||||
return err
|
||||
|
@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error {
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// Tell our parent that we're ready to Execv. This must be done before the
|
||||
// Seccomp rules have been applied, because we need to be able to read and
|
||||
// write to a socket.
|
||||
if err := syncParentReady(l.pipe); err != nil {
|
||||
return err
|
||||
}
|
||||
if l.config.Config.Seccomp != nil {
|
||||
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
||||
return err
|
||||
|
|
Loading…
Reference in New Issue