criu: restore into existing namespace when specified

Using CRIU to checkpoint and restore a container into an existing
network namespace is not possible.

If the network namespace is defined like

	{
		"type": "network",
		"path": "/run/netns/test"
	}

there is the expectation that the restored container is again running in
the network namespace specified with 'path'.

This adds the new CRIU 'external namespace' feature to runc, where
during checkpointing that specific namespace is referenced and during
restore CRIU tries to restore the container in exactly that
namespace.

This breaks/fixes current runc behavior. If, without this patch, runc
restores a container with such a network namespace definition, it is
ignored and CRIU recreates a network namespace without a name.

With this patch runc uses the network namespace path (if available) to
checkpoint and restore the container in just that network namespace.

Restore will now fail if a container was checkpointed with a network
namespace path set and if that network namespace path does not exist
during restore.

runc still falls back to the old behavior if CRIU older than 3.11 is
installed.

Fixes #1786

Related to https://github.com/projectatomic/libpod/pull/469

Thanks to Andrei Vagin for all the help in getting the interface between
CRIU and runc right!

Signed-off-by: Adrian Reber <areber@redhat.com>
This commit is contained in:
Adrian Reber 2018-07-02 20:22:38 +00:00 committed by Adrian Reber
parent 308daade45
commit fa43a72aba
Failed to extract signature
1 changed files with 69 additions and 5 deletions

View File

@ -657,7 +657,7 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.
Features: criuFeat, Features: criuFeat,
} }
err := c.criuSwrk(nil, req, criuOpts, false) err := c.criuSwrk(nil, req, criuOpts, false, nil)
if err != nil { if err != nil {
logrus.Debugf("%s", err) logrus.Debugf("%s", err)
return fmt.Errorf("CRIU feature check failed") return fmt.Errorf("CRIU feature check failed")
@ -770,7 +770,7 @@ func (c *linuxContainer) checkCriuVersion(minVersion int) error {
Type: &t, Type: &t,
} }
err := c.criuSwrk(nil, req, nil, false) err := c.criuSwrk(nil, req, nil, false, nil)
if err != nil { if err != nil {
return fmt.Errorf("CRIU version check failed: %s", err) return fmt.Errorf("CRIU version check failed: %s", err)
} }
@ -928,6 +928,33 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
LazyPages: proto.Bool(criuOpts.LazyPages), LazyPages: proto.Bool(criuOpts.LazyPages),
} }
// If the container is running in a network namespace and has
// a path to the network namespace configured, we will dump
// that network namespace as an external namespace and we
// will expect that the namespace exists during restore.
// This basically means that CRIU will ignore the namespace
// and expect to be setup correctly.
nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
if nsPath != "" {
// For this to work we need at least criu 3.11.0 => 31100.
// As there was already a successful version check we will
// not error out if it fails. runc will just behave as it used
// to do and ignore external network namespaces.
err := c.checkCriuVersion(31100)
if err == nil {
// CRIU expects the information about an external namespace
// like this: --external net[<inode>]:<key>
// This <key> is always 'extRootNetNS'.
var netns syscall.Stat_t
err = syscall.Stat(nsPath, &netns)
if err != nil {
return err
}
criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino)
rpcOpts.External = append(rpcOpts.External, criuExternal)
}
}
fcg := c.cgroupManager.GetPaths()["freezer"] fcg := c.cgroupManager.GetPaths()["freezer"]
if fcg != "" { if fcg != "" {
rpcOpts.FreezeCgroup = proto.String(fcg) rpcOpts.FreezeCgroup = proto.String(fcg)
@ -1032,7 +1059,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
} }
} }
err = c.criuSwrk(nil, req, criuOpts, false) err = c.criuSwrk(nil, req, criuOpts, false, nil)
if err != nil { if err != nil {
return err return err
} }
@ -1076,6 +1103,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
var extraFiles []*os.File
// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
// support for unprivileged restore at the moment. // support for unprivileged restore at the moment.
if c.config.Rootless { if c.config.Rootless {
@ -1150,6 +1179,38 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
}, },
} }
// Same as during checkpointing. If the container has a specific network namespace
// assigned to it, this now expects that the checkpoint will be restored in a
// already created network namespace.
nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
if nsPath != "" {
// For this to work we need at least criu 3.11.0 => 31100.
// As there was already a successful version check we will
// not error out if it fails. runc will just behave as it used
// to do and ignore external network namespaces.
err := c.checkCriuVersion(31100)
if err == nil {
// CRIU wants the information about an existing network namespace
// like this: --inherit-fd fd[<fd>]:<key>
// The <key> needs to be the same as during checkpointing.
// We are always using 'extRootNetNS' as the key in this.
netns, err := os.Open(nsPath)
defer netns.Close()
if err != nil {
logrus.Error("If a specific network namespace is defined it must exist: %s", err)
return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
}
inheritFd := new(criurpc.InheritFd)
inheritFd.Key = proto.String("extRootNetNS")
// The offset of four is necessary because 0, 1, 2 and 3 is already
// used by stdin, stdout, stderr, 'criu swrk' socket.
inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles)))
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
// All open FDs need to be transferred to CRIU via extraFiles
extraFiles = append(extraFiles, netns)
}
}
for _, m := range c.config.Mounts { for _, m := range c.config.Mounts {
switch m.Device { switch m.Device {
case "bind": case "bind":
@ -1208,7 +1269,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
} }
} }
return c.criuSwrk(process, req, criuOpts, true) return c.criuSwrk(process, req, criuOpts, true, extraFiles)
} }
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
@ -1238,7 +1299,7 @@ func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
return nil return nil
} }
func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error { func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
if err != nil { if err != nil {
return err return err
@ -1279,6 +1340,9 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
cmd.Stderr = process.Stderr cmd.Stderr = process.Stderr
} }
cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
if extraFiles != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
}
if err := cmd.Start(); err != nil { if err := cmd.Start(); err != nil {
return err return err