From 610c5ad75cb69f49094719ffda5d34ba65d1a092 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 21 Jul 2020 11:33:51 +0000 Subject: [PATCH 1/2] Factor out checkpointing with external namespace code To checkpoint and restore a container with an external network namespace (like with Podman and CNI), runc tells CRIU to ignore the network namespace during checkpoint and restore. This commit moves that code to their own functions to be able to reuse the same code path for external PID namespaces which are necessary for checkpointing and restoring containers out of a pod in cri-o. Signed-off-by: Adrian Reber --- libcontainer/container_linux.go | 125 ++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 47 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index c5188b1d..79f6d33c 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -836,6 +836,72 @@ func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) } } +func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool { + switch t { + case configs.NEWNET: + // CRIU supports different external namespace with different released CRIU versions. + // For network namespaces to work we need at least criu 3.11.0 => 31100. + return c.checkCriuVersion(31100) == nil + } + return false +} + +func (c *linuxContainer) criuNsToKey(t configs.NamespaceType) string { + return "extRoot" + strings.Title(configs.NsName(t)) + "NS" +} + +func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error { + if !c.criuSupportsExtNS(t) { + return nil + } + + nsPath := c.config.Namespaces.PathOf(t) + if nsPath == "" { + return nil + } + // CRIU expects the information about an external namespace + // like this: --external []: + // This is always 'extRootNS'. + var ns unix.Stat_t + if err := unix.Stat(nsPath, &ns); err != nil { + return err + } + criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, c.criuNsToKey(t)) + rpcOpts.External = append(rpcOpts.External, criuExternal) + + return nil +} + +func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error { + if !c.criuSupportsExtNS(t) { + return nil + } + + nsPath := c.config.Namespaces.PathOf(t) + if nsPath == "" { + return nil + } + // CRIU wants the information about an existing namespace + // like this: --inherit-fd fd[]: + // The needs to be the same as during checkpointing. + // We are always using 'extRootNS' as the key in this. + nsFd, err := os.Open(nsPath) + if err != nil { + logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) + return fmt.Errorf("Requested network namespace %v does not exist", nsPath) + } + inheritFd := new(criurpc.InheritFd) + inheritFd.Key = proto.String(c.criuNsToKey(t)) + // The offset of four is necessary because 0, 1, 2 and 3 is already + // used by stdin, stdout, stderr, 'criu swrk' socket. + inheritFd.Fd = proto.Int32(int32(4 + len(*extraFiles))) + rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd) + // All open FDs need to be transferred to CRIU via extraFiles + *extraFiles = append(*extraFiles, nsFd) + + return nil +} + func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() @@ -909,25 +975,8 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { // will expect that the namespace exists during restore. // This basically means that CRIU will ignore the namespace // and expect to be setup correctly. - nsPath := c.config.Namespaces.PathOf(configs.NEWNET) - if nsPath != "" { - // For this to work we need at least criu 3.11.0 => 31100. - // As there was already a successful version check we will - // not error out if it fails. runc will just behave as it used - // to do and ignore external network namespaces. - err := c.checkCriuVersion(31100) - if err == nil { - // CRIU expects the information about an external namespace - // like this: --external net[]: - // This is always 'extRootNetNS'. - var netns unix.Stat_t - err = unix.Stat(nsPath, &netns) - if err != nil { - return err - } - criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino) - rpcOpts.External = append(rpcOpts.External, criuExternal) - } + if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil { + return err } // CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup @@ -1251,33 +1300,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { // Same as during checkpointing. If the container has a specific network namespace // assigned to it, this now expects that the checkpoint will be restored in a // already created network namespace. - nsPath := c.config.Namespaces.PathOf(configs.NEWNET) - if nsPath != "" { - // For this to work we need at least criu 3.11.0 => 31100. - // As there was already a successful version check we will - // not error out if it fails. runc will just behave as it used - // to do and ignore external network namespaces. - err := c.checkCriuVersion(31100) - if err == nil { - // CRIU wants the information about an existing network namespace - // like this: --inherit-fd fd[]: - // The needs to be the same as during checkpointing. - // We are always using 'extRootNetNS' as the key in this. - netns, err := os.Open(nsPath) - if err != nil { - logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) - return fmt.Errorf("Requested network namespace %v does not exist", nsPath) - } - defer netns.Close() - inheritFd := new(criurpc.InheritFd) - inheritFd.Key = proto.String("extRootNetNS") - // The offset of four is necessary because 0, 1, 2 and 3 is already - // used by stdin, stdout, stderr, 'criu swrk' socket. - inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles))) - req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) - // All open FDs need to be transferred to CRIU via extraFiles - extraFiles = append(extraFiles, netns) - } + if err := c.handleRestoringExternalNamespaces(req.Opts, &extraFiles, configs.NEWNET); err != nil { + return err } // This will modify the rootfs of the container in the same way runc @@ -1345,7 +1369,14 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) } } - return c.criuSwrk(process, req, criuOpts, extraFiles) + err = c.criuSwrk(process, req, criuOpts, extraFiles) + + // Now that CRIU is done let's close all opened FDs CRIU needed. + for _, fd := range extraFiles { + fd.Close() + } + + return err } func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { From 09e103b01e34a7dda321781ec7f05939c9fe4adb Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 21 Jul 2020 15:33:24 +0000 Subject: [PATCH 2/2] Tell CRIU to use an external pid namespace if necessary Trying to checkpoint a container out of pod in cri-o fails with: Error (criu/namespaces.c:1081): Can't dump a pid namespace without the process init Starting with the upcoming CRIU release 3.15, CRIU can be told to ignore the PID namespace during checkpointing and to restore processes into an existing network namespace. With the changes from this commit and CRIU 3.15 it is possible to checkpoint a container out of a pod in cri-o. Signed-off-by: Adrian Reber --- libcontainer/container_linux.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 79f6d33c..ce62ea48 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -837,13 +837,19 @@ func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) } func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool { + var minVersion int switch t { case configs.NEWNET: // CRIU supports different external namespace with different released CRIU versions. // For network namespaces to work we need at least criu 3.11.0 => 31100. - return c.checkCriuVersion(31100) == nil + minVersion = 31100 + case configs.NEWPID: + // For PID namespaces criu 31500 is needed. + minVersion = 31500 + default: + return false } - return false + return c.checkCriuVersion(minVersion) == nil } func (c *linuxContainer) criuNsToKey(t configs.NamespaceType) string { @@ -979,6 +985,11 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { return err } + // Same for possible external PID namespaces + if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil { + return err + } + // CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup // is not set, CRIU uses ptrace() to pause the processes. // Note cgroup v2 freezer is only supported since CRIU release 3.14. @@ -1304,6 +1315,11 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { return err } + // Same for PID namespaces. + if err := c.handleRestoringExternalNamespaces(req.Opts, &extraFiles, configs.NEWPID); err != nil { + return err + } + // This will modify the rootfs of the container in the same way runc // modifies the container during initial creation. if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {