From a3a632ad28de68eb375168f39639dc160fac067d Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 28 Jul 2017 08:44:45 +0000 Subject: [PATCH 1/3] checkpoint: add support to query for lazy page support Before adding the actual lazy migration support, this adds the feature check for lazy-pages. Right now lazy migration, which is based on userfaultd is only available in the criu-dev branch and not yet in a release. As the check does not dependent on a certain version but on a CRIU feature which can be queried it can be part of runC without a new version check depending on a feature from criu-dev. Signed-off-by: Adrian Reber --- libcontainer/container_linux.go | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 9e1b74d7..b034fa0c 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -600,9 +600,24 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc. logrus.Debugf("Feature check says: %s", criuFeatures) missingFeatures := false - if *criuFeat.MemTrack && !*criuFeatures.MemTrack { - missingFeatures = true - logrus.Debugf("CRIU does not support MemTrack") + // The outer if checks if the fields actually exist + if (criuFeat.MemTrack != nil) && + (criuFeatures.MemTrack != nil) { + // The inner if checks if they are set to true + if *criuFeat.MemTrack && !*criuFeatures.MemTrack { + missingFeatures = true + logrus.Debugf("CRIU does not support MemTrack") + } + } + + // This needs to be repeated for every new feature check. + // Is there a way to put this in a function. Reflection? + if (criuFeat.LazyPages != nil) && + (criuFeatures.LazyPages != nil) { + if *criuFeat.LazyPages && !*criuFeatures.LazyPages { + missingFeatures = true + logrus.Debugf("CRIU does not support LazyPages") + } } if missingFeatures { From 60ae7091de607c3ac87c886e90418a527cac5f22 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 24 Jul 2017 15:43:14 +0000 Subject: [PATCH 2/3] checkpoint: support lazy migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the help of userfaultfd CRIU supports lazy migration. Lazy migration means that memory pages are only transferred from the migration source to the migration destination on page fault. This enables to reduce the downtime during process or container migration to a minimum as the memory does not need to be transferred during migration. Lazy migration currently depends on userfaultfd being available on the current Linux kernel and if the used CRIU version supports lazy migration. Both dependencies can be checked by querying CRIU via RPC if the lazy migration feature is available. Using feature checking instead of version comparison enables runC to use CRIU features from the criu-dev branch. This way the user can decide if lazy migration should be available by choosing the right kernel and CRIU branch. To use lazy migration the CRIU process during dump needs to dump everything besides the memory pages and then it opens a network port waiting for remote page fault requests: # runc checkpoint httpd --lazy-pages --page-server 0.0.0.0:27 \ --status-fd /tmp/postcopy-pipe In this example CRIU will hang/wait once it has opened the network port and wait for network connection. As runC waits for CRIU to finish it will also hang until the lazy migration has finished. To know when the restore on the destination side can start the '--status-fd' parameter is used: #️ runc checkpoint --help | grep status --status-fd value criu writes \0 to this FD once lazy-pages is ready The parameter '--status-fd' is directly from CRIU and this way the process outside of runC which controls the migration knows exactly when to transfer the checkpoint (without memory pages) to the destination and that the restore can be started. On the destination side it is necessary to start CRIU in 'lazy-pages' mode like this: # criu lazy-pages --page-server --address 192.168.122.3 --port 27 \ -D checkpoint and tell runC to do a lazy restore: # runc restore -d --image-path checkpoint --work-path checkpoint \ --lazy-pages httpd If both processes on the restore side have the same working directory 'criu lazy-pages' creates a unix domain socket where it waits for requests from the actual restore. runC starts CRIU restore in lazy restore mode and talks to 'criu lazy-pages' that it wants to restore memory pages on demand. CRIU continues to restore the process and once the process is running and accesses the first non-existing memory page the 'criu lazy-pages' server will request the page from the source system. Thus all pages from the source system will be transferred to the destination system. Once all pages have been transferred runC on the source system will end and the container will have finished migration. This can also be combined with CRIU's pre-copy support. The combination of pre-copy and post-copy (lazy migration) provides the possibility to migrate containers with minimal downtimes. Some additional background about post-copy migration can be found in these articles: https://lisas.de/~adrian/?p=1253 https://lisas.de/~adrian/?p=1183 Signed-off-by: Adrian Reber --- checkpoint.go | 2 ++ libcontainer/container_linux.go | 39 +++++++++++++++++++++++++++++++++ libcontainer/criu_opts_linux.go | 2 ++ restore.go | 6 +++++ 4 files changed, 49 insertions(+) diff --git a/checkpoint.go b/checkpoint.go index d62816b7..41c2ac56 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -30,6 +30,8 @@ checkpointed.`, cli.BoolFlag{Name: "tcp-established", Usage: "allow open tcp connections"}, cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"}, cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"}, + cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"}, + cli.StringFlag{Name: "status-fd", Value: "", Usage: "criu writes \\0 to this FD once lazy-pages is ready"}, cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"}, cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"}, cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"}, diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index b034fa0c..d76a3efb 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -773,6 +773,25 @@ func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { } req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) } + return nil +} + +func waitForCriuLazyServer(r *os.File, status string) error { + + data := make([]byte, 1) + _, err := r.Read(data) + if err != nil { + return err + } + fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend) + if err != nil { + return err + } + _, err = fd.Write(data) + if err != nil { + return err + } + fd.Close() return nil } @@ -840,6 +859,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { EmptyNs: proto.Uint32(criuOpts.EmptyNs), OrphanPtsMaster: proto.Bool(true), AutoDedup: proto.Bool(criuOpts.AutoDedup), + LazyPages: proto.Bool(criuOpts.LazyPages), } fcg := c.cgroupManager.GetPaths()["freezer"] @@ -890,6 +910,24 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { Opts: &rpcOpts, } + if criuOpts.LazyPages { + // lazy migration requested; check if criu supports it + feat := criurpc.CriuFeatures{ + LazyPages: proto.Bool(true), + } + + if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { + return err + } + + statusRead, statusWrite, err := os.Pipe() + if err != nil { + return err + } + rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd())) + go waitForCriuLazyServer(statusRead, criuOpts.StatusFd) + } + //no need to dump these information in pre-dump if !criuOpts.PreDump { for _, m := range c.config.Mounts { @@ -1042,6 +1080,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { EmptyNs: proto.Uint32(criuOpts.EmptyNs), OrphanPtsMaster: proto.Bool(true), AutoDedup: proto.Bool(criuOpts.AutoDedup), + LazyPages: proto.Bool(criuOpts.LazyPages), }, } diff --git a/libcontainer/criu_opts_linux.go b/libcontainer/criu_opts_linux.go index 8f142c9f..adcb7cb6 100644 --- a/libcontainer/criu_opts_linux.go +++ b/libcontainer/criu_opts_linux.go @@ -35,4 +35,6 @@ type CriuOpts struct { ManageCgroupsMode cgMode // dump or restore cgroup mode EmptyNs uint32 // don't c/r properties for namespace from this mask AutoDedup bool // auto deduplication for incremental dumps + LazyPages bool // restore memory pages lazily using userfaultfd + StatusFd string // fd for feedback when lazy server is ready } diff --git a/restore.go b/restore.go index 7342f9df..362be62d 100644 --- a/restore.go +++ b/restore.go @@ -86,6 +86,10 @@ using the runc checkpoint command.`, Name: "auto-dedup", Usage: "enable auto deduplication of memory images", }, + cli.BoolFlag{ + Name: "lazy-pages", + Usage: "use userfaultfd to lazily restore memory pages", + }, }, Action: func(context *cli.Context) error { if err := checkArgs(context, 1, exactArgs); err != nil { @@ -128,5 +132,7 @@ func criuOptions(context *cli.Context) *libcontainer.CriuOpts { FileLocks: context.Bool("file-locks"), PreDump: context.Bool("pre-dump"), AutoDedup: context.Bool("auto-dedup"), + LazyPages: context.Bool("lazy-pages"), + StatusFd: context.String("status-fd"), } } From ec260653b7d4ed47392a87001e11ac6178c70268 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 10 Aug 2017 19:24:31 +0000 Subject: [PATCH 3/3] lazy-migration: add test case The lazy-pages test case is not as straight forward as the other test cases. This is related to the fact that restoring requires a different name if restored on the same host. During 'runc checkpoint' the container is not destroyed before all memory pages have been transferred to the destination and thus the same container name cannot be used. As real world usage will rather migrate a container from one system to another than lazy migrate a container on the same host this is only problematic for this test case. Another reason is that it requires starting 'runc checkpoint' and 'criu lazy-pages' in the background as those process need to be running to start the final restore 'runc restore'. CRIU upstream is currently discussing to automatically start 'criu lazy-pages' which would simplify the lazy-pages test case a bit. The handling and checking of the background processes make the test case not the most elegant as at one point a 'sleep 2' is required to make sure that 'runc checkpoint' had time to do its thing before looking at log files. Before running the actual test criu is called in feature checking mode to make sure lazy migration is in the test case criu enabled. If not, the test is skipped. Signed-off-by: Adrian Reber --- tests/integration/checkpoint.bats | 108 ++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/tests/integration/checkpoint.bats b/tests/integration/checkpoint.bats index a99d6d72..27d6edc9 100644 --- a/tests/integration/checkpoint.bats +++ b/tests/integration/checkpoint.bats @@ -122,3 +122,111 @@ function teardown() { [ "$status" -eq 0 ] [[ "${output}" == *"ponG Ping"* ]] } + +@test "checkpoint --lazy-pages and restore" { + # XXX: currently criu require root containers. + requires criu root + + # check if lazy-pages is supported + run ${CRIU} check --feature lazy_pages + if [ "$status" -eq 1 ]; then + # this criu does not support lazy migration; skip the test + skip "this criu does not support lazy migration" + fi + + sed -i 's;"terminal": true;"terminal": false;' config.json + sed -i 's;"readonly": true;"readonly": false;' config.json + sed -i 's/"sh"/"sh","-c","for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"/' config.json + + # The following code creates pipes for stdin and stdout. + # CRIU can't handle fifo-s, so we need all these tricks. + fifo=`mktemp -u /tmp/runc-fifo-XXXXXX` + mkfifo $fifo + + # For lazy migration we need to know when CRIU is ready to serve + # the memory pages via TCP. + lazy_pipe=`mktemp -u /tmp/lazy-pipe-XXXXXX` + mkfifo $lazy_pipe + + # TCP port for lazy migration + port=27277 + + # stdout + cat $fifo | cat $fifo & + pid=$! + exec 50/proc/$pid/fd/0 + + # stdin + cat $fifo | cat $fifo & + pid=$! + exec 60/proc/$pid/fd/0 + + echo -n > $fifo + unlink $fifo + + # run busybox + __runc run -d test_busybox <&60 >&51 2>&51 + [ $? -eq 0 ] + + testcontainer test_busybox running + + # checkpoint the running container + mkdir image-dir + mkdir work-dir + # Double fork taken from helpers.bats + # We need to start 'runc checkpoint --lazy-pages' in the background, + # so we double fork in the shell. + (runc --criu "$CRIU" checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_pipe} --work-path ./work-dir --image-path ./image-dir test_busybox & ) & + # Sleeping here. This is ugly, but not sure how else to handle it. + # The return code of the in the background running runc is needed, if + # there is some basic error. If the lazy migration is ready can + # be handled by $lazy_pipe. Which probably will always be ready + # after sleeping two seconds. + sleep 2 + # Check if inventory.img was written + [ -e image-dir/inventory.img ] + # If the inventory.img exists criu checkpointed some things, let's see + # if there were other errors in the log file. + run grep -B 5 Error ./work-dir/dump.log -q + [ "$status" -eq 1 ] + + # This will block until CRIU is ready to serve memory pages + cat $lazy_pipe + [ "$status" -eq 1 ] + + unlink $lazy_pipe + + # Double fork taken from helpers.bats + # We need to start 'criu lazy-pages' in the background, + # so we double fork in the shell. + # Start CRIU in lazy-daemon mode + $(${CRIU} lazy-pages --page-server --address 127.0.0.1 --port ${port} -D image-dir &) & + + # Restore lazily from checkpoint. + # The restored container needs a different name as the checkpointed + # container is not yet destroyed. It is only destroyed at that point + # in time when the last page is lazily transferred to the destination. + # Killing the CRIU on the checkpoint side will let the container + # continue to run if the migration failed at some point. + __runc --criu "$CRIU" restore -d --work-path ./image-dir --image-path ./image-dir --lazy-pages test_busybox_restore <&60 >&51 2>&51 + ret=$? + [ $ret -eq 0 ] + run grep -B 5 Error ./work-dir/dump.log -q + [ "$status" -eq 1 ] + + # busybox should be back up and running + testcontainer test_busybox_restore running + + runc exec --cwd /bin test_busybox_restore echo ok + [ "$status" -eq 0 ] + [[ ${output} == "ok" ]] + + echo Ping >&61 + exec 61>&- + exec 51>&- + run cat <&50 + [ "$status" -eq 0 ] + [[ "${output}" == *"ponG Ping"* ]] +}