From 60ae7091de607c3ac87c886e90418a527cac5f22 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 24 Jul 2017 15:43:14 +0000 Subject: [PATCH] checkpoint: support lazy migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the help of userfaultfd CRIU supports lazy migration. Lazy migration means that memory pages are only transferred from the migration source to the migration destination on page fault. This enables to reduce the downtime during process or container migration to a minimum as the memory does not need to be transferred during migration. Lazy migration currently depends on userfaultfd being available on the current Linux kernel and if the used CRIU version supports lazy migration. Both dependencies can be checked by querying CRIU via RPC if the lazy migration feature is available. Using feature checking instead of version comparison enables runC to use CRIU features from the criu-dev branch. This way the user can decide if lazy migration should be available by choosing the right kernel and CRIU branch. To use lazy migration the CRIU process during dump needs to dump everything besides the memory pages and then it opens a network port waiting for remote page fault requests: # runc checkpoint httpd --lazy-pages --page-server 0.0.0.0:27 \ --status-fd /tmp/postcopy-pipe In this example CRIU will hang/wait once it has opened the network port and wait for network connection. As runC waits for CRIU to finish it will also hang until the lazy migration has finished. To know when the restore on the destination side can start the '--status-fd' parameter is used: #️ runc checkpoint --help | grep status --status-fd value criu writes \0 to this FD once lazy-pages is ready The parameter '--status-fd' is directly from CRIU and this way the process outside of runC which controls the migration knows exactly when to transfer the checkpoint (without memory pages) to the destination and that the restore can be started. On the destination side it is necessary to start CRIU in 'lazy-pages' mode like this: # criu lazy-pages --page-server --address 192.168.122.3 --port 27 \ -D checkpoint and tell runC to do a lazy restore: # runc restore -d --image-path checkpoint --work-path checkpoint \ --lazy-pages httpd If both processes on the restore side have the same working directory 'criu lazy-pages' creates a unix domain socket where it waits for requests from the actual restore. runC starts CRIU restore in lazy restore mode and talks to 'criu lazy-pages' that it wants to restore memory pages on demand. CRIU continues to restore the process and once the process is running and accesses the first non-existing memory page the 'criu lazy-pages' server will request the page from the source system. Thus all pages from the source system will be transferred to the destination system. Once all pages have been transferred runC on the source system will end and the container will have finished migration. This can also be combined with CRIU's pre-copy support. The combination of pre-copy and post-copy (lazy migration) provides the possibility to migrate containers with minimal downtimes. Some additional background about post-copy migration can be found in these articles: https://lisas.de/~adrian/?p=1253 https://lisas.de/~adrian/?p=1183 Signed-off-by: Adrian Reber --- checkpoint.go | 2 ++ libcontainer/container_linux.go | 39 +++++++++++++++++++++++++++++++++ libcontainer/criu_opts_linux.go | 2 ++ restore.go | 6 +++++ 4 files changed, 49 insertions(+) diff --git a/checkpoint.go b/checkpoint.go index d62816b7..41c2ac56 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -30,6 +30,8 @@ checkpointed.`, cli.BoolFlag{Name: "tcp-established", Usage: "allow open tcp connections"}, cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"}, cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"}, + cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"}, + cli.StringFlag{Name: "status-fd", Value: "", Usage: "criu writes \\0 to this FD once lazy-pages is ready"}, cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"}, cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"}, cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"}, diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index b034fa0c..d76a3efb 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -773,6 +773,25 @@ func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { } req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) } + return nil +} + +func waitForCriuLazyServer(r *os.File, status string) error { + + data := make([]byte, 1) + _, err := r.Read(data) + if err != nil { + return err + } + fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend) + if err != nil { + return err + } + _, err = fd.Write(data) + if err != nil { + return err + } + fd.Close() return nil } @@ -840,6 +859,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { EmptyNs: proto.Uint32(criuOpts.EmptyNs), OrphanPtsMaster: proto.Bool(true), AutoDedup: proto.Bool(criuOpts.AutoDedup), + LazyPages: proto.Bool(criuOpts.LazyPages), } fcg := c.cgroupManager.GetPaths()["freezer"] @@ -890,6 +910,24 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { Opts: &rpcOpts, } + if criuOpts.LazyPages { + // lazy migration requested; check if criu supports it + feat := criurpc.CriuFeatures{ + LazyPages: proto.Bool(true), + } + + if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { + return err + } + + statusRead, statusWrite, err := os.Pipe() + if err != nil { + return err + } + rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd())) + go waitForCriuLazyServer(statusRead, criuOpts.StatusFd) + } + //no need to dump these information in pre-dump if !criuOpts.PreDump { for _, m := range c.config.Mounts { @@ -1042,6 +1080,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { EmptyNs: proto.Uint32(criuOpts.EmptyNs), OrphanPtsMaster: proto.Bool(true), AutoDedup: proto.Bool(criuOpts.AutoDedup), + LazyPages: proto.Bool(criuOpts.LazyPages), }, } diff --git a/libcontainer/criu_opts_linux.go b/libcontainer/criu_opts_linux.go index 8f142c9f..adcb7cb6 100644 --- a/libcontainer/criu_opts_linux.go +++ b/libcontainer/criu_opts_linux.go @@ -35,4 +35,6 @@ type CriuOpts struct { ManageCgroupsMode cgMode // dump or restore cgroup mode EmptyNs uint32 // don't c/r properties for namespace from this mask AutoDedup bool // auto deduplication for incremental dumps + LazyPages bool // restore memory pages lazily using userfaultfd + StatusFd string // fd for feedback when lazy server is ready } diff --git a/restore.go b/restore.go index 7342f9df..362be62d 100644 --- a/restore.go +++ b/restore.go @@ -86,6 +86,10 @@ using the runc checkpoint command.`, Name: "auto-dedup", Usage: "enable auto deduplication of memory images", }, + cli.BoolFlag{ + Name: "lazy-pages", + Usage: "use userfaultfd to lazily restore memory pages", + }, }, Action: func(context *cli.Context) error { if err := checkArgs(context, 1, exactArgs); err != nil { @@ -128,5 +132,7 @@ func criuOptions(context *cli.Context) *libcontainer.CriuOpts { FileLocks: context.Bool("file-locks"), PreDump: context.Bool("pre-dump"), AutoDedup: context.Bool("auto-dedup"), + LazyPages: context.Bool("lazy-pages"), + StatusFd: context.String("status-fd"), } }