diff --git a/.gitignore b/.gitignore index 2e3f79b4..4c9ade07 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ bundles nsinit/nsinit vendor/pkg +runc diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..27448585 --- /dev/null +++ b/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2014 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..db89b1f9 --- /dev/null +++ b/Makefile @@ -0,0 +1,9 @@ +all: + go build -o runc . + +install: + cp runc /usr/local/bin/runc + rm runc + +clean: + rm runc diff --git a/README.md b/README.md new file mode 100644 index 00000000..dfa1aa0c --- /dev/null +++ b/README.md @@ -0,0 +1,145 @@ +## runc + +`runc` is a CLI tool for spawning and running containers according to the OCF specification. + +### Building: + +```bash +git clone https://github.com/opencontainers/runc +make +sudo make install +``` + +### Using: + +To run a container that you received just execute `runc run` with the JSON format at the argument or have a +`container.json` file in the current working directory. + +```bash +runc +/ $ ps +PID USER COMMAND +1 daemon sh +5 daemon sh +/ $ +``` + +### OCF Container JSON Format; + +```json +{ + "version": "0.1", + "os": "linux", + "arch": "amd64", + "processes": [ + { + "tty": true, + "user": "daemon", + "args": [ + "sh" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "" + } + ], + "root": { + "path": "rootfs", + "readonly": true + }, + "cpus": 1.1, + "memory": 1024, + "hostname": "shell", + "namespaces": [ + { + "type": "process" + }, + { + "type": "network" + }, + { + "type": "mount" + }, + { + "type": "ipc" + }, + { + "type": "uts" + } + ], + "capabilities": [ + "AUDIT_WRITE", + "KILL", + "NET_BIND_SERVICE" + ], + "devices": [ + "null", + "random", + "full", + "tty", + "zero", + "urandom" + ], + "mounts": [ + { + "type": "proc", + "source": "proc", + "destination": "/proc", + "options": "" + }, + { + "type": "tmpfs", + "source": "tmpfs", + "destination": "/dev", + "options": "nosuid,strictatime,mode=755,size=65536k" + }, + { + "type": "devpts", + "source": "devpts", + "destination": "/dev/pts", + "options": "nosuid,noexec,newinstance,ptmxmode=0666,mode=0620,gid=5" + }, + { + "type": "tmpfs", + "source": "shm", + "destination": "/dev/shm", + "options": "nosuid,noexec,nodev,mode=1777,size=65536k" + }, + { + "type": "mqueue", + "source": "mqueue", + "destination": "/dev/mqueue", + "options": "nosuid,noexec,nodev" + }, + { + "type": "sysfs", + "source": "sysfs", + "destination": "/sys", + "options": "nosuid,noexec,nodev" + } + ] +} +``` + +### Examples: + +#### Using runc with systemd + +```service +[Unit] +Description=Minecraft Build Server +Documentation=http://minecraft.net +After=network.target + +[Service] +CPUQuota=200% +MemoryLimit=1536M +ExecStart=/usr/local/bin/runc +Restart=on-failure +WorkingDirectory=/containers/minecraftbuild + +[Install] +WantedBy=multi-user.target +``` diff --git a/checkpoint.go b/checkpoint.go new file mode 100644 index 00000000..733a145c --- /dev/null +++ b/checkpoint.go @@ -0,0 +1,63 @@ +package main + +import ( + "fmt" + "strconv" + "strings" + + "github.com/codegangsta/cli" + "github.com/opencontainers/runc/libcontainer" +) + +var checkpointCommand = cli.Command{ + Name: "checkpoint", + Usage: "checkpoint a running container", + Flags: []cli.Flag{ + cli.StringFlag{Name: "image-path", Value: "", Usage: "path for saving criu image files"}, + cli.StringFlag{Name: "work-path", Value: "", Usage: "path for saving work files and logs"}, + cli.BoolFlag{Name: "leave-running", Usage: "leave the process running after checkpointing"}, + cli.BoolFlag{Name: "tcp-established", Usage: "allow open tcp connections"}, + cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"}, + cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"}, + cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"}, + }, + Action: func(context *cli.Context) { + container, err := getContainer(context) + if err != nil { + fatal(err) + } + options := criuOptions(context) + // these are the mandatory criu options for a container + setPageServer(context, options) + if err := container.Checkpoint(options); err != nil { + fatal(err) + } + }, +} + +func getCheckpointImagePath(context *cli.Context) string { + imagePath := context.String("image-path") + if imagePath == "" { + imagePath = getDefaultImagePath(context) + } + return imagePath +} + +func setPageServer(context *cli.Context, options *libcontainer.CriuOpts) { + // xxx following criu opts are optional + // The dump image can be sent to a criu page server + if psOpt := context.String("page-server"); psOpt != "" { + addressPort := strings.Split(psOpt, ":") + if len(addressPort) != 2 { + fatal(fmt.Errorf("Use --page-server ADDRESS:PORT to specify page server")) + } + port_int, err := strconv.Atoi(addressPort[1]) + if err != nil { + fatal(fmt.Errorf("Invalid port number")) + } + options.PageServer = libcontainer.CriuPageServerInfo{ + Address: addressPort[0], + Port: int32(port_int), + } + } +} diff --git a/events.go b/events.go new file mode 100644 index 00000000..174af906 --- /dev/null +++ b/events.go @@ -0,0 +1,93 @@ +package main + +import ( + "encoding/json" + "os" + "sync" + "time" + + "github.com/Sirupsen/logrus" + "github.com/codegangsta/cli" + "github.com/opencontainers/runc/libcontainer" +) + +// event struct for encoding the event data to json. +type event struct { + Type string `json:"type"` + ID string `json:"id"` + Data interface{} `json:"data,omitempty"` +} + +var eventsCommand = cli.Command{ + Name: "events", + Usage: "display container events such as OOM notifications and cpu, memeory, IO, and network stats", + Flags: []cli.Flag{ + cli.DurationFlag{Name: "interval", Value: 5 * time.Second, Usage: "set the stats collection interval"}, + cli.BoolFlag{Name: "stats", Usage: "display the container's stats then exit"}, + }, + Action: func(context *cli.Context) { + container, err := getContainer(context) + if err != nil { + logrus.Fatal(err) + } + var ( + stats = make(chan *libcontainer.Stats, 1) + events = make(chan *event, 1024) + group = &sync.WaitGroup{} + ) + group.Add(1) + go func() { + defer group.Done() + enc := json.NewEncoder(os.Stdout) + for e := range events { + if err := enc.Encode(e); err != nil { + logrus.Error(err) + } + } + }() + if context.Bool("stats") { + s, err := container.Stats() + if err != nil { + fatal(err) + } + events <- &event{Type: "stats", ID: container.ID(), Data: s} + close(events) + group.Wait() + return + } + go func() { + for _ = range time.Tick(context.Duration("interval")) { + s, err := container.Stats() + if err != nil { + logrus.Error(err) + continue + } + stats <- s + } + }() + n, err := container.NotifyOOM() + if err != nil { + logrus.Fatal(err) + } + for { + select { + case _, ok := <-n: + if ok { + // this means an oom event was received, if it is !ok then + // the channel was closed because the container stopped and + // the cgroups no longer exist. + events <- &event{Type: "oom", ID: container.ID()} + } else { + n = nil + } + case s := <-stats: + events <- &event{Type: "stats", ID: container.ID(), Data: s} + } + if n == nil { + close(events) + break + } + } + group.Wait() + }, +} diff --git a/main.go b/main.go new file mode 100644 index 00000000..42517e12 --- /dev/null +++ b/main.go @@ -0,0 +1,99 @@ +package main + +import ( + "os" + "runtime" + + "github.com/Sirupsen/logrus" + "github.com/codegangsta/cli" + "github.com/opencontainers/runc/libcontainer" + _ "github.com/opencontainers/runc/libcontainer/nsenter" +) + +const ( + version = "0.1" + usage = `open container runtime + +runc integrates well with existing process supervisors to provide a production container runtime environment for +applications. It can be used with your existing process monitoring tools and the container will be spawned as direct +child of the process supervisor. nsinit can be used to manage the lifetime of a single container. + +Execute a simple container in your shell by running: + + cd /mycontainer + runc +` +) + +func init() { + if len(os.Args) > 1 && os.Args[1] == "init" { + runtime.GOMAXPROCS(1) + runtime.LockOSThread() + factory, _ := libcontainer.New("") + if err := factory.StartInitialization(); err != nil { + fatal(err) + } + panic("--this line should never been executed, congradulations--") + } +} + +func main() { + app := cli.NewApp() + app.Name = "runc" + app.Usage = usage + app.Version = version + app.Flags = []cli.Flag{ + cli.StringFlag{ + Name: "id", + Value: getDefaultID(), + Usage: "specify the ID to be used for the container", + }, + cli.BoolFlag{ + Name: "debug", + Usage: "enable debug output for logging", + }, + cli.StringFlag{ + Name: "root", + Value: "/var/run/ocf", + Usage: "root directory for storage of container state (this should be located in tmpfs)", + }, + cli.StringFlag{ + Name: "criu", + Value: "criu", + Usage: "path to the criu binary used for checkpoint and restore", + }, + } + app.Commands = []cli.Command{ + checkpointCommand, + eventsCommand, + restoreCommand, + specCommand, + } + app.Before = func(context *cli.Context) error { + if context.GlobalBool("debug") { + logrus.SetLevel(logrus.DebugLevel) + } + return nil + } + // default action is to execute a container + app.Action = func(context *cli.Context) { + if os.Geteuid() != 0 { + cli.ShowAppHelp(context) + logrus.Fatal("runc should be run as root") + } + spec, err := loadSpec(context.Args().First()) + if err != nil { + fatal(err) + } + status, err := execContainer(context, spec) + if err != nil { + fatal(err) + } + // exit with the container's exit status so any external supervisor is + // notified of the exit with the correct exit status. + os.Exit(status) + } + if err := app.Run(os.Args); err != nil { + logrus.Fatal(err) + } +} diff --git a/restore.go b/restore.go new file mode 100644 index 00000000..b186bbf4 --- /dev/null +++ b/restore.go @@ -0,0 +1,129 @@ +package main + +import ( + "os" + "os/signal" + "syscall" + + "github.com/Sirupsen/logrus" + "github.com/codegangsta/cli" + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" +) + +var restoreCommand = cli.Command{ + Name: "restore", + Usage: "restore a container from a previous checkpoint", + Flags: []cli.Flag{ + cli.StringFlag{Name: "image-path", Value: "", Usage: "path to criu image files for restoring"}, + cli.StringFlag{Name: "work-path", Value: "", Usage: "path for saving work files and logs"}, + cli.BoolFlag{Name: "tcp-established", Usage: "allow open tcp connections"}, + cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"}, + cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"}, + }, + Action: func(context *cli.Context) { + imagePath := context.String("image-path") + if imagePath == "" { + imagePath = getDefaultImagePath(context) + } + spec, err := loadSpec(context.Args().First()) + if err != nil { + fatal(err) + } + config, err := createLibcontainerConfig(spec) + if err != nil { + fatal(err) + } + status, err := restoreContainer(context, spec, config, imagePath) + if err != nil { + fatal(err) + } + os.Exit(status) + }, +} + +func restoreContainer(context *cli.Context, spec *LinuxSpec, config *configs.Config, imagePath string) (code int, err error) { + rootuid := 0 + factory, err := loadFactory(context) + if err != nil { + return -1, err + } + container, err := factory.Load(context.GlobalString("id")) + if err != nil { + container, err = factory.Create(context.GlobalString("id"), config) + if err != nil { + return -1, err + } + } + options := criuOptions(context) + // ensure that the container is always removed if we were the process + // that created it. + defer func() { + if err != nil { + return + } + status, err := container.Status() + if err != nil { + logrus.Error(err) + } + if status != libcontainer.Checkpointed { + if err := container.Destroy(); err != nil { + logrus.Error(err) + } + if err := os.RemoveAll(options.ImagesDirectory); err != nil { + logrus.Error(err) + } + } + }() + process := &libcontainer.Process{ + Stdin: os.Stdin, + Stdout: os.Stdout, + Stderr: os.Stderr, + } + tty, err := newTty(spec.Processes[0].TTY, process, rootuid) + if err != nil { + return -1, err + } + defer tty.Close() + go handleSignals(process, tty) + if err := container.Restore(process, options); err != nil { + return -1, err + } + status, err := process.Wait() + if err != nil { + return -1, err + } + return utils.ExitStatus(status.Sys().(syscall.WaitStatus)), nil +} + +func criuOptions(context *cli.Context) *libcontainer.CriuOpts { + imagePath := getCheckpointImagePath(context) + if err := os.MkdirAll(imagePath, 0655); err != nil { + fatal(err) + } + return &libcontainer.CriuOpts{ + ImagesDirectory: imagePath, + WorkDirectory: context.String("work-path"), + LeaveRunning: context.Bool("leave-running"), + TcpEstablished: true, // context.Bool("tcp-established"), + ExternalUnixConnections: context.Bool("ext-unix-sk"), + ShellJob: context.Bool("shell-job"), + } +} + +// we have to use this type of signal handler because there is a memory leak if we +// wait and reap with SICHLD. +func handleSignals(process *libcontainer.Process, tty *tty) { + sigc := make(chan os.Signal, 10) + signal.Notify(sigc) + tty.resize() + for sig := range sigc { + switch sig { + case syscall.SIGWINCH: + tty.resize() + default: + process.Signal(sig) + } + } +} diff --git a/run.go b/run.go new file mode 100644 index 00000000..98e819c8 --- /dev/null +++ b/run.go @@ -0,0 +1,57 @@ +package main + +import ( + "fmt" + + "github.com/Sirupsen/logrus" + "github.com/codegangsta/cli" + "github.com/opencontainers/runc/libcontainer" +) + +func execContainer(context *cli.Context, spec *LinuxSpec) (int, error) { + if len(spec.Processes) != 1 { + return -1, fmt.Errorf("runc only supports one(1) process for the container") + } + config, err := createLibcontainerConfig(spec) + if err != nil { + return -1, err + } + rootuid, err := config.HostUID() + if err != nil { + return -1, err + } + factory, err := loadFactory(context) + if err != nil { + return -1, err + } + container, err := factory.Create(context.GlobalString("id"), config) + if err != nil { + return -1, err + } + // ensure that the container is always removed if we were the process + // that created it. + defer destroy(container) + process := newProcess(spec.Processes[0]) + tty, err := newTty(spec.Processes[0].TTY, process, rootuid) + if err != nil { + return -1, err + } + handler := newSignalHandler(tty) + defer handler.Close() + if err := container.Start(process); err != nil { + return -1, err + } + return handler.forward(process) +} + +func destroy(container libcontainer.Container) { + status, err := container.Status() + if err != nil { + logrus.Error(err) + } + if status != libcontainer.Checkpointed { + if err := container.Destroy(); err != nil { + logrus.Error(err) + } + } +} diff --git a/signals.go b/signals.go new file mode 100644 index 00000000..4c1463e8 --- /dev/null +++ b/signals.go @@ -0,0 +1,108 @@ +package main + +import ( + "os" + "os/signal" + "syscall" + + "github.com/Sirupsen/logrus" + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/utils" +) + +const signalBufferSize = 2048 + +// newSignalHandler returns a signal handler for processing SIGCHLD and SIGWINCH signals +// while still forwarding all other signals to the process. +func newSignalHandler(tty *tty) *signalHandler { + // ensure that we have a large buffer size so that we do not miss any signals + // incase we are not processing them fast enough. + s := make(chan os.Signal, signalBufferSize) + // handle all signals for the process. + signal.Notify(s) + return &signalHandler{ + tty: tty, + signals: s, + } +} + +// exit models a process exit status with the pid and +// exit status. +type exit struct { + pid int + status int +} + +type signalHandler struct { + signals chan os.Signal + tty *tty +} + +// forward handles the main signal event loop forwarding, resizing, or reaping depeding +// on the signal received. +func (h *signalHandler) forward(process *libcontainer.Process) (int, error) { + // make sure we know the pid of our main process so that we can return + // after it dies. + pid1, err := process.Pid() + if err != nil { + return -1, err + } + // perform the initial tty resize. + h.tty.resize() + for s := range h.signals { + switch s { + case syscall.SIGWINCH: + h.tty.resize() + case syscall.SIGCHLD: + exits, err := h.reap() + if err != nil { + logrus.Error(err) + } + for _, e := range exits { + logrus.WithFields(logrus.Fields{ + "pid": e.pid, + "status": e.status, + }).Debug("process exited") + if e.pid == pid1 { + // call Wait() on the process even though we already have the exit + // status because we must ensure that any of the go specific process + // fun such as flushing pipes are complete before we return. + process.Wait() + return e.status, nil + } + } + default: + logrus.Debugf("sending signal to process %s", s) + if err := syscall.Kill(pid1, s.(syscall.Signal)); err != nil { + logrus.Error(err) + } + } + } + return -1, nil +} + +// reap runs wait4 in a loop until we have finished processing any existing exits +// then returns all exits to the main event loop for further processing. +func (h *signalHandler) reap() (exits []exit, err error) { + var ( + ws syscall.WaitStatus + rus syscall.Rusage + ) + for { + pid, err := syscall.Wait4(-1, &ws, syscall.WNOHANG, &rus) + if err != nil { + if err == syscall.ECHILD { + return exits, nil + } + return nil, err + } + exits = append(exits, exit{ + pid: pid, + status: utils.ExitStatus(ws), + }) + } +} + +func (h *signalHandler) Close() error { + return h.tty.Close() +} diff --git a/spec.go b/spec.go new file mode 100644 index 00000000..5fc24115 --- /dev/null +++ b/spec.go @@ -0,0 +1,146 @@ +package main + +import ( + "encoding/json" + "fmt" + "runtime" + + "github.com/codegangsta/cli" +) + +const cpuQuotaMultiplyer = 100000 + +type Mount struct { + Type string `json:"type"` + Source string `json:"source"` + Destination string `json:"destination"` + Options string `json:"options"` +} + +type Process struct { + TTY bool `json:"tty"` + User string `json:"user"` + Args []string `json:"args"` + Env []string `json:"env"` + Cwd string `json:"cwd"` +} + +type Root struct { + Path string `json:"path"` + Readonly bool `json:"readonly"` +} + +type Namespace struct { + Type string `json:"type"` + Path string `json:"path,omitempty"` +} + +type PortableSpec struct { + Version string `json:"version"` + OS string `json:"os"` + Arch string `json:"arch"` + Processes []*Process `json:"processes"` + Root Root `json:"root"` + Cpus float64 `json:"cpus"` // in 1.1 for 110% cpus + Memory int64 `json:"memory"` // in mb; 1024m + Hostname string `json:"hostname"` + Namespaces []Namespace `json:"namespaces"` + Capabilities []string `json:"capabilities"` + Devices []string `json:"devices"` + Mounts []Mount `json:"mounts"` +} + +var specCommand = cli.Command{ + Name: "spec", + Usage: "create a new specification file", + Action: func(context *cli.Context) { + spec := PortableSpec{ + Version: version, + OS: runtime.GOOS, + Arch: runtime.GOARCH, + Root: Root{ + Path: "rootfs", + Readonly: true, + }, + Processes: []*Process{ + { + TTY: true, + User: "daemon", + Args: []string{ + "sh", + }, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + }, + }, + Cpus: 1.1, + Memory: 1024, + Hostname: "shell", + Capabilities: []string{ + "AUDIT_WRITE", + "KILL", + "NET_BIND_SERVICE", + }, + Devices: []string{ + "null", + "random", + "full", + "tty", + "zero", + "urandom", + }, + Namespaces: []Namespace{ + {Type: "process"}, + {Type: "network"}, + {Type: "mount"}, + {Type: "ipc"}, + {Type: "uts"}, + }, + Mounts: []Mount{ + { + Type: "proc", + Source: "proc", + Destination: "/proc", + Options: "", + }, + { + Type: "tmpfs", + Source: "tmpfs", + Destination: "/dev", + Options: "nosuid,strictatime,mode=755,size=65536k", + }, + { + Type: "devpts", + Source: "devpts", + Destination: "/dev/pts", + Options: "nosuid,noexec,newinstance,ptmxmode=0666,mode=0620,gid=5", + }, + { + Type: "tmpfs", + Source: "shm", + Destination: "/dev/shm", + Options: "nosuid,noexec,nodev,mode=1777,size=65536k", + }, + { + Type: "mqueue", + Source: "mqueue", + Destination: "/dev/mqueue", + Options: "nosuid,noexec,nodev", + }, + { + Type: "sysfs", + Source: "sysfs", + Destination: "/sys", + Options: "nosuid,noexec,nodev", + }, + }, + } + data, err := json.MarshalIndent(&spec, "", "\t") + if err != nil { + fatal(err) + } + fmt.Printf("%s", data) + }, +} diff --git a/spec_linux.go b/spec_linux.go new file mode 100644 index 00000000..c74a383e --- /dev/null +++ b/spec_linux.go @@ -0,0 +1,344 @@ +// +build linux + +package main + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "syscall" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" +) + +type UserMapping struct { + From int `json:"from"` + To int `json:"to"` + Count int `json:"count"` +} + +type Rlimit struct { + Type int `json:"type"` + Hard uint64 `json:"hard"` + Soft uint64 `json:"soft"` +} + +type HugepageLimit struct { + Pagesize string `json:"pageSize"` + Limit int `json:"limit"` +} + +type IfPrioMap struct { + Interface string `json:"interface"` + Priority int64 `json:"priority"` +} + +type Resources struct { + // Memory reservation or soft_limit (in bytes) + MemoryReservation int64 `json:"memoryReservation"` + // Total memory usage (memory + swap); set `-1' to disable swap + MemorySwap int64 `json:"memorySwap"` + // Kernel memory limit (in bytes) + KernelMemory int64 `json:"kernelMemory"` + // CPU shares (relative weight vs. other containers) + CpuShares int64 `json:"cpuShares"` + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + CpuQuota int64 `json:"cpuQuota"` + // CPU period to be used for hardcapping (in usecs). 0 to use system default. + CpuPeriod int64 `json:"cpuPeriod"` + // How many time CPU will use in realtime scheduling (in usecs). + CpuRtRuntime int64 `json:"cpuQuota"` + // CPU period to be used for realtime scheduling (in usecs). + CpuRtPeriod int64 `json:"cpuPeriod"` + // CPU to use + CpusetCpus string `json:"cpusetCpus"` + // MEM to use + CpusetMems string `json:"cpusetMems"` + // IO read rate limit per cgroup per device, bytes per second. + BlkioThrottleReadBpsDevice string `json:"blkioThrottleReadBpsDevice"` + // IO write rate limit per cgroup per divice, bytes per second. + BlkioThrottleWriteBpsDevice string `json:"blkioThrottleWriteBpsDevice"` + // IO read rate limit per cgroup per device, IO per second. + BlkioThrottleReadIOpsDevice string `json:"blkioThrottleReadIopsDevice"` + // IO write rate limit per cgroup per device, IO per second. + BlkioThrottleWriteIOpsDevice string `json:"blkioThrottleWriteIopsDevice"` + // Specifies per cgroup weight, range is from 10 to 1000. + BlkioWeight int64 `json:"blkioWeight"` + // Weight per cgroup per device, can override BlkioWeight. + BlkioWeightDevice string `json:"blkioWeightDevice"` + // Hugetlb limit (in bytes) + HugetlbLimit []*HugepageLimit `json:"hugetlbLimit"` + // Whether to disable OOM Killer + DisableOOMKiller bool `json:"disableOOMKiller"` + // Set priority of network traffic for container + NetPrioIfpriomap []*IfPrioMap `json:"netPrioIfpriomap"` + // Set class identifier for container's network packets + NetClsClassid string `json:"netClsClassid"` +} + +type LinuxSpec struct { + PortableSpec + UserMapping map[string]UserMapping `json:"userMapping"` + Rlimits []Rlimit `json:"rlimits"` + SystemProperties map[string]string `json:"systemProperties"` + Resources *Resources `json:"resources"` +} + +var namespaceMapping = map[string]configs.NamespaceType{ + "process": configs.NEWPID, + "network": configs.NEWNET, + "mount": configs.NEWNS, + "user": configs.NEWUSER, + "ipc": configs.NEWIPC, + "uts": configs.NEWUTS, +} + +// loadSpec loads the specification from the provided path. +// If the path is empty then the default path will be "container.json" +func loadSpec(path string) (*LinuxSpec, error) { + if path == "" { + path = "container.json" + } + f, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return nil, fmt.Errorf("JSON specification file for %s not found", path) + } + return nil, err + } + defer f.Close() + var s *LinuxSpec + if err := json.NewDecoder(f).Decode(&s); err != nil { + return nil, err + } + return s, nil +} + +func createLibcontainerConfig(spec *LinuxSpec) (*configs.Config, error) { + cwd, err := os.Getwd() + if err != nil { + return nil, err + } + config := &configs.Config{ + Capabilities: spec.Capabilities, + Rootfs: filepath.Join(cwd, spec.Root.Path), + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + Privatefs: true, + } + for _, ns := range spec.Namespaces { + t, exists := namespaceMapping[ns.Type] + if !exists { + return nil, fmt.Errorf("namespace %q does not exist", ns) + } + config.Namespaces.Add(t, ns.Path) + } + for _, m := range spec.Mounts { + config.Mounts = append(config.Mounts, createLibcontianerMount(cwd, m)) + } + if err := createDevices(spec, config); err != nil { + return nil, err + } + if err := setupUserNamespace(spec, config); err != nil { + return nil, err + } + c, err := createCgroupConfig(spec, config.Devices) + if err != nil { + return nil, err + } + config.Cgroups = c + if config.Readonlyfs { + setReadonly(config) + config.MaskPaths = []string{ + "/proc/kcore", + } + config.ReadonlyPaths = []string{ + "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", + } + } + return config, nil +} + +func createLibcontianerMount(cwd string, m Mount) *configs.Mount { + flags, data := parseMountOptions(m.Options) + source := m.Source + if m.Type == "bind" { + if !filepath.IsAbs(source) { + source = filepath.Join(cwd, m.Source) + } + } + return &configs.Mount{ + Device: m.Type, + Source: source, + Destination: m.Destination, + Data: data, + Flags: flags, + } +} + +func createCgroupConfig(spec *LinuxSpec, devices []*configs.Device) (*configs.Cgroup, error) { + myCgroupPath, err := cgroups.GetThisCgroupDir("devices") + if err != nil { + return nil, err + } + c := &configs.Cgroup{ + Name: getDefaultID(), + Parent: myCgroupPath, + AllowedDevices: append(devices, allowedDevices...), + CpuQuota: getCPUQuota(spec.Cpus), + Memory: spec.Memory * 1024 * 1024, + MemorySwap: -1, + } + if r := spec.Resources; r != nil { + c.MemoryReservation = r.MemoryReservation + c.MemorySwap = r.MemorySwap + c.KernelMemory = r.KernelMemory + c.CpuShares = r.CpuShares + c.CpuQuota = r.CpuQuota + c.CpuPeriod = r.CpuPeriod + c.CpuRtRuntime = r.CpuRtRuntime + c.CpuRtPeriod = r.CpuRtPeriod + c.CpusetCpus = r.CpusetCpus + c.CpusetMems = r.CpusetMems + c.BlkioThrottleReadBpsDevice = r.BlkioThrottleReadBpsDevice + c.BlkioThrottleWriteBpsDevice = r.BlkioThrottleWriteBpsDevice + c.BlkioThrottleReadIOpsDevice = r.BlkioThrottleReadIOpsDevice + c.BlkioThrottleWriteIOpsDevice = r.BlkioThrottleWriteIOpsDevice + c.BlkioWeight = r.BlkioWeight + c.BlkioWeightDevice = r.BlkioWeightDevice + for _, l := range r.HugetlbLimit { + c.HugetlbLimit = append(c.HugetlbLimit, &configs.HugepageLimit{ + Pagesize: l.Pagesize, + Limit: l.Limit, + }) + } + c.OomKillDisable = r.DisableOOMKiller + for _, m := range r.NetPrioIfpriomap { + c.NetPrioIfpriomap = append(c.NetPrioIfpriomap, &configs.IfPrioMap{ + Interface: m.Interface, + Priority: m.Priority, + }) + } + c.NetClsClassid = r.NetClsClassid + } + return c, nil +} + +func createDevices(spec *LinuxSpec, config *configs.Config) error { + for _, name := range spec.Devices { + d, err := devices.DeviceFromPath(filepath.Join("/dev", name), "rwm") + if err != nil { + return err + } + config.Devices = append(config.Devices, d) + } + return nil +} + +func setReadonly(config *configs.Config) { + for _, m := range config.Mounts { + if m.Device == "sysfs" { + m.Flags |= syscall.MS_RDONLY + } + } +} + +func getCPUQuota(cpus float64) int64 { + return int64(cpus * cpuQuotaMultiplyer) +} + +func setupUserNamespace(spec *LinuxSpec, config *configs.Config) error { + if len(spec.UserMapping) == 0 { + return nil + } + config.Namespaces.Add(configs.NEWUSER, "") + mappings := make(map[string][]configs.IDMap) + for k, v := range spec.UserMapping { + mappings[k] = append(mappings[k], configs.IDMap{ + ContainerID: v.From, + HostID: v.To, + Size: v.Count, + }) + } + config.UidMappings = mappings["uid"] + config.GidMappings = mappings["gid"] + rootUid, err := config.HostUID() + if err != nil { + return err + } + rootGid, err := config.HostGID() + if err != nil { + return err + } + for _, node := range config.Devices { + node.Uid = uint32(rootUid) + node.Gid = uint32(rootGid) + } + return nil +} + +// parseMountOptions parses the string and returns the flags and any mount data that +// it contains. +func parseMountOptions(options string) (int, string) { + var ( + flag int + data []string + ) + flags := map[string]struct { + clear bool + flag int + }{ + "defaults": {false, 0}, + "ro": {false, syscall.MS_RDONLY}, + "rw": {true, syscall.MS_RDONLY}, + "suid": {true, syscall.MS_NOSUID}, + "nosuid": {false, syscall.MS_NOSUID}, + "dev": {true, syscall.MS_NODEV}, + "nodev": {false, syscall.MS_NODEV}, + "exec": {true, syscall.MS_NOEXEC}, + "noexec": {false, syscall.MS_NOEXEC}, + "sync": {false, syscall.MS_SYNCHRONOUS}, + "async": {true, syscall.MS_SYNCHRONOUS}, + "dirsync": {false, syscall.MS_DIRSYNC}, + "remount": {false, syscall.MS_REMOUNT}, + "mand": {false, syscall.MS_MANDLOCK}, + "nomand": {true, syscall.MS_MANDLOCK}, + "atime": {true, syscall.MS_NOATIME}, + "noatime": {false, syscall.MS_NOATIME}, + "diratime": {true, syscall.MS_NODIRATIME}, + "nodiratime": {false, syscall.MS_NODIRATIME}, + "bind": {false, syscall.MS_BIND}, + "rbind": {false, syscall.MS_BIND | syscall.MS_REC}, + "unbindable": {false, syscall.MS_UNBINDABLE}, + "runbindable": {false, syscall.MS_UNBINDABLE | syscall.MS_REC}, + "private": {false, syscall.MS_PRIVATE}, + "rprivate": {false, syscall.MS_PRIVATE | syscall.MS_REC}, + "shared": {false, syscall.MS_SHARED}, + "rshared": {false, syscall.MS_SHARED | syscall.MS_REC}, + "slave": {false, syscall.MS_SLAVE}, + "rslave": {false, syscall.MS_SLAVE | syscall.MS_REC}, + "relatime": {false, syscall.MS_RELATIME}, + "norelatime": {true, syscall.MS_RELATIME}, + "strictatime": {false, syscall.MS_STRICTATIME}, + "nostrictatime": {true, syscall.MS_STRICTATIME}, + } + for _, o := range strings.Split(options, ",") { + // If the option does not exist in the flags table or the flag + // is not supported on the platform, + // then it is a data value for a specific fs type + if f, exists := flags[o]; exists && f.flag != 0 { + if f.clear { + flag &= ^f.flag + } else { + flag |= f.flag + } + } else { + data = append(data, o) + } + } + return flag, strings.Join(data, ",") +} diff --git a/tty.go b/tty.go new file mode 100644 index 00000000..7dacc1d1 --- /dev/null +++ b/tty.go @@ -0,0 +1,97 @@ +package main + +import ( + "io" + "os" + + "github.com/docker/docker/pkg/term" + "github.com/opencontainers/runc/libcontainer" +) + +// newTty creates a new pty for use with the container. If a tty is not to be +// created for the process, pipes are created so that the TTY of the parent +// process are not inherited by the container. +func newTty(create bool, p *libcontainer.Process, rootuid int) (*tty, error) { + if create { + return createTty(p, rootuid) + } + return createStdioPipes(p) +} + +// setup standard pipes so that the TTY of the calling nsinit process +// is not inherited by the container. +func createStdioPipes(p *libcontainer.Process) (*tty, error) { + t := &tty{} + r, w, err := os.Pipe() + if err != nil { + return nil, err + } + go io.Copy(w, os.Stdin) + t.closers = append(t.closers, w) + p.Stdin = r + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + go io.Copy(os.Stdout, r) + p.Stdout = w + t.closers = append(t.closers, r) + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + go io.Copy(os.Stderr, r) + p.Stderr = w + t.closers = append(t.closers, r) + return t, nil + +} + +func createTty(p *libcontainer.Process, rootuid int) (*tty, error) { + console, err := p.NewConsole(rootuid) + if err != nil { + return nil, err + } + go io.Copy(console, os.Stdin) + go io.Copy(os.Stdout, console) + state, err := term.SetRawTerminal(os.Stdin.Fd()) + if err != nil { + return nil, err + } + t := &tty{ + console: console, + state: state, + closers: []io.Closer{ + console, + }, + } + p.Stderr = nil + p.Stdout = nil + p.Stdin = nil + return t, nil +} + +type tty struct { + console libcontainer.Console + state *term.State + closers []io.Closer +} + +func (t *tty) Close() error { + for _, c := range t.closers { + c.Close() + } + if t.state != nil { + term.RestoreTerminal(os.Stdin.Fd(), t.state) + } + return nil +} + +func (t *tty) resize() error { + if t.console == nil { + return nil + } + ws, err := term.GetWinsize(os.Stdin.Fd()) + if err != nil { + return err + } + return term.SetWinsize(t.console.Fd(), ws) +} diff --git a/utils.go b/utils.go new file mode 100644 index 00000000..9ec826b3 --- /dev/null +++ b/utils.go @@ -0,0 +1,173 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/codegangsta/cli" + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const wildcard = -1 + +var allowedDevices = []*configs.Device{ + // allow mknod for any device + { + Type: 'c', + Major: wildcard, + Minor: wildcard, + Permissions: "m", + }, + { + Type: 'b', + Major: wildcard, + Minor: wildcard, + Permissions: "m", + }, + { + Path: "/dev/console", + Type: 'c', + Major: 5, + Minor: 1, + Permissions: "rwm", + }, + { + Path: "/dev/tty0", + Type: 'c', + Major: 4, + Minor: 0, + Permissions: "rwm", + }, + { + Path: "/dev/tty1", + Type: 'c', + Major: 4, + Minor: 1, + Permissions: "rwm", + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Path: "", + Type: 'c', + Major: 136, + Minor: wildcard, + Permissions: "rwm", + }, + { + Path: "", + Type: 'c', + Major: 5, + Minor: 2, + Permissions: "rwm", + }, + // tuntap + { + Path: "", + Type: 'c', + Major: 10, + Minor: 200, + Permissions: "rwm", + }, +} + +var container libcontainer.Container + +func containerPreload(context *cli.Context) error { + c, err := getContainer(context) + if err != nil { + return err + } + container = c + return nil +} + +var factory libcontainer.Factory + +func factoryPreload(context *cli.Context) error { + f, err := loadFactory(context) + if err != nil { + return err + } + factory = f + return nil +} + +// loadFactory returns the configured factory instance for execing containers. +func loadFactory(context *cli.Context) (libcontainer.Factory, error) { + root := context.GlobalString("root") + abs, err := filepath.Abs(root) + if err != nil { + return nil, err + } + return libcontainer.New(abs, libcontainer.Cgroupfs, func(l *libcontainer.LinuxFactory) error { + l.CriuPath = context.GlobalString("criu") + return nil + }) +} + +// getContainer returns the specified container instance by loading it from state +// with the default factory. +func getContainer(context *cli.Context) (libcontainer.Container, error) { + factory, err := loadFactory(context) + if err != nil { + return nil, err + } + container, err := factory.Load(context.GlobalString("id")) + if err != nil { + return nil, err + } + return container, nil +} + +// fatal prints the error's details if it is a libcontainer specific error type +// then exists the program with an exit status of 1. +func fatal(err error) { + if lerr, ok := err.(libcontainer.Error); ok { + lerr.Detail(os.Stderr) + os.Exit(1) + } + fmt.Fprintln(os.Stderr, err) + os.Exit(1) +} + +// fatalf formats the errror string with the specified template then exits the +// program with an exit status of 1. +func fatalf(t string, v ...interface{}) { + fmt.Fprintf(os.Stderr, t, v...) + os.Exit(1) +} + +// getDefaultID returns a string to be used as the container id based on the +// current working directory of the nsinit process. This function panics +// if the cwd is unable to be found based on a system error. +func getDefaultID() string { + cwd, err := os.Getwd() + if err != nil { + panic(err) + } + return filepath.Base(cwd) +} + +func getDefaultImagePath(context *cli.Context) string { + cwd, err := os.Getwd() + if err != nil { + panic(err) + } + return filepath.Join(cwd, "checkpoint") +} + +// newProcess returns a new libcontainer Process with the arguments from the +// spec and stdio from the current process. +func newProcess(p *Process) *libcontainer.Process { + return &libcontainer.Process{ + Args: p.Args, + Env: p.Env, + User: p.User, + Cwd: p.Cwd, + Stdin: os.Stdin, + Stdout: os.Stdout, + Stderr: os.Stderr, + } +}