Import Upstream version 1.0.0~rc10

2022-07-28 16:28:18 +08:00 · 2022-07-28 16:28:18 +08:00 · 4912a38791
commit 4912a38791
823 changed files with 324476 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
+vendor/pkg
+/runc
+/runc-*
+contrib/cmd/recvtty/recvtty
+man/man8
+release
--- a/.pullapprove.yml
+++ b/.pullapprove.yml
@ -0,0 +1,10 @@
+approve_by_comment: true
+approve_regex: ^LGTM
+reject_regex: ^Rejected
+reset_on_push: true
+author_approval: ignored
+reviewers:
+  teams:
+    - runc-maintainers
+  name: default
+  required: 2
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,54 @@
+dist: bionic
+language: go
+go:
+  - 1.11.x
+  - 1.12.x
+  - tip
+
+matrix:
+  include:
+    - go: 1.12.x
+      env:
+        - RUNC_USE_SYSTEMD=1
+      script:
+        - make BUILDTAGS="${BUILDTAGS}" all
+        - sudo PATH="$PATH" make localintegration RUNC_USE_SYSTEMD=1
+    - go: 1.12.x
+      env:
+        - VIRTUALBOX_VERSION=6.0
+        - VAGRANT_VERSION=2.2.6
+        - FEDORA_VERSION=31
+      before_install:
+        - cat /proc/cpuinfo
+        - wget -q https://www.virtualbox.org/download/oracle_vbox_2016.asc -O- | sudo apt-key add - && sudo sh -c "echo deb https://download.virtualbox.org/virtualbox/debian $(lsb_release -cs) contrib >> /etc/apt/sources.list" && sudo apt-get update && sudo apt-get install -yq build-essential gcc make linux-headers-$(uname -r) virtualbox-${VIRTUALBOX_VERSION} && sudo usermod -aG vboxusers $(whoami)
+        - wget https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_$(uname -m).deb && sudo dpkg -i vagrant_${VAGRANT_VERSION}_$(uname -m).deb
+        - vagrant init bento/fedora-${FEDORA_VERSION} && vagrant up && mkdir -p ~/.ssh && vagrant ssh-config >> ~/.ssh/config
+        - ssh default sudo dnf install -y podman
+      script:
+        - ssh default sudo podman build -t test /vagrant
+        - ssh default sudo podman run --privileged --cgroupns=private test make localunittest
+  allow_failures:
+    - go: tip
+
+go_import_path: github.com/opencontainers/runc
+
+# `make ci` uses Docker.
+sudo: required
+services:
+  - docker
+
+env:
+  global:
+    - BUILDTAGS="seccomp apparmor selinux ambient"
+
+before_install:
+  - sudo apt-get -qq update
+  - sudo apt-get install -y libseccomp-dev
+  - go get -u golang.org/x/lint/golint
+  - go get -u github.com/vbatts/git-validation
+  - env | grep TRAVIS_
+
+script:
+  - git-validation -run DCO,short-subject -v
+  - make BUILDTAGS="${BUILDTAGS}"
+  - make BUILDTAGS="${BUILDTAGS}" clean ci cross
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,124 @@
+## Contribution Guidelines
+
+### Security issues
+
+If you are reporting a security issue, do not create an issue or file a pull
+request on GitHub. Instead, disclose the issue responsibly by sending an email
+to security@opencontainers.org (which is inhabited only by the maintainers of
+the various OCI projects).
+
+### Pull requests are always welcome
+
+We are always thrilled to receive pull requests, and do our best to
+process them as fast as possible. Not sure if that typo is worth a pull
+request? Do it! We will appreciate it.
+
+If your pull request is not accepted on the first try, don't be
+discouraged! If there's a problem with the implementation, hopefully you
+received feedback on what to improve.
+
+We're trying very hard to keep runc lean and focused. We don't want it
+to do everything for everybody. This means that we might decide against
+incorporating a new feature. However, there might be a way to implement
+that feature *on top of* runc.
+
+
+### Conventions
+
+Fork the repo and make changes on your fork in a feature branch:
+
+- If it's a bugfix branch, name it XXX-something where XXX is the number of the
+  issue
+- If it's a feature branch, create an enhancement issue to announce your
+  intentions, and name it XXX-something where XXX is the number of the issue.
+
+Submit unit tests for your changes.  Go has a great test framework built in; use
+it! Take a look at existing tests for inspiration. Run the full test suite on
+your branch before submitting a pull request.
+
+Update the documentation when creating or modifying features. Test
+your documentation changes for clarity, concision, and correctness, as
+well as a clean documentation build. See ``docs/README.md`` for more
+information on building the docs and how docs get released.
+
+Write clean code. Universally formatted code promotes ease of writing, reading,
+and maintenance. Always run `gofmt -s -w file.go` on each changed file before
+committing your changes. Most editors have plugins that do this automatically.
+
+Pull requests descriptions should be as clear as possible and include a
+reference to all the issues that they address.
+
+Pull requests must not contain commits from other users or branches.
+
+Commit messages must start with a capitalized and short summary (max. 50
+chars) written in the imperative, followed by an optional, more detailed
+explanatory text which is separated from the summary by an empty line.
+
+Code review comments may be added to your pull request. Discuss, then make the
+suggested modifications and push additional commits to your feature branch. Be
+sure to post a comment after pushing. The new commits will show up in the pull
+request automatically, but the reviewers will not be notified unless you
+comment.
+
+Before the pull request is merged, make sure that you squash your commits into
+logical units of work using `git rebase -i` and `git push -f`. After every
+commit the test suite should be passing. Include documentation changes in the
+same commit so that a revert would remove all traces of the feature or fix.
+
+Commits that fix or close an issue should include a reference like `Closes #XXX`
+or `Fixes #XXX`, which will automatically close the issue when merged.
+
+### Sign your work
+
+The sign-off is a simple line at the end of the explanation for the
+patch, which certifies that you wrote it or otherwise have the right to
+pass it on as an open-source patch.  The rules are pretty simple: if you
+can certify the below (from
+[developercertificate.org](http://developercertificate.org/)):
+
+```
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+660 York Street, Suite 102,
+San Francisco, CA 94110 USA
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+    have the right to submit it under the open source license
+    indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+    of my knowledge, is covered under an appropriate open source
+    license and I have the right under that license to submit that
+    work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am
+    permitted to submit under a different license), as indicated
+    in the file; or
+
+(c) The contribution was provided directly to me by some other
+    person who certified (a), (b) or (c) and I have not modified
+    it.
+
+(d) I understand and agree that this project and the contribution
+    are public and that a record of the contribution (including all
+    personal information I submit with it, including my sign-off) is
+    maintained indefinitely and may be redistributed consistent with
+    this project or the open source license(s) involved.
+```
+
+then you just add a line to every git commit message:
+
+    Signed-off-by: Joe Smith <joe@gmail.com>
+
+using your real name (sorry, no pseudonyms or anonymous contributions.)
+
+You can add the sign off when creating the git commit via `git commit -s`.
--- a/66
+++ b/66
@ -0,0 +1,66 @@
+FROM golang:1.12-stretch
+
+RUN dpkg --add-architecture armel \
+    && dpkg --add-architecture armhf \
+    && dpkg --add-architecture arm64 \
+    && dpkg --add-architecture ppc64el \
+    && apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    sudo \
+    gawk \
+    iptables \
+    jq \
+    pkg-config \
+    libaio-dev \
+    libcap-dev \
+    libprotobuf-dev \
+    libprotobuf-c0-dev \
+    libnl-3-dev \
+    libnet-dev \
+    libseccomp2 \
+    libseccomp-dev \
+    protobuf-c-compiler \
+    protobuf-compiler \
+    python-minimal \
+    uidmap \
+    kmod \
+    crossbuild-essential-armel crossbuild-essential-armhf crossbuild-essential-arm64 crossbuild-essential-ppc64el \
+    libseccomp-dev:armel libseccomp-dev:armhf libseccomp-dev:arm64 libseccomp-dev:ppc64el \
+    --no-install-recommends \
+    && apt-get clean
+
+# Add a dummy user for the rootless integration tests. While runC does
+# not require an entry in /etc/passwd to operate, one of the tests uses
+# `git clone` -- and `git clone` does not allow you to clone a
+# repository if the current uid does not have an entry in /etc/passwd.
+RUN useradd -u1000 -m -d/home/rootless -s/bin/bash rootless
+
+# install bats
+RUN cd /tmp \
+    && git clone https://github.com/sstephenson/bats.git \
+    && cd bats \
+    && git reset --hard 03608115df2071fff4eaaff1605768c275e5f81f \
+    && ./install.sh /usr/local \
+    && rm -rf /tmp/bats
+
+# install criu
+ENV CRIU_VERSION v3.12
+RUN mkdir -p /usr/src/criu \
+    && curl -sSL https://github.com/checkpoint-restore/criu/archive/${CRIU_VERSION}.tar.gz | tar -v -C /usr/src/criu/ -xz --strip-components=1 \
+    && cd /usr/src/criu \
+    && make install-criu \
+    && rm -rf /usr/src/criu
+
+# setup a playground for us to spawn containers in
+ENV ROOTFS /busybox
+RUN mkdir -p ${ROOTFS}
+
+COPY script/tmpmount /
+WORKDIR /go/src/github.com/opencontainers/runc
+ENTRYPOINT ["/tmpmount"]
+
+ADD . /go/src/github.com/opencontainers/runc
+
+RUN . tests/integration/multi-arch.bash \
+    && curl -o- -sSL `get_busybox` | tar xfJC - ${ROOTFS}
--- a/191
+++ b/191
@ -0,0 +1,191 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   Copyright 2014 Docker, Inc.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/5
+++ b/5
@ -0,0 +1,5 @@
+Michael Crosby <michael@docker.com> (@crosbymichael)
+Mrunal Patel <mpatel@redhat.com> (@mrunalp)
+Daniel, Dao Quang Minh <dqminh89@gmail.com> (@dqminh)
+Qiang Huang <h.huangqiang@huawei.com> (@hqhq)
+Aleksa Sarai <asarai@suse.de> (@cyphar)
--- a/MAINTAINERS_GUIDE.md
+++ b/MAINTAINERS_GUIDE.md
@ -0,0 +1,120 @@
+## Introduction
+
+Dear maintainer. Thank you for investing the time and energy to help
+make runc as useful as possible. Maintaining a project is difficult,
+sometimes unrewarding work.  Sure, you will get to contribute cool
+features to the project. But most of your time will be spent reviewing,
+cleaning up, documenting, answering questions, justifying design
+decisions - while everyone has all the fun! But remember - the quality
+of the maintainers work is what distinguishes the good projects from the
+great.  So please be proud of your work, even the unglamorous parts,
+and encourage a culture of appreciation and respect for *every* aspect
+of improving the project - not just the hot new features.
+
+This document is a manual for maintainers old and new. It explains what
+is expected of maintainers, how they should work, and what tools are
+available to them.
+
+This is a living document - if you see something out of date or missing,
+speak up!
+
+## What are a maintainer's responsibility?
+
+It is every maintainer's responsibility to:
+
+* 1) Expose a clear roadmap for improving their component.
+* 2) Deliver prompt feedback and decisions on pull requests.
+* 3) Be available to anyone with questions, bug reports, criticism etc.
+  on their component. This includes IRC and GitHub issues and pull requests.
+* 4) Make sure their component respects the philosophy, design and
+  roadmap of the project.
+
+## How are decisions made?
+
+Short answer: with pull requests to the runc repository.
+
+runc is an open-source project with an open design philosophy. This
+means that the repository is the source of truth for EVERY aspect of the
+project, including its philosophy, design, roadmap and APIs. *If it's
+part of the project, it's in the repo. It's in the repo, it's part of
+the project.*
+
+As a result, all decisions can be expressed as changes to the
+repository. An implementation change is a change to the source code. An
+API change is a change to the API specification. A philosophy change is
+a change to the philosophy manifesto. And so on.
+
+All decisions affecting runc, big and small, follow the same 3 steps:
+
+* Step 1: Open a pull request. Anyone can do this.
+
+* Step 2: Discuss the pull request. Anyone can do this.
+
+* Step 3: Accept (`LGTM`) or refuse a pull request. The relevant maintainers do 
+this (see below "Who decides what?")
+
+*I'm a maintainer, should I make pull requests too?*
+
+Yes. Nobody should ever push to master directly. All changes should be
+made through a pull request.
+
+## Who decides what?
+
+All decisions are pull requests, and the relevant maintainers make
+decisions by accepting or refusing the pull request. Review and acceptance
+by anyone is denoted by adding a comment in the pull request: `LGTM`.
+However, only currently listed `MAINTAINERS` are counted towards the required
+two LGTMs.
+
+Overall the maintainer system works because of mutual respect across the
+maintainers of the project.  The maintainers trust one another to make decisions
+in the best interests of the project.  Sometimes maintainers can disagree and
+this is part of a healthy project to represent the point of views of various people.
+In the case where maintainers cannot find agreement on a specific change the
+role of a Chief Maintainer comes into play.
+
+The Chief Maintainer for the project is responsible for overall architecture
+of the project to maintain conceptual integrity.  Large decisions and
+architecture changes should be reviewed by the chief maintainer.
+The current chief maintainer for the project is Michael Crosby (@crosbymichael).
+
+Even though the maintainer system is built on trust, if there is a conflict
+with the chief maintainer on a decision, their decision can be challenged
+and brought to the technical oversight board if two-thirds of the
+maintainers vote for an appeal. It is expected that this would be a
+very exceptional event.
+
+
+### How are maintainers added?
+
+The best maintainers have a vested interest in the project.  Maintainers
+are first and foremost contributors that have shown they are committed to
+the long term success of the project.  Contributors wanting to become
+maintainers are expected to be deeply involved in contributing code,
+pull request review, and triage of issues in the project for more than two months.
+
+Just contributing does not make you a maintainer, it is about building trust
+with the current maintainers of the project and being a person that they can
+depend on and trust to make decisions in the best interest of the project.  The
+final vote to add a new maintainer should be approved by over 66% of the current
+maintainers with the chief maintainer having veto power.  In case of a veto,
+conflict resolution rules expressed above apply.  The voting period is
+five business days on the Pull Request to add the new maintainer.
+
+
+### What is expected of maintainers?
+
+Part of a healthy project is to have active maintainers to support the community
+in contributions and perform tasks to keep the project running.  Maintainers are
+expected to be able to respond in a timely manner if their help is required on specific
+issues where they are pinged.  Being a maintainer is a time consuming commitment and should
+not be taken lightly.
+
+When a maintainer is unable to perform the required duties they can be removed with
+a vote by 66% of the current maintainers with the chief maintainer having veto power.
+The voting period is ten business days.  Issues related to a maintainer's performance should
+be discussed with them among the other maintainers so that they are not surprised by
+a pull request removing them.
+
+
+
--- a/133
+++ b/133
@ -0,0 +1,133 @@
+.PHONY: all shell dbuild man release \
+	    localtest localunittest localintegration \
+	    test unittest integration \
+	    cross localcross
+
+CONTAINER_ENGINE := docker
+GO := go
+
+SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$')
+PREFIX := $(DESTDIR)/usr/local
+BINDIR := $(PREFIX)/sbin
+GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
+GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
+RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
+PROJECT := github.com/opencontainers/runc
+BUILDTAGS ?= seccomp
+COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true)
+COMMIT ?= $(if $(shell git status --porcelain --untracked-files=no),"${COMMIT_NO}-dirty","${COMMIT_NO}")
+
+MAN_DIR := $(CURDIR)/man/man8
+MAN_PAGES = $(shell ls $(MAN_DIR)/*.8)
+MAN_PAGES_BASE = $(notdir $(MAN_PAGES))
+MAN_INSTALL_PATH := ${PREFIX}/share/man/man8/
+
+RELEASE_DIR := $(CURDIR)/release
+
+VERSION := ${shell cat ./VERSION}
+
+SHELL := $(shell command -v bash 2>/dev/null)
+
+.DEFAULT: runc
+
+runc: $(SOURCES)
+	$(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc .
+
+all: runc recvtty
+
+recvtty: contrib/cmd/recvtty/recvtty
+
+contrib/cmd/recvtty/recvtty: $(SOURCES)
+	$(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty
+
+static: $(SOURCES)
+	CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o runc .
+	CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty
+
+release:
+	script/release.sh -r release/$(VERSION) -v $(VERSION)
+
+dbuild: runcimage
+	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} --rm -v $(CURDIR):/go/src/$(PROJECT) --privileged $(RUNC_IMAGE) make clean all
+
+lint:
+	$(GO) vet $(allpackages)
+	$(GO) fmt $(allpackages)
+
+man:
+	man/md2man-all.sh
+
+runcimage:
+	$(CONTAINER_ENGINE) build ${CONTAINER_ENGINE_BUILD_FLAGS} -t $(RUNC_IMAGE) .
+
+test:
+	make unittest integration rootlessintegration
+
+localtest:
+	make localunittest localintegration localrootlessintegration
+
+unittest: runcimage
+	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localunittest TESTFLAGS=${TESTFLAGS}
+
+localunittest: all
+	$(GO) test -timeout 3m -tags "$(BUILDTAGS)" ${TESTFLAGS} -v $(allpackages)
+
+integration: runcimage
+	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localintegration TESTPATH=${TESTPATH}
+
+localintegration: all
+	bats -t tests/integration${TESTPATH}
+
+rootlessintegration: runcimage
+	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localrootlessintegration
+
+localrootlessintegration: all
+	tests/rootless.sh
+
+shell: runcimage
+	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -ti --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) bash
+
+install:
+	install -D -m0755 runc $(BINDIR)/runc
+
+install-bash:
+	install -D -m0644 contrib/completions/bash/runc $(PREFIX)/share/bash-completion/completions/runc
+
+install-man:
+	install -d -m 755 $(MAN_INSTALL_PATH)
+	install -m 644 $(MAN_PAGES) $(MAN_INSTALL_PATH)
+
+uninstall:
+	rm -f $(BINDIR)/runc
+
+uninstall-bash:
+	rm -f $(PREFIX)/share/bash-completion/completions/runc
+
+uninstall-man:
+	rm -f $(addprefix $(MAN_INSTALL_PATH),$(MAN_PAGES_BASE))
+
+clean:
+	rm -f runc runc-*
+	rm -f contrib/cmd/recvtty/recvtty
+	rm -rf $(RELEASE_DIR)
+	rm -rf $(MAN_DIR)
+
+validate:
+	script/validate-gofmt
+	script/validate-c
+	$(GO) vet $(allpackages)
+
+ci: validate test release
+
+cross: runcimage
+	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -e BUILDTAGS="$(BUILDTAGS)" --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localcross
+
+localcross:
+	CGO_ENABLED=1 GOARCH=arm GOARM=6 CC=arm-linux-gnueabi-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armel .
+	CGO_ENABLED=1 GOARCH=arm GOARM=7 CC=arm-linux-gnueabihf-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armhf .
+	CGO_ENABLED=1 GOARCH=arm64 CC=aarch64-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-arm64 .
+	CGO_ENABLED=1 GOARCH=ppc64le CC=powerpc64le-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-ppc64le .
+
+# memoize allpackages, so that it's executed only once and only if used
+_allpackages = $(shell $(GO) list ./... | grep -v vendor)
+allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages)
--- a/17
+++ b/17
@ -0,0 +1,17 @@
+runc
+
+Copyright 2012-2015 Docker, Inc.
+
+This product includes software developed at Docker, Inc. (http://www.docker.com).
+
+The following is courtesy of our legal counsel:
+
+
+Use and transfer of Docker may be subject to certain restrictions by the
+United States and other governments.  
+It is your responsibility to ensure that your use and/or transfer does not
+violate applicable laws. 
+
+For more information, please see http://www.bis.doc.gov
+
+See also http://www.apache.org/dev/crypto.html and/or seek legal counsel.
--- a/PRINCIPLES.md
+++ b/PRINCIPLES.md
@ -0,0 +1,19 @@
+# runc principles
+
+In the design and development of runc and libcontainer we try to follow these principles:
+
+(Work in progress)
+
+* Don't try to replace every tool. Instead, be an ingredient to improve them.
+* Less code is better.
+* Fewer components are better. Do you really need to add one more class?
+* 50 lines of straightforward, readable code is better than 10 lines of magic that nobody can understand.
+* Don't do later what you can do now. "//TODO: refactor" is not acceptable in new code.
+* When hesitating between two options, choose the one that is easier to reverse.
+* "No" is temporary; "Yes" is forever. If you're not sure about a new feature, say no. You can change your mind later.
+* Containers must be portable to the greatest possible number of machines. Be suspicious of any change which makes machines less interchangeable.
+* The fewer moving parts in a container, the better.
+* Don't merge it unless you document it.
+* Don't document it unless you can keep it up-to-date.
+* Don't merge it unless you test it!
+* Everyone's problem is slightly different. Focus on the part that is the same for everyone, and solve that.
--- a/README.md
+++ b/README.md
@ -0,0 +1,280 @@
+# runc
+
+[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
+[![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
+[![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
+
+## Introduction
+
+`runc` is a CLI tool for spawning and running containers according to the OCI specification.
+
+## Releases
+
+`runc` depends on and tracks the [runtime-spec](https://github.com/opencontainers/runtime-spec) repository.
+We will try to make sure that `runc` and the OCI specification major versions stay in lockstep.
+This means that `runc` 1.0.0 should implement the 1.0 version of the specification.
+
+You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
+
+Currently, the following features are not considered to be production-ready:
+
+* Support for cgroup v2
+
+## Security
+
+The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/).
+
+## Building
+
+`runc` currently supports the Linux platform with various architecture support.
+It must be built with Go version 1.6 or higher in order for some features to function properly.
+
+In order to enable seccomp support you will need to install `libseccomp` on your platform.
+> e.g. `libseccomp-devel` for CentOS, or `libseccomp-dev` for Ubuntu
+
+Otherwise, if you do not want to build `runc` with seccomp support you can add `BUILDTAGS=""` when running make.
+
+```bash
+# create a 'github.com/opencontainers' in your GOPATH/src
+cd github.com/opencontainers
+git clone https://github.com/opencontainers/runc
+cd runc
+
+make
+sudo make install
+```
+
+You can also use `go get` to install to your `GOPATH`, assuming that you have a `github.com` parent folder already created under `src`:
+
+```bash
+go get github.com/opencontainers/runc
+cd $GOPATH/src/github.com/opencontainers/runc
+make
+sudo make install
+```
+
+`runc` will be installed to `/usr/local/sbin/runc` on your system.
+
+
+#### Build Tags
+
+`runc` supports optional build tags for compiling support of various features.
+To add build tags to the make option the `BUILDTAGS` variable must be set.
+
+```bash
+make BUILDTAGS='seccomp apparmor'
+```
+
+| Build Tag | Feature                            | Dependency  |
+|-----------|------------------------------------|-------------|
+| seccomp   | Syscall filtering                  | libseccomp  |
+| selinux   | selinux process and mount labeling | <none>      |
+| apparmor  | apparmor profile support           | <none>      |
+| ambient   | ambient capability support         | kernel 4.3  |
+| nokmem    | disable kernel memory account      | <none>      |
+
+
+### Running the test suite
+
+`runc` currently supports running its test suite via Docker.
+To run the suite just type `make test`.
+
+```bash
+make test
+```
+
+There are additional make targets for running the tests outside of a container but this is not recommended as the tests are written with the expectation that they can write and remove anywhere.
+
+You can run a specific test case by setting the `TESTFLAGS` variable.
+
+```bash
+# make test TESTFLAGS="-run=SomeTestFunction"
+```
+
+You can run a specific integration test by setting the `TESTPATH` variable.
+
+```bash
+# make test TESTPATH="/checkpoint.bats"
+```
+
+You can run a test in your proxy environment by setting `DOCKER_BUILD_PROXY` and `DOCKER_RUN_PROXY` variables.
+
+```bash
+# make test DOCKER_BUILD_PROXY="--build-arg HTTP_PROXY=http://yourproxy/" DOCKER_RUN_PROXY="-e HTTP_PROXY=http://yourproxy/"
+```
+
+### Dependencies Management
+
+`runc` uses [vndr](https://github.com/LK4D4/vndr) for dependencies management.
+Please refer to [vndr](https://github.com/LK4D4/vndr) for how to add or update
+new dependencies.
+
+## Using runc
+
+### Creating an OCI Bundle
+
+In order to use runc you must have your container in the format of an OCI bundle.
+If you have Docker installed you can use its `export` method to acquire a root filesystem from an existing Docker container.
+
+```bash
+# create the top most bundle directory
+mkdir /mycontainer
+cd /mycontainer
+
+# create the rootfs directory
+mkdir rootfs
+
+# export busybox via Docker into the rootfs directory
+docker export $(docker create busybox) | tar -C rootfs -xvf -
+```
+
+After a root filesystem is populated you just generate a spec in the format of a `config.json` file inside your bundle.
+`runc` provides a `spec` command to generate a base template spec that you are then able to edit.
+To find features and documentation for fields in the spec please refer to the [specs](https://github.com/opencontainers/runtime-spec) repository.
+
+```bash
+runc spec
+```
+
+### Running Containers
+
+Assuming you have an OCI bundle from the previous step you can execute the container in two different ways.
+
+The first way is to use the convenience command `run` that will handle creating, starting, and deleting the container after it exits.
+
+```bash
+# run as root
+cd /mycontainer
+runc run mycontainerid
+```
+
+If you used the unmodified `runc spec` template this should give you a `sh` session inside the container.
+
+The second way to start a container is using the specs lifecycle operations.
+This gives you more power over how the container is created and managed while it is running.
+This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
+Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
+
+
+```json
+        "process": {
+                "terminal": false,
+                "user": {
+                        "uid": 0,
+                        "gid": 0
+                },
+                "args": [
+                        "sleep", "5"
+                ],
+                "env": [
+                        "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+                        "TERM=xterm"
+                ],
+                "cwd": "/",
+                "capabilities": {
+                        "bounding": [
+                                "CAP_AUDIT_WRITE",
+                                "CAP_KILL",
+                                "CAP_NET_BIND_SERVICE"
+                        ],
+                        "effective": [
+                                "CAP_AUDIT_WRITE",
+                                "CAP_KILL",
+                                "CAP_NET_BIND_SERVICE"
+                        ],
+                        "inheritable": [
+                                "CAP_AUDIT_WRITE",
+                                "CAP_KILL",
+                                "CAP_NET_BIND_SERVICE"
+                        ],
+                        "permitted": [
+                                "CAP_AUDIT_WRITE",
+                                "CAP_KILL",
+                                "CAP_NET_BIND_SERVICE"
+                        ],
+                        "ambient": [
+                                "CAP_AUDIT_WRITE",
+                                "CAP_KILL",
+                                "CAP_NET_BIND_SERVICE"
+                        ]
+                },
+                "rlimits": [
+                        {
+                                "type": "RLIMIT_NOFILE",
+                                "hard": 1024,
+                                "soft": 1024
+                        }
+                ],
+                "noNewPrivileges": true
+        },
+```
+
+Now we can go through the lifecycle operations in your shell.
+
+
+```bash
+# run as root
+cd /mycontainer
+runc create mycontainerid
+
+# view the container is created and in the "created" state
+runc list
+
+# start the process inside the container
+runc start mycontainerid
+
+# after 5 seconds view that the container has exited and is now in the stopped state
+runc list
+
+# now delete the container
+runc delete mycontainerid
+```
+
+This allows higher level systems to augment the containers creation logic with setup of various settings after the container is created and/or before it is deleted. For example, the container's network stack is commonly set up after `create` but before `start`.
+
+#### Rootless containers
+`runc` has the ability to run containers without root privileges. This is called `rootless`. You need to pass some parameters to `runc` in order to run rootless containers. See below and compare with the previous version.
+
+**Note:** In order to use this feature, "User Namespaces" must be compiled and enabled in your kernel. There are various ways to do this depending on your distribution:
+- Confirm `CONFIG_USER_NS=y` is set in your kernel configuration (normally found in `/proc/config.gz`)
+- Arch/Debian: `echo 1 > /proc/sys/kernel/unprivileged_userns_clone`
+- RHEL/CentOS 7: `echo 28633 > /proc/sys/user/max_user_namespaces`
+
+Run the following commands as an ordinary user:
+```bash
+# Same as the first example
+mkdir ~/mycontainer
+cd ~/mycontainer
+mkdir rootfs
+docker export $(docker create busybox) | tar -C rootfs -xvf -
+
+# The --rootless parameter instructs runc spec to generate a configuration for a rootless container, which will allow you to run the container as a non-root user.
+runc spec --rootless
+
+# The --root parameter tells runc where to store the container state. It must be writable by the user.
+runc --root /tmp/runc run mycontainerid
+```
+
+#### Supervisors
+
+`runc` can be used with process supervisors and init systems to ensure that containers are restarted when they exit.
+An example systemd unit file looks something like this.
+
+```systemd
+[Unit]
+Description=Start My Container
+
+[Service]
+Type=forking
+ExecStart=/usr/local/sbin/runc run -d --pid-file /run/mycontainerid.pid mycontainerid
+ExecStopPost=/usr/local/sbin/runc delete mycontainerid
+WorkingDirectory=/mycontainer
+PIDFile=/run/mycontainerid.pid
+
+[Install]
+WantedBy=multi-user.target
+```
+
+## License
+
+The code and docs are released under the [Apache 2.0 license](LICENSE).
--- a/SECURITY.md
+++ b/SECURITY.md
@ -0,0 +1,3 @@
+# Security
+
+The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/).
--- a/1
+++ b/1
@ -0,0 +1 @@
+1.0.0-rc10
--- a/checkpoint.go
+++ b/checkpoint.go
@ -0,0 +1,137 @@
+// +build linux
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer"
+	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/sirupsen/logrus"
+	"github.com/urfave/cli"
+
+	"golang.org/x/sys/unix"
+)
+
+var checkpointCommand = cli.Command{
+	Name:  "checkpoint",
+	Usage: "checkpoint a running container",
+	ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container to be
+checkpointed.`,
+	Description: `The checkpoint command saves the state of the container instance.`,
+	Flags: []cli.Flag{
+		cli.StringFlag{Name: "image-path", Value: "", Usage: "path for saving criu image files"},
+		cli.StringFlag{Name: "work-path", Value: "", Usage: "path for saving work files and logs"},
+		cli.StringFlag{Name: "parent-path", Value: "", Usage: "path for previous criu image files in pre-dump"},
+		cli.BoolFlag{Name: "leave-running", Usage: "leave the process running after checkpointing"},
+		cli.BoolFlag{Name: "tcp-established", Usage: "allow open tcp connections"},
+		cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"},
+		cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"},
+		cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"},
+		cli.StringFlag{Name: "status-fd", Value: "", Usage: "criu writes \\0 to this FD once lazy-pages is ready"},
+		cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"},
+		cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"},
+		cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"},
+		cli.StringFlag{Name: "manage-cgroups-mode", Value: "", Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'"},
+		cli.StringSliceFlag{Name: "empty-ns", Usage: "create a namespace, but don't restore its properties"},
+		cli.BoolFlag{Name: "auto-dedup", Usage: "enable auto deduplication of memory images"},
+	},
+	Action: func(context *cli.Context) error {
+		if err := checkArgs(context, 1, exactArgs); err != nil {
+			return err
+		}
+		// XXX: Currently this is untested with rootless containers.
+		if os.Geteuid() != 0 || system.RunningInUserNS() {
+			logrus.Warn("runc checkpoint is untested with rootless containers")
+		}
+
+		container, err := getContainer(context)
+		if err != nil {
+			return err
+		}
+		status, err := container.Status()
+		if err != nil {
+			return err
+		}
+		if status == libcontainer.Created || status == libcontainer.Stopped {
+			fatalf("Container cannot be checkpointed in %s state", status.String())
+		}
+		defer destroy(container)
+		options := criuOptions(context)
+		// these are the mandatory criu options for a container
+		setPageServer(context, options)
+		setManageCgroupsMode(context, options)
+		if err := setEmptyNsMask(context, options); err != nil {
+			return err
+		}
+		return container.Checkpoint(options)
+	},
+}
+
+func getCheckpointImagePath(context *cli.Context) string {
+	imagePath := context.String("image-path")
+	if imagePath == "" {
+		imagePath = getDefaultImagePath(context)
+	}
+	return imagePath
+}
+
+func setPageServer(context *cli.Context, options *libcontainer.CriuOpts) {
+	// xxx following criu opts are optional
+	// The dump image can be sent to a criu page server
+	if psOpt := context.String("page-server"); psOpt != "" {
+		addressPort := strings.Split(psOpt, ":")
+		if len(addressPort) != 2 {
+			fatal(fmt.Errorf("Use --page-server ADDRESS:PORT to specify page server"))
+		}
+		portInt, err := strconv.Atoi(addressPort[1])
+		if err != nil {
+			fatal(fmt.Errorf("Invalid port number"))
+		}
+		options.PageServer = libcontainer.CriuPageServerInfo{
+			Address: addressPort[0],
+			Port:    int32(portInt),
+		}
+	}
+}
+
+func setManageCgroupsMode(context *cli.Context, options *libcontainer.CriuOpts) {
+	if cgOpt := context.String("manage-cgroups-mode"); cgOpt != "" {
+		switch cgOpt {
+		case "soft":
+			options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_SOFT
+		case "full":
+			options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_FULL
+		case "strict":
+			options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_STRICT
+		default:
+			fatal(fmt.Errorf("Invalid manage cgroups mode"))
+		}
+	}
+}
+
+var namespaceMapping = map[specs.LinuxNamespaceType]int{
+	specs.NetworkNamespace: unix.CLONE_NEWNET,
+}
+
+func setEmptyNsMask(context *cli.Context, options *libcontainer.CriuOpts) error {
+	/* Runc doesn't manage network devices and their configuration */
+	nsmask := unix.CLONE_NEWNET
+
+	for _, ns := range context.StringSlice("empty-ns") {
+		f, exists := namespaceMapping[specs.LinuxNamespaceType(ns)]
+		if !exists {
+			return fmt.Errorf("namespace %q is not supported", ns)
+		}
+		nsmask |= f
+	}
+
+	options.EmptyNs = uint32(nsmask)
+	return nil
+}
--- a/contrib/cmd/recvtty/recvtty.go
+++ b/contrib/cmd/recvtty/recvtty.go
@ -0,0 +1,238 @@
+/*
+ * Copyright 2016 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package main
+
+import (
+	"fmt"
+	"io"
+	"io/ioutil"
+	"net"
+	"os"
+	"strings"
+
+	"github.com/containerd/console"
+	"github.com/opencontainers/runc/libcontainer/utils"
+	"github.com/urfave/cli"
+)
+
+// version will be populated by the Makefile, read from
+// VERSION file of the source code.
+var version = ""
+
+// gitCommit will be the hash that the binary was built from
+// and will be populated by the Makefile
+var gitCommit = ""
+
+const (
+	usage = `Open Container Initiative contrib/cmd/recvtty
+
+recvtty is a reference implementation of a consumer of runC's --console-socket
+API. It has two main modes of operation:
+
+  * single: Only permit one terminal to be sent to the socket, which is
+	then hooked up to the stdio of the recvtty process. This is useful
+	for rudimentary shell management of a container.
+
+  * null: Permit as many terminals to be sent to the socket, but they
+	are read to /dev/null. This is used for testing, and imitates the
+	old runC API's --console=/dev/pts/ptmx hack which would allow for a
+	similar trick. This is probably not what you want to use, unless
+	you're doing something like our bats integration tests.
+
+To use recvtty, just specify a socket path at which you want to receive
+terminals:
+
+    $ recvtty [--mode <single|null>] socket.sock
+`
+)
+
+func bail(err error) {
+	fmt.Fprintf(os.Stderr, "[recvtty] fatal error: %v\n", err)
+	os.Exit(1)
+}
+
+func handleSingle(path string) error {
+	// Open a socket.
+	ln, err := net.Listen("unix", path)
+	if err != nil {
+		return err
+	}
+	defer ln.Close()
+
+	// We only accept a single connection, since we can only really have
+	// one reader for os.Stdin. Plus this is all a PoC.
+	conn, err := ln.Accept()
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	// Close ln, to allow for other instances to take over.
+	ln.Close()
+
+	// Get the fd of the connection.
+	unixconn, ok := conn.(*net.UnixConn)
+	if !ok {
+		return fmt.Errorf("failed to cast to unixconn")
+	}
+
+	socket, err := unixconn.File()
+	if err != nil {
+		return err
+	}
+	defer socket.Close()
+
+	// Get the master file descriptor from runC.
+	master, err := utils.RecvFd(socket)
+	if err != nil {
+		return err
+	}
+	c, err := console.ConsoleFromFile(master)
+	if err != nil {
+		return err
+	}
+	console.ClearONLCR(c.Fd())
+
+	// Copy from our stdio to the master fd.
+	quitChan := make(chan struct{})
+	go func() {
+		io.Copy(os.Stdout, c)
+		quitChan <- struct{}{}
+	}()
+	go func() {
+		io.Copy(c, os.Stdin)
+		quitChan <- struct{}{}
+	}()
+
+	// Only close the master fd once we've stopped copying.
+	<-quitChan
+	c.Close()
+	return nil
+}
+
+func handleNull(path string) error {
+	// Open a socket.
+	ln, err := net.Listen("unix", path)
+	if err != nil {
+		return err
+	}
+	defer ln.Close()
+
+	// As opposed to handleSingle we accept as many connections as we get, but
+	// we don't interact with Stdin at all (and we copy stdout to /dev/null).
+	for {
+		conn, err := ln.Accept()
+		if err != nil {
+			return err
+		}
+		go func(conn net.Conn) {
+			// Don't leave references lying around.
+			defer conn.Close()
+
+			// Get the fd of the connection.
+			unixconn, ok := conn.(*net.UnixConn)
+			if !ok {
+				return
+			}
+
+			socket, err := unixconn.File()
+			if err != nil {
+				return
+			}
+			defer socket.Close()
+
+			// Get the master file descriptor from runC.
+			master, err := utils.RecvFd(socket)
+			if err != nil {
+				return
+			}
+
+			// Just do a dumb copy to /dev/null.
+			devnull, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
+			if err != nil {
+				// TODO: Handle this nicely.
+				return
+			}
+
+			io.Copy(devnull, master)
+			devnull.Close()
+		}(conn)
+	}
+}
+
+func main() {
+	app := cli.NewApp()
+	app.Name = "recvtty"
+	app.Usage = usage
+
+	// Set version to be the same as runC.
+	var v []string
+	if version != "" {
+		v = append(v, version)
+	}
+	if gitCommit != "" {
+		v = append(v, fmt.Sprintf("commit: %s", gitCommit))
+	}
+	app.Version = strings.Join(v, "\n")
+
+	// Set the flags.
+	app.Flags = []cli.Flag{
+		cli.StringFlag{
+			Name:  "mode, m",
+			Value: "single",
+			Usage: "Mode of operation (single or null)",
+		},
+		cli.StringFlag{
+			Name:  "pid-file",
+			Value: "",
+			Usage: "Path to write daemon process ID to",
+		},
+	}
+
+	app.Action = func(ctx *cli.Context) error {
+		args := ctx.Args()
+		if len(args) != 1 {
+			return fmt.Errorf("need to specify a single socket path")
+		}
+		path := ctx.Args()[0]
+
+		pidPath := ctx.String("pid-file")
+		if pidPath != "" {
+			pid := fmt.Sprintf("%d\n", os.Getpid())
+			if err := ioutil.WriteFile(pidPath, []byte(pid), 0644); err != nil {
+				return err
+			}
+		}
+
+		switch ctx.String("mode") {
+		case "single":
+			if err := handleSingle(path); err != nil {
+				return err
+			}
+		case "null":
+			if err := handleNull(path); err != nil {
+				return err
+			}
+		default:
+			return fmt.Errorf("need to select a valid mode: %s", ctx.String("mode"))
+		}
+		return nil
+	}
+	if err := app.Run(os.Args); err != nil {
+		bail(err)
+	}
+}
--- a/contrib/completions/bash/runc
+++ b/contrib/completions/bash/runc
@ -0,0 +1,826 @@
+#!/bin/bash
+#
+# bash completion file for runc command
+#
+# This script provides completion of:
+#  - commands and their options
+#  - filepaths
+#
+# To enable the completions either:
+#  - place this file in /usr/share/bash-completion/completions
+#  or
+#  - copy this file to e.g. ~/.runc-completion.sh and add the line
+#    below to your .bashrc after bash completion features are loaded
+#    . ~/.runc-completion.sh
+#
+# Configuration:
+#
+
+# Note for developers:
+# Please arrange options sorted alphabetically by long name with the short
+# options immediately following their corresponding long form.
+# This order should be applied to lists, alternatives and code blocks.
+
+__runc_previous_extglob_setting=$(shopt -p extglob)
+shopt -s extglob
+
+__runc_list_all() {
+	COMPREPLY=($(compgen -W "$(runc list -q)" -- $cur))
+}
+
+__runc_pos_first_nonflag() {
+	local argument_flags=$1
+
+	local counter=$((${subcommand_pos:-${command_pos}} + 1))
+	while [ $counter -le $cword ]; do
+		if [ -n "$argument_flags" ] && eval "case '${words[$counter]}' in $argument_flags) true ;; *) false ;; esac"; then
+			((counter++))
+		else
+			case "${words[$counter]}" in
+			-*) ;;
+			*)
+				break
+				;;
+			esac
+		fi
+		((counter++))
+	done
+
+	echo $counter
+}
+
+# Transforms a multiline list of strings into a single line string
+# with the words separated by "|".
+# This is used to prepare arguments to __runc_pos_first_nonflag().
+__runc_to_alternatives() {
+	local parts=($1)
+	local IFS='|'
+	echo "${parts[*]}"
+}
+
+# Transforms a multiline list of options into an extglob pattern
+# suitable for use in case statements.
+__runc_to_extglob() {
+	local extglob=$(__runc_to_alternatives "$1")
+	echo "@($extglob)"
+}
+
+# Subcommand processing.
+# Locates the first occurrence of any of the subcommands contained in the
+# first argument. In case of a match, calls the corresponding completion
+# function and returns 0.
+# If no match is found, 1 is returned. The calling function can then
+# continue processing its completion.
+#
+# TODO if the preceding command has options that accept arguments and an
+# argument is equal to one of the subcommands, this is falsely detected as
+# a match.
+__runc_subcommands() {
+	local subcommands="$1"
+
+	local counter=$(($command_pos + 1))
+	while [ $counter -lt $cword ]; do
+		case "${words[$counter]}" in
+		$(__runc_to_extglob "$subcommands"))
+			subcommand_pos=$counter
+			local subcommand=${words[$counter]}
+			local completions_func=_runc_${command}_${subcommand}
+			declare -F $completions_func >/dev/null && $completions_func
+			return 0
+			;;
+		esac
+		((counter++))
+	done
+	return 1
+}
+
+# List all Signals
+__runc_list_signals() {
+	COMPREPLY=($(compgen -W "$(for i in $(kill -l | xargs); do echo $i; done | grep SIG)"))
+}
+
+# suppress trailing whitespace
+__runc_nospace() {
+	# compopt is not available in ancient bash versions
+	type compopt &>/dev/null && compopt -o nospace
+}
+
+# The list of capabilities is defined in types.go, ALL was added manually.
+__runc_complete_capabilities() {
+	COMPREPLY=($(compgen -W "
+		ALL
+		AUDIT_CONTROL
+		AUDIT_WRITE
+		AUDIT_READ
+		BLOCK_SUSPEND
+		CHOWN
+		DAC_OVERRIDE
+		DAC_READ_SEARCH
+		FOWNER
+		FSETID
+		IPC_LOCK
+		IPC_OWNER
+		KILL
+		LEASE
+		LINUX_IMMUTABLE
+		MAC_ADMIN
+		MAC_OVERRIDE
+		MKNOD
+		NET_ADMIN
+		NET_BIND_SERVICE
+		NET_BROADCAST
+		NET_RAW
+		SETFCAP
+		SETGID
+		SETPCAP
+		SETUID
+		SYS_ADMIN
+		SYS_BOOT
+		SYS_CHROOT
+		SYSLOG
+		SYS_MODULE
+		SYS_NICE
+		SYS_PACCT
+		SYS_PTRACE
+		SYS_RAWIO
+		SYS_RESOURCE
+		SYS_TIME
+		SYS_TTY_CONFIG
+		WAKE_ALARM
+	" -- "$cur"))
+}
+
+_runc_exec() {
+	local boolean_options="
+	   --help
+	   --no-new-privs
+	   --tty, -t
+	   --detach, -d
+	"
+
+	local options_with_args="
+	   --console-socket
+	   --cwd
+	   --env, -e
+	   --user, -u
+	   --additional-gids, -g
+	   --process, -p
+	   --pid-file
+	   --process-label
+	   --apparmor
+	   --cap, -c
+	   --preserve-fds
+	"
+
+	local all_options="$options_with_args $boolean_options"
+
+	case "$prev" in
+	--cap | -c)
+		__runc_complete_capabilities
+		return
+		;;
+
+	--console-socket | --cwd | --process | --apparmor)
+		case "$cur" in
+		*:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+		'')
+			COMPREPLY=($(compgen -W '/' -- "$cur"))
+			__runc_nospace
+			;;
+		/*)
+			_filedir
+			__runc_nospace
+			;;
+		esac
+		return
+		;;
+	--env | -e)
+		COMPREPLY=($(compgen -e -- "$cur"))
+		__runc_nospace
+		return
+		;;
+	$(__runc_to_extglob "$options_with_args"))
+		return
+		;;
+	esac
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$all_options" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+
+# global options that may appear after the runc command
+_runc_runc() {
+	local boolean_options="
+		$global_boolean_options
+		--help
+		--version -v
+		--debug
+	"
+	local options_with_args="
+		--log
+		--log-format
+		--root
+		--criu
+		--rootless
+	"
+
+	case "$prev" in
+	--log | --root | --criu)
+		case "$cur" in
+		*:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+		'')
+			COMPREPLY=($(compgen -W '/' -- "$cur"))
+			__runc_nospace
+			;;
+		*)
+			_filedir
+			__runc_nospace
+			;;
+		esac
+		return
+		;;
+
+	--log-format)
+		COMPREPLY=($(compgen -W 'text json' -- "$cur"))
+		return
+		;;
+
+	$(__runc_to_extglob "$options_with_args"))
+		return
+		;;
+	esac
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args"))
+		if [ $cword -eq $counter ]; then
+			COMPREPLY=($(compgen -W "${commands[*]} help" -- "$cur"))
+		fi
+		;;
+	esac
+}
+
+_runc_pause() {
+	local boolean_options="
+	   --help
+	   -h
+	"
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+
+_runc_ps() {
+	local boolean_options="
+	   --help
+	   -h
+	"
+	local options_with_args="
+	   --format, -f
+	"
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+
+_runc_delete() {
+	local boolean_options="
+	   --help
+	   -h
+	   --format, -f
+	"
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+
+_runc_kill() {
+	local boolean_options="
+	   --help
+	   -h
+          --all
+          -a
+	"
+
+	case "$prev" in
+	"kill")
+		__runc_list_all
+		return
+		;;
+	*)
+		__runc_list_signals
+		return
+		;;
+	esac
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+
+_runc_events() {
+	local boolean_options="
+	   --help
+	   --stats
+	"
+
+	local options_with_args="
+	   --interval
+	"
+
+	case "$prev" in
+	$(__runc_to_extglob "$options_with_args"))
+		return
+		;;
+	esac
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+
+_runc_list() {
+	local boolean_options="
+	   --help
+	   --quiet 
+	   -q
+	"
+
+	local options_with_args="
+	   --format
+	   -f
+	"
+
+	case "$prev" in
+	--format | -f)
+		COMPREPLY=($(compgen -W 'text json' -- "$cur"))
+		return
+		;;
+
+	$(__runc_to_extglob "$options_with_args"))
+		return
+		;;
+	esac
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args"))
+		;;
+	esac
+}
+
+_runc_spec() {
+	local boolean_options="
+	   --help
+	   --rootless
+	"
+
+	local options_with_args="
+	   --bundle
+	   -b
+	"
+
+	case "$prev" in
+	--bundle | -b)
+		case "$cur" in
+		'')
+			COMPREPLY=($(compgen -W '/' -- "$cur"))
+			__runc_nospace
+			;;
+		/*)
+			_filedir
+			__runc_nospace
+			;;
+		esac
+		return
+		;;
+
+	$(__runc_to_extglob "$options_with_args"))
+		return
+		;;
+	esac
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args"))
+		;;
+	esac
+}
+
+_runc_run() {
+	local boolean_options="
+	   --help
+	   --detatch
+	   -d
+	   --no-subreaper
+	   --no-pivot
+	   --no-new-keyring
+	"
+
+	local options_with_args="
+	   --bundle
+	   -b
+	   --console-socket
+	   --pid-file
+	   --preserve-fds
+	"
+
+	case "$prev" in
+	--bundle | -b | --console-socket | --pid-file)
+		case "$cur" in
+		'')
+			COMPREPLY=($(compgen -W '/' -- "$cur"))
+			__runc_nospace
+			;;
+		/*)
+			_filedir
+			__runc_nospace
+			;;
+		esac
+		return
+		;;
+
+	$(__runc_to_extglob "$options_with_args"))
+		return
+		;;
+	esac
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+
+_runc_checkpoint() {
+	local boolean_options="
+	   --help
+	   -h
+	   --leave-running
+	   --tcp-established
+	   --ext-unix-sk
+	   --shell-job
+	   --lazy-pages
+	   --file-locks
+	   --pre-dump
+	   --auto-dedup
+	"
+
+	local options_with_args="
+	   --image-path
+	   --work-path
+	   --parent-path
+	   --status-fd
+	   --page-server
+	   --manage-cgroups-mode
+	   --empty-ns
+	"
+
+	case "$prev" in
+	--page-server) ;;
+
+	--manage-cgroups-mode)
+		COMPREPLY=($(compgen -W "soft full strict" -- "$cur"))
+		return
+		;;
+
+	--image-path | --work-path | --parent-path)
+		case "$cur" in
+		*:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+		'')
+			COMPREPLY=($(compgen -W '/' -- "$cur"))
+			__runc_nospace
+			;;
+		*)
+			_filedir
+			__runc_nospace
+			;;
+		esac
+		return
+		;;
+
+	$(__runc_to_extglob "$options_with_args"))
+		return
+		;;
+	esac
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+_runc_create() {
+	local boolean_options="
+	   --help
+	   --no-pivot
+	   --no-new-keyring
+	"
+
+	local options_with_args="
+	   --bundle
+	   -b
+	   --console-socket
+	   --pid-file
+	   --preserve-fds
+	"
+	case "$prev" in
+	--bundle | -b | --console-socket | --pid-file)
+		case "$cur" in
+		'')
+			COMPREPLY=($(compgen -W '/' -- "$cur"))
+			__runc_nospace
+			;;
+		/*)
+			_filedir
+			__runc_nospace
+			;;
+		esac
+		return
+		;;
+
+	$(__runc_to_extglob "$options_with_args"))
+		return
+		;;
+	esac
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+
+}
+
+_runc_help() {
+	local counter=$(__runc_pos_first_nonflag)
+	if [ $cword -eq $counter ]; then
+		COMPREPLY=($(compgen -W "${commands[*]}" -- "$cur"))
+	fi
+}
+
+_runc_restore() {
+	local boolean_options="
+	   --help
+	   --tcp-established
+	   --ext-unix-sk
+	   --shell-job
+	   --file-locks
+	   --detach
+	   -d
+	   --no-subreaper
+	   --no-pivot
+	   --auto-dedup
+	   --lazy-pages
+	"
+
+	local options_with_args="
+	   -b
+	   --bundle
+	   --image-path
+	   --work-path
+	   --manage-cgroups-mode
+	   --pid-file
+	   --empty-ns
+	"
+
+	local all_options="$options_with_args $boolean_options"
+
+	case "$prev" in
+	--manage-cgroups-mode)
+		COMPREPLY=($(compgen -W "soft full strict" -- "$cur"))
+		return
+		;;
+
+	--pid-file | --image-path | --work-path | --bundle | -b)
+		case "$cur" in
+		*:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine)
+		'')
+			COMPREPLY=($(compgen -W '/' -- "$cur"))
+			__runc_nospace
+			;;
+		/*)
+			_filedir
+			__runc_nospace
+			;;
+		esac
+		return
+		;;
+
+	$(__runc_to_extglob "$options_with_args"))
+		return
+		;;
+	esac
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$all_options" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+
+_runc_resume() {
+	local boolean_options="
+	   --help
+	   -h
+	"
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+
+_runc_state() {
+	local boolean_options="
+	   --help
+	   -h
+	"
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+_runc_start() {
+	local boolean_options="
+	   --help
+	   -h
+	"
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+_runc_update() {
+	local boolean_options="
+	   --help
+	"
+
+	local options_with_args="
+	   --blkio-weight
+	   --cpu-period
+	   --cpu-quota
+	   --cpu-rt-period
+	   --cpu-rt-runtime
+	   --cpu-share
+	   --cpuset-cpus
+	   --cpuset-mems
+	   --kernel-memory
+	   --kernel-memory-tcp
+	   --memory
+	   --memory-reservation
+	   --memory-swap
+	   --pids-limit
+	   --l3-cache-schema
+	   --mem-bw-schema
+	"
+
+	case "$prev" in
+	$(__runc_to_extglob "$options_with_args"))
+		return
+		;;
+	esac
+
+	case "$cur" in
+	-*)
+		COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur"))
+		;;
+	*)
+		__runc_list_all
+		;;
+	esac
+}
+
+_runc() {
+	local previous_extglob_setting=$(shopt -p extglob)
+	shopt -s extglob
+
+	local commands=(
+		checkpoint
+		create
+		delete
+		events
+		exec
+		init
+		kill
+		list
+		pause
+		ps
+		restore
+		resume
+		run
+		spec
+		start
+		state
+		update
+		help
+		h
+	)
+
+	# These options are valid as global options for all client commands
+	# and valid as command options for `runc daemon`
+	local global_boolean_options="
+		--help -h
+		--version -v
+	"
+
+	COMPREPLY=()
+	local cur prev words cword
+	_get_comp_words_by_ref -n : cur prev words cword
+
+	local command='runc' command_pos=0 subcommand_pos
+	local counter=1
+	while [ $counter -lt $cword ]; do
+		case "${words[$counter]}" in
+		-*) ;;
+		=)
+			((counter++))
+			;;
+		*)
+			command="${words[$counter]}"
+			command_pos=$counter
+			break
+			;;
+		esac
+		((counter++))
+	done
+
+	local completions_func=_runc_${command}
+	declare -F $completions_func >/dev/null && $completions_func
+
+	eval "$previous_extglob_setting"
+	return 0
+}
+
+eval "$__runc_previous_extglob_setting"
+unset __runc_previous_extglob_setting
+
+complete -F _runc runc
--- a/create.go
+++ b/create.go
@ -0,0 +1,74 @@
+package main
+
+import (
+	"os"
+
+	"github.com/urfave/cli"
+)
+
+var createCommand = cli.Command{
+	Name:  "create",
+	Usage: "create a container",
+	ArgsUsage: `<container-id>
+
+Where "<container-id>" is your name for the instance of the container that you
+are starting. The name you provide for the container instance must be unique on
+your host.`,
+	Description: `The create command creates an instance of a container for a bundle. The bundle
+is a directory with a specification file named "` + specConfig + `" and a root
+filesystem.
+
+The specification file includes an args parameter. The args parameter is used
+to specify command(s) that get run when the container is started. To change the
+command(s) that get executed on start, edit the args parameter of the spec. See
+"runc spec --help" for more explanation.`,
+	Flags: []cli.Flag{
+		cli.StringFlag{
+			Name:  "bundle, b",
+			Value: "",
+			Usage: `path to the root of the bundle directory, defaults to the current directory`,
+		},
+		cli.StringFlag{
+			Name:  "console-socket",
+			Value: "",
+			Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
+		},
+		cli.StringFlag{
+			Name:  "pid-file",
+			Value: "",
+			Usage: "specify the file to write the process id to",
+		},
+		cli.BoolFlag{
+			Name:  "no-pivot",
+			Usage: "do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk",
+		},
+		cli.BoolFlag{
+			Name:  "no-new-keyring",
+			Usage: "do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key",
+		},
+		cli.IntFlag{
+			Name:  "preserve-fds",
+			Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
+		},
+	},
+	Action: func(context *cli.Context) error {
+		if err := checkArgs(context, 1, exactArgs); err != nil {
+			return err
+		}
+		if err := revisePidFile(context); err != nil {
+			return err
+		}
+		spec, err := setupSpec(context)
+		if err != nil {
+			return err
+		}
+		status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
+		if err != nil {
+			return err
+		}
+		// exit with the container's exit status so any external supervisor is
+		// notified of the exit with the correct exit status.
+		os.Exit(status)
+		return nil
+	},
+}
--- a/delete.go
+++ b/delete.go
@ -0,0 +1,89 @@
+// +build !solaris
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"syscall"
+	"time"
+
+	"github.com/opencontainers/runc/libcontainer"
+	"github.com/urfave/cli"
+
+	"golang.org/x/sys/unix"
+)
+
+func killContainer(container libcontainer.Container) error {
+	_ = container.Signal(unix.SIGKILL, false)
+	for i := 0; i < 100; i++ {
+		time.Sleep(100 * time.Millisecond)
+		if err := container.Signal(syscall.Signal(0), false); err != nil {
+			destroy(container)
+			return nil
+		}
+	}
+	return fmt.Errorf("container init still running")
+}
+
+var deleteCommand = cli.Command{
+	Name:  "delete",
+	Usage: "delete any resources held by the container often used with detached container",
+	ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.
+
+EXAMPLE:
+For example, if the container id is "ubuntu01" and runc list currently shows the
+status of "ubuntu01" as "stopped" the following will delete resources held for
+"ubuntu01" removing "ubuntu01" from the runc list of containers:
+
+       # runc delete ubuntu01`,
+	Flags: []cli.Flag{
+		cli.BoolFlag{
+			Name:  "force, f",
+			Usage: "Forcibly deletes the container if it is still running (uses SIGKILL)",
+		},
+	},
+	Action: func(context *cli.Context) error {
+		if err := checkArgs(context, 1, exactArgs); err != nil {
+			return err
+		}
+
+		id := context.Args().First()
+		force := context.Bool("force")
+		container, err := getContainer(context)
+		if err != nil {
+			if lerr, ok := err.(libcontainer.Error); ok && lerr.Code() == libcontainer.ContainerNotExists {
+				// if there was an aborted start or something of the sort then the container's directory could exist but
+				// libcontainer does not see it because the state.json file inside that directory was never created.
+				path := filepath.Join(context.GlobalString("root"), id)
+				if e := os.RemoveAll(path); e != nil {
+					fmt.Fprintf(os.Stderr, "remove %s: %v\n", path, e)
+				}
+				if force {
+					return nil
+				}
+			}
+			return err
+		}
+		s, err := container.Status()
+		if err != nil {
+			return err
+		}
+		switch s {
+		case libcontainer.Stopped:
+			destroy(container)
+		case libcontainer.Created:
+			return killContainer(container)
+		default:
+			if force {
+				return killContainer(container)
+			}
+			return fmt.Errorf("cannot delete container %s that is not stopped: %s\n", id, s)
+		}
+
+		return nil
+	},
+}
--- a/docs/checkpoint-restore.md
+++ b/docs/checkpoint-restore.md
@ -0,0 +1,50 @@
+# Checkpoint and Restore #
+
+For a basic description about checkpointing and restoring containers with
+`runc` please see [runc-checkpoint(8)](../man/runc-checkpoint.8.md) and
+[runc-restore(8)](../man/runc-restore.8.md).
+
+## Checkpoint/Restore Annotations ##
+
+In addition to specifying options on the command-line like it is described
+in the man-pages (see above), it is also possible to influence CRIU's
+behaviour using CRIU configuration files. For details about CRIU's
+configuration file support please see [CRIU's wiki](https://criu.org/Configuration_files).
+
+In addition to CRIU's default configuration files `runc` tells CRIU to
+also evaluate the file `/etc/criu/runc.conf`. Using the annotation
+`org.criu.config` it is, however, possible to change this additional
+CRIU configuration file.
+
+If the annotation `org.criu.config` is set to an empty string `runc`
+will not pass any additional configuration file to CRIU. With an empty
+string it is therefore possible to disable the additional CRIU configuration
+file. This can be used to make sure that no additional configuration file
+changes CRIU's behaviour accidentally.
+
+If the annotation `org.criu.config` is set to a non-empty string `runc` will
+pass that string to CRIU to be evaluated as an additional configuration file.
+If CRIU cannot open this additional configuration file, it will ignore this
+file and continue.
+
+### Annotation Example to disable additional CRIU configuration file ###
+
+```
+{
+	"ociVersion": "1.0.0",
+	"annotations": {
+		"org.criu.config": ""
+	},
+	"process": {
+```
+
+### Annotation Example to set a specific CRIU configuration file ###
+
+```
+{
+	"ociVersion": "1.0.0",
+	"annotations": {
+		"org.criu.config": "/etc/special-runc-criu-options"
+	},
+	"process": {
+```
--- a/docs/terminals.md
+++ b/docs/terminals.md
@ -0,0 +1,314 @@
+# Terminals and Standard IO #
+
+*Note that the default configuration of `runc` (foreground, new terminal) is
+generally the best option for most users. This document exists to help explain
+what the purpose of the different modes is, and to try to steer users away from
+common mistakes and misunderstandings.*
+
+In general, most processes on Unix (and Unix-like) operating systems have 3
+standard file descriptors provided at the start, collectively referred to as
+"standard IO" (`stdio`):
+
+* `0`: standard-in (`stdin`), the input stream into the process
+* `1`: standard-out (`stdout`), the output stream from the process
+* `2`: standard-error (`stderr`), the error stream from the process
+
+When creating and running a container via `runc`, it is important to take care
+to structure the `stdio` the new container's process receives. In some ways
+containers are just regular processes, while in other ways they're an isolated
+sub-partition of your machine (in a similar sense to a VM). This means that the
+structure of IO is not as simple as with ordinary programs (which generally
+just use the file descriptors you give them).
+
+## Other File Descriptors ##
+
+Before we continue, it is important to note that processes can have more file
+descriptors than just `stdio`. By default in `runc` no other file descriptors
+will be passed to the spawned container process. If you wish to explicitly pass
+file descriptors to the container you have to use the `--preserve-fds` option.
+These ancillary file descriptors don't have any of the strange semantics
+discussed further in this document (those only apply to `stdio`) -- they are
+passed untouched by `runc`.
+
+It should be noted that `--preserve-fds` does not take individual file
+descriptors to preserve. Instead, it takes how many file descriptors (not
+including `stdio` or `LISTEN_FDS`) should be passed to the container. In the
+following example:
+
+```
+% runc run --preserve-fds 5 <container>
+```
+
+`runc` will pass the first `5` file descriptors (`3`, `4`, `5`, `6`, and `7` --
+assuming that `LISTEN_FDS` has not been configured) to the container.
+
+In addition to `--preserve-fds`, `LISTEN_FDS` file descriptors are passed
+automatically to allow for `systemd`-style socket activation. To extend the
+above example:
+
+```
+% LISTEN_PID=$pid_of_runc LISTEN_FDS=3 runc run --preserve-fds 5 <container>
+```
+
+`runc` will now pass the first `8` file descriptors (and it will also pass
+`LISTEN_FDS=3` and `LISTEN_PID=1` to the container). The first `3` (`3`, `4`,
+and `5`) were passed due to `LISTEN_FDS` and the other `5` (`6`, `7`, `8`, `9`,
+and `10`) were passed due to `--preserve-fds`. You should keep this in mind if
+you use `runc` directly in something like a `systemd` unit file. To disable
+this `LISTEN_FDS`-style passing just unset `LISTEN_FDS`.
+
+**Be very careful when passing file descriptors to a container process.** Due
+to some Linux kernel (mis)features, a container with access to certain types of
+file descriptors (such as `O_PATH` descriptors) outside of the container's root
+file system can use these to break out of the container's pivoted mount
+namespace. [This has resulted in CVEs in the past.][CVE-2016-9962]
+
+[CVE-2016-9962]: https://nvd.nist.gov/vuln/detail/CVE-2016-9962
+
+## <a name="terminal-modes" /> Terminal Modes ##
+
+`runc` supports two distinct methods for passing `stdio` to the container's
+primary process:
+
+* [new terminal](#new-terminal) (`terminal: true`)
+* [pass-through](#pass-through) (`terminal: false`)
+
+When first using `runc` these two modes will look incredibly similar, but this
+can be quite deceptive as these different modes have quite different
+characteristics.
+
+By default, `runc spec` will create a configuration that will create a new
+terminal (`terminal: true`). However, if the `terminal: ...` line is not
+present in `config.json` then pass-through is the default.
+
+*In general we recommend using new terminal, because it means that tools like
+`sudo` will work inside your container. But pass-through can be useful if you
+know what you're doing, or if you're using `runc` as part of a non-interactive
+pipeline.*
+
+### <a name="new-terminal"> New Terminal ###
+
+In new terminal mode, `runc` will create a brand-new "console" (or more
+precisely, a new pseudo-terminal using the container's namespaced
+`/dev/pts/ptmx`) for your contained process to use as its `stdio`.
+
+When you start a process in new terminal mode, `runc` will do the following:
+
+1. Create a new pseudo-terminal.
+2. Pass the slave end to the container's primary process as its `stdio`.
+3. Send the master end to a process to interact with the `stdio` for the
+   container's primary process ([details below](#runc-modes)).
+
+It should be noted that since a new pseudo-terminal is being used for
+communication with the container, some strange properties of pseudo-terminals
+might surprise you. For instance, by default, all new pseudo-terminals
+translate the byte `'\n'` to the sequence `'\r\n'` on both `stdout` and
+`stderr`. In addition there are [a whole range of `ioctls(2)` that can only
+interact with pseudo-terminal `stdio`][tty_ioctl(4)].
+
+> **NOTE**: In new terminal mode, all three `stdio` file descriptors are the
+> same underlying file. The reason for this is to match how a shell's `stdio`
+> looks to a process (as well as remove race condition issues with having to
+> deal with multiple master pseudo-terminal file descriptors). However this
+> means that it is not really possible to uniquely distinguish between `stdout`
+> and `stderr` from the caller's perspective.
+
+[tty_ioctl(4)]: https://linux.die.net/man/4/tty_ioctl
+
+### <a name="pass-through"> Pass-Through ###
+
+If you have already set up some file handles that you wish your contained
+process to use as its `stdio`, then you can ask `runc` to pass them through to
+the contained process (this is not necessarily the same as `--preserve-fds`'s
+passing of file descriptors -- [details below](#runc-modes)). As an example
+(assuming that `terminal: false` is set in `config.json`):
+
+```
+% echo input | runc run some_container > /tmp/log.out 2>& /tmp/log.err
+```
+
+Here the container's various `stdio` file descriptors will be substituted with
+the following:
+
+* `stdin` will be sourced from the `echo input` pipeline.
+* `stdout` will be output into `/tmp/log.out` on the host.
+* `stderr` will be output into `/tmp/log.err` on the host.
+
+It should be noted that the actual file handles seen inside the container may
+be different [based on the mode `runc` is being used in](#runc-modes) (for
+instance, the file referenced by `1` could be `/tmp/log.out` directly or a pipe
+which `runc` is using to buffer output, based on the mode). However the net
+result will be the same in either case. In principle you could use the [new
+terminal mode](#new-terminal) in a pipeline, but the difference will become
+more clear when you are introduced to [`runc`'s detached mode](#runc-modes).
+
+## <a name="runc-modes" /> `runc` Modes ##
+
+`runc` itself runs in two modes:
+
+* [foreground](#foreground)
+* [detached](#detached)
+
+You can use either [terminal mode](#terminal-modes) with either `runc` mode.
+However, there are considerations that may indicate preference for one mode
+over another. It should be noted that while two types of modes (terminal and
+`runc`) are conceptually independent from each other, you should be aware of
+the intricacies of which combination you are using.
+
+*In general we recommend using foreground because it's the most
+straight-forward to use, with the only downside being that you will have a
+long-running `runc` process. Detached mode is difficult to get right and
+generally requires having your own `stdio` management.*
+
+### Foreground ###
+
+The default (and most straight-forward) mode of `runc`. In this mode, your
+`runc` command remains in the foreground with the container process as a child.
+All `stdio` is buffered through the foreground `runc` process (irrespective of
+which terminal mode you are using). This is conceptually quite similar to
+running a normal process interactively in a shell (and if you are using `runc`
+in a shell interactively, this is what you should use).
+
+Because the `stdio` will be buffered in this mode, some very important
+peculiarities of this mode should be kept in mind:
+
+* With [new terminal mode](#new-terminal), the container will see a
+  pseudo-terminal as its `stdio` (as you might expect). However, the `stdio` of
+  the foreground `runc` process will remain the `stdio` that the process was
+  started with -- and `runc` will copy all `stdio` between its `stdio` and the
+  container's `stdio`. This means that while a new pseudo-terminal has been
+  created, the foreground `runc` process manages it over the lifetime of the
+  container.
+
+* With [pass-through mode](#pass-through), the foreground `runc`'s `stdio` is
+  **not** passed to the container. Instead, the container's `stdio` is a set of
+  pipes which are used to copy data between `runc`'s `stdio` and the
+  container's `stdio`. This means that the container never has direct access to
+  host file descriptors (aside from the pipes created by the container runtime,
+  but that shouldn't be an issue).
+
+The main drawback of the foreground mode of operation is that it requires a
+long-running foreground `runc` process. If you kill the foreground `runc`
+process then you will no longer have access to the `stdio` of the container
+(and in most cases this will result in the container dying abnormally due to
+`SIGPIPE` or some other error). By extension this means that any bug in the
+long-running foreground `runc` process (such as a memory leak) or a stray
+OOM-kill sweep could result in your container being killed **through no fault
+of the user**. In addition, there is no way in foreground mode of passing a
+file descriptor directly to the container process as its `stdio` (like
+`--preserve-fds` does).
+
+These shortcomings are obviously sub-optimal and are the reason that `runc` has
+an additional mode called "detached mode".
+
+### Detached ###
+
+In contrast to foreground mode, in detached mode there is no long-running
+foreground `runc` process once the container has started. In fact, there is no
+long-running `runc` process at all. However, this means that it is up to the
+caller to handle the `stdio` after `runc` has set it up for you. In a shell
+this means that the `runc` command will exit and control will return to the
+shell, after the container has been set up.
+
+You can run `runc` in detached mode in one of the following ways:
+
+* `runc run -d ...` which operates similar to `runc run` but is detached.
+* `runc create` followed by `runc start` which is the standard container
+  lifecycle defined by the OCI runtime specification (`runc create` sets up the
+  container completely, waiting for `runc start` to begin execution of user
+  code).
+
+The main use-case of detached mode is for higher-level tools that want to be
+wrappers around `runc`. By running `runc` in detached mode, those tools have
+far more control over the container's `stdio` without `runc` getting in the
+way (most wrappers around `runc` like `cri-o` or `containerd` use detached mode
+for this reason).
+
+Unfortunately using detached mode is a bit more complicated and requires more
+care than the foreground mode -- mainly because it is now up to the caller to
+handle the `stdio` of the container.
+
+#### Detached Pass-Through ####
+
+In detached mode, pass-through actually does what it says on the tin -- the
+`stdio` file descriptors of the `runc` process are passed through (untouched)
+to the container's `stdio`. The purpose of this option is to allow a user to
+set up `stdio` for a container themselves and then force `runc` to just use
+their pre-prepared `stdio` (without any pseudo-terminal funny business). *If
+you don't see why this would be useful, don't use this option.*
+
+**You must be incredibly careful when using detached pass-through (especially
+in a shell).** The reason for this is that by using detached pass-through you
+are passing host file descriptors to the container. In the case of a shell,
+usually your `stdio` is going to be a pseudo-terminal (on your host). A
+malicious container could take advantage of TTY-specific `ioctls` like
+`TIOCSTI` to fake input into the **host** shell (remember that in detached
+mode, control is returned to your shell and so the terminal you've given the
+container is being read by a shell prompt).
+
+There are also several other issues with running non-malicious containers in a
+shell with detached pass-through (where you pass your shell's `stdio` to the
+container):
+
+* Output from the container will be interleaved with output from your shell (in
+  a non-deterministic way), without any real way of distinguishing from where a
+  particular piece of output came from.
+
+* Any input to `stdin` will be non-deterministically split and given to either
+  the container or the shell (because both are blocked on a `read(2)` of the
+  same FIFO-style file descriptor).
+
+They are all related to the fact that there is going to be a race when either
+your host or the container tries to read from (or write to) `stdio`. This
+problem is especially obvious when in a shell, where usually the terminal has
+been put into raw mode (where each individual key-press should cause `read(2)`
+to return).
+
+> **NOTE**: There is also currently a [known problem][issue-1721] where using
+> detached pass-through will result in the container hanging if the `stdout` or
+> `stderr` is a pipe (though this should be a temporary issue).
+
+[issue-1721]: https://github.com/opencontainers/runc/issues/1721
+
+#### Detached New Terminal ####
+
+When creating a new pseudo-terminal in detached mode, and fairly obvious
+problem appears -- how do we use the new terminal that `runc` created? Unlike
+in pass-through, `runc` has created a new set of file descriptors that need to
+be used by *something* in order for container communication to work.
+
+The way this problem is resolved is through the use of Unix domain sockets.
+There is a feature of Unix sockets called `SCM_RIGHTS` which allows a file
+descriptor to be sent through a Unix socket to a completely separate process
+(which can then use that file descriptor as though they opened it). When using
+`runc` in detached new terminal mode, this is how a user gets access to the
+pseudo-terminal's master file descriptor.
+
+To this end, there is a new option (which is required if you want to use `runc`
+in detached new terminal mode): `--console-socket`. This option takes the path
+to a Unix domain socket which `runc` will connect to and send the
+pseudo-terminal master file descriptor down. The general process for getting
+the pseudo-terminal master is as follows:
+
+1. Create a Unix domain socket at some path, `$socket_path`.
+2. Call `runc run` or `runc create` with the argument `--console-socket
+   $socket_path`.
+3. Using `recvmsg(2)` retrieve the file descriptor sent using `SCM_RIGHTS` by
+   `runc`.
+4. Now the manager can interact with the `stdio` of the container, using the
+   retrieved pseudo-terminal master.
+
+After `runc` exits, the only process with a copy of the pseudo-terminal master
+file descriptor is whoever read the file descriptor from the socket.
+
+> **NOTE**: Currently `runc` doesn't support abstract socket addresses (due to
+> it not being possible to pass an `argv` with a null-byte as the first
+> character). In the future this may change, but currently you must use a valid
+> path name.
+
+In order to help users make use of detached new terminal mode, we have provided
+a [Go implementation in the `go-runc` bindings][containerd/go-runc.Socket], as
+well as [a simple client][recvtty].
+
+[containerd/go-runc.Socket]: https://godoc.org/github.com/containerd/go-runc#Socket
+[recvtty]: /contrib/cmd/recvtty
--- a/events.go
+++ b/events.go
@ -0,0 +1,215 @@
+// +build linux
+
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"sync"
+	"time"
+
+	"github.com/opencontainers/runc/libcontainer"
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/intelrdt"
+	"github.com/opencontainers/runc/types"
+
+	"github.com/sirupsen/logrus"
+	"github.com/urfave/cli"
+)
+
+var eventsCommand = cli.Command{
+	Name:  "events",
+	Usage: "display container events such as OOM notifications, cpu, memory, and IO usage statistics",
+	ArgsUsage: `<container-id>
+
+Where "<container-id>" is the name for the instance of the container.`,
+	Description: `The events command displays information about the container. By default the
+information is displayed once every 5 seconds.`,
+	Flags: []cli.Flag{
+		cli.DurationFlag{Name: "interval", Value: 5 * time.Second, Usage: "set the stats collection interval"},
+		cli.BoolFlag{Name: "stats", Usage: "display the container's stats then exit"},
+	},
+	Action: func(context *cli.Context) error {
+		if err := checkArgs(context, 1, exactArgs); err != nil {
+			return err
+		}
+		container, err := getContainer(context)
+		if err != nil {
+			return err
+		}
+		duration := context.Duration("interval")
+		if duration <= 0 {
+			return fmt.Errorf("duration interval must be greater than 0")
+		}
+		status, err := container.Status()
+		if err != nil {
+			return err
+		}
+		if status == libcontainer.Stopped {
+			return fmt.Errorf("container with id %s is not running", container.ID())
+		}
+		var (
+			stats  = make(chan *libcontainer.Stats, 1)
+			events = make(chan *types.Event, 1024)
+			group  = &sync.WaitGroup{}
+		)
+		group.Add(1)
+		go func() {
+			defer group.Done()
+			enc := json.NewEncoder(os.Stdout)
+			for e := range events {
+				if err := enc.Encode(e); err != nil {
+					logrus.Error(err)
+				}
+			}
+		}()
+		if context.Bool("stats") {
+			s, err := container.Stats()
+			if err != nil {
+				return err
+			}
+			events <- &types.Event{Type: "stats", ID: container.ID(), Data: convertLibcontainerStats(s)}
+			close(events)
+			group.Wait()
+			return nil
+		}
+		go func() {
+			for range time.Tick(context.Duration("interval")) {
+				s, err := container.Stats()
+				if err != nil {
+					logrus.Error(err)
+					continue
+				}
+				stats <- s
+			}
+		}()
+		n, err := container.NotifyOOM()
+		if err != nil {
+			return err
+		}
+		for {
+			select {
+			case _, ok := <-n:
+				if ok {
+					// this means an oom event was received, if it is !ok then
+					// the channel was closed because the container stopped and
+					// the cgroups no longer exist.
+					events <- &types.Event{Type: "oom", ID: container.ID()}
+				} else {
+					n = nil
+				}
+			case s := <-stats:
+				events <- &types.Event{Type: "stats", ID: container.ID(), Data: convertLibcontainerStats(s)}
+			}
+			if n == nil {
+				close(events)
+				break
+			}
+		}
+		group.Wait()
+		return nil
+	},
+}
+
+func convertLibcontainerStats(ls *libcontainer.Stats) *types.Stats {
+	cg := ls.CgroupStats
+	if cg == nil {
+		return nil
+	}
+	var s types.Stats
+	s.Pids.Current = cg.PidsStats.Current
+	s.Pids.Limit = cg.PidsStats.Limit
+
+	s.CPU.Usage.Kernel = cg.CpuStats.CpuUsage.UsageInKernelmode
+	s.CPU.Usage.User = cg.CpuStats.CpuUsage.UsageInUsermode
+	s.CPU.Usage.Total = cg.CpuStats.CpuUsage.TotalUsage
+	s.CPU.Usage.Percpu = cg.CpuStats.CpuUsage.PercpuUsage
+	s.CPU.Throttling.Periods = cg.CpuStats.ThrottlingData.Periods
+	s.CPU.Throttling.ThrottledPeriods = cg.CpuStats.ThrottlingData.ThrottledPeriods
+	s.CPU.Throttling.ThrottledTime = cg.CpuStats.ThrottlingData.ThrottledTime
+
+	s.Memory.Cache = cg.MemoryStats.Cache
+	s.Memory.Kernel = convertMemoryEntry(cg.MemoryStats.KernelUsage)
+	s.Memory.KernelTCP = convertMemoryEntry(cg.MemoryStats.KernelTCPUsage)
+	s.Memory.Swap = convertMemoryEntry(cg.MemoryStats.SwapUsage)
+	s.Memory.Usage = convertMemoryEntry(cg.MemoryStats.Usage)
+	s.Memory.Raw = cg.MemoryStats.Stats
+
+	s.Blkio.IoServiceBytesRecursive = convertBlkioEntry(cg.BlkioStats.IoServiceBytesRecursive)
+	s.Blkio.IoServicedRecursive = convertBlkioEntry(cg.BlkioStats.IoServicedRecursive)
+	s.Blkio.IoQueuedRecursive = convertBlkioEntry(cg.BlkioStats.IoQueuedRecursive)
+	s.Blkio.IoServiceTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoServiceTimeRecursive)
+	s.Blkio.IoWaitTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoWaitTimeRecursive)
+	s.Blkio.IoMergedRecursive = convertBlkioEntry(cg.BlkioStats.IoMergedRecursive)
+	s.Blkio.IoTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoTimeRecursive)
+	s.Blkio.SectorsRecursive = convertBlkioEntry(cg.BlkioStats.SectorsRecursive)
+
+	s.Hugetlb = make(map[string]types.Hugetlb)
+	for k, v := range cg.HugetlbStats {
+		s.Hugetlb[k] = convertHugtlb(v)
+	}
+
+	if is := ls.IntelRdtStats; is != nil {
+		if intelrdt.IsCatEnabled() {
+			s.IntelRdt.L3CacheInfo = convertL3CacheInfo(is.L3CacheInfo)
+			s.IntelRdt.L3CacheSchemaRoot = is.L3CacheSchemaRoot
+			s.IntelRdt.L3CacheSchema = is.L3CacheSchema
+		}
+		if intelrdt.IsMbaEnabled() {
+			s.IntelRdt.MemBwInfo = convertMemBwInfo(is.MemBwInfo)
+			s.IntelRdt.MemBwSchemaRoot = is.MemBwSchemaRoot
+			s.IntelRdt.MemBwSchema = is.MemBwSchema
+		}
+	}
+
+	s.NetworkInterfaces = ls.Interfaces
+	return &s
+}
+
+func convertHugtlb(c cgroups.HugetlbStats) types.Hugetlb {
+	return types.Hugetlb{
+		Usage:   c.Usage,
+		Max:     c.MaxUsage,
+		Failcnt: c.Failcnt,
+	}
+}
+
+func convertMemoryEntry(c cgroups.MemoryData) types.MemoryEntry {
+	return types.MemoryEntry{
+		Limit:   c.Limit,
+		Usage:   c.Usage,
+		Max:     c.MaxUsage,
+		Failcnt: c.Failcnt,
+	}
+}
+
+func convertBlkioEntry(c []cgroups.BlkioStatEntry) []types.BlkioEntry {
+	var out []types.BlkioEntry
+	for _, e := range c {
+		out = append(out, types.BlkioEntry{
+			Major: e.Major,
+			Minor: e.Minor,
+			Op:    e.Op,
+			Value: e.Value,
+		})
+	}
+	return out
+}
+
+func convertL3CacheInfo(i *intelrdt.L3CacheInfo) *types.L3CacheInfo {
+	return &types.L3CacheInfo{
+		CbmMask:    i.CbmMask,
+		MinCbmBits: i.MinCbmBits,
+		NumClosids: i.NumClosids,
+	}
+}
+
+func convertMemBwInfo(i *intelrdt.MemBwInfo) *types.MemBwInfo {
+	return &types.MemBwInfo{
+		BandwidthGran: i.BandwidthGran,
+		DelayLinear:   i.DelayLinear,
+		MinBandwidth:  i.MinBandwidth,
+		NumClosids:    i.NumClosids,
+	}
+}
--- a/exec.go
+++ b/exec.go
@ -0,0 +1,235 @@
+// +build linux
+
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer"
+	"github.com/opencontainers/runc/libcontainer/utils"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/urfave/cli"
+)
+
+var execCommand = cli.Command{
+	Name:  "exec",
+	Usage: "execute new process inside the container",
+	ArgsUsage: `<container-id> <command> [command options]  || -p process.json <container-id>
+
+Where "<container-id>" is the name for the instance of the container and
+"<command>" is the command to be executed in the container.
+"<command>" can't be empty unless a "-p" flag provided.
+
+EXAMPLE:
+For example, if the container is configured to run the linux ps command the
+following will output a list of processes running in the container:
+
+       # runc exec <container-id> ps`,
+	Flags: []cli.Flag{
+		cli.StringFlag{
+			Name:  "console-socket",
+			Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
+		},
+		cli.StringFlag{
+			Name:  "cwd",
+			Usage: "current working directory in the container",
+		},
+		cli.StringSliceFlag{
+			Name:  "env, e",
+			Usage: "set environment variables",
+		},
+		cli.BoolFlag{
+			Name:  "tty, t",
+			Usage: "allocate a pseudo-TTY",
+		},
+		cli.StringFlag{
+			Name:  "user, u",
+			Usage: "UID (format: <uid>[:<gid>])",
+		},
+		cli.Int64SliceFlag{
+			Name:  "additional-gids, g",
+			Usage: "additional gids",
+		},
+		cli.StringFlag{
+			Name:  "process, p",
+			Usage: "path to the process.json",
+		},
+		cli.BoolFlag{
+			Name:  "detach,d",
+			Usage: "detach from the container's process",
+		},
+		cli.StringFlag{
+			Name:  "pid-file",
+			Value: "",
+			Usage: "specify the file to write the process id to",
+		},
+		cli.StringFlag{
+			Name:  "process-label",
+			Usage: "set the asm process label for the process commonly used with selinux",
+		},
+		cli.StringFlag{
+			Name:  "apparmor",
+			Usage: "set the apparmor profile for the process",
+		},
+		cli.BoolFlag{
+			Name:  "no-new-privs",
+			Usage: "set the no new privileges value for the process",
+		},
+		cli.StringSliceFlag{
+			Name:  "cap, c",
+			Value: &cli.StringSlice{},
+			Usage: "add a capability to the bounding set for the process",
+		},
+		cli.BoolFlag{
+			Name:   "no-subreaper",
+			Usage:  "disable the use of the subreaper used to reap reparented processes",
+			Hidden: true,
+		},
+		cli.IntFlag{
+			Name:  "preserve-fds",
+			Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
+		},
+	},
+	Action: func(context *cli.Context) error {
+		if err := checkArgs(context, 1, minArgs); err != nil {
+			return err
+		}
+		if err := revisePidFile(context); err != nil {
+			return err
+		}
+		status, err := execProcess(context)
+		if err == nil {
+			os.Exit(status)
+		}
+		return fmt.Errorf("exec failed: %v", err)
+	},
+	SkipArgReorder: true,
+}
+
+func execProcess(context *cli.Context) (int, error) {
+	container, err := getContainer(context)
+	if err != nil {
+		return -1, err
+	}
+	status, err := container.Status()
+	if err != nil {
+		return -1, err
+	}
+	if status == libcontainer.Stopped {
+		return -1, fmt.Errorf("cannot exec a container that has stopped")
+	}
+	path := context.String("process")
+	if path == "" && len(context.Args()) == 1 {
+		return -1, fmt.Errorf("process args cannot be empty")
+	}
+	detach := context.Bool("detach")
+	state, err := container.State()
+	if err != nil {
+		return -1, err
+	}
+	bundle := utils.SearchLabels(state.Config.Labels, "bundle")
+	p, err := getProcess(context, bundle)
+	if err != nil {
+		return -1, err
+	}
+
+	logLevel := "info"
+	if context.GlobalBool("debug") {
+		logLevel = "debug"
+	}
+
+	r := &runner{
+		enableSubreaper: false,
+		shouldDestroy:   false,
+		container:       container,
+		consoleSocket:   context.String("console-socket"),
+		detach:          detach,
+		pidFile:         context.String("pid-file"),
+		action:          CT_ACT_RUN,
+		init:            false,
+		preserveFDs:     context.Int("preserve-fds"),
+		logLevel:        logLevel,
+	}
+	return r.run(p)
+}
+
+func getProcess(context *cli.Context, bundle string) (*specs.Process, error) {
+	if path := context.String("process"); path != "" {
+		f, err := os.Open(path)
+		if err != nil {
+			return nil, err
+		}
+		defer f.Close()
+		var p specs.Process
+		if err := json.NewDecoder(f).Decode(&p); err != nil {
+			return nil, err
+		}
+		return &p, validateProcessSpec(&p)
+	}
+	// process via cli flags
+	if err := os.Chdir(bundle); err != nil {
+		return nil, err
+	}
+	spec, err := loadSpec(specConfig)
+	if err != nil {
+		return nil, err
+	}
+	p := spec.Process
+	p.Args = context.Args()[1:]
+	// override the cwd, if passed
+	if context.String("cwd") != "" {
+		p.Cwd = context.String("cwd")
+	}
+	if ap := context.String("apparmor"); ap != "" {
+		p.ApparmorProfile = ap
+	}
+	if l := context.String("process-label"); l != "" {
+		p.SelinuxLabel = l
+	}
+	if caps := context.StringSlice("cap"); len(caps) > 0 {
+		for _, c := range caps {
+			p.Capabilities.Bounding = append(p.Capabilities.Bounding, c)
+			p.Capabilities.Inheritable = append(p.Capabilities.Inheritable, c)
+			p.Capabilities.Effective = append(p.Capabilities.Effective, c)
+			p.Capabilities.Permitted = append(p.Capabilities.Permitted, c)
+			p.Capabilities.Ambient = append(p.Capabilities.Ambient, c)
+		}
+	}
+	// append the passed env variables
+	p.Env = append(p.Env, context.StringSlice("env")...)
+
+	// set the tty
+	if context.IsSet("tty") {
+		p.Terminal = context.Bool("tty")
+	}
+	if context.IsSet("no-new-privs") {
+		p.NoNewPrivileges = context.Bool("no-new-privs")
+	}
+	// override the user, if passed
+	if context.String("user") != "" {
+		u := strings.SplitN(context.String("user"), ":", 2)
+		if len(u) > 1 {
+			gid, err := strconv.Atoi(u[1])
+			if err != nil {
+				return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err)
+			}
+			p.User.GID = uint32(gid)
+		}
+		uid, err := strconv.Atoi(u[0])
+		if err != nil {
+			return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err)
+		}
+		p.User.UID = uint32(uid)
+	}
+	for _, gid := range context.Int64Slice("additional-gids") {
+		if gid < 0 {
+			return nil, fmt.Errorf("additional-gids must be a positive number %d", gid)
+		}
+		p.User.AdditionalGids = append(p.User.AdditionalGids, uint32(gid))
+	}
+	return p, validateProcessSpec(p)
+}
--- a/init.go
+++ b/init.go
@ -0,0 +1,50 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+
+	"github.com/opencontainers/runc/libcontainer"
+	"github.com/opencontainers/runc/libcontainer/logs"
+	_ "github.com/opencontainers/runc/libcontainer/nsenter"
+	"github.com/sirupsen/logrus"
+	"github.com/urfave/cli"
+)
+
+func init() {
+	if len(os.Args) > 1 && os.Args[1] == "init" {
+		runtime.GOMAXPROCS(1)
+		runtime.LockOSThread()
+
+		level := os.Getenv("_LIBCONTAINER_LOGLEVEL")
+		logLevel, err := logrus.ParseLevel(level)
+		if err != nil {
+			panic(fmt.Sprintf("libcontainer: failed to parse log level: %q: %v", level, err))
+		}
+
+		err = logs.ConfigureLogging(logs.Config{
+			LogPipeFd: os.Getenv("_LIBCONTAINER_LOGPIPE"),
+			LogFormat: "json",
+			LogLevel:  logLevel,
+		})
+		if err != nil {
+			panic(fmt.Sprintf("libcontainer: failed to configure logging: %v", err))
+		}
+		logrus.Debug("child process in init()")
+	}
+}
+
+var initCommand = cli.Command{
+	Name:  "init",
+	Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
+	Action: func(context *cli.Context) error {
+		factory, _ := libcontainer.New("")
+		if err := factory.StartInitialization(); err != nil {
+			// as the error is sent back to the parent there is no need to log
+			// or write it to stderr because the parent process will handle this
+			os.Exit(1)
+		}
+		panic("libcontainer: container init failed to exec")
+	},
+}
--- a/kill.go
+++ b/kill.go
@ -0,0 +1,68 @@
+// +build linux
+
+package main
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"syscall"
+
+	"github.com/urfave/cli"
+)
+
+var killCommand = cli.Command{
+	Name:  "kill",
+	Usage: "kill sends the specified signal (default: SIGTERM) to the container's init process",
+	ArgsUsage: `<container-id> [signal]
+
+Where "<container-id>" is the name for the instance of the container and
+"[signal]" is the signal to be sent to the init process.
+
+EXAMPLE:
+For example, if the container id is "ubuntu01" the following will send a "KILL"
+signal to the init process of the "ubuntu01" container:
+	 
+       # runc kill ubuntu01 KILL`,
+	Flags: []cli.Flag{
+		cli.BoolFlag{
+			Name:  "all, a",
+			Usage: "send the specified signal to all processes inside the container",
+		},
+	},
+	Action: func(context *cli.Context) error {
+		if err := checkArgs(context, 1, minArgs); err != nil {
+			return err
+		}
+		if err := checkArgs(context, 2, maxArgs); err != nil {
+			return err
+		}
+		container, err := getContainer(context)
+		if err != nil {
+			return err
+		}
+
+		sigstr := context.Args().Get(1)
+		if sigstr == "" {
+			sigstr = "SIGTERM"
+		}
+
+		signal, err := parseSignal(sigstr)
+		if err != nil {
+			return err
+		}
+		return container.Signal(signal, context.Bool("all"))
+	},
+}
+
+func parseSignal(rawSignal string) (syscall.Signal, error) {
+	s, err := strconv.Atoi(rawSignal)
+	if err == nil {
+		return syscall.Signal(s), nil
+	}
+	signal, ok := signalMap[strings.TrimPrefix(strings.ToUpper(rawSignal), "SIG")]
+	if !ok {
+		return -1, fmt.Errorf("unknown signal %q", rawSignal)
+	}
+	return signal, nil
+}
--- a/libcontainer/README.md
+++ b/libcontainer/README.md
@ -0,0 +1,331 @@
+# libcontainer
+
+[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer)
+
+Libcontainer provides a native Go implementation for creating containers
+with namespaces, cgroups, capabilities, and filesystem access controls.
+It allows you to manage the lifecycle of the container performing additional operations
+after the container is created.
+
+
+#### Container
+A container is a self contained execution environment that shares the kernel of the
+host system and which is (optionally) isolated from other containers in the system.
+
+#### Using libcontainer
+
+Because containers are spawned in a two step process you will need a binary that
+will be executed as the init process for the container. In libcontainer, we use
+the current binary (/proc/self/exe) to be executed as the init process, and use
+arg "init", we call the first step process "bootstrap", so you always need a "init"
+function as the entry of "bootstrap".
+
+In addition to the go init function the early stage bootstrap is handled by importing
+[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md).
+
+```go
+import (
+	_ "github.com/opencontainers/runc/libcontainer/nsenter"
+)
+
+func init() {
+	if len(os.Args) > 1 && os.Args[1] == "init" {
+		runtime.GOMAXPROCS(1)
+		runtime.LockOSThread()
+		factory, _ := libcontainer.New("")
+		if err := factory.StartInitialization(); err != nil {
+			logrus.Fatal(err)
+		}
+		panic("--this line should have never been executed, congratulations--")
+	}
+}
+```
+
+Then to create a container you first have to initialize an instance of a factory
+that will handle the creation and initialization for a container.
+
+```go
+factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
+if err != nil {
+	logrus.Fatal(err)
+	return
+}
+```
+
+Once you have an instance of the factory created we can create a configuration
+struct describing how the container is to be created. A sample would look similar to this:
+
+```go
+defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+config := &configs.Config{
+	Rootfs: "/your/path/to/rootfs",
+	Capabilities: &configs.Capabilities{
+                Bounding: []string{
+                        "CAP_CHOWN",
+                        "CAP_DAC_OVERRIDE",
+                        "CAP_FSETID",
+                        "CAP_FOWNER",
+                        "CAP_MKNOD",
+                        "CAP_NET_RAW",
+                        "CAP_SETGID",
+                        "CAP_SETUID",
+                        "CAP_SETFCAP",
+                        "CAP_SETPCAP",
+                        "CAP_NET_BIND_SERVICE",
+                        "CAP_SYS_CHROOT",
+                        "CAP_KILL",
+                        "CAP_AUDIT_WRITE",
+                },
+                Effective: []string{
+                        "CAP_CHOWN",
+                        "CAP_DAC_OVERRIDE",
+                        "CAP_FSETID",
+                        "CAP_FOWNER",
+                        "CAP_MKNOD",
+                        "CAP_NET_RAW",
+                        "CAP_SETGID",
+                        "CAP_SETUID",
+                        "CAP_SETFCAP",
+                        "CAP_SETPCAP",
+                        "CAP_NET_BIND_SERVICE",
+                        "CAP_SYS_CHROOT",
+                        "CAP_KILL",
+                        "CAP_AUDIT_WRITE",
+                },
+                Inheritable: []string{
+                        "CAP_CHOWN",
+                        "CAP_DAC_OVERRIDE",
+                        "CAP_FSETID",
+                        "CAP_FOWNER",
+                        "CAP_MKNOD",
+                        "CAP_NET_RAW",
+                        "CAP_SETGID",
+                        "CAP_SETUID",
+                        "CAP_SETFCAP",
+                        "CAP_SETPCAP",
+                        "CAP_NET_BIND_SERVICE",
+                        "CAP_SYS_CHROOT",
+                        "CAP_KILL",
+                        "CAP_AUDIT_WRITE",
+                },
+                Permitted: []string{
+                        "CAP_CHOWN",
+                        "CAP_DAC_OVERRIDE",
+                        "CAP_FSETID",
+                        "CAP_FOWNER",
+                        "CAP_MKNOD",
+                        "CAP_NET_RAW",
+                        "CAP_SETGID",
+                        "CAP_SETUID",
+                        "CAP_SETFCAP",
+                        "CAP_SETPCAP",
+                        "CAP_NET_BIND_SERVICE",
+                        "CAP_SYS_CHROOT",
+                        "CAP_KILL",
+                        "CAP_AUDIT_WRITE",
+                },
+                Ambient: []string{
+                        "CAP_CHOWN",
+                        "CAP_DAC_OVERRIDE",
+                        "CAP_FSETID",
+                        "CAP_FOWNER",
+                        "CAP_MKNOD",
+                        "CAP_NET_RAW",
+                        "CAP_SETGID",
+                        "CAP_SETUID",
+                        "CAP_SETFCAP",
+                        "CAP_SETPCAP",
+                        "CAP_NET_BIND_SERVICE",
+                        "CAP_SYS_CHROOT",
+                        "CAP_KILL",
+                        "CAP_AUDIT_WRITE",
+                },
+        },
+	Namespaces: configs.Namespaces([]configs.Namespace{
+		{Type: configs.NEWNS},
+		{Type: configs.NEWUTS},
+		{Type: configs.NEWIPC},
+		{Type: configs.NEWPID},
+		{Type: configs.NEWUSER},
+		{Type: configs.NEWNET},
+		{Type: configs.NEWCGROUP},
+	}),
+	Cgroups: &configs.Cgroup{
+		Name:   "test-container",
+		Parent: "system",
+		Resources: &configs.Resources{
+			MemorySwappiness: nil,
+			AllowAllDevices:  nil,
+			AllowedDevices:   configs.DefaultAllowedDevices,
+		},
+	},
+	MaskPaths: []string{
+		"/proc/kcore",
+		"/sys/firmware",
+	},
+	ReadonlyPaths: []string{
+		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
+	},
+	Devices:  configs.DefaultAutoCreatedDevices,
+	Hostname: "testing",
+	Mounts: []*configs.Mount{
+		{
+			Source:      "proc",
+			Destination: "/proc",
+			Device:      "proc",
+			Flags:       defaultMountFlags,
+		},
+		{
+			Source:      "tmpfs",
+			Destination: "/dev",
+			Device:      "tmpfs",
+			Flags:       unix.MS_NOSUID | unix.MS_STRICTATIME,
+			Data:        "mode=755",
+		},
+		{
+			Source:      "devpts",
+			Destination: "/dev/pts",
+			Device:      "devpts",
+			Flags:       unix.MS_NOSUID | unix.MS_NOEXEC,
+			Data:        "newinstance,ptmxmode=0666,mode=0620,gid=5",
+		},
+		{
+			Device:      "tmpfs",
+			Source:      "shm",
+			Destination: "/dev/shm",
+			Data:        "mode=1777,size=65536k",
+			Flags:       defaultMountFlags,
+		},
+		{
+			Source:      "mqueue",
+			Destination: "/dev/mqueue",
+			Device:      "mqueue",
+			Flags:       defaultMountFlags,
+		},
+		{
+			Source:      "sysfs",
+			Destination: "/sys",
+			Device:      "sysfs",
+			Flags:       defaultMountFlags | unix.MS_RDONLY,
+		},
+	},
+	UidMappings: []configs.IDMap{
+		{
+			ContainerID: 0,
+			HostID: 1000,
+			Size: 65536,
+		},
+	},
+	GidMappings: []configs.IDMap{
+		{
+			ContainerID: 0,
+			HostID: 1000,
+			Size: 65536,
+		},
+	},
+	Networks: []*configs.Network{
+		{
+			Type:    "loopback",
+			Address: "127.0.0.1/0",
+			Gateway: "localhost",
+		},
+	},
+	Rlimits: []configs.Rlimit{
+		{
+			Type: unix.RLIMIT_NOFILE,
+			Hard: uint64(1025),
+			Soft: uint64(1025),
+		},
+	},
+}
+```
+
+Once you have the configuration populated you can create a container:
+
+```go
+container, err := factory.Create("container-id", config)
+if err != nil {
+	logrus.Fatal(err)
+	return
+}
+```
+
+To spawn bash as the initial process inside the container and have the
+processes pid returned in order to wait, signal, or kill the process:
+
+```go
+process := &libcontainer.Process{
+	Args:   []string{"/bin/bash"},
+	Env:    []string{"PATH=/bin"},
+	User:   "daemon",
+	Stdin:  os.Stdin,
+	Stdout: os.Stdout,
+	Stderr: os.Stderr,
+	Init:   true,
+}
+
+err := container.Run(process)
+if err != nil {
+	container.Destroy()
+	logrus.Fatal(err)
+	return
+}
+
+// wait for the process to finish.
+_, err := process.Wait()
+if err != nil {
+	logrus.Fatal(err)
+}
+
+// destroy the container.
+container.Destroy()
+```
+
+Additional ways to interact with a running container are:
+
+```go
+// return all the pids for all processes running inside the container.
+processes, err := container.Processes()
+
+// get detailed cpu, memory, io, and network statistics for the container and
+// it's processes.
+stats, err := container.Stats()
+
+// pause all processes inside the container.
+container.Pause()
+
+// resume all paused processes.
+container.Resume()
+
+// send signal to container's init process.
+container.Signal(signal)
+
+// update container resource constraints.
+container.Set(config)
+
+// get current status of the container.
+status, err := container.Status()
+
+// get current container's state information.
+state, err := container.State()
+```
+
+
+#### Checkpoint & Restore
+
+libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
+This let's you save the state of a process running inside a container to disk, and then restore
+that state into a new process, on the same machine or on another machine.
+
+`criu` version 1.5.2 or higher is required to use checkpoint and restore.
+If you don't already  have `criu` installed, you can build it from source, following the
+[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image
+generated when building libcontainer with docker.
+
+
+## Copyright and license
+
+Code and documentation copyright 2014 Docker, inc.
+The code and documentation are released under the [Apache 2.0 license](../LICENSE).
+The documentation is also released under Creative Commons Attribution 4.0 International License.
+You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.
--- a/libcontainer/SPEC.md
+++ b/libcontainer/SPEC.md
@ -0,0 +1,465 @@
+## Container Specification - v1
+
+This is the standard configuration for version 1 containers.  It includes
+namespaces, standard filesystem setup, a default Linux capability set, and
+information about resource reservations.  It also has information about any 
+populated environment settings for the processes running inside a container.
+
+Along with the configuration of how a container is created the standard also
+discusses actions that can be performed on a container to manage and inspect
+information about the processes running inside.
+
+The v1 profile is meant to be able to accommodate the majority of applications
+with a strong security configuration.
+
+### System Requirements and Compatibility
+
+Minimum requirements:
+* Kernel version - 3.10 recommended 2.6.2x minimum(with backported patches)
+* Mounted cgroups with each subsystem in its own hierarchy
+
+
+### Namespaces
+
+|     Flag        | Enabled |
+| --------------- | ------- |
+| CLONE_NEWPID    |    1    |
+| CLONE_NEWUTS    |    1    |
+| CLONE_NEWIPC    |    1    |
+| CLONE_NEWNET    |    1    |
+| CLONE_NEWNS     |    1    |
+| CLONE_NEWUSER   |    1    |
+| CLONE_NEWCGROUP |    1    |
+
+Namespaces are created for the container via the `unshare` syscall.
+
+
+### Filesystem
+
+A root filesystem must be provided to a container for execution.  The container
+will use this root filesystem (rootfs) to jail and spawn processes inside where
+the binaries and system libraries are local to that directory.  Any binaries
+to be executed must be contained within this rootfs.
+
+Mounts that happen inside the container are automatically cleaned up when the
+container exits as the mount namespace is destroyed and the kernel will 
+unmount all the mounts that were setup within that namespace.
+
+For a container to execute properly there are certain filesystems that 
+are required to be mounted within the rootfs that the runtime will setup.
+
+|     Path    |  Type  |                  Flags                 |                 Data                     |
+| ----------- | ------ | -------------------------------------- | ---------------------------------------- |
+| /proc       | proc   | MS_NOEXEC,MS_NOSUID,MS_NODEV           |                                          |
+| /dev        | tmpfs  | MS_NOEXEC,MS_STRICTATIME               | mode=755                                 |
+| /dev/shm    | tmpfs  | MS_NOEXEC,MS_NOSUID,MS_NODEV           | mode=1777,size=65536k                    |
+| /dev/mqueue | mqueue | MS_NOEXEC,MS_NOSUID,MS_NODEV           |                                          |
+| /dev/pts    | devpts | MS_NOEXEC,MS_NOSUID                    | newinstance,ptmxmode=0666,mode=620,gid=5 |
+| /sys        | sysfs  | MS_NOEXEC,MS_NOSUID,MS_NODEV,MS_RDONLY |                                          |
+
+
+After a container's filesystems are mounted within the newly created 
+mount namespace `/dev` will need to be populated with a set of device nodes.
+It is expected that a rootfs does not need to have any device nodes specified
+for `/dev` within the rootfs as the container will setup the correct devices
+that are required for executing a container's process.
+
+|      Path    | Mode |   Access   |
+| ------------ | ---- | ---------- |
+| /dev/null    | 0666 |  rwm       |
+| /dev/zero    | 0666 |  rwm       |
+| /dev/full    | 0666 |  rwm       |
+| /dev/tty     | 0666 |  rwm       |
+| /dev/random  | 0666 |  rwm       |
+| /dev/urandom | 0666 |  rwm       |
+
+
+**ptmx**
+`/dev/ptmx` will need to be a symlink to the host's `/dev/ptmx` within
+the container.  
+
+The use of a pseudo TTY is optional within a container and it should support both.
+If a pseudo is provided to the container `/dev/console` will need to be 
+setup by binding the console in `/dev/` after it has been populated and mounted
+in tmpfs.
+
+|      Source     | Destination  | UID GID | Mode | Type |
+| --------------- | ------------ | ------- | ---- | ---- |
+| *pty host path* | /dev/console | 0 0     | 0600 | bind | 
+
+
+After `/dev/null` has been setup we check for any external links between
+the container's io, STDIN, STDOUT, STDERR.  If the container's io is pointing
+to `/dev/null` outside the container we close and `dup2` the `/dev/null` 
+that is local to the container's rootfs.
+
+
+After the container has `/proc` mounted a few standard symlinks are setup 
+within `/dev/` for the io.
+
+|    Source       | Destination |
+| --------------- | ----------- |
+| /proc/self/fd   | /dev/fd     |
+| /proc/self/fd/0 | /dev/stdin  |
+| /proc/self/fd/1 | /dev/stdout |
+| /proc/self/fd/2 | /dev/stderr |
+
+A `pivot_root` is used to change the root for the process, effectively 
+jailing the process inside the rootfs.
+
+```c
+put_old = mkdir(...);
+pivot_root(rootfs, put_old);
+chdir("/");
+unmount(put_old, MS_DETACH);
+rmdir(put_old);
+```
+
+For container's running with a rootfs inside `ramfs` a `MS_MOVE` combined
+with a `chroot` is required as `pivot_root` is not supported in `ramfs`.
+
+```c
+mount(rootfs, "/", NULL, MS_MOVE, NULL);
+chroot(".");
+chdir("/");
+```
+
+The `umask` is set back to `0022` after the filesystem setup has been completed.
+
+### Resources
+
+Cgroups are used to handle resource allocation for containers.  This includes
+system resources like cpu, memory, and device access.
+
+| Subsystem  | Enabled |
+| ---------- | ------- |
+| devices    | 1       |
+| memory     | 1       |
+| cpu        | 1       |
+| cpuacct    | 1       |
+| cpuset     | 1       |
+| blkio      | 1       |
+| perf_event | 1       |
+| freezer    | 1       |
+| hugetlb    | 1       |
+| pids       | 1       |
+
+
+All cgroup subsystem are joined so that statistics can be collected from
+each of the subsystems.  Freezer does not expose any stats but is joined
+so that containers can be paused and resumed.
+
+The parent process of the container's init must place the init pid inside
+the correct cgroups before the initialization begins.  This is done so
+that no processes or threads escape the cgroups.  This sync is 
+done via a pipe ( specified in the runtime section below ) that the container's
+init process will block waiting for the parent to finish setup.
+
+### IntelRdt
+
+Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
+Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
+two sub-features of RDT.
+
+Cache Allocation Technology (CAT) provides a way for the software to restrict
+cache allocation to a defined 'subset' of L3 cache which may be overlapping
+with other 'subsets'. The different subsets are identified by class of
+service (CLOS) and each CLOS has a capacity bitmask (CBM).
+
+Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
+over memory bandwidth for the software. A user controls the resource by
+indicating the percentage of maximum memory bandwidth or memory bandwidth limit
+in MBps unit if MBA Software Controller is enabled.
+
+It can be used to handle L3 cache and memory bandwidth resources allocation
+for containers if hardware and kernel support Intel RDT CAT and MBA features.
+
+In Linux 4.10 kernel or newer, the interface is defined and exposed via
+"resource control" filesystem, which is a "cgroup-like" interface.
+
+Comparing with cgroups, it has similar process management lifecycle and
+interfaces in a container. But unlike cgroups' hierarchy, it has single level
+filesystem layout.
+
+CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
+"resource control" filesystem.
+
+Intel RDT "resource control" filesystem hierarchy:
+```
+mount -t resctrl resctrl /sys/fs/resctrl
+tree /sys/fs/resctrl
+/sys/fs/resctrl/
+|-- info
+|   |-- L3
+|   |   |-- cbm_mask
+|   |   |-- min_cbm_bits
+|   |   |-- num_closids
+|   |-- MB
+|       |-- bandwidth_gran
+|       |-- delay_linear
+|       |-- min_bandwidth
+|       |-- num_closids
+|-- ...
+|-- schemata
+|-- tasks
+|-- <container_id>
+    |-- ...
+    |-- schemata
+    |-- tasks
+```
+
+For runc, we can make use of `tasks` and `schemata` configuration for L3
+cache and memory bandwidth resources constraints.
+
+The file `tasks` has a list of tasks that belongs to this group (e.g.,
+<container_id>" group). Tasks can be added to a group by writing the task ID
+to the "tasks" file (which will automatically remove them from the previous
+group to which they belonged). New tasks created by fork(2) and clone(2) are
+added to the same group as their parent.
+
+The file `schemata` has a list of all the resources available to this group.
+Each resource (L3 cache, memory bandwidth) has its own line and format.
+
+L3 cache schema:
+It has allocation bitmasks/values for L3 cache on each socket, which
+contains L3 cache id and capacity bitmask (CBM).
+```
+	Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+```
+For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
+which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
+
+The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
+be set is less than the max bit. The max bits in the CBM is varied among
+supported Intel CPU models. Kernel will check if it is valid when writing.
+e.g., default value 0xfffff in root indicates the max bits of CBM is 20
+bits, which mapping to entire L3 cache capacity. Some valid CBM values to
+set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
+
+Memory bandwidth schema:
+It has allocation values for memory bandwidth on each socket, which contains
+L3 cache id and memory bandwidth.
+```
+	Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
+```
+For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
+
+The minimum bandwidth percentage value for each CPU model is predefined and
+can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
+that is allocated is also dependent on the CPU model and can be looked up at
+"info/MB/bandwidth_gran". The available bandwidth control steps are:
+min_bw + N * bw_gran. Intermediate values are rounded to the next control
+step available on the hardware.
+
+If MBA Software Controller is enabled through mount option "-o mba_MBps"
+mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl
+We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit
+instead of "percentages". The kernel underneath would use a software feedback
+mechanism or a "Software Controller" which reads the actual bandwidth using
+MBM counters and adjust the memory bandwidth percentages to ensure:
+"actual memory bandwidth < user specified memory bandwidth".
+
+For example, on a two-socket machine, the schema line could be
+"MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0
+and 7000 MBps memory bandwidth limit on socket 1.
+
+For more information about Intel RDT kernel interface:  
+https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
+
+```
+An example for runc:
+Consider a two-socket machine with two L3 caches where the default CBM is
+0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
+with a memory bandwidth granularity of 10%.
+
+Tasks inside the container only have access to the "upper" 7/11 of L3 cache
+on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
+maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
+
+"linux": {
+    "intelRdt": {
+        "closID": "guaranteed_group",
+        "l3CacheSchema": "L3:0=7f0;1=1f",
+        "memBwSchema": "MB:0=20;1=70"
+    }
+}
+```
+
+### Security 
+
+The standard set of Linux capabilities that are set in a container
+provide a good default for security and flexibility for the applications.
+
+
+|     Capability       | Enabled |
+| -------------------- | ------- |
+| CAP_NET_RAW          | 1       |
+| CAP_NET_BIND_SERVICE | 1       |
+| CAP_AUDIT_READ       | 1       |
+| CAP_AUDIT_WRITE      | 1       |
+| CAP_DAC_OVERRIDE     | 1       |
+| CAP_SETFCAP          | 1       |
+| CAP_SETPCAP          | 1       |
+| CAP_SETGID           | 1       |
+| CAP_SETUID           | 1       |
+| CAP_MKNOD            | 1       |
+| CAP_CHOWN            | 1       |
+| CAP_FOWNER           | 1       |
+| CAP_FSETID           | 1       |
+| CAP_KILL             | 1       |
+| CAP_SYS_CHROOT       | 1       |
+| CAP_NET_BROADCAST    | 0       |
+| CAP_SYS_MODULE       | 0       |
+| CAP_SYS_RAWIO        | 0       |
+| CAP_SYS_PACCT        | 0       |
+| CAP_SYS_ADMIN        | 0       |
+| CAP_SYS_NICE         | 0       |
+| CAP_SYS_RESOURCE     | 0       |
+| CAP_SYS_TIME         | 0       |
+| CAP_SYS_TTY_CONFIG   | 0       |
+| CAP_AUDIT_CONTROL    | 0       |
+| CAP_MAC_OVERRIDE     | 0       |
+| CAP_MAC_ADMIN        | 0       |
+| CAP_NET_ADMIN        | 0       |
+| CAP_SYSLOG           | 0       |
+| CAP_DAC_READ_SEARCH  | 0       |
+| CAP_LINUX_IMMUTABLE  | 0       |
+| CAP_IPC_LOCK         | 0       |
+| CAP_IPC_OWNER        | 0       |
+| CAP_SYS_PTRACE       | 0       |
+| CAP_SYS_BOOT         | 0       |
+| CAP_LEASE            | 0       |
+| CAP_WAKE_ALARM       | 0       |
+| CAP_BLOCK_SUSPEND    | 0       |
+
+
+Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
+and [selinux](http://selinuxproject.org/page/Main_Page) can be used with
+the containers.  A container should support setting an apparmor profile or 
+selinux process and mount labels if provided in the configuration.  
+
+Standard apparmor profile:
+```c
+#include <tunables/global>
+profile <profile_name> flags=(attach_disconnected,mediate_deleted) {
+  #include <abstractions/base>
+  network,
+  capability,
+  file,
+  umount,
+
+  deny @{PROC}/sys/fs/** wklx,
+  deny @{PROC}/sysrq-trigger rwklx,
+  deny @{PROC}/mem rwklx,
+  deny @{PROC}/kmem rwklx,
+  deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx,
+  deny @{PROC}/sys/kernel/*/** wklx,
+
+  deny mount,
+
+  deny /sys/[^f]*/** wklx,
+  deny /sys/f[^s]*/** wklx,
+  deny /sys/fs/[^c]*/** wklx,
+  deny /sys/fs/c[^g]*/** wklx,
+  deny /sys/fs/cg[^r]*/** wklx,
+  deny /sys/firmware/efi/efivars/** rwklx,
+  deny /sys/kernel/security/** rwklx,
+}
+```
+
+*TODO: seccomp work is being done to find a good default config*
+
+### Runtime and Init Process
+
+During container creation the parent process needs to talk to the container's init 
+process and have a form of synchronization.  This is accomplished by creating
+a pipe that is passed to the container's init.  When the init process first spawns 
+it will block on its side of the pipe until the parent closes its side.  This
+allows the parent to have time to set the new process inside a cgroup hierarchy 
+and/or write any uid/gid mappings required for user namespaces.  
+The pipe is passed to the init process via FD 3.
+
+The application consuming libcontainer should be compiled statically.  libcontainer
+does not define any init process and the arguments provided are used to `exec` the
+process inside the application.  There should be no long running init within the 
+container spec.
+
+If a pseudo tty is provided to a container it will open and `dup2` the console
+as the container's STDIN, STDOUT, STDERR as well as mounting the console
+as `/dev/console`.
+
+An extra set of mounts are provided to a container and setup for use.  A container's
+rootfs can contain some non portable files inside that can cause side effects during
+execution of a process.  These files are usually created and populated with the container
+specific information via the runtime.  
+
+**Extra runtime files:**
+* /etc/hosts 
+* /etc/resolv.conf
+* /etc/hostname
+* /etc/localtime
+
+
+#### Defaults
+
+There are a few defaults that can be overridden by users, but in their omission
+these apply to processes within a container.
+
+|       Type          |             Value              |
+| ------------------- | ------------------------------ |
+| Parent Death Signal | SIGKILL                        | 
+| UID                 | 0                              |
+| GID                 | 0                              |
+| GROUPS              | 0, NULL                        |
+| CWD                 | "/"                            |
+| $HOME               | Current user's home dir or "/" |
+| Readonly rootfs     | false                          |
+| Pseudo TTY          | false                          |
+
+
+## Actions
+
+After a container is created there is a standard set of actions that can
+be done to the container.  These actions are part of the public API for 
+a container.
+
+|     Action     |                         Description                                |
+| -------------- | ------------------------------------------------------------------ |
+| Get processes  | Return all the pids for processes running inside a container       | 
+| Get Stats      | Return resource statistics for the container as a whole            |
+| Wait           | Waits on the container's init process ( pid 1 )                    |
+| Wait Process   | Wait on any of the container's processes returning the exit status | 
+| Destroy        | Kill the container's init process and remove any filesystem state  |
+| Signal         | Send a signal to the container's init process                      |
+| Signal Process | Send a signal to any of the container's processes                  |
+| Pause          | Pause all processes inside the container                           |
+| Resume         | Resume all processes inside the container if paused                |
+| Exec           | Execute a new process inside of the container  ( requires setns )  |
+| Set            | Setup configs of the container after it's created                  |
+
+### Execute a new process inside of a running container
+
+User can execute a new process inside of a running container. Any binaries to be
+executed must be accessible within the container's rootfs.
+
+The started process will run inside the container's rootfs. Any changes
+made by the process to the container's filesystem will persist after the
+process finished executing.
+
+The started process will join all the container's existing namespaces. When the
+container is paused, the process will also be paused and will resume when
+the container is unpaused.  The started process will only run when the container's
+primary process (PID 1) is running, and will not be restarted when the container
+is restarted.
+
+#### Planned additions
+
+The started process will have its own cgroups nested inside the container's
+cgroups. This is used for process tracking and optionally resource allocation
+handling for the new process. Freezer cgroup is required, the rest of the cgroups
+are optional. The process executor must place its pid inside the correct
+cgroups before starting the process. This is done so that no child processes or
+threads can escape the cgroups.
+
+When the process is stopped, the process executor will try (in a best-effort way)
+to stop all its children and remove the sub-cgroups.
--- a/libcontainer/apparmor/apparmor.go
+++ b/libcontainer/apparmor/apparmor.go
@ -0,0 +1,60 @@
+// +build apparmor,linux
+
+package apparmor
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+
+	"github.com/opencontainers/runc/libcontainer/utils"
+)
+
+// IsEnabled returns true if apparmor is enabled for the host.
+func IsEnabled() bool {
+	if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" {
+		if _, err = os.Stat("/sbin/apparmor_parser"); err == nil {
+			buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
+			return err == nil && len(buf) > 1 && buf[0] == 'Y'
+		}
+	}
+	return false
+}
+
+func setProcAttr(attr, value string) error {
+	// Under AppArmor you can only change your own attr, so use /proc/self/
+	// instead of /proc/<tid>/ like libapparmor does
+	path := fmt.Sprintf("/proc/self/attr/%s", attr)
+
+	f, err := os.OpenFile(path, os.O_WRONLY, 0)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	if err := utils.EnsureProcHandle(f); err != nil {
+		return err
+	}
+
+	_, err = fmt.Fprintf(f, "%s", value)
+	return err
+}
+
+// changeOnExec reimplements aa_change_onexec from libapparmor in Go
+func changeOnExec(name string) error {
+	value := "exec " + name
+	if err := setProcAttr("exec", value); err != nil {
+		return fmt.Errorf("apparmor failed to apply profile: %s", err)
+	}
+	return nil
+}
+
+// ApplyProfile will apply the profile with the specified name to the process after
+// the next exec.
+func ApplyProfile(name string) error {
+	if name == "" {
+		return nil
+	}
+
+	return changeOnExec(name)
+}
--- a/libcontainer/apparmor/apparmor_disabled.go
+++ b/libcontainer/apparmor/apparmor_disabled.go
@ -0,0 +1,20 @@
+// +build !apparmor !linux
+
+package apparmor
+
+import (
+	"errors"
+)
+
+var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
+
+func IsEnabled() bool {
+	return false
+}
+
+func ApplyProfile(name string) error {
+	if name != "" {
+		return ErrApparmorNotEnabled
+	}
+	return nil
+}
--- a/libcontainer/capabilities_linux.go
+++ b/libcontainer/capabilities_linux.go
@ -0,0 +1,117 @@
+// +build linux
+
+package libcontainer
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/syndtr/gocapability/capability"
+)
+
+const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
+
+var capabilityMap map[string]capability.Cap
+
+func init() {
+	capabilityMap = make(map[string]capability.Cap)
+	last := capability.CAP_LAST_CAP
+	// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
+	if last == capability.Cap(63) {
+		last = capability.CAP_BLOCK_SUSPEND
+	}
+	for _, cap := range capability.List() {
+		if cap > last {
+			continue
+		}
+		capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))
+		capabilityMap[capKey] = cap
+	}
+}
+
+func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) {
+	bounding := []capability.Cap{}
+	for _, c := range capConfig.Bounding {
+		v, ok := capabilityMap[c]
+		if !ok {
+			return nil, fmt.Errorf("unknown capability %q", c)
+		}
+		bounding = append(bounding, v)
+	}
+	effective := []capability.Cap{}
+	for _, c := range capConfig.Effective {
+		v, ok := capabilityMap[c]
+		if !ok {
+			return nil, fmt.Errorf("unknown capability %q", c)
+		}
+		effective = append(effective, v)
+	}
+	inheritable := []capability.Cap{}
+	for _, c := range capConfig.Inheritable {
+		v, ok := capabilityMap[c]
+		if !ok {
+			return nil, fmt.Errorf("unknown capability %q", c)
+		}
+		inheritable = append(inheritable, v)
+	}
+	permitted := []capability.Cap{}
+	for _, c := range capConfig.Permitted {
+		v, ok := capabilityMap[c]
+		if !ok {
+			return nil, fmt.Errorf("unknown capability %q", c)
+		}
+		permitted = append(permitted, v)
+	}
+	ambient := []capability.Cap{}
+	for _, c := range capConfig.Ambient {
+		v, ok := capabilityMap[c]
+		if !ok {
+			return nil, fmt.Errorf("unknown capability %q", c)
+		}
+		ambient = append(ambient, v)
+	}
+	pid, err := capability.NewPid2(0)
+	if err != nil {
+		return nil, err
+	}
+	err = pid.Load()
+	if err != nil {
+		return nil, err
+	}
+	return &containerCapabilities{
+		bounding:    bounding,
+		effective:   effective,
+		inheritable: inheritable,
+		permitted:   permitted,
+		ambient:     ambient,
+		pid:         pid,
+	}, nil
+}
+
+type containerCapabilities struct {
+	pid         capability.Capabilities
+	bounding    []capability.Cap
+	effective   []capability.Cap
+	inheritable []capability.Cap
+	permitted   []capability.Cap
+	ambient     []capability.Cap
+}
+
+// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
+func (c *containerCapabilities) ApplyBoundingSet() error {
+	c.pid.Clear(capability.BOUNDS)
+	c.pid.Set(capability.BOUNDS, c.bounding...)
+	return c.pid.Apply(capability.BOUNDS)
+}
+
+// Apply sets all the capabilities for the current process in the config.
+func (c *containerCapabilities) ApplyCaps() error {
+	c.pid.Clear(allCapabilityTypes)
+	c.pid.Set(capability.BOUNDS, c.bounding...)
+	c.pid.Set(capability.PERMITTED, c.permitted...)
+	c.pid.Set(capability.INHERITABLE, c.inheritable...)
+	c.pid.Set(capability.EFFECTIVE, c.effective...)
+	c.pid.Set(capability.AMBIENT, c.ambient...)
+	return c.pid.Apply(allCapabilityTypes)
+}
--- a/libcontainer/cgroups/cgroups.go
+++ b/libcontainer/cgroups/cgroups.go
@ -0,0 +1,74 @@
+// +build linux
+
+package cgroups
+
+import (
+	"fmt"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Manager interface {
+	// Applies cgroup configuration to the process with the specified pid
+	Apply(pid int) error
+
+	// Returns the PIDs inside the cgroup set
+	GetPids() ([]int, error)
+
+	// Returns the PIDs inside the cgroup set & all sub-cgroups
+	GetAllPids() ([]int, error)
+
+	// Returns statistics for the cgroup set
+	GetStats() (*Stats, error)
+
+	// Toggles the freezer cgroup according with specified state
+	Freeze(state configs.FreezerState) error
+
+	// Destroys the cgroup set
+	Destroy() error
+
+	// The option func SystemdCgroups() and Cgroupfs() require following attributes:
+	// 	Paths   map[string]string
+	// 	Cgroups *configs.Cgroup
+	// Paths maps cgroup subsystem to path at which it is mounted.
+	// Cgroups specifies specific cgroup settings for the various subsystems
+
+	// Returns cgroup paths to save in a state file and to be able to
+	// restore the object later.
+	GetPaths() map[string]string
+
+	// GetUnifiedPath returns the unified path when running in unified mode.
+	// The value corresponds to the all values of GetPaths() map.
+	//
+	// GetUnifiedPath returns error when running in hybrid mode as well as
+	// in legacy mode.
+	GetUnifiedPath() (string, error)
+
+	// Sets the cgroup as configured.
+	Set(container *configs.Config) error
+
+	// Gets the cgroup as configured.
+	GetCgroups() (*configs.Cgroup, error)
+}
+
+type NotFoundError struct {
+	Subsystem string
+}
+
+func (e *NotFoundError) Error() string {
+	return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
+}
+
+func NewNotFoundError(sub string) error {
+	return &NotFoundError{
+		Subsystem: sub,
+	}
+}
+
+func IsNotFound(err error) bool {
+	if err == nil {
+		return false
+	}
+	_, ok := err.(*NotFoundError)
+	return ok
+}
--- a/libcontainer/cgroups/cgroups_test.go
+++ b/libcontainer/cgroups/cgroups_test.go
@ -0,0 +1,20 @@
+// +build linux
+
+package cgroups
+
+import (
+	"testing"
+)
+
+func TestParseCgroups(t *testing.T) {
+	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if IsCgroup2UnifiedMode() {
+		return
+	}
+	if _, ok := cgroups["cpu"]; !ok {
+		t.Fail()
+	}
+}
--- a/libcontainer/cgroups/cgroups_unsupported.go
+++ b/libcontainer/cgroups/cgroups_unsupported.go
@ -0,0 +1,3 @@
+// +build !linux
+
+package cgroups
--- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
+++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
@ -0,0 +1,180 @@
+// Package devicefilter containes eBPF device filter program
+//
+// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
+//
+// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
+// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
+package devicefilter
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/cilium/ebpf/asm"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	// license string format is same as kernel MODULE_LICENSE macro
+	license = "Apache"
+)
+
+// DeviceFilter returns eBPF device filter program and its license string
+func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) {
+	p := &program{}
+	p.init()
+	for i := len(devices) - 1; i >= 0; i-- {
+		if err := p.appendDevice(devices[i]); err != nil {
+			return nil, "", err
+		}
+	}
+	insts, err := p.finalize()
+	return insts, license, err
+}
+
+type program struct {
+	insts       asm.Instructions
+	hasWildCard bool
+	blockID     int
+}
+
+func (p *program) init() {
+	// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
+	/*
+		u32 access_type
+		u32 major
+		u32 minor
+	*/
+	// R2 <- type (lower 16 bit of u32 access_type at R1[0])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R2, asm.R1, 0, asm.Half))
+
+	// R3 <- access (upper 16 bit of u32 access_type at R1[0])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
+		// RSh: bitwise shift right
+		asm.RSh.Imm32(asm.R3, 16))
+
+	// R4 <- major (u32 major at R1[4])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
+
+	// R5 <- minor (u32 minor at R1[8])
+	p.insts = append(p.insts,
+		asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
+}
+
+// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
+func (p *program) appendDevice(dev *configs.Device) error {
+	if p.blockID < 0 {
+		return errors.New("the program is finalized")
+	}
+	if p.hasWildCard {
+		// All entries after wildcard entry are ignored
+		return nil
+	}
+
+	bpfType := int32(-1)
+	hasType := true
+	switch dev.Type {
+	case 'c':
+		bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
+	case 'b':
+		bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
+	case 'a':
+		hasType = false
+	default:
+		// if not specified in OCI json, typ is set to DeviceTypeAll
+		return errors.Errorf("invalid DeviceType %q", string(dev.Type))
+	}
+	if dev.Major > math.MaxUint32 {
+		return errors.Errorf("invalid major %d", dev.Major)
+	}
+	if dev.Minor > math.MaxUint32 {
+		return errors.Errorf("invalid minor %d", dev.Major)
+	}
+	hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1
+	hasMinor := dev.Minor >= 0
+	bpfAccess := int32(0)
+	for _, r := range dev.Permissions {
+		switch r {
+		case 'r':
+			bpfAccess |= unix.BPF_DEVCG_ACC_READ
+		case 'w':
+			bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
+		case 'm':
+			bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
+		default:
+			return errors.Errorf("unknown device access %v", r)
+		}
+	}
+	// If the access is rwm, skip the check.
+	hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
+
+	blockSym := fmt.Sprintf("block-%d", p.blockID)
+	nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1)
+	prevBlockLastIdx := len(p.insts) - 1
+	if hasType {
+		p.insts = append(p.insts,
+			// if (R2 != bpfType) goto next
+			asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
+		)
+	}
+	if hasAccess {
+		p.insts = append(p.insts,
+			// if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next
+			asm.Mov.Reg32(asm.R1, asm.R3),
+			asm.And.Imm32(asm.R1, bpfAccess),
+			asm.JEq.Imm(asm.R1, 0, nextBlockSym),
+		)
+	}
+	if hasMajor {
+		p.insts = append(p.insts,
+			// if (R4 != major) goto next
+			asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym),
+		)
+	}
+	if hasMinor {
+		p.insts = append(p.insts,
+			// if (R5 != minor) goto next
+			asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym),
+		)
+	}
+	if !hasType && !hasAccess && !hasMajor && !hasMinor {
+		p.hasWildCard = true
+	}
+	p.insts = append(p.insts, acceptBlock(dev.Allow)...)
+	// set blockSym to the first instruction we added in this iteration
+	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
+	p.blockID++
+	return nil
+}
+
+func (p *program) finalize() (asm.Instructions, error) {
+	if p.hasWildCard {
+		// acceptBlock with asm.Return() is already inserted
+		return p.insts, nil
+	}
+	blockSym := fmt.Sprintf("block-%d", p.blockID)
+	p.insts = append(p.insts,
+		// R0 <- 0
+		asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),
+		asm.Return(),
+	)
+	p.blockID = -1
+	return p.insts, nil
+}
+
+func acceptBlock(accept bool) asm.Instructions {
+	v := int32(0)
+	if accept {
+		v = 1
+	}
+	return []asm.Instruction{
+		// R0 <- v
+		asm.Mov.Imm32(asm.R0, v),
+		asm.Return(),
+	}
+}
--- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
+++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
@ -0,0 +1,258 @@
+package devicefilter
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/specconv"
+)
+
+func hash(s, comm string) string {
+	var res []string
+	for _, l := range strings.Split(s, "\n") {
+		trimmed := strings.TrimSpace(l)
+		if trimmed == "" || strings.HasPrefix(trimmed, comm) {
+			continue
+		}
+		res = append(res, trimmed)
+	}
+	return strings.Join(res, "\n")
+}
+
+func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr string) {
+	insts, _, err := DeviceFilter(devices)
+	if err != nil {
+		t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices)
+	}
+	s := insts.String()
+	t.Logf("%s: devices: %+v\n%s", t.Name(), devices, s)
+	if expectedStr != "" {
+		hashed := hash(s, "//")
+		expectedHashed := hash(expectedStr, "//")
+		if expectedHashed != hashed {
+			t.Fatalf("expected:\n%q\ngot\n%q", expectedHashed, hashed)
+		}
+	}
+}
+
+func TestDeviceFilter_Nil(t *testing.T) {
+	expected := `
+// load parameters into registers
+        0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
+        1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        2: RSh32Imm dst: r3 imm: 16
+        3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 0 (reject)
+        5: Mov32Imm dst: r0 imm: 0
+        6: Exit
+	`
+	testDeviceFilter(t, nil, expected)
+}
+
+func TestDeviceFilter_BuiltInAllowList(t *testing.T) {
+	expected := `
+// load parameters into registers
+         0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
+         1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+         2: RSh32Imm dst: r3 imm: 16
+         3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+         4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// tuntap (c, 10, 200, rwm, allow)
+         5: JNEImm dst: r2 off: -1 imm: 2 <block-1>
+         6: JNEImm dst: r4 off: -1 imm: 10 <block-1>
+         7: JNEImm dst: r5 off: -1 imm: 200 <block-1>
+         8: Mov32Imm dst: r0 imm: 1
+         9: Exit
+block-1:
+        10: JNEImm dst: r2 off: -1 imm: 2 <block-2>
+        11: JNEImm dst: r4 off: -1 imm: 5 <block-2>
+        12: JNEImm dst: r5 off: -1 imm: 2 <block-2>
+        13: Mov32Imm dst: r0 imm: 1
+        14: Exit
+block-2:
+// /dev/pts (c, 136, wildcard, rwm, true)
+        15: JNEImm dst: r2 off: -1 imm: 2 <block-3>
+        16: JNEImm dst: r4 off: -1 imm: 136 <block-3>
+        17: Mov32Imm dst: r0 imm: 1
+        18: Exit
+block-3:
+        19: JNEImm dst: r2 off: -1 imm: 2 <block-4>
+        20: JNEImm dst: r4 off: -1 imm: 5 <block-4>
+        21: JNEImm dst: r5 off: -1 imm: 1 <block-4>
+        22: Mov32Imm dst: r0 imm: 1
+        23: Exit
+block-4:
+        24: JNEImm dst: r2 off: -1 imm: 2 <block-5>
+        25: JNEImm dst: r4 off: -1 imm: 1 <block-5>
+        26: JNEImm dst: r5 off: -1 imm: 9 <block-5>
+        27: Mov32Imm dst: r0 imm: 1
+        28: Exit
+block-5:
+        29: JNEImm dst: r2 off: -1 imm: 2 <block-6>
+        30: JNEImm dst: r4 off: -1 imm: 1 <block-6>
+        31: JNEImm dst: r5 off: -1 imm: 5 <block-6>
+        32: Mov32Imm dst: r0 imm: 1
+        33: Exit
+block-6:
+        34: JNEImm dst: r2 off: -1 imm: 2 <block-7>
+        35: JNEImm dst: r4 off: -1 imm: 5 <block-7>
+        36: JNEImm dst: r5 off: -1 imm: 0 <block-7>
+        37: Mov32Imm dst: r0 imm: 1
+        38: Exit
+block-7:
+        39: JNEImm dst: r2 off: -1 imm: 2 <block-8>
+        40: JNEImm dst: r4 off: -1 imm: 1 <block-8>
+        41: JNEImm dst: r5 off: -1 imm: 7 <block-8>
+        42: Mov32Imm dst: r0 imm: 1
+        43: Exit
+block-8:
+        44: JNEImm dst: r2 off: -1 imm: 2 <block-9>
+        45: JNEImm dst: r4 off: -1 imm: 1 <block-9>
+        46: JNEImm dst: r5 off: -1 imm: 8 <block-9>
+        47: Mov32Imm dst: r0 imm: 1
+        48: Exit
+block-9:
+        49: JNEImm dst: r2 off: -1 imm: 2 <block-10>
+        50: JNEImm dst: r4 off: -1 imm: 1 <block-10>
+        51: JNEImm dst: r5 off: -1 imm: 3 <block-10>
+        52: Mov32Imm dst: r0 imm: 1
+        53: Exit
+block-10:
+// (b, wildcard, wildcard, m, true)
+        54: JNEImm dst: r2 off: -1 imm: 1 <block-11>
+        55: Mov32Reg dst: r1 src: r3
+        56: And32Imm dst: r1 imm: 1
+        57: JEqImm dst: r1 off: -1 imm: 0 <block-11>
+        58: Mov32Imm dst: r0 imm: 1
+        59: Exit
+block-11:
+// (c, wildcard, wildcard, m, true)
+        60: JNEImm dst: r2 off: -1 imm: 2 <block-12>
+        61: Mov32Reg dst: r1 src: r3
+        62: And32Imm dst: r1 imm: 1
+        63: JEqImm dst: r1 off: -1 imm: 0 <block-12>
+        64: Mov32Imm dst: r0 imm: 1
+        65: Exit
+block-12:
+        66: Mov32Imm dst: r0 imm: 0
+        67: Exit
+`
+	testDeviceFilter(t, specconv.AllowedDevices, expected)
+}
+
+func TestDeviceFilter_Privileged(t *testing.T) {
+	devices := []*configs.Device{
+		{
+			Type:        'a',
+			Major:       -1,
+			Minor:       -1,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+	}
+	expected :=
+		`
+// load parameters into registers
+        0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
+        1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        2: RSh32Imm dst: r3 imm: 16
+        3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 1 (accept)
+        5: Mov32Imm dst: r0 imm: 1
+        6: Exit
+	`
+	testDeviceFilter(t, devices, expected)
+}
+
+func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) {
+	devices := []*configs.Device{
+		{
+			Type:        'a',
+			Major:       -1,
+			Minor:       -1,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        'b',
+			Major:       8,
+			Minor:       0,
+			Permissions: "rwm",
+			Allow:       false,
+		},
+	}
+	expected := `
+// load parameters into registers
+         0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
+         1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+         2: RSh32Imm dst: r3 imm: 16
+         3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+         4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 0 (reject) if type==b && major == 8 && minor == 0
+         5: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+         6: JNEImm dst: r4 off: -1 imm: 8 <block-1>
+         7: JNEImm dst: r5 off: -1 imm: 0 <block-1>
+         8: Mov32Imm dst: r0 imm: 0
+         9: Exit
+block-1:
+// return 1 (accept)
+        10: Mov32Imm dst: r0 imm: 1
+        11: Exit
+`
+	testDeviceFilter(t, devices, expected)
+}
+
+func TestDeviceFilter_Weird(t *testing.T) {
+	devices := []*configs.Device{
+		{
+			Type:        'b',
+			Major:       8,
+			Minor:       1,
+			Permissions: "rwm",
+			Allow:       false,
+		},
+		{
+			Type:        'a',
+			Major:       -1,
+			Minor:       -1,
+			Permissions: "rwm",
+			Allow:       true,
+		},
+		{
+			Type:        'b',
+			Major:       8,
+			Minor:       2,
+			Permissions: "rwm",
+			Allow:       false,
+		},
+	}
+	// 8/1 is allowed, 8/2 is not allowed.
+	// This conforms to runc v1.0.0-rc.9 (cgroup1) behavior.
+	expected := `
+// load parameters into registers
+         0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
+         1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+         2: RSh32Imm dst: r3 imm: 16
+         3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+         4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+block-0:
+// return 0 (reject) if type==b && major == 8 && minor == 2
+         5: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+         6: JNEImm dst: r4 off: -1 imm: 8 <block-1>
+         7: JNEImm dst: r5 off: -1 imm: 2 <block-1>
+         8: Mov32Imm dst: r0 imm: 0
+         9: Exit
+block-1:
+// return 1 (accept)
+        10: Mov32Imm dst: r0 imm: 1
+        11: Exit
+`
+	testDeviceFilter(t, devices, expected)
+}
--- a/libcontainer/cgroups/ebpf/ebpf.go
+++ b/libcontainer/cgroups/ebpf/ebpf.go
@ -0,0 +1,45 @@
+package ebpf
+
+import (
+	"github.com/cilium/ebpf"
+	"github.com/cilium/ebpf/asm"
+	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
+)
+
+// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
+//
+// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
+//
+// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
+func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) {
+	nilCloser := func() error {
+		return nil
+	}
+	// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
+	// This limit is not inherited into the container.
+	memlockLimit := &unix.Rlimit{
+		Cur: unix.RLIM_INFINITY,
+		Max: unix.RLIM_INFINITY,
+	}
+	_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
+	spec := &ebpf.ProgramSpec{
+		Type:         ebpf.CGroupDevice,
+		Instructions: insts,
+		License:      license,
+	}
+	prog, err := ebpf.NewProgram(spec)
+	if err != nil {
+		return nilCloser, err
+	}
+	if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
+		return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
+	}
+	closer := func() error {
+		if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
+			return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
+		}
+		return nil
+	}
+	return closer, nil
+}
--- a/libcontainer/cgroups/fs/apply_raw.go
+++ b/libcontainer/cgroups/fs/apply_raw.go
@ -0,0 +1,411 @@
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
+)
+
+var (
+	subsystemsLegacy = subsystemSet{
+		&CpusetGroup{},
+		&DevicesGroup{},
+		&MemoryGroup{},
+		&CpuGroup{},
+		&CpuacctGroup{},
+		&PidsGroup{},
+		&BlkioGroup{},
+		&HugetlbGroup{},
+		&NetClsGroup{},
+		&NetPrioGroup{},
+		&PerfEventGroup{},
+		&FreezerGroup{},
+		&NameGroup{GroupName: "name=systemd", Join: true},
+	}
+	HugePageSizes, _ = cgroups.GetHugePageSize()
+)
+
+var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist")
+
+type subsystemSet []subsystem
+
+func (s subsystemSet) Get(name string) (subsystem, error) {
+	for _, ss := range s {
+		if ss.Name() == name {
+			return ss, nil
+		}
+	}
+	return nil, errSubsystemDoesNotExist
+}
+
+type subsystem interface {
+	// Name returns the name of the subsystem.
+	Name() string
+	// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
+	GetStats(path string, stats *cgroups.Stats) error
+	// Removes the cgroup represented by 'cgroupData'.
+	Remove(*cgroupData) error
+	// Creates and joins the cgroup represented by 'cgroupData'.
+	Apply(*cgroupData) error
+	// Set the cgroup represented by cgroup.
+	Set(path string, cgroup *configs.Cgroup) error
+}
+
+type Manager struct {
+	mu       sync.Mutex
+	Cgroups  *configs.Cgroup
+	Rootless bool // ignore permission-related errors
+	Paths    map[string]string
+}
+
+// The absolute path to the root of the cgroup hierarchies.
+var cgroupRootLock sync.Mutex
+var cgroupRoot string
+
+// Gets the cgroupRoot.
+func getCgroupRoot() (string, error) {
+	cgroupRootLock.Lock()
+	defer cgroupRootLock.Unlock()
+
+	if cgroupRoot != "" {
+		return cgroupRoot, nil
+	}
+
+	root, err := cgroups.FindCgroupMountpointDir()
+	if err != nil {
+		return "", err
+	}
+
+	if _, err := os.Stat(root); err != nil {
+		return "", err
+	}
+
+	cgroupRoot = root
+	return cgroupRoot, nil
+}
+
+type cgroupData struct {
+	root      string
+	innerPath string
+	config    *configs.Cgroup
+	pid       int
+}
+
+// isIgnorableError returns whether err is a permission error (in the loose
+// sense of the word). This includes EROFS (which for an unprivileged user is
+// basically a permission error) and EACCES (for similar reasons) as well as
+// the normal EPERM.
+func isIgnorableError(rootless bool, err error) bool {
+	// We do not ignore errors if we are root.
+	if !rootless {
+		return false
+	}
+	// Is it an ordinary EPERM?
+	if os.IsPermission(errors.Cause(err)) {
+		return true
+	}
+
+	// Try to handle other errnos.
+	var errno error
+	switch err := errors.Cause(err).(type) {
+	case *os.PathError:
+		errno = err.Err
+	case *os.LinkError:
+		errno = err.Err
+	case *os.SyscallError:
+		errno = err.Err
+	}
+	return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
+}
+
+func (m *Manager) getSubsystems() subsystemSet {
+	return subsystemsLegacy
+}
+
+func (m *Manager) Apply(pid int) (err error) {
+	if m.Cgroups == nil {
+		return nil
+	}
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	var c = m.Cgroups
+
+	d, err := getCgroupData(m.Cgroups, pid)
+	if err != nil {
+		return err
+	}
+
+	m.Paths = make(map[string]string)
+	if c.Paths != nil {
+		for name, path := range c.Paths {
+			_, err := d.path(name)
+			if err != nil {
+				if cgroups.IsNotFound(err) {
+					continue
+				}
+				return err
+			}
+			m.Paths[name] = path
+		}
+		return cgroups.EnterPid(m.Paths, pid)
+	}
+
+	for _, sys := range m.getSubsystems() {
+		// TODO: Apply should, ideally, be reentrant or be broken up into a separate
+		// create and join phase so that the cgroup hierarchy for a container can be
+		// created then join consists of writing the process pids to cgroup.procs
+		p, err := d.path(sys.Name())
+		if err != nil {
+			// The non-presence of the devices subsystem is
+			// considered fatal for security reasons.
+			if cgroups.IsNotFound(err) && sys.Name() != "devices" {
+				continue
+			}
+			return err
+		}
+		m.Paths[sys.Name()] = p
+
+		if err := sys.Apply(d); err != nil {
+			// In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't
+			// been set, we don't bail on error in case of permission problems.
+			// Cases where limits have been set (and we couldn't create our own
+			// cgroup) are handled by Set.
+			if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" {
+				delete(m.Paths, sys.Name())
+				continue
+			}
+			return err
+		}
+
+	}
+	return nil
+}
+
+func (m *Manager) Destroy() error {
+	if m.Cgroups == nil || m.Cgroups.Paths != nil {
+		return nil
+	}
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	if err := cgroups.RemovePaths(m.Paths); err != nil {
+		return err
+	}
+	m.Paths = make(map[string]string)
+	return nil
+}
+
+func (m *Manager) GetPaths() map[string]string {
+	m.mu.Lock()
+	paths := m.Paths
+	m.mu.Unlock()
+	return paths
+}
+
+func (m *Manager) GetUnifiedPath() (string, error) {
+	return "", errors.New("unified path is only supported when running in unified mode")
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	stats := cgroups.NewStats()
+	for name, path := range m.Paths {
+		sys, err := m.getSubsystems().Get(name)
+		if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
+			continue
+		}
+		if err := sys.GetStats(path, stats); err != nil {
+			return nil, err
+		}
+	}
+	return stats, nil
+}
+
+func (m *Manager) Set(container *configs.Config) error {
+	if container.Cgroups == nil {
+		return nil
+	}
+
+	// If Paths are set, then we are just joining cgroups paths
+	// and there is no need to set any values.
+	if m.Cgroups != nil && m.Cgroups.Paths != nil {
+		return nil
+	}
+
+	paths := m.GetPaths()
+	for _, sys := range m.getSubsystems() {
+		path := paths[sys.Name()]
+		if err := sys.Set(path, container.Cgroups); err != nil {
+			if m.Rootless && sys.Name() == "devices" {
+				continue
+			}
+			// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
+			// However, errors from other subsystems are not ignored.
+			// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
+			if path == "" {
+				// We never created a path for this cgroup, so we cannot set
+				// limits for it (though we have already tried at this point).
+				return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
+			}
+			return err
+		}
+	}
+
+	if m.Paths["cpu"] != "" {
+		if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Freeze toggles the container's freezer cgroup depending on the state
+// provided
+func (m *Manager) Freeze(state configs.FreezerState) error {
+	if m.Cgroups == nil {
+		return errors.New("cannot toggle freezer: cgroups not configured for container")
+	}
+
+	paths := m.GetPaths()
+	dir := paths["freezer"]
+	prevState := m.Cgroups.Resources.Freezer
+	m.Cgroups.Resources.Freezer = state
+	freezer, err := m.getSubsystems().Get("freezer")
+	if err != nil {
+		return err
+	}
+	err = freezer.Set(dir, m.Cgroups)
+	if err != nil {
+		m.Cgroups.Resources.Freezer = prevState
+		return err
+	}
+	return nil
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+	paths := m.GetPaths()
+	return cgroups.GetPids(paths["devices"])
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+	paths := m.GetPaths()
+	return cgroups.GetAllPids(paths["devices"])
+}
+
+func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
+	root, err := getCgroupRoot()
+	if err != nil {
+		return nil, err
+	}
+
+	if (c.Name != "" || c.Parent != "") && c.Path != "" {
+		return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
+	}
+
+	// XXX: Do not remove this code. Path safety is important! -- cyphar
+	cgPath := libcontainerUtils.CleanPath(c.Path)
+	cgParent := libcontainerUtils.CleanPath(c.Parent)
+	cgName := libcontainerUtils.CleanPath(c.Name)
+
+	innerPath := cgPath
+	if innerPath == "" {
+		innerPath = filepath.Join(cgParent, cgName)
+	}
+
+	return &cgroupData{
+		root:      root,
+		innerPath: innerPath,
+		config:    c,
+		pid:       pid,
+	}, nil
+}
+
+func (raw *cgroupData) path(subsystem string) (string, error) {
+	mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem)
+	// If we didn't mount the subsystem, there is no point we make the path.
+	if err != nil {
+		return "", err
+	}
+
+	// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
+	if filepath.IsAbs(raw.innerPath) {
+		// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
+		return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
+	}
+
+	// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
+	// process could in container and shared pid namespace with host, and
+	// /proc/1/cgroup could point to whole other world of cgroups.
+	parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	return filepath.Join(parentPath, raw.innerPath), nil
+}
+
+func (raw *cgroupData) join(subsystem string) (string, error) {
+	path, err := raw.path(subsystem)
+	if err != nil {
+		return "", err
+	}
+	if err := os.MkdirAll(path, 0755); err != nil {
+		return "", err
+	}
+	if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil {
+		return "", err
+	}
+	return path, nil
+}
+
+func removePath(p string, err error) error {
+	if err != nil {
+		return err
+	}
+	if p != "" {
+		return os.RemoveAll(p)
+	}
+	return nil
+}
+
+func CheckCpushares(path string, c uint64) error {
+	var cpuShares uint64
+
+	if c == 0 {
+		return nil
+	}
+
+	fd, err := os.Open(filepath.Join(path, "cpu.shares"))
+	if err != nil {
+		return err
+	}
+	defer fd.Close()
+
+	_, err = fmt.Fscanf(fd, "%d", &cpuShares)
+	if err != nil && err != io.EOF {
+		return err
+	}
+
+	if c > cpuShares {
+		return fmt.Errorf("The maximum allowed cpu-shares is %d", cpuShares)
+	} else if c < cpuShares {
+		return fmt.Errorf("The minimum allowed cpu-shares is %d", cpuShares)
+	}
+
+	return nil
+}
+
+func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
+	return m.Cgroups, nil
+}
--- a/libcontainer/cgroups/fs/apply_raw_test.go
+++ b/libcontainer/cgroups/fs/apply_raw_test.go
@ -0,0 +1,297 @@
+// +build linux
+
+package fs
+
+import (
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func TestInvalidCgroupPath(t *testing.T) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v1 is not supported")
+	}
+	root, err := getCgroupRoot()
+	if err != nil {
+		t.Errorf("couldn't get cgroup root: %v", err)
+	}
+
+	config := &configs.Cgroup{
+		Path: "../../../../../../../../../../some/path",
+	}
+
+	data, err := getCgroupData(config, 0)
+	if err != nil {
+		t.Errorf("couldn't get cgroup data: %v", err)
+	}
+
+	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+	if strings.HasPrefix(data.innerPath, "..") {
+		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+	}
+
+	// Double-check, using an actual cgroup.
+	deviceRoot := filepath.Join(root, "devices")
+	devicePath, err := data.path("devices")
+	if err != nil {
+		t.Errorf("couldn't get cgroup path: %v", err)
+	}
+	if !strings.HasPrefix(devicePath, deviceRoot) {
+		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+	}
+}
+
+func TestInvalidAbsoluteCgroupPath(t *testing.T) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v1 is not supported")
+	}
+	root, err := getCgroupRoot()
+	if err != nil {
+		t.Errorf("couldn't get cgroup root: %v", err)
+	}
+
+	config := &configs.Cgroup{
+		Path: "/../../../../../../../../../../some/path",
+	}
+
+	data, err := getCgroupData(config, 0)
+	if err != nil {
+		t.Errorf("couldn't get cgroup data: %v", err)
+	}
+
+	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+	if strings.HasPrefix(data.innerPath, "..") {
+		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+	}
+
+	// Double-check, using an actual cgroup.
+	deviceRoot := filepath.Join(root, "devices")
+	devicePath, err := data.path("devices")
+	if err != nil {
+		t.Errorf("couldn't get cgroup path: %v", err)
+	}
+	if !strings.HasPrefix(devicePath, deviceRoot) {
+		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+	}
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidCgroupParent(t *testing.T) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v1 is not supported")
+	}
+	root, err := getCgroupRoot()
+	if err != nil {
+		t.Errorf("couldn't get cgroup root: %v", err)
+	}
+
+	config := &configs.Cgroup{
+		Parent: "../../../../../../../../../../some/path",
+		Name:   "name",
+	}
+
+	data, err := getCgroupData(config, 0)
+	if err != nil {
+		t.Errorf("couldn't get cgroup data: %v", err)
+	}
+
+	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+	if strings.HasPrefix(data.innerPath, "..") {
+		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+	}
+
+	// Double-check, using an actual cgroup.
+	deviceRoot := filepath.Join(root, "devices")
+	devicePath, err := data.path("devices")
+	if err != nil {
+		t.Errorf("couldn't get cgroup path: %v", err)
+	}
+	if !strings.HasPrefix(devicePath, deviceRoot) {
+		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+	}
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidAbsoluteCgroupParent(t *testing.T) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v1 is not supported")
+	}
+	root, err := getCgroupRoot()
+	if err != nil {
+		t.Errorf("couldn't get cgroup root: %v", err)
+	}
+
+	config := &configs.Cgroup{
+		Parent: "/../../../../../../../../../../some/path",
+		Name:   "name",
+	}
+
+	data, err := getCgroupData(config, 0)
+	if err != nil {
+		t.Errorf("couldn't get cgroup data: %v", err)
+	}
+
+	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+	if strings.HasPrefix(data.innerPath, "..") {
+		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+	}
+
+	// Double-check, using an actual cgroup.
+	deviceRoot := filepath.Join(root, "devices")
+	devicePath, err := data.path("devices")
+	if err != nil {
+		t.Errorf("couldn't get cgroup path: %v", err)
+	}
+	if !strings.HasPrefix(devicePath, deviceRoot) {
+		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+	}
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidCgroupName(t *testing.T) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v1 is not supported")
+	}
+	root, err := getCgroupRoot()
+	if err != nil {
+		t.Errorf("couldn't get cgroup root: %v", err)
+	}
+
+	config := &configs.Cgroup{
+		Parent: "parent",
+		Name:   "../../../../../../../../../../some/path",
+	}
+
+	data, err := getCgroupData(config, 0)
+	if err != nil {
+		t.Errorf("couldn't get cgroup data: %v", err)
+	}
+
+	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+	if strings.HasPrefix(data.innerPath, "..") {
+		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+	}
+
+	// Double-check, using an actual cgroup.
+	deviceRoot := filepath.Join(root, "devices")
+	devicePath, err := data.path("devices")
+	if err != nil {
+		t.Errorf("couldn't get cgroup path: %v", err)
+	}
+	if !strings.HasPrefix(devicePath, deviceRoot) {
+		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+	}
+
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidAbsoluteCgroupName(t *testing.T) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v1 is not supported")
+	}
+	root, err := getCgroupRoot()
+	if err != nil {
+		t.Errorf("couldn't get cgroup root: %v", err)
+	}
+
+	config := &configs.Cgroup{
+		Parent: "parent",
+		Name:   "/../../../../../../../../../../some/path",
+	}
+
+	data, err := getCgroupData(config, 0)
+	if err != nil {
+		t.Errorf("couldn't get cgroup data: %v", err)
+	}
+
+	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+	if strings.HasPrefix(data.innerPath, "..") {
+		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+	}
+
+	// Double-check, using an actual cgroup.
+	deviceRoot := filepath.Join(root, "devices")
+	devicePath, err := data.path("devices")
+	if err != nil {
+		t.Errorf("couldn't get cgroup path: %v", err)
+	}
+	if !strings.HasPrefix(devicePath, deviceRoot) {
+		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+	}
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidCgroupNameAndParent(t *testing.T) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v1 is not supported")
+	}
+	root, err := getCgroupRoot()
+	if err != nil {
+		t.Errorf("couldn't get cgroup root: %v", err)
+	}
+
+	config := &configs.Cgroup{
+		Parent: "../../../../../../../../../../some/path",
+		Name:   "../../../../../../../../../../some/path",
+	}
+
+	data, err := getCgroupData(config, 0)
+	if err != nil {
+		t.Errorf("couldn't get cgroup data: %v", err)
+	}
+
+	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+	if strings.HasPrefix(data.innerPath, "..") {
+		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+	}
+
+	// Double-check, using an actual cgroup.
+	deviceRoot := filepath.Join(root, "devices")
+	devicePath, err := data.path("devices")
+	if err != nil {
+		t.Errorf("couldn't get cgroup path: %v", err)
+	}
+	if !strings.HasPrefix(devicePath, deviceRoot) {
+		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+	}
+}
+
+// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
+func TestInvalidAbsoluteCgroupNameAndParent(t *testing.T) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v1 is not supported")
+	}
+	root, err := getCgroupRoot()
+	if err != nil {
+		t.Errorf("couldn't get cgroup root: %v", err)
+	}
+
+	config := &configs.Cgroup{
+		Parent: "/../../../../../../../../../../some/path",
+		Name:   "/../../../../../../../../../../some/path",
+	}
+
+	data, err := getCgroupData(config, 0)
+	if err != nil {
+		t.Errorf("couldn't get cgroup data: %v", err)
+	}
+
+	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
+	if strings.HasPrefix(data.innerPath, "..") {
+		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+	}
+
+	// Double-check, using an actual cgroup.
+	deviceRoot := filepath.Join(root, "devices")
+	devicePath, err := data.path("devices")
+	if err != nil {
+		t.Errorf("couldn't get cgroup path: %v", err)
+	}
+	if !strings.HasPrefix(devicePath, deviceRoot) {
+		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+	}
+}
--- a/libcontainer/cgroups/fs/blkio.go
+++ b/libcontainer/cgroups/fs/blkio.go
@ -0,0 +1,238 @@
+// +build linux
+
+package fs
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type BlkioGroup struct {
+}
+
+func (s *BlkioGroup) Name() string {
+	return "blkio"
+}
+
+func (s *BlkioGroup) Apply(d *cgroupData) error {
+	_, err := d.join("blkio")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return nil
+}
+
+func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.BlkioWeight != 0 {
+		if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
+			return err
+		}
+	}
+
+	if cgroup.Resources.BlkioLeafWeight != 0 {
+		if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
+			return err
+		}
+	}
+	for _, wd := range cgroup.Resources.BlkioWeightDevice {
+		if err := fscommon.WriteFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
+			return err
+		}
+		if err := fscommon.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
+			return err
+		}
+	}
+	for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
+		if err := fscommon.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
+			return err
+		}
+	}
+	for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
+		if err := fscommon.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
+			return err
+		}
+	}
+	for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
+		if err := fscommon.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
+			return err
+		}
+	}
+	for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
+		if err := fscommon.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *BlkioGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("blkio"))
+}
+
+/*
+examples:
+
+    blkio.sectors
+    8:0 6792
+
+    blkio.io_service_bytes
+    8:0 Read 1282048
+    8:0 Write 2195456
+    8:0 Sync 2195456
+    8:0 Async 1282048
+    8:0 Total 3477504
+    Total 3477504
+
+    blkio.io_serviced
+    8:0 Read 124
+    8:0 Write 104
+    8:0 Sync 104
+    8:0 Async 124
+    8:0 Total 228
+    Total 228
+
+    blkio.io_queued
+    8:0 Read 0
+    8:0 Write 0
+    8:0 Sync 0
+    8:0 Async 0
+    8:0 Total 0
+    Total 0
+*/
+
+func splitBlkioStatLine(r rune) bool {
+	return r == ' ' || r == ':'
+}
+
+func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
+	var blkioStats []cgroups.BlkioStatEntry
+	f, err := os.Open(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return blkioStats, nil
+		}
+		return nil, err
+	}
+	defer f.Close()
+
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		// format: dev type amount
+		fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine)
+		if len(fields) < 3 {
+			if len(fields) == 2 && fields[0] == "Total" {
+				// skip total line
+				continue
+			} else {
+				return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text())
+			}
+		}
+
+		v, err := strconv.ParseUint(fields[0], 10, 64)
+		if err != nil {
+			return nil, err
+		}
+		major := v
+
+		v, err = strconv.ParseUint(fields[1], 10, 64)
+		if err != nil {
+			return nil, err
+		}
+		minor := v
+
+		op := ""
+		valueField := 2
+		if len(fields) == 4 {
+			op = fields[2]
+			valueField = 3
+		}
+		v, err = strconv.ParseUint(fields[valueField], 10, 64)
+		if err != nil {
+			return nil, err
+		}
+		blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
+	}
+
+	return blkioStats, nil
+}
+
+func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
+	// Try to read CFQ stats available on all CFQ enabled kernels first
+	if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil {
+		return getCFQStats(path, stats)
+	}
+	return getStats(path, stats) // Use generic stats as fallback
+}
+
+func getCFQStats(path string, stats *cgroups.Stats) error {
+	var blkioStats []cgroups.BlkioStatEntry
+	var err error
+
+	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil {
+		return err
+	}
+	stats.BlkioStats.SectorsRecursive = blkioStats
+
+	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil {
+		return err
+	}
+	stats.BlkioStats.IoServiceBytesRecursive = blkioStats
+
+	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil {
+		return err
+	}
+	stats.BlkioStats.IoServicedRecursive = blkioStats
+
+	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil {
+		return err
+	}
+	stats.BlkioStats.IoQueuedRecursive = blkioStats
+
+	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil {
+		return err
+	}
+	stats.BlkioStats.IoServiceTimeRecursive = blkioStats
+
+	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil {
+		return err
+	}
+	stats.BlkioStats.IoWaitTimeRecursive = blkioStats
+
+	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil {
+		return err
+	}
+	stats.BlkioStats.IoMergedRecursive = blkioStats
+
+	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil {
+		return err
+	}
+	stats.BlkioStats.IoTimeRecursive = blkioStats
+
+	return nil
+}
+
+func getStats(path string, stats *cgroups.Stats) error {
+	var blkioStats []cgroups.BlkioStatEntry
+	var err error
+
+	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil {
+		return err
+	}
+	stats.BlkioStats.IoServiceBytesRecursive = blkioStats
+
+	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil {
+		return err
+	}
+	stats.BlkioStats.IoServicedRecursive = blkioStats
+
+	return nil
+}
--- a/libcontainer/cgroups/fs/blkio_test.go
+++ b/libcontainer/cgroups/fs/blkio_test.go
@ -0,0 +1,637 @@
+// +build linux
+
+package fs
+
+import (
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+const (
+	sectorsRecursiveContents      = `8:0 1024`
+	serviceBytesRecursiveContents = `8:0 Read 100
+8:0 Write 200
+8:0 Sync 300
+8:0 Async 500
+8:0 Total 500
+Total 500`
+	servicedRecursiveContents = `8:0 Read 10
+8:0 Write 40
+8:0 Sync 20
+8:0 Async 30
+8:0 Total 50
+Total 50`
+	queuedRecursiveContents = `8:0 Read 1
+8:0 Write 4
+8:0 Sync 2
+8:0 Async 3
+8:0 Total 5
+Total 5`
+	serviceTimeRecursiveContents = `8:0 Read 173959
+8:0 Write 0
+8:0 Sync 0
+8:0 Async 173959
+8:0 Total 17395
+Total 17395`
+	waitTimeRecursiveContents = `8:0 Read 15571
+8:0 Write 0
+8:0 Sync 0
+8:0 Async 15571
+8:0 Total 15571`
+	mergedRecursiveContents = `8:0 Read 5
+8:0 Write 10
+8:0 Sync 0
+8:0 Async 0
+8:0 Total 15
+Total 15`
+	timeRecursiveContents = `8:0 8`
+	throttleServiceBytes  = `8:0 Read 11030528
+8:0 Write 23
+8:0 Sync 42
+8:0 Async 11030528
+8:0 Total 11030528
+252:0 Read 11030528
+252:0 Write 23
+252:0 Sync 42
+252:0 Async 11030528
+252:0 Total 11030528
+Total 22061056`
+	throttleServiced = `8:0 Read 164
+8:0 Write 23
+8:0 Sync 42
+8:0 Async 164
+8:0 Total 164
+252:0 Read 164
+252:0 Write 23
+252:0 Sync 42
+252:0 Async 164
+252:0 Total 164
+Total 328`
+)
+
+func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) {
+	*blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op})
+}
+
+func TestBlkioSetWeight(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+
+	const (
+		weightBefore = 100
+		weightAfter  = 200
+	)
+
+	helper.writeFileContents(map[string]string{
+		"blkio.weight": strconv.Itoa(weightBefore),
+	})
+
+	helper.CgroupData.config.Resources.BlkioWeight = weightAfter
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "blkio.weight")
+	if err != nil {
+		t.Fatalf("Failed to parse blkio.weight - %s", err)
+	}
+
+	if value != weightAfter {
+		t.Fatal("Got the wrong value, set blkio.weight failed.")
+	}
+}
+
+func TestBlkioSetWeightDevice(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+
+	const (
+		weightDeviceBefore = "8:0 400"
+	)
+
+	wd := configs.NewWeightDevice(8, 0, 500, 0)
+	weightDeviceAfter := wd.WeightString()
+
+	helper.writeFileContents(map[string]string{
+		"blkio.weight_device": weightDeviceBefore,
+	})
+
+	helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd}
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.weight_device")
+	if err != nil {
+		t.Fatalf("Failed to parse blkio.weight_device - %s", err)
+	}
+
+	if value != weightDeviceAfter {
+		t.Fatal("Got the wrong value, set blkio.weight_device failed.")
+	}
+}
+
+// regression #274
+func TestBlkioSetMultipleWeightDevice(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+
+	const (
+		weightDeviceBefore = "8:0 400"
+	)
+
+	wd1 := configs.NewWeightDevice(8, 0, 500, 0)
+	wd2 := configs.NewWeightDevice(8, 16, 500, 0)
+	// we cannot actually set and check both because normal ioutil.WriteFile
+	// when writing to cgroup file will overwrite the whole file content instead
+	// of updating it as the kernel is doing. Just check the second device
+	// is present will suffice for the test to ensure multiple writes are done.
+	weightDeviceAfter := wd2.WeightString()
+
+	helper.writeFileContents(map[string]string{
+		"blkio.weight_device": weightDeviceBefore,
+	})
+
+	helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd1, wd2}
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.weight_device")
+	if err != nil {
+		t.Fatalf("Failed to parse blkio.weight_device - %s", err)
+	}
+
+	if value != weightDeviceAfter {
+		t.Fatal("Got the wrong value, set blkio.weight_device failed.")
+	}
+}
+
+func TestBlkioStats(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify expected stats.
+	expectedStats := cgroups.BlkioStats{}
+	appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 1024, "")
+
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 100, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 200, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 300, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 10, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 40, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 20, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 30, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 50, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 1, "Read")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Write")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Async")
+	appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 17395, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Read")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Async")
+	appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 5, "Read")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 10, "Write")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async")
+	appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 15, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 8, "")
+
+	expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioStatsNoSectorsFile(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatalf("Failed unexpectedly: %s", err)
+	}
+}
+
+func TestBlkioStatsNoServiceBytesFile(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_serviced_recursive":     servicedRecursiveContents,
+		"blkio.io_queued_recursive":       queuedRecursiveContents,
+		"blkio.sectors_recursive":         sectorsRecursiveContents,
+		"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":    waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":       mergedRecursiveContents,
+		"blkio.time_recursive":            timeRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatalf("Failed unexpectedly: %s", err)
+	}
+}
+
+func TestBlkioStatsNoServicedFile(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatalf("Failed unexpectedly: %s", err)
+	}
+}
+
+func TestBlkioStatsNoQueuedFile(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatalf("Failed unexpectedly: %s", err)
+	}
+}
+
+func TestBlkioStatsNoServiceTimeFile(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatalf("Failed unexpectedly: %s", err)
+	}
+}
+
+func TestBlkioStatsNoWaitTimeFile(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatalf("Failed unexpectedly: %s", err)
+	}
+}
+
+func TestBlkioStatsNoMergedFile(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatalf("Failed unexpectedly: %s", err)
+	}
+}
+
+func TestBlkioStatsNoTimeFile(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping test in short mode.")
+	}
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatalf("Failed unexpectedly: %s", err)
+	}
+}
+
+func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": "8:0 Read 100 100",
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected to fail, but did not")
+	}
+}
+
+func TestBlkioStatsUnexpectedFieldType(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": "8:0 Read Write",
+		"blkio.io_serviced_recursive":      servicedRecursiveContents,
+		"blkio.io_queued_recursive":        queuedRecursiveContents,
+		"blkio.sectors_recursive":          sectorsRecursiveContents,
+		"blkio.io_service_time_recursive":  serviceTimeRecursiveContents,
+		"blkio.io_wait_time_recursive":     waitTimeRecursiveContents,
+		"blkio.io_merged_recursive":        mergedRecursiveContents,
+		"blkio.time_recursive":             timeRecursiveContents,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected to fail, but did not")
+	}
+}
+
+func TestNonCFQBlkioStats(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"blkio.io_service_bytes_recursive": "",
+		"blkio.io_serviced_recursive":      "",
+		"blkio.io_queued_recursive":        "",
+		"blkio.sectors_recursive":          "",
+		"blkio.io_service_time_recursive":  "",
+		"blkio.io_wait_time_recursive":     "",
+		"blkio.io_merged_recursive":        "",
+		"blkio.time_recursive":             "",
+		"blkio.throttle.io_service_bytes":  throttleServiceBytes,
+		"blkio.throttle.io_serviced":       throttleServiced,
+	})
+
+	blkio := &BlkioGroup{}
+	actualStats := *cgroups.NewStats()
+	err := blkio.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify expected stats.
+	expectedStats := cgroups.BlkioStats{}
+
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 23, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 42, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Total")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 23, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 42, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Total")
+
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 23, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 42, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Total")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Read")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 23, "Write")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 42, "Sync")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Async")
+	appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Total")
+
+	expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
+}
+
+func TestBlkioSetThrottleReadBpsDevice(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+
+	const (
+		throttleBefore = `8:0 1024`
+	)
+
+	td := configs.NewThrottleDevice(8, 0, 2048)
+	throttleAfter := td.String()
+
+	helper.writeFileContents(map[string]string{
+		"blkio.throttle.read_bps_device": throttleBefore,
+	})
+
+	helper.CgroupData.config.Resources.BlkioThrottleReadBpsDevice = []*configs.ThrottleDevice{td}
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.read_bps_device")
+	if err != nil {
+		t.Fatalf("Failed to parse blkio.throttle.read_bps_device - %s", err)
+	}
+
+	if value != throttleAfter {
+		t.Fatal("Got the wrong value, set blkio.throttle.read_bps_device failed.")
+	}
+}
+func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+
+	const (
+		throttleBefore = `8:0 1024`
+	)
+
+	td := configs.NewThrottleDevice(8, 0, 2048)
+	throttleAfter := td.String()
+
+	helper.writeFileContents(map[string]string{
+		"blkio.throttle.write_bps_device": throttleBefore,
+	})
+
+	helper.CgroupData.config.Resources.BlkioThrottleWriteBpsDevice = []*configs.ThrottleDevice{td}
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.write_bps_device")
+	if err != nil {
+		t.Fatalf("Failed to parse blkio.throttle.write_bps_device - %s", err)
+	}
+
+	if value != throttleAfter {
+		t.Fatal("Got the wrong value, set blkio.throttle.write_bps_device failed.")
+	}
+}
+func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+
+	const (
+		throttleBefore = `8:0 1024`
+	)
+
+	td := configs.NewThrottleDevice(8, 0, 2048)
+	throttleAfter := td.String()
+
+	helper.writeFileContents(map[string]string{
+		"blkio.throttle.read_iops_device": throttleBefore,
+	})
+
+	helper.CgroupData.config.Resources.BlkioThrottleReadIOPSDevice = []*configs.ThrottleDevice{td}
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.read_iops_device")
+	if err != nil {
+		t.Fatalf("Failed to parse blkio.throttle.read_iops_device - %s", err)
+	}
+
+	if value != throttleAfter {
+		t.Fatal("Got the wrong value, set blkio.throttle.read_iops_device failed.")
+	}
+}
+func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) {
+	helper := NewCgroupTestUtil("blkio", t)
+	defer helper.cleanup()
+
+	const (
+		throttleBefore = `8:0 1024`
+	)
+
+	td := configs.NewThrottleDevice(8, 0, 2048)
+	throttleAfter := td.String()
+
+	helper.writeFileContents(map[string]string{
+		"blkio.throttle.write_iops_device": throttleBefore,
+	})
+
+	helper.CgroupData.config.Resources.BlkioThrottleWriteIOPSDevice = []*configs.ThrottleDevice{td}
+	blkio := &BlkioGroup{}
+	if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.write_iops_device")
+	if err != nil {
+		t.Fatalf("Failed to parse blkio.throttle.write_iops_device - %s", err)
+	}
+
+	if value != throttleAfter {
+		t.Fatal("Got the wrong value, set blkio.throttle.write_iops_device failed.")
+	}
+}
--- a/libcontainer/cgroups/fs/cpu.go
+++ b/libcontainer/cgroups/fs/cpu.go
@ -0,0 +1,118 @@
+// +build linux
+
+package fs
+
+import (
+	"bufio"
+	"os"
+	"path/filepath"
+	"strconv"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type CpuGroup struct {
+}
+
+func (s *CpuGroup) Name() string {
+	return "cpu"
+}
+
+func (s *CpuGroup) Apply(d *cgroupData) error {
+	// We always want to join the cpu group, to allow fair cpu scheduling
+	// on a container basis
+	path, err := d.path("cpu")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return s.ApplyDir(path, d.config, d.pid)
+}
+
+func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error {
+	// This might happen if we have no cpu cgroup mounted.
+	// Just do nothing and don't fail.
+	if path == "" {
+		return nil
+	}
+	if err := os.MkdirAll(path, 0755); err != nil {
+		return err
+	}
+	// We should set the real-Time group scheduling settings before moving
+	// in the process because if the process is already in SCHED_RR mode
+	// and no RT bandwidth is set, adding it will fail.
+	if err := s.SetRtSched(path, cgroup); err != nil {
+		return err
+	}
+	// because we are not using d.join we need to place the pid into the procs file
+	// unlike the other subsystems
+	return cgroups.WriteCgroupProc(path, pid)
+}
+
+func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.CpuRtPeriod != 0 {
+		if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
+			return err
+		}
+	}
+	if cgroup.Resources.CpuRtRuntime != 0 {
+		if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.CpuShares != 0 {
+		if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil {
+			return err
+		}
+	}
+	if cgroup.Resources.CpuPeriod != 0 {
+		if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
+			return err
+		}
+	}
+	if cgroup.Resources.CpuQuota != 0 {
+		if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil {
+			return err
+		}
+	}
+	return s.SetRtSched(path, cgroup)
+}
+
+func (s *CpuGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("cpu"))
+}
+
+func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
+	f, err := os.Open(filepath.Join(path, "cpu.stat"))
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil
+		}
+		return err
+	}
+	defer f.Close()
+
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
+		if err != nil {
+			return err
+		}
+		switch t {
+		case "nr_periods":
+			stats.CpuStats.ThrottlingData.Periods = v
+
+		case "nr_throttled":
+			stats.CpuStats.ThrottlingData.ThrottledPeriods = v
+
+		case "throttled_time":
+			stats.CpuStats.ThrottlingData.ThrottledTime = v
+		}
+	}
+	return nil
+}
--- a/libcontainer/cgroups/fs/cpu_test.go
+++ b/libcontainer/cgroups/fs/cpu_test.go
@ -0,0 +1,210 @@
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+func TestCpuSetShares(t *testing.T) {
+	helper := NewCgroupTestUtil("cpu", t)
+	defer helper.cleanup()
+
+	const (
+		sharesBefore = 1024
+		sharesAfter  = 512
+	)
+
+	helper.writeFileContents(map[string]string{
+		"cpu.shares": strconv.Itoa(sharesBefore),
+	})
+
+	helper.CgroupData.config.Resources.CpuShares = sharesAfter
+	cpu := &CpuGroup{}
+	if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.shares")
+	if err != nil {
+		t.Fatalf("Failed to parse cpu.shares - %s", err)
+	}
+
+	if value != sharesAfter {
+		t.Fatal("Got the wrong value, set cpu.shares failed.")
+	}
+}
+
+func TestCpuSetBandWidth(t *testing.T) {
+	helper := NewCgroupTestUtil("cpu", t)
+	defer helper.cleanup()
+
+	const (
+		quotaBefore     = 8000
+		quotaAfter      = 5000
+		periodBefore    = 10000
+		periodAfter     = 7000
+		rtRuntimeBefore = 8000
+		rtRuntimeAfter  = 5000
+		rtPeriodBefore  = 10000
+		rtPeriodAfter   = 7000
+	)
+
+	helper.writeFileContents(map[string]string{
+		"cpu.cfs_quota_us":  strconv.Itoa(quotaBefore),
+		"cpu.cfs_period_us": strconv.Itoa(periodBefore),
+		"cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
+		"cpu.rt_period_us":  strconv.Itoa(rtPeriodBefore),
+	})
+
+	helper.CgroupData.config.Resources.CpuQuota = quotaAfter
+	helper.CgroupData.config.Resources.CpuPeriod = periodAfter
+	helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
+	helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
+	cpu := &CpuGroup{}
+	if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	quota, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_quota_us")
+	if err != nil {
+		t.Fatalf("Failed to parse cpu.cfs_quota_us - %s", err)
+	}
+	if quota != quotaAfter {
+		t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.")
+	}
+
+	period, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_period_us")
+	if err != nil {
+		t.Fatalf("Failed to parse cpu.cfs_period_us - %s", err)
+	}
+	if period != periodAfter {
+		t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.")
+	}
+	rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
+	if err != nil {
+		t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
+	}
+	if rtRuntime != rtRuntimeAfter {
+		t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
+	}
+	rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
+	if err != nil {
+		t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
+	}
+	if rtPeriod != rtPeriodAfter {
+		t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
+	}
+}
+
+func TestCpuStats(t *testing.T) {
+	helper := NewCgroupTestUtil("cpu", t)
+	defer helper.cleanup()
+
+	const (
+		nrPeriods     = 2000
+		nrThrottled   = 200
+		throttledTime = uint64(18446744073709551615)
+	)
+
+	cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n",
+		nrPeriods, nrThrottled, throttledTime)
+	helper.writeFileContents(map[string]string{
+		"cpu.stat": cpuStatContent,
+	})
+
+	cpu := &CpuGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpu.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedStats := cgroups.ThrottlingData{
+		Periods:          nrPeriods,
+		ThrottledPeriods: nrThrottled,
+		ThrottledTime:    throttledTime}
+
+	expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData)
+}
+
+func TestNoCpuStatFile(t *testing.T) {
+	helper := NewCgroupTestUtil("cpu", t)
+	defer helper.cleanup()
+
+	cpu := &CpuGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpu.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatal("Expected not to fail, but did")
+	}
+}
+
+func TestInvalidCpuStat(t *testing.T) {
+	helper := NewCgroupTestUtil("cpu", t)
+	defer helper.cleanup()
+	cpuStatContent := `nr_periods 2000
+	nr_throttled 200
+	throttled_time fortytwo`
+	helper.writeFileContents(map[string]string{
+		"cpu.stat": cpuStatContent,
+	})
+
+	cpu := &CpuGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpu.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failed stat parsing.")
+	}
+}
+
+func TestCpuSetRtSchedAtApply(t *testing.T) {
+	helper := NewCgroupTestUtil("cpu", t)
+	defer helper.cleanup()
+
+	const (
+		rtRuntimeBefore = 0
+		rtRuntimeAfter  = 5000
+		rtPeriodBefore  = 0
+		rtPeriodAfter   = 7000
+	)
+
+	helper.writeFileContents(map[string]string{
+		"cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
+		"cpu.rt_period_us":  strconv.Itoa(rtPeriodBefore),
+	})
+
+	helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
+	helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
+	cpu := &CpuGroup{}
+	if err := cpu.ApplyDir(helper.CgroupPath, helper.CgroupData.config, 1234); err != nil {
+		t.Fatal(err)
+	}
+
+	rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
+	if err != nil {
+		t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
+	}
+	if rtRuntime != rtRuntimeAfter {
+		t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
+	}
+	rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
+	if err != nil {
+		t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
+	}
+	if rtPeriod != rtPeriodAfter {
+		t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
+	}
+	pid, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cgroup.procs")
+	if err != nil {
+		t.Fatalf("Failed to parse cgroup.procs - %s", err)
+	}
+	if pid != 1234 {
+		t.Fatal("Got the wrong value, set cgroup.procs failed.")
+	}
+}
--- a/libcontainer/cgroups/fs/cpuacct.go
+++ b/libcontainer/cgroups/fs/cpuacct.go
@ -0,0 +1,122 @@
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+	"io/ioutil"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/system"
+)
+
+const (
+	cgroupCpuacctStat   = "cpuacct.stat"
+	nanosecondsInSecond = 1000000000
+)
+
+var clockTicks = uint64(system.GetClockTicks())
+
+type CpuacctGroup struct {
+}
+
+func (s *CpuacctGroup) Name() string {
+	return "cpuacct"
+}
+
+func (s *CpuacctGroup) Apply(d *cgroupData) error {
+	// we just want to join this group even though we don't set anything
+	if _, err := d.join("cpuacct"); err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+
+	return nil
+}
+
+func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error {
+	return nil
+}
+
+func (s *CpuacctGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("cpuacct"))
+}
+
+func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
+	userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
+	if err != nil {
+		return err
+	}
+
+	totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage")
+	if err != nil {
+		return err
+	}
+
+	percpuUsage, err := getPercpuUsage(path)
+	if err != nil {
+		return err
+	}
+
+	stats.CpuStats.CpuUsage.TotalUsage = totalUsage
+	stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
+	stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
+	stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
+	return nil
+}
+
+// Returns user and kernel usage breakdown in nanoseconds.
+func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
+	userModeUsage := uint64(0)
+	kernelModeUsage := uint64(0)
+	const (
+		userField   = "user"
+		systemField = "system"
+	)
+
+	// Expected format:
+	// user <usage in ticks>
+	// system <usage in ticks>
+	data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat))
+	if err != nil {
+		return 0, 0, err
+	}
+	fields := strings.Fields(string(data))
+	if len(fields) < 4 {
+		return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat))
+	}
+	if fields[0] != userField {
+		return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField)
+	}
+	if fields[2] != systemField {
+		return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField)
+	}
+	if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
+		return 0, 0, err
+	}
+	if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
+		return 0, 0, err
+	}
+
+	return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil
+}
+
+func getPercpuUsage(path string) ([]uint64, error) {
+	percpuUsage := []uint64{}
+	data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu"))
+	if err != nil {
+		return percpuUsage, err
+	}
+	for _, value := range strings.Fields(string(data)) {
+		value, err := strconv.ParseUint(value, 10, 64)
+		if err != nil {
+			return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err)
+		}
+		percpuUsage = append(percpuUsage, value)
+	}
+	return percpuUsage, nil
+}
--- a/libcontainer/cgroups/fs/cpuset.go
+++ b/libcontainer/cgroups/fs/cpuset.go
@ -0,0 +1,160 @@
+// +build linux
+
+package fs
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+)
+
+type CpusetGroup struct {
+}
+
+func (s *CpusetGroup) Name() string {
+	return "cpuset"
+}
+
+func (s *CpusetGroup) Apply(d *cgroupData) error {
+	dir, err := d.path("cpuset")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return s.ApplyDir(dir, d.config, d.pid)
+}
+
+func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.CpusetCpus != "" {
+		if err := fscommon.WriteFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
+			return err
+		}
+	}
+	if cgroup.Resources.CpusetMems != "" {
+		if err := fscommon.WriteFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (s *CpusetGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("cpuset"))
+}
+
+func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
+
+func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
+	// This might happen if we have no cpuset cgroup mounted.
+	// Just do nothing and don't fail.
+	if dir == "" {
+		return nil
+	}
+	mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo")
+	if err != nil {
+		return err
+	}
+	root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo)))
+	// 'ensureParent' start with parent because we don't want to
+	// explicitly inherit from parent, it could conflict with
+	// 'cpuset.cpu_exclusive'.
+	if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
+		return err
+	}
+	if err := os.MkdirAll(dir, 0755); err != nil {
+		return err
+	}
+	// We didn't inherit cpuset configs from parent, but we have
+	// to ensure cpuset configs are set before moving task into the
+	// cgroup.
+	// The logic is, if user specified cpuset configs, use these
+	// specified configs, otherwise, inherit from parent. This makes
+	// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
+	// keep backward compatibility.
+	if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
+		return err
+	}
+
+	// because we are not using d.join we need to place the pid into the procs file
+	// unlike the other subsystems
+	return cgroups.WriteCgroupProc(dir, pid)
+}
+
+func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {
+	if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil {
+		return
+	}
+	if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil {
+		return
+	}
+	return cpus, mems, nil
+}
+
+// ensureParent makes sure that the parent directory of current is created
+// and populated with the proper cpus and mems files copied from
+// it's parent.
+func (s *CpusetGroup) ensureParent(current, root string) error {
+	parent := filepath.Dir(current)
+	if libcontainerUtils.CleanPath(parent) == root {
+		return nil
+	}
+	// Avoid infinite recursion.
+	if parent == current {
+		return fmt.Errorf("cpuset: cgroup parent path outside cgroup root")
+	}
+	if err := s.ensureParent(parent, root); err != nil {
+		return err
+	}
+	if err := os.MkdirAll(current, 0755); err != nil {
+		return err
+	}
+	return s.copyIfNeeded(current, parent)
+}
+
+// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
+// directory to the current directory if the file's contents are 0
+func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
+	var (
+		err                      error
+		currentCpus, currentMems []byte
+		parentCpus, parentMems   []byte
+	)
+
+	if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil {
+		return err
+	}
+	if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil {
+		return err
+	}
+
+	if s.isEmpty(currentCpus) {
+		if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
+			return err
+		}
+	}
+	if s.isEmpty(currentMems) {
+		if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (s *CpusetGroup) isEmpty(b []byte) bool {
+	return len(bytes.Trim(b, "\n")) == 0
+}
+
+func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
+	if err := s.Set(path, cgroup); err != nil {
+		return err
+	}
+	return s.copyIfNeeded(path, filepath.Dir(path))
+}
--- a/libcontainer/cgroups/fs/cpuset_test.go
+++ b/libcontainer/cgroups/fs/cpuset_test.go
@ -0,0 +1,67 @@
+// +build linux
+
+package fs
+
+import (
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+func TestCpusetSetCpus(t *testing.T) {
+	helper := NewCgroupTestUtil("cpuset", t)
+	defer helper.cleanup()
+
+	const (
+		cpusBefore = "0"
+		cpusAfter  = "1-3"
+	)
+
+	helper.writeFileContents(map[string]string{
+		"cpuset.cpus": cpusBefore,
+	})
+
+	helper.CgroupData.config.Resources.CpusetCpus = cpusAfter
+	cpuset := &CpusetGroup{}
+	if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.cpus")
+	if err != nil {
+		t.Fatalf("Failed to parse cpuset.cpus - %s", err)
+	}
+
+	if value != cpusAfter {
+		t.Fatal("Got the wrong value, set cpuset.cpus failed.")
+	}
+}
+
+func TestCpusetSetMems(t *testing.T) {
+	helper := NewCgroupTestUtil("cpuset", t)
+	defer helper.cleanup()
+
+	const (
+		memsBefore = "0"
+		memsAfter  = "1"
+	)
+
+	helper.writeFileContents(map[string]string{
+		"cpuset.mems": memsBefore,
+	})
+
+	helper.CgroupData.config.Resources.CpusetMems = memsAfter
+	cpuset := &CpusetGroup{}
+	if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.mems")
+	if err != nil {
+		t.Fatalf("Failed to parse cpuset.mems - %s", err)
+	}
+
+	if value != memsAfter {
+		t.Fatal("Got the wrong value, set cpuset.mems failed.")
+	}
+}
--- a/libcontainer/cgroups/fs/devices.go
+++ b/libcontainer/cgroups/fs/devices.go
@ -0,0 +1,81 @@
+// +build linux
+
+package fs
+
+import (
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/system"
+)
+
+type DevicesGroup struct {
+}
+
+func (s *DevicesGroup) Name() string {
+	return "devices"
+}
+
+func (s *DevicesGroup) Apply(d *cgroupData) error {
+	_, err := d.join("devices")
+	if err != nil {
+		// We will return error even it's `not found` error, devices
+		// cgroup is hard requirement for container's security.
+		return err
+	}
+	return nil
+}
+
+func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
+	if system.RunningInUserNS() {
+		return nil
+	}
+
+	devices := cgroup.Resources.Devices
+	if len(devices) > 0 {
+		for _, dev := range devices {
+			file := "devices.deny"
+			if dev.Allow {
+				file = "devices.allow"
+			}
+			if err := fscommon.WriteFile(path, file, dev.CgroupString()); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
+	if cgroup.Resources.AllowAllDevices != nil {
+		if *cgroup.Resources.AllowAllDevices == false {
+			if err := fscommon.WriteFile(path, "devices.deny", "a"); err != nil {
+				return err
+			}
+
+			for _, dev := range cgroup.Resources.AllowedDevices {
+				if err := fscommon.WriteFile(path, "devices.allow", dev.CgroupString()); err != nil {
+					return err
+				}
+			}
+			return nil
+		}
+
+		if err := fscommon.WriteFile(path, "devices.allow", "a"); err != nil {
+			return err
+		}
+	}
+
+	for _, dev := range cgroup.Resources.DeniedDevices {
+		if err := fscommon.WriteFile(path, "devices.deny", dev.CgroupString()); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *DevicesGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("devices"))
+}
+
+func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
--- a/libcontainer/cgroups/fs/devices_test.go
+++ b/libcontainer/cgroups/fs/devices_test.go
@ -0,0 +1,99 @@
+// +build linux
+
+package fs
+
+import (
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var (
+	allowedDevices = []*configs.Device{
+		{
+			Path:        "/dev/zero",
+			Type:        'c',
+			Major:       1,
+			Minor:       5,
+			Permissions: "rwm",
+			FileMode:    0666,
+		},
+	}
+	allowedList   = "c 1:5 rwm"
+	deniedDevices = []*configs.Device{
+		{
+			Path:        "/dev/null",
+			Type:        'c',
+			Major:       1,
+			Minor:       3,
+			Permissions: "rwm",
+			FileMode:    0666,
+		},
+	}
+	deniedList = "c 1:3 rwm"
+)
+
+func TestDevicesSetAllow(t *testing.T) {
+	helper := NewCgroupTestUtil("devices", t)
+	defer helper.cleanup()
+
+	helper.writeFileContents(map[string]string{
+		"devices.deny": "a",
+	})
+	allowAllDevices := false
+	helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
+	helper.CgroupData.config.Resources.AllowedDevices = allowedDevices
+	devices := &DevicesGroup{}
+	if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow")
+	if err != nil {
+		t.Fatalf("Failed to parse devices.allow - %s", err)
+	}
+
+	if value != allowedList {
+		t.Fatal("Got the wrong value, set devices.allow failed.")
+	}
+
+	// When AllowAllDevices is nil, devices.allow file should not be modified.
+	helper.CgroupData.config.Resources.AllowAllDevices = nil
+	if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+	value, err = fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow")
+	if err != nil {
+		t.Fatalf("Failed to parse devices.allow - %s", err)
+	}
+	if value != allowedList {
+		t.Fatal("devices policy shouldn't have changed on AllowedAllDevices=nil.")
+	}
+}
+
+func TestDevicesSetDeny(t *testing.T) {
+	helper := NewCgroupTestUtil("devices", t)
+	defer helper.cleanup()
+
+	helper.writeFileContents(map[string]string{
+		"devices.allow": "a",
+	})
+
+	allowAllDevices := true
+	helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
+	helper.CgroupData.config.Resources.DeniedDevices = deniedDevices
+	devices := &DevicesGroup{}
+	if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.deny")
+	if err != nil {
+		t.Fatalf("Failed to parse devices.deny - %s", err)
+	}
+
+	if value != deniedList {
+		t.Fatal("Got the wrong value, set devices.deny failed.")
+	}
+}
--- a/libcontainer/cgroups/fs/freezer.go
+++ b/libcontainer/cgroups/fs/freezer.go
@ -0,0 +1,67 @@
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type FreezerGroup struct {
+}
+
+func (s *FreezerGroup) Name() string {
+	return "freezer"
+}
+
+func (s *FreezerGroup) Apply(d *cgroupData) error {
+	_, err := d.join("freezer")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return nil
+}
+
+func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
+	switch cgroup.Resources.Freezer {
+	case configs.Frozen, configs.Thawed:
+		for {
+			// In case this loop does not exit because it doesn't get the expected
+			// state, let's write again this state, hoping it's going to be properly
+			// set this time. Otherwise, this loop could run infinitely, waiting for
+			// a state change that would never happen.
+			if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
+				return err
+			}
+
+			state, err := fscommon.ReadFile(path, "freezer.state")
+			if err != nil {
+				return err
+			}
+			if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
+				break
+			}
+
+			time.Sleep(1 * time.Millisecond)
+		}
+	case configs.Undefined:
+		return nil
+	default:
+		return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
+	}
+
+	return nil
+}
+
+func (s *FreezerGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("freezer"))
+}
+
+func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
--- a/libcontainer/cgroups/fs/freezer_test.go
+++ b/libcontainer/cgroups/fs/freezer_test.go
@ -0,0 +1,48 @@
+// +build linux
+
+package fs
+
+import (
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func TestFreezerSetState(t *testing.T) {
+	helper := NewCgroupTestUtil("freezer", t)
+	defer helper.cleanup()
+
+	helper.writeFileContents(map[string]string{
+		"freezer.state": string(configs.Frozen),
+	})
+
+	helper.CgroupData.config.Resources.Freezer = configs.Thawed
+	freezer := &FreezerGroup{}
+	if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "freezer.state")
+	if err != nil {
+		t.Fatalf("Failed to parse freezer.state - %s", err)
+	}
+	if value != string(configs.Thawed) {
+		t.Fatal("Got the wrong value, set freezer.state failed.")
+	}
+}
+
+func TestFreezerSetInvalidState(t *testing.T) {
+	helper := NewCgroupTestUtil("freezer", t)
+	defer helper.cleanup()
+
+	const (
+		invalidArg configs.FreezerState = "Invalid"
+	)
+
+	helper.CgroupData.config.Resources.Freezer = invalidArg
+	freezer := &FreezerGroup{}
+	if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err == nil {
+		t.Fatal("Failed to return invalid argument error")
+	}
+}
--- a/libcontainer/cgroups/fs/fs_unsupported.go
+++ b/libcontainer/cgroups/fs/fs_unsupported.go
@ -0,0 +1,3 @@
+// +build !linux
+
+package fs
--- a/libcontainer/cgroups/fs/hugetlb.go
+++ b/libcontainer/cgroups/fs/hugetlb.go
@ -0,0 +1,72 @@
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type HugetlbGroup struct {
+}
+
+func (s *HugetlbGroup) Name() string {
+	return "hugetlb"
+}
+
+func (s *HugetlbGroup) Apply(d *cgroupData) error {
+	_, err := d.join("hugetlb")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return nil
+}
+
+func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
+	for _, hugetlb := range cgroup.Resources.HugetlbLimit {
+		if err := fscommon.WriteFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *HugetlbGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("hugetlb"))
+}
+
+func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
+	hugetlbStats := cgroups.HugetlbStats{}
+	for _, pageSize := range HugePageSizes {
+		usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".")
+		value, err := fscommon.GetCgroupParamUint(path, usage)
+		if err != nil {
+			return fmt.Errorf("failed to parse %s - %v", usage, err)
+		}
+		hugetlbStats.Usage = value
+
+		maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".")
+		value, err = fscommon.GetCgroupParamUint(path, maxUsage)
+		if err != nil {
+			return fmt.Errorf("failed to parse %s - %v", maxUsage, err)
+		}
+		hugetlbStats.MaxUsage = value
+
+		failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".")
+		value, err = fscommon.GetCgroupParamUint(path, failcnt)
+		if err != nil {
+			return fmt.Errorf("failed to parse %s - %v", failcnt, err)
+		}
+		hugetlbStats.Failcnt = value
+
+		stats.HugetlbStats[pageSize] = hugetlbStats
+	}
+
+	return nil
+}
--- a/libcontainer/cgroups/fs/hugetlb_test.go
+++ b/libcontainer/cgroups/fs/hugetlb_test.go
@ -0,0 +1,155 @@
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+const (
+	hugetlbUsageContents    = "128\n"
+	hugetlbMaxUsageContents = "256\n"
+	hugetlbFailcnt          = "100\n"
+)
+
+var (
+	usage    = "hugetlb.%s.usage_in_bytes"
+	limit    = "hugetlb.%s.limit_in_bytes"
+	maxUsage = "hugetlb.%s.max_usage_in_bytes"
+	failcnt  = "hugetlb.%s.failcnt"
+)
+
+func TestHugetlbSetHugetlb(t *testing.T) {
+	helper := NewCgroupTestUtil("hugetlb", t)
+	defer helper.cleanup()
+
+	const (
+		hugetlbBefore = 256
+		hugetlbAfter  = 512
+	)
+
+	for _, pageSize := range HugePageSizes {
+		helper.writeFileContents(map[string]string{
+			fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore),
+		})
+	}
+
+	for _, pageSize := range HugePageSizes {
+		helper.CgroupData.config.Resources.HugetlbLimit = []*configs.HugepageLimit{
+			{
+				Pagesize: pageSize,
+				Limit:    hugetlbAfter,
+			},
+		}
+		hugetlb := &HugetlbGroup{}
+		if err := hugetlb.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	for _, pageSize := range HugePageSizes {
+		limit := fmt.Sprintf(limit, pageSize)
+		value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, limit)
+		if err != nil {
+			t.Fatalf("Failed to parse %s - %s", limit, err)
+		}
+		if value != hugetlbAfter {
+			t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value)
+		}
+	}
+}
+
+func TestHugetlbStats(t *testing.T) {
+	helper := NewCgroupTestUtil("hugetlb", t)
+	defer helper.cleanup()
+	for _, pageSize := range HugePageSizes {
+		helper.writeFileContents(map[string]string{
+			fmt.Sprintf(usage, pageSize):    hugetlbUsageContents,
+			fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents,
+			fmt.Sprintf(failcnt, pageSize):  hugetlbFailcnt,
+		})
+	}
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
+	for _, pageSize := range HugePageSizes {
+		expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
+	}
+}
+
+func TestHugetlbStatsNoUsageFile(t *testing.T) {
+	helper := NewCgroupTestUtil("hugetlb", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		maxUsage: hugetlbMaxUsageContents,
+	})
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
+	helper := NewCgroupTestUtil("hugetlb", t)
+	defer helper.cleanup()
+	for _, pageSize := range HugePageSizes {
+		helper.writeFileContents(map[string]string{
+			fmt.Sprintf(usage, pageSize): hugetlbUsageContents,
+		})
+	}
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestHugetlbStatsBadUsageFile(t *testing.T) {
+	helper := NewCgroupTestUtil("hugetlb", t)
+	defer helper.cleanup()
+	for _, pageSize := range HugePageSizes {
+		helper.writeFileContents(map[string]string{
+			fmt.Sprintf(usage, pageSize): "bad",
+			maxUsage:                     hugetlbMaxUsageContents,
+		})
+	}
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
+	helper := NewCgroupTestUtil("hugetlb", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		usage:    hugetlbUsageContents,
+		maxUsage: "bad",
+	})
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
--- a/libcontainer/cgroups/fs/kmem.go
+++ b/libcontainer/cgroups/fs/kmem.go
@ -0,0 +1,62 @@
+// +build linux,!nokmem
+
+package fs
+
+import (
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
+	"syscall" // for Errno type only
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"golang.org/x/sys/unix"
+)
+
+const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
+
+func EnableKernelMemoryAccounting(path string) error {
+	// Ensure that kernel memory is available in this kernel build. If it
+	// isn't, we just ignore it because EnableKernelMemoryAccounting is
+	// automatically called for all memory limits.
+	if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
+		return nil
+	}
+	// We have to limit the kernel memory here as it won't be accounted at all
+	// until a limit is set on the cgroup and limit cannot be set once the
+	// cgroup has children, or if there are already tasks in the cgroup.
+	for _, i := range []int64{1, -1} {
+		if err := setKernelMemory(path, i); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func setKernelMemory(path string, kernelMemoryLimit int64) error {
+	if path == "" {
+		return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
+	}
+	if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
+		// We have specifically been asked to set a kmem limit. If the kernel
+		// doesn't support it we *must* error out.
+		return errors.New("kernel memory accounting not supported by this kernel")
+	}
+	if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
+		// Check if the error number returned by the syscall is "EBUSY"
+		// The EBUSY signal is returned on attempts to write to the
+		// memory.kmem.limit_in_bytes file if the cgroup has children or
+		// once tasks have been attached to the cgroup
+		if pathErr, ok := err.(*os.PathError); ok {
+			if errNo, ok := pathErr.Err.(syscall.Errno); ok {
+				if errNo == unix.EBUSY {
+					return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
+				}
+			}
+		}
+		return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
+	}
+	return nil
+}
--- a/libcontainer/cgroups/fs/kmem_disabled.go
+++ b/libcontainer/cgroups/fs/kmem_disabled.go
@ -0,0 +1,15 @@
+// +build linux,nokmem
+
+package fs
+
+import (
+	"errors"
+)
+
+func EnableKernelMemoryAccounting(path string) error {
+	return nil
+}
+
+func setKernelMemory(path string, kernelMemoryLimit int64) error {
+	return errors.New("kernel memory accounting disabled in this runc build")
+}
--- a/libcontainer/cgroups/fs/memory.go
+++ b/libcontainer/cgroups/fs/memory.go
@ -0,0 +1,271 @@
+// +build linux
+
+package fs
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+const (
+	cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
+	cgroupMemoryLimit     = "memory.limit_in_bytes"
+)
+
+type MemoryGroup struct {
+}
+
+func (s *MemoryGroup) Name() string {
+	return "memory"
+}
+
+func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
+	path, err := d.path("memory")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	} else if path == "" {
+		return nil
+	}
+	if memoryAssigned(d.config) {
+		if _, err := os.Stat(path); os.IsNotExist(err) {
+			if err := os.MkdirAll(path, 0755); err != nil {
+				return err
+			}
+			// Only enable kernel memory accouting when this cgroup
+			// is created by libcontainer, otherwise we might get
+			// error when people use `cgroupsPath` to join an existed
+			// cgroup whose kernel memory is not initialized.
+			if err := EnableKernelMemoryAccounting(path); err != nil {
+				return err
+			}
+		}
+	}
+	defer func() {
+		if err != nil {
+			os.RemoveAll(path)
+		}
+	}()
+
+	// We need to join memory cgroup after set memory limits, because
+	// kmem.limit_in_bytes can only be set when the cgroup is empty.
+	_, err = d.join("memory")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return nil
+}
+
+func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
+	// If the memory update is set to -1 we should also
+	// set swap to -1, it means unlimited memory.
+	if cgroup.Resources.Memory == -1 {
+		// Only set swap if it's enabled in kernel
+		if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
+			cgroup.Resources.MemorySwap = -1
+		}
+	}
+
+	// When memory and swap memory are both set, we need to handle the cases
+	// for updating container.
+	if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 {
+		memoryUsage, err := getMemoryData(path, "")
+		if err != nil {
+			return err
+		}
+
+		// When update memory limit, we should adapt the write sequence
+		// for memory and swap memory, so it won't fail because the new
+		// value and the old value don't fit kernel's validation.
+		if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
+			if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+				return err
+			}
+			if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+				return err
+			}
+		} else {
+			if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+				return err
+			}
+			if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+				return err
+			}
+		}
+	} else {
+		if cgroup.Resources.Memory != 0 {
+			if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+				return err
+			}
+		}
+		if cgroup.Resources.MemorySwap != 0 {
+			if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
+	if err := setMemoryAndSwap(path, cgroup); err != nil {
+		return err
+	}
+
+	if cgroup.Resources.KernelMemory != 0 {
+		if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
+			return err
+		}
+	}
+
+	if cgroup.Resources.MemoryReservation != 0 {
+		if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
+			return err
+		}
+	}
+
+	if cgroup.Resources.KernelMemoryTCP != 0 {
+		if err := fscommon.WriteFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
+			return err
+		}
+	}
+	if cgroup.Resources.OomKillDisable {
+		if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil {
+			return err
+		}
+	}
+	if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
+		return nil
+	} else if *cgroup.Resources.MemorySwappiness <= 100 {
+		if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
+			return err
+		}
+	} else {
+		return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness)
+	}
+
+	return nil
+}
+
+func (s *MemoryGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("memory"))
+}
+
+func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
+	// Set stats from memory.stat.
+	statsFile, err := os.Open(filepath.Join(path, "memory.stat"))
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil
+		}
+		return err
+	}
+	defer statsFile.Close()
+
+	sc := bufio.NewScanner(statsFile)
+	for sc.Scan() {
+		t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
+		if err != nil {
+			return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
+		}
+		stats.MemoryStats.Stats[t] = v
+	}
+	stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
+
+	memoryUsage, err := getMemoryData(path, "")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.Usage = memoryUsage
+	swapUsage, err := getMemoryData(path, "memsw")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.SwapUsage = swapUsage
+	kernelUsage, err := getMemoryData(path, "kmem")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.KernelUsage = kernelUsage
+	kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
+
+	useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".")
+	value, err := fscommon.GetCgroupParamUint(path, useHierarchy)
+	if err != nil {
+		return err
+	}
+	if value == 1 {
+		stats.MemoryStats.UseHierarchy = true
+	}
+	return nil
+}
+
+func memoryAssigned(cgroup *configs.Cgroup) bool {
+	return cgroup.Resources.Memory != 0 ||
+		cgroup.Resources.MemoryReservation != 0 ||
+		cgroup.Resources.MemorySwap > 0 ||
+		cgroup.Resources.KernelMemory > 0 ||
+		cgroup.Resources.KernelMemoryTCP > 0 ||
+		cgroup.Resources.OomKillDisable ||
+		(cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
+}
+
+func getMemoryData(path, name string) (cgroups.MemoryData, error) {
+	memoryData := cgroups.MemoryData{}
+
+	moduleName := "memory"
+	if name != "" {
+		moduleName = strings.Join([]string{"memory", name}, ".")
+	}
+	usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
+	maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
+	failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
+	limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
+
+	value, err := fscommon.GetCgroupParamUint(path, usage)
+	if err != nil {
+		if moduleName != "memory" && os.IsNotExist(err) {
+			return cgroups.MemoryData{}, nil
+		}
+		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
+	}
+	memoryData.Usage = value
+	value, err = fscommon.GetCgroupParamUint(path, maxUsage)
+	if err != nil {
+		if moduleName != "memory" && os.IsNotExist(err) {
+			return cgroups.MemoryData{}, nil
+		}
+		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err)
+	}
+	memoryData.MaxUsage = value
+	value, err = fscommon.GetCgroupParamUint(path, failcnt)
+	if err != nil {
+		if moduleName != "memory" && os.IsNotExist(err) {
+			return cgroups.MemoryData{}, nil
+		}
+		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
+	}
+	memoryData.Failcnt = value
+	value, err = fscommon.GetCgroupParamUint(path, limit)
+	if err != nil {
+		if moduleName != "memory" && os.IsNotExist(err) {
+			return cgroups.MemoryData{}, nil
+		}
+		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
+	}
+	memoryData.Limit = value
+
+	return memoryData, nil
+}
--- a/libcontainer/cgroups/fs/memory_test.go
+++ b/libcontainer/cgroups/fs/memory_test.go
@ -0,0 +1,456 @@
+// +build linux
+
+package fs
+
+import (
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+const (
+	memoryStatContents = `cache 512
+rss 1024`
+	memoryUsageContents        = "2048\n"
+	memoryMaxUsageContents     = "4096\n"
+	memoryFailcnt              = "100\n"
+	memoryLimitContents        = "8192\n"
+	memoryUseHierarchyContents = "1\n"
+)
+
+func TestMemorySetMemory(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+
+	const (
+		memoryBefore      = 314572800 // 300M
+		memoryAfter       = 524288000 // 500M
+		reservationBefore = 209715200 // 200M
+		reservationAfter  = 314572800 // 300M
+	)
+
+	helper.writeFileContents(map[string]string{
+		"memory.limit_in_bytes":      strconv.Itoa(memoryBefore),
+		"memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore),
+	})
+
+	helper.CgroupData.config.Resources.Memory = memoryAfter
+	helper.CgroupData.config.Resources.MemoryReservation = reservationAfter
+	memory := &MemoryGroup{}
+	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+	}
+	if value != memoryAfter {
+		t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+	}
+
+	value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.soft_limit_in_bytes")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.soft_limit_in_bytes - %s", err)
+	}
+	if value != reservationAfter {
+		t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.")
+	}
+}
+
+func TestMemorySetMemoryswap(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+
+	const (
+		memoryswapBefore = 314572800 // 300M
+		memoryswapAfter  = 524288000 // 500M
+	)
+
+	helper.writeFileContents(map[string]string{
+		"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+	})
+
+	helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+	memory := &MemoryGroup{}
+	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+	}
+	if value != memoryswapAfter {
+		t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+	}
+}
+
+func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+
+	const (
+		memoryBefore     = 314572800 // 300M
+		memoryswapBefore = 524288000 // 500M
+		memoryAfter      = 629145600 // 600M
+		memoryswapAfter  = 838860800 // 800M
+	)
+
+	helper.writeFileContents(map[string]string{
+		"memory.limit_in_bytes":       strconv.Itoa(memoryBefore),
+		"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+		// Set will call getMemoryData when memory and swap memory are
+		// both set, fake these fields so we don't get error.
+		"memory.usage_in_bytes":     "0",
+		"memory.max_usage_in_bytes": "0",
+		"memory.failcnt":            "0",
+	})
+
+	helper.CgroupData.config.Resources.Memory = memoryAfter
+	helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+	memory := &MemoryGroup{}
+	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+	}
+	if value != memoryAfter {
+		t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+	}
+	value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+	}
+	if value != memoryswapAfter {
+		t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+	}
+}
+
+func TestMemorySetSwapSmallerThanMemory(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+
+	const (
+		memoryBefore     = 629145600 // 600M
+		memoryswapBefore = 838860800 // 800M
+		memoryAfter      = 314572800 // 300M
+		memoryswapAfter  = 524288000 // 500M
+	)
+
+	helper.writeFileContents(map[string]string{
+		"memory.limit_in_bytes":       strconv.Itoa(memoryBefore),
+		"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
+		// Set will call getMemoryData when memory and swap memory are
+		// both set, fake these fields so we don't get error.
+		"memory.usage_in_bytes":     "0",
+		"memory.max_usage_in_bytes": "0",
+		"memory.failcnt":            "0",
+	})
+
+	helper.CgroupData.config.Resources.Memory = memoryAfter
+	helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+	memory := &MemoryGroup{}
+	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+	}
+	if value != memoryAfter {
+		t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+	}
+	value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+	}
+	if value != memoryswapAfter {
+		t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
+	}
+}
+
+func TestMemorySetKernelMemory(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+
+	const (
+		kernelMemoryBefore = 314572800 // 300M
+		kernelMemoryAfter  = 524288000 // 500M
+	)
+
+	helper.writeFileContents(map[string]string{
+		"memory.kmem.limit_in_bytes": strconv.Itoa(kernelMemoryBefore),
+	})
+
+	helper.CgroupData.config.Resources.KernelMemory = kernelMemoryAfter
+	memory := &MemoryGroup{}
+	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.limit_in_bytes")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.kmem.limit_in_bytes - %s", err)
+	}
+	if value != kernelMemoryAfter {
+		t.Fatal("Got the wrong value, set memory.kmem.limit_in_bytes failed.")
+	}
+}
+
+func TestMemorySetKernelMemoryTCP(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+
+	const (
+		kernelMemoryTCPBefore = 314572800 // 300M
+		kernelMemoryTCPAfter  = 524288000 // 500M
+	)
+
+	helper.writeFileContents(map[string]string{
+		"memory.kmem.tcp.limit_in_bytes": strconv.Itoa(kernelMemoryTCPBefore),
+	})
+
+	helper.CgroupData.config.Resources.KernelMemoryTCP = kernelMemoryTCPAfter
+	memory := &MemoryGroup{}
+	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.tcp.limit_in_bytes")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.kmem.tcp.limit_in_bytes - %s", err)
+	}
+	if value != kernelMemoryTCPAfter {
+		t.Fatal("Got the wrong value, set memory.kmem.tcp.limit_in_bytes failed.")
+	}
+}
+
+func TestMemorySetMemorySwappinessDefault(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+
+	swappinessBefore := 60 //default is 60
+	swappinessAfter := uint64(0)
+
+	helper.writeFileContents(map[string]string{
+		"memory.swappiness": strconv.Itoa(swappinessBefore),
+	})
+
+	helper.CgroupData.config.Resources.MemorySwappiness = &swappinessAfter
+	memory := &MemoryGroup{}
+	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.swappiness")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.swappiness - %s", err)
+	}
+	if value != swappinessAfter {
+		t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter)
+	}
+}
+
+func TestMemoryStats(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"memory.stat":                     memoryStatContents,
+		"memory.usage_in_bytes":           memoryUsageContents,
+		"memory.limit_in_bytes":           memoryLimitContents,
+		"memory.max_usage_in_bytes":       memoryMaxUsageContents,
+		"memory.failcnt":                  memoryFailcnt,
+		"memory.memsw.usage_in_bytes":     memoryUsageContents,
+		"memory.memsw.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.memsw.failcnt":            memoryFailcnt,
+		"memory.memsw.limit_in_bytes":     memoryLimitContents,
+		"memory.kmem.usage_in_bytes":      memoryUsageContents,
+		"memory.kmem.max_usage_in_bytes":  memoryMaxUsageContents,
+		"memory.kmem.failcnt":             memoryFailcnt,
+		"memory.kmem.limit_in_bytes":      memoryLimitContents,
+		"memory.use_hierarchy":            memoryUseHierarchyContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectedStats := cgroups.MemoryStats{Cache: 512, Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, Stats: map[string]uint64{"cache": 512, "rss": 1024}, UseHierarchy: true}
+	expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats)
+}
+
+func TestMemoryStatsNoStatFile(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"memory.usage_in_bytes":     memoryUsageContents,
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.limit_in_bytes":     memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestMemoryStatsNoUsageFile(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"memory.stat":               memoryStatContents,
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.limit_in_bytes":     memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsNoMaxUsageFile(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"memory.stat":           memoryStatContents,
+		"memory.usage_in_bytes": memoryUsageContents,
+		"memory.limit_in_bytes": memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsNoLimitInBytesFile(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"memory.stat":               memoryStatContents,
+		"memory.usage_in_bytes":     memoryUsageContents,
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsBadStatFile(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"memory.stat":               "rss rss",
+		"memory.usage_in_bytes":     memoryUsageContents,
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.limit_in_bytes":     memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsBadUsageFile(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"memory.stat":               memoryStatContents,
+		"memory.usage_in_bytes":     "bad",
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.limit_in_bytes":     memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsBadMaxUsageFile(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"memory.stat":               memoryStatContents,
+		"memory.usage_in_bytes":     memoryUsageContents,
+		"memory.max_usage_in_bytes": "bad",
+		"memory.limit_in_bytes":     memoryLimitContents,
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemoryStatsBadLimitInBytesFile(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+	helper.writeFileContents(map[string]string{
+		"memory.stat":               memoryStatContents,
+		"memory.usage_in_bytes":     memoryUsageContents,
+		"memory.max_usage_in_bytes": memoryMaxUsageContents,
+		"memory.limit_in_bytes":     "bad",
+	})
+
+	memory := &MemoryGroup{}
+	actualStats := *cgroups.NewStats()
+	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	if err == nil {
+		t.Fatal("Expected failure")
+	}
+}
+
+func TestMemorySetOomControl(t *testing.T) {
+	helper := NewCgroupTestUtil("memory", t)
+	defer helper.cleanup()
+
+	const (
+		oomKillDisable = 1 // disable oom killer, default is 0
+	)
+
+	helper.writeFileContents(map[string]string{
+		"memory.oom_control": strconv.Itoa(oomKillDisable),
+	})
+
+	memory := &MemoryGroup{}
+	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.oom_control")
+	if err != nil {
+		t.Fatalf("Failed to parse memory.oom_control - %s", err)
+	}
+
+	if value != oomKillDisable {
+		t.Fatalf("Got the wrong value, set memory.oom_control failed.")
+	}
+}
--- a/libcontainer/cgroups/fs/name.go
+++ b/libcontainer/cgroups/fs/name.go
@ -0,0 +1,40 @@
+// +build linux
+
+package fs
+
+import (
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type NameGroup struct {
+	GroupName string
+	Join      bool
+}
+
+func (s *NameGroup) Name() string {
+	return s.GroupName
+}
+
+func (s *NameGroup) Apply(d *cgroupData) error {
+	if s.Join {
+		// ignore errors if the named cgroup does not exist
+		d.join(s.GroupName)
+	}
+	return nil
+}
+
+func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
+	return nil
+}
+
+func (s *NameGroup) Remove(d *cgroupData) error {
+	if s.Join {
+		removePath(d.path(s.GroupName))
+	}
+	return nil
+}
+
+func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
--- a/libcontainer/cgroups/fs/net_cls.go
+++ b/libcontainer/cgroups/fs/net_cls.go
@ -0,0 +1,44 @@
+// +build linux
+
+package fs
+
+import (
+	"strconv"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type NetClsGroup struct {
+}
+
+func (s *NetClsGroup) Name() string {
+	return "net_cls"
+}
+
+func (s *NetClsGroup) Apply(d *cgroupData) error {
+	_, err := d.join("net_cls")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return nil
+}
+
+func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.NetClsClassid != 0 {
+		if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *NetClsGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("net_cls"))
+}
+
+func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
--- a/libcontainer/cgroups/fs/net_cls_test.go
+++ b/libcontainer/cgroups/fs/net_cls_test.go
@ -0,0 +1,41 @@
+// +build linux
+
+package fs
+
+import (
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+const (
+	classidBefore = 0x100002
+	classidAfter  = 0x100001
+)
+
+func TestNetClsSetClassid(t *testing.T) {
+	helper := NewCgroupTestUtil("net_cls", t)
+	defer helper.cleanup()
+
+	helper.writeFileContents(map[string]string{
+		"net_cls.classid": strconv.FormatUint(classidBefore, 10),
+	})
+
+	helper.CgroupData.config.Resources.NetClsClassid = classidAfter
+	netcls := &NetClsGroup{}
+	if err := netcls.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	// As we are in mock environment, we can't get correct value of classid from
+	// net_cls.classid.
+	// So. we just judge if we successfully write classid into file
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "net_cls.classid")
+	if err != nil {
+		t.Fatalf("Failed to parse net_cls.classid - %s", err)
+	}
+	if value != classidAfter {
+		t.Fatal("Got the wrong value, set net_cls.classid failed.")
+	}
+}
--- a/libcontainer/cgroups/fs/net_prio.go
+++ b/libcontainer/cgroups/fs/net_prio.go
@ -0,0 +1,42 @@
+// +build linux
+
+package fs
+
+import (
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type NetPrioGroup struct {
+}
+
+func (s *NetPrioGroup) Name() string {
+	return "net_prio"
+}
+
+func (s *NetPrioGroup) Apply(d *cgroupData) error {
+	_, err := d.join("net_prio")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return nil
+}
+
+func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
+	for _, prioMap := range cgroup.Resources.NetPrioIfpriomap {
+		if err := fscommon.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *NetPrioGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("net_prio"))
+}
+
+func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
--- a/libcontainer/cgroups/fs/net_prio_test.go
+++ b/libcontainer/cgroups/fs/net_prio_test.go
@ -0,0 +1,39 @@
+// +build linux
+
+package fs
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var (
+	prioMap = []*configs.IfPrioMap{
+		{
+			Interface: "test",
+			Priority:  5,
+		},
+	}
+)
+
+func TestNetPrioSetIfPrio(t *testing.T) {
+	helper := NewCgroupTestUtil("net_prio", t)
+	defer helper.cleanup()
+
+	helper.CgroupData.config.Resources.NetPrioIfpriomap = prioMap
+	netPrio := &NetPrioGroup{}
+	if err := netPrio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "net_prio.ifpriomap")
+	if err != nil {
+		t.Fatalf("Failed to parse net_prio.ifpriomap - %s", err)
+	}
+	if !strings.Contains(value, "test 5") {
+		t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.")
+	}
+}
--- a/libcontainer/cgroups/fs/perf_event.go
+++ b/libcontainer/cgroups/fs/perf_event.go
@ -0,0 +1,35 @@
+// +build linux
+
+package fs
+
+import (
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type PerfEventGroup struct {
+}
+
+func (s *PerfEventGroup) Name() string {
+	return "perf_event"
+}
+
+func (s *PerfEventGroup) Apply(d *cgroupData) error {
+	// we just want to join this group even though we don't set anything
+	if _, err := d.join("perf_event"); err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return nil
+}
+
+func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error {
+	return nil
+}
+
+func (s *PerfEventGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("perf_event"))
+}
+
+func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return nil
+}
--- a/libcontainer/cgroups/fs/pids.go
+++ b/libcontainer/cgroups/fs/pids.go
@ -0,0 +1,74 @@
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+	"path/filepath"
+	"strconv"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type PidsGroup struct {
+}
+
+func (s *PidsGroup) Name() string {
+	return "pids"
+}
+
+func (s *PidsGroup) Apply(d *cgroupData) error {
+	_, err := d.join("pids")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+	return nil
+}
+
+func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.PidsLimit != 0 {
+		// "max" is the fallback value.
+		limit := "max"
+
+		if cgroup.Resources.PidsLimit > 0 {
+			limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
+		}
+
+		if err := fscommon.WriteFile(path, "pids.max", limit); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (s *PidsGroup) Remove(d *cgroupData) error {
+	return removePath(d.path("pids"))
+}
+
+func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
+	current, err := fscommon.GetCgroupParamUint(path, "pids.current")
+	if err != nil {
+		return fmt.Errorf("failed to parse pids.current - %s", err)
+	}
+
+	maxString, err := fscommon.GetCgroupParamString(path, "pids.max")
+	if err != nil {
+		return fmt.Errorf("failed to parse pids.max - %s", err)
+	}
+
+	// Default if pids.max == "max" is 0 -- which represents "no limit".
+	var max uint64
+	if maxString != "max" {
+		max, err = fscommon.ParseUint(maxString, 10, 64)
+		if err != nil {
+			return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
+		}
+	}
+
+	stats.PidsStats.Current = current
+	stats.PidsStats.Limit = max
+	return nil
+}
--- a/libcontainer/cgroups/fs/pids_test.go
+++ b/libcontainer/cgroups/fs/pids_test.go
@ -0,0 +1,112 @@
+// +build linux
+
+package fs
+
+import (
+	"strconv"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+const (
+	maxUnlimited = -1
+	maxLimited   = 1024
+)
+
+func TestPidsSetMax(t *testing.T) {
+	helper := NewCgroupTestUtil("pids", t)
+	defer helper.cleanup()
+
+	helper.writeFileContents(map[string]string{
+		"pids.max": "max",
+	})
+
+	helper.CgroupData.config.Resources.PidsLimit = maxLimited
+	pids := &PidsGroup{}
+	if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "pids.max")
+	if err != nil {
+		t.Fatalf("Failed to parse pids.max - %s", err)
+	}
+
+	if value != maxLimited {
+		t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value)
+	}
+}
+
+func TestPidsSetUnlimited(t *testing.T) {
+	helper := NewCgroupTestUtil("pids", t)
+	defer helper.cleanup()
+
+	helper.writeFileContents(map[string]string{
+		"pids.max": strconv.Itoa(maxLimited),
+	})
+
+	helper.CgroupData.config.Resources.PidsLimit = maxUnlimited
+	pids := &PidsGroup{}
+	if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "pids.max")
+	if err != nil {
+		t.Fatalf("Failed to parse pids.max - %s", err)
+	}
+
+	if value != "max" {
+		t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value)
+	}
+}
+
+func TestPidsStats(t *testing.T) {
+	helper := NewCgroupTestUtil("pids", t)
+	defer helper.cleanup()
+
+	helper.writeFileContents(map[string]string{
+		"pids.current": strconv.Itoa(1337),
+		"pids.max":     strconv.Itoa(maxLimited),
+	})
+
+	pids := &PidsGroup{}
+	stats := *cgroups.NewStats()
+	if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
+		t.Fatal(err)
+	}
+
+	if stats.PidsStats.Current != 1337 {
+		t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current)
+	}
+
+	if stats.PidsStats.Limit != maxLimited {
+		t.Fatalf("Expected %d, got %d for pids.max", maxLimited, stats.PidsStats.Limit)
+	}
+}
+
+func TestPidsStatsUnlimited(t *testing.T) {
+	helper := NewCgroupTestUtil("pids", t)
+	defer helper.cleanup()
+
+	helper.writeFileContents(map[string]string{
+		"pids.current": strconv.Itoa(4096),
+		"pids.max":     "max",
+	})
+
+	pids := &PidsGroup{}
+	stats := *cgroups.NewStats()
+	if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
+		t.Fatal(err)
+	}
+
+	if stats.PidsStats.Current != 4096 {
+		t.Fatalf("Expected %d, got %d for pids.current", 4096, stats.PidsStats.Current)
+	}
+
+	if stats.PidsStats.Limit != 0 {
+		t.Fatalf("Expected %d, got %d for pids.max", 0, stats.PidsStats.Limit)
+	}
+}
--- a/libcontainer/cgroups/fs/stats_util_test.go
+++ b/libcontainer/cgroups/fs/stats_util_test.go
@ -0,0 +1,123 @@
+// +build linux
+
+package fs
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+
+	"github.com/sirupsen/logrus"
+)
+
+func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error {
+	if len(expected) != len(actual) {
+		return fmt.Errorf("blkioStatEntries length do not match")
+	}
+	for i, expValue := range expected {
+		actValue := actual[i]
+		if expValue != actValue {
+			return fmt.Errorf("Expected blkio stat entry %v but found %v", expValue, actValue)
+		}
+	}
+	return nil
+}
+
+func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) {
+	if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil {
+		logrus.Printf("blkio IoServiceBytesRecursive do not match - %s\n", err)
+		t.Fail()
+	}
+
+	if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil {
+		logrus.Printf("blkio IoServicedRecursive do not match - %s\n", err)
+		t.Fail()
+	}
+
+	if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil {
+		logrus.Printf("blkio IoQueuedRecursive do not match - %s\n", err)
+		t.Fail()
+	}
+
+	if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil {
+		logrus.Printf("blkio SectorsRecursive do not match - %s\n", err)
+		t.Fail()
+	}
+
+	if err := blkioStatEntryEquals(expected.IoServiceTimeRecursive, actual.IoServiceTimeRecursive); err != nil {
+		logrus.Printf("blkio IoServiceTimeRecursive do not match - %s\n", err)
+		t.Fail()
+	}
+
+	if err := blkioStatEntryEquals(expected.IoWaitTimeRecursive, actual.IoWaitTimeRecursive); err != nil {
+		logrus.Printf("blkio IoWaitTimeRecursive do not match - %s\n", err)
+		t.Fail()
+	}
+
+	if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil {
+		logrus.Printf("blkio IoMergedRecursive do not match - %v vs %v\n", expected.IoMergedRecursive, actual.IoMergedRecursive)
+		t.Fail()
+	}
+
+	if err := blkioStatEntryEquals(expected.IoTimeRecursive, actual.IoTimeRecursive); err != nil {
+		logrus.Printf("blkio IoTimeRecursive do not match - %s\n", err)
+		t.Fail()
+	}
+}
+
+func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) {
+	if expected != actual {
+		logrus.Printf("Expected throttling data %v but found %v\n", expected, actual)
+		t.Fail()
+	}
+}
+
+func expectHugetlbStatEquals(t *testing.T, expected, actual cgroups.HugetlbStats) {
+	if expected != actual {
+		logrus.Printf("Expected hugetlb stats %v but found %v\n", expected, actual)
+		t.Fail()
+	}
+}
+
+func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) {
+	expectMemoryDataEquals(t, expected.Usage, actual.Usage)
+	expectMemoryDataEquals(t, expected.SwapUsage, actual.SwapUsage)
+	expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage)
+
+	if expected.UseHierarchy != actual.UseHierarchy {
+		logrus.Printf("Expected memory use hierarchy %v, but found %v\n", expected.UseHierarchy, actual.UseHierarchy)
+		t.Fail()
+	}
+
+	for key, expValue := range expected.Stats {
+		actValue, ok := actual.Stats[key]
+		if !ok {
+			logrus.Printf("Expected memory stat key %s not found\n", key)
+			t.Fail()
+		}
+		if expValue != actValue {
+			logrus.Printf("Expected memory stat value %d but found %d\n", expValue, actValue)
+			t.Fail()
+		}
+	}
+}
+
+func expectMemoryDataEquals(t *testing.T, expected, actual cgroups.MemoryData) {
+	if expected.Usage != actual.Usage {
+		logrus.Printf("Expected memory usage %d but found %d\n", expected.Usage, actual.Usage)
+		t.Fail()
+	}
+	if expected.MaxUsage != actual.MaxUsage {
+		logrus.Printf("Expected memory max usage %d but found %d\n", expected.MaxUsage, actual.MaxUsage)
+		t.Fail()
+	}
+	if expected.Failcnt != actual.Failcnt {
+		logrus.Printf("Expected memory failcnt %d but found %d\n", expected.Failcnt, actual.Failcnt)
+		t.Fail()
+	}
+	if expected.Limit != actual.Limit {
+		logrus.Printf("Expected memory limit %d but found %d\n", expected.Limit, actual.Limit)
+		t.Fail()
+	}
+}
--- a/libcontainer/cgroups/fs/util_test.go
+++ b/libcontainer/cgroups/fs/util_test.go
@ -0,0 +1,68 @@
+// +build linux
+
+/*
+Utility for testing cgroup operations.
+
+Creates a mock of the cgroup filesystem for the duration of the test.
+*/
+package fs
+
+import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type cgroupTestUtil struct {
+	// cgroup data to use in tests.
+	CgroupData *cgroupData
+
+	// Path to the mock cgroup directory.
+	CgroupPath string
+
+	// Temporary directory to store mock cgroup filesystem.
+	tempDir string
+	t       *testing.T
+}
+
+// Creates a new test util for the specified subsystem
+func NewCgroupTestUtil(subsystem string, t *testing.T) *cgroupTestUtil {
+	d := &cgroupData{
+		config: &configs.Cgroup{},
+	}
+	d.config.Resources = &configs.Resources{}
+	tempDir, err := ioutil.TempDir("", "cgroup_test")
+	if err != nil {
+		t.Fatal(err)
+	}
+	d.root = tempDir
+	testCgroupPath := filepath.Join(d.root, subsystem)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Ensure the full mock cgroup path exists.
+	err = os.MkdirAll(testCgroupPath, 0755)
+	if err != nil {
+		t.Fatal(err)
+	}
+	return &cgroupTestUtil{CgroupData: d, CgroupPath: testCgroupPath, tempDir: tempDir, t: t}
+}
+
+func (c *cgroupTestUtil) cleanup() {
+	os.RemoveAll(c.tempDir)
+}
+
+// Write the specified contents on the mock of the specified cgroup files.
+func (c *cgroupTestUtil) writeFileContents(fileContents map[string]string) {
+	for file, contents := range fileContents {
+		err := fscommon.WriteFile(c.CgroupPath, file, contents)
+		if err != nil {
+			c.t.Fatal(err)
+		}
+	}
+}
--- a/libcontainer/cgroups/fs2/cpu.go
+++ b/libcontainer/cgroups/fs2/cpu.go
@ -0,0 +1,56 @@
+// +build linux
+
+package fs2
+
+import (
+	"bufio"
+	"os"
+	"path/filepath"
+	"strconv"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func setCpu(dirPath string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.CpuWeight != 0 {
+		if err := fscommon.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(cgroup.Resources.CpuWeight, 10)); err != nil {
+			return err
+		}
+	}
+
+	if cgroup.Resources.CpuMax != "" {
+		if err := fscommon.WriteFile(dirPath, "cpu.max", cgroup.Resources.CpuMax); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+func statCpu(dirPath string, stats *cgroups.Stats) error {
+	f, err := os.Open(filepath.Join(dirPath, "cpu.stat"))
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	sc := bufio.NewScanner(f)
+	for sc.Scan() {
+		t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
+		if err != nil {
+			return err
+		}
+		switch t {
+		case "usage_usec":
+			stats.CpuStats.CpuUsage.TotalUsage = v * 1000
+
+		case "user_usec":
+			stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000
+
+		case "system_usec":
+			stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000
+		}
+	}
+	return nil
+}
--- a/libcontainer/cgroups/fs2/cpuset.go
+++ b/libcontainer/cgroups/fs2/cpuset.go
@ -0,0 +1,22 @@
+// +build linux
+
+package fs2
+
+import (
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func setCpuset(dirPath string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.CpusetCpus != "" {
+		if err := fscommon.WriteFile(dirPath, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
+			return err
+		}
+	}
+	if cgroup.Resources.CpusetMems != "" {
+		if err := fscommon.WriteFile(dirPath, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
+			return err
+		}
+	}
+	return nil
+}
--- a/libcontainer/cgroups/fs2/defaultpath.go
+++ b/libcontainer/cgroups/fs2/defaultpath.go
@ -0,0 +1,99 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package fs2
+
+import (
+	"bufio"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+	"github.com/pkg/errors"
+)
+
+const UnifiedMountpoint = "/sys/fs/cgroup"
+
+func defaultDirPath(c *configs.Cgroup) (string, error) {
+	if (c.Name != "" || c.Parent != "") && c.Path != "" {
+		return "", errors.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
+	}
+	if len(c.Paths) != 0 {
+		// never set by specconv
+		return "", errors.Errorf("cgroup: Paths is unsupported, use Path, got %+v", c)
+	}
+
+	// XXX: Do not remove this code. Path safety is important! -- cyphar
+	cgPath := libcontainerUtils.CleanPath(c.Path)
+	cgParent := libcontainerUtils.CleanPath(c.Parent)
+	cgName := libcontainerUtils.CleanPath(c.Name)
+
+	ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return "", err
+	}
+	return _defaultDirPath(UnifiedMountpoint, cgPath, cgParent, cgName, ownCgroup)
+}
+
+func _defaultDirPath(root, cgPath, cgParent, cgName, ownCgroup string) (string, error) {
+	if (cgName != "" || cgParent != "") && cgPath != "" {
+		return "", errors.New("cgroup: either Path or Name and Parent should be used")
+	}
+	innerPath := cgPath
+	if innerPath == "" {
+		innerPath = filepath.Join(cgParent, cgName)
+	}
+	if filepath.IsAbs(innerPath) {
+		return filepath.Join(root, innerPath), nil
+	}
+	return filepath.Join(root, ownCgroup, innerPath), nil
+}
+
+// parseCgroupFile parses /proc/PID/cgroup file and return string
+func parseCgroupFile(path string) (string, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+	return parseCgroupFromReader(f)
+}
+
+func parseCgroupFromReader(r io.Reader) (string, error) {
+	var (
+		s = bufio.NewScanner(r)
+	)
+	for s.Scan() {
+		if err := s.Err(); err != nil {
+			return "", err
+		}
+		var (
+			text  = s.Text()
+			parts = strings.SplitN(text, ":", 3)
+		)
+		if len(parts) < 3 {
+			return "", errors.Errorf("invalid cgroup entry: %q", text)
+		}
+		// text is like "0::/user.slice/user-1001.slice/session-1.scope"
+		if parts[0] == "0" && parts[1] == "" {
+			return parts[2], nil
+		}
+	}
+	return "", errors.New("cgroup path not found")
+}
--- a/libcontainer/cgroups/fs2/defaultpath_test.go
+++ b/libcontainer/cgroups/fs2/defaultpath_test.go
@ -0,0 +1,76 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package fs2
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestParseCgroupFromReader(t *testing.T) {
+	cases := map[string]string{
+		"0::/user.slice/user-1001.slice/session-1.scope\n":                                  "/user.slice/user-1001.slice/session-1.scope",
+		"2:cpuset:/foo\n1:name=systemd:/\n":                                                 "",
+		"2:cpuset:/foo\n1:name=systemd:/\n0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope",
+	}
+	for s, expected := range cases {
+		g, err := parseCgroupFromReader(strings.NewReader(s))
+		if expected != "" {
+			if string(g) != expected {
+				t.Errorf("expected %q, got %q", expected, string(g))
+			}
+			if err != nil {
+				t.Error(err)
+			}
+		} else {
+			if err == nil {
+				t.Error("error is expected")
+			}
+		}
+	}
+}
+
+func TestDefaultDirPath(t *testing.T) {
+	root := "/sys/fs/cgroup"
+	cases := []struct {
+		cgPath    string
+		cgParent  string
+		cgName    string
+		ownCgroup string
+		expected  string
+	}{
+		{
+			cgPath:    "/foo/bar",
+			ownCgroup: "/apple/banana",
+			expected:  "/sys/fs/cgroup/foo/bar",
+		},
+		{
+			cgPath:    "foo/bar",
+			ownCgroup: "/apple/banana",
+			expected:  "/sys/fs/cgroup/apple/banana/foo/bar",
+		},
+	}
+	for _, c := range cases {
+		got, err := _defaultDirPath(root, c.cgPath, c.cgParent, c.cgName, c.ownCgroup)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if got != c.expected {
+			t.Fatalf("expected %q, got %q", c.expected, got)
+		}
+	}
+}
--- a/libcontainer/cgroups/fs2/devices.go
+++ b/libcontainer/cgroups/fs2/devices.go
@ -0,0 +1,73 @@
+// +build linux
+
+package fs2
+
+import (
+	"github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
+	"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
+)
+
+func isRWM(cgroupPermissions string) bool {
+	r := false
+	w := false
+	m := false
+	for _, rn := range cgroupPermissions {
+		switch rn {
+		case 'r':
+			r = true
+		case 'w':
+			w = true
+		case 'm':
+			m = true
+		}
+	}
+	return r && w && m
+}
+
+// the logic is from crun
+// https://github.com/containers/crun/blob/0.10.2/src/libcrun/cgroup.c#L1644-L1652
+func canSkipEBPFError(cgroup *configs.Cgroup) bool {
+	for _, dev := range cgroup.Resources.Devices {
+		if dev.Allow || !isRWM(dev.Permissions) {
+			return false
+		}
+	}
+	return true
+}
+
+func setDevices(dirPath string, cgroup *configs.Cgroup) error {
+	devices := cgroup.Devices
+	if allowAllDevices := cgroup.Resources.AllowAllDevices; allowAllDevices != nil {
+		// never set by OCI specconv, but *allowAllDevices=false is still used by the integration test
+		if *allowAllDevices == true {
+			return errors.New("libcontainer AllowAllDevices is not supported, use Devices")
+		}
+		for _, ad := range cgroup.Resources.AllowedDevices {
+			d := *ad
+			d.Allow = true
+			devices = append(devices, &d)
+		}
+	}
+	if len(cgroup.Resources.DeniedDevices) != 0 {
+		// never set by OCI specconv
+		return errors.New("libcontainer DeniedDevices is not supported, use Devices")
+	}
+	insts, license, err := devicefilter.DeviceFilter(devices)
+	if err != nil {
+		return err
+	}
+	dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0600)
+	if err != nil {
+		return errors.Errorf("cannot get dir FD for %s", dirPath)
+	}
+	defer unix.Close(dirFD)
+	if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
+		if !canSkipEBPFError(cgroup) {
+			return err
+		}
+	}
+	return nil
+}
--- a/libcontainer/cgroups/fs2/freezer.go
+++ b/libcontainer/cgroups/fs2/freezer.go
@ -0,0 +1,53 @@
+// +build linux
+
+package fs2
+
+import (
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/pkg/errors"
+)
+
+func setFreezer(dirPath string, state configs.FreezerState) error {
+	var desired int
+	switch state {
+	case configs.Undefined:
+		return nil
+	case configs.Frozen:
+		desired = 1
+	case configs.Thawed:
+		desired = 0
+	default:
+		return errors.Errorf("unknown freezer state %+v", state)
+	}
+	supportedErr := supportsFreezer(dirPath)
+	if supportedErr != nil && desired != 0 {
+		// can ignore error if desired == 1
+		return errors.Wrap(supportedErr, "freezer not supported")
+	}
+	return freezeWithInt(dirPath, desired)
+}
+
+func supportsFreezer(dirPath string) error {
+	_, err := fscommon.ReadFile(dirPath, "cgroup.freeze")
+	return err
+}
+
+// freeze writes desired int to "cgroup.freeze".
+func freezeWithInt(dirPath string, desired int) error {
+	desiredS := strconv.Itoa(desired)
+	if err := fscommon.WriteFile(dirPath, "cgroup.freeze", desiredS); err != nil {
+		return err
+	}
+	got, err := fscommon.ReadFile(dirPath, "cgroup.freeze")
+	if err != nil {
+		return err
+	}
+	if gotS := strings.TrimSpace(string(got)); gotS != desiredS {
+		return errors.Errorf("expected \"cgroup.freeze\" in %q to be %q, got %q", dirPath, desiredS, gotS)
+	}
+	return nil
+}
--- a/libcontainer/cgroups/fs2/fs2.go
+++ b/libcontainer/cgroups/fs2/fs2.go
@ -0,0 +1,214 @@
+// +build linux
+
+package fs2
+
+import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+
+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/pkg/errors"
+)
+
+// NewManager creates a manager for cgroup v2 unified hierarchy.
+// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
+// If dirPath is empty, it is automatically set using config.
+func NewManager(config *configs.Cgroup, dirPath string, rootless bool) (cgroups.Manager, error) {
+	if config == nil {
+		config = &configs.Cgroup{}
+	}
+	if dirPath != "" {
+		if filepath.Clean(dirPath) != dirPath || !filepath.IsAbs(dirPath) {
+			return nil, errors.Errorf("invalid dir path %q", dirPath)
+		}
+	} else {
+		var err error
+		dirPath, err = defaultDirPath(config)
+		if err != nil {
+			return nil, err
+		}
+	}
+	controllers, err := detectControllers(dirPath)
+	if err != nil && !rootless {
+		return nil, err
+	}
+
+	m := &manager{
+		config:      config,
+		dirPath:     dirPath,
+		controllers: controllers,
+		rootless:    rootless,
+	}
+	return m, nil
+}
+
+func detectControllers(dirPath string) (map[string]struct{}, error) {
+	if err := os.MkdirAll(dirPath, 0755); err != nil {
+		return nil, err
+	}
+	controllersPath, err := securejoin.SecureJoin(dirPath, "cgroup.controllers")
+	if err != nil {
+		return nil, err
+	}
+	controllersData, err := ioutil.ReadFile(controllersPath)
+	if err != nil {
+		return nil, err
+	}
+	controllersFields := strings.Fields(string(controllersData))
+	controllers := make(map[string]struct{}, len(controllersFields))
+	for _, c := range controllersFields {
+		controllers[c] = struct{}{}
+	}
+	return controllers, nil
+}
+
+type manager struct {
+	config *configs.Cgroup
+	// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
+	dirPath string
+	// controllers is content of "cgroup.controllers" file.
+	// excludes pseudo-controllers ("devices" and "freezer").
+	controllers map[string]struct{}
+	rootless    bool
+}
+
+func (m *manager) Apply(pid int) error {
+	if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil && !m.rootless {
+		return err
+	}
+	return nil
+}
+
+func (m *manager) GetPids() ([]int, error) {
+	return cgroups.GetPids(m.dirPath)
+}
+
+func (m *manager) GetAllPids() ([]int, error) {
+	return cgroups.GetAllPids(m.dirPath)
+}
+
+func (m *manager) GetStats() (*cgroups.Stats, error) {
+	var (
+		st   cgroups.Stats
+		errs []error
+	)
+	// pids (since kernel 4.5)
+	if _, ok := m.controllers["pids"]; ok {
+		if err := statPids(m.dirPath, &st); err != nil {
+			errs = append(errs, err)
+		}
+	} else {
+		if err := statPidsWithoutController(m.dirPath, &st); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	// memory (since kenrel 4.5)
+	if _, ok := m.controllers["memory"]; ok {
+		if err := statMemory(m.dirPath, &st); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	// io (since kernel 4.5)
+	if _, ok := m.controllers["io"]; ok {
+		if err := statIo(m.dirPath, &st); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	// cpu (since kernel 4.15)
+	if _, ok := m.controllers["cpu"]; ok {
+		if err := statCpu(m.dirPath, &st); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	if len(errs) > 0 && !m.rootless {
+		return &st, errors.Errorf("error while statting cgroup v2: %+v", errs)
+	}
+	return &st, nil
+}
+
+func (m *manager) Freeze(state configs.FreezerState) error {
+	if err := setFreezer(m.dirPath, state); err != nil {
+		return err
+	}
+	m.config.Resources.Freezer = state
+	return nil
+}
+
+func (m *manager) Destroy() error {
+	return os.RemoveAll(m.dirPath)
+}
+
+// GetPaths is for compatibility purpose and should be removed in future
+func (m *manager) GetPaths() map[string]string {
+	paths := map[string]string{
+		// pseudo-controller for compatibility
+		"devices": m.dirPath,
+		"freezer": m.dirPath,
+	}
+	for c := range m.controllers {
+		paths[c] = m.dirPath
+	}
+	return paths
+}
+
+func (m *manager) GetUnifiedPath() (string, error) {
+	return m.dirPath, nil
+}
+
+func (m *manager) Set(container *configs.Config) error {
+	if container == nil || container.Cgroups == nil {
+		return nil
+	}
+	var errs []error
+	// pids (since kernel 4.5)
+	if _, ok := m.controllers["pids"]; ok {
+		if err := setPids(m.dirPath, container.Cgroups); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	// memory (since kernel 4.5)
+	if _, ok := m.controllers["memory"]; ok {
+		if err := setMemory(m.dirPath, container.Cgroups); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	// io (since kernel 4.5)
+	if _, ok := m.controllers["io"]; ok {
+		if err := setIo(m.dirPath, container.Cgroups); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	// cpu (since kernel 4.15)
+	if _, ok := m.controllers["cpu"]; ok {
+		if err := setCpu(m.dirPath, container.Cgroups); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	// devices (since kernel 4.15, pseudo-controller)
+	if err := setDevices(m.dirPath, container.Cgroups); err != nil {
+		errs = append(errs, err)
+	}
+	// cpuset (since kernel 5.0)
+	if _, ok := m.controllers["cpuset"]; ok {
+		if err := setCpuset(m.dirPath, container.Cgroups); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	// freezer (since kernel 5.2, pseudo-controller)
+	if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil {
+		errs = append(errs, err)
+	}
+	if len(errs) > 0 && !m.rootless {
+		return errors.Errorf("error while setting cgroup v2: %+v", errs)
+	}
+	m.config = container.Cgroups
+	return nil
+}
+
+func (m *manager) GetCgroups() (*configs.Cgroup, error) {
+	return m.config, nil
+}
--- a/libcontainer/cgroups/fs2/io.go
+++ b/libcontainer/cgroups/fs2/io.go
@ -0,0 +1,124 @@
+// +build linux
+
+package fs2
+
+import (
+	"bufio"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func setIo(dirPath string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.BlkioWeight != 0 {
+		filename := "io.bfq.weight"
+		if err := fscommon.WriteFile(dirPath, filename, strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
+			return err
+		}
+	}
+
+	for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
+		if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
+			return err
+		}
+	}
+	for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
+		if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil {
+			return err
+		}
+	}
+	for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
+		if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil {
+			return err
+		}
+	}
+	for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
+		if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) {
+	ret := map[string][]string{}
+	p := filepath.Join(dirPath, name)
+	f, err := os.Open(p)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		line := scanner.Text()
+		parts := strings.Fields(line)
+		if len(parts) < 2 {
+			continue
+		}
+		ret[parts[0]] = parts[1:]
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+	return ret, nil
+}
+
+func statIo(dirPath string, stats *cgroups.Stats) error {
+	// more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
+	var ioServiceBytesRecursive []cgroups.BlkioStatEntry
+	values, err := readCgroup2MapFile(dirPath, "io.stat")
+	if err != nil {
+		return err
+	}
+	for k, v := range values {
+		d := strings.Split(k, ":")
+		if len(d) != 2 {
+			continue
+		}
+		minor, err := strconv.ParseUint(d[0], 10, 0)
+		if err != nil {
+			return err
+		}
+		major, err := strconv.ParseUint(d[1], 10, 0)
+		if err != nil {
+			return err
+		}
+
+		for _, item := range v {
+			d := strings.Split(item, "=")
+			if len(d) != 2 {
+				continue
+			}
+			op := d[0]
+
+			// Accommodate the cgroup v1 naming
+			switch op {
+			case "rbytes":
+				op = "read"
+			case "wbytes":
+				op = "write"
+			}
+
+			value, err := strconv.ParseUint(d[1], 10, 0)
+			if err != nil {
+				return err
+			}
+
+			entry := cgroups.BlkioStatEntry{
+				Op:    op,
+				Major: major,
+				Minor: minor,
+				Value: value,
+			}
+			ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry)
+		}
+	}
+	stats.BlkioStats = cgroups.BlkioStats{IoServiceBytesRecursive: ioServiceBytesRecursive}
+	return nil
+}
--- a/libcontainer/cgroups/fs2/memory.go
+++ b/libcontainer/cgroups/fs2/memory.go
@ -0,0 +1,103 @@
+// +build linux
+
+package fs2
+
+import (
+	"bufio"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/pkg/errors"
+)
+
+func setMemory(dirPath string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.MemorySwap != 0 {
+		if err := fscommon.WriteFile(dirPath, "memory.swap.max", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+			return err
+		}
+	}
+	if cgroup.Resources.Memory != 0 {
+		if err := fscommon.WriteFile(dirPath, "memory.max", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+			return err
+		}
+	}
+
+	// cgroup.Resources.KernelMemory is ignored
+
+	if cgroup.Resources.MemoryReservation != 0 {
+		if err := fscommon.WriteFile(dirPath, "memory.low", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func statMemory(dirPath string, stats *cgroups.Stats) error {
+	// Set stats from memory.stat.
+	statsFile, err := os.Open(filepath.Join(dirPath, "memory.stat"))
+	if err != nil {
+		return err
+	}
+	defer statsFile.Close()
+
+	sc := bufio.NewScanner(statsFile)
+	for sc.Scan() {
+		t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
+		if err != nil {
+			return errors.Wrapf(err, "failed to parse memory.stat (%q)", sc.Text())
+		}
+		stats.MemoryStats.Stats[t] = v
+	}
+	stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
+
+	memoryUsage, err := getMemoryDataV2(dirPath, "")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.Usage = memoryUsage
+	swapUsage, err := getMemoryDataV2(dirPath, "swap")
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.SwapUsage = swapUsage
+
+	stats.MemoryStats.UseHierarchy = true
+	return nil
+}
+
+func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
+	memoryData := cgroups.MemoryData{}
+
+	moduleName := "memory"
+	if name != "" {
+		moduleName = strings.Join([]string{"memory", name}, ".")
+	}
+	usage := strings.Join([]string{moduleName, "current"}, ".")
+	limit := strings.Join([]string{moduleName, "max"}, ".")
+
+	value, err := fscommon.GetCgroupParamUint(path, usage)
+	if err != nil {
+		if moduleName != "memory" && os.IsNotExist(err) {
+			return cgroups.MemoryData{}, nil
+		}
+		return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", usage)
+	}
+	memoryData.Usage = value
+
+	value, err = fscommon.GetCgroupParamUint(path, limit)
+	if err != nil {
+		if moduleName != "memory" && os.IsNotExist(err) {
+			return cgroups.MemoryData{}, nil
+		}
+		return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", limit)
+	}
+	memoryData.Limit = value
+
+	return memoryData, nil
+}
--- a/libcontainer/cgroups/fs2/pids.go
+++ b/libcontainer/cgroups/fs2/pids.go
@ -0,0 +1,90 @@
+// +build linux
+
+package fs2
+
+import (
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
+)
+
+func setPids(dirPath string, cgroup *configs.Cgroup) error {
+	if cgroup.Resources.PidsLimit != 0 {
+		// "max" is the fallback value.
+		limit := "max"
+
+		if cgroup.Resources.PidsLimit > 0 {
+			limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
+		}
+
+		if err := fscommon.WriteFile(dirPath, "pids.max", limit); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func isNOTSUP(err error) bool {
+	switch err := err.(type) {
+	case *os.PathError:
+		return err.Err == unix.ENOTSUP
+	default:
+		return false
+	}
+}
+
+func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error {
+	// if the controller is not enabled, let's read PIDS from cgroups.procs
+	// (or threads if cgroup.threads is enabled)
+	contents, err := ioutil.ReadFile(filepath.Join(dirPath, "cgroup.procs"))
+	if err != nil && isNOTSUP(err) {
+		contents, err = ioutil.ReadFile(filepath.Join(dirPath, "cgroup.threads"))
+	}
+	if err != nil {
+		return err
+	}
+	pids := make(map[string]string)
+	for _, i := range strings.Split(string(contents), "\n") {
+		if i != "" {
+			pids[i] = i
+		}
+	}
+	stats.PidsStats.Current = uint64(len(pids))
+	stats.PidsStats.Limit = 0
+	return nil
+}
+
+func statPids(dirPath string, stats *cgroups.Stats) error {
+	current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current")
+	if err != nil {
+		return errors.Wrap(err, "failed to parse pids.current")
+	}
+
+	maxString, err := fscommon.GetCgroupParamString(dirPath, "pids.max")
+	if err != nil {
+		return errors.Wrap(err, "failed to parse pids.max")
+	}
+
+	// Default if pids.max == "max" is 0 -- which represents "no limit".
+	var max uint64
+	if maxString != "max" {
+		max, err = fscommon.ParseUint(maxString, 10, 64)
+		if err != nil {
+			return errors.Wrapf(err, "failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q",
+				maxString, filepath.Join(dirPath, "pids.max"))
+		}
+	}
+
+	stats.PidsStats.Current = current
+	stats.PidsStats.Limit = max
+	return nil
+}
--- a/libcontainer/cgroups/fscommon/fscommon.go
+++ b/libcontainer/cgroups/fscommon/fscommon.go
@ -0,0 +1,36 @@
+// +build linux
+
+package fscommon
+
+import (
+	"io/ioutil"
+
+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/pkg/errors"
+)
+
+func WriteFile(dir, file, data string) error {
+	if dir == "" {
+		return errors.Errorf("no directory specified for %s", file)
+	}
+	path, err := securejoin.SecureJoin(dir, file)
+	if err != nil {
+		return err
+	}
+	if err := ioutil.WriteFile(path, []byte(data), 0700); err != nil {
+		return errors.Wrapf(err, "failed to write %q to %q", data, path)
+	}
+	return nil
+}
+
+func ReadFile(dir, file string) (string, error) {
+	if dir == "" {
+		return "", errors.Errorf("no directory specified for %s", file)
+	}
+	path, err := securejoin.SecureJoin(dir, file)
+	if err != nil {
+		return "", err
+	}
+	data, err := ioutil.ReadFile(path)
+	return string(data), err
+}
--- a/libcontainer/cgroups/fscommon/utils.go
+++ b/libcontainer/cgroups/fscommon/utils.go
@ -0,0 +1,83 @@
+// +build linux
+
+package fscommon
+
+import (
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"math"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+var (
+	ErrNotValidFormat = errors.New("line is not a valid key value format")
+)
+
+// Saturates negative values at zero and returns a uint64.
+// Due to kernel bugs, some of the memory cgroup stats can be negative.
+func ParseUint(s string, base, bitSize int) (uint64, error) {
+	value, err := strconv.ParseUint(s, base, bitSize)
+	if err != nil {
+		intValue, intErr := strconv.ParseInt(s, base, bitSize)
+		// 1. Handle negative values greater than MinInt64 (and)
+		// 2. Handle negative values lesser than MinInt64
+		if intErr == nil && intValue < 0 {
+			return 0, nil
+		} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
+			return 0, nil
+		}
+
+		return value, err
+	}
+
+	return value, nil
+}
+
+// Parses a cgroup param and returns as name, value
+//  i.e. "io_service_bytes 1234" will return as io_service_bytes, 1234
+func GetCgroupParamKeyValue(t string) (string, uint64, error) {
+	parts := strings.Fields(t)
+	switch len(parts) {
+	case 2:
+		value, err := ParseUint(parts[1], 10, 64)
+		if err != nil {
+			return "", 0, fmt.Errorf("unable to convert param value (%q) to uint64: %v", parts[1], err)
+		}
+
+		return parts[0], value, nil
+	default:
+		return "", 0, ErrNotValidFormat
+	}
+}
+
+// Gets a single uint64 value from the specified cgroup file.
+func GetCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) {
+	fileName := filepath.Join(cgroupPath, cgroupFile)
+	contents, err := ioutil.ReadFile(fileName)
+	if err != nil {
+		return 0, err
+	}
+	trimmed := strings.TrimSpace(string(contents))
+	if trimmed == "max" {
+		return math.MaxUint64, nil
+	}
+
+	res, err := ParseUint(trimmed, 10, 64)
+	if err != nil {
+		return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName)
+	}
+	return res, nil
+}
+
+// Gets a string value from the specified cgroup file
+func GetCgroupParamString(cgroupPath, cgroupFile string) (string, error) {
+	contents, err := ioutil.ReadFile(filepath.Join(cgroupPath, cgroupFile))
+	if err != nil {
+		return "", err
+	}
+
+	return strings.TrimSpace(string(contents)), nil
+}
--- a/libcontainer/cgroups/fscommon/utils_test.go
+++ b/libcontainer/cgroups/fscommon/utils_test.go
@ -0,0 +1,97 @@
+// +build linux
+
+package fscommon
+
+import (
+	"io/ioutil"
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"testing"
+)
+
+const (
+	cgroupFile  = "cgroup.file"
+	floatValue  = 2048.0
+	floatString = "2048"
+)
+
+func TestGetCgroupParamsInt(t *testing.T) {
+	// Setup tempdir.
+	tempDir, err := ioutil.TempDir("", "cgroup_utils_test")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.RemoveAll(tempDir)
+	tempFile := filepath.Join(tempDir, cgroupFile)
+
+	// Success.
+	err = ioutil.WriteFile(tempFile, []byte(floatString), 0755)
+	if err != nil {
+		t.Fatal(err)
+	}
+	value, err := GetCgroupParamUint(tempDir, cgroupFile)
+	if err != nil {
+		t.Fatal(err)
+	} else if value != floatValue {
+		t.Fatalf("Expected %d to equal %f", value, floatValue)
+	}
+
+	// Success with new line.
+	err = ioutil.WriteFile(tempFile, []byte(floatString+"\n"), 0755)
+	if err != nil {
+		t.Fatal(err)
+	}
+	value, err = GetCgroupParamUint(tempDir, cgroupFile)
+	if err != nil {
+		t.Fatal(err)
+	} else if value != floatValue {
+		t.Fatalf("Expected %d to equal %f", value, floatValue)
+	}
+
+	// Success with negative values
+	err = ioutil.WriteFile(tempFile, []byte("-12345"), 0755)
+	if err != nil {
+		t.Fatal(err)
+	}
+	value, err = GetCgroupParamUint(tempDir, cgroupFile)
+	if err != nil {
+		t.Fatal(err)
+	} else if value != 0 {
+		t.Fatalf("Expected %d to equal %d", value, 0)
+	}
+
+	// Success with negative values lesser than min int64
+	s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64)
+	err = ioutil.WriteFile(tempFile, []byte(s), 0755)
+	if err != nil {
+		t.Fatal(err)
+	}
+	value, err = GetCgroupParamUint(tempDir, cgroupFile)
+	if err != nil {
+		t.Fatal(err)
+	} else if value != 0 {
+		t.Fatalf("Expected %d to equal %d", value, 0)
+	}
+
+	// Not a float.
+	err = ioutil.WriteFile(tempFile, []byte("not-a-float"), 0755)
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = GetCgroupParamUint(tempDir, cgroupFile)
+	if err == nil {
+		t.Fatal("Expecting error, got none")
+	}
+
+	// Unknown file.
+	err = os.Remove(tempFile)
+	if err != nil {
+		t.Fatal(err)
+	}
+	_, err = GetCgroupParamUint(tempDir, cgroupFile)
+	if err == nil {
+		t.Fatal("Expecting error, got none")
+	}
+}
--- a/libcontainer/cgroups/stats.go
+++ b/libcontainer/cgroups/stats.go
@ -0,0 +1,108 @@
+// +build linux
+
+package cgroups
+
+type ThrottlingData struct {
+	// Number of periods with throttling active
+	Periods uint64 `json:"periods,omitempty"`
+	// Number of periods when the container hit its throttling limit.
+	ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
+	// Aggregate time the container was throttled for in nanoseconds.
+	ThrottledTime uint64 `json:"throttled_time,omitempty"`
+}
+
+// CpuUsage denotes the usage of a CPU.
+// All CPU stats are aggregate since container inception.
+type CpuUsage struct {
+	// Total CPU time consumed.
+	// Units: nanoseconds.
+	TotalUsage uint64 `json:"total_usage,omitempty"`
+	// Total CPU time consumed per core.
+	// Units: nanoseconds.
+	PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
+	// Time spent by tasks of the cgroup in kernel mode.
+	// Units: nanoseconds.
+	UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
+	// Time spent by tasks of the cgroup in user mode.
+	// Units: nanoseconds.
+	UsageInUsermode uint64 `json:"usage_in_usermode"`
+}
+
+type CpuStats struct {
+	CpuUsage       CpuUsage       `json:"cpu_usage,omitempty"`
+	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
+}
+
+type MemoryData struct {
+	Usage    uint64 `json:"usage,omitempty"`
+	MaxUsage uint64 `json:"max_usage,omitempty"`
+	Failcnt  uint64 `json:"failcnt"`
+	Limit    uint64 `json:"limit"`
+}
+
+type MemoryStats struct {
+	// memory used for cache
+	Cache uint64 `json:"cache,omitempty"`
+	// usage of memory
+	Usage MemoryData `json:"usage,omitempty"`
+	// usage of memory + swap
+	SwapUsage MemoryData `json:"swap_usage,omitempty"`
+	// usage of kernel memory
+	KernelUsage MemoryData `json:"kernel_usage,omitempty"`
+	// usage of kernel TCP memory
+	KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
+	// if true, memory usage is accounted for throughout a hierarchy of cgroups.
+	UseHierarchy bool `json:"use_hierarchy"`
+
+	Stats map[string]uint64 `json:"stats,omitempty"`
+}
+
+type PidsStats struct {
+	// number of pids in the cgroup
+	Current uint64 `json:"current,omitempty"`
+	// active pids hard limit
+	Limit uint64 `json:"limit,omitempty"`
+}
+
+type BlkioStatEntry struct {
+	Major uint64 `json:"major,omitempty"`
+	Minor uint64 `json:"minor,omitempty"`
+	Op    string `json:"op,omitempty"`
+	Value uint64 `json:"value,omitempty"`
+}
+
+type BlkioStats struct {
+	// number of bytes tranferred to and from the block device
+	IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
+	IoServicedRecursive     []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
+	IoQueuedRecursive       []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
+	IoServiceTimeRecursive  []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
+	IoWaitTimeRecursive     []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
+	IoMergedRecursive       []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
+	IoTimeRecursive         []BlkioStatEntry `json:"io_time_recursive,omitempty"`
+	SectorsRecursive        []BlkioStatEntry `json:"sectors_recursive,omitempty"`
+}
+
+type HugetlbStats struct {
+	// current res_counter usage for hugetlb
+	Usage uint64 `json:"usage,omitempty"`
+	// maximum usage ever recorded.
+	MaxUsage uint64 `json:"max_usage,omitempty"`
+	// number of times hugetlb usage allocation failure.
+	Failcnt uint64 `json:"failcnt"`
+}
+
+type Stats struct {
+	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
+	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
+	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
+	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
+	// the map is in the format "size of hugepage: stats of the hugepage"
+	HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
+}
+
+func NewStats() *Stats {
+	memoryStats := MemoryStats{Stats: make(map[string]uint64)}
+	hugetlbStats := make(map[string]HugetlbStats)
+	return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats}
+}
--- a/libcontainer/cgroups/systemd/apply_nosystemd.go
+++ b/libcontainer/cgroups/systemd/apply_nosystemd.go
@ -0,0 +1,67 @@
+// +build !linux
+
+package systemd
+
+import (
+	"fmt"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Manager struct {
+	Cgroups *configs.Cgroup
+	Paths   map[string]string
+}
+
+func UseSystemd() bool {
+	return false
+}
+
+func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) {
+	return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Apply(pid int) error {
+	return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetPids() ([]int, error) {
+	return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetAllPids() ([]int, error) {
+	return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Destroy() error {
+	return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetPaths() map[string]string {
+	return nil
+}
+
+func (m *Manager) GetUnifiedPath() (string, error) {
+	return "", fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetStats() (*cgroups.Stats, error) {
+	return nil, fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Set(container *configs.Config) error {
+	return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) Freeze(state configs.FreezerState) error {
+	return fmt.Errorf("Systemd not supported")
+}
+
+func Freeze(c *configs.Cgroup, state configs.FreezerState) error {
+	return fmt.Errorf("Systemd not supported")
+}
+
+func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
+	return nil, fmt.Errorf("Systemd not supported")
+}
--- a/libcontainer/cgroups/systemd/apply_systemd.go
+++ b/libcontainer/cgroups/systemd/apply_systemd.go
@ -0,0 +1,534 @@
+// +build linux
+
+package systemd
+
+import (
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"math"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+
+	systemdDbus "github.com/coreos/go-systemd/dbus"
+	"github.com/godbus/dbus"
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fs"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/sirupsen/logrus"
+)
+
+type LegacyManager struct {
+	mu      sync.Mutex
+	Cgroups *configs.Cgroup
+	Paths   map[string]string
+}
+
+type subsystem interface {
+	// Name returns the name of the subsystem.
+	Name() string
+	// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
+	GetStats(path string, stats *cgroups.Stats) error
+	// Set the cgroup represented by cgroup.
+	Set(path string, cgroup *configs.Cgroup) error
+}
+
+var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
+
+type subsystemSet []subsystem
+
+func (s subsystemSet) Get(name string) (subsystem, error) {
+	for _, ss := range s {
+		if ss.Name() == name {
+			return ss, nil
+		}
+	}
+	return nil, errSubsystemDoesNotExist
+}
+
+var legacySubsystems = subsystemSet{
+	&fs.CpusetGroup{},
+	&fs.DevicesGroup{},
+	&fs.MemoryGroup{},
+	&fs.CpuGroup{},
+	&fs.CpuacctGroup{},
+	&fs.PidsGroup{},
+	&fs.BlkioGroup{},
+	&fs.HugetlbGroup{},
+	&fs.PerfEventGroup{},
+	&fs.FreezerGroup{},
+	&fs.NetPrioGroup{},
+	&fs.NetClsGroup{},
+	&fs.NameGroup{GroupName: "name=systemd"},
+}
+
+const (
+	testScopeWait = 4
+	testSliceWait = 4
+)
+
+var (
+	connLock sync.Mutex
+	theConn  *systemdDbus.Conn
+)
+
+func newProp(name string, units interface{}) systemdDbus.Property {
+	return systemdDbus.Property{
+		Name:  name,
+		Value: dbus.MakeVariant(units),
+	}
+}
+
+// NOTE: This function comes from package github.com/coreos/go-systemd/util
+// It was borrowed here to avoid a dependency on cgo.
+//
+// IsRunningSystemd checks whether the host was booted with systemd as its init
+// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
+// checks whether /run/systemd/system/ exists and is a directory.
+// http://www.freedesktop.org/software/systemd/man/sd_booted.html
+func isRunningSystemd() bool {
+	fi, err := os.Lstat("/run/systemd/system")
+	if err != nil {
+		return false
+	}
+	return fi.IsDir()
+}
+
+func UseSystemd() bool {
+	if !isRunningSystemd() {
+		return false
+	}
+
+	connLock.Lock()
+	defer connLock.Unlock()
+
+	if theConn == nil {
+		var err error
+		theConn, err = systemdDbus.New()
+		if err != nil {
+			return false
+		}
+	}
+	return true
+}
+
+func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) {
+	if !isRunningSystemd() {
+		return nil, fmt.Errorf("systemd not running on this host, can't use systemd as a cgroups.Manager")
+	}
+	if cgroups.IsCgroup2UnifiedMode() {
+		return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
+			return &UnifiedManager{
+				Cgroups: config,
+				Paths:   paths,
+			}
+		}, nil
+	}
+	return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
+		return &LegacyManager{
+			Cgroups: config,
+			Paths:   paths,
+		}
+	}, nil
+}
+
+func (m *LegacyManager) Apply(pid int) error {
+	var (
+		c          = m.Cgroups
+		unitName   = getUnitName(c)
+		slice      = "system.slice"
+		properties []systemdDbus.Property
+	)
+
+	if c.Paths != nil {
+		paths := make(map[string]string)
+		for name, path := range c.Paths {
+			_, err := getSubsystemPath(m.Cgroups, name)
+			if err != nil {
+				// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+				if cgroups.IsNotFound(err) {
+					continue
+				}
+				return err
+			}
+			paths[name] = path
+		}
+		m.Paths = paths
+		return cgroups.EnterPid(m.Paths, pid)
+	}
+
+	if c.Parent != "" {
+		slice = c.Parent
+	}
+
+	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+	// if we create a slice, the parent is defined via a Wants=
+	if strings.HasSuffix(unitName, ".slice") {
+		properties = append(properties, systemdDbus.PropWants(slice))
+	} else {
+		// otherwise, we use Slice=
+		properties = append(properties, systemdDbus.PropSlice(slice))
+	}
+
+	// only add pid if its valid, -1 is used w/ general slice creation.
+	if pid != -1 {
+		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+	}
+
+	// Check if we can delegate. This is only supported on systemd versions 218 and above.
+	if !strings.HasSuffix(unitName, ".slice") {
+		// Assume scopes always support delegation.
+		properties = append(properties, newProp("Delegate", true))
+	}
+
+	// Always enable accounting, this gets us the same behaviour as the fs implementation,
+	// plus the kernel has some problems with joining the memory cgroup at a later time.
+	properties = append(properties,
+		newProp("MemoryAccounting", true),
+		newProp("CPUAccounting", true),
+		newProp("BlockIOAccounting", true))
+
+	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
+	properties = append(properties,
+		newProp("DefaultDependencies", false))
+
+	if c.Resources.Memory != 0 {
+		properties = append(properties,
+			newProp("MemoryLimit", uint64(c.Resources.Memory)))
+	}
+
+	if c.Resources.CpuShares != 0 {
+		properties = append(properties,
+			newProp("CPUShares", c.Resources.CpuShares))
+	}
+
+	// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
+	if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
+		// corresponds to USEC_INFINITY in systemd
+		// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
+		// always setting a property value ensures we can apply a quota and remove it later
+		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
+		if c.Resources.CpuQuota > 0 {
+			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
+			// (integer percentage of CPU) internally.  This means that if a fractional percent of
+			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
+			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
+			cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
+			if cpuQuotaPerSecUSec%10000 != 0 {
+				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
+			}
+		}
+		properties = append(properties,
+			newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
+	}
+
+	if c.Resources.BlkioWeight != 0 {
+		properties = append(properties,
+			newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
+	}
+
+	if c.Resources.PidsLimit > 0 {
+		properties = append(properties,
+			newProp("TasksAccounting", true),
+			newProp("TasksMax", uint64(c.Resources.PidsLimit)))
+	}
+
+	// We have to set kernel memory here, as we can't change it once
+	// processes have been attached to the cgroup.
+	if c.Resources.KernelMemory != 0 {
+		if err := setKernelMemory(c); err != nil {
+			return err
+		}
+	}
+
+	statusChan := make(chan string, 1)
+	if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
+		select {
+		case <-statusChan:
+		case <-time.After(time.Second):
+			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
+		}
+	} else if !isUnitExists(err) {
+		return err
+	}
+
+	if err := joinCgroups(c, pid); err != nil {
+		return err
+	}
+
+	paths := make(map[string]string)
+	for _, s := range legacySubsystems {
+		subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name())
+		if err != nil {
+			// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+			if cgroups.IsNotFound(err) {
+				continue
+			}
+			return err
+		}
+		paths[s.Name()] = subsystemPath
+	}
+	m.Paths = paths
+	return nil
+}
+
+func (m *LegacyManager) Destroy() error {
+	if m.Cgroups.Paths != nil {
+		return nil
+	}
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
+	if err := cgroups.RemovePaths(m.Paths); err != nil {
+		return err
+	}
+	m.Paths = make(map[string]string)
+	return nil
+}
+
+func (m *LegacyManager) GetPaths() map[string]string {
+	m.mu.Lock()
+	paths := m.Paths
+	m.mu.Unlock()
+	return paths
+}
+
+func (m *LegacyManager) GetUnifiedPath() (string, error) {
+	return "", errors.New("unified path is only supported when running in unified mode")
+}
+
+func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
+	path, err := getSubsystemPath(c, subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	if err := os.MkdirAll(path, 0755); err != nil {
+		return "", err
+	}
+	if err := cgroups.WriteCgroupProc(path, pid); err != nil {
+		return "", err
+	}
+	return path, nil
+}
+
+func joinCgroups(c *configs.Cgroup, pid int) error {
+	for _, sys := range legacySubsystems {
+		name := sys.Name()
+		switch name {
+		case "name=systemd":
+			// let systemd handle this
+		case "cpuset":
+			path, err := getSubsystemPath(c, name)
+			if err != nil && !cgroups.IsNotFound(err) {
+				return err
+			}
+			s := &fs.CpusetGroup{}
+			if err := s.ApplyDir(path, c, pid); err != nil {
+				return err
+			}
+		default:
+			_, err := join(c, name, pid)
+			if err != nil {
+				// Even if it's `not found` error, we'll return err
+				// because devices cgroup is hard requirement for
+				// container security.
+				if name == "devices" {
+					return err
+				}
+				// For other subsystems, omit the `not found` error
+				// because they are optional.
+				if !cgroups.IsNotFound(err) {
+					return err
+				}
+			}
+		}
+	}
+
+	return nil
+}
+
+// systemd represents slice hierarchy using `-`, so we need to follow suit when
+// generating the path of slice. Essentially, test-a-b.slice becomes
+// /test.slice/test-a.slice/test-a-b.slice.
+func ExpandSlice(slice string) (string, error) {
+	suffix := ".slice"
+	// Name has to end with ".slice", but can't be just ".slice".
+	if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	// Path-separators are not allowed.
+	if strings.Contains(slice, "/") {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	var path, prefix string
+	sliceName := strings.TrimSuffix(slice, suffix)
+	// if input was -.slice, we should just return root now
+	if sliceName == "-" {
+		return "/", nil
+	}
+	for _, component := range strings.Split(sliceName, "-") {
+		// test--a.slice isn't permitted, nor is -test.slice.
+		if component == "" {
+			return "", fmt.Errorf("invalid slice name: %s", slice)
+		}
+
+		// Append the component to the path and to the prefix.
+		path += "/" + prefix + component + suffix
+		prefix += component + "-"
+	}
+	return path, nil
+}
+
+func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
+	mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	initPath, err := cgroups.GetInitCgroup(subsystem)
+	if err != nil {
+		return "", err
+	}
+	// if pid 1 is systemd 226 or later, it will be in init.scope, not the root
+	initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
+
+	slice := "system.slice"
+	if c.Parent != "" {
+		slice = c.Parent
+	}
+
+	slice, err = ExpandSlice(slice)
+	if err != nil {
+		return "", err
+	}
+
+	return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
+}
+
+func (m *LegacyManager) Freeze(state configs.FreezerState) error {
+	path, err := getSubsystemPath(m.Cgroups, "freezer")
+	if err != nil {
+		return err
+	}
+	prevState := m.Cgroups.Resources.Freezer
+	m.Cgroups.Resources.Freezer = state
+	freezer, err := legacySubsystems.Get("freezer")
+	if err != nil {
+		return err
+	}
+	err = freezer.Set(path, m.Cgroups)
+	if err != nil {
+		m.Cgroups.Resources.Freezer = prevState
+		return err
+	}
+	return nil
+}
+
+func (m *LegacyManager) GetPids() ([]int, error) {
+	path, err := getSubsystemPath(m.Cgroups, "devices")
+	if err != nil {
+		return nil, err
+	}
+	return cgroups.GetPids(path)
+}
+
+func (m *LegacyManager) GetAllPids() ([]int, error) {
+	path, err := getSubsystemPath(m.Cgroups, "devices")
+	if err != nil {
+		return nil, err
+	}
+	return cgroups.GetAllPids(path)
+}
+
+func (m *LegacyManager) GetStats() (*cgroups.Stats, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	stats := cgroups.NewStats()
+	for name, path := range m.Paths {
+		sys, err := legacySubsystems.Get(name)
+		if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
+			continue
+		}
+		if err := sys.GetStats(path, stats); err != nil {
+			return nil, err
+		}
+	}
+
+	return stats, nil
+}
+
+func (m *LegacyManager) Set(container *configs.Config) error {
+	// If Paths are set, then we are just joining cgroups paths
+	// and there is no need to set any values.
+	if m.Cgroups.Paths != nil {
+		return nil
+	}
+	for _, sys := range legacySubsystems {
+		// Get the subsystem path, but don't error out for not found cgroups.
+		path, err := getSubsystemPath(container.Cgroups, sys.Name())
+		if err != nil && !cgroups.IsNotFound(err) {
+			return err
+		}
+
+		if err := sys.Set(path, container.Cgroups); err != nil {
+			return err
+		}
+	}
+
+	if m.Paths["cpu"] != "" {
+		if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func getUnitName(c *configs.Cgroup) string {
+	// by default, we create a scope unless the user explicitly asks for a slice.
+	if !strings.HasSuffix(c.Name, ".slice") {
+		return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
+	}
+	return c.Name
+}
+
+func setKernelMemory(c *configs.Cgroup) error {
+	path, err := getSubsystemPath(c, "memory")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return err
+	}
+
+	if err := os.MkdirAll(path, 0755); err != nil {
+		return err
+	}
+	// do not try to enable the kernel memory if we already have
+	// tasks in the cgroup.
+	content, err := ioutil.ReadFile(filepath.Join(path, "tasks"))
+	if err != nil {
+		return err
+	}
+	if len(content) > 0 {
+		return nil
+	}
+	return fs.EnableKernelMemoryAccounting(path)
+}
+
+// isUnitExists returns true if the error is that a systemd unit already exists.
+func isUnitExists(err error) bool {
+	if err != nil {
+		if dbusError, ok := err.(dbus.Error); ok {
+			return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
+		}
+	}
+	return false
+}
+
+func (m *LegacyManager) GetCgroups() (*configs.Cgroup, error) {
+	return m.Cgroups, nil
+}
--- a/libcontainer/cgroups/systemd/unified_hierarchy.go
+++ b/libcontainer/cgroups/systemd/unified_hierarchy.go
@ -0,0 +1,312 @@
+// +build linux
+
+package systemd
+
+import (
+	"fmt"
+	"io/ioutil"
+	"math"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+	"time"
+
+	systemdDbus "github.com/coreos/go-systemd/dbus"
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+)
+
+type UnifiedManager struct {
+	mu      sync.Mutex
+	Cgroups *configs.Cgroup
+	Paths   map[string]string
+}
+
+func (m *UnifiedManager) Apply(pid int) error {
+	var (
+		c          = m.Cgroups
+		unitName   = getUnitName(c)
+		slice      = "system.slice"
+		properties []systemdDbus.Property
+	)
+
+	if c.Paths != nil {
+		paths := make(map[string]string)
+		for name, path := range c.Paths {
+			_, err := getSubsystemPath(m.Cgroups, name)
+			if err != nil {
+				// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+				if cgroups.IsNotFound(err) {
+					continue
+				}
+				return err
+			}
+			paths[name] = path
+		}
+		m.Paths = paths
+		return cgroups.EnterPid(m.Paths, pid)
+	}
+
+	if c.Parent != "" {
+		slice = c.Parent
+	}
+
+	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+	// if we create a slice, the parent is defined via a Wants=
+	if strings.HasSuffix(unitName, ".slice") {
+		properties = append(properties, systemdDbus.PropWants(slice))
+	} else {
+		// otherwise, we use Slice=
+		properties = append(properties, systemdDbus.PropSlice(slice))
+	}
+
+	// only add pid if its valid, -1 is used w/ general slice creation.
+	if pid != -1 {
+		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+	}
+
+	// Check if we can delegate. This is only supported on systemd versions 218 and above.
+	if !strings.HasSuffix(unitName, ".slice") {
+		// Assume scopes always support delegation.
+		properties = append(properties, newProp("Delegate", true))
+	}
+
+	// Always enable accounting, this gets us the same behaviour as the fs implementation,
+	// plus the kernel has some problems with joining the memory cgroup at a later time.
+	properties = append(properties,
+		newProp("MemoryAccounting", true),
+		newProp("CPUAccounting", true),
+		newProp("BlockIOAccounting", true))
+
+	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
+	properties = append(properties,
+		newProp("DefaultDependencies", false))
+
+	if c.Resources.Memory != 0 {
+		properties = append(properties,
+			newProp("MemoryLimit", uint64(c.Resources.Memory)))
+	}
+
+	if c.Resources.CpuShares != 0 {
+		properties = append(properties,
+			newProp("CPUShares", c.Resources.CpuShares))
+	}
+
+	// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
+	if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
+		// corresponds to USEC_INFINITY in systemd
+		// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
+		// always setting a property value ensures we can apply a quota and remove it later
+		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
+		if c.Resources.CpuQuota > 0 {
+			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
+			// (integer percentage of CPU) internally.  This means that if a fractional percent of
+			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
+			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
+			cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
+			if cpuQuotaPerSecUSec%10000 != 0 {
+				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
+			}
+		}
+		properties = append(properties,
+			newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
+	}
+
+	if c.Resources.BlkioWeight != 0 {
+		properties = append(properties,
+			newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
+	}
+
+	if c.Resources.PidsLimit > 0 {
+		properties = append(properties,
+			newProp("TasksAccounting", true),
+			newProp("TasksMax", uint64(c.Resources.PidsLimit)))
+	}
+
+	// We have to set kernel memory here, as we can't change it once
+	// processes have been attached to the cgroup.
+	if c.Resources.KernelMemory != 0 {
+		if err := setKernelMemory(c); err != nil {
+			return err
+		}
+	}
+
+	statusChan := make(chan string, 1)
+	if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
+		select {
+		case <-statusChan:
+		case <-time.After(time.Second):
+			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
+		}
+	} else if !isUnitExists(err) {
+		return err
+	}
+
+	if err := joinCgroupsV2(c, pid); err != nil {
+		return err
+	}
+
+	path, err := getSubsystemPath(m.Cgroups, "")
+	if err != nil {
+		return err
+	}
+	m.Paths = map[string]string{
+		"pids":    path,
+		"memory":  path,
+		"io":      path,
+		"cpu":     path,
+		"devices": path,
+		"cpuset":  path,
+		"freezer": path,
+	}
+	return nil
+}
+
+func (m *UnifiedManager) Destroy() error {
+	if m.Cgroups.Paths != nil {
+		return nil
+	}
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
+	if err := cgroups.RemovePaths(m.Paths); err != nil {
+		return err
+	}
+	m.Paths = make(map[string]string)
+	return nil
+}
+
+func (m *UnifiedManager) GetPaths() map[string]string {
+	m.mu.Lock()
+	paths := m.Paths
+	m.mu.Unlock()
+	return paths
+}
+func (m *UnifiedManager) GetUnifiedPath() (string, error) {
+	unifiedPath := ""
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	for k, v := range m.Paths {
+		if unifiedPath == "" {
+			unifiedPath = v
+		} else if v != unifiedPath {
+			return unifiedPath,
+				errors.Errorf("expected %q path to be unified path %q, got %q", k, unifiedPath, v)
+		}
+	}
+	if unifiedPath == "" {
+		// FIXME: unified path could be detected even when no controller is available
+		return unifiedPath, errors.New("cannot detect unified path")
+	}
+	return unifiedPath, nil
+}
+func createCgroupsv2Path(path string) (Err error) {
+	content, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
+	if err != nil {
+		return err
+	}
+	if !filepath.HasPrefix(path, "/sys/fs/cgroup") {
+		return fmt.Errorf("invalid cgroup path %s", path)
+	}
+
+	res := ""
+	for i, c := range strings.Split(strings.TrimSpace(string(content)), " ") {
+		if i == 0 {
+			res = fmt.Sprintf("+%s", c)
+		} else {
+			res = res + fmt.Sprintf(" +%s", c)
+		}
+	}
+	resByte := []byte(res)
+
+	current := "/sys/fs"
+	elements := strings.Split(path, "/")
+	for i, e := range elements[3:] {
+		current = filepath.Join(current, e)
+		if i > 0 {
+			if err := os.Mkdir(current, 0755); err != nil {
+				if !os.IsExist(err) {
+					return err
+				}
+			} else {
+				// If the directory was created, be sure it is not left around on errors.
+				defer func() {
+					if Err != nil {
+						os.Remove(current)
+					}
+				}()
+			}
+		}
+		if i < len(elements[3:])-1 {
+			if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), resByte, 0755); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+func joinCgroupsV2(c *configs.Cgroup, pid int) error {
+	path, err := getSubsystemPath(c, "memory")
+	if err != nil {
+		return err
+	}
+	return createCgroupsv2Path(path)
+}
+
+func (m *UnifiedManager) fsManager() (cgroups.Manager, error) {
+	path, err := m.GetUnifiedPath()
+	if err != nil {
+		return nil, err
+	}
+	return fs2.NewManager(m.Cgroups, path, false)
+}
+
+func (m *UnifiedManager) Freeze(state configs.FreezerState) error {
+	fsMgr, err := m.fsManager()
+	if err != nil {
+		return err
+	}
+	return fsMgr.Freeze(state)
+}
+
+func (m *UnifiedManager) GetPids() ([]int, error) {
+	path, err := m.GetUnifiedPath()
+	if err != nil {
+		return nil, err
+	}
+	return cgroups.GetPids(path)
+}
+
+func (m *UnifiedManager) GetAllPids() ([]int, error) {
+	path, err := m.GetUnifiedPath()
+	if err != nil {
+		return nil, err
+	}
+	return cgroups.GetAllPids(path)
+}
+
+func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) {
+	fsMgr, err := m.fsManager()
+	if err != nil {
+		return nil, err
+	}
+	return fsMgr.GetStats()
+}
+
+func (m *UnifiedManager) Set(container *configs.Config) error {
+	fsMgr, err := m.fsManager()
+	if err != nil {
+		return err
+	}
+	return fsMgr.Set(container)
+}
+
+func (m *UnifiedManager) GetCgroups() (*configs.Cgroup, error) {
+	return m.Cgroups, nil
+}
--- a/libcontainer/cgroups/utils.go
+++ b/libcontainer/cgroups/utils.go
@ -0,0 +1,588 @@
+// +build linux
+
+package cgroups
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"syscall"
+	"time"
+
+	units "github.com/docker/go-units"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	CgroupNamePrefix  = "name="
+	CgroupProcesses   = "cgroup.procs"
+	unifiedMountpoint = "/sys/fs/cgroup"
+)
+
+var (
+	isUnifiedOnce sync.Once
+	isUnified     bool
+)
+
+// HugePageSizeUnitList is a list of the units used by the linux kernel when
+// naming the HugePage control files.
+// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
+// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
+// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
+var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
+
+// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
+func IsCgroup2UnifiedMode() bool {
+	isUnifiedOnce.Do(func() {
+		var st syscall.Statfs_t
+		if err := syscall.Statfs(unifiedMountpoint, &st); err != nil {
+			panic("cannot statfs cgroup root")
+		}
+		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
+	})
+	return isUnified
+}
+
+// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
+func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
+	if IsCgroup2UnifiedMode() {
+		return unifiedMountpoint, nil
+	}
+	mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
+	return mnt, err
+}
+
+func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
+	// We are not using mount.GetMounts() because it's super-inefficient,
+	// parsing it directly sped up x10 times because of not using Sscanf.
+	// It was one of two major performance drawbacks in container start.
+	if !isSubsystemAvailable(subsystem) {
+		return "", "", NewNotFoundError(subsystem)
+	}
+
+	f, err := os.Open("/proc/self/mountinfo")
+	if err != nil {
+		return "", "", err
+	}
+	defer f.Close()
+
+	if IsCgroup2UnifiedMode() {
+		subsystem = ""
+	}
+
+	return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
+}
+
+func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
+	scanner := bufio.NewScanner(reader)
+	for scanner.Scan() {
+		txt := scanner.Text()
+		fields := strings.Fields(txt)
+		if len(fields) < 9 {
+			continue
+		}
+		if strings.HasPrefix(fields[4], cgroupPath) {
+			for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+				if (subsystem == "" && fields[9] == "cgroup2") || opt == subsystem {
+					return fields[4], fields[3], nil
+				}
+			}
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return "", "", err
+	}
+
+	return "", "", NewNotFoundError(subsystem)
+}
+
+func isSubsystemAvailable(subsystem string) bool {
+	if IsCgroup2UnifiedMode() {
+		controllers, err := GetAllSubsystems()
+		if err != nil {
+			return false
+		}
+		for _, c := range controllers {
+			if c == subsystem {
+				return true
+			}
+		}
+		return false
+	}
+
+	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return false
+	}
+	_, avail := cgroups[subsystem]
+	return avail
+}
+
+func GetClosestMountpointAncestor(dir, mountinfo string) string {
+	deepestMountPoint := ""
+	for _, mountInfoEntry := range strings.Split(mountinfo, "\n") {
+		mountInfoParts := strings.Fields(mountInfoEntry)
+		if len(mountInfoParts) < 5 {
+			continue
+		}
+		mountPoint := mountInfoParts[4]
+		if strings.HasPrefix(mountPoint, deepestMountPoint) && strings.HasPrefix(dir, mountPoint) {
+			deepestMountPoint = mountPoint
+		}
+	}
+	return deepestMountPoint
+}
+
+func FindCgroupMountpointDir() (string, error) {
+	f, err := os.Open("/proc/self/mountinfo")
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		text := scanner.Text()
+		fields := strings.Split(text, " ")
+		// Safe as mountinfo encodes mountpoints with spaces as \040.
+		index := strings.Index(text, " - ")
+		postSeparatorFields := strings.Fields(text[index+3:])
+		numPostFields := len(postSeparatorFields)
+
+		// This is an error as we can't detect if the mount is for "cgroup"
+		if numPostFields == 0 {
+			return "", fmt.Errorf("Found no fields post '-' in %q", text)
+		}
+
+		if postSeparatorFields[0] == "cgroup" || postSeparatorFields[0] == "cgroup2" {
+			// Check that the mount is properly formatted.
+			if numPostFields < 3 {
+				return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+			}
+
+			return filepath.Dir(fields[4]), nil
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return "", err
+	}
+
+	return "", NewNotFoundError("cgroup")
+}
+
+type Mount struct {
+	Mountpoint string
+	Root       string
+	Subsystems []string
+}
+
+func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
+	if len(m.Subsystems) == 0 {
+		return "", fmt.Errorf("no subsystem for mount")
+	}
+
+	return getControllerPath(m.Subsystems[0], cgroups)
+}
+
+func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
+	res := make([]Mount, 0, len(ss))
+	scanner := bufio.NewScanner(mi)
+	numFound := 0
+	for scanner.Scan() && numFound < len(ss) {
+		txt := scanner.Text()
+		sepIdx := strings.Index(txt, " - ")
+		if sepIdx == -1 {
+			return nil, fmt.Errorf("invalid mountinfo format")
+		}
+		if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
+			continue
+		}
+		fields := strings.Split(txt, " ")
+		m := Mount{
+			Mountpoint: fields[4],
+			Root:       fields[3],
+		}
+		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+			seen, known := ss[opt]
+			if !known || (!all && seen) {
+				continue
+			}
+			ss[opt] = true
+			if strings.HasPrefix(opt, CgroupNamePrefix) {
+				opt = opt[len(CgroupNamePrefix):]
+			}
+			m.Subsystems = append(m.Subsystems, opt)
+			numFound++
+		}
+		if len(m.Subsystems) > 0 || all {
+			res = append(res, m)
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+	return res, nil
+}
+
+// GetCgroupMounts returns the mounts for the cgroup subsystems.
+// all indicates whether to return just the first instance or all the mounts.
+func GetCgroupMounts(all bool) ([]Mount, error) {
+	if IsCgroup2UnifiedMode() {
+		availableControllers, err := GetAllSubsystems()
+		if err != nil {
+			return nil, err
+		}
+		m := Mount{
+			Mountpoint: unifiedMountpoint,
+			Root:       unifiedMountpoint,
+			Subsystems: availableControllers,
+		}
+		return []Mount{m}, nil
+	}
+
+	f, err := os.Open("/proc/self/mountinfo")
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return nil, err
+	}
+
+	allMap := make(map[string]bool)
+	for s := range allSubsystems {
+		allMap[s] = false
+	}
+	return getCgroupMountsHelper(allMap, f, all)
+}
+
+// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
+func GetAllSubsystems() ([]string, error) {
+	// /proc/cgroups is meaningless for v2
+	// https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
+	if IsCgroup2UnifiedMode() {
+		// "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
+		// - devices: implemented in kernel 4.15
+		// - freezer: implemented in kernel 5.2
+		// We assume these are always available, as it is hard to detect availability.
+		pseudo := []string{"devices", "freezer"}
+		data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
+		if err != nil {
+			return nil, err
+		}
+		subsystems := append(pseudo, strings.Fields(string(data))...)
+		return subsystems, nil
+	}
+	f, err := os.Open("/proc/cgroups")
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	subsystems := []string{}
+
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		text := s.Text()
+		if text[0] != '#' {
+			parts := strings.Fields(text)
+			if len(parts) >= 4 && parts[3] != "0" {
+				subsystems = append(subsystems, parts[0])
+			}
+		}
+	}
+	if err := s.Err(); err != nil {
+		return nil, err
+	}
+	return subsystems, nil
+}
+
+// GetOwnCgroup returns the relative path to the cgroup docker is running in.
+func GetOwnCgroup(subsystem string) (string, error) {
+	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return "", err
+	}
+
+	return getControllerPath(subsystem, cgroups)
+}
+
+func GetOwnCgroupPath(subsystem string) (string, error) {
+	cgroup, err := GetOwnCgroup(subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	return getCgroupPathHelper(subsystem, cgroup)
+}
+
+func GetInitCgroup(subsystem string) (string, error) {
+	cgroups, err := ParseCgroupFile("/proc/1/cgroup")
+	if err != nil {
+		return "", err
+	}
+
+	return getControllerPath(subsystem, cgroups)
+}
+
+func GetInitCgroupPath(subsystem string) (string, error) {
+	cgroup, err := GetInitCgroup(subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	return getCgroupPathHelper(subsystem, cgroup)
+}
+
+func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
+	mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	// This is needed for nested containers, because in /proc/self/cgroup we
+	// see paths from host, which don't exist in container.
+	relCgroup, err := filepath.Rel(root, cgroup)
+	if err != nil {
+		return "", err
+	}
+
+	return filepath.Join(mnt, relCgroup), nil
+}
+
+func readProcsFile(dir string) ([]int, error) {
+	f, err := os.Open(filepath.Join(dir, CgroupProcesses))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	var (
+		s   = bufio.NewScanner(f)
+		out = []int{}
+	)
+
+	for s.Scan() {
+		if t := s.Text(); t != "" {
+			pid, err := strconv.Atoi(t)
+			if err != nil {
+				return nil, err
+			}
+			out = append(out, pid)
+		}
+	}
+	return out, nil
+}
+
+// ParseCgroupFile parses the given cgroup file, typically from
+// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
+func ParseCgroupFile(path string) (map[string]string, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	return parseCgroupFromReader(f)
+}
+
+// helper function for ParseCgroupFile to make testing easier
+func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
+	s := bufio.NewScanner(r)
+	cgroups := make(map[string]string)
+
+	for s.Scan() {
+		text := s.Text()
+		// from cgroups(7):
+		// /proc/[pid]/cgroup
+		// ...
+		// For each cgroup hierarchy ... there is one entry
+		// containing three colon-separated fields of the form:
+		//     hierarchy-ID:subsystem-list:cgroup-path
+		parts := strings.SplitN(text, ":", 3)
+		if len(parts) < 3 {
+			return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
+		}
+
+		for _, subs := range strings.Split(parts[1], ",") {
+			cgroups[subs] = parts[2]
+		}
+	}
+	if err := s.Err(); err != nil {
+		return nil, err
+	}
+
+	return cgroups, nil
+}
+
+func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "/", nil
+	}
+
+	if p, ok := cgroups[subsystem]; ok {
+		return p, nil
+	}
+
+	if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
+		return p, nil
+	}
+
+	return "", NewNotFoundError(subsystem)
+}
+
+func PathExists(path string) bool {
+	if _, err := os.Stat(path); err != nil {
+		return false
+	}
+	return true
+}
+
+func EnterPid(cgroupPaths map[string]string, pid int) error {
+	for _, path := range cgroupPaths {
+		if PathExists(path) {
+			if err := WriteCgroupProc(path, pid); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// RemovePaths iterates over the provided paths removing them.
+// We trying to remove all paths five times with increasing delay between tries.
+// If after all there are not removed cgroups - appropriate error will be
+// returned.
+func RemovePaths(paths map[string]string) (err error) {
+	delay := 10 * time.Millisecond
+	for i := 0; i < 5; i++ {
+		if i != 0 {
+			time.Sleep(delay)
+			delay *= 2
+		}
+		for s, p := range paths {
+			os.RemoveAll(p)
+			// TODO: here probably should be logging
+			_, err := os.Stat(p)
+			// We need this strange way of checking cgroups existence because
+			// RemoveAll almost always returns error, even on already removed
+			// cgroups
+			if os.IsNotExist(err) {
+				delete(paths, s)
+			}
+		}
+		if len(paths) == 0 {
+			return nil
+		}
+	}
+	return fmt.Errorf("Failed to remove paths: %v", paths)
+}
+
+func GetHugePageSize() ([]string, error) {
+	files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
+	if err != nil {
+		return []string{}, err
+	}
+	var fileNames []string
+	for _, st := range files {
+		fileNames = append(fileNames, st.Name())
+	}
+	return getHugePageSizeFromFilenames(fileNames)
+}
+
+func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
+	var pageSizes []string
+	for _, fileName := range fileNames {
+		nameArray := strings.Split(fileName, "-")
+		pageSize, err := units.RAMInBytes(nameArray[1])
+		if err != nil {
+			return []string{}, err
+		}
+		sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
+		pageSizes = append(pageSizes, sizeString)
+	}
+
+	return pageSizes, nil
+}
+
+// GetPids returns all pids, that were added to cgroup at path.
+func GetPids(path string) ([]int, error) {
+	return readProcsFile(path)
+}
+
+// GetAllPids returns all pids, that were added to cgroup at path and to all its
+// subcgroups.
+func GetAllPids(path string) ([]int, error) {
+	var pids []int
+	// collect pids from all sub-cgroups
+	err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
+		dir, file := filepath.Split(p)
+		if file != CgroupProcesses {
+			return nil
+		}
+		if iErr != nil {
+			return iErr
+		}
+		cPids, err := readProcsFile(dir)
+		if err != nil {
+			return err
+		}
+		pids = append(pids, cPids...)
+		return nil
+	})
+	return pids, err
+}
+
+// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
+func WriteCgroupProc(dir string, pid int) error {
+	// Normally dir should not be empty, one case is that cgroup subsystem
+	// is not mounted, we will get empty dir, and we want it fail here.
+	if dir == "" {
+		return fmt.Errorf("no such directory for %s", CgroupProcesses)
+	}
+
+	// Dont attach any pid to the cgroup if -1 is specified as a pid
+	if pid == -1 {
+		return nil
+	}
+
+	cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
+	if err != nil {
+		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
+	}
+	defer cgroupProcessesFile.Close()
+
+	for i := 0; i < 5; i++ {
+		_, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
+		if err == nil {
+			return nil
+		}
+
+		// EINVAL might mean that the task being added to cgroup.procs is in state
+		// TASK_NEW. We should attempt to do so again.
+		if isEINVAL(err) {
+			time.Sleep(30 * time.Millisecond)
+			continue
+		}
+
+		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
+	}
+	return err
+}
+
+func isEINVAL(err error) bool {
+	switch err := err.(type) {
+	case *os.PathError:
+		return err.Err == unix.EINVAL
+	default:
+		return false
+	}
+}
--- a/libcontainer/cgroups/utils_test.go
+++ b/libcontainer/cgroups/utils_test.go
@ -0,0 +1,459 @@
+// +build linux
+
+package cgroups
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"reflect"
+	"strings"
+	"testing"
+)
+
+const fedoraMountinfo = `15 35 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
+16 35 0:14 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
+17 35 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8056484k,nr_inodes=2014121,mode=755
+18 16 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
+19 16 0:13 / /sys/fs/selinux rw,relatime shared:8 - selinuxfs selinuxfs rw
+20 17 0:16 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
+21 17 0:10 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
+22 35 0:17 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,seclabel,mode=755
+23 16 0:18 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:9 - tmpfs tmpfs rw,seclabel,mode=755
+24 23 0:19 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
+25 16 0:20 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
+26 23 0:21 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,clone_children
+27 23 0:22 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,cpuacct,cpu,clone_children
+28 23 0:23 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,memory,clone_children
+29 23 0:24 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,devices,clone_children
+30 23 0:25 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,freezer,clone_children
+31 23 0:26 / /sys/fs/cgroup/net_cls rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,net_cls,clone_children
+32 23 0:27 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,blkio,clone_children
+33 23 0:28 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,perf_event,clone_children
+34 23 0:29 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,hugetlb,clone_children
+35 1 253:2 / / rw,relatime shared:1 - ext4 /dev/mapper/ssd-root--f20 rw,seclabel,data=ordered
+36 15 0:30 / /proc/sys/fs/binfmt_misc rw,relatime shared:22 - autofs systemd-1 rw,fd=38,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
+37 17 0:12 / /dev/mqueue rw,relatime shared:23 - mqueue mqueue rw,seclabel
+38 35 0:31 / /tmp rw shared:24 - tmpfs tmpfs rw,seclabel
+39 17 0:32 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw,seclabel
+40 16 0:7 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw
+41 16 0:33 / /sys/kernel/config rw,relatime shared:27 - configfs configfs rw
+42 35 0:34 / /var/lib/nfs/rpc_pipefs rw,relatime shared:28 - rpc_pipefs sunrpc rw
+43 15 0:35 / /proc/fs/nfsd rw,relatime shared:29 - nfsd sunrpc rw
+45 35 8:17 / /boot rw,relatime shared:30 - ext4 /dev/sdb1 rw,seclabel,data=ordered
+46 35 253:4 / /home rw,relatime shared:31 - ext4 /dev/mapper/ssd-home rw,seclabel,data=ordered
+47 35 253:5 / /var/lib/libvirt/images rw,noatime,nodiratime shared:32 - ext4 /dev/mapper/ssd-virt rw,seclabel,discard,data=ordered
+48 35 253:12 / /mnt/old rw,relatime shared:33 - ext4 /dev/mapper/HelpDeskRHEL6-FedoraRoot rw,seclabel,data=ordered
+121 22 0:36 / /run/user/1000/gvfs rw,nosuid,nodev,relatime shared:104 - fuse.gvfsd-fuse gvfsd-fuse rw,user_id=1000,group_id=1000
+124 16 0:37 / /sys/fs/fuse/connections rw,relatime shared:107 - fusectl fusectl rw
+165 38 253:3 / /tmp/mnt rw,relatime shared:147 - ext4 /dev/mapper/ssd-root rw,seclabel,data=ordered
+167 35 253:15 / /var/lib/docker/devicemapper/mnt/aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,relatime shared:149 - ext4 /dev/mapper/docker-253:2-425882-aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,seclabel,discard,stripe=16,data=ordered
+171 35 253:16 / /var/lib/docker/devicemapper/mnt/c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,relatime shared:153 - ext4 /dev/mapper/docker-253:2-425882-c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,seclabel,discard,stripe=16,data=ordered
+175 35 253:17 / /var/lib/docker/devicemapper/mnt/1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,relatime shared:157 - ext4 /dev/mapper/docker-253:2-425882-1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,seclabel,discard,stripe=16,data=ordered
+179 35 253:18 / /var/lib/docker/devicemapper/mnt/d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,relatime shared:161 - ext4 /dev/mapper/docker-253:2-425882-d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,seclabel,discard,stripe=16,data=ordered
+183 35 253:19 / /var/lib/docker/devicemapper/mnt/6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,relatime shared:165 - ext4 /dev/mapper/docker-253:2-425882-6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,seclabel,discard,stripe=16,data=ordered
+187 35 253:20 / /var/lib/docker/devicemapper/mnt/8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,relatime shared:169 - ext4 /dev/mapper/docker-253:2-425882-8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,seclabel,discard,stripe=16,data=ordered
+191 35 253:21 / /var/lib/docker/devicemapper/mnt/c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,relatime shared:173 - ext4 /dev/mapper/docker-253:2-425882-c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,seclabel,discard,stripe=16,data=ordered
+195 35 253:22 / /var/lib/docker/devicemapper/mnt/2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,relatime shared:177 - ext4 /dev/mapper/docker-253:2-425882-2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,seclabel,discard,stripe=16,data=ordered
+199 35 253:23 / /var/lib/docker/devicemapper/mnt/37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,relatime shared:181 - ext4 /dev/mapper/docker-253:2-425882-37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,seclabel,discard,stripe=16,data=ordered
+203 35 253:24 / /var/lib/docker/devicemapper/mnt/aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,relatime shared:185 - ext4 /dev/mapper/docker-253:2-425882-aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,seclabel,discard,stripe=16,data=ordered
+207 35 253:25 / /var/lib/docker/devicemapper/mnt/928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,relatime shared:189 - ext4 /dev/mapper/docker-253:2-425882-928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,seclabel,discard,stripe=16,data=ordered
+211 35 253:26 / /var/lib/docker/devicemapper/mnt/0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,relatime shared:193 - ext4 /dev/mapper/docker-253:2-425882-0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,seclabel,discard,stripe=16,data=ordered
+215 35 253:27 / /var/lib/docker/devicemapper/mnt/d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,relatime shared:197 - ext4 /dev/mapper/docker-253:2-425882-d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,seclabel,discard,stripe=16,data=ordered
+219 35 253:28 / /var/lib/docker/devicemapper/mnt/bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,relatime shared:201 - ext4 /dev/mapper/docker-253:2-425882-bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,seclabel,discard,stripe=16,data=ordered
+223 35 253:29 / /var/lib/docker/devicemapper/mnt/7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,relatime shared:205 - ext4 /dev/mapper/docker-253:2-425882-7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,seclabel,discard,stripe=16,data=ordered
+227 35 253:30 / /var/lib/docker/devicemapper/mnt/c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,relatime shared:209 - ext4 /dev/mapper/docker-253:2-425882-c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,seclabel,discard,stripe=16,data=ordered
+231 35 253:31 / /var/lib/docker/devicemapper/mnt/8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,relatime shared:213 - ext4 /dev/mapper/docker-253:2-425882-8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,seclabel,discard,stripe=16,data=ordered
+235 35 253:32 / /var/lib/docker/devicemapper/mnt/1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,relatime shared:217 - ext4 /dev/mapper/docker-253:2-425882-1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,seclabel,discard,stripe=16,data=ordered
+239 35 253:33 / /var/lib/docker/devicemapper/mnt/e9aa60c60128cad1 rw,relatime shared:221 - ext4 /dev/mapper/docker-253:2-425882-e9aa60c60128cad1 rw,seclabel,discard,stripe=16,data=ordered
+243 35 253:34 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,relatime shared:225 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,seclabel,discard,stripe=16,data=ordered
+247 35 253:35 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,relatime shared:229 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,seclabel,discard,stripe=16,data=ordered
+31 21 0:23 / /DATA/foo_bla_bla rw,relatime - cifs //foo/BLA\040BLA\040BLA/ rw,sec=ntlm,cache=loose,unc=\\foo\BLA BLA BLA,username=my_login,domain=mydomain.com,uid=12345678,forceuid,gid=12345678,forcegid,addr=10.1.30.10,file_mode=0755,dir_mode=0755,nounix,rsize=61440,wsize=65536,actimeo=1`
+
+const systemdMountinfo = `115 83 0:32 / / rw,relatime - aufs none rw,si=c0bd3d3,dio,dirperm1
+116 115 0:35 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
+117 115 0:36 / /dev rw,nosuid - tmpfs tmpfs rw,mode=755
+118 117 0:37 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666
+119 115 0:38 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw
+120 119 0:39 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755
+121 120 0:19 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+122 120 0:20 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,devices
+123 120 0:21 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
+124 120 0:22 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
+125 120 0:23 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,net_cls,net_prio
+126 120 0:24 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,blkio
+127 120 0:25 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuset,clone_children
+128 120 0:26 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpu,cpuacct
+129 120 0:27 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,perf_event,release_agent=/run/cgmanager/agents/cgm-release-agent.perf_event
+130 115 43:0 /var/lib/docker/volumes/a44a712176377f57c094397330ee04387284c478364eb25f4c3d25f775f25c26/_data /var/lib/docker rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+131 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/resolv.conf /etc/resolv.conf rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+132 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hostname /etc/hostname rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+133 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hosts /etc/hosts rw,relatime - ext4 /dev/nbd0 rw,data=ordered
+134 117 0:33 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k
+135 117 0:13 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw
+136 117 0:12 /1 /dev/console rw,nosuid,noexec,relatime - devpts none rw,gid=5,mode=620,ptmxmode=000
+84 115 0:40 / /tmp rw,relatime - tmpfs none rw`
+
+const bedrockMountinfo = `120 17 0:28 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+124 28 0:28 / /bedrock/strata/arch/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+123 53 0:28 / /bedrock/strata/fallback/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+122 71 0:28 / /bedrock/strata/gentoo/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+121 89 0:28 / /bedrock/strata/kde/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755
+125 120 0:29 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+129 124 0:29 / /bedrock/strata/arch/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+128 123 0:29 / /bedrock/strata/fallback/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+127 122 0:29 / /bedrock/strata/gentoo/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+126 121 0:29 / /bedrock/strata/kde/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd
+140 120 0:32 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+144 124 0:32 / /bedrock/strata/arch/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+143 123 0:32 / /bedrock/strata/fallback/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+142 122 0:32 / /bedrock/strata/gentoo/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+141 121 0:32 / /bedrock/strata/kde/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio
+145 120 0:33 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+149 124 0:33 / /bedrock/strata/arch/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+148 123 0:33 / /bedrock/strata/fallback/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+147 122 0:33 / /bedrock/strata/gentoo/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+146 121 0:33 / /bedrock/strata/kde/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio
+150 120 0:34 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+154 124 0:34 / /bedrock/strata/arch/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+153 123 0:34 / /bedrock/strata/fallback/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+152 122 0:34 / /bedrock/strata/gentoo/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+151 121 0:34 / /bedrock/strata/kde/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct
+155 120 0:35 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+159 124 0:35 / /bedrock/strata/arch/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+158 123 0:35 / /bedrock/strata/fallback/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+157 122 0:35 / /bedrock/strata/gentoo/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+156 121 0:35 / /bedrock/strata/kde/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset
+160 120 0:36 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+164 124 0:36 / /bedrock/strata/arch/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+163 123 0:36 / /bedrock/strata/fallback/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+162 122 0:36 / /bedrock/strata/gentoo/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+161 121 0:36 / /bedrock/strata/kde/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices
+165 120 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+169 124 0:37 / /bedrock/strata/arch/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+168 123 0:37 / /bedrock/strata/fallback/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+167 122 0:37 / /bedrock/strata/gentoo/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+166 121 0:37 / /bedrock/strata/kde/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory
+170 120 0:38 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+174 124 0:38 / /bedrock/strata/arch/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+173 123 0:38 / /bedrock/strata/fallback/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+172 122 0:38 / /bedrock/strata/gentoo/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+171 121 0:38 / /bedrock/strata/kde/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer
+175 120 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+179 124 0:39 / /bedrock/strata/arch/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+178 123 0:39 / /bedrock/strata/fallback/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+177 122 0:39 / /bedrock/strata/gentoo/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+176 121 0:39 / /bedrock/strata/kde/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids
+180 120 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+184 124 0:40 / /bedrock/strata/arch/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+183 123 0:40 / /bedrock/strata/fallback/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+182 122 0:40 / /bedrock/strata/gentoo/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event
+181 121 0:40 / /bedrock/strata/kde/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event`
+
+const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
+19 64 0:4 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
+20 64 0:6 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8171204k,nr_inodes=2042801,mode=755
+21 18 0:19 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
+22 20 0:20 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
+23 20 0:21 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
+24 64 0:22 / /run rw,nosuid,nodev shared:24 - tmpfs tmpfs rw,seclabel,mode=755
+25 18 0:23 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755
+26 25 0:24 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup2 cgroup rw
+27 18 0:25 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw,seclabel
+28 18 0:26 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime shared:21 - efivarfs efivarfs rw
+29 25 0:27 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpu,cpuacct
+30 25 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,memory
+31 25 0:29 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,net_cls,net_prio
+32 25 0:30 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,blkio
+33 25 0:31 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,perf_event
+34 25 0:32 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,hugetlb
+35 25 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,freezer
+36 25 0:34 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
+37 25 0:35 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+38 25 0:36 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids
+61 18 0:37 / /sys/kernel/config rw,relatime shared:22 - configfs configfs rw
+64 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/fedora_dhcp--16--129-root rw,seclabel,data=ordered
+39 18 0:17 / /sys/fs/selinux rw,relatime shared:23 - selinuxfs selinuxfs rw
+40 20 0:16 / /dev/mqueue rw,relatime shared:25 - mqueue mqueue rw,seclabel
+41 20 0:39 / /dev/hugepages rw,relatime shared:26 - hugetlbfs hugetlbfs rw,seclabel
+`
+
+func TestGetCgroupMounts(t *testing.T) {
+	type testData struct {
+		mountInfo  string
+		root       string
+		subsystems map[string]bool
+	}
+	testTable := []testData{
+		{
+			mountInfo: fedoraMountinfo,
+			root:      "/",
+			subsystems: map[string]bool{
+				"cpuset":     false,
+				"cpu":        false,
+				"cpuacct":    false,
+				"memory":     false,
+				"devices":    false,
+				"freezer":    false,
+				"net_cls":    false,
+				"blkio":      false,
+				"perf_event": false,
+				"hugetlb":    false,
+			},
+		},
+		{
+			mountInfo: systemdMountinfo,
+			root:      "/system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope",
+			subsystems: map[string]bool{
+				"cpuset":     false,
+				"cpu":        false,
+				"cpuacct":    false,
+				"memory":     false,
+				"devices":    false,
+				"freezer":    false,
+				"net_cls":    false,
+				"blkio":      false,
+				"perf_event": false,
+			},
+		},
+		{
+			mountInfo: bedrockMountinfo,
+			root:      "/",
+			subsystems: map[string]bool{
+				"cpuset":     false,
+				"cpu":        false,
+				"cpuacct":    false,
+				"memory":     false,
+				"devices":    false,
+				"freezer":    false,
+				"net_cls":    false,
+				"blkio":      false,
+				"perf_event": false,
+			},
+		},
+	}
+	for _, td := range testTable {
+		mi := bytes.NewBufferString(td.mountInfo)
+		cgMounts, err := getCgroupMountsHelper(td.subsystems, mi, false)
+		if err != nil {
+			t.Fatal(err)
+		}
+		cgMap := make(map[string]Mount)
+		for _, m := range cgMounts {
+			for _, ss := range m.Subsystems {
+				cgMap[ss] = m
+			}
+		}
+		for ss := range td.subsystems {
+			m, ok := cgMap[ss]
+			if !ok {
+				t.Fatalf("%s not found", ss)
+			}
+			if m.Root != td.root {
+				t.Fatalf("unexpected root for %s: %s", ss, m.Root)
+			}
+			if !strings.HasPrefix(m.Mountpoint, "/sys/fs/cgroup/") && !strings.Contains(m.Mountpoint, ss) {
+				t.Fatalf("unexpected mountpoint for %s: %s", ss, m.Mountpoint)
+			}
+			var ssFound bool
+			for _, mss := range m.Subsystems {
+				if mss == ss {
+					ssFound = true
+					break
+				}
+			}
+			if !ssFound {
+				t.Fatalf("subsystem %s not found in Subsystems field %v", ss, m.Subsystems)
+			}
+		}
+	}
+}
+
+func BenchmarkGetCgroupMounts(b *testing.B) {
+	subsystems := map[string]bool{
+		"cpuset":     false,
+		"cpu":        false,
+		"cpuacct":    false,
+		"memory":     false,
+		"devices":    false,
+		"freezer":    false,
+		"net_cls":    false,
+		"blkio":      false,
+		"perf_event": false,
+		"hugetlb":    false,
+	}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		b.StopTimer()
+		mi := bytes.NewBufferString(fedoraMountinfo)
+		b.StartTimer()
+		if _, err := getCgroupMountsHelper(subsystems, mi, false); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func TestParseCgroupString(t *testing.T) {
+	testCases := []struct {
+		input          string
+		expectedError  error
+		expectedOutput map[string]string
+	}{
+		{
+			// Taken from a CoreOS instance running systemd 225 with CPU/Mem
+			// accounting enabled in systemd
+			input: `9:blkio:/
+8:freezer:/
+7:perf_event:/
+6:devices:/system.slice/system-sshd.slice
+5:cpuset:/
+4:cpu,cpuacct:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service
+3:net_cls,net_prio:/
+2:memory:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service
+1:name=systemd:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service`,
+			expectedOutput: map[string]string{
+				"name=systemd": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+				"blkio":        "/",
+				"freezer":      "/",
+				"perf_event":   "/",
+				"devices":      "/system.slice/system-sshd.slice",
+				"cpuset":       "/",
+				"cpu":          "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+				"cpuacct":      "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+				"net_cls":      "/",
+				"net_prio":     "/",
+				"memory":       "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service",
+			},
+		},
+		{
+			input:         `malformed input`,
+			expectedError: fmt.Errorf(`invalid cgroup entry: must contain at least two colons: malformed input`),
+		},
+	}
+
+	for ndx, testCase := range testCases {
+		out, err := parseCgroupFromReader(strings.NewReader(testCase.input))
+		if err != nil {
+			if testCase.expectedError == nil || testCase.expectedError.Error() != err.Error() {
+				t.Errorf("%v: expected error %v, got error %v", ndx, testCase.expectedError, err)
+			}
+		} else {
+			if !reflect.DeepEqual(testCase.expectedOutput, out) {
+				t.Errorf("%v: expected output %v, got error %v", ndx, testCase.expectedOutput, out)
+			}
+		}
+	}
+
+}
+
+func TestIgnoreCgroup2Mount(t *testing.T) {
+	subsystems := map[string]bool{
+		"cpuset":       false,
+		"cpu":          false,
+		"cpuacct":      false,
+		"memory":       false,
+		"devices":      false,
+		"freezer":      false,
+		"net_cls":      false,
+		"blkio":        false,
+		"perf_event":   false,
+		"pids":         false,
+		"name=systemd": false,
+	}
+
+	mi := bytes.NewBufferString(cgroup2Mountinfo)
+	cgMounts, err := getCgroupMountsHelper(subsystems, mi, false)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, m := range cgMounts {
+		if m.Mountpoint == "/sys/fs/cgroup/systemd" {
+			t.Errorf("parsed a cgroup2 mount at /sys/fs/cgroup/systemd instead of ignoring it")
+		}
+	}
+}
+
+func TestGetClosestMountpointAncestor(t *testing.T) {
+	fakeMountInfo := ` 18 24 0:17 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw
+100 99 1:31 / /foo/bar rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/baz2 rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/baz rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/bazza rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo/bar/baz3 rw,relatime - fake fake rw,fake
+100 99 1:31 / /foo rw,relatime - fake fake rw,fake
+100 99 1:31 / /unrelated rw,relatime - fake fake rw,fake
+100 99 1:31 / / rw,relatime - fake fake rw,fake
+`
+	testCases := []struct {
+		input  string
+		output string
+	}{
+		{input: "/foo/bar/baz/a/b/c", output: "/foo/bar/baz"},
+		{input: "/foo/bar/baz", output: "/foo/bar/baz"},
+		{input: "/foo/bar/bazza", output: "/foo/bar/bazza"},
+		{input: "/a/b/c/d", output: "/"},
+	}
+
+	for _, c := range testCases {
+		mountpoint := GetClosestMountpointAncestor(c.input, fakeMountInfo)
+		if mountpoint != c.output {
+			t.Errorf("expected %s, got %s", c.output, mountpoint)
+		}
+	}
+}
+
+func TestFindCgroupMountpointAndRoot(t *testing.T) {
+	fakeMountInfo := `
+35 27 0:29 / /foo rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+35 27 0:29 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices
+`
+	testCases := []struct {
+		cgroupPath string
+		output     string
+	}{
+		{cgroupPath: "/sys/fs", output: "/sys/fs/cgroup/devices"},
+		{cgroupPath: "", output: "/foo"},
+	}
+
+	for _, c := range testCases {
+		mountpoint, _, _ := findCgroupMountpointAndRootFromReader(strings.NewReader(fakeMountInfo), c.cgroupPath, "devices")
+		if mountpoint != c.output {
+			t.Errorf("expected %s, got %s", c.output, mountpoint)
+		}
+	}
+}
+
+func TestGetHugePageSizeImpl(t *testing.T) {
+
+	testCases := []struct {
+		inputFiles      []string
+		outputPageSizes []string
+		err             error
+	}{
+		{
+			inputFiles:      []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"},
+			outputPageSizes: []string{"1GB", "2MB", "32MB", "64KB"},
+			err:             nil,
+		},
+		{
+			inputFiles:      []string{},
+			outputPageSizes: []string{},
+			err:             nil,
+		},
+		{
+			inputFiles:      []string{"hugepages-a"},
+			outputPageSizes: []string{},
+			err:             errors.New("invalid size: 'a'"),
+		},
+	}
+
+	for _, c := range testCases {
+		pageSizes, err := getHugePageSizeFromFilenames(c.inputFiles)
+		if len(pageSizes) != 0 && len(c.outputPageSizes) != 0 && !reflect.DeepEqual(pageSizes, c.outputPageSizes) {
+			t.Errorf("expected %s, got %s", c.outputPageSizes, pageSizes)
+		}
+		if err != nil && err.Error() != c.err.Error() {
+			t.Errorf("expected error %s, got %s", c.err, err)
+		}
+	}
+}
--- a/libcontainer/configs/blkio_device.go
+++ b/libcontainer/configs/blkio_device.go
@ -0,0 +1,66 @@
+package configs
+
+import "fmt"
+
+// blockIODevice holds major:minor format supported in blkio cgroup
+type blockIODevice struct {
+	// Major is the device's major number
+	Major int64 `json:"major"`
+	// Minor is the device's minor number
+	Minor int64 `json:"minor"`
+}
+
+// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
+type WeightDevice struct {
+	blockIODevice
+	// Weight is the bandwidth rate for the device, range is from 10 to 1000
+	Weight uint16 `json:"weight"`
+	// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
+	LeafWeight uint16 `json:"leafWeight"`
+}
+
+// NewWeightDevice returns a configured WeightDevice pointer
+func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
+	wd := &WeightDevice{}
+	wd.Major = major
+	wd.Minor = minor
+	wd.Weight = weight
+	wd.LeafWeight = leafWeight
+	return wd
+}
+
+// WeightString formats the struct to be writable to the cgroup specific file
+func (wd *WeightDevice) WeightString() string {
+	return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
+}
+
+// LeafWeightString formats the struct to be writable to the cgroup specific file
+func (wd *WeightDevice) LeafWeightString() string {
+	return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
+}
+
+// ThrottleDevice struct holds a `major:minor rate_per_second` pair
+type ThrottleDevice struct {
+	blockIODevice
+	// Rate is the IO rate limit per cgroup per device
+	Rate uint64 `json:"rate"`
+}
+
+// NewThrottleDevice returns a configured ThrottleDevice pointer
+func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
+	td := &ThrottleDevice{}
+	td.Major = major
+	td.Minor = minor
+	td.Rate = rate
+	return td
+}
+
+// String formats the struct to be writable to the cgroup specific file
+func (td *ThrottleDevice) String() string {
+	return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
+}
+
+// StringName formats the struct to be writable to the cgroup specific file
+func (td *ThrottleDevice) StringName(name string) string {
+	return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate)
+}
--- a/libcontainer/configs/cgroup_linux.go
+++ b/libcontainer/configs/cgroup_linux.go
@ -0,0 +1,130 @@
+package configs
+
+type FreezerState string
+
+const (
+	Undefined FreezerState = ""
+	Frozen    FreezerState = "FROZEN"
+	Thawed    FreezerState = "THAWED"
+)
+
+type Cgroup struct {
+	// Deprecated, use Path instead
+	Name string `json:"name,omitempty"`
+
+	// name of parent of cgroup or slice
+	// Deprecated, use Path instead
+	Parent string `json:"parent,omitempty"`
+
+	// Path specifies the path to cgroups that are created and/or joined by the container.
+	// The path is assumed to be relative to the host system cgroup mountpoint.
+	Path string `json:"path"`
+
+	// ScopePrefix describes prefix for the scope name
+	ScopePrefix string `json:"scope_prefix"`
+
+	// Paths represent the absolute cgroups paths to join.
+	// This takes precedence over Path.
+	Paths map[string]string
+
+	// Resources contains various cgroups settings to apply
+	*Resources
+}
+
+type Resources struct {
+	// If this is true allow access to any kind of device within the container.  If false, allow access only to devices explicitly listed in the allowed_devices list.
+	// Deprecated
+	AllowAllDevices *bool `json:"allow_all_devices,omitempty"`
+	// Deprecated
+	AllowedDevices []*Device `json:"allowed_devices,omitempty"`
+	// Deprecated
+	DeniedDevices []*Device `json:"denied_devices,omitempty"`
+
+	Devices []*Device `json:"devices"`
+
+	// Memory limit (in bytes)
+	Memory int64 `json:"memory"`
+
+	// Memory reservation or soft_limit (in bytes)
+	MemoryReservation int64 `json:"memory_reservation"`
+
+	// Total memory usage (memory + swap); set `-1` to enable unlimited swap
+	MemorySwap int64 `json:"memory_swap"`
+
+	// Kernel memory limit (in bytes)
+	KernelMemory int64 `json:"kernel_memory"`
+
+	// Kernel memory limit for TCP use (in bytes)
+	KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
+
+	// CPU shares (relative weight vs. other containers)
+	CpuShares uint64 `json:"cpu_shares"`
+
+	// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
+	CpuQuota int64 `json:"cpu_quota"`
+
+	// CPU period to be used for hardcapping (in usecs). 0 to use system default.
+	CpuPeriod uint64 `json:"cpu_period"`
+
+	// How many time CPU will use in realtime scheduling (in usecs).
+	CpuRtRuntime int64 `json:"cpu_rt_quota"`
+
+	// CPU period to be used for realtime scheduling (in usecs).
+	CpuRtPeriod uint64 `json:"cpu_rt_period"`
+
+	// CPU to use
+	CpusetCpus string `json:"cpuset_cpus"`
+
+	// MEM to use
+	CpusetMems string `json:"cpuset_mems"`
+
+	// Process limit; set <= `0' to disable limit.
+	PidsLimit int64 `json:"pids_limit"`
+
+	// Specifies per cgroup weight, range is from 10 to 1000.
+	BlkioWeight uint16 `json:"blkio_weight"`
+
+	// Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
+	BlkioLeafWeight uint16 `json:"blkio_leaf_weight"`
+
+	// Weight per cgroup per device, can override BlkioWeight.
+	BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"`
+
+	// IO read rate limit per cgroup per device, bytes per second.
+	BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
+
+	// IO write rate limit per cgroup per device, bytes per second.
+	BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
+
+	// IO read rate limit per cgroup per device, IO per second.
+	BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"`
+
+	// IO write rate limit per cgroup per device, IO per second.
+	BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"`
+
+	// set the freeze value for the process
+	Freezer FreezerState `json:"freezer"`
+
+	// Hugetlb limit (in bytes)
+	HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
+
+	// Whether to disable OOM Killer
+	OomKillDisable bool `json:"oom_kill_disable"`
+
+	// Tuning swappiness behaviour per cgroup
+	MemorySwappiness *uint64 `json:"memory_swappiness"`
+
+	// Set priority of network traffic for container
+	NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
+
+	// Set class identifier for container's network packets
+	NetClsClassid uint32 `json:"net_cls_classid_u"`
+
+	// Used on cgroups v2:
+
+	// CpuWeight sets a proportional bandwidth limit.
+	CpuWeight uint64 `json:"cpu_weight"`
+
+	// CpuMax sets she maximum bandwidth limit (format: max period).
+	CpuMax string `json:"cpu_max"`
+}
--- a/libcontainer/configs/cgroup_unsupported.go
+++ b/libcontainer/configs/cgroup_unsupported.go
@ -0,0 +1,8 @@
+// +build !linux
+
+package configs
+
+// TODO Windows: This can ultimately be entirely factored out on Windows as
+// cgroups are a Unix-specific construct.
+type Cgroup struct {
+}
--- a/libcontainer/configs/config.go
+++ b/libcontainer/configs/config.go
@ -0,0 +1,354 @@
+package configs
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"time"
+
+	"github.com/opencontainers/runtime-spec/specs-go"
+
+	"github.com/sirupsen/logrus"
+)
+
+type Rlimit struct {
+	Type int    `json:"type"`
+	Hard uint64 `json:"hard"`
+	Soft uint64 `json:"soft"`
+}
+
+// IDMap represents UID/GID Mappings for User Namespaces.
+type IDMap struct {
+	ContainerID int `json:"container_id"`
+	HostID      int `json:"host_id"`
+	Size        int `json:"size"`
+}
+
+// Seccomp represents syscall restrictions
+// By default, only the native architecture of the kernel is allowed to be used
+// for syscalls. Additional architectures can be added by specifying them in
+// Architectures.
+type Seccomp struct {
+	DefaultAction Action     `json:"default_action"`
+	Architectures []string   `json:"architectures"`
+	Syscalls      []*Syscall `json:"syscalls"`
+}
+
+// Action is taken upon rule match in Seccomp
+type Action int
+
+const (
+	Kill Action = iota + 1
+	Errno
+	Trap
+	Allow
+	Trace
+	Log
+)
+
+// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
+type Operator int
+
+const (
+	EqualTo Operator = iota + 1
+	NotEqualTo
+	GreaterThan
+	GreaterThanOrEqualTo
+	LessThan
+	LessThanOrEqualTo
+	MaskEqualTo
+)
+
+// Arg is a rule to match a specific syscall argument in Seccomp
+type Arg struct {
+	Index    uint     `json:"index"`
+	Value    uint64   `json:"value"`
+	ValueTwo uint64   `json:"value_two"`
+	Op       Operator `json:"op"`
+}
+
+// Syscall is a rule to match a syscall in Seccomp
+type Syscall struct {
+	Name   string `json:"name"`
+	Action Action `json:"action"`
+	Args   []*Arg `json:"args"`
+}
+
+// TODO Windows. Many of these fields should be factored out into those parts
+// which are common across platforms, and those which are platform specific.
+
+// Config defines configuration options for executing a process inside a contained environment.
+type Config struct {
+	// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
+	// This is a common option when the container is running in ramdisk
+	NoPivotRoot bool `json:"no_pivot_root"`
+
+	// ParentDeathSignal specifies the signal that is sent to the container's process in the case
+	// that the parent process dies.
+	ParentDeathSignal int `json:"parent_death_signal"`
+
+	// Path to a directory containing the container's root filesystem.
+	Rootfs string `json:"rootfs"`
+
+	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
+	// bind mounts are writtable.
+	Readonlyfs bool `json:"readonlyfs"`
+
+	// Specifies the mount propagation flags to be applied to /.
+	RootPropagation int `json:"rootPropagation"`
+
+	// Mounts specify additional source and destination paths that will be mounted inside the container's
+	// rootfs and mount namespace if specified
+	Mounts []*Mount `json:"mounts"`
+
+	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
+	Devices []*Device `json:"devices"`
+
+	MountLabel string `json:"mount_label"`
+
+	// Hostname optionally sets the container's hostname if provided
+	Hostname string `json:"hostname"`
+
+	// Namespaces specifies the container's namespaces that it should setup when cloning the init process
+	// If a namespace is not provided that namespace is shared from the container's parent process
+	Namespaces Namespaces `json:"namespaces"`
+
+	// Capabilities specify the capabilities to keep when executing the process inside the container
+	// All capabilities not specified will be dropped from the processes capability mask
+	Capabilities *Capabilities `json:"capabilities"`
+
+	// Networks specifies the container's network setup to be created
+	Networks []*Network `json:"networks"`
+
+	// Routes can be specified to create entries in the route table as the container is started
+	Routes []*Route `json:"routes"`
+
+	// Cgroups specifies specific cgroup settings for the various subsystems that the container is
+	// placed into to limit the resources the container has available
+	Cgroups *Cgroup `json:"cgroups"`
+
+	// AppArmorProfile specifies the profile to apply to the process running in the container and is
+	// change at the time the process is execed
+	AppArmorProfile string `json:"apparmor_profile,omitempty"`
+
+	// ProcessLabel specifies the label to apply to the process running in the container.  It is
+	// commonly used by selinux
+	ProcessLabel string `json:"process_label,omitempty"`
+
+	// Rlimits specifies the resource limits, such as max open files, to set in the container
+	// If Rlimits are not set, the container will inherit rlimits from the parent process
+	Rlimits []Rlimit `json:"rlimits,omitempty"`
+
+	// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
+	// for a process. Valid values are between the range [-1000, '1000'], where processes with
+	// higher scores are preferred for being killed. If it is unset then we don't touch the current
+	// value.
+	// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
+	OomScoreAdj *int `json:"oom_score_adj,omitempty"`
+
+	// UidMappings is an array of User ID mappings for User Namespaces
+	UidMappings []IDMap `json:"uid_mappings"`
+
+	// GidMappings is an array of Group ID mappings for User Namespaces
+	GidMappings []IDMap `json:"gid_mappings"`
+
+	// MaskPaths specifies paths within the container's rootfs to mask over with a bind
+	// mount pointing to /dev/null as to prevent reads of the file.
+	MaskPaths []string `json:"mask_paths"`
+
+	// ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
+	// so that these files prevent any writes.
+	ReadonlyPaths []string `json:"readonly_paths"`
+
+	// Sysctl is a map of properties and their values. It is the equivalent of using
+	// sysctl -w my.property.name value in Linux.
+	Sysctl map[string]string `json:"sysctl"`
+
+	// Seccomp allows actions to be taken whenever a syscall is made within the container.
+	// A number of rules are given, each having an action to be taken if a syscall matches it.
+	// A default action to be taken if no rules match is also given.
+	Seccomp *Seccomp `json:"seccomp"`
+
+	// NoNewPrivileges controls whether processes in the container can gain additional privileges.
+	NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
+
+	// Hooks are a collection of actions to perform at various container lifecycle events.
+	// CommandHooks are serialized to JSON, but other hooks are not.
+	Hooks *Hooks
+
+	// Version is the version of opencontainer specification that is supported.
+	Version string `json:"version"`
+
+	// Labels are user defined metadata that is stored in the config and populated on the state
+	Labels []string `json:"labels"`
+
+	// NoNewKeyring will not allocated a new session keyring for the container.  It will use the
+	// callers keyring in this case.
+	NoNewKeyring bool `json:"no_new_keyring"`
+
+	// IntelRdt specifies settings for Intel RDT group that the container is placed into
+	// to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
+	IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
+
+	// RootlessEUID is set when the runc was launched with non-zero EUID.
+	// Note that RootlessEUID is set to false when launched with EUID=0 in userns.
+	// When RootlessEUID is set, runc creates a new userns for the container.
+	// (config.json needs to contain userns settings)
+	RootlessEUID bool `json:"rootless_euid,omitempty"`
+
+	// RootlessCgroups is set when unlikely to have the full access to cgroups.
+	// When RootlessCgroups is set, cgroups errors are ignored.
+	RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
+}
+
+type Hooks struct {
+	// Prestart commands are executed after the container namespaces are created,
+	// but before the user supplied command is executed from init.
+	Prestart []Hook
+
+	// Poststart commands are executed after the container init process starts.
+	Poststart []Hook
+
+	// Poststop commands are executed after the container init process exits.
+	Poststop []Hook
+}
+
+type Capabilities struct {
+	// Bounding is the set of capabilities checked by the kernel.
+	Bounding []string
+	// Effective is the set of capabilities checked by the kernel.
+	Effective []string
+	// Inheritable is the capabilities preserved across execve.
+	Inheritable []string
+	// Permitted is the limiting superset for effective capabilities.
+	Permitted []string
+	// Ambient is the ambient set of capabilities that are kept.
+	Ambient []string
+}
+
+func (hooks *Hooks) UnmarshalJSON(b []byte) error {
+	var state struct {
+		Prestart  []CommandHook
+		Poststart []CommandHook
+		Poststop  []CommandHook
+	}
+
+	if err := json.Unmarshal(b, &state); err != nil {
+		return err
+	}
+
+	deserialize := func(shooks []CommandHook) (hooks []Hook) {
+		for _, shook := range shooks {
+			hooks = append(hooks, shook)
+		}
+
+		return hooks
+	}
+
+	hooks.Prestart = deserialize(state.Prestart)
+	hooks.Poststart = deserialize(state.Poststart)
+	hooks.Poststop = deserialize(state.Poststop)
+	return nil
+}
+
+func (hooks Hooks) MarshalJSON() ([]byte, error) {
+	serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
+		for _, hook := range hooks {
+			switch chook := hook.(type) {
+			case CommandHook:
+				serializableHooks = append(serializableHooks, chook)
+			default:
+				logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
+			}
+		}
+
+		return serializableHooks
+	}
+
+	return json.Marshal(map[string]interface{}{
+		"prestart":  serialize(hooks.Prestart),
+		"poststart": serialize(hooks.Poststart),
+		"poststop":  serialize(hooks.Poststop),
+	})
+}
+
+type Hook interface {
+	// Run executes the hook with the provided state.
+	Run(*specs.State) error
+}
+
+// NewFunctionHook will call the provided function when the hook is run.
+func NewFunctionHook(f func(*specs.State) error) FuncHook {
+	return FuncHook{
+		run: f,
+	}
+}
+
+type FuncHook struct {
+	run func(*specs.State) error
+}
+
+func (f FuncHook) Run(s *specs.State) error {
+	return f.run(s)
+}
+
+type Command struct {
+	Path    string         `json:"path"`
+	Args    []string       `json:"args"`
+	Env     []string       `json:"env"`
+	Dir     string         `json:"dir"`
+	Timeout *time.Duration `json:"timeout"`
+}
+
+// NewCommandHook will execute the provided command when the hook is run.
+func NewCommandHook(cmd Command) CommandHook {
+	return CommandHook{
+		Command: cmd,
+	}
+}
+
+type CommandHook struct {
+	Command
+}
+
+func (c Command) Run(s *specs.State) error {
+	b, err := json.Marshal(s)
+	if err != nil {
+		return err
+	}
+	var stdout, stderr bytes.Buffer
+	cmd := exec.Cmd{
+		Path:   c.Path,
+		Args:   c.Args,
+		Env:    c.Env,
+		Stdin:  bytes.NewReader(b),
+		Stdout: &stdout,
+		Stderr: &stderr,
+	}
+	if err := cmd.Start(); err != nil {
+		return err
+	}
+	errC := make(chan error, 1)
+	go func() {
+		err := cmd.Wait()
+		if err != nil {
+			err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
+		}
+		errC <- err
+	}()
+	var timerCh <-chan time.Time
+	if c.Timeout != nil {
+		timer := time.NewTimer(*c.Timeout)
+		defer timer.Stop()
+		timerCh = timer.C
+	}
+	select {
+	case err := <-errC:
+		return err
+	case <-timerCh:
+		cmd.Process.Kill()
+		cmd.Wait()
+		return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
+	}
+}
--- a/libcontainer/configs/config_linux.go
+++ b/libcontainer/configs/config_linux.go
@ -0,0 +1,61 @@
+package configs
+
+import "fmt"
+
+// HostUID gets the translated uid for the process on host which could be
+// different when user namespaces are enabled.
+func (c Config) HostUID(containerId int) (int, error) {
+	if c.Namespaces.Contains(NEWUSER) {
+		if c.UidMappings == nil {
+			return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.")
+		}
+		id, found := c.hostIDFromMapping(containerId, c.UidMappings)
+		if !found {
+			return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.")
+		}
+		return id, nil
+	}
+	// Return unchanged id.
+	return containerId, nil
+}
+
+// HostRootUID gets the root uid for the process on host which could be non-zero
+// when user namespaces are enabled.
+func (c Config) HostRootUID() (int, error) {
+	return c.HostUID(0)
+}
+
+// HostGID gets the translated gid for the process on host which could be
+// different when user namespaces are enabled.
+func (c Config) HostGID(containerId int) (int, error) {
+	if c.Namespaces.Contains(NEWUSER) {
+		if c.GidMappings == nil {
+			return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
+		}
+		id, found := c.hostIDFromMapping(containerId, c.GidMappings)
+		if !found {
+			return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.")
+		}
+		return id, nil
+	}
+	// Return unchanged id.
+	return containerId, nil
+}
+
+// HostRootGID gets the root gid for the process on host which could be non-zero
+// when user namespaces are enabled.
+func (c Config) HostRootGID() (int, error) {
+	return c.HostGID(0)
+}
+
+// Utility function that gets a host ID for a container ID from user namespace map
+// if that ID is present in the map.
+func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
+	for _, m := range uMap {
+		if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
+			hostID := m.HostID + (containerID - m.ContainerID)
+			return hostID, true
+		}
+	}
+	return -1, false
+}
--- a/libcontainer/configs/config_linux_test.go
+++ b/libcontainer/configs/config_linux_test.go
@ -0,0 +1,130 @@
+package configs
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+func loadConfig(name string) (*Config, error) {
+	f, err := os.Open(filepath.Join("../sample_configs", name))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	var container *Config
+	if err := json.NewDecoder(f).Decode(&container); err != nil {
+		return nil, err
+	}
+
+	// Check that a config doesn't contain extra fields
+	var configMap, abstractMap map[string]interface{}
+
+	if _, err := f.Seek(0, 0); err != nil {
+		return nil, err
+	}
+
+	if err := json.NewDecoder(f).Decode(&abstractMap); err != nil {
+		return nil, err
+	}
+
+	configData, err := json.Marshal(&container)
+	if err != nil {
+		return nil, err
+	}
+
+	if err := json.Unmarshal(configData, &configMap); err != nil {
+		return nil, err
+	}
+
+	for k := range configMap {
+		delete(abstractMap, k)
+	}
+
+	if len(abstractMap) != 0 {
+		return nil, fmt.Errorf("unknown fields: %s", abstractMap)
+	}
+
+	return container, nil
+}
+
+func TestRemoveNamespace(t *testing.T) {
+	ns := Namespaces{
+		{Type: NEWNET},
+	}
+	if !ns.Remove(NEWNET) {
+		t.Fatal("NEWNET was not removed")
+	}
+	if len(ns) != 0 {
+		t.Fatalf("namespaces should have 0 items but reports %d", len(ns))
+	}
+}
+
+func TestHostRootUIDNoUSERNS(t *testing.T) {
+	config := &Config{
+		Namespaces: Namespaces{},
+	}
+	uid, err := config.HostRootUID()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if uid != 0 {
+		t.Fatalf("expected uid 0 with no USERNS but received %d", uid)
+	}
+}
+
+func TestHostRootUIDWithUSERNS(t *testing.T) {
+	config := &Config{
+		Namespaces: Namespaces{{Type: NEWUSER}},
+		UidMappings: []IDMap{
+			{
+				ContainerID: 0,
+				HostID:      1000,
+				Size:        1,
+			},
+		},
+	}
+	uid, err := config.HostRootUID()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if uid != 1000 {
+		t.Fatalf("expected uid 1000 with no USERNS but received %d", uid)
+	}
+}
+
+func TestHostRootGIDNoUSERNS(t *testing.T) {
+	config := &Config{
+		Namespaces: Namespaces{},
+	}
+	uid, err := config.HostRootGID()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if uid != 0 {
+		t.Fatalf("expected gid 0 with no USERNS but received %d", uid)
+	}
+}
+
+func TestHostRootGIDWithUSERNS(t *testing.T) {
+	config := &Config{
+		Namespaces: Namespaces{{Type: NEWUSER}},
+		GidMappings: []IDMap{
+			{
+				ContainerID: 0,
+				HostID:      1000,
+				Size:        1,
+			},
+		},
+	}
+	uid, err := config.HostRootGID()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if uid != 1000 {
+		t.Fatalf("expected gid 1000 with no USERNS but received %d", uid)
+	}
+}
--- a/libcontainer/configs/config_test.go
+++ b/libcontainer/configs/config_test.go
@ -0,0 +1,195 @@
+package configs_test
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"reflect"
+	"testing"
+	"time"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestUnmarshalHooks(t *testing.T) {
+	timeout := time.Second
+
+	prestartCmd := configs.NewCommandHook(configs.Command{
+		Path:    "/var/vcap/hooks/prestart",
+		Args:    []string{"--pid=123"},
+		Env:     []string{"FOO=BAR"},
+		Dir:     "/var/vcap",
+		Timeout: &timeout,
+	})
+	prestart, err := json.Marshal(prestartCmd.Command)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	hook := configs.Hooks{}
+	err = hook.UnmarshalJSON([]byte(fmt.Sprintf(`{"Prestart" :[%s]}`, prestart)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !reflect.DeepEqual(hook.Prestart[0], prestartCmd) {
+		t.Errorf("Expected prestart to equal %+v but it was %+v",
+			prestartCmd, hook.Prestart[0])
+	}
+}
+
+func TestUnmarshalHooksWithInvalidData(t *testing.T) {
+	hook := configs.Hooks{}
+	err := hook.UnmarshalJSON([]byte(`{invalid-json}`))
+	if err == nil {
+		t.Error("Expected error to occur but it was nil")
+	}
+}
+
+func TestMarshalHooks(t *testing.T) {
+	timeout := time.Second
+
+	prestartCmd := configs.NewCommandHook(configs.Command{
+		Path:    "/var/vcap/hooks/prestart",
+		Args:    []string{"--pid=123"},
+		Env:     []string{"FOO=BAR"},
+		Dir:     "/var/vcap",
+		Timeout: &timeout,
+	})
+
+	hook := configs.Hooks{
+		Prestart: []configs.Hook{prestartCmd},
+	}
+	hooks, err := hook.MarshalJSON()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	h := `{"poststart":null,"poststop":null,"prestart":[{"path":"/var/vcap/hooks/prestart","args":["--pid=123"],"env":["FOO=BAR"],"dir":"/var/vcap","timeout":1000000000}]}`
+	if string(hooks) != h {
+		t.Errorf("Expected hooks %s to equal %s", string(hooks), h)
+	}
+}
+
+func TestMarshalUnmarshalHooks(t *testing.T) {
+	timeout := time.Second
+
+	prestart := configs.NewCommandHook(configs.Command{
+		Path:    "/var/vcap/hooks/prestart",
+		Args:    []string{"--pid=123"},
+		Env:     []string{"FOO=BAR"},
+		Dir:     "/var/vcap",
+		Timeout: &timeout,
+	})
+
+	hook := configs.Hooks{
+		Prestart: []configs.Hook{prestart},
+	}
+	hooks, err := hook.MarshalJSON()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	umMhook := configs.Hooks{}
+	err = umMhook.UnmarshalJSON(hooks)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !reflect.DeepEqual(umMhook.Prestart[0], prestart) {
+		t.Errorf("Expected hooks to be equal after mashaling -> unmarshaling them: %+v, %+v", umMhook.Prestart[0], prestart)
+	}
+}
+
+func TestMarshalHooksWithUnexpectedType(t *testing.T) {
+	fHook := configs.NewFunctionHook(func(*specs.State) error {
+		return nil
+	})
+	hook := configs.Hooks{
+		Prestart: []configs.Hook{fHook},
+	}
+	hooks, err := hook.MarshalJSON()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	h := `{"poststart":null,"poststop":null,"prestart":null}`
+	if string(hooks) != h {
+		t.Errorf("Expected hooks %s to equal %s", string(hooks), h)
+	}
+}
+
+func TestFuncHookRun(t *testing.T) {
+	state := &specs.State{
+		Version: "1",
+		ID:      "1",
+		Status:  "created",
+		Pid:     1,
+		Bundle:  "/bundle",
+	}
+
+	fHook := configs.NewFunctionHook(func(s *specs.State) error {
+		if !reflect.DeepEqual(state, s) {
+			t.Errorf("Expected state %+v to equal %+v", state, s)
+		}
+		return nil
+	})
+
+	fHook.Run(state)
+}
+
+func TestCommandHookRun(t *testing.T) {
+	state := &specs.State{
+		Version: "1",
+		ID:      "1",
+		Status:  "created",
+		Pid:     1,
+		Bundle:  "/bundle",
+	}
+	timeout := time.Second
+
+	cmdHook := configs.NewCommandHook(configs.Command{
+		Path:    os.Args[0],
+		Args:    []string{os.Args[0], "-test.run=TestHelperProcess"},
+		Env:     []string{"FOO=BAR"},
+		Dir:     "/",
+		Timeout: &timeout,
+	})
+
+	err := cmdHook.Run(state)
+	if err != nil {
+		t.Errorf(fmt.Sprintf("Expected error to not occur but it was %+v", err))
+	}
+}
+
+func TestCommandHookRunTimeout(t *testing.T) {
+	state := &specs.State{
+		Version: "1",
+		ID:      "1",
+		Status:  "created",
+		Pid:     1,
+		Bundle:  "/bundle",
+	}
+	timeout := (10 * time.Millisecond)
+
+	cmdHook := configs.NewCommandHook(configs.Command{
+		Path:    os.Args[0],
+		Args:    []string{os.Args[0], "-test.run=TestHelperProcessWithTimeout"},
+		Env:     []string{"FOO=BAR"},
+		Dir:     "/",
+		Timeout: &timeout,
+	})
+
+	err := cmdHook.Run(state)
+	if err == nil {
+		t.Error("Expected error to occur but it was nil")
+	}
+}
+
+func TestHelperProcess(*testing.T) {
+	fmt.Println("Helper Process")
+	os.Exit(0)
+}
+func TestHelperProcessWithTimeout(*testing.T) {
+	time.Sleep(time.Second)
+}
--- a/libcontainer/configs/config_windows_test.go
+++ b/libcontainer/configs/config_windows_test.go
@ -0,0 +1,3 @@
+package configs
+
+// All current tests are for Unix-specific functionality
--- a/libcontainer/configs/device.go
+++ b/libcontainer/configs/device.go
@ -0,0 +1,57 @@
+package configs
+
+import (
+	"fmt"
+	"os"
+)
+
+const (
+	Wildcard = -1
+)
+
+// TODO Windows: This can be factored out in the future
+
+type Device struct {
+	// Device type, block, char, etc.
+	Type rune `json:"type"`
+
+	// Path to the device.
+	Path string `json:"path"`
+
+	// Major is the device's major number.
+	Major int64 `json:"major"`
+
+	// Minor is the device's minor number.
+	Minor int64 `json:"minor"`
+
+	// Cgroup permissions format, rwm.
+	Permissions string `json:"permissions"`
+
+	// FileMode permission bits for the device.
+	FileMode os.FileMode `json:"file_mode"`
+
+	// Uid of the device.
+	Uid uint32 `json:"uid"`
+
+	// Gid of the device.
+	Gid uint32 `json:"gid"`
+
+	// Write the file to the allowed list
+	Allow bool `json:"allow"`
+}
+
+func (d *Device) CgroupString() string {
+	return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions)
+}
+
+func (d *Device) Mkdev() int {
+	return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12))
+}
+
+// deviceNumberString converts the device number to a string return result.
+func deviceNumberString(number int64) string {
+	if number == Wildcard {
+		return "*"
+	}
+	return fmt.Sprint(number)
+}
--- a/libcontainer/configs/device_defaults.go
+++ b/libcontainer/configs/device_defaults.go
@ -0,0 +1,111 @@
+// +build linux
+
+package configs
+
+var (
+	// DefaultSimpleDevices are devices that are to be both allowed and created.
+	DefaultSimpleDevices = []*Device{
+		// /dev/null and zero
+		{
+			Path:        "/dev/null",
+			Type:        'c',
+			Major:       1,
+			Minor:       3,
+			Permissions: "rwm",
+			FileMode:    0666,
+		},
+		{
+			Path:        "/dev/zero",
+			Type:        'c',
+			Major:       1,
+			Minor:       5,
+			Permissions: "rwm",
+			FileMode:    0666,
+		},
+
+		{
+			Path:        "/dev/full",
+			Type:        'c',
+			Major:       1,
+			Minor:       7,
+			Permissions: "rwm",
+			FileMode:    0666,
+		},
+
+		// consoles and ttys
+		{
+			Path:        "/dev/tty",
+			Type:        'c',
+			Major:       5,
+			Minor:       0,
+			Permissions: "rwm",
+			FileMode:    0666,
+		},
+
+		// /dev/urandom,/dev/random
+		{
+			Path:        "/dev/urandom",
+			Type:        'c',
+			Major:       1,
+			Minor:       9,
+			Permissions: "rwm",
+			FileMode:    0666,
+		},
+		{
+			Path:        "/dev/random",
+			Type:        'c',
+			Major:       1,
+			Minor:       8,
+			Permissions: "rwm",
+			FileMode:    0666,
+		},
+	}
+	DefaultAllowedDevices = append([]*Device{
+		// allow mknod for any device
+		{
+			Type:        'c',
+			Major:       Wildcard,
+			Minor:       Wildcard,
+			Permissions: "m",
+		},
+		{
+			Type:        'b',
+			Major:       Wildcard,
+			Minor:       Wildcard,
+			Permissions: "m",
+		},
+
+		{
+			Path:        "/dev/console",
+			Type:        'c',
+			Major:       5,
+			Minor:       1,
+			Permissions: "rwm",
+		},
+		// /dev/pts/ - pts namespaces are "coming soon"
+		{
+			Path:        "",
+			Type:        'c',
+			Major:       136,
+			Minor:       Wildcard,
+			Permissions: "rwm",
+		},
+		{
+			Path:        "",
+			Type:        'c',
+			Major:       5,
+			Minor:       2,
+			Permissions: "rwm",
+		},
+
+		// tuntap
+		{
+			Path:        "",
+			Type:        'c',
+			Major:       10,
+			Minor:       200,
+			Permissions: "rwm",
+		},
+	}, DefaultSimpleDevices...)
+	DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...)
+)
--- a/libcontainer/configs/hugepage_limit.go
+++ b/libcontainer/configs/hugepage_limit.go
@ -0,0 +1,9 @@
+package configs
+
+type HugepageLimit struct {
+	// which type of hugepage to limit.
+	Pagesize string `json:"page_size"`
+
+	// usage limit for hugepage.
+	Limit uint64 `json:"limit"`
+}
--- a/libcontainer/configs/intelrdt.go
+++ b/libcontainer/configs/intelrdt.go
@ -0,0 +1,13 @@
+package configs
+
+type IntelRdt struct {
+	// The schema for L3 cache id and capacity bitmask (CBM)
+	// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
+	L3CacheSchema string `json:"l3_cache_schema,omitempty"`
+
+	// The schema of memory bandwidth per L3 cache id
+	// Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
+	// The unit of memory bandwidth is specified in "percentages" by
+	// default, and in "MBps" if MBA Software Controller is enabled.
+	MemBwSchema string `json:"memBwSchema,omitempty"`
+}
--- a/libcontainer/configs/interface_priority_map.go
+++ b/libcontainer/configs/interface_priority_map.go
@ -0,0 +1,14 @@
+package configs
+
+import (
+	"fmt"
+)
+
+type IfPrioMap struct {
+	Interface string `json:"interface"`
+	Priority  int64  `json:"priority"`
+}
+
+func (i *IfPrioMap) CgroupString() string {
+	return fmt.Sprintf("%s %d", i.Interface, i.Priority)
+}
--- a/libcontainer/configs/mount.go
+++ b/libcontainer/configs/mount.go
@ -0,0 +1,39 @@
+package configs
+
+const (
+	// EXT_COPYUP is a directive to copy up the contents of a directory when
+	// a tmpfs is mounted over it.
+	EXT_COPYUP = 1 << iota
+)
+
+type Mount struct {
+	// Source path for the mount.
+	Source string `json:"source"`
+
+	// Destination path for the mount inside the container.
+	Destination string `json:"destination"`
+
+	// Device the mount is for.
+	Device string `json:"device"`
+
+	// Mount flags.
+	Flags int `json:"flags"`
+
+	// Propagation Flags
+	PropagationFlags []int `json:"propagation_flags"`
+
+	// Mount data applied to the mount.
+	Data string `json:"data"`
+
+	// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
+	Relabel string `json:"relabel"`
+
+	// Extensions are additional flags that are specific to runc.
+	Extensions int `json:"extensions"`
+
+	// Optional Command to be run before Source is mounted.
+	PremountCmds []Command `json:"premount_cmds"`
+
+	// Optional Command to be run after Source is mounted.
+	PostmountCmds []Command `json:"postmount_cmds"`
+}
--- a/libcontainer/configs/namespaces.go
+++ b/libcontainer/configs/namespaces.go
@ -0,0 +1,5 @@
+package configs
+
+type NamespaceType string
+
+type Namespaces []Namespace
--- a/Show More
+++ b/Show More