2015-10-03 02:16:50 +08:00
|
|
|
// +build linux
|
|
|
|
|
|
|
|
package libcontainer
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
|
|
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
2017-05-10 05:38:27 +08:00
|
|
|
|
2017-07-19 22:28:59 +08:00
|
|
|
"github.com/sirupsen/logrus"
|
2017-05-10 05:38:27 +08:00
|
|
|
"golang.org/x/sys/unix"
|
2015-10-03 02:16:50 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
func newStateTransitionError(from, to containerState) error {
|
|
|
|
return &stateTransitionError{
|
|
|
|
From: from.status().String(),
|
|
|
|
To: to.status().String(),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// stateTransitionError is returned when an invalid state transition happens from one
|
|
|
|
// state to another.
|
|
|
|
type stateTransitionError struct {
|
|
|
|
From string
|
|
|
|
To string
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *stateTransitionError) Error() string {
|
|
|
|
return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
|
|
|
|
}
|
|
|
|
|
|
|
|
type containerState interface {
|
|
|
|
transition(containerState) error
|
|
|
|
destroy() error
|
|
|
|
status() Status
|
|
|
|
}
|
|
|
|
|
|
|
|
func destroy(c *linuxContainer) error {
|
|
|
|
if !c.config.Namespaces.Contains(configs.NEWPID) {
|
2017-05-10 05:38:27 +08:00
|
|
|
if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
|
2015-10-03 02:16:50 +08:00
|
|
|
logrus.Warn(err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
err := c.cgroupManager.Destroy()
|
libcontainer: add support for Intel RDT/CAT in runc
About Intel RDT/CAT feature:
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
For more information about Intel RDT/CAT can be found in the section 17.17
of Intel Software Developer Manual.
About Intel RDT/CAT kernel interface:
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
Intel RDT "resource control" filesystem hierarchy:
mount -t resctrl resctrl /sys/fs/resctrl
tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| |-- num_closids
|-- cpus
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- schemata
|-- tasks
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
Is in root group.
The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
}
Signed-off-by: Xiaochen Shen <xiaochen.shen@intel.com>
2017-08-30 19:34:26 +08:00
|
|
|
if c.intelRdtManager != nil {
|
|
|
|
if ierr := c.intelRdtManager.Destroy(); err == nil {
|
|
|
|
err = ierr
|
|
|
|
}
|
|
|
|
}
|
2015-10-03 02:16:50 +08:00
|
|
|
if rerr := os.RemoveAll(c.root); err == nil {
|
|
|
|
err = rerr
|
|
|
|
}
|
|
|
|
c.initProcess = nil
|
|
|
|
if herr := runPoststopHooks(c); err == nil {
|
|
|
|
err = herr
|
|
|
|
}
|
2016-01-22 08:43:33 +08:00
|
|
|
c.state = &stoppedState{c: c}
|
2015-10-03 02:16:50 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
func runPoststopHooks(c *linuxContainer) error {
|
|
|
|
if c.config.Hooks != nil {
|
libcontainer: Set 'status' in hook stdin
Finish off the work started in a344b2d6 (sync up `HookState` with OCI
spec `State`, 2016-12-19, #1201).
And drop HookState, since there's no need for a local alias for
specs.State.
Also set c.initProcess in newInitProcess to support OCIState calls
from within initProcess.start(). I think the cyclic references
between linuxContainer and initProcess are unfortunate, but didn't
want to address that here.
I've also left the timing of the Prestart hooks alone, although the
spec calls for them to happen before start (not as part of creation)
[1,2]. Once the timing gets fixed we can drop the
initProcessStartTime hacks which initProcess.start currently needs.
I'm not sure why we trigger the prestart hooks in response to both
procReady and procHooks. But we've had two prestart rounds in
initProcess.start since 2f276498 (Move pre-start hooks after container
mounts, 2016-02-17, #568). I've left that alone too.
I really think we should have len() guards to avoid computing the
state when .Hooks is non-nil but the particular phase we're looking at
is empty. Aleksa, however, is adamantly against them [3] citing a
risk of sloppy copy/pastes causing the hook slice being len-guarded to
diverge from the hook slice being iterated over within the guard. I
think that ort of thing is very lo-risk, because:
* We shouldn't be copy/pasting this, right? DRY for the win :).
* There's only ever a few lines between the guard and the guarded
loop. That makes broken copy/pastes easy to catch in review.
* We should have test coverage for these. Guarding with the wrong
slice is certainly not the only thing you can break with a sloppy
copy/paste.
But I'm not a maintainer ;).
[1]: https://github.com/opencontainers/runtime-spec/blob/v1.0.0/config.md#prestart
[2]: https://github.com/opencontainers/runc/issues/1710
[3]: https://github.com/opencontainers/runc/pull/1741#discussion_r233331570
Signed-off-by: W. Trevor King <wking@tremily.us>
2018-02-26 06:47:41 +08:00
|
|
|
s, err := c.currentOCIState()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2015-10-03 02:16:50 +08:00
|
|
|
}
|
|
|
|
for _, hook := range c.config.Hooks.Poststop {
|
|
|
|
if err := hook.Run(s); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// stoppedState represents a container is a stopped/destroyed state.
|
|
|
|
type stoppedState struct {
|
|
|
|
c *linuxContainer
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *stoppedState) status() Status {
|
2016-05-14 08:01:12 +08:00
|
|
|
return Stopped
|
2015-10-03 02:16:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func (b *stoppedState) transition(s containerState) error {
|
|
|
|
switch s.(type) {
|
2016-10-20 17:57:37 +08:00
|
|
|
case *runningState, *restoredState:
|
2015-10-03 02:16:50 +08:00
|
|
|
b.c.state = s
|
|
|
|
return nil
|
|
|
|
case *stoppedState:
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return newStateTransitionError(b, s)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *stoppedState) destroy() error {
|
|
|
|
return destroy(b.c)
|
|
|
|
}
|
|
|
|
|
|
|
|
// runningState represents a container that is currently running.
|
|
|
|
type runningState struct {
|
|
|
|
c *linuxContainer
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *runningState) status() Status {
|
|
|
|
return Running
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *runningState) transition(s containerState) error {
|
|
|
|
switch s.(type) {
|
|
|
|
case *stoppedState:
|
2016-05-14 07:54:16 +08:00
|
|
|
t, err := r.c.runType()
|
2015-10-03 02:16:50 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-05-14 07:54:16 +08:00
|
|
|
if t == Running {
|
2015-10-03 02:16:50 +08:00
|
|
|
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
|
|
|
|
}
|
|
|
|
r.c.state = s
|
|
|
|
return nil
|
2016-01-25 12:52:52 +08:00
|
|
|
case *pausedState:
|
2015-10-03 02:16:50 +08:00
|
|
|
r.c.state = s
|
|
|
|
return nil
|
2016-01-22 08:43:33 +08:00
|
|
|
case *runningState:
|
2015-10-03 02:16:50 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return newStateTransitionError(r, s)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *runningState) destroy() error {
|
2016-05-14 07:54:16 +08:00
|
|
|
t, err := r.c.runType()
|
2015-10-03 02:16:50 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-05-14 07:54:16 +08:00
|
|
|
if t == Running {
|
2015-10-03 02:16:50 +08:00
|
|
|
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
|
|
|
|
}
|
|
|
|
return destroy(r.c)
|
|
|
|
}
|
|
|
|
|
2016-05-14 08:01:12 +08:00
|
|
|
type createdState struct {
|
2016-05-14 07:54:16 +08:00
|
|
|
c *linuxContainer
|
|
|
|
}
|
|
|
|
|
2016-05-14 08:01:12 +08:00
|
|
|
func (i *createdState) status() Status {
|
|
|
|
return Created
|
2016-05-14 07:54:16 +08:00
|
|
|
}
|
|
|
|
|
2016-05-14 08:01:12 +08:00
|
|
|
func (i *createdState) transition(s containerState) error {
|
2016-05-14 07:54:16 +08:00
|
|
|
switch s.(type) {
|
2016-05-20 08:28:58 +08:00
|
|
|
case *runningState, *pausedState, *stoppedState:
|
2016-05-14 07:54:16 +08:00
|
|
|
i.c.state = s
|
2016-05-20 08:28:58 +08:00
|
|
|
return nil
|
2016-05-14 08:01:12 +08:00
|
|
|
case *createdState:
|
2016-05-14 07:54:16 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return newStateTransitionError(i, s)
|
|
|
|
}
|
|
|
|
|
2016-05-14 08:01:12 +08:00
|
|
|
func (i *createdState) destroy() error {
|
2017-05-10 05:38:27 +08:00
|
|
|
i.c.initProcess.signal(unix.SIGKILL)
|
2016-05-14 07:54:16 +08:00
|
|
|
return destroy(i.c)
|
|
|
|
}
|
|
|
|
|
2015-10-03 02:16:50 +08:00
|
|
|
// pausedState represents a container that is currently pause. It cannot be destroyed in a
|
|
|
|
// paused state and must transition back to running first.
|
|
|
|
type pausedState struct {
|
|
|
|
c *linuxContainer
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *pausedState) status() Status {
|
|
|
|
return Paused
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *pausedState) transition(s containerState) error {
|
|
|
|
switch s.(type) {
|
2016-01-22 08:43:33 +08:00
|
|
|
case *runningState, *stoppedState:
|
2015-10-03 02:16:50 +08:00
|
|
|
p.c.state = s
|
|
|
|
return nil
|
|
|
|
case *pausedState:
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return newStateTransitionError(p, s)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *pausedState) destroy() error {
|
2016-05-14 07:54:16 +08:00
|
|
|
t, err := p.c.runType()
|
2016-01-22 08:43:33 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-05-14 07:54:16 +08:00
|
|
|
if t != Running && t != Created {
|
2016-01-22 08:43:33 +08:00
|
|
|
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return destroy(p.c)
|
|
|
|
}
|
2015-10-03 02:16:50 +08:00
|
|
|
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
|
|
|
|
}
|
|
|
|
|
2017-02-03 23:46:38 +08:00
|
|
|
// restoredState is the same as the running state but also has associated checkpoint
|
2016-06-04 01:05:34 +08:00
|
|
|
// information that maybe need destroyed when the container is stopped and destroy is called.
|
2015-10-03 02:16:50 +08:00
|
|
|
type restoredState struct {
|
|
|
|
imageDir string
|
|
|
|
c *linuxContainer
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *restoredState) status() Status {
|
|
|
|
return Running
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *restoredState) transition(s containerState) error {
|
|
|
|
switch s.(type) {
|
2016-10-20 17:57:37 +08:00
|
|
|
case *stoppedState, *runningState:
|
2015-10-03 02:16:50 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return newStateTransitionError(r, s)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *restoredState) destroy() error {
|
|
|
|
if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
|
|
|
|
if !os.IsNotExist(err) {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return destroy(r.c)
|
|
|
|
}
|
|
|
|
|
2016-05-14 08:01:12 +08:00
|
|
|
// loadedState is used whenever a container is restored, loaded, or setting additional
|
2015-10-03 02:16:50 +08:00
|
|
|
// processes inside and it should not be destroyed when it is exiting.
|
2016-05-14 08:01:12 +08:00
|
|
|
type loadedState struct {
|
2015-10-03 02:16:50 +08:00
|
|
|
c *linuxContainer
|
|
|
|
s Status
|
|
|
|
}
|
|
|
|
|
2016-05-14 08:01:12 +08:00
|
|
|
func (n *loadedState) status() Status {
|
2015-10-03 02:16:50 +08:00
|
|
|
return n.s
|
|
|
|
}
|
|
|
|
|
2016-05-14 08:01:12 +08:00
|
|
|
func (n *loadedState) transition(s containerState) error {
|
2016-01-22 08:48:05 +08:00
|
|
|
n.c.state = s
|
2015-10-03 02:16:50 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2016-05-14 08:01:12 +08:00
|
|
|
func (n *loadedState) destroy() error {
|
2016-01-12 08:57:18 +08:00
|
|
|
if err := n.c.refreshState(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return n.c.state.destroy()
|
2015-10-03 02:16:50 +08:00
|
|
|
}
|