359 lines
9.1 KiB
Go
359 lines
9.1 KiB
Go
// +build linux
|
|
|
|
package libcontainer
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"net"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
"github.com/opencontainers/runc/libcontainer/configs"
|
|
"github.com/opencontainers/runc/libcontainer/system"
|
|
"github.com/opencontainers/runc/libcontainer/user"
|
|
"github.com/opencontainers/runc/libcontainer/utils"
|
|
"github.com/vishvananda/netlink"
|
|
)
|
|
|
|
type initType string
|
|
|
|
const (
|
|
initSetns initType = "setns"
|
|
initStandard initType = "standard"
|
|
)
|
|
|
|
type pid struct {
|
|
Pid int `json:"pid"`
|
|
}
|
|
|
|
// network is an internal struct used to setup container networks.
|
|
type network struct {
|
|
configs.Network
|
|
|
|
// TempVethPeerName is a unique temporary veth peer name that was placed into
|
|
// the container's namespace.
|
|
TempVethPeerName string `json:"temp_veth_peer_name"`
|
|
}
|
|
|
|
// initConfig is used for transferring parameters from Exec() to Init()
|
|
type initConfig struct {
|
|
Args []string `json:"args"`
|
|
Env []string `json:"env"`
|
|
Cwd string `json:"cwd"`
|
|
Capabilities []string `json:"capabilities"`
|
|
User string `json:"user"`
|
|
Config *configs.Config `json:"config"`
|
|
Console string `json:"console"`
|
|
Networks []*network `json:"network"`
|
|
PassedFilesCount int `json:"passed_files_count"`
|
|
}
|
|
|
|
type initer interface {
|
|
Init() error
|
|
}
|
|
|
|
func newContainerInit(t initType, pipe *os.File) (initer, error) {
|
|
var config *initConfig
|
|
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
|
|
return nil, err
|
|
}
|
|
if err := populateProcessEnvironment(config.Env); err != nil {
|
|
return nil, err
|
|
}
|
|
switch t {
|
|
case initSetns:
|
|
return &linuxSetnsInit{
|
|
config: config,
|
|
}, nil
|
|
case initStandard:
|
|
return &linuxStandardInit{
|
|
pipe: pipe,
|
|
parentPid: syscall.Getppid(),
|
|
config: config,
|
|
}, nil
|
|
}
|
|
return nil, fmt.Errorf("unknown init type %q", t)
|
|
}
|
|
|
|
// populateProcessEnvironment loads the provided environment variables into the
|
|
// current processes's environment.
|
|
func populateProcessEnvironment(env []string) error {
|
|
for _, pair := range env {
|
|
p := strings.SplitN(pair, "=", 2)
|
|
if len(p) < 2 {
|
|
return fmt.Errorf("invalid environment '%v'", pair)
|
|
}
|
|
if err := os.Setenv(p[0], p[1]); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// finalizeNamespace drops the caps, sets the correct user
|
|
// and working dir, and closes any leaked file descriptors
|
|
// before executing the command inside the namespace
|
|
func finalizeNamespace(config *initConfig) error {
|
|
// Ensure that all unwanted fds we may have accidentally
|
|
// inherited are marked close-on-exec so they stay out of the
|
|
// container
|
|
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
|
|
return err
|
|
}
|
|
|
|
capabilities := config.Config.Capabilities
|
|
if config.Capabilities != nil {
|
|
capabilities = config.Capabilities
|
|
}
|
|
w, err := newCapWhitelist(capabilities)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// drop capabilities in bounding set before changing user
|
|
if err := w.dropBoundingSet(); err != nil {
|
|
return err
|
|
}
|
|
// preserve existing capabilities while we change users
|
|
if err := system.SetKeepCaps(); err != nil {
|
|
return err
|
|
}
|
|
if err := setupUser(config); err != nil {
|
|
return err
|
|
}
|
|
if err := system.ClearKeepCaps(); err != nil {
|
|
return err
|
|
}
|
|
// drop all other capabilities
|
|
if err := w.drop(); err != nil {
|
|
return err
|
|
}
|
|
if config.Cwd != "" {
|
|
if err := syscall.Chdir(config.Cwd); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// syncParentReady sends to the given pipe a JSON payload which indicates that
|
|
// the init is ready to Exec the child process. It then waits for the parent to
|
|
// indicate that it is cleared to Exec.
|
|
func syncParentReady(pipe io.ReadWriter) error {
|
|
// Tell parent.
|
|
if err := json.NewEncoder(pipe).Encode(procReady); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Wait for parent to give the all-clear.
|
|
var procSync syncType
|
|
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
|
|
if err == io.EOF {
|
|
return fmt.Errorf("parent closed synchronisation channel")
|
|
}
|
|
if procSync != procRun {
|
|
return fmt.Errorf("invalid synchronisation flag from parent")
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// joinExistingNamespaces gets all the namespace paths specified for the container and
|
|
// does a setns on the namespace fd so that the current process joins the namespace.
|
|
func joinExistingNamespaces(namespaces []configs.Namespace) error {
|
|
for _, ns := range namespaces {
|
|
if ns.Path != "" {
|
|
f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = system.Setns(f.Fd(), uintptr(ns.Syscall()))
|
|
f.Close()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// setupUser changes the groups, gid, and uid for the user inside the container
|
|
func setupUser(config *initConfig) error {
|
|
// Set up defaults.
|
|
defaultExecUser := user.ExecUser{
|
|
Uid: syscall.Getuid(),
|
|
Gid: syscall.Getgid(),
|
|
Home: "/",
|
|
}
|
|
passwdPath, err := user.GetPasswdPath()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
groupPath, err := user.GetGroupPath()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var addGroups []int
|
|
if len(config.Config.AdditionalGroups) > 0 {
|
|
addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
// before we change to the container's user make sure that the processes STDIO
|
|
// is correctly owned by the user that we are switching to.
|
|
if err := fixStdioPermissions(execUser); err != nil {
|
|
return err
|
|
}
|
|
suppGroups := append(execUser.Sgids, addGroups...)
|
|
if err := syscall.Setgroups(suppGroups); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := system.Setgid(execUser.Gid); err != nil {
|
|
return err
|
|
}
|
|
if err := system.Setuid(execUser.Uid); err != nil {
|
|
return err
|
|
}
|
|
// if we didn't get HOME already, set it based on the user's HOME
|
|
if envHome := os.Getenv("HOME"); envHome == "" {
|
|
if err := os.Setenv("HOME", execUser.Home); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
|
|
// The ownership needs to match because it is created outside of the container and needs to be
|
|
// localized.
|
|
func fixStdioPermissions(u *user.ExecUser) error {
|
|
var null syscall.Stat_t
|
|
if err := syscall.Stat("/dev/null", &null); err != nil {
|
|
return err
|
|
}
|
|
for _, fd := range []uintptr{
|
|
os.Stdin.Fd(),
|
|
os.Stderr.Fd(),
|
|
os.Stdout.Fd(),
|
|
} {
|
|
var s syscall.Stat_t
|
|
if err := syscall.Fstat(int(fd), &s); err != nil {
|
|
return err
|
|
}
|
|
// skip chown of /dev/null if it was used as one of the STDIO fds.
|
|
if s.Rdev == null.Rdev {
|
|
continue
|
|
}
|
|
if err := syscall.Fchown(int(fd), u.Uid, u.Gid); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// setupNetwork sets up and initializes any network interface inside the container.
|
|
func setupNetwork(config *initConfig) error {
|
|
for _, config := range config.Networks {
|
|
strategy, err := getStrategy(config.Type)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := strategy.initialize(config); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setupRoute(config *configs.Config) error {
|
|
for _, config := range config.Routes {
|
|
_, dst, err := net.ParseCIDR(config.Destination)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
src := net.ParseIP(config.Source)
|
|
if src == nil {
|
|
return fmt.Errorf("Invalid source for route: %s", config.Source)
|
|
}
|
|
gw := net.ParseIP(config.Gateway)
|
|
if gw == nil {
|
|
return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
|
|
}
|
|
l, err := netlink.LinkByName(config.InterfaceName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
route := &netlink.Route{
|
|
Scope: netlink.SCOPE_UNIVERSE,
|
|
Dst: dst,
|
|
Src: src,
|
|
Gw: gw,
|
|
LinkIndex: l.Attrs().Index,
|
|
}
|
|
if err := netlink.RouteAdd(route); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setupRlimits(config *configs.Config) error {
|
|
for _, rlimit := range config.Rlimits {
|
|
l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
|
|
if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
|
|
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setOomScoreAdj(oomScoreAdj int) error {
|
|
path := "/proc/self/oom_score_adj"
|
|
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0700)
|
|
}
|
|
|
|
// killCgroupProcesses freezes then iterates over all the processes inside the
|
|
// manager's cgroups sending a SIGKILL to each process then waiting for them to
|
|
// exit.
|
|
func killCgroupProcesses(m cgroups.Manager) error {
|
|
var procs []*os.Process
|
|
if err := m.Freeze(configs.Frozen); err != nil {
|
|
logrus.Warn(err)
|
|
}
|
|
pids, err := m.GetAllPids()
|
|
if err != nil {
|
|
m.Freeze(configs.Thawed)
|
|
return err
|
|
}
|
|
for _, pid := range pids {
|
|
if p, err := os.FindProcess(pid); err == nil {
|
|
procs = append(procs, p)
|
|
if err := p.Kill(); err != nil {
|
|
logrus.Warn(err)
|
|
}
|
|
}
|
|
}
|
|
if err := m.Freeze(configs.Thawed); err != nil {
|
|
logrus.Warn(err)
|
|
}
|
|
for _, p := range procs {
|
|
if _, err := p.Wait(); err != nil {
|
|
logrus.Warn(err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|