// +build linux package libcontainer import ( "encoding/json" "fmt" "io" "io/ioutil" "net" "os" "strconv" "strings" "syscall" "github.com/Sirupsen/logrus" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/user" "github.com/opencontainers/runc/libcontainer/utils" "github.com/vishvananda/netlink" ) type initType string const ( initSetns initType = "setns" initStandard initType = "standard" ) type pid struct { Pid int `json:"pid"` } // network is an internal struct used to setup container networks. type network struct { configs.Network // TempVethPeerName is a unique temporary veth peer name that was placed into // the container's namespace. TempVethPeerName string `json:"temp_veth_peer_name"` } // initConfig is used for transferring parameters from Exec() to Init() type initConfig struct { Args []string `json:"args"` Env []string `json:"env"` Cwd string `json:"cwd"` Capabilities []string `json:"capabilities"` ProcessLabel string `json:"process_label"` AppArmorProfile string `json:"apparmor_profile"` NoNewPrivileges bool `json:"no_new_privileges"` User string `json:"user"` Config *configs.Config `json:"config"` Console string `json:"console"` Networks []*network `json:"network"` PassedFilesCount int `json:"passed_files_count"` ContainerId string `json:"containerid"` Rlimits []configs.Rlimit `json:"rlimits"` } type initer interface { Init(s chan os.Signal) error } func newContainerInit(t initType, pipe *os.File) (initer, error) { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return nil, err } if err := populateProcessEnvironment(config.Env); err != nil { return nil, err } switch t { case initSetns: return &linuxSetnsInit{ config: config, }, nil case initStandard: return &linuxStandardInit{ pipe: pipe, parentPid: syscall.Getppid(), config: config, }, nil } return nil, fmt.Errorf("unknown init type %q", t) } // populateProcessEnvironment loads the provided environment variables into the // current processes's environment. func populateProcessEnvironment(env []string) error { for _, pair := range env { p := strings.SplitN(pair, "=", 2) if len(p) < 2 { return fmt.Errorf("invalid environment '%v'", pair) } if err := os.Setenv(p[0], p[1]); err != nil { return err } } return nil } // finalizeNamespace drops the caps, sets the correct user // and working dir, and closes any leaked file descriptors // before executing the command inside the namespace func finalizeNamespace(config *initConfig) error { // Ensure that all unwanted fds we may have accidentally // inherited are marked close-on-exec so they stay out of the // container if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { return err } capabilities := config.Config.Capabilities if config.Capabilities != nil { capabilities = config.Capabilities } w, err := newCapWhitelist(capabilities) if err != nil { return err } // drop capabilities in bounding set before changing user if err := w.dropBoundingSet(); err != nil { return err } // preserve existing capabilities while we change users if err := system.SetKeepCaps(); err != nil { return err } if err := setupUser(config); err != nil { return err } if err := system.ClearKeepCaps(); err != nil { return err } // drop all other capabilities if err := w.drop(); err != nil { return err } if config.Cwd != "" { if err := syscall.Chdir(config.Cwd); err != nil { return err } } return nil } // syncParentReady sends to the given pipe a JSON payload which indicates that // the init is ready to Exec the child process. It then waits for the parent to // indicate that it is cleared to Exec. func syncParentReady(pipe io.ReadWriter) error { // Tell parent. if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil { return err } // Wait for parent to give the all-clear. var procSync syncT if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { if err == io.EOF { return fmt.Errorf("parent closed synchronisation channel") } if procSync.Type != procRun { return fmt.Errorf("invalid synchronisation flag from parent") } } return nil } // syncParentHooks sends to the given pipe a JSON payload which indicates that // the parent should execute pre-start hooks. It then waits for the parent to // indicate that it is cleared to resume. func syncParentHooks(pipe io.ReadWriter) error { // Tell parent. if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil { return err } // Wait for parent to give the all-clear. var procSync syncT if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { if err == io.EOF { return fmt.Errorf("parent closed synchronisation channel") } if procSync.Type != procResume { return fmt.Errorf("invalid synchronisation flag from parent") } } return nil } // setupUser changes the groups, gid, and uid for the user inside the container func setupUser(config *initConfig) error { // Set up defaults. defaultExecUser := user.ExecUser{ Uid: syscall.Getuid(), Gid: syscall.Getgid(), Home: "/", } passwdPath, err := user.GetPasswdPath() if err != nil { return err } groupPath, err := user.GetGroupPath() if err != nil { return err } execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) if err != nil { return err } var addGroups []int if len(config.Config.AdditionalGroups) > 0 { addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath) if err != nil { return err } } // before we change to the container's user make sure that the processes STDIO // is correctly owned by the user that we are switching to. if err := fixStdioPermissions(execUser); err != nil { return err } suppGroups := append(execUser.Sgids, addGroups...) if err := syscall.Setgroups(suppGroups); err != nil { return err } if err := system.Setgid(execUser.Gid); err != nil { return err } if err := system.Setuid(execUser.Uid); err != nil { return err } // if we didn't get HOME already, set it based on the user's HOME if envHome := os.Getenv("HOME"); envHome == "" { if err := os.Setenv("HOME", execUser.Home); err != nil { return err } } return nil } // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. // The ownership needs to match because it is created outside of the container and needs to be // localized. func fixStdioPermissions(u *user.ExecUser) error { var null syscall.Stat_t if err := syscall.Stat("/dev/null", &null); err != nil { return err } for _, fd := range []uintptr{ os.Stdin.Fd(), os.Stderr.Fd(), os.Stdout.Fd(), } { var s syscall.Stat_t if err := syscall.Fstat(int(fd), &s); err != nil { return err } // skip chown of /dev/null if it was used as one of the STDIO fds. if s.Rdev == null.Rdev { continue } if err := syscall.Fchown(int(fd), u.Uid, u.Gid); err != nil { return err } } return nil } // setupNetwork sets up and initializes any network interface inside the container. func setupNetwork(config *initConfig) error { for _, config := range config.Networks { strategy, err := getStrategy(config.Type) if err != nil { return err } if err := strategy.initialize(config); err != nil { return err } } return nil } func setupRoute(config *configs.Config) error { for _, config := range config.Routes { _, dst, err := net.ParseCIDR(config.Destination) if err != nil { return err } src := net.ParseIP(config.Source) if src == nil { return fmt.Errorf("Invalid source for route: %s", config.Source) } gw := net.ParseIP(config.Gateway) if gw == nil { return fmt.Errorf("Invalid gateway for route: %s", config.Gateway) } l, err := netlink.LinkByName(config.InterfaceName) if err != nil { return err } route := &netlink.Route{ Scope: netlink.SCOPE_UNIVERSE, Dst: dst, Src: src, Gw: gw, LinkIndex: l.Attrs().Index, } if err := netlink.RouteAdd(route); err != nil { return err } } return nil } func setupRlimits(limits []configs.Rlimit, pid int) error { for _, rlimit := range limits { if err := system.Prlimit(pid, rlimit.Type, syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil { return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) } } return nil } func setOomScoreAdj(oomScoreAdj int, pid int) error { path := fmt.Sprintf("/proc/%d/oom_score_adj", pid) return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600) } // killCgroupProcesses freezes then iterates over all the processes inside the // manager's cgroups sending a SIGKILL to each process then waiting for them to // exit. func killCgroupProcesses(m cgroups.Manager) error { var procs []*os.Process if err := m.Freeze(configs.Frozen); err != nil { logrus.Warn(err) } pids, err := m.GetAllPids() if err != nil { m.Freeze(configs.Thawed) return err } for _, pid := range pids { p, err := os.FindProcess(pid) if err != nil { logrus.Warn(err) continue } procs = append(procs, p) if err := p.Kill(); err != nil { logrus.Warn(err) } } if err := m.Freeze(configs.Thawed); err != nil { logrus.Warn(err) } for _, p := range procs { if _, err := p.Wait(); err != nil { logrus.Warn(err) } } return nil }