*: handle unprivileged operations and !dumpable
Effectively, !dumpable makes implementing rootless containers quite hard, due to a bunch of different operations on /proc/self no longer being possible without reordering everything. !dumpable only really makes sense when you are switching between different security contexts, which is only the case when we are joining namespaces. Unfortunately this means that !dumpable will still have issues in this instance, and it should only be necessary to set !dumpable if we are not joining USER namespaces (new kernels have protections that make !dumpable no longer necessary). But that's a topic for another time. This also includes code to unset and then re-set dumpable when doing the USER namespace mappings. This should also be safe because in principle processes in a container can't see us until after we fork into the PID namespace (which happens after the user mapping). In rootless containers, it is not possible to set a non-dumpable process's /proc/self/oom_score_adj (it's owned by root and thus not writeable). Thus, it needs to be set inside nsexec before we set ourselves as non-dumpable. Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
parent
ef9a4b3155
commit
6bd4bd9030
|
@ -1455,5 +1455,11 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
|
|||
}
|
||||
}
|
||||
|
||||
// write oom_score_adj
|
||||
r.AddData(&Bytemsg{
|
||||
Type: OomScoreAdjAttr,
|
||||
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
|
||||
})
|
||||
|
||||
return bytes.NewReader(r.Serialize()), nil
|
||||
}
|
||||
|
|
|
@ -6,10 +6,8 @@ import (
|
|||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"net"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
@ -369,12 +367,6 @@ func setupRlimits(limits []configs.Rlimit, pid int) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func setOomScoreAdj(oomScoreAdj int, pid int) error {
|
||||
path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
|
||||
|
||||
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
|
||||
}
|
||||
|
||||
const _P_PID = 1
|
||||
|
||||
type siginfo struct {
|
||||
|
|
|
@ -11,12 +11,14 @@ import (
|
|||
// list of known message types we want to send to bootstrap program
|
||||
// The number is randomly chosen to not conflict with known netlink types
|
||||
const (
|
||||
InitMsg uint16 = 62000
|
||||
CloneFlagsAttr uint16 = 27281
|
||||
NsPathsAttr uint16 = 27282
|
||||
UidmapAttr uint16 = 27283
|
||||
GidmapAttr uint16 = 27284
|
||||
SetgroupAttr uint16 = 27285
|
||||
InitMsg uint16 = 62000
|
||||
CloneFlagsAttr uint16 = 27281
|
||||
NsPathsAttr uint16 = 27282
|
||||
UidmapAttr uint16 = 27283
|
||||
GidmapAttr uint16 = 27284
|
||||
SetgroupAttr uint16 = 27285
|
||||
OomScoreAdjAttr uint16 = 27286
|
||||
|
||||
// When syscall.NLA_HDRLEN is in gccgo, take this out.
|
||||
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
|
||||
)
|
||||
|
|
|
@ -72,18 +72,21 @@ struct nlconfig_t {
|
|||
char *namespaces;
|
||||
size_t namespaces_len;
|
||||
uint8_t is_setgroup;
|
||||
char *oom_score_adj;
|
||||
size_t oom_score_adj_len;
|
||||
};
|
||||
|
||||
/*
|
||||
* List of netlink message types sent to us as part of bootstrapping the init.
|
||||
* These constants are defined in libcontainer/message_linux.go.
|
||||
*/
|
||||
#define INIT_MSG 62000
|
||||
#define INIT_MSG 62000
|
||||
#define CLONE_FLAGS_ATTR 27281
|
||||
#define NS_PATHS_ATTR 27282
|
||||
#define UIDMAP_ATTR 27283
|
||||
#define GIDMAP_ATTR 27284
|
||||
#define UIDMAP_ATTR 27283
|
||||
#define GIDMAP_ATTR 27284
|
||||
#define SETGROUP_ATTR 27285
|
||||
#define OOM_SCORE_ADJ_ATTR 27286
|
||||
|
||||
/*
|
||||
* Use the raw syscall for versions of glibc which don't include a function for
|
||||
|
@ -186,7 +189,7 @@ static void update_setgroups(int pid, enum policy_t setgroup)
|
|||
}
|
||||
}
|
||||
|
||||
static void update_uidmap(int pid, char *map, int map_len)
|
||||
static void update_uidmap(int pid, char *map, size_t map_len)
|
||||
{
|
||||
if (map == NULL || map_len <= 0)
|
||||
return;
|
||||
|
@ -195,7 +198,7 @@ static void update_uidmap(int pid, char *map, int map_len)
|
|||
bail("failed to update /proc/%d/uid_map", pid);
|
||||
}
|
||||
|
||||
static void update_gidmap(int pid, char *map, int map_len)
|
||||
static void update_gidmap(int pid, char *map, size_t map_len)
|
||||
{
|
||||
if (map == NULL || map_len <= 0)
|
||||
return;
|
||||
|
@ -204,6 +207,15 @@ static void update_gidmap(int pid, char *map, int map_len)
|
|||
bail("failed to update /proc/%d/gid_map", pid);
|
||||
}
|
||||
|
||||
static void update_oom_score_adj(char *data, size_t len)
|
||||
{
|
||||
if (data == NULL || len <= 0)
|
||||
return;
|
||||
|
||||
if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
|
||||
bail("failed to update /proc/self/oom_score_adj");
|
||||
}
|
||||
|
||||
/* A dummy function that just jumps to the given jumpval. */
|
||||
static int child_func(void *arg) __attribute__ ((noinline));
|
||||
static int child_func(void *arg)
|
||||
|
@ -317,6 +329,10 @@ static void nl_parse(int fd, struct nlconfig_t *config)
|
|||
case CLONE_FLAGS_ATTR:
|
||||
config->cloneflags = readint32(current);
|
||||
break;
|
||||
case OOM_SCORE_ADJ_ATTR:
|
||||
config->oom_score_adj = current;
|
||||
config->oom_score_adj_len = payload_len;
|
||||
break;
|
||||
case NS_PATHS_ATTR:
|
||||
config->namespaces = current;
|
||||
config->namespaces_len = payload_len;
|
||||
|
@ -425,14 +441,32 @@ void nsexec(void)
|
|||
if (pipenum == -1)
|
||||
return;
|
||||
|
||||
/* make the process non-dumpable */
|
||||
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) != 0) {
|
||||
bail("failed to set process as non-dumpable");
|
||||
}
|
||||
|
||||
/* Parse all of the netlink configuration. */
|
||||
nl_parse(pipenum, &config);
|
||||
|
||||
/* Set oom_score_adj. This has to be done before !dumpable because
|
||||
* /proc/self/oom_score_adj is not writeable unless you're an privileged
|
||||
* user (if !dumpable is set). All children inherit their parent's
|
||||
* oom_score_adj value on fork(2) so this will always be propagated
|
||||
* properly.
|
||||
*/
|
||||
update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
|
||||
|
||||
/*
|
||||
* Make the process non-dumpable, to avoid various race conditions that
|
||||
* could cause processes in namespaces we're joining to access host
|
||||
* resources (or potentially execute code).
|
||||
*
|
||||
* However, if the number of namespaces we are joining is 0, we are not
|
||||
* going to be switching to a different security context. Thus setting
|
||||
* ourselves to be non-dumpable only breaks things (like rootless
|
||||
* containers), which is the recommendation from the kernel folks.
|
||||
*/
|
||||
if (config.namespaces) {
|
||||
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
|
||||
bail("failed to set process as non-dumpable");
|
||||
}
|
||||
|
||||
/* Pipe so we can tell the child when we've finished setting up. */
|
||||
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
|
||||
bail("failed to setup sync pipe between parent and child");
|
||||
|
@ -681,6 +715,11 @@ void nsexec(void)
|
|||
* clone_parent rant). So signal our parent to hook us up.
|
||||
*/
|
||||
|
||||
/* Switching is only necessary if we joined namespaces. */
|
||||
if (config.namespaces) {
|
||||
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
|
||||
bail("failed to set process as dumpable");
|
||||
}
|
||||
s = SYNC_USERMAP_PLS;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
|
||||
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
|
||||
|
@ -691,6 +730,11 @@ void nsexec(void)
|
|||
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
|
||||
if (s != SYNC_USERMAP_ACK)
|
||||
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
|
||||
/* Switching is only necessary if we joined namespaces. */
|
||||
if (config.namespaces) {
|
||||
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
|
||||
bail("failed to set process as dumpable");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -85,10 +85,6 @@ func (p *setnsProcess) start() (err error) {
|
|||
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
|
||||
}
|
||||
}
|
||||
// set oom_score_adj
|
||||
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
|
||||
return newSystemErrorWithCause(err, "setting oom score")
|
||||
}
|
||||
// set rlimits, this has to be done here because we lose permissions
|
||||
// to raise the limits once we enter a user-namespace
|
||||
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
|
||||
|
@ -285,10 +281,6 @@ func (p *initProcess) start() error {
|
|||
if err := p.manager.Set(p.config.Config); err != nil {
|
||||
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
|
||||
}
|
||||
// set oom_score_adj
|
||||
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
|
||||
return newSystemErrorWithCause(err, "setting oom score for ready process")
|
||||
}
|
||||
// set rlimits, this has to be done here because we lose permissions
|
||||
// to raise the limits once we enter a user-namespace
|
||||
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
|
||||
|
|
Loading…
Reference in New Issue