*: handle unprivileged operations and !dumpable

Effectively, !dumpable makes implementing rootless containers quite
hard, due to a bunch of different operations on /proc/self no longer
being possible without reordering everything.

!dumpable only really makes sense when you are switching between
different security contexts, which is only the case when we are joining
namespaces. Unfortunately this means that !dumpable will still have
issues in this instance, and it should only be necessary to set
!dumpable if we are not joining USER namespaces (new kernels have
protections that make !dumpable no longer necessary). But that's a topic
for another time.

This also includes code to unset and then re-set dumpable when doing the
USER namespace mappings. This should also be safe because in principle
processes in a container can't see us until after we fork into the PID
namespace (which happens after the user mapping).

In rootless containers, it is not possible to set a non-dumpable
process's /proc/self/oom_score_adj (it's owned by root and thus not
writeable). Thus, it needs to be set inside nsexec before we set
ourselves as non-dumpable.

Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
Aleksa Sarai 2017-01-17 12:25:21 +11:00
parent ef9a4b3155
commit 6bd4bd9030
No known key found for this signature in database
GPG Key ID: 9E18AA267DDB8DB4
5 changed files with 68 additions and 32 deletions

View File

@ -1455,5 +1455,11 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
}
}
// write oom_score_adj
r.AddData(&Bytemsg{
Type: OomScoreAdjAttr,
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
})
return bytes.NewReader(r.Serialize()), nil
}

View File

@ -6,10 +6,8 @@ import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"os"
"strconv"
"strings"
"syscall"
"unsafe"
@ -369,12 +367,6 @@ func setupRlimits(limits []configs.Rlimit, pid int) error {
return nil
}
func setOomScoreAdj(oomScoreAdj int, pid int) error {
path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
}
const _P_PID = 1
type siginfo struct {

View File

@ -11,12 +11,14 @@ import (
// list of known message types we want to send to bootstrap program
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
NsPathsAttr uint16 = 27282
UidmapAttr uint16 = 27283
GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
NsPathsAttr uint16 = 27282
UidmapAttr uint16 = 27283
GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285
OomScoreAdjAttr uint16 = 27286
// When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
)

View File

@ -72,18 +72,21 @@ struct nlconfig_t {
char *namespaces;
size_t namespaces_len;
uint8_t is_setgroup;
char *oom_score_adj;
size_t oom_score_adj_len;
};
/*
* List of netlink message types sent to us as part of bootstrapping the init.
* These constants are defined in libcontainer/message_linux.go.
*/
#define INIT_MSG 62000
#define INIT_MSG 62000
#define CLONE_FLAGS_ATTR 27281
#define NS_PATHS_ATTR 27282
#define UIDMAP_ATTR 27283
#define GIDMAP_ATTR 27284
#define UIDMAP_ATTR 27283
#define GIDMAP_ATTR 27284
#define SETGROUP_ATTR 27285
#define OOM_SCORE_ADJ_ATTR 27286
/*
* Use the raw syscall for versions of glibc which don't include a function for
@ -186,7 +189,7 @@ static void update_setgroups(int pid, enum policy_t setgroup)
}
}
static void update_uidmap(int pid, char *map, int map_len)
static void update_uidmap(int pid, char *map, size_t map_len)
{
if (map == NULL || map_len <= 0)
return;
@ -195,7 +198,7 @@ static void update_uidmap(int pid, char *map, int map_len)
bail("failed to update /proc/%d/uid_map", pid);
}
static void update_gidmap(int pid, char *map, int map_len)
static void update_gidmap(int pid, char *map, size_t map_len)
{
if (map == NULL || map_len <= 0)
return;
@ -204,6 +207,15 @@ static void update_gidmap(int pid, char *map, int map_len)
bail("failed to update /proc/%d/gid_map", pid);
}
static void update_oom_score_adj(char *data, size_t len)
{
if (data == NULL || len <= 0)
return;
if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
bail("failed to update /proc/self/oom_score_adj");
}
/* A dummy function that just jumps to the given jumpval. */
static int child_func(void *arg) __attribute__ ((noinline));
static int child_func(void *arg)
@ -317,6 +329,10 @@ static void nl_parse(int fd, struct nlconfig_t *config)
case CLONE_FLAGS_ATTR:
config->cloneflags = readint32(current);
break;
case OOM_SCORE_ADJ_ATTR:
config->oom_score_adj = current;
config->oom_score_adj_len = payload_len;
break;
case NS_PATHS_ATTR:
config->namespaces = current;
config->namespaces_len = payload_len;
@ -425,14 +441,32 @@ void nsexec(void)
if (pipenum == -1)
return;
/* make the process non-dumpable */
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) != 0) {
bail("failed to set process as non-dumpable");
}
/* Parse all of the netlink configuration. */
nl_parse(pipenum, &config);
/* Set oom_score_adj. This has to be done before !dumpable because
* /proc/self/oom_score_adj is not writeable unless you're an privileged
* user (if !dumpable is set). All children inherit their parent's
* oom_score_adj value on fork(2) so this will always be propagated
* properly.
*/
update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
/*
* Make the process non-dumpable, to avoid various race conditions that
* could cause processes in namespaces we're joining to access host
* resources (or potentially execute code).
*
* However, if the number of namespaces we are joining is 0, we are not
* going to be switching to a different security context. Thus setting
* ourselves to be non-dumpable only breaks things (like rootless
* containers), which is the recommendation from the kernel folks.
*/
if (config.namespaces) {
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as non-dumpable");
}
/* Pipe so we can tell the child when we've finished setting up. */
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
bail("failed to setup sync pipe between parent and child");
@ -681,6 +715,11 @@ void nsexec(void)
* clone_parent rant). So signal our parent to hook us up.
*/
/* Switching is only necessary if we joined namespaces. */
if (config.namespaces) {
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
bail("failed to set process as dumpable");
}
s = SYNC_USERMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
@ -691,6 +730,11 @@ void nsexec(void)
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
if (s != SYNC_USERMAP_ACK)
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
/* Switching is only necessary if we joined namespaces. */
if (config.namespaces) {
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as dumpable");
}
}
/*

View File

@ -85,10 +85,6 @@ func (p *setnsProcess) start() (err error) {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
@ -285,10 +281,6 @@ func (p *initProcess) start() error {
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score for ready process")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {