nsenter: cloned_binary: expand and add pre-3.11 fallbacks
In order to get around the memfd_create(2) requirement, 0a8e4117e7
("nsenter: clone /proc/self/exe to avoid exposing host binary to
container") added an O_TMPFILE fallback. However, this fallback was
flawed in two ways:
* It required O_TMPFILE which is relatively new (having been added to
Linux 3.11).
* The fallback choice was made at compile-time, not runtime. This
results in several complications when it comes to running binaries
on different machines to the ones they were built on.
The easiest way to resolve these things is to have fallbacks work in a
more procedural way (though it does make the code unfortunately more
complicated) and to add a new fallback that uses mkotemp(3).
Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
parent
5b775bf297
commit
2429d59352
|
@ -36,18 +36,21 @@
|
||||||
#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
|
#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
|
||||||
# define SYS_memfd_create __NR_memfd_create
|
# define SYS_memfd_create __NR_memfd_create
|
||||||
#endif
|
#endif
|
||||||
#ifdef SYS_memfd_create
|
|
||||||
# define HAVE_MEMFD_CREATE
|
|
||||||
/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
|
/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
|
||||||
# ifndef MFD_CLOEXEC
|
#ifndef MFD_CLOEXEC
|
||||||
# define MFD_CLOEXEC 0x0001U
|
# define MFD_CLOEXEC 0x0001U
|
||||||
# define MFD_ALLOW_SEALING 0x0002U
|
# define MFD_ALLOW_SEALING 0x0002U
|
||||||
# endif
|
#endif
|
||||||
int memfd_create(const char *name, unsigned int flags)
|
int memfd_create(const char *name, unsigned int flags)
|
||||||
{
|
{
|
||||||
|
#ifdef SYS_memfd_create
|
||||||
return syscall(SYS_memfd_create, name, flags);
|
return syscall(SYS_memfd_create, name, flags);
|
||||||
}
|
#else
|
||||||
|
errno = ENOSYS;
|
||||||
|
return -1;
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* This comes directly from <linux/fcntl.h>. */
|
/* This comes directly from <linux/fcntl.h>. */
|
||||||
#ifndef F_LINUX_SPECIFIC_BASE
|
#ifndef F_LINUX_SPECIFIC_BASE
|
||||||
|
@ -64,11 +67,9 @@ int memfd_create(const char *name, unsigned int flags)
|
||||||
# define F_SEAL_WRITE 0x0008 /* prevent writes */
|
# define F_SEAL_WRITE 0x0008 /* prevent writes */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_MEMFD_CREATE
|
#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
|
||||||
# define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
|
#define RUNC_MEMFD_SEALS \
|
||||||
# define RUNC_MEMFD_SEALS \
|
|
||||||
(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
|
(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
|
||||||
#endif
|
|
||||||
|
|
||||||
static void *must_realloc(void *ptr, size_t size)
|
static void *must_realloc(void *ptr, size_t size)
|
||||||
{
|
{
|
||||||
|
@ -92,23 +93,29 @@ static int is_self_cloned(void)
|
||||||
if (fd < 0)
|
if (fd < 0)
|
||||||
return -ENOTRECOVERABLE;
|
return -ENOTRECOVERABLE;
|
||||||
|
|
||||||
#ifdef HAVE_MEMFD_CREATE
|
/* First check memfd. */
|
||||||
ret = fcntl(fd, F_GET_SEALS);
|
ret = fcntl(fd, F_GET_SEALS);
|
||||||
is_cloned = (ret == RUNC_MEMFD_SEALS);
|
if (ret >= 0) {
|
||||||
#else
|
is_cloned = (ret == RUNC_MEMFD_SEALS);
|
||||||
struct stat statbuf = {0};
|
} else {
|
||||||
ret = fstat(fd, &statbuf);
|
/*
|
||||||
if (ret >= 0)
|
* Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
|
||||||
is_cloned = (statbuf.st_nlink == 0);
|
* which appears to have a borked backport of F_GET_SEALS. Either way,
|
||||||
#endif
|
* having a file which has no hardlinks indicates that we aren't using
|
||||||
|
* a host-side "runc" binary and this is something that a container
|
||||||
|
* cannot fake (because unlinking requires being able to resolve the
|
||||||
|
* path that you want to unlink).
|
||||||
|
*/
|
||||||
|
struct stat statbuf = {};
|
||||||
|
if (fstat(fd, &statbuf) >= 0)
|
||||||
|
is_cloned = (statbuf.st_nlink == 0);
|
||||||
|
}
|
||||||
|
|
||||||
close(fd);
|
close(fd);
|
||||||
return is_cloned;
|
return is_cloned;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* Read a given file into a new buffer, and providing the length. */
|
||||||
* Basic wrapper around mmap(2) that gives you the file length so you can
|
|
||||||
* safely treat it as an ordinary buffer. Only gives you read access.
|
|
||||||
*/
|
|
||||||
static char *read_file(char *path, size_t *length)
|
static char *read_file(char *path, size_t *length)
|
||||||
{
|
{
|
||||||
int fd;
|
int fd;
|
||||||
|
@ -191,18 +198,127 @@ error:
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum {
|
||||||
|
EFD_NONE = 0,
|
||||||
|
EFD_MEMFD,
|
||||||
|
EFD_FILE,
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
|
||||||
|
* changes depending on the architecture. If we don't have O_TMPFILE we always
|
||||||
|
* have the mkostemp(3) fallback.
|
||||||
|
*/
|
||||||
|
#ifndef O_TMPFILE
|
||||||
|
# if defined(__O_TMPFILE) && defined(O_DIRECTORY)
|
||||||
|
# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static int make_execfd(int *fdtype)
|
||||||
|
{
|
||||||
|
int fd;
|
||||||
|
char template[] = "/tmp/runc-cloned-binary.XXXXXX";
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Try memfd first, it's much nicer since it's easily detected thanks to
|
||||||
|
* sealing and also doesn't require assumptions like /tmp.
|
||||||
|
*/
|
||||||
|
*fdtype = EFD_MEMFD;
|
||||||
|
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
|
||||||
|
if (fd >= 0)
|
||||||
|
return fd;
|
||||||
|
if (errno != ENOSYS)
|
||||||
|
goto err;
|
||||||
|
|
||||||
|
#ifdef O_TMPFILE
|
||||||
|
/*
|
||||||
|
* Try O_TMPFILE to avoid races where someone might snatch our file. Note
|
||||||
|
* that O_EXCL isn't actually a security measure here (since you can just
|
||||||
|
* fd re-open it and clear O_EXCL).
|
||||||
|
*/
|
||||||
|
*fdtype = EFD_FILE;
|
||||||
|
fd = open("/tmp", O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
|
||||||
|
if (fd >= 0) {
|
||||||
|
struct stat statbuf = {};
|
||||||
|
bool working_otmpfile = false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* open(2) ignores unknown O_* flags -- yeah, I was surprised when I
|
||||||
|
* found this out too. As a result we can't check for EINVAL. However,
|
||||||
|
* if we get nlink != 0 (or EISDIR) then we know that this kernel
|
||||||
|
* doesn't support O_TMPFILE.
|
||||||
|
*/
|
||||||
|
if (fstat(fd, &statbuf) >= 0)
|
||||||
|
working_otmpfile = (statbuf.st_nlink == 0);
|
||||||
|
|
||||||
|
if (working_otmpfile)
|
||||||
|
return fd;
|
||||||
|
|
||||||
|
/* Pretend that we got EISDIR since O_TMPFILE failed. */
|
||||||
|
close(fd);
|
||||||
|
errno = EISDIR;
|
||||||
|
}
|
||||||
|
if (errno != EISDIR)
|
||||||
|
goto err;
|
||||||
|
#endif /* defined(O_TMPFILE) */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Our final option is to create a temporary file the old-school way, and
|
||||||
|
* then unlink it so that nothing else sees it by accident.
|
||||||
|
*/
|
||||||
|
*fdtype = EFD_FILE;
|
||||||
|
fd = mkostemp(template, O_CLOEXEC);
|
||||||
|
if (fd >= 0) {
|
||||||
|
if (unlink(template) >= 0)
|
||||||
|
return fd;
|
||||||
|
close(fd);
|
||||||
|
}
|
||||||
|
|
||||||
|
err:
|
||||||
|
*fdtype = EFD_NONE;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int seal_execfd(int *fd, int fdtype)
|
||||||
|
{
|
||||||
|
switch (fdtype) {
|
||||||
|
case EFD_MEMFD:
|
||||||
|
return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
|
||||||
|
case EFD_FILE: {
|
||||||
|
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
|
||||||
|
int newfd;
|
||||||
|
char fdpath[PATH_MAX] = {0};
|
||||||
|
|
||||||
|
if (fchmod(*fd, 0100) < 0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
newfd = open(fdpath, O_PATH | O_CLOEXEC);
|
||||||
|
if (newfd < 0)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
close(*fd);
|
||||||
|
*fd = newfd;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
static int clone_binary(void)
|
static int clone_binary(void)
|
||||||
{
|
{
|
||||||
int binfd, memfd;
|
int binfd, memfd;
|
||||||
struct stat statbuf = {};
|
struct stat statbuf = {};
|
||||||
size_t sent = 0;
|
size_t sent = 0;
|
||||||
|
int fdtype = EFD_NONE;
|
||||||
|
|
||||||
#ifdef HAVE_MEMFD_CREATE
|
memfd = make_execfd(&fdtype);
|
||||||
memfd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
|
if (memfd < 0 || fdtype == EFD_NONE)
|
||||||
#else
|
|
||||||
memfd = open("/tmp", O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0711);
|
|
||||||
#endif
|
|
||||||
if (memfd < 0)
|
|
||||||
return -ENOTRECOVERABLE;
|
return -ENOTRECOVERABLE;
|
||||||
|
|
||||||
binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
|
binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
|
||||||
|
@ -222,25 +338,8 @@ static int clone_binary(void)
|
||||||
if (sent != statbuf.st_size)
|
if (sent != statbuf.st_size)
|
||||||
goto error;
|
goto error;
|
||||||
|
|
||||||
#ifdef HAVE_MEMFD_CREATE
|
if (seal_execfd(&memfd, fdtype) < 0)
|
||||||
int err = fcntl(memfd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
|
|
||||||
if (err < 0)
|
|
||||||
goto error;
|
goto error;
|
||||||
#else
|
|
||||||
/* Need to re-open "memfd" as read-only to avoid execve(2) giving -EXTBUSY. */
|
|
||||||
int newfd;
|
|
||||||
char *fdpath = NULL;
|
|
||||||
|
|
||||||
if (asprintf(&fdpath, "/proc/self/fd/%d", memfd) < 0)
|
|
||||||
goto error;
|
|
||||||
newfd = open(fdpath, O_RDONLY | O_CLOEXEC);
|
|
||||||
free(fdpath);
|
|
||||||
if (newfd < 0)
|
|
||||||
goto error;
|
|
||||||
|
|
||||||
close(memfd);
|
|
||||||
memfd = newfd;
|
|
||||||
#endif
|
|
||||||
return memfd;
|
return memfd;
|
||||||
|
|
||||||
error_binfd:
|
error_binfd:
|
||||||
|
|
Loading…
Reference in New Issue