From 6dbb716877728ce4eb51619885ef6ef4ada9565f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 1 May 2020 15:06:43 +0100 Subject: [PATCH 1/6] virtiofsd: add --rlimit-nofile=NUM option Make it possible to specify the RLIMIT_NOFILE on the command-line. Users running multiple virtiofsd processes should allocate a certain number to each process so that the system-wide limit can never be exhausted. When this option is set to 0 the rlimit is left at its current value. This is useful when a management tool wants to configure the rlimit itself. The default behavior remains unchanged: try to set the limit to 1,000,000 file descriptors if the current rlimit is lower. Signed-off-by: Stefan Hajnoczi Reviewed-by: Dr. David Alan Gilbert Message-Id: <20200501140644.220940-2-stefanha@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/fuse_lowlevel.h | 1 + tools/virtiofsd/helper.c | 23 +++++++++++++++++++++++ tools/virtiofsd/passthrough_ll.c | 22 ++++++++-------------- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h index 8f6d705b5c..562fd5241e 100644 --- a/tools/virtiofsd/fuse_lowlevel.h +++ b/tools/virtiofsd/fuse_lowlevel.h @@ -1777,6 +1777,7 @@ struct fuse_cmdline_opts { int syslog; int log_level; unsigned int max_idle_threads; + unsigned long rlimit_nofile; }; /** diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c index 819c2bc13c..dc59f38af0 100644 --- a/tools/virtiofsd/helper.c +++ b/tools/virtiofsd/helper.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #define FUSE_HELPER_OPT(t, p) \ @@ -53,6 +55,7 @@ static const struct fuse_opt fuse_helper_opts[] = { FUSE_HELPER_OPT("subtype=", nodefault_subtype), FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), + FUSE_HELPER_OPT("--rlimit-nofile=%lu", rlimit_nofile), FUSE_HELPER_OPT("--syslog", syslog), FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG), FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO), @@ -171,6 +174,9 @@ void fuse_cmdline_help(void) " default: no_writeback\n" " -o xattr|no_xattr enable/disable xattr\n" " default: no_xattr\n" + " --rlimit-nofile= set maximum number of file descriptors\n" + " (0 leaves rlimit unchanged)\n" + " default: 1,000,000 if the current rlimit is lower\n" ); } @@ -191,11 +197,28 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key, } } +static unsigned long get_default_rlimit_nofile(void) +{ + rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */ + struct rlimit rlim; + + if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { + fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); + exit(1); + } + + if (rlim.rlim_cur >= max_fds) { + return 0; /* we have more fds available than required! */ + } + return max_fds; +} + int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) { memset(opts, 0, sizeof(struct fuse_cmdline_opts)); opts->max_idle_threads = 10; + opts->rlimit_nofile = get_default_rlimit_nofile(); opts->foreground = 1; if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) == diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 4c35c95b25..f7b9c1d20c 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -2707,24 +2707,18 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, setup_seccomp(enable_syslog); } -/* Raise the maximum number of open file descriptors */ -static void setup_nofile_rlimit(void) +/* Set the maximum number of open file descriptors */ +static void setup_nofile_rlimit(unsigned long rlimit_nofile) { - const rlim_t max_fds = 1000000; - struct rlimit rlim; + struct rlimit rlim = { + .rlim_cur = rlimit_nofile, + .rlim_max = rlimit_nofile, + }; - if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { - fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); - exit(1); - } - - if (rlim.rlim_cur >= max_fds) { + if (rlimit_nofile == 0) { return; /* nothing to do */ } - rlim.rlim_cur = max_fds; - rlim.rlim_max = max_fds; - if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { /* Ignore SELinux denials */ if (errno == EPERM) { @@ -2977,7 +2971,7 @@ int main(int argc, char *argv[]) fuse_daemonize(opts.foreground); - setup_nofile_rlimit(); + setup_nofile_rlimit(opts.rlimit_nofile); /* Must be before sandbox since it wants /proc */ setup_capng(); From 8c1d353d107b4fc344e27f2f08ea7fa25de2eea2 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 1 May 2020 15:06:44 +0100 Subject: [PATCH 2/6] virtiofsd: stay below fs.file-max sysctl value (CVE-2020-10717) The system-wide fs.file-max sysctl value determines how many files can be open. It defaults to a value calculated based on the machine's RAM size. Previously virtiofsd would try to set RLIMIT_NOFILE to 1,000,000 and this allowed the FUSE client to exhaust the number of open files system-wide on Linux hosts with less than 10 GB of RAM! Take fs.file-max into account when choosing the default RLIMIT_NOFILE value. Fixes: CVE-2020-10717 Reported-by: Yuval Avrahami Signed-off-by: Stefan Hajnoczi Reviewed-by: Dr. David Alan Gilbert Message-Id: <20200501140644.220940-3-stefanha@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/helper.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c index dc59f38af0..00a1ef666a 100644 --- a/tools/virtiofsd/helper.c +++ b/tools/virtiofsd/helper.c @@ -176,7 +176,8 @@ void fuse_cmdline_help(void) " default: no_xattr\n" " --rlimit-nofile= set maximum number of file descriptors\n" " (0 leaves rlimit unchanged)\n" - " default: 1,000,000 if the current rlimit is lower\n" + " default: min(1000000, fs.file-max - 16384)\n" + " if the current rlimit is lower\n" ); } @@ -199,9 +200,32 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key, static unsigned long get_default_rlimit_nofile(void) { + g_autofree gchar *file_max_str = NULL; + const rlim_t reserved_fds = 16384; /* leave at least this many fds free */ rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */ + rlim_t file_max; struct rlimit rlim; + /* + * Reduce max_fds below the system-wide maximum, if necessary. This + * ensures there are fds available for other processes so we don't + * cause resource exhaustion. + */ + if (!g_file_get_contents("/proc/sys/fs/file-max", &file_max_str, + NULL, NULL)) { + fuse_log(FUSE_LOG_ERR, "can't read /proc/sys/fs/file-max\n"); + exit(1); + } + file_max = g_ascii_strtoull(file_max_str, NULL, 10); + if (file_max < 2 * reserved_fds) { + fuse_log(FUSE_LOG_ERR, + "The fs.file-max sysctl is too low (%lu) to allow a " + "reasonable number of open files.\n", + (unsigned long)file_max); + exit(1); + } + max_fds = MIN(file_max - reserved_fds, max_fds); + if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); exit(1); From 397ae982f4df46e7d4b2625c431062c9146f3b83 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 Apr 2020 14:47:33 +0200 Subject: [PATCH 3/6] virtiofsd: jail lo->proc_self_fd While it's not possible to escape the proc filesystem through lo->proc_self_fd, it is possible to escape to the root of the proc filesystem itself through "../..". Use a temporary mount for opening lo->proc_self_fd, that has it's root at /proc/self/fd/, preventing access to the ancestor directories. Signed-off-by: Miklos Szeredi Message-Id: <20200429124733.22488-1-mszeredi@redhat.com> Reviewed-by: Stefan Hajnoczi Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/passthrough_ll.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index f7b9c1d20c..d7a6474b6e 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -2536,6 +2536,8 @@ static void print_capabilities(void) static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) { pid_t child; + char template[] = "virtiofsd-XXXXXX"; + char *tmpdir; /* * Create a new pid namespace for *child* processes. We'll have to @@ -2597,12 +2599,33 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) exit(1); } - /* Now we can get our /proc/self/fd directory file descriptor */ - lo->proc_self_fd = open("/proc/self/fd", O_PATH); - if (lo->proc_self_fd == -1) { - fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); + tmpdir = mkdtemp(template); + if (!tmpdir) { + fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template); exit(1); } + + if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n", + tmpdir); + exit(1); + } + + /* Now we can get our /proc/self/fd directory file descriptor */ + lo->proc_self_fd = open(tmpdir, O_PATH); + if (lo->proc_self_fd == -1) { + fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir); + exit(1); + } + + if (umount2(tmpdir, MNT_DETACH) < 0) { + fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir); + exit(1); + } + + if (rmdir(tmpdir) < 0) { + fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir); + } } /* From ace0829c0d08f0e5f1451e402e94495bc2166772 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 24 Apr 2020 15:35:16 +0200 Subject: [PATCH 4/6] virtiofsd: Show submounts Currently, setup_mounts() bind-mounts the shared directory without MS_REC. This makes all submounts disappear. Pass MS_REC so that the guest can see submounts again. Fixes: 5baa3b8e95064c2434bd9e2f312edd5e9ae275dc Signed-off-by: Max Reitz Message-Id: <20200424133516.73077-1-mreitz@redhat.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert Changed Fixes to point to the commit with the problem rather than the commit that turned it on --- tools/virtiofsd/passthrough_ll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index d7a6474b6e..7873692168 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -2666,7 +2666,7 @@ static void setup_mounts(const char *source) int oldroot; int newroot; - if (mount(source, source, NULL, MS_BIND, NULL) < 0) { + if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) { fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); exit(1); } From a59feb483b8fae24d043569ccfcc97ea23d54a02 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 16 Apr 2020 17:49:06 +0100 Subject: [PATCH 5/6] virtiofsd: only retain file system capabilities virtiofsd runs as root but only needs a subset of root's Linux capabilities(7). As a file server its purpose is to create and access files on behalf of a client. It needs to be able to access files with arbitrary uid/gid owners. It also needs to be create device nodes. Introduce a Linux capabilities(7) whitelist and drop all capabilities that we don't need, making the virtiofsd process less powerful than a regular uid root process. # cat /proc/PID/status ... Before After CapInh: 0000000000000000 0000000000000000 CapPrm: 0000003fffffffff 00000000880000df CapEff: 0000003fffffffff 00000000880000df CapBnd: 0000003fffffffff 0000000000000000 CapAmb: 0000000000000000 0000000000000000 Note that file capabilities cannot be used to achieve the same effect on the virtiofsd executable because mount is used during sandbox setup. Therefore we drop capabilities programmatically at the right point during startup. This patch only affects the sandboxed child process. The parent process that sits in waitpid(2) still has full root capabilities and will be addressed in the next patch. Signed-off-by: Stefan Hajnoczi Message-Id: <20200416164907.244868-2-stefanha@redhat.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/passthrough_ll.c | 38 ++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 7873692168..e49650b63d 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -2718,6 +2718,43 @@ static void setup_mounts(const char *source) close(oldroot); } +/* + * Only keep whitelisted capabilities that are needed for file system operation + */ +static void setup_capabilities(void) +{ + pthread_mutex_lock(&cap.mutex); + capng_restore_state(&cap.saved); + + /* + * Whitelist file system-related capabilities that are needed for a file + * server to act like root. Drop everything else like networking and + * sysadmin capabilities. + * + * Exclusions: + * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl + * and we don't support that. + * 2. CAP_MAC_OVERRIDE is not included because it only seems to be + * used by the Smack LSM. Omit it until there is demand for it. + */ + capng_setpid(syscall(SYS_gettid)); + capng_clear(CAPNG_SELECT_BOTH); + capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE, + CAP_CHOWN, + CAP_DAC_OVERRIDE, + CAP_DAC_READ_SEARCH, + CAP_FOWNER, + CAP_FSETID, + CAP_SETGID, + CAP_SETUID, + CAP_MKNOD, + CAP_SETFCAP); + capng_apply(CAPNG_SELECT_BOTH); + + cap.saved = capng_save_state(); + pthread_mutex_unlock(&cap.mutex); +} + /* * Lock down this process to prevent access to other processes or files outside * source directory. This reduces the impact of arbitrary code execution bugs. @@ -2728,6 +2765,7 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, setup_namespaces(lo, se); setup_mounts(lo->source); setup_seccomp(enable_syslog); + setup_capabilities(); } /* Set the maximum number of open file descriptors */ From 66502bbca37ca7a3bfa57e82cfc03b89a7a11eae Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 16 Apr 2020 17:49:07 +0100 Subject: [PATCH 6/6] virtiofsd: drop all capabilities in the wait parent process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All this process does is wait for its child. No capabilities are needed. Signed-off-by: Stefan Hajnoczi Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/passthrough_ll.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index e49650b63d..3ba1d90984 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -2530,6 +2530,17 @@ static void print_capabilities(void) printf("}\n"); } +/* + * Drop all Linux capabilities because the wait parent process only needs to + * sit in waitpid(2) and terminate. + */ +static void setup_wait_parent_capabilities(void) +{ + capng_setpid(syscall(SYS_gettid)); + capng_clear(CAPNG_SELECT_BOTH); + capng_apply(CAPNG_SELECT_BOTH); +} + /* * Move to a new mount, net, and pid namespaces to isolate this process. */ @@ -2563,6 +2574,8 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) pid_t waited; int wstatus; + setup_wait_parent_capabilities(); + /* The parent waits for the child */ do { waited = waitpid(child, &wstatus, 0);