From 0958ee89b6fb8542867ca68203626d6aef522e8f Mon Sep 17 00:00:00 2001 From: Wainer dos Santos Moschetta Date: Mon, 1 Feb 2021 18:14:56 -0300 Subject: [PATCH 1/6] virtiofsd: Allow to build it without the tools This changed the Meson build script to allow virtiofsd be built even though the tools build is disabled, thus honoring the --enable-virtiofsd option. Fixes: cece116c939d219070b250338439c2d16f94e3da (configure: add option for virtiofsd) Signed-off-by: Wainer dos Santos Moschetta Message-Id: <20210201211456.1133364-2-wainersm@redhat.com> Reviewed-by: Misono Tomohiro Reviewed-by: Stefan Hajnoczi Signed-off-by: Dr. David Alan Gilbert --- tools/meson.build | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/meson.build b/tools/meson.build index fdce66857d..3e5a0abfa2 100644 --- a/tools/meson.build +++ b/tools/meson.build @@ -10,8 +10,11 @@ if get_option('virtiofsd').enabled() error('virtiofsd requires Linux') elif not seccomp.found() or not libcap_ng.found() error('virtiofsd requires libcap-ng-devel and seccomp-devel') - elif not have_tools or 'CONFIG_VHOST_USER' not in config_host - error('virtiofsd needs tools and vhost-user support') + elif 'CONFIG_VHOST_USER' not in config_host + error('virtiofsd needs vhost-user support') + else + # Disabled all the tools but virtiofsd. + have_virtiofsd = true endif endif elif get_option('virtiofsd').disabled() or not have_system From 525a3030a82714693c2045d390b698dd4e2090bd Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Wed, 3 Feb 2021 19:24:34 +0100 Subject: [PATCH 2/6] virtiofsd: vu_dispatch locking should never fail pthread_rwlock_rdlock() and pthread_rwlock_wrlock() can fail if a deadlock condition is detected or the current thread already owns the lock. They can also fail, like pthread_rwlock_unlock(), if the mutex wasn't properly initialized. None of these are ever expected to happen with fv_VuDev::vu_dispatch_rwlock. Some users already check the return value and assert, some others don't. Introduce rdlock/wrlock/unlock wrappers that just do the former and use them everywhere for improved consistency and robustness. This is just cleanup. It doesn't fix any actual issue. Signed-off-by: Greg Kurz Message-Id: <20210203182434.93870-1-groug@kaod.org> Reviewed-by: Vivek Goyal Reviewed-by: Stefan Hajnoczi Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/fuse_virtio.c | 49 +++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c index ddcefee427..523ee64fb7 100644 --- a/tools/virtiofsd/fuse_virtio.c +++ b/tools/virtiofsd/fuse_virtio.c @@ -187,6 +187,31 @@ static void copy_iov(struct iovec *src_iov, int src_count, } } +/* + * pthread_rwlock_rdlock() and pthread_rwlock_wrlock can fail if + * a deadlock condition is detected or the current thread already + * owns the lock. They can also fail, like pthread_rwlock_unlock(), + * if the mutex wasn't properly initialized. None of these are ever + * expected to happen. + */ +static void vu_dispatch_rdlock(struct fv_VuDev *vud) +{ + int ret = pthread_rwlock_rdlock(&vud->vu_dispatch_rwlock); + assert(ret == 0); +} + +static void vu_dispatch_wrlock(struct fv_VuDev *vud) +{ + int ret = pthread_rwlock_wrlock(&vud->vu_dispatch_rwlock); + assert(ret == 0); +} + +static void vu_dispatch_unlock(struct fv_VuDev *vud) +{ + int ret = pthread_rwlock_unlock(&vud->vu_dispatch_rwlock); + assert(ret == 0); +} + /* * Called back by ll whenever it wants to send a reply/message back * The 1st element of the iov starts with the fuse_out_header @@ -240,12 +265,12 @@ int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, copy_iov(iov, count, in_sg, in_num, tosend_len); - pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_rdlock(qi->virtio_dev); pthread_mutex_lock(&qi->vq_lock); vu_queue_push(dev, q, elem, tosend_len); vu_queue_notify(dev, q); pthread_mutex_unlock(&qi->vq_lock); - pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_unlock(qi->virtio_dev); req->reply_sent = true; @@ -403,12 +428,12 @@ int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, ret = 0; - pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_rdlock(qi->virtio_dev); pthread_mutex_lock(&qi->vq_lock); vu_queue_push(dev, q, elem, tosend_len); vu_queue_notify(dev, q); pthread_mutex_unlock(&qi->vq_lock); - pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_unlock(qi->virtio_dev); err: if (ret == 0) { @@ -558,12 +583,12 @@ out: fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__, elem->index); - pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_rdlock(qi->virtio_dev); pthread_mutex_lock(&qi->vq_lock); vu_queue_push(dev, q, elem, 0); vu_queue_notify(dev, q); pthread_mutex_unlock(&qi->vq_lock); - pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_unlock(qi->virtio_dev); } pthread_mutex_destroy(&req->ch.lock); @@ -596,7 +621,6 @@ static void *fv_queue_thread(void *opaque) qi->qidx, qi->kick_fd); while (1) { struct pollfd pf[2]; - int ret; pf[0].fd = qi->kick_fd; pf[0].events = POLLIN; @@ -645,8 +669,7 @@ static void *fv_queue_thread(void *opaque) break; } /* Mutual exclusion with virtio_loop() */ - ret = pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock); - assert(ret == 0); /* there is no possible error case */ + vu_dispatch_rdlock(qi->virtio_dev); pthread_mutex_lock(&qi->vq_lock); /* out is from guest, in is too guest */ unsigned int in_bytes, out_bytes; @@ -672,7 +695,7 @@ static void *fv_queue_thread(void *opaque) } pthread_mutex_unlock(&qi->vq_lock); - pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_unlock(qi->virtio_dev); /* Process all the requests. */ if (!se->thread_pool_size && req_list != NULL) { @@ -799,7 +822,6 @@ int virtio_loop(struct fuse_session *se) while (!fuse_session_exited(se)) { struct pollfd pf[1]; bool ok; - int ret; pf[0].fd = se->vu_socketfd; pf[0].events = POLLIN; pf[0].revents = 0; @@ -825,12 +847,11 @@ int virtio_loop(struct fuse_session *se) assert(pf[0].revents & POLLIN); fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__); /* Mutual exclusion with fv_queue_thread() */ - ret = pthread_rwlock_wrlock(&se->virtio_dev->vu_dispatch_rwlock); - assert(ret == 0); /* there is no possible error case */ + vu_dispatch_wrlock(se->virtio_dev); ok = vu_dispatch(&se->virtio_dev->dev); - pthread_rwlock_unlock(&se->virtio_dev->vu_dispatch_rwlock); + vu_dispatch_unlock(se->virtio_dev); if (!ok) { fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__); From a65963efa3a8533e8c9fc62e899147612d913058 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Fri, 5 Feb 2021 18:18:11 +0100 Subject: [PATCH 3/6] tools/virtiofsd: Replace the word 'whitelist' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow the inclusive terminology from the "Conscious Language in your Open Source Projects" guidelines [*] and replace the words "whitelist" appropriately. [*] https://github.com/conscious-lang/conscious-lang-docs/blob/main/faq.md Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Daniel P. Berrangé Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20210205171817.2108907-3-philmd@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/passthrough_ll.c | 6 +++--- tools/virtiofsd/passthrough_seccomp.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 147b59338a..5f3afe8557 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -3204,7 +3204,7 @@ static void setup_mounts(const char *source) } /* - * Only keep whitelisted capabilities that are needed for file system operation + * Only keep capabilities in allowlist that are needed for file system operation * The (possibly NULL) modcaps_in string passed in is free'd before exit. */ static void setup_capabilities(char *modcaps_in) @@ -3214,8 +3214,8 @@ static void setup_capabilities(char *modcaps_in) capng_restore_state(&cap.saved); /* - * Whitelist file system-related capabilities that are needed for a file - * server to act like root. Drop everything else like networking and + * Add to allowlist file system-related capabilities that are needed for a + * file server to act like root. Drop everything else like networking and * sysadmin capabilities. * * Exclusions: diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c index ea852e2e33..62441cfcdb 100644 --- a/tools/virtiofsd/passthrough_seccomp.c +++ b/tools/virtiofsd/passthrough_seccomp.c @@ -21,7 +21,7 @@ #endif #endif -static const int syscall_whitelist[] = { +static const int syscall_allowlist[] = { /* TODO ireg sem*() syscalls */ SCMP_SYS(brk), SCMP_SYS(capget), /* For CAP_FSETID */ @@ -117,12 +117,12 @@ static const int syscall_whitelist[] = { }; /* Syscalls used when --syslog is enabled */ -static const int syscall_whitelist_syslog[] = { +static const int syscall_allowlist_syslog[] = { SCMP_SYS(send), SCMP_SYS(sendto), }; -static void add_whitelist(scmp_filter_ctx ctx, const int syscalls[], size_t len) +static void add_allowlist(scmp_filter_ctx ctx, const int syscalls[], size_t len) { size_t i; @@ -153,10 +153,10 @@ void setup_seccomp(bool enable_syslog) exit(1); } - add_whitelist(ctx, syscall_whitelist, G_N_ELEMENTS(syscall_whitelist)); + add_allowlist(ctx, syscall_allowlist, G_N_ELEMENTS(syscall_allowlist)); if (enable_syslog) { - add_whitelist(ctx, syscall_whitelist_syslog, - G_N_ELEMENTS(syscall_whitelist_syslog)); + add_allowlist(ctx, syscall_allowlist_syslog, + G_N_ELEMENTS(syscall_allowlist_syslog)); } /* libvhost-user calls this for post-copy migration, we don't need it */ From 1e08f164e9fdc9528ad6990012301b9a04b0bc90 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 8 Feb 2021 17:40:23 -0500 Subject: [PATCH 4/6] virtiofsd: Save error code early at the failure callsite Change error code handling slightly in lo_setattr(). Right now we seem to jump to out_err and assume that "errno" is valid and use that to send reply. But if caller has to do some other operations before jumping to out_err, then it does the dance of first saving errno to saverr and the restore errno before jumping to out_err. This makes it more confusing. I am about to make more changes where caller will have to do some work after error before jumping to out_err. I found it easier to change the convention a bit. That is caller saves error in "saverr" before jumping to out_err. And out_err uses "saverr" to send error back and does not rely on "errno" having actual error. v3: Resolved conflicts in lo_setattr() due to lo_inode_open() changes. Signed-off-by: Vivek Goyal Reviewed-by: Dr. David Alan Gilbert Message-Id: <20210208224024.43555-2-vgoyal@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/passthrough_ll.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 5f3afe8557..216c0bc026 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -698,6 +698,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0); } if (res == -1) { + saverr = errno; goto out_err; } } @@ -707,6 +708,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); if (res == -1) { + saverr = errno; goto out_err; } } @@ -718,16 +720,15 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, } else { truncfd = lo_inode_open(lo, inode, O_RDWR); if (truncfd < 0) { - errno = -truncfd; + saverr = -truncfd; goto out_err; } } res = ftruncate(truncfd, attr->st_size); + saverr = res == -1 ? errno : 0; if (!fi) { - saverr = errno; close(truncfd); - errno = saverr; } if (res == -1) { goto out_err; @@ -760,6 +761,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, res = utimensat(lo->proc_self_fd, procname, tv, 0); } if (res == -1) { + saverr = errno; goto out_err; } } @@ -768,7 +770,6 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, return lo_getattr(req, ino, fi); out_err: - saverr = errno; lo_inode_put(lo, &inode); fuse_reply_err(req, saverr); } From d64907acbf6e436099fd26fbb6312fd56f9fb29d Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 8 Feb 2021 17:40:24 -0500 Subject: [PATCH 5/6] viriofsd: Add support for FUSE_HANDLE_KILLPRIV_V2 This patch adds basic support for FUSE_HANDLE_KILLPRIV_V2. virtiofsd can enable/disable this by specifying option "-o killpriv_v2/no_killpriv_v2". By default this is enabled as long as client supports it Enabling this option helps with performance in write path. Without this option, currently every write is first preceeded with a getxattr() operation to find out if security.capability is set. (Write is supposed to clear security.capability). With this option enabled, server is signing up for clearing security.capability on every WRITE and also clearing suid/sgid subject to certain rules. This gets rid of extra getxattr() call for every WRITE and improves performance. This is true when virtiofsd is run with option -o xattr. What does enabling FUSE_HANDLE_KILLPRIV_V2 mean for file server implementation. It needs to adhere to following rules. Thanks to Miklos for this summary. - clear "security.capability" on write, truncate and chown unconditionally - clear suid/sgid in case of following. Note, sgid is cleared only if group executable bit is set. o setattr has FATTR_SIZE and FATTR_KILL_SUIDGID set. o setattr has FATTR_UID or FATTR_GID o open has O_TRUNC and FUSE_OPEN_KILL_SUIDGID o create has O_TRUNC and FUSE_OPEN_KILL_SUIDGID flag set. o write has FUSE_WRITE_KILL_SUIDGID >From Linux VFS client perspective, here are the requirements. - caps are always cleared on chown/write/truncate - suid is always cleared on chown, while for truncate/write it is cleared only if caller does not have CAP_FSETID. - sgid is always cleared on chown, while for truncate/write it is cleared only if caller does not have CAP_FSETID as well as file has group execute permission. virtiofsd implementation has not changed much to adhere to above ruls. And reason being that current assumption is that we are running on Linux and on top of filesystems like ext4/xfs which already follow above rules. On write, truncate, chown, seucurity.capability is cleared. And virtiofsd drops CAP_FSETID if need be and that will lead to clearing of suid/sgid. But if virtiofsd is running on top a filesystem which breaks above assumptions, then it will have to take extra actions to emulate above. That's a TODO for later when need arises. Note: create normally is supposed to be called only when file does not exist. So generally there should not be any question of clearing setuid/setgid. But it is possible that after client checks that file is not present, some other client creates file on server and this race can trigger sending FUSE_CREATE. In that case, if O_TRUNC is set, we should clear suid/sgid if FUSE_OPEN_KILL_SUIDGID is also set. v3: - Resolved conflicts due to lo_inode_open() changes. - Moved capability code in lo_do_open() so that both lo_open() and lo_create() can benefit from common code. - Dropped changes to kernel headers as these are part of qemu already. Signed-off-by: Vivek Goyal Acked-by: Stefan Hajnoczi Reviewed-by: Dr. David Alan Gilbert Message-Id: <20210208224024.43555-3-vgoyal@redhat.com> Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/fuse_common.h | 15 ++++++ tools/virtiofsd/fuse_lowlevel.c | 11 ++++- tools/virtiofsd/fuse_lowlevel.h | 1 + tools/virtiofsd/passthrough_ll.c | 84 +++++++++++++++++++++++++++++--- 4 files changed, 103 insertions(+), 8 deletions(-) diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h index a090040bb2..fa9671872e 100644 --- a/tools/virtiofsd/fuse_common.h +++ b/tools/virtiofsd/fuse_common.h @@ -357,6 +357,21 @@ struct fuse_file_info { */ #define FUSE_CAP_SUBMOUNTS (1 << 27) +/** + * Indicates that the filesystem is responsible for clearing + * security.capability xattr and clearing setuid and setgid bits. Following + * are the rules. + * - clear "security.capability" on write, truncate and chown unconditionally + * - clear suid/sgid if following is true. Note, sgid is cleared only if + * group executable bit is set. + * o setattr has FATTR_SIZE and FATTR_KILL_SUIDGID set. + * o setattr has FATTR_UID or FATTR_GID + * o open has O_TRUNC and FUSE_OPEN_KILL_SUIDGID + * o create has O_TRUNC and FUSE_OPEN_KILL_SUIDGID flag set. + * o write has FUSE_WRITE_KILL_SUIDGID + */ +#define FUSE_CAP_HANDLE_KILLPRIV_V2 (1 << 28) + /** * Ioctl flags * diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c index e94b71110b..f78692ef66 100644 --- a/tools/virtiofsd/fuse_lowlevel.c +++ b/tools/virtiofsd/fuse_lowlevel.c @@ -855,7 +855,7 @@ static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, FUSE_SET_ATTR_GID | FUSE_SET_ATTR_SIZE | FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME | FUSE_SET_ATTR_ATIME_NOW | FUSE_SET_ATTR_MTIME_NOW | - FUSE_SET_ATTR_CTIME; + FUSE_SET_ATTR_CTIME | FUSE_SET_ATTR_KILL_SUIDGID; req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi); } else { @@ -1069,6 +1069,7 @@ static void do_create(fuse_req_t req, fuse_ino_t nodeid, memset(&fi, 0, sizeof(fi)); fi.flags = arg->flags; + fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID; req->ctx.umask = arg->umask; @@ -1092,6 +1093,7 @@ static void do_open(fuse_req_t req, fuse_ino_t nodeid, memset(&fi, 0, sizeof(fi)); fi.flags = arg->flags; + fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID; if (req->se->op.open) { req->se->op.open(req, nodeid, &fi); @@ -1983,6 +1985,9 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, if (arg->flags & FUSE_SUBMOUNTS) { se->conn.capable |= FUSE_CAP_SUBMOUNTS; } + if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) { + se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV_V2; + } #ifdef HAVE_SPLICE #ifdef HAVE_VMSPLICE se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; @@ -2114,6 +2119,10 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, outarg.congestion_threshold = se->conn.congestion_threshold; outarg.time_gran = se->conn.time_gran; + if (se->conn.want & FUSE_CAP_HANDLE_KILLPRIV_V2) { + outarg.flags |= FUSE_HANDLE_KILLPRIV_V2; + } + fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead); diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h index 0e10a14bc9..3bf786b034 100644 --- a/tools/virtiofsd/fuse_lowlevel.h +++ b/tools/virtiofsd/fuse_lowlevel.h @@ -143,6 +143,7 @@ struct fuse_forget_data { #define FUSE_SET_ATTR_ATIME_NOW (1 << 7) #define FUSE_SET_ATTR_MTIME_NOW (1 << 8) #define FUSE_SET_ATTR_CTIME (1 << 10) +#define FUSE_SET_ATTR_KILL_SUIDGID (1 << 11) /* * Request methods and replies diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 216c0bc026..58d24c0010 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -168,6 +168,7 @@ struct lo_data { /* An O_PATH file descriptor to /proc/self/fd/ */ int proc_self_fd; + int user_killpriv_v2, killpriv_v2; }; static const struct fuse_opt lo_opts[] = { @@ -198,6 +199,8 @@ static const struct fuse_opt lo_opts[] = { { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 }, { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 }, { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 }, + { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 }, + { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 }, FUSE_OPT_END }; static bool use_syslog = false; @@ -630,6 +633,34 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) "does not support it\n"); lo->announce_submounts = false; } + + if (lo->user_killpriv_v2 == 1) { + /* + * User explicitly asked for this option. Enable it unconditionally. + * If connection does not have this capability, it should fail + * in fuse_lowlevel.c + */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n"); + conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2; + lo->killpriv_v2 = 1; + } else if (lo->user_killpriv_v2 == -1 && + conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) { + /* + * User did not specify a value for killpriv_v2. By default enable it + * if connection offers this capability + */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n"); + conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2; + lo->killpriv_v2 = 1; + } else { + /* + * Either user specified to disable killpriv_v2, or connection does + * not offer this capability. Disable killpriv_v2 in both the cases + */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n"); + conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2; + lo->killpriv_v2 = 0; + } } static void lo_getattr(fuse_req_t req, fuse_ino_t ino, @@ -714,7 +745,10 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, } if (valid & FUSE_SET_ATTR_SIZE) { int truncfd; + bool kill_suidgid; + bool cap_fsetid_dropped = false; + kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID); if (fi) { truncfd = fd; } else { @@ -725,8 +759,25 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, } } + if (kill_suidgid) { + res = drop_effective_cap("FSETID", &cap_fsetid_dropped); + if (res != 0) { + saverr = res; + if (!fi) { + close(truncfd); + } + goto out_err; + } + } + res = ftruncate(truncfd, attr->st_size); saverr = res == -1 ? errno : 0; + + if (cap_fsetid_dropped) { + if (gain_effective_cap("FSETID")) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); + } + } if (!fi) { close(truncfd); } @@ -1709,11 +1760,27 @@ static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, { ssize_t fh; int fd = existing_fd; + int err; + bool cap_fsetid_dropped = false; + bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv; update_open_flags(lo->writeback, lo->allow_direct_io, fi); if (fd < 0) { + if (kill_suidgid) { + err = drop_effective_cap("FSETID", &cap_fsetid_dropped); + if (err) { + return err; + } + } + fd = lo_inode_open(lo, inode, fi->flags); + + if (cap_fsetid_dropped) { + if (gain_effective_cap("FSETID")) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); + } + } if (fd < 0) { return -fd; } @@ -1747,8 +1814,8 @@ static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, int err; struct lo_cred old = {}; - fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)\n", parent, - name); + fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)" + " kill_priv=%d\n", parent, name, fi->kill_priv); if (!is_safe_path_component(name)) { fuse_reply_err(req, EINVAL); @@ -1981,8 +2048,8 @@ static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) struct lo_inode *inode = lo_inode(req, ino); int err; - fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d)\n", ino, - fi->flags); + fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)" + "\n", ino, fi->flags, fi->kill_priv); if (!inode) { fuse_reply_err(req, EBADF); @@ -2121,12 +2188,14 @@ static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, out_buf.buf[0].pos = off; fuse_log(FUSE_LOG_DEBUG, - "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu)\n", ino, - out_buf.buf[0].size, (unsigned long)off); + "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n", + ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv); /* * If kill_priv is set, drop CAP_FSETID which should lead to kernel - * clearing setuid/setgid on file. + * clearing setuid/setgid on file. Note, for WRITE, we need to do + * this even if killpriv_v2 is not enabled. fuse direct write path + * relies on this. */ if (fi->kill_priv) { res = drop_effective_cap("FSETID", &cap_fsetid_dropped); @@ -3534,6 +3603,7 @@ int main(int argc, char *argv[]) .posix_lock = 0, .allow_direct_io = 0, .proc_self_fd = -1, + .user_killpriv_v2 = -1, }; struct lo_map_elem *root_elem; struct lo_map_elem *reserve_elem; From 26ec1909648e0c06ff06ebc3ddb2f88ebeeaa6a9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 10 Feb 2021 13:27:44 -0500 Subject: [PATCH 6/6] virtiofsd: Do not use a thread pool by default Currently we created a thread pool (With 64 max threads per pool) for each virtqueue. We hoped that this will provide us with better scalability and performance. But in practice, we are getting better numbers in most of the cases when we don't create a thread pool at all and a single thread per virtqueue receives the request and processes it. Hence, I am proposing that we switch to no thread pool by default (equivalent of --thread-pool-size=0). This will provide out of box better performance to most of the users. In fact other users have confirmed that not using a thread pool gives them better numbers. So why not use this as default. It can be changed when somebody can fix the issues with thread pool performance. Signed-off-by: Vivek Goyal Message-Id: <20210210182744.27324-2-vgoyal@redhat.com> Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/fuse_lowlevel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c index f78692ef66..1aa26c6333 100644 --- a/tools/virtiofsd/fuse_lowlevel.c +++ b/tools/virtiofsd/fuse_lowlevel.c @@ -18,7 +18,7 @@ #include -#define THREAD_POOL_SIZE 64 +#define THREAD_POOL_SIZE 0 #define OFFSET_MAX 0x7fffffffffffffffLL