From 2acf4f8fdd1fb1d7d76fa26e67b39af898df0aed Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 27 Aug 2020 16:36:52 +0100 Subject: [PATCH 1/6] virtiofsd: Silence gcc warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gcc worries fd might be used unset, in reality it's always set if fi is set, and only used if fi is set so it's safe. Initialise it to -1 just to keep gcc happy for now. Signed-off-by: Dr. David Alan Gilbert Message-Id: <20200827153657.111098-2-dgilbert@redhat.com> Reviewed-by: Ján Tomko Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/passthrough_ll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 0b229ebd57..36ad46e0c0 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -620,7 +620,7 @@ static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, struct lo_inode *inode; int ifd; int res; - int fd; + int fd = -1; inode = lo_inode(req, ino); if (!inode) { From f6698f2b03b0db76aff7298ed4de1f9a0e22cc26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 25 Sep 2020 13:51:29 +0100 Subject: [PATCH 2/6] tools/virtiofsd: add support for --socket-group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If you like running QEMU as a normal user (very common for TCG runs) but you have to run virtiofsd as a root user you run into connection problems. Adding support for an optional --socket-group allows the users to keep using the command line. Signed-off-by: Alex Bennée Reviewed-by: Stefan Hajnoczi Message-Id: <20200925125147.26943-2-alex.bennee@linaro.org> Signed-off-by: Dr. David Alan Gilbert dgilbert: Split long line --- docs/tools/virtiofsd.rst | 4 ++++ tools/virtiofsd/fuse_i.h | 1 + tools/virtiofsd/fuse_lowlevel.c | 6 ++++++ tools/virtiofsd/fuse_virtio.c | 21 +++++++++++++++++++-- 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/docs/tools/virtiofsd.rst b/docs/tools/virtiofsd.rst index ae02938a95..7ecee49834 100644 --- a/docs/tools/virtiofsd.rst +++ b/docs/tools/virtiofsd.rst @@ -87,6 +87,10 @@ Options Listen on vhost-user UNIX domain socket at PATH. +.. option:: --socket-group=GROUP + + Set the vhost-user UNIX domain socket gid to GROUP. + .. option:: --fd=FDNUM Accept connections from vhost-user UNIX domain socket file descriptor FDNUM. diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h index 1240828208..492e002181 100644 --- a/tools/virtiofsd/fuse_i.h +++ b/tools/virtiofsd/fuse_i.h @@ -68,6 +68,7 @@ struct fuse_session { size_t bufsize; int error; char *vu_socket_path; + char *vu_socket_group; int vu_listen_fd; int vu_socketfd; struct fv_VuDev *virtio_dev; diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c index 2dd36ec03b..4d1ba2925d 100644 --- a/tools/virtiofsd/fuse_lowlevel.c +++ b/tools/virtiofsd/fuse_lowlevel.c @@ -2523,6 +2523,7 @@ static const struct fuse_opt fuse_ll_opts[] = { LL_OPTION("--debug", debug, 1), LL_OPTION("allow_root", deny_others, 1), LL_OPTION("--socket-path=%s", vu_socket_path, 0), + LL_OPTION("--socket-group=%s", vu_socket_group, 0), LL_OPTION("--fd=%d", vu_listen_fd, 0), LL_OPTION("--thread-pool-size=%d", thread_pool_size, 0), FUSE_OPT_END @@ -2630,6 +2631,11 @@ struct fuse_session *fuse_session_new(struct fuse_args *args, "fuse: --socket-path and --fd cannot be given together\n"); goto out4; } + if (se->vu_socket_group && !se->vu_socket_path) { + fuse_log(FUSE_LOG_ERR, + "fuse: --socket-group can only be used with --socket-path\n"); + goto out4; + } se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + FUSE_BUFFER_HEADER_SIZE; diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c index d5c8e98253..89f537f79b 100644 --- a/tools/virtiofsd/fuse_virtio.c +++ b/tools/virtiofsd/fuse_virtio.c @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include #include "contrib/libvhost-user/libvhost-user.h" @@ -924,15 +926,30 @@ static int fv_create_listen_socket(struct fuse_session *se) /* * Unfortunately bind doesn't let you set the mask on the socket, - * so set umask to 077 and restore it later. + * so set umask appropriately and restore it later. */ - old_umask = umask(0077); + if (se->vu_socket_group) { + old_umask = umask(S_IROTH | S_IWOTH | S_IXOTH); + } else { + old_umask = umask(S_IRGRP | S_IWGRP | S_IXGRP | + S_IROTH | S_IWOTH | S_IXOTH); + } if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) { fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n"); close(listen_sock); umask(old_umask); return -1; } + if (se->vu_socket_group) { + struct group *g = getgrnam(se->vu_socket_group); + if (g) { + if (!chown(se->vu_socket_path, -1, g->gr_gid)) { + fuse_log(FUSE_LOG_WARNING, + "vhost socket failed to set group to %s (%d)\n", + se->vu_socket_group, g->gr_gid); + } + } + } umask(old_umask); if (listen(listen_sock, 1) == -1) { From ff3995e2f0e12770dfa73d9e95c0461024840b9a Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 2 Oct 2020 13:40:15 +0100 Subject: [PATCH 3/6] virtiofsd: Call qemu_init_exec_dir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since fcb4f59c879 qemu_get_local_state_pathname relies on the init_exec_dir, and virtiofsd asserts because we never set it. Set it. Reported-by: Alex Bennée Signed-off-by: Dr. David Alan Gilbert Message-Id: <20201002124015.44820-1-dgilbert@redhat.com> Tested-by: Alex Bennée Reviewed-by: Alex Bennée Reviewed-by: Stefan Hajnoczi Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/passthrough_ll.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 36ad46e0c0..477e6ee0b5 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -2839,6 +2839,8 @@ int main(int argc, char *argv[]) /* Don't mask creation mode, kernel already did that */ umask(0); + qemu_init_exec_dir(argv[0]); + pthread_mutex_init(&lo.mutex, NULL); lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal); lo.root.fd = -1; From ebf101955ce8f8d72fba103b5151115a4335de2c Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 6 Oct 2020 10:58:26 +0100 Subject: [PATCH 4/6] virtiofsd: avoid /proc/self/fd tempdir In order to prevent /proc/self/fd escapes a temporary directory is created where /proc/self/fd is bind-mounted. This doesn't work on read-only file systems. Avoid the temporary directory by bind-mounting /proc/self/fd over /proc. This does not affect other processes since we remounted / with MS_REC | MS_SLAVE. /proc must exist and virtiofsd does not use it so it's safe to do this. Path traversal can be tested with the following function: static void test_proc_fd_escape(struct lo_data *lo) { int fd; int level = 0; ino_t last_ino = 0; fd = lo->proc_self_fd; for (;;) { struct stat st; if (fstat(fd, &st) != 0) { perror("fstat"); return; } if (last_ino && st.st_ino == last_ino) { fprintf(stderr, "inode number unchanged, stopping\n"); return; } last_ino = st.st_ino; fprintf(stderr, "Level %d dev %lu ino %lu\n", level, (unsigned long)st.st_dev, (unsigned long)last_ino); fd = openat(fd, "..", O_PATH | O_DIRECTORY | O_NOFOLLOW); level++; } } Before and after this patch only Level 0 is displayed. Without /proc/self/fd bind-mount protection it is possible to traverse parent directories. Fixes: 397ae982f4df4 ("virtiofsd: jail lo->proc_self_fd") Cc: Miklos Szeredi Cc: Jens Freimann Signed-off-by: Stefan Hajnoczi Message-Id: <20201006095826.59813-1-stefanha@redhat.com> Reviewed-by: Dr. David Alan Gilbert Tested-by: Jens Freimann Reviewed-by: Jens Freimann Signed-off-by: Dr. David Alan Gilbert --- tools/virtiofsd/passthrough_ll.c | 34 +++++++++++--------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 477e6ee0b5..ff53df4451 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -2393,8 +2393,6 @@ static void setup_wait_parent_capabilities(void) static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) { pid_t child; - char template[] = "virtiofsd-XXXXXX"; - char *tmpdir; /* * Create a new pid namespace for *child* processes. We'll have to @@ -2458,33 +2456,23 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) exit(1); } - tmpdir = mkdtemp(template); - if (!tmpdir) { - fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template); + /* + * We only need /proc/self/fd. Prevent ".." from accessing parent + * directories of /proc/self/fd by bind-mounting it over /proc. Since / was + * previously remounted with MS_REC | MS_SLAVE this mount change only + * affects our process. + */ + if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n"); exit(1); } - if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) { - fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n", - tmpdir); - exit(1); - } - - /* Now we can get our /proc/self/fd directory file descriptor */ - lo->proc_self_fd = open(tmpdir, O_PATH); + /* Get the /proc (actually /proc/self/fd, see above) file descriptor */ + lo->proc_self_fd = open("/proc", O_PATH); if (lo->proc_self_fd == -1) { - fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir); + fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n"); exit(1); } - - if (umount2(tmpdir, MNT_DETACH) < 0) { - fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir); - exit(1); - } - - if (rmdir(tmpdir) < 0) { - fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir); - } } /* From aa84b506f75db0bdab2ef0d68a4c5e266eab05e6 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Tue, 29 Sep 2020 11:42:17 +0800 Subject: [PATCH 5/6] migration/dirtyrate: record start_time and calc_time while at the measuring state Querying could include both the start-time and the calc-time while at the measuring state, allowing a caller to determine when they should expect to come back looking for a result. Signed-off-by: Chuan Zheng Message-Id: <1601350938-128320-2-git-send-email-zhengchuan@huawei.com> Reviewed-by: David Edmondson Signed-off-by: Dr. David Alan Gilbert --- migration/dirtyrate.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c index 68577ef250..40e41e793e 100644 --- a/migration/dirtyrate.c +++ b/migration/dirtyrate.c @@ -83,14 +83,14 @@ static struct DirtyRateInfo *query_dirty_rate_info(void) return info; } -static void reset_dirtyrate_stat(void) +static void init_dirtyrate_stat(int64_t start_time, int64_t calc_time) { DirtyStat.total_dirty_samples = 0; DirtyStat.total_sample_count = 0; DirtyStat.total_block_mem_MB = 0; DirtyStat.dirty_rate = -1; - DirtyStat.start_time = 0; - DirtyStat.calc_time = 0; + DirtyStat.start_time = start_time; + DirtyStat.calc_time = calc_time; } static void update_dirtyrate_stat(struct RamblockDirtyInfo *info) @@ -335,7 +335,6 @@ static void calculate_dirtyrate(struct DirtyRateConfig config) int64_t initial_time; rcu_register_thread(); - reset_dirtyrate_stat(); rcu_read_lock(); initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); if (!record_ramblock_hash_info(&block_dinfo, config, &block_count)) { @@ -365,6 +364,8 @@ void *get_dirtyrate_thread(void *arg) { struct DirtyRateConfig config = *(struct DirtyRateConfig *)arg; int ret; + int64_t start_time; + int64_t calc_time; ret = dirtyrate_set_state(&CalculatingState, DIRTY_RATE_STATUS_UNSTARTED, DIRTY_RATE_STATUS_MEASURING); @@ -373,6 +374,10 @@ void *get_dirtyrate_thread(void *arg) return NULL; } + start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000; + calc_time = config.sample_period_seconds; + init_dirtyrate_stat(start_time, calc_time); + calculate_dirtyrate(config); ret = dirtyrate_set_state(&CalculatingState, DIRTY_RATE_STATUS_MEASURING, From b1a859cfb04db1d2b80dd9979ce6081cb9c00d75 Mon Sep 17 00:00:00 2001 From: Chuan Zheng Date: Tue, 29 Sep 2020 11:42:18 +0800 Subject: [PATCH 6/6] migration/dirtyrate: present dirty rate only when querying the rate has completed Make dirty_rate field optional, present dirty rate only when querying the rate has completed. The qmp results is shown as follow: @unstarted: {"return":{"status":"unstarted","start-time":0,"calc-time":0},"id":"libvirt-12"} @measuring: {"return":{"status":"measuring","start-time":102931,"calc-time":1},"id":"libvirt-85"} @measured: {"return":{"status":"measured","dirty-rate":4,"start-time":150146,"calc-time":1},"id":"libvirt-15"} Signed-off-by: Chuan Zheng Reviewed-by: David Edmondson Message-Id: <1601350938-128320-3-git-send-email-zhengchuan@huawei.com> Signed-off-by: Dr. David Alan Gilbert --- migration/dirtyrate.c | 3 +-- qapi/migration.json | 8 +++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c index 40e41e793e..ab9e1301f6 100644 --- a/migration/dirtyrate.c +++ b/migration/dirtyrate.c @@ -69,9 +69,8 @@ static struct DirtyRateInfo *query_dirty_rate_info(void) struct DirtyRateInfo *info = g_malloc0(sizeof(DirtyRateInfo)); if (qatomic_read(&CalculatingState) == DIRTY_RATE_STATUS_MEASURED) { + info->has_dirty_rate = true; info->dirty_rate = dirty_rate; - } else { - info->dirty_rate = -1; } info->status = CalculatingState; diff --git a/qapi/migration.json b/qapi/migration.json index 7f5e6fd681..974021a5c8 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -1743,10 +1743,8 @@ # # Information about current dirty page rate of vm. # -# @dirty-rate: @dirtyrate describing the dirty page rate of vm -# in units of MB/s. -# If this field returns '-1', it means querying has not -# yet started or completed. +# @dirty-rate: an estimate of the dirty page rate of the VM in units of +# MB/s, present only when estimating the rate has completed. # # @status: status containing dirtyrate query status includes # 'unstarted' or 'measuring' or 'measured' @@ -1759,7 +1757,7 @@ # ## { 'struct': 'DirtyRateInfo', - 'data': {'dirty-rate': 'int64', + 'data': {'*dirty-rate': 'int64', 'status': 'DirtyRateStatus', 'start-time': 'int64', 'calc-time': 'int64'} }