diff --git a/MAINTAINERS b/MAINTAINERS index 3ca814850e..fe4dc51b08 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1517,7 +1517,7 @@ F: tests/virtio-balloon-test.c virtio-9p M: Greg Kurz -S: Supported +S: Odd Fixes F: hw/9pfs/ X: hw/9pfs/xen-9p* F: fsdev/ diff --git a/fsdev/9p-marshal.h b/fsdev/9p-marshal.h index c8823d878f..8f3babb60a 100644 --- a/fsdev/9p-marshal.h +++ b/fsdev/9p-marshal.h @@ -9,9 +9,9 @@ typedef struct V9fsString typedef struct V9fsQID { - int8_t type; - int32_t version; - int64_t path; + uint8_t type; + uint32_t version; + uint64_t path; } V9fsQID; typedef struct V9fsStat diff --git a/fsdev/file-op-9p.h b/fsdev/file-op-9p.h index c757c8099f..f2f7772c86 100644 --- a/fsdev/file-op-9p.h +++ b/fsdev/file-op-9p.h @@ -59,6 +59,11 @@ typedef struct ExtendedOps { #define V9FS_RDONLY 0x00000040 #define V9FS_PROXY_SOCK_FD 0x00000080 #define V9FS_PROXY_SOCK_NAME 0x00000100 +/* + * multidevs option (either one of the two applies exclusively) + */ +#define V9FS_REMAP_INODES 0x00000200 +#define V9FS_FORBID_MULTIDEVS 0x00000400 #define V9FS_SEC_MASK 0x0000003C diff --git a/fsdev/qemu-fsdev-opts.c b/fsdev/qemu-fsdev-opts.c index 7c31ffffaf..07a18c6e48 100644 --- a/fsdev/qemu-fsdev-opts.c +++ b/fsdev/qemu-fsdev-opts.c @@ -31,7 +31,9 @@ static QemuOptsList qemu_fsdev_opts = { }, { .name = "readonly", .type = QEMU_OPT_BOOL, - + }, { + .name = "multidevs", + .type = QEMU_OPT_STRING, }, { .name = "socket", .type = QEMU_OPT_STRING, @@ -75,6 +77,9 @@ static QemuOptsList qemu_virtfs_opts = { }, { .name = "readonly", .type = QEMU_OPT_BOOL, + }, { + .name = "multidevs", + .type = QEMU_OPT_STRING, }, { .name = "socket", .type = QEMU_OPT_STRING, diff --git a/fsdev/qemu-fsdev-throttle.c b/fsdev/qemu-fsdev-throttle.c index fe1d76dc76..5c83a1cc09 100644 --- a/fsdev/qemu-fsdev-throttle.c +++ b/fsdev/qemu-fsdev-throttle.c @@ -31,7 +31,7 @@ static void fsdev_throttle_write_timer_cb(void *opaque) qemu_co_enter_next(&fst->throttled_reqs[true], NULL); } -void fsdev_throttle_parse_opts(QemuOpts *opts, FsThrottle *fst, Error **errp) +int fsdev_throttle_parse_opts(QemuOpts *opts, FsThrottle *fst, Error **errp) { throttle_config_init(&fst->cfg); fst->cfg.buckets[THROTTLE_BPS_TOTAL].avg = @@ -75,7 +75,7 @@ void fsdev_throttle_parse_opts(QemuOpts *opts, FsThrottle *fst, Error **errp) fst->cfg.op_size = qemu_opt_get_number(opts, "throttling.iops-size", 0); - throttle_is_valid(&fst->cfg, errp); + return throttle_is_valid(&fst->cfg, errp) ? 0 : -1; } void fsdev_throttle_init(FsThrottle *fst) diff --git a/fsdev/qemu-fsdev-throttle.h b/fsdev/qemu-fsdev-throttle.h index c98e2feab5..a21aecddc7 100644 --- a/fsdev/qemu-fsdev-throttle.h +++ b/fsdev/qemu-fsdev-throttle.h @@ -26,7 +26,7 @@ typedef struct FsThrottle { CoQueue throttled_reqs[2]; } FsThrottle; -void fsdev_throttle_parse_opts(QemuOpts *, FsThrottle *, Error **); +int fsdev_throttle_parse_opts(QemuOpts *, FsThrottle *, Error **); void fsdev_throttle_init(FsThrottle *); diff --git a/fsdev/qemu-fsdev.c b/fsdev/qemu-fsdev.c index 077a8c4e2b..a9e069c0c7 100644 --- a/fsdev/qemu-fsdev.c +++ b/fsdev/qemu-fsdev.c @@ -58,6 +58,7 @@ static FsDriverTable FsDrivers[] = { "writeout", "fmode", "dmode", + "multidevs", "throttling.bps-total", "throttling.bps-read", "throttling.bps-write", diff --git a/hw/9pfs/9p-local.c b/hw/9pfs/9p-local.c index 08e673a79c..4708c0bd89 100644 --- a/hw/9pfs/9p-local.c +++ b/hw/9pfs/9p-local.c @@ -1465,6 +1465,10 @@ static void local_cleanup(FsContext *ctx) { LocalData *data = ctx->private; + if (!data) { + return; + } + close(data->mountfd); g_free(data); } @@ -1479,6 +1483,7 @@ static int local_parse_opts(QemuOpts *opts, FsDriverEntry *fse, Error **errp) { const char *sec_model = qemu_opt_get(opts, "security_model"); const char *path = qemu_opt_get(opts, "path"); + const char *multidevs = qemu_opt_get(opts, "multidevs"); Error *local_err = NULL; if (!sec_model) { @@ -1502,13 +1507,32 @@ static int local_parse_opts(QemuOpts *opts, FsDriverEntry *fse, Error **errp) return -1; } + if (multidevs) { + if (!strcmp(multidevs, "remap")) { + fse->export_flags &= ~V9FS_FORBID_MULTIDEVS; + fse->export_flags |= V9FS_REMAP_INODES; + } else if (!strcmp(multidevs, "forbid")) { + fse->export_flags &= ~V9FS_REMAP_INODES; + fse->export_flags |= V9FS_FORBID_MULTIDEVS; + } else if (!strcmp(multidevs, "warn")) { + fse->export_flags &= ~V9FS_FORBID_MULTIDEVS; + fse->export_flags &= ~V9FS_REMAP_INODES; + } else { + error_setg(&local_err, "invalid multidevs property '%s'", + multidevs); + error_append_hint(&local_err, "Valid options are: multidevs=" + "[remap|forbid|warn]\n"); + error_propagate(errp, local_err); + return -1; + } + } + if (!path) { error_setg(errp, "path property not set"); return -1; } - fsdev_throttle_parse_opts(opts, &fse->fst, &local_err); - if (local_err) { + if (fsdev_throttle_parse_opts(opts, &fse->fst, &local_err)) { error_propagate_prepend(errp, local_err, "invalid throttle configuration: "); return -1; diff --git a/hw/9pfs/9p-proxy.c b/hw/9pfs/9p-proxy.c index 57a8c1c808..97ab9c58a5 100644 --- a/hw/9pfs/9p-proxy.c +++ b/hw/9pfs/9p-proxy.c @@ -1185,6 +1185,10 @@ static void proxy_cleanup(FsContext *ctx) { V9fsProxy *proxy = ctx->private; + if (!proxy) { + return; + } + g_free(proxy->out_iovec.iov_base); g_free(proxy->in_iovec.iov_base); if (ctx->export_flags & V9FS_PROXY_SOCK_NAME) { diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index cce2366219..37abcdb71e 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -26,6 +26,8 @@ #include "trace.h" #include "migration/blocker.h" #include "sysemu/qtest.h" +#include "qemu/xxhash.h" +#include int open_fd_hw; int total_open_fd; @@ -572,14 +574,374 @@ static void coroutine_fn virtfs_reset(V9fsPDU *pdu) P9_STAT_MODE_NAMED_PIPE | \ P9_STAT_MODE_SOCKET) -/* This is the algorithm from ufs in spfs */ -static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp) +/* Mirrors all bits of a byte. So e.g. binary 10100000 would become 00000101. */ +static inline uint8_t mirror8bit(uint8_t byte) { + return (byte * 0x0202020202ULL & 0x010884422010ULL) % 1023; +} + +/* Same as mirror8bit() just for a 64 bit data type instead for a byte. */ +static inline uint64_t mirror64bit(uint64_t value) +{ + return ((uint64_t)mirror8bit(value & 0xff) << 56) | + ((uint64_t)mirror8bit((value >> 8) & 0xff) << 48) | + ((uint64_t)mirror8bit((value >> 16) & 0xff) << 40) | + ((uint64_t)mirror8bit((value >> 24) & 0xff) << 32) | + ((uint64_t)mirror8bit((value >> 32) & 0xff) << 24) | + ((uint64_t)mirror8bit((value >> 40) & 0xff) << 16) | + ((uint64_t)mirror8bit((value >> 48) & 0xff) << 8) | + ((uint64_t)mirror8bit((value >> 56) & 0xff)); +} + +/** + * @brief Parameter k for the Exponential Golomb algorihm to be used. + * + * The smaller this value, the smaller the minimum bit count for the Exp. + * Golomb generated affixes will be (at lowest index) however for the + * price of having higher maximum bit count of generated affixes (at highest + * index). Likewise increasing this parameter yields in smaller maximum bit + * count for the price of having higher minimum bit count. + * + * In practice that means: a good value for k depends on the expected amount + * of devices to be exposed by one export. For a small amount of devices k + * should be small, for a large amount of devices k might be increased + * instead. The default of k=0 should be fine for most users though. + * + * @b IMPORTANT: In case this ever becomes a runtime parameter; the value of + * k should not change as long as guest is still running! Because that would + * cause completely different inode numbers to be generated on guest. + */ +#define EXP_GOLOMB_K 0 + +/** + * @brief Exponential Golomb algorithm for arbitrary k (including k=0). + * + * The Exponential Golomb algorithm generates @b prefixes (@b not suffixes!) + * with growing length and with the mathematical property of being + * "prefix-free". The latter means the generated prefixes can be prepended + * in front of arbitrary numbers and the resulting concatenated numbers are + * guaranteed to be always unique. + * + * This is a minor adjustment to the original Exp. Golomb algorithm in the + * sense that lowest allowed index (@param n) starts with 1, not with zero. + * + * @param n - natural number (or index) of the prefix to be generated + * (1, 2, 3, ...) + * @param k - parameter k of Exp. Golomb algorithm to be used + * (see comment on EXP_GOLOMB_K macro for details about k) + */ +static VariLenAffix expGolombEncode(uint64_t n, int k) +{ + const uint64_t value = n + (1 << k) - 1; + const int bits = (int) log2(value) + 1; + return (VariLenAffix) { + .type = AffixType_Prefix, + .value = value, + .bits = bits + MAX((bits - 1 - k), 0) + }; +} + +/** + * @brief Converts a suffix into a prefix, or a prefix into a suffix. + * + * Simply mirror all bits of the affix value, for the purpose to preserve + * respectively the mathematical "prefix-free" or "suffix-free" property + * after the conversion. + * + * If a passed prefix is suitable to create unique numbers, then the + * returned suffix is suitable to create unique numbers as well (and vice + * versa). + */ +static VariLenAffix invertAffix(const VariLenAffix *affix) +{ + return (VariLenAffix) { + .type = + (affix->type == AffixType_Suffix) ? + AffixType_Prefix : AffixType_Suffix, + .value = + mirror64bit(affix->value) >> + ((sizeof(affix->value) * 8) - affix->bits), + .bits = affix->bits + }; +} + +/** + * @brief Generates suffix numbers with "suffix-free" property. + * + * This is just a wrapper function on top of the Exp. Golomb algorithm. + * + * Since the Exp. Golomb algorithm generates prefixes, but we need suffixes, + * this function converts the Exp. Golomb prefixes into appropriate suffixes + * which are still suitable for generating unique numbers. + * + * @param n - natural number (or index) of the suffix to be generated + * (1, 2, 3, ...) + */ +static VariLenAffix affixForIndex(uint64_t index) +{ + VariLenAffix prefix; + prefix = expGolombEncode(index, EXP_GOLOMB_K); + return invertAffix(&prefix); /* convert prefix to suffix */ +} + +/* creative abuse of tb_hash_func7, which is based on xxhash */ +static uint32_t qpp_hash(QppEntry e) +{ + return qemu_xxhash7(e.ino_prefix, e.dev, 0, 0, 0); +} + +static uint32_t qpf_hash(QpfEntry e) +{ + return qemu_xxhash7(e.ino, e.dev, 0, 0, 0); +} + +static bool qpd_cmp_func(const void *obj, const void *userp) +{ + const QpdEntry *e1 = obj, *e2 = userp; + return e1->dev == e2->dev; +} + +static bool qpp_cmp_func(const void *obj, const void *userp) +{ + const QppEntry *e1 = obj, *e2 = userp; + return e1->dev == e2->dev && e1->ino_prefix == e2->ino_prefix; +} + +static bool qpf_cmp_func(const void *obj, const void *userp) +{ + const QpfEntry *e1 = obj, *e2 = userp; + return e1->dev == e2->dev && e1->ino == e2->ino; +} + +static void qp_table_remove(void *p, uint32_t h, void *up) +{ + g_free(p); +} + +static void qp_table_destroy(struct qht *ht) +{ + if (!ht || !ht->map) { + return; + } + qht_iter(ht, qp_table_remove, NULL); + qht_destroy(ht); +} + +static void qpd_table_init(struct qht *ht) +{ + qht_init(ht, qpd_cmp_func, 1, QHT_MODE_AUTO_RESIZE); +} + +static void qpp_table_init(struct qht *ht) +{ + qht_init(ht, qpp_cmp_func, 1, QHT_MODE_AUTO_RESIZE); +} + +static void qpf_table_init(struct qht *ht) +{ + qht_init(ht, qpf_cmp_func, 1 << 16, QHT_MODE_AUTO_RESIZE); +} + +/* + * Returns how many (high end) bits of inode numbers of the passed fs + * device shall be used (in combination with the device number) to + * generate hash values for qpp_table entries. + * + * This function is required if variable length suffixes are used for inode + * number mapping on guest level. Since a device may end up having multiple + * entries in qpp_table, each entry most probably with a different suffix + * length, we thus need this function in conjunction with qpd_table to + * "agree" about a fix amount of bits (per device) to be always used for + * generating hash values for the purpose of accessing qpp_table in order + * get consistent behaviour when accessing qpp_table. + */ +static int qid_inode_prefix_hash_bits(V9fsPDU *pdu, dev_t dev) +{ + QpdEntry lookup = { + .dev = dev + }, *val; + uint32_t hash = dev; + VariLenAffix affix; + + val = qht_lookup(&pdu->s->qpd_table, &lookup, hash); + if (!val) { + val = g_malloc0(sizeof(QpdEntry)); + *val = lookup; + affix = affixForIndex(pdu->s->qp_affix_next); + val->prefix_bits = affix.bits; + qht_insert(&pdu->s->qpd_table, val, hash, NULL); + pdu->s->qp_ndevices++; + } + return val->prefix_bits; +} + +/** + * @brief Slow / full mapping host inode nr -> guest inode nr. + * + * This function performs a slower and much more costly remapping of an + * original file inode number on host to an appropriate different inode + * number on guest. For every (dev, inode) combination on host a new + * sequential number is generated, cached and exposed as inode number on + * guest. + * + * This is just a "last resort" fallback solution if the much faster/cheaper + * qid_path_suffixmap() failed. In practice this slow / full mapping is not + * expected ever to be used at all though. + * + * @see qid_path_suffixmap() for details + * + */ +static int qid_path_fullmap(V9fsPDU *pdu, const struct stat *stbuf, + uint64_t *path) +{ + QpfEntry lookup = { + .dev = stbuf->st_dev, + .ino = stbuf->st_ino + }, *val; + uint32_t hash = qpf_hash(lookup); + VariLenAffix affix; + + val = qht_lookup(&pdu->s->qpf_table, &lookup, hash); + + if (!val) { + if (pdu->s->qp_fullpath_next == 0) { + /* no more files can be mapped :'( */ + error_report_once( + "9p: No more prefixes available for remapping inodes from " + "host to guest." + ); + return -ENFILE; + } + + val = g_malloc0(sizeof(QppEntry)); + *val = lookup; + + /* new unique inode and device combo */ + affix = affixForIndex( + 1ULL << (sizeof(pdu->s->qp_affix_next) * 8) + ); + val->path = (pdu->s->qp_fullpath_next++ << affix.bits) | affix.value; + pdu->s->qp_fullpath_next &= ((1ULL << (64 - affix.bits)) - 1); + qht_insert(&pdu->s->qpf_table, val, hash, NULL); + } + + *path = val->path; + return 0; +} + +/** + * @brief Quick mapping host inode nr -> guest inode nr. + * + * This function performs quick remapping of an original file inode number + * on host to an appropriate different inode number on guest. This remapping + * of inodes is required to avoid inode nr collisions on guest which would + * happen if the 9p export contains more than 1 exported file system (or + * more than 1 file system data set), because unlike on host level where the + * files would have different device nrs, all files exported by 9p would + * share the same device nr on guest (the device nr of the virtual 9p device + * that is). + * + * Inode remapping is performed by chopping off high end bits of the original + * inode number from host, shifting the result upwards and then assigning a + * generated suffix number for the low end bits, where the same suffix number + * will be shared by all inodes with the same device id AND the same high end + * bits that have been chopped off. That approach utilizes the fact that inode + * numbers very likely share the same high end bits (i.e. due to their common + * sequential generation by file systems) and hence we only have to generate + * and track a very limited amount of suffixes in practice due to that. + * + * We generate variable size suffixes for that purpose. The 1st generated + * suffix will only have 1 bit and hence we only need to chop off 1 bit from + * the original inode number. The subsequent suffixes being generated will + * grow in (bit) size subsequently, i.e. the 2nd and 3rd suffix being + * generated will have 3 bits and hence we have to chop off 3 bits from their + * original inodes, and so on. That approach of using variable length suffixes + * (i.e. over fixed size ones) utilizes the fact that in practice only a very + * limited amount of devices are shared by the same export (e.g. typically + * less than 2 dozen devices per 9p export), so in practice we need to chop + * off less bits than with fixed size prefixes and yet are flexible to add + * new devices at runtime below host's export directory at any time without + * having to reboot guest nor requiring to reconfigure guest for that. And due + * to the very limited amount of original high end bits that we chop off that + * way, the total amount of suffixes we need to generate is less than by using + * fixed size prefixes and hence it also improves performance of the inode + * remapping algorithm, and finally has the nice side effect that the inode + * numbers on guest will be much smaller & human friendly. ;-) + */ +static int qid_path_suffixmap(V9fsPDU *pdu, const struct stat *stbuf, + uint64_t *path) +{ + const int ino_hash_bits = qid_inode_prefix_hash_bits(pdu, stbuf->st_dev); + QppEntry lookup = { + .dev = stbuf->st_dev, + .ino_prefix = (uint16_t) (stbuf->st_ino >> (64 - ino_hash_bits)) + }, *val; + uint32_t hash = qpp_hash(lookup); + + val = qht_lookup(&pdu->s->qpp_table, &lookup, hash); + + if (!val) { + if (pdu->s->qp_affix_next == 0) { + /* we ran out of affixes */ + warn_report_once( + "9p: Potential degraded performance of inode remapping" + ); + return -ENFILE; + } + + val = g_malloc0(sizeof(QppEntry)); + *val = lookup; + + /* new unique inode affix and device combo */ + val->qp_affix_index = pdu->s->qp_affix_next++; + val->qp_affix = affixForIndex(val->qp_affix_index); + qht_insert(&pdu->s->qpp_table, val, hash, NULL); + } + /* assuming generated affix to be suffix type, not prefix */ + *path = (stbuf->st_ino << val->qp_affix.bits) | val->qp_affix.value; + return 0; +} + +static int stat_to_qid(V9fsPDU *pdu, const struct stat *stbuf, V9fsQID *qidp) +{ + int err; size_t size; - memset(&qidp->path, 0, sizeof(qidp->path)); - size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path)); - memcpy(&qidp->path, &stbuf->st_ino, size); + if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) { + /* map inode+device to qid path (fast path) */ + err = qid_path_suffixmap(pdu, stbuf, &qidp->path); + if (err == -ENFILE) { + /* fast path didn't work, fall back to full map */ + err = qid_path_fullmap(pdu, stbuf, &qidp->path); + } + if (err) { + return err; + } + } else { + if (pdu->s->dev_id != stbuf->st_dev) { + if (pdu->s->ctx.export_flags & V9FS_FORBID_MULTIDEVS) { + error_report_once( + "9p: Multiple devices detected in same VirtFS export. " + "Access of guest to additional devices is (partly) " + "denied due to virtfs option 'multidevs=forbid' being " + "effective." + ); + return -ENODEV; + } else { + warn_report_once( + "9p: Multiple devices detected in same VirtFS export, " + "which might lead to file ID collisions and severe " + "misbehaviours on guest! You should either use a " + "separate export for each device shared from host or " + "use virtfs option 'multidevs=remap'!" + ); + } + } + memset(&qidp->path, 0, sizeof(qidp->path)); + size = MIN(sizeof(stbuf->st_ino), sizeof(qidp->path)); + memcpy(&qidp->path, &stbuf->st_ino, size); + } + qidp->version = stbuf->st_mtime ^ (stbuf->st_size << 8); qidp->type = 0; if (S_ISDIR(stbuf->st_mode)) { @@ -588,6 +950,8 @@ static void stat_to_qid(const struct stat *stbuf, V9fsQID *qidp) if (S_ISLNK(stbuf->st_mode)) { qidp->type |= P9_QID_TYPE_SYMLINK; } + + return 0; } static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp, @@ -600,10 +964,37 @@ static int coroutine_fn fid_to_qid(V9fsPDU *pdu, V9fsFidState *fidp, if (err < 0) { return err; } - stat_to_qid(&stbuf, qidp); + err = stat_to_qid(pdu, &stbuf, qidp); + if (err < 0) { + return err; + } return 0; } +static int coroutine_fn dirent_to_qid(V9fsPDU *pdu, V9fsFidState *fidp, + struct dirent *dent, V9fsQID *qidp) +{ + struct stat stbuf; + V9fsPath path; + int err; + + v9fs_path_init(&path); + + err = v9fs_co_name_to_path(pdu, &fidp->path, dent->d_name, &path); + if (err < 0) { + goto out; + } + err = v9fs_co_lstat(pdu, &path, &stbuf); + if (err < 0) { + goto out; + } + err = stat_to_qid(pdu, &stbuf, qidp); + +out: + v9fs_path_free(&path); + return err; +} + V9fsPDU *pdu_alloc(V9fsState *s) { V9fsPDU *pdu = NULL; @@ -744,9 +1135,9 @@ static int donttouch_stat(V9fsStat *stat) { if (stat->type == -1 && stat->dev == -1 && - stat->qid.type == -1 && - stat->qid.version == -1 && - stat->qid.path == -1 && + stat->qid.type == 0xff && + stat->qid.version == (uint32_t) -1 && + stat->qid.path == (uint64_t) -1 && stat->mode == -1 && stat->atime == -1 && stat->mtime == -1 && @@ -831,7 +1222,10 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path, memset(v9stat, 0, sizeof(*v9stat)); - stat_to_qid(stbuf, &v9stat->qid); + err = stat_to_qid(pdu, stbuf, &v9stat->qid); + if (err < 0) { + return err; + } v9stat->mode = stat_to_v9mode(stbuf); v9stat->atime = stbuf->st_atime; v9stat->mtime = stbuf->st_mtime; @@ -892,7 +1286,7 @@ static int coroutine_fn stat_to_v9stat(V9fsPDU *pdu, V9fsPath *path, #define P9_STATS_ALL 0x00003fffULL /* Mask for All fields above */ -static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf, +static int stat_to_v9stat_dotl(V9fsPDU *pdu, const struct stat *stbuf, V9fsStatDotl *v9lstat) { memset(v9lstat, 0, sizeof(*v9lstat)); @@ -914,7 +1308,7 @@ static void stat_to_v9stat_dotl(V9fsState *s, const struct stat *stbuf, /* Currently we only support BASIC fields in stat */ v9lstat->st_result_mask = P9_STATS_BASIC; - stat_to_qid(stbuf, &v9lstat->qid); + return stat_to_qid(pdu, stbuf, &v9lstat->qid); } static void print_sg(struct iovec *sg, int cnt) @@ -1116,7 +1510,6 @@ static void coroutine_fn v9fs_getattr(void *opaque) uint64_t request_mask; V9fsStatDotl v9stat_dotl; V9fsPDU *pdu = opaque; - V9fsState *s = pdu->s; retval = pdu_unmarshal(pdu, offset, "dq", &fid, &request_mask); if (retval < 0) { @@ -1137,7 +1530,10 @@ static void coroutine_fn v9fs_getattr(void *opaque) if (retval < 0) { goto out; } - stat_to_v9stat_dotl(s, &stbuf, &v9stat_dotl); + retval = stat_to_v9stat_dotl(pdu, &stbuf, &v9stat_dotl); + if (retval < 0) { + goto out; + } /* fill st_gen if requested and supported by underlying fs */ if (request_mask & P9_STATS_GEN) { @@ -1382,7 +1778,10 @@ static void coroutine_fn v9fs_walk(void *opaque) if (err < 0) { goto out; } - stat_to_qid(&stbuf, &qid); + err = stat_to_qid(pdu, &stbuf, &qid); + if (err < 0) { + goto out; + } v9fs_path_copy(&dpath, &path); } memcpy(&qids[name_idx], &qid, sizeof(qid)); @@ -1484,7 +1883,10 @@ static void coroutine_fn v9fs_open(void *opaque) if (err < 0) { goto out; } - stat_to_qid(&stbuf, &qid); + err = stat_to_qid(pdu, &stbuf, &qid); + if (err < 0) { + goto out; + } if (S_ISDIR(stbuf.st_mode)) { err = v9fs_co_opendir(pdu, fidp); if (err < 0) { @@ -1594,7 +1996,10 @@ static void coroutine_fn v9fs_lcreate(void *opaque) fidp->flags |= FID_NON_RECLAIMABLE; } iounit = get_iounit(pdu, &fidp->path); - stat_to_qid(&stbuf, &qid); + err = stat_to_qid(pdu, &stbuf, &qid); + if (err < 0) { + goto out; + } err = pdu_marshal(pdu, offset, "Qd", &qid, iounit); if (err < 0) { goto out; @@ -1938,16 +2343,39 @@ static int coroutine_fn v9fs_do_readdir(V9fsPDU *pdu, V9fsFidState *fidp, v9fs_string_free(&name); return count; } - /* - * Fill up just the path field of qid because the client uses - * only that. To fill the entire qid structure we will have - * to stat each dirent found, which is expensive - */ - size = MIN(sizeof(dent->d_ino), sizeof(qid.path)); - memcpy(&qid.path, &dent->d_ino, size); - /* Fill the other fields with dummy values */ - qid.type = 0; - qid.version = 0; + + if (pdu->s->ctx.export_flags & V9FS_REMAP_INODES) { + /* + * dirent_to_qid() implies expensive stat call for each entry, + * we must do that here though since inode remapping requires + * the device id, which in turn might be different for + * different entries; we cannot make any assumption to avoid + * that here. + */ + err = dirent_to_qid(pdu, fidp, dent, &qid); + if (err < 0) { + v9fs_readdir_unlock(&fidp->fs.dir); + v9fs_co_seekdir(pdu, fidp, saved_dir_pos); + v9fs_string_free(&name); + return err; + } + } else { + /* + * Fill up just the path field of qid because the client uses + * only that. To fill the entire qid structure we will have + * to stat each dirent found, which is expensive. For the + * latter reason we don't call dirent_to_qid() here. Only drawback + * is that no multi-device export detection of stat_to_qid() + * would be done and provided as error to the user here. But + * user would get that error anyway when accessing those + * files/dirs through other ways. + */ + size = MIN(sizeof(dent->d_ino), sizeof(qid.path)); + memcpy(&qid.path, &dent->d_ino, size); + /* Fill the other fields with dummy values */ + qid.type = 0; + qid.version = 0; + } /* 11 = 7 + 4 (7 = start offset, 4 = space for storing count) */ len = pdu_marshal(pdu, 11 + count, "Qqbs", @@ -2328,7 +2756,10 @@ static void coroutine_fn v9fs_create(void *opaque) } } iounit = get_iounit(pdu, &fidp->path); - stat_to_qid(&stbuf, &qid); + err = stat_to_qid(pdu, &stbuf, &qid); + if (err < 0) { + goto out; + } err = pdu_marshal(pdu, offset, "Qd", &qid, iounit); if (err < 0) { goto out; @@ -2385,7 +2816,10 @@ static void coroutine_fn v9fs_symlink(void *opaque) if (err < 0) { goto out; } - stat_to_qid(&stbuf, &qid); + err = stat_to_qid(pdu, &stbuf, &qid); + if (err < 0) { + goto out; + } err = pdu_marshal(pdu, offset, "Q", &qid); if (err < 0) { goto out; @@ -3065,7 +3499,10 @@ static void coroutine_fn v9fs_mknod(void *opaque) if (err < 0) { goto out; } - stat_to_qid(&stbuf, &qid); + err = stat_to_qid(pdu, &stbuf, &qid); + if (err < 0) { + goto out; + } err = pdu_marshal(pdu, offset, "Q", &qid); if (err < 0) { goto out; @@ -3223,7 +3660,10 @@ static void coroutine_fn v9fs_mkdir(void *opaque) if (err < 0) { goto out; } - stat_to_qid(&stbuf, &qid); + err = stat_to_qid(pdu, &stbuf, &qid); + if (err < 0) { + goto out; + } err = pdu_marshal(pdu, offset, "Q", &qid); if (err < 0) { goto out; @@ -3634,31 +4074,43 @@ int v9fs_device_realize_common(V9fsState *s, const V9fsTransport *t, goto out; } + s->dev_id = stat.st_dev; + + /* init inode remapping : */ + /* hash table for variable length inode suffixes */ + qpd_table_init(&s->qpd_table); + /* hash table for slow/full inode remapping (most users won't need it) */ + qpf_table_init(&s->qpf_table); + /* hash table for quick inode remapping */ + qpp_table_init(&s->qpp_table); + s->qp_ndevices = 0; + s->qp_affix_next = 1; /* reserve 0 to detect overflow */ + s->qp_fullpath_next = 1; + s->ctx.fst = &fse->fst; fsdev_throttle_init(s->ctx.fst); - v9fs_path_free(&path); - rc = 0; out: if (rc) { - if (s->ops && s->ops->cleanup && s->ctx.private) { - s->ops->cleanup(&s->ctx); - } - g_free(s->tag); - g_free(s->ctx.fs_root); - v9fs_path_free(&path); + v9fs_device_unrealize_common(s, NULL); } + v9fs_path_free(&path); return rc; } void v9fs_device_unrealize_common(V9fsState *s, Error **errp) { - if (s->ops->cleanup) { + if (s->ops && s->ops->cleanup) { s->ops->cleanup(&s->ctx); } - fsdev_throttle_cleanup(s->ctx.fst); + if (s->ctx.fst) { + fsdev_throttle_cleanup(s->ctx.fst); + } g_free(s->tag); + qp_table_destroy(&s->qpd_table); + qp_table_destroy(&s->qpp_table); + qp_table_destroy(&s->qpf_table); g_free(s->ctx.fs_root); } diff --git a/hw/9pfs/9p.h b/hw/9pfs/9p.h index 8883761b2c..3904f82901 100644 --- a/hw/9pfs/9p.h +++ b/hw/9pfs/9p.h @@ -8,6 +8,7 @@ #include "fsdev/9p-iov-marshal.h" #include "qemu/thread.h" #include "qemu/coroutine.h" +#include "qemu/qht.h" enum { P9_TLERROR = 6, @@ -235,6 +236,58 @@ struct V9fsFidState V9fsFidState *rclm_lst; }; +typedef enum AffixType_t { + AffixType_Prefix, + AffixType_Suffix, /* A.k.a. postfix. */ +} AffixType_t; + +/** + * @brief Unique affix of variable length. + * + * An affix is (currently) either a suffix or a prefix, which is either + * going to be prepended (prefix) or appended (suffix) with some other + * number for the goal to generate unique numbers. Accordingly the + * suffixes (or prefixes) we generate @b must all have the mathematical + * property of being suffix-free (or prefix-free in case of prefixes) + * so that no matter what number we concatenate the affix with, that we + * always reliably get unique numbers as result after concatenation. + */ +typedef struct VariLenAffix { + AffixType_t type; /* Whether this affix is a suffix or a prefix. */ + uint64_t value; /* Actual numerical value of this affix. */ + /* + * Lenght of the affix, that is how many (of the lowest) bits of @c value + * must be used for appending/prepending this affix to its final resulting, + * unique number. + */ + int bits; +} VariLenAffix; + +/* See qid_inode_prefix_hash_bits(). */ +typedef struct { + dev_t dev; /* FS device on host. */ + /* + * How many (high) bits of the original inode number shall be used for + * hashing. + */ + int prefix_bits; +} QpdEntry; + +/* QID path prefix entry, see stat_to_qid */ +typedef struct { + dev_t dev; + uint16_t ino_prefix; + uint32_t qp_affix_index; + VariLenAffix qp_affix; +} QppEntry; + +/* QID path full entry, as above */ +typedef struct { + dev_t dev; + ino_t ino; + uint64_t path; +} QpfEntry; + struct V9fsState { QLIST_HEAD(, V9fsPDU) free_list; @@ -256,6 +309,13 @@ struct V9fsState Error *migration_blocker; V9fsConf fsconf; V9fsQID root_qid; + dev_t dev_id; + struct qht qpd_table; + struct qht qpp_table; + struct qht qpf_table; + uint64_t qp_ndevices; /* Amount of entries in qpd_table. */ + uint16_t qp_affix_next; + uint64_t qp_fullpath_next; }; /* 9p2000.L open flags */ diff --git a/hw/9pfs/trace-events b/hw/9pfs/trace-events index c0a0a4ab5d..10188daf7f 100644 --- a/hw/9pfs/trace-events +++ b/hw/9pfs/trace-events @@ -6,7 +6,7 @@ v9fs_rerror(uint16_t tag, uint8_t id, int err) "tag %d id %d err %d" v9fs_version(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s" v9fs_version_return(uint16_t tag, uint8_t id, int32_t msize, char* version) "tag %d id %d msize %d version %s" v9fs_attach(uint16_t tag, uint8_t id, int32_t fid, int32_t afid, char* uname, char* aname) "tag %u id %u fid %d afid %d uname %s aname %s" -v9fs_attach_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d type %d version %d path %"PRId64 +v9fs_attach_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %u id %u type %u version %u path %"PRIu64 v9fs_stat(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d" v9fs_stat_return(uint16_t tag, uint8_t id, int32_t mode, int32_t atime, int32_t mtime, int64_t length) "tag %d id %d stat={mode %d atime %d mtime %d length %"PRId64"}" v9fs_getattr(uint16_t tag, uint8_t id, int32_t fid, uint64_t request_mask) "tag %d id %d fid %d request_mask %"PRIu64 @@ -14,9 +14,9 @@ v9fs_getattr_return(uint16_t tag, uint8_t id, uint64_t result_mask, uint32_t mod v9fs_walk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, uint16_t nwnames) "tag %d id %d fid %d newfid %d nwnames %d" v9fs_walk_return(uint16_t tag, uint8_t id, uint16_t nwnames, void* qids) "tag %d id %d nwnames %d qids %p" v9fs_open(uint16_t tag, uint8_t id, int32_t fid, int32_t mode) "tag %d id %d fid %d mode %d" -v9fs_open_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d" +v9fs_open_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %u id %u qid={type %u version %u path %"PRIu64"} iounit %d" v9fs_lcreate(uint16_t tag, uint8_t id, int32_t dfid, int32_t flags, int32_t mode, uint32_t gid) "tag %d id %d dfid %d flags %d mode %d gid %u" -v9fs_lcreate_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int32_t iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d" +v9fs_lcreate_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int32_t iounit) "tag %u id %u qid={type %u version %u path %"PRIu64"} iounit %d" v9fs_fsync(uint16_t tag, uint8_t id, int32_t fid, int datasync) "tag %d id %d fid %d datasync %d" v9fs_clunk(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d" v9fs_read(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t max_count) "tag %d id %d fid %d off %"PRIu64" max_count %u" @@ -26,21 +26,21 @@ v9fs_readdir_return(uint16_t tag, uint8_t id, uint32_t count, ssize_t retval) "t v9fs_write(uint16_t tag, uint8_t id, int32_t fid, uint64_t off, uint32_t count, int cnt) "tag %d id %d fid %d off %"PRIu64" count %u cnt %d" v9fs_write_return(uint16_t tag, uint8_t id, int32_t total, ssize_t err) "tag %d id %d total %d err %zd" v9fs_create(uint16_t tag, uint8_t id, int32_t fid, char* name, int32_t perm, int8_t mode) "tag %d id %d fid %d name %s perm %d mode %d" -v9fs_create_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int iounit) "tag %d id %d qid={type %d version %d path %"PRId64"} iounit %d" +v9fs_create_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int iounit) "tag %u id %u qid={type %u version %u path %"PRIu64"} iounit %d" v9fs_symlink(uint16_t tag, uint8_t id, int32_t fid, char* name, char* symname, uint32_t gid) "tag %d id %d fid %d name %s symname %s gid %u" -v9fs_symlink_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}" +v9fs_symlink_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %u id %u qid={type %u version %u path %"PRIu64"}" v9fs_flush(uint16_t tag, uint8_t id, int16_t flush_tag) "tag %d id %d flush_tag %d" v9fs_link(uint16_t tag, uint8_t id, int32_t dfid, int32_t oldfid, char* name) "tag %d id %d dfid %d oldfid %d name %s" v9fs_remove(uint16_t tag, uint8_t id, int32_t fid) "tag %d id %d fid %d" v9fs_wstat(uint16_t tag, uint8_t id, int32_t fid, int32_t mode, int32_t atime, int32_t mtime) "tag %u id %u fid %d stat={mode %d atime %d mtime %d}" v9fs_mknod(uint16_t tag, uint8_t id, int32_t fid, int mode, int major, int minor) "tag %d id %d fid %d mode %d major %d minor %d" -v9fs_mknod_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path) "tag %d id %d qid={type %d version %d path %"PRId64"}" +v9fs_mknod_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path) "tag %u id %u qid={type %u version %u path %"PRIu64"}" v9fs_lock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length) "tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64 v9fs_lock_return(uint16_t tag, uint8_t id, int8_t status) "tag %d id %d status %d" v9fs_getlock(uint16_t tag, uint8_t id, int32_t fid, uint8_t type, uint64_t start, uint64_t length)"tag %d id %d fid %d type %d start %"PRIu64" length %"PRIu64 v9fs_getlock_return(uint16_t tag, uint8_t id, uint8_t type, uint64_t start, uint64_t length, uint32_t proc_id) "tag %d id %d type %d start %"PRIu64" length %"PRIu64" proc_id %u" v9fs_mkdir(uint16_t tag, uint8_t id, int32_t fid, char* name, int mode, uint32_t gid) "tag %u id %u fid %d name %s mode %d gid %u" -v9fs_mkdir_return(uint16_t tag, uint8_t id, int8_t type, int32_t version, int64_t path, int err) "tag %u id %u qid={type %d version %d path %"PRId64"} err %d" +v9fs_mkdir_return(uint16_t tag, uint8_t id, uint8_t type, uint32_t version, uint64_t path, int err) "tag %u id %u qid={type %u version %u path %"PRIu64"} err %d" v9fs_xattrwalk(uint16_t tag, uint8_t id, int32_t fid, int32_t newfid, char* name) "tag %d id %d fid %d newfid %d name %s" v9fs_xattrwalk_return(uint16_t tag, uint8_t id, int64_t size) "tag %d id %d size %"PRId64 v9fs_xattrcreate(uint16_t tag, uint8_t id, int32_t fid, char* name, uint64_t size, int flags) "tag %d id %d fid %d name %s size %"PRIu64" flags %d" diff --git a/qemu-options.hx b/qemu-options.hx index 2a04ca6ac5..793d70ff93 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1339,7 +1339,7 @@ ETEXI DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs, "-virtfs local,path=path,mount_tag=tag,security_model=mapped-xattr|mapped-file|passthrough|none\n" - " [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode]\n" + " [,id=id][,writeout=immediate][,readonly][,fmode=fmode][,dmode=dmode][,multidevs=remap|forbid|warn]\n" "-virtfs proxy,mount_tag=tag,socket=socket[,id=id][,writeout=immediate][,readonly]\n" "-virtfs proxy,mount_tag=tag,sock_fd=sock_fd[,id=id][,writeout=immediate][,readonly]\n" "-virtfs synth,mount_tag=tag[,id=id][,readonly]\n", @@ -1347,7 +1347,7 @@ DEF("virtfs", HAS_ARG, QEMU_OPTION_virtfs, STEXI -@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}] +@item -virtfs local,path=@var{path},mount_tag=@var{mount_tag} ,security_model=@var{security_model}[,writeout=@var{writeout}][,readonly] [,fmode=@var{fmode}][,dmode=@var{dmode}][,multidevs=@var{multidevs}] @itemx -virtfs proxy,socket=@var{socket},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly] @itemx -virtfs proxy,sock_fd=@var{sock_fd},mount_tag=@var{mount_tag} [,writeout=@var{writeout}][,readonly] @itemx -virtfs synth,mount_tag=@var{mount_tag} @@ -1403,6 +1403,28 @@ Specifies the default mode for newly created directories on the host. Works only with security models "mapped-xattr" and "mapped-file". @item mount_tag=@var{mount_tag} Specifies the tag name to be used by the guest to mount this export point. +@item multidevs=@var{multidevs} +Specifies how to deal with multiple devices being shared with a 9p export. +Supported behaviours are either "remap", "forbid" or "warn". The latter is +the default behaviour on which virtfs 9p expects only one device to be +shared with the same export, and if more than one device is shared and +accessed via the same 9p export then only a warning message is logged +(once) by qemu on host side. In order to avoid file ID collisions on guest +you should either create a separate virtfs export for each device to be +shared with guests (recommended way) or you might use "remap" instead which +allows you to share multiple devices with only one export instead, which is +achieved by remapping the original inode numbers from host to guest in a +way that would prevent such collisions. Remapping inodes in such use cases +is required because the original device IDs from host are never passed and +exposed on guest. Instead all files of an export shared with virtfs always +share the same device id on guest. So two files with identical inode +numbers but from actually different devices on host would otherwise cause a +file ID collision and hence potential misbehaviours on guest. "forbid" on +the other hand assumes like "warn" that only one device is shared by the +same export, however it will not only log a warning message but also +deny access to additional devices on guest. Note though that "forbid" does +currently not block all possible file access operations (e.g. readdir() +would still return entries from other devices). @end table ETEXI diff --git a/vl.c b/vl.c index 002bf4919e..0a295e5d77 100644 --- a/vl.c +++ b/vl.c @@ -3335,7 +3335,8 @@ int main(int argc, char **argv, char **envp) case QEMU_OPTION_virtfs: { QemuOpts *fsdev; QemuOpts *device; - const char *writeout, *sock_fd, *socket, *path, *security_model; + const char *writeout, *sock_fd, *socket, *path, *security_model, + *multidevs; olist = qemu_find_opts("virtfs"); if (!olist) { @@ -3395,6 +3396,10 @@ int main(int argc, char **argv, char **envp) qemu_opt_set_bool(fsdev, "readonly", qemu_opt_get_bool(opts, "readonly", 0), &error_abort); + multidevs = qemu_opt_get(opts, "multidevs"); + if (multidevs) { + qemu_opt_set(fsdev, "multidevs", multidevs, &error_abort); + } device = qemu_opts_create(qemu_find_opts("device"), NULL, 0, &error_abort); qemu_opt_set(device, "driver", "virtio-9p-pci", &error_abort);