mirror of https://github.com/xemu-project/xemu.git
Pull request
This pull request contain's Sam Li's zoned storage support in the QEMU block layer and virtio-blk emulation. v2: - Sam fixed the CI failures. CI passes for me now. [Richard] -----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEhpWov9P5fNqsNXdanKSrs4Grc8gFAmRiWCgACgkQnKSrs4Gr c8h/7gf+MMm2cGEaf376t8HMwTc6wbXVfbmAlZrge2EXPZfFvEaxj7HClcEraOgV yJsGWeU6mOw4r68ICJ/4KhrY1cdv+VZym/LsMLMcFUTXFHnyX4pyU3am31FPOI4K +wrDYJOJhc4DkAESWGgEWiMKpuO/uUEgBmHdW+qPFCl77Yl/eP6H5uNP6nGFn55p QpS/l8iha7PDkc81EsrjA+e/YI0ubfNSP7+zZElhQ98354CQ0MCfmZ6h9bT+o2bu R7SBUj80e+2X0a1b9s/2Jz/x8l4TEsl8kr48/Q1usq3GVVkbjEgqsk6wTN13Q/4g CeIR7E61ZeYzmpb4tLFRIqK2Jw+NEQ== =Q8xW -----END PGP SIGNATURE----- Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into staging Pull request This pull request contain's Sam Li's zoned storage support in the QEMU block layer and virtio-blk emulation. v2: - Sam fixed the CI failures. CI passes for me now. [Richard] # -----BEGIN PGP SIGNATURE----- # # iQEzBAABCAAdFiEEhpWov9P5fNqsNXdanKSrs4Grc8gFAmRiWCgACgkQnKSrs4Gr # c8h/7gf+MMm2cGEaf376t8HMwTc6wbXVfbmAlZrge2EXPZfFvEaxj7HClcEraOgV # yJsGWeU6mOw4r68ICJ/4KhrY1cdv+VZym/LsMLMcFUTXFHnyX4pyU3am31FPOI4K # +wrDYJOJhc4DkAESWGgEWiMKpuO/uUEgBmHdW+qPFCl77Yl/eP6H5uNP6nGFn55p # QpS/l8iha7PDkc81EsrjA+e/YI0ubfNSP7+zZElhQ98354CQ0MCfmZ6h9bT+o2bu # R7SBUj80e+2X0a1b9s/2Jz/x8l4TEsl8kr48/Q1usq3GVVkbjEgqsk6wTN13Q/4g # CeIR7E61ZeYzmpb4tLFRIqK2Jw+NEQ== # =Q8xW # -----END PGP SIGNATURE----- # gpg: Signature made Mon 15 May 2023 09:04:56 AM PDT # gpg: using RSA key 8695A8BFD3F97CDAAC35775A9CA4ABB381AB73C8 # gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>" [full] # gpg: aka "Stefan Hajnoczi <stefanha@gmail.com>" [full] * tag 'block-pull-request' of https://gitlab.com/stefanha/qemu: docs/zoned-storage:add zoned emulation use case virtio-blk: add some trace events for zoned emulation block: add accounting for zone append operation virtio-blk: add zoned storage emulation for zoned devices block: add some trace events for zone append qemu-iotests: test zone append operation block: introduce zone append write for zoned devices file-posix: add tracking of the zone write pointers docs/zoned-storage: add zoned device documentation block: add some trace events for new block layer APIs iotests: test new zone operations block: add zoned BlockDriver check to block layer block/raw-format: add zone operations to pass through requests block/block-backend: add block layer APIs resembling Linux ZonedBlockDevice ioctls block/file-posix: introduce helper functions for sysfs attributes block/block-common: add zoned device structs Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
commit
ab4c44d657
19
block.c
19
block.c
|
@ -7982,6 +7982,25 @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
|
|||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Non-zoned block drivers do not follow zoned storage constraints
|
||||
* (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
|
||||
* drivers in a graph.
|
||||
*/
|
||||
if (!parent_bs->drv->supports_zoned_children &&
|
||||
child_bs->bl.zoned == BLK_Z_HM) {
|
||||
/*
|
||||
* The host-aware model allows zoned storage constraints and random
|
||||
* write. Allow mixing host-aware and non-zoned drivers. Using
|
||||
* host-aware device as a regular device.
|
||||
*/
|
||||
error_setg(errp, "Cannot add a %s child to a %s parent",
|
||||
child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
|
||||
parent_bs->drv->supports_zoned_children ?
|
||||
"support zoned children" : "not support zoned children");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!QLIST_EMPTY(&child_bs->parents)) {
|
||||
error_setg(errp, "The node %s already has a parent",
|
||||
child_bs->node_name);
|
||||
|
|
|
@ -1845,6 +1845,204 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
|
||||
{
|
||||
BlkAioEmAIOCB *acb = opaque;
|
||||
BlkRwCo *rwco = &acb->rwco;
|
||||
|
||||
rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
|
||||
(unsigned int*)(uintptr_t)acb->bytes,
|
||||
rwco->iobuf);
|
||||
blk_aio_complete(acb);
|
||||
}
|
||||
|
||||
BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
|
||||
unsigned int *nr_zones,
|
||||
BlockZoneDescriptor *zones,
|
||||
BlockCompletionFunc *cb, void *opaque)
|
||||
{
|
||||
BlkAioEmAIOCB *acb;
|
||||
Coroutine *co;
|
||||
IO_CODE();
|
||||
|
||||
blk_inc_in_flight(blk);
|
||||
acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
|
||||
acb->rwco = (BlkRwCo) {
|
||||
.blk = blk,
|
||||
.offset = offset,
|
||||
.iobuf = zones,
|
||||
.ret = NOT_DONE,
|
||||
};
|
||||
acb->bytes = (int64_t)(uintptr_t)nr_zones,
|
||||
acb->has_returned = false;
|
||||
|
||||
co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
|
||||
aio_co_enter(blk_get_aio_context(blk), co);
|
||||
|
||||
acb->has_returned = true;
|
||||
if (acb->rwco.ret != NOT_DONE) {
|
||||
replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
|
||||
blk_aio_complete_bh, acb);
|
||||
}
|
||||
|
||||
return &acb->common;
|
||||
}
|
||||
|
||||
static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
|
||||
{
|
||||
BlkAioEmAIOCB *acb = opaque;
|
||||
BlkRwCo *rwco = &acb->rwco;
|
||||
|
||||
rwco->ret = blk_co_zone_mgmt(rwco->blk,
|
||||
(BlockZoneOp)(uintptr_t)rwco->iobuf,
|
||||
rwco->offset, acb->bytes);
|
||||
blk_aio_complete(acb);
|
||||
}
|
||||
|
||||
BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
|
||||
int64_t offset, int64_t len,
|
||||
BlockCompletionFunc *cb, void *opaque) {
|
||||
BlkAioEmAIOCB *acb;
|
||||
Coroutine *co;
|
||||
IO_CODE();
|
||||
|
||||
blk_inc_in_flight(blk);
|
||||
acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
|
||||
acb->rwco = (BlkRwCo) {
|
||||
.blk = blk,
|
||||
.offset = offset,
|
||||
.iobuf = (void *)(uintptr_t)op,
|
||||
.ret = NOT_DONE,
|
||||
};
|
||||
acb->bytes = len;
|
||||
acb->has_returned = false;
|
||||
|
||||
co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
|
||||
aio_co_enter(blk_get_aio_context(blk), co);
|
||||
|
||||
acb->has_returned = true;
|
||||
if (acb->rwco.ret != NOT_DONE) {
|
||||
replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
|
||||
blk_aio_complete_bh, acb);
|
||||
}
|
||||
|
||||
return &acb->common;
|
||||
}
|
||||
|
||||
static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
|
||||
{
|
||||
BlkAioEmAIOCB *acb = opaque;
|
||||
BlkRwCo *rwco = &acb->rwco;
|
||||
|
||||
rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
|
||||
rwco->iobuf, rwco->flags);
|
||||
blk_aio_complete(acb);
|
||||
}
|
||||
|
||||
BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
|
||||
QEMUIOVector *qiov, BdrvRequestFlags flags,
|
||||
BlockCompletionFunc *cb, void *opaque) {
|
||||
BlkAioEmAIOCB *acb;
|
||||
Coroutine *co;
|
||||
IO_CODE();
|
||||
|
||||
blk_inc_in_flight(blk);
|
||||
acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
|
||||
acb->rwco = (BlkRwCo) {
|
||||
.blk = blk,
|
||||
.ret = NOT_DONE,
|
||||
.flags = flags,
|
||||
.iobuf = qiov,
|
||||
};
|
||||
acb->bytes = (int64_t)(uintptr_t)offset;
|
||||
acb->has_returned = false;
|
||||
|
||||
co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
|
||||
aio_co_enter(blk_get_aio_context(blk), co);
|
||||
acb->has_returned = true;
|
||||
if (acb->rwco.ret != NOT_DONE) {
|
||||
replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
|
||||
blk_aio_complete_bh, acb);
|
||||
}
|
||||
|
||||
return &acb->common;
|
||||
}
|
||||
|
||||
/*
|
||||
* Send a zone_report command.
|
||||
* offset is a byte offset from the start of the device. No alignment
|
||||
* required for offset.
|
||||
* nr_zones represents IN maximum and OUT actual.
|
||||
*/
|
||||
int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
|
||||
unsigned int *nr_zones,
|
||||
BlockZoneDescriptor *zones)
|
||||
{
|
||||
int ret;
|
||||
IO_CODE();
|
||||
|
||||
blk_inc_in_flight(blk); /* increase before waiting */
|
||||
blk_wait_while_drained(blk);
|
||||
GRAPH_RDLOCK_GUARD();
|
||||
if (!blk_is_available(blk)) {
|
||||
blk_dec_in_flight(blk);
|
||||
return -ENOMEDIUM;
|
||||
}
|
||||
ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
|
||||
blk_dec_in_flight(blk);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Send a zone_management command.
|
||||
* op is the zone operation;
|
||||
* offset is the byte offset from the start of the zoned device;
|
||||
* len is the maximum number of bytes the command should operate on. It
|
||||
* should be aligned with the device zone size.
|
||||
*/
|
||||
int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
|
||||
int64_t offset, int64_t len)
|
||||
{
|
||||
int ret;
|
||||
IO_CODE();
|
||||
|
||||
blk_inc_in_flight(blk);
|
||||
blk_wait_while_drained(blk);
|
||||
GRAPH_RDLOCK_GUARD();
|
||||
|
||||
ret = blk_check_byte_request(blk, offset, len);
|
||||
if (ret < 0) {
|
||||
blk_dec_in_flight(blk);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
|
||||
blk_dec_in_flight(blk);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Send a zone_append command.
|
||||
*/
|
||||
int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
|
||||
QEMUIOVector *qiov, BdrvRequestFlags flags)
|
||||
{
|
||||
int ret;
|
||||
IO_CODE();
|
||||
|
||||
blk_inc_in_flight(blk);
|
||||
blk_wait_while_drained(blk);
|
||||
GRAPH_RDLOCK_GUARD();
|
||||
if (!blk_is_available(blk)) {
|
||||
blk_dec_in_flight(blk);
|
||||
return -ENOMEDIUM;
|
||||
}
|
||||
|
||||
ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
|
||||
blk_dec_in_flight(blk);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void blk_drain(BlockBackend *blk)
|
||||
{
|
||||
BlockDriverState *bs = blk_bs(blk);
|
||||
|
|
|
@ -68,6 +68,9 @@
|
|||
#include <sys/param.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/vfs.h>
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
#include <linux/blkzoned.h>
|
||||
#endif
|
||||
#include <linux/cdrom.h>
|
||||
#include <linux/fd.h>
|
||||
#include <linux/fs.h>
|
||||
|
@ -157,6 +160,7 @@ typedef struct BDRVRawState {
|
|||
bool has_write_zeroes:1;
|
||||
bool use_linux_aio:1;
|
||||
bool use_linux_io_uring:1;
|
||||
int64_t *offset; /* offset of zone append operation */
|
||||
int page_cache_inconsistent; /* errno from fdatasync failure */
|
||||
bool has_fallocate;
|
||||
bool needs_alignment;
|
||||
|
@ -216,6 +220,13 @@ typedef struct RawPosixAIOData {
|
|||
PreallocMode prealloc;
|
||||
Error **errp;
|
||||
} truncate;
|
||||
struct {
|
||||
unsigned int *nr_zones;
|
||||
BlockZoneDescriptor *zones;
|
||||
} zone_report;
|
||||
struct {
|
||||
unsigned long op;
|
||||
} zone_mgmt;
|
||||
};
|
||||
} RawPosixAIOData;
|
||||
|
||||
|
@ -766,6 +777,18 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
|
|||
goto fail;
|
||||
}
|
||||
}
|
||||
#ifdef CONFIG_BLKZONED
|
||||
/*
|
||||
* The kernel page cache does not reliably work for writes to SWR zones
|
||||
* of zoned block device because it can not guarantee the order of writes.
|
||||
*/
|
||||
if ((bs->bl.zoned != BLK_Z_NONE) &&
|
||||
(!(s->open_flags & O_DIRECT))) {
|
||||
error_setg(errp, "The driver supports zoned devices, and it requires "
|
||||
"cache.direct=on, which was not specified.");
|
||||
return -EINVAL; /* No host kernel page cache */
|
||||
}
|
||||
#endif
|
||||
|
||||
if (S_ISBLK(st.st_mode)) {
|
||||
#ifdef __linux__
|
||||
|
@ -1202,15 +1225,91 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
|
|||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a sysfs attribute value as character string.
|
||||
*/
|
||||
#ifdef CONFIG_LINUX
|
||||
static int get_sysfs_str_val(struct stat *st, const char *attribute,
|
||||
char **val) {
|
||||
g_autofree char *sysfspath = NULL;
|
||||
int ret;
|
||||
size_t len;
|
||||
|
||||
if (!S_ISBLK(st->st_mode)) {
|
||||
return -ENOTSUP;
|
||||
}
|
||||
|
||||
sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
|
||||
major(st->st_rdev), minor(st->st_rdev),
|
||||
attribute);
|
||||
ret = g_file_get_contents(sysfspath, val, &len, NULL);
|
||||
if (ret == -1) {
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
/* The file is ended with '\n' */
|
||||
char *p;
|
||||
p = *val;
|
||||
if (*(p + len - 1) == '\n') {
|
||||
*(p + len - 1) = '\0';
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
|
||||
{
|
||||
g_autofree char *val = NULL;
|
||||
int ret;
|
||||
|
||||
ret = get_sysfs_str_val(st, "zoned", &val);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (strcmp(val, "host-managed") == 0) {
|
||||
*zoned = BLK_Z_HM;
|
||||
} else if (strcmp(val, "host-aware") == 0) {
|
||||
*zoned = BLK_Z_HA;
|
||||
} else if (strcmp(val, "none") == 0) {
|
||||
*zoned = BLK_Z_NONE;
|
||||
} else {
|
||||
return -ENOTSUP;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif /* defined(CONFIG_BLKZONED) */
|
||||
|
||||
/*
|
||||
* Get a sysfs attribute value as a long integer.
|
||||
*/
|
||||
#ifdef CONFIG_LINUX
|
||||
static long get_sysfs_long_val(struct stat *st, const char *attribute)
|
||||
{
|
||||
g_autofree char *str = NULL;
|
||||
const char *end;
|
||||
long val;
|
||||
int ret;
|
||||
|
||||
ret = get_sysfs_str_val(st, attribute, &str);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* The file is ended with '\n', pass 'end' to accept that. */
|
||||
ret = qemu_strtol(str, &end, 10, &val);
|
||||
if (ret == 0 && end && *end == '\0') {
|
||||
ret = val;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int hdev_get_max_segments(int fd, struct stat *st)
|
||||
{
|
||||
#ifdef CONFIG_LINUX
|
||||
char buf[32];
|
||||
const char *end;
|
||||
char *sysfspath = NULL;
|
||||
int ret;
|
||||
int sysfd = -1;
|
||||
long max_segments;
|
||||
|
||||
if (S_ISCHR(st->st_mode)) {
|
||||
if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
|
||||
|
@ -1218,44 +1317,176 @@ static int hdev_get_max_segments(int fd, struct stat *st)
|
|||
}
|
||||
return -ENOTSUP;
|
||||
}
|
||||
|
||||
if (!S_ISBLK(st->st_mode)) {
|
||||
return -ENOTSUP;
|
||||
}
|
||||
|
||||
sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
|
||||
major(st->st_rdev), minor(st->st_rdev));
|
||||
sysfd = open(sysfspath, O_RDONLY);
|
||||
if (sysfd == -1) {
|
||||
ret = -errno;
|
||||
goto out;
|
||||
}
|
||||
ret = RETRY_ON_EINTR(read(sysfd, buf, sizeof(buf) - 1));
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
goto out;
|
||||
} else if (ret == 0) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
buf[ret] = 0;
|
||||
/* The file is ended with '\n', pass 'end' to accept that. */
|
||||
ret = qemu_strtol(buf, &end, 10, &max_segments);
|
||||
if (ret == 0 && end && *end == '\n') {
|
||||
ret = max_segments;
|
||||
}
|
||||
|
||||
out:
|
||||
if (sysfd != -1) {
|
||||
close(sysfd);
|
||||
}
|
||||
g_free(sysfspath);
|
||||
return ret;
|
||||
return get_sysfs_long_val(st, "max_segments");
|
||||
#else
|
||||
return -ENOTSUP;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
/*
|
||||
* If the reset_all flag is true, then the wps of zone whose state is
|
||||
* not readonly or offline should be all reset to the start sector.
|
||||
* Else, take the real wp of the device.
|
||||
*/
|
||||
static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
|
||||
unsigned int nrz, bool reset_all)
|
||||
{
|
||||
struct blk_zone *blkz;
|
||||
size_t rep_size;
|
||||
uint64_t sector = offset >> BDRV_SECTOR_BITS;
|
||||
BlockZoneWps *wps = bs->wps;
|
||||
unsigned int j = offset / bs->bl.zone_size;
|
||||
unsigned int n = 0, i = 0;
|
||||
int ret;
|
||||
rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
|
||||
g_autofree struct blk_zone_report *rep = NULL;
|
||||
|
||||
rep = g_malloc(rep_size);
|
||||
blkz = (struct blk_zone *)(rep + 1);
|
||||
while (n < nrz) {
|
||||
memset(rep, 0, rep_size);
|
||||
rep->sector = sector;
|
||||
rep->nr_zones = nrz - n;
|
||||
|
||||
do {
|
||||
ret = ioctl(fd, BLKREPORTZONE, rep);
|
||||
} while (ret != 0 && errno == EINTR);
|
||||
if (ret != 0) {
|
||||
error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
|
||||
fd, offset, errno);
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (!rep->nr_zones) {
|
||||
break;
|
||||
}
|
||||
|
||||
for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) {
|
||||
/*
|
||||
* The wp tracking cares only about sequential writes required and
|
||||
* sequential write preferred zones so that the wp can advance to
|
||||
* the right location.
|
||||
* Use the most significant bit of the wp location to indicate the
|
||||
* zone type: 0 for SWR/SWP zones and 1 for conventional zones.
|
||||
*/
|
||||
if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
|
||||
wps->wp[j] |= 1ULL << 63;
|
||||
} else {
|
||||
switch(blkz[i].cond) {
|
||||
case BLK_ZONE_COND_FULL:
|
||||
case BLK_ZONE_COND_READONLY:
|
||||
/* Zone not writable */
|
||||
wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
|
||||
break;
|
||||
case BLK_ZONE_COND_OFFLINE:
|
||||
/* Zone not writable nor readable */
|
||||
wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS;
|
||||
break;
|
||||
default:
|
||||
if (reset_all) {
|
||||
wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS;
|
||||
} else {
|
||||
wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
sector = blkz[i - 1].start + blkz[i - 1].len;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
|
||||
unsigned int nrz)
|
||||
{
|
||||
if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) {
|
||||
error_report("update zone wp failed");
|
||||
}
|
||||
}
|
||||
|
||||
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
|
||||
Error **errp)
|
||||
{
|
||||
BDRVRawState *s = bs->opaque;
|
||||
BlockZoneModel zoned;
|
||||
int ret;
|
||||
|
||||
bs->bl.zoned = BLK_Z_NONE;
|
||||
|
||||
ret = get_sysfs_zoned_model(st, &zoned);
|
||||
if (ret < 0 || zoned == BLK_Z_NONE) {
|
||||
return;
|
||||
}
|
||||
bs->bl.zoned = zoned;
|
||||
|
||||
ret = get_sysfs_long_val(st, "max_open_zones");
|
||||
if (ret >= 0) {
|
||||
bs->bl.max_open_zones = ret;
|
||||
}
|
||||
|
||||
ret = get_sysfs_long_val(st, "max_active_zones");
|
||||
if (ret >= 0) {
|
||||
bs->bl.max_active_zones = ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The zoned device must at least have zone size and nr_zones fields.
|
||||
*/
|
||||
ret = get_sysfs_long_val(st, "chunk_sectors");
|
||||
if (ret < 0) {
|
||||
error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
|
||||
"sysfs attribute");
|
||||
return;
|
||||
} else if (!ret) {
|
||||
error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
|
||||
return;
|
||||
}
|
||||
bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
|
||||
|
||||
ret = get_sysfs_long_val(st, "nr_zones");
|
||||
if (ret < 0) {
|
||||
error_setg_errno(errp, -ret, "Unable to read nr_zones "
|
||||
"sysfs attribute");
|
||||
return;
|
||||
} else if (!ret) {
|
||||
error_setg(errp, "Read 0 from nr_zones sysfs attribute");
|
||||
return;
|
||||
}
|
||||
bs->bl.nr_zones = ret;
|
||||
|
||||
ret = get_sysfs_long_val(st, "zone_append_max_bytes");
|
||||
if (ret > 0) {
|
||||
bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
|
||||
}
|
||||
|
||||
ret = get_sysfs_long_val(st, "physical_block_size");
|
||||
if (ret >= 0) {
|
||||
bs->bl.write_granularity = ret;
|
||||
}
|
||||
|
||||
/* The refresh_limits() function can be called multiple times. */
|
||||
g_free(bs->wps);
|
||||
bs->wps = g_malloc(sizeof(BlockZoneWps) +
|
||||
sizeof(int64_t) * bs->bl.nr_zones);
|
||||
ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
|
||||
if (ret < 0) {
|
||||
error_setg_errno(errp, -ret, "report wps failed");
|
||||
bs->wps = NULL;
|
||||
return;
|
||||
}
|
||||
qemu_co_mutex_init(&bs->wps->colock);
|
||||
}
|
||||
#else /* !defined(CONFIG_BLKZONED) */
|
||||
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
|
||||
Error **errp)
|
||||
{
|
||||
bs->bl.zoned = BLK_Z_NONE;
|
||||
}
|
||||
#endif /* !defined(CONFIG_BLKZONED) */
|
||||
|
||||
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
|
||||
{
|
||||
BDRVRawState *s = bs->opaque;
|
||||
|
@ -1297,6 +1528,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
|
|||
bs->bl.max_hw_iov = ret;
|
||||
}
|
||||
}
|
||||
|
||||
raw_refresh_zoned_limits(bs, &st, errp);
|
||||
}
|
||||
|
||||
static int check_for_dasd(int fd)
|
||||
|
@ -1320,9 +1553,12 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
|
|||
BDRVRawState *s = bs->opaque;
|
||||
int ret;
|
||||
|
||||
/* If DASD, get blocksizes */
|
||||
/* If DASD or zoned devices, get blocksizes */
|
||||
if (check_for_dasd(s->fd) < 0) {
|
||||
return -ENOTSUP;
|
||||
/* zoned devices are not DASD */
|
||||
if (bs->bl.zoned == BLK_Z_NONE) {
|
||||
return -ENOTSUP;
|
||||
}
|
||||
}
|
||||
ret = probe_logical_blocksize(s->fd, &bsz->log);
|
||||
if (ret < 0) {
|
||||
|
@ -1463,7 +1699,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
|
|||
ssize_t len;
|
||||
|
||||
len = RETRY_ON_EINTR(
|
||||
(aiocb->aio_type & QEMU_AIO_WRITE) ?
|
||||
(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
|
||||
qemu_pwritev(aiocb->aio_fildes,
|
||||
aiocb->io.iov,
|
||||
aiocb->io.niov,
|
||||
|
@ -1492,7 +1728,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
|
|||
ssize_t len;
|
||||
|
||||
while (offset < aiocb->aio_nbytes) {
|
||||
if (aiocb->aio_type & QEMU_AIO_WRITE) {
|
||||
if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
|
||||
len = pwrite(aiocb->aio_fildes,
|
||||
(const char *)buf + offset,
|
||||
aiocb->aio_nbytes - offset,
|
||||
|
@ -1585,7 +1821,7 @@ static int handle_aiocb_rw(void *opaque)
|
|||
}
|
||||
|
||||
nbytes = handle_aiocb_rw_linear(aiocb, buf);
|
||||
if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
|
||||
if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
|
||||
char *p = buf;
|
||||
size_t count = aiocb->aio_nbytes, copy;
|
||||
int i;
|
||||
|
@ -1790,6 +2026,147 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
|
|||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* parse_zone - Fill a zone descriptor
|
||||
*/
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
static inline int parse_zone(struct BlockZoneDescriptor *zone,
|
||||
const struct blk_zone *blkz) {
|
||||
zone->start = blkz->start << BDRV_SECTOR_BITS;
|
||||
zone->length = blkz->len << BDRV_SECTOR_BITS;
|
||||
zone->wp = blkz->wp << BDRV_SECTOR_BITS;
|
||||
|
||||
#ifdef HAVE_BLK_ZONE_REP_CAPACITY
|
||||
zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
|
||||
#else
|
||||
zone->cap = blkz->len << BDRV_SECTOR_BITS;
|
||||
#endif
|
||||
|
||||
switch (blkz->type) {
|
||||
case BLK_ZONE_TYPE_SEQWRITE_REQ:
|
||||
zone->type = BLK_ZT_SWR;
|
||||
break;
|
||||
case BLK_ZONE_TYPE_SEQWRITE_PREF:
|
||||
zone->type = BLK_ZT_SWP;
|
||||
break;
|
||||
case BLK_ZONE_TYPE_CONVENTIONAL:
|
||||
zone->type = BLK_ZT_CONV;
|
||||
break;
|
||||
default:
|
||||
error_report("Unsupported zone type: 0x%x", blkz->type);
|
||||
return -ENOTSUP;
|
||||
}
|
||||
|
||||
switch (blkz->cond) {
|
||||
case BLK_ZONE_COND_NOT_WP:
|
||||
zone->state = BLK_ZS_NOT_WP;
|
||||
break;
|
||||
case BLK_ZONE_COND_EMPTY:
|
||||
zone->state = BLK_ZS_EMPTY;
|
||||
break;
|
||||
case BLK_ZONE_COND_IMP_OPEN:
|
||||
zone->state = BLK_ZS_IOPEN;
|
||||
break;
|
||||
case BLK_ZONE_COND_EXP_OPEN:
|
||||
zone->state = BLK_ZS_EOPEN;
|
||||
break;
|
||||
case BLK_ZONE_COND_CLOSED:
|
||||
zone->state = BLK_ZS_CLOSED;
|
||||
break;
|
||||
case BLK_ZONE_COND_READONLY:
|
||||
zone->state = BLK_ZS_RDONLY;
|
||||
break;
|
||||
case BLK_ZONE_COND_FULL:
|
||||
zone->state = BLK_ZS_FULL;
|
||||
break;
|
||||
case BLK_ZONE_COND_OFFLINE:
|
||||
zone->state = BLK_ZS_OFFLINE;
|
||||
break;
|
||||
default:
|
||||
error_report("Unsupported zone state: 0x%x", blkz->cond);
|
||||
return -ENOTSUP;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
static int handle_aiocb_zone_report(void *opaque)
|
||||
{
|
||||
RawPosixAIOData *aiocb = opaque;
|
||||
int fd = aiocb->aio_fildes;
|
||||
unsigned int *nr_zones = aiocb->zone_report.nr_zones;
|
||||
BlockZoneDescriptor *zones = aiocb->zone_report.zones;
|
||||
/* zoned block devices use 512-byte sectors */
|
||||
uint64_t sector = aiocb->aio_offset / 512;
|
||||
|
||||
struct blk_zone *blkz;
|
||||
size_t rep_size;
|
||||
unsigned int nrz;
|
||||
int ret;
|
||||
unsigned int n = 0, i = 0;
|
||||
|
||||
nrz = *nr_zones;
|
||||
rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
|
||||
g_autofree struct blk_zone_report *rep = NULL;
|
||||
rep = g_malloc(rep_size);
|
||||
|
||||
blkz = (struct blk_zone *)(rep + 1);
|
||||
while (n < nrz) {
|
||||
memset(rep, 0, rep_size);
|
||||
rep->sector = sector;
|
||||
rep->nr_zones = nrz - n;
|
||||
|
||||
do {
|
||||
ret = ioctl(fd, BLKREPORTZONE, rep);
|
||||
} while (ret != 0 && errno == EINTR);
|
||||
if (ret != 0) {
|
||||
error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
|
||||
fd, sector, errno);
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (!rep->nr_zones) {
|
||||
break;
|
||||
}
|
||||
|
||||
for (i = 0; i < rep->nr_zones; i++, n++) {
|
||||
ret = parse_zone(&zones[n], &blkz[i]);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* The next report should start after the last zone reported */
|
||||
sector = blkz[i].start + blkz[i].len;
|
||||
}
|
||||
}
|
||||
|
||||
*nr_zones = n;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
static int handle_aiocb_zone_mgmt(void *opaque)
|
||||
{
|
||||
RawPosixAIOData *aiocb = opaque;
|
||||
int fd = aiocb->aio_fildes;
|
||||
uint64_t sector = aiocb->aio_offset / 512;
|
||||
int64_t nr_sectors = aiocb->aio_nbytes / 512;
|
||||
struct blk_zone_range range;
|
||||
int ret;
|
||||
|
||||
/* Execute the operation */
|
||||
range.sector = sector;
|
||||
range.nr_sectors = nr_sectors;
|
||||
do {
|
||||
ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
|
||||
} while (ret != 0 && errno == EINTR);
|
||||
|
||||
return ret < 0 ? -errno : ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int handle_aiocb_copy_range(void *opaque)
|
||||
{
|
||||
RawPosixAIOData *aiocb = opaque;
|
||||
|
@ -2072,9 +2449,19 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
|
|||
{
|
||||
BDRVRawState *s = bs->opaque;
|
||||
RawPosixAIOData acb;
|
||||
int ret;
|
||||
|
||||
if (fd_open(bs) < 0)
|
||||
return -EIO;
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && bs->wps) {
|
||||
qemu_co_mutex_lock(&bs->wps->colock);
|
||||
if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
|
||||
int index = offset / bs->bl.zone_size;
|
||||
offset = bs->wps->wp[index];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When using O_DIRECT, the request must be aligned to be able to use
|
||||
|
@ -2087,12 +2474,15 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
|
|||
#ifdef CONFIG_LINUX_IO_URING
|
||||
} else if (s->use_linux_io_uring) {
|
||||
assert(qiov->size == bytes);
|
||||
return luring_co_submit(bs, s->fd, offset, qiov, type);
|
||||
ret = luring_co_submit(bs, s->fd, offset, qiov, type);
|
||||
goto out;
|
||||
#endif
|
||||
#ifdef CONFIG_LINUX_AIO
|
||||
} else if (s->use_linux_aio) {
|
||||
assert(qiov->size == bytes);
|
||||
return laio_co_submit(s->fd, offset, qiov, type, s->aio_max_batch);
|
||||
ret = laio_co_submit(s->fd, offset, qiov, type,
|
||||
s->aio_max_batch);
|
||||
goto out;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -2109,7 +2499,41 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
|
|||
};
|
||||
|
||||
assert(qiov->size == bytes);
|
||||
return raw_thread_pool_submit(handle_aiocb_rw, &acb);
|
||||
ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
|
||||
goto out; /* Avoid the compiler err of unused label */
|
||||
|
||||
out:
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
{
|
||||
BlockZoneWps *wps = bs->wps;
|
||||
if (ret == 0) {
|
||||
if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
|
||||
&& wps && bs->bl.zone_size) {
|
||||
uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
|
||||
if (!BDRV_ZT_IS_CONV(*wp)) {
|
||||
if (type & QEMU_AIO_ZONE_APPEND) {
|
||||
*s->offset = *wp;
|
||||
trace_zbd_zone_append_complete(bs, *s->offset
|
||||
>> BDRV_SECTOR_BITS);
|
||||
}
|
||||
/* Advance the wp if needed */
|
||||
if (offset + bytes > *wp) {
|
||||
*wp = offset + bytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
|
||||
update_zones_wp(bs, s->fd, 0, 1);
|
||||
}
|
||||
}
|
||||
|
||||
if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && wps) {
|
||||
qemu_co_mutex_unlock(&wps->colock);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
|
||||
|
@ -2212,6 +2636,9 @@ static void raw_close(BlockDriverState *bs)
|
|||
BDRVRawState *s = bs->opaque;
|
||||
|
||||
if (s->fd >= 0) {
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
g_free(bs->wps);
|
||||
#endif
|
||||
qemu_close(s->fd);
|
||||
s->fd = -1;
|
||||
}
|
||||
|
@ -2969,6 +3396,171 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* zone report - Get a zone block device's information in the form
|
||||
* of an array of zone descriptors.
|
||||
* zones is an array of zone descriptors to hold zone information on reply;
|
||||
* offset can be any byte within the entire size of the device;
|
||||
* nr_zones is the maxium number of sectors the command should operate on.
|
||||
*/
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
|
||||
unsigned int *nr_zones,
|
||||
BlockZoneDescriptor *zones) {
|
||||
BDRVRawState *s = bs->opaque;
|
||||
RawPosixAIOData acb = (RawPosixAIOData) {
|
||||
.bs = bs,
|
||||
.aio_fildes = s->fd,
|
||||
.aio_type = QEMU_AIO_ZONE_REPORT,
|
||||
.aio_offset = offset,
|
||||
.zone_report = {
|
||||
.nr_zones = nr_zones,
|
||||
.zones = zones,
|
||||
},
|
||||
};
|
||||
|
||||
trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
|
||||
return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* zone management operations - Execute an operation on a zone
|
||||
*/
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
|
||||
int64_t offset, int64_t len) {
|
||||
BDRVRawState *s = bs->opaque;
|
||||
RawPosixAIOData acb;
|
||||
int64_t zone_size, zone_size_mask;
|
||||
const char *op_name;
|
||||
unsigned long zo;
|
||||
int ret;
|
||||
BlockZoneWps *wps = bs->wps;
|
||||
int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
|
||||
|
||||
zone_size = bs->bl.zone_size;
|
||||
zone_size_mask = zone_size - 1;
|
||||
if (offset & zone_size_mask) {
|
||||
error_report("sector offset %" PRId64 " is not aligned to zone size "
|
||||
"%" PRId64 "", offset / 512, zone_size / 512);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (((offset + len) < capacity && len & zone_size_mask) ||
|
||||
offset + len > capacity) {
|
||||
error_report("number of sectors %" PRId64 " is not aligned to zone size"
|
||||
" %" PRId64 "", len / 512, zone_size / 512);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
uint32_t i = offset / bs->bl.zone_size;
|
||||
uint32_t nrz = len / bs->bl.zone_size;
|
||||
uint64_t *wp = &wps->wp[i];
|
||||
if (BDRV_ZT_IS_CONV(*wp) && len != capacity) {
|
||||
error_report("zone mgmt operations are not allowed for conventional zones");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
switch (op) {
|
||||
case BLK_ZO_OPEN:
|
||||
op_name = "BLKOPENZONE";
|
||||
zo = BLKOPENZONE;
|
||||
break;
|
||||
case BLK_ZO_CLOSE:
|
||||
op_name = "BLKCLOSEZONE";
|
||||
zo = BLKCLOSEZONE;
|
||||
break;
|
||||
case BLK_ZO_FINISH:
|
||||
op_name = "BLKFINISHZONE";
|
||||
zo = BLKFINISHZONE;
|
||||
break;
|
||||
case BLK_ZO_RESET:
|
||||
op_name = "BLKRESETZONE";
|
||||
zo = BLKRESETZONE;
|
||||
break;
|
||||
default:
|
||||
error_report("Unsupported zone op: 0x%x", op);
|
||||
return -ENOTSUP;
|
||||
}
|
||||
|
||||
acb = (RawPosixAIOData) {
|
||||
.bs = bs,
|
||||
.aio_fildes = s->fd,
|
||||
.aio_type = QEMU_AIO_ZONE_MGMT,
|
||||
.aio_offset = offset,
|
||||
.aio_nbytes = len,
|
||||
.zone_mgmt = {
|
||||
.op = zo,
|
||||
},
|
||||
};
|
||||
|
||||
trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
|
||||
len >> BDRV_SECTOR_BITS);
|
||||
ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
|
||||
if (ret != 0) {
|
||||
update_zones_wp(bs, s->fd, offset, i);
|
||||
error_report("ioctl %s failed %d", op_name, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (zo == BLKRESETZONE && len == capacity) {
|
||||
ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1);
|
||||
if (ret < 0) {
|
||||
error_report("reporting single wp failed");
|
||||
return ret;
|
||||
}
|
||||
} else if (zo == BLKRESETZONE) {
|
||||
for (unsigned int j = 0; j < nrz; ++j) {
|
||||
wp[j] = offset + j * zone_size;
|
||||
}
|
||||
} else if (zo == BLKFINISHZONE) {
|
||||
for (unsigned int j = 0; j < nrz; ++j) {
|
||||
/* The zoned device allows the last zone smaller that the
|
||||
* zone size. */
|
||||
wp[j] = MIN(offset + (j + 1) * zone_size, offset + len);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
|
||||
int64_t *offset,
|
||||
QEMUIOVector *qiov,
|
||||
BdrvRequestFlags flags) {
|
||||
assert(flags == 0);
|
||||
int64_t zone_size_mask = bs->bl.zone_size - 1;
|
||||
int64_t iov_len = 0;
|
||||
int64_t len = 0;
|
||||
BDRVRawState *s = bs->opaque;
|
||||
s->offset = offset;
|
||||
|
||||
if (*offset & zone_size_mask) {
|
||||
error_report("sector offset %" PRId64 " is not aligned to zone size "
|
||||
"%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
int64_t wg = bs->bl.write_granularity;
|
||||
int64_t wg_mask = wg - 1;
|
||||
for (int i = 0; i < qiov->niov; i++) {
|
||||
iov_len = qiov->iov[i].iov_len;
|
||||
if (iov_len & wg_mask) {
|
||||
error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
|
||||
"block size %" PRId64 "", i, iov_len, wg);
|
||||
return -EINVAL;
|
||||
}
|
||||
len += iov_len;
|
||||
}
|
||||
|
||||
trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
|
||||
return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
|
||||
}
|
||||
#endif
|
||||
|
||||
static coroutine_fn int
|
||||
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
|
||||
bool blkdev)
|
||||
|
@ -3724,6 +4316,14 @@ static BlockDriver bdrv_host_device = {
|
|||
#ifdef __linux__
|
||||
.bdrv_co_ioctl = hdev_co_ioctl,
|
||||
#endif
|
||||
|
||||
/* zoned device */
|
||||
#if defined(CONFIG_BLKZONED)
|
||||
/* zone management operations */
|
||||
.bdrv_co_zone_report = raw_co_zone_report,
|
||||
.bdrv_co_zone_mgmt = raw_co_zone_mgmt,
|
||||
.bdrv_co_zone_append = raw_co_zone_append,
|
||||
#endif
|
||||
};
|
||||
|
||||
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
|
||||
|
|
68
block/io.c
68
block/io.c
|
@ -3113,6 +3113,74 @@ out:
|
|||
return co.ret;
|
||||
}
|
||||
|
||||
int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
|
||||
unsigned int *nr_zones,
|
||||
BlockZoneDescriptor *zones)
|
||||
{
|
||||
BlockDriver *drv = bs->drv;
|
||||
CoroutineIOCompletion co = {
|
||||
.coroutine = qemu_coroutine_self(),
|
||||
};
|
||||
IO_CODE();
|
||||
|
||||
bdrv_inc_in_flight(bs);
|
||||
if (!drv || !drv->bdrv_co_zone_report || bs->bl.zoned == BLK_Z_NONE) {
|
||||
co.ret = -ENOTSUP;
|
||||
goto out;
|
||||
}
|
||||
co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
|
||||
out:
|
||||
bdrv_dec_in_flight(bs);
|
||||
return co.ret;
|
||||
}
|
||||
|
||||
int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
|
||||
int64_t offset, int64_t len)
|
||||
{
|
||||
BlockDriver *drv = bs->drv;
|
||||
CoroutineIOCompletion co = {
|
||||
.coroutine = qemu_coroutine_self(),
|
||||
};
|
||||
IO_CODE();
|
||||
|
||||
bdrv_inc_in_flight(bs);
|
||||
if (!drv || !drv->bdrv_co_zone_mgmt || bs->bl.zoned == BLK_Z_NONE) {
|
||||
co.ret = -ENOTSUP;
|
||||
goto out;
|
||||
}
|
||||
co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
|
||||
out:
|
||||
bdrv_dec_in_flight(bs);
|
||||
return co.ret;
|
||||
}
|
||||
|
||||
int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
|
||||
QEMUIOVector *qiov,
|
||||
BdrvRequestFlags flags)
|
||||
{
|
||||
int ret;
|
||||
BlockDriver *drv = bs->drv;
|
||||
CoroutineIOCompletion co = {
|
||||
.coroutine = qemu_coroutine_self(),
|
||||
};
|
||||
IO_CODE();
|
||||
|
||||
ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
bdrv_inc_in_flight(bs);
|
||||
if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
|
||||
co.ret = -ENOTSUP;
|
||||
goto out;
|
||||
}
|
||||
co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
|
||||
out:
|
||||
bdrv_dec_in_flight(bs);
|
||||
return co.ret;
|
||||
}
|
||||
|
||||
void *qemu_blockalign(BlockDriverState *bs, size_t size)
|
||||
{
|
||||
IO_CODE();
|
||||
|
|
|
@ -350,6 +350,10 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
|
|||
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
|
||||
luringcb->qiov->niov, offset);
|
||||
break;
|
||||
case QEMU_AIO_ZONE_APPEND:
|
||||
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
|
||||
luringcb->qiov->niov, offset);
|
||||
break;
|
||||
case QEMU_AIO_READ:
|
||||
io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
|
||||
luringcb->qiov->niov, offset);
|
||||
|
|
|
@ -394,6 +394,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
|
|||
case QEMU_AIO_WRITE:
|
||||
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
|
||||
break;
|
||||
case QEMU_AIO_ZONE_APPEND:
|
||||
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
|
||||
break;
|
||||
case QEMU_AIO_READ:
|
||||
io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
|
||||
break;
|
||||
|
|
|
@ -517,6 +517,7 @@ void qmp_block_latency_histogram_set(
|
|||
bool has_boundaries, uint64List *boundaries,
|
||||
bool has_boundaries_read, uint64List *boundaries_read,
|
||||
bool has_boundaries_write, uint64List *boundaries_write,
|
||||
bool has_boundaries_append, uint64List *boundaries_append,
|
||||
bool has_boundaries_flush, uint64List *boundaries_flush,
|
||||
Error **errp)
|
||||
{
|
||||
|
@ -557,6 +558,16 @@ void qmp_block_latency_histogram_set(
|
|||
}
|
||||
}
|
||||
|
||||
if (has_boundaries || has_boundaries_append) {
|
||||
ret = block_latency_histogram_set(
|
||||
stats, BLOCK_ACCT_ZONE_APPEND,
|
||||
has_boundaries_append ? boundaries_append : boundaries);
|
||||
if (ret) {
|
||||
error_setg(errp, "Device '%s' set append write boundaries fail", id);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_boundaries || has_boundaries_flush) {
|
||||
ret = block_latency_histogram_set(
|
||||
stats, BLOCK_ACCT_FLUSH,
|
||||
|
|
18
block/qapi.c
18
block/qapi.c
|
@ -533,27 +533,36 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
|
|||
|
||||
ds->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
|
||||
ds->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
|
||||
ds->zone_append_bytes = stats->nr_bytes[BLOCK_ACCT_ZONE_APPEND];
|
||||
ds->unmap_bytes = stats->nr_bytes[BLOCK_ACCT_UNMAP];
|
||||
ds->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
|
||||
ds->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
|
||||
ds->zone_append_operations = stats->nr_ops[BLOCK_ACCT_ZONE_APPEND];
|
||||
ds->unmap_operations = stats->nr_ops[BLOCK_ACCT_UNMAP];
|
||||
|
||||
ds->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
|
||||
ds->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
|
||||
ds->failed_zone_append_operations =
|
||||
stats->failed_ops[BLOCK_ACCT_ZONE_APPEND];
|
||||
ds->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
|
||||
ds->failed_unmap_operations = stats->failed_ops[BLOCK_ACCT_UNMAP];
|
||||
|
||||
ds->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
|
||||
ds->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
|
||||
ds->invalid_zone_append_operations =
|
||||
stats->invalid_ops[BLOCK_ACCT_ZONE_APPEND];
|
||||
ds->invalid_flush_operations =
|
||||
stats->invalid_ops[BLOCK_ACCT_FLUSH];
|
||||
ds->invalid_unmap_operations = stats->invalid_ops[BLOCK_ACCT_UNMAP];
|
||||
|
||||
ds->rd_merged = stats->merged[BLOCK_ACCT_READ];
|
||||
ds->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
|
||||
ds->zone_append_merged = stats->merged[BLOCK_ACCT_ZONE_APPEND];
|
||||
ds->unmap_merged = stats->merged[BLOCK_ACCT_UNMAP];
|
||||
ds->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
|
||||
ds->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
|
||||
ds->zone_append_total_time_ns =
|
||||
stats->total_time_ns[BLOCK_ACCT_ZONE_APPEND];
|
||||
ds->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
|
||||
ds->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
|
||||
ds->unmap_total_time_ns = stats->total_time_ns[BLOCK_ACCT_UNMAP];
|
||||
|
@ -571,6 +580,7 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
|
|||
|
||||
TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
|
||||
TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
|
||||
TimedAverage *zap = &ts->latency[BLOCK_ACCT_ZONE_APPEND];
|
||||
TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
|
||||
|
||||
dev_stats->interval_length = ts->interval_length;
|
||||
|
@ -583,6 +593,10 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
|
|||
dev_stats->max_wr_latency_ns = timed_average_max(wr);
|
||||
dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
|
||||
|
||||
dev_stats->min_zone_append_latency_ns = timed_average_min(zap);
|
||||
dev_stats->max_zone_append_latency_ns = timed_average_max(zap);
|
||||
dev_stats->avg_zone_append_latency_ns = timed_average_avg(zap);
|
||||
|
||||
dev_stats->min_flush_latency_ns = timed_average_min(fl);
|
||||
dev_stats->max_flush_latency_ns = timed_average_max(fl);
|
||||
dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
|
||||
|
@ -591,6 +605,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
|
|||
block_acct_queue_depth(ts, BLOCK_ACCT_READ);
|
||||
dev_stats->avg_wr_queue_depth =
|
||||
block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
|
||||
dev_stats->avg_zone_append_queue_depth =
|
||||
block_acct_queue_depth(ts, BLOCK_ACCT_ZONE_APPEND);
|
||||
|
||||
QAPI_LIST_PREPEND(ds->timed_stats, dev_stats);
|
||||
}
|
||||
|
@ -600,6 +616,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
|
|||
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_READ]);
|
||||
ds->wr_latency_histogram
|
||||
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_WRITE]);
|
||||
ds->zone_append_latency_histogram
|
||||
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_ZONE_APPEND]);
|
||||
ds->flush_latency_histogram
|
||||
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_FLUSH]);
|
||||
}
|
||||
|
|
|
@ -317,6 +317,28 @@ raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
|
|||
return bdrv_co_pdiscard(bs->file, offset, bytes);
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
raw_co_zone_report(BlockDriverState *bs, int64_t offset,
|
||||
unsigned int *nr_zones,
|
||||
BlockZoneDescriptor *zones)
|
||||
{
|
||||
return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
|
||||
int64_t offset, int64_t len)
|
||||
{
|
||||
return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
|
||||
}
|
||||
|
||||
static int coroutine_fn GRAPH_RDLOCK
|
||||
raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov,
|
||||
BdrvRequestFlags flags)
|
||||
{
|
||||
return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
|
||||
}
|
||||
|
||||
static int64_t coroutine_fn GRAPH_RDLOCK
|
||||
raw_co_getlength(BlockDriverState *bs)
|
||||
{
|
||||
|
@ -608,6 +630,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild *c,
|
|||
BlockDriver bdrv_raw = {
|
||||
.format_name = "raw",
|
||||
.instance_size = sizeof(BDRVRawState),
|
||||
.supports_zoned_children = true,
|
||||
.bdrv_probe = &raw_probe,
|
||||
.bdrv_reopen_prepare = &raw_reopen_prepare,
|
||||
.bdrv_reopen_commit = &raw_reopen_commit,
|
||||
|
@ -619,6 +642,9 @@ BlockDriver bdrv_raw = {
|
|||
.bdrv_co_pwritev = &raw_co_pwritev,
|
||||
.bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
|
||||
.bdrv_co_pdiscard = &raw_co_pdiscard,
|
||||
.bdrv_co_zone_report = &raw_co_zone_report,
|
||||
.bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
|
||||
.bdrv_co_zone_append = &raw_co_zone_append,
|
||||
.bdrv_co_block_status = &raw_co_block_status,
|
||||
.bdrv_co_copy_range_from = &raw_co_copy_range_from,
|
||||
.bdrv_co_copy_range_to = &raw_co_copy_range_to,
|
||||
|
|
|
@ -209,6 +209,10 @@ file_FindEjectableOpticalMedia(const char *media) "Matching using %s"
|
|||
file_setup_cdrom(const char *partition) "Using %s as optical disc"
|
||||
file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
|
||||
file_flush_fdatasync_failed(int err) "errno %d"
|
||||
zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p report %d zones starting at sector offset 0x%" PRIx64 ""
|
||||
zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 " sectors"
|
||||
zbd_zone_append(void *bs, int64_t sector) "bs %p append at sector offset 0x%" PRIx64 ""
|
||||
zbd_zone_append_complete(void *bs, int64_t sector) "bs %p returns append sector 0x%" PRIx64 ""
|
||||
|
||||
# ssh.c
|
||||
sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"
|
||||
|
|
|
@ -12,3 +12,4 @@ generated from in-code annotations to function prototypes.
|
|||
memory
|
||||
modules
|
||||
ui
|
||||
zoned-storage
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
=============
|
||||
zoned-storage
|
||||
=============
|
||||
|
||||
Zoned Block Devices (ZBDs) divide the LBA space into block regions called zones
|
||||
that are larger than the LBA size. They can only allow sequential writes, which
|
||||
can reduce write amplification in SSDs, and potentially lead to higher
|
||||
throughput and increased capacity. More details about ZBDs can be found at:
|
||||
|
||||
https://zonedstorage.io/docs/introduction/zoned-storage
|
||||
|
||||
1. Block layer APIs for zoned storage
|
||||
-------------------------------------
|
||||
QEMU block layer supports three zoned storage models:
|
||||
- BLK_Z_HM: The host-managed zoned model only allows sequential writes access
|
||||
to zones. It supports ZBD-specific I/O commands that can be used by a host to
|
||||
manage the zones of a device.
|
||||
- BLK_Z_HA: The host-aware zoned model allows random write operations in
|
||||
zones, making it backward compatible with regular block devices.
|
||||
- BLK_Z_NONE: The non-zoned model has no zones support. It includes both
|
||||
regular and drive-managed ZBD devices. ZBD-specific I/O commands are not
|
||||
supported.
|
||||
|
||||
The block device information resides inside BlockDriverState. QEMU uses
|
||||
BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
|
||||
block layer while processing I/O requests. A BlockBackend has a root pointer to
|
||||
a BlockDriverState graph(for example, raw format on top of file-posix). The
|
||||
zoned storage information can be propagated from the leaf BlockDriverState all
|
||||
the way up to the BlockBackend. If the zoned storage model in file-posix is
|
||||
set to BLK_Z_HM, then block drivers will declare support for zoned host device.
|
||||
|
||||
The block layer APIs support commands needed for zoned storage devices,
|
||||
including report zones, four zone operations, and zone append.
|
||||
|
||||
2. Emulating zoned storage controllers
|
||||
--------------------------------------
|
||||
When the BlockBackend's BlockLimits model reports a zoned storage device, users
|
||||
like the virtio-blk emulation or the qemu-io-cmds.c utility can use block layer
|
||||
APIs for zoned storage emulation or testing.
|
||||
|
||||
For example, to test zone_report on a null_blk device using qemu-io is::
|
||||
|
||||
$ path/to/qemu-io --image-opts -n driver=host_device,filename=/dev/nullb0 -c "zrp offset nr_zones"
|
||||
|
||||
To expose the host's zoned block device through virtio-blk, the command line
|
||||
can be (includes the -device parameter)::
|
||||
|
||||
-blockdev node-name=drive0,driver=host_device,filename=/dev/nullb0,cache.direct=on \
|
||||
-device virtio-blk-pci,drive=drive0
|
||||
|
||||
Or only use the -drive parameter::
|
||||
|
||||
-driver driver=host_device,file=/dev/nullb0,if=virtio,cache.direct=on
|
||||
|
||||
Additionally, QEMU has several ways of supporting zoned storage, including:
|
||||
(1) Using virtio-scsi: --device scsi-block allows for the passing through of
|
||||
SCSI ZBC devices, enabling the attachment of ZBC or ZAC HDDs to QEMU.
|
||||
(2) PCI device pass-through: While NVMe ZNS emulation is available for testing
|
||||
purposes, it cannot yet pass through a zoned device from the host. To pass on
|
||||
the NVMe ZNS device to the guest, use VFIO PCI pass the entire NVMe PCI adapter
|
||||
through to the guest. Likewise, an HDD HBA can be passed on to QEMU all HDDs
|
||||
attached to the HBA.
|
|
@ -430,6 +430,12 @@ Hard disks
|
|||
you may corrupt your host data (use the ``-snapshot`` command
|
||||
line option or modify the device permissions accordingly).
|
||||
|
||||
Zoned block devices
|
||||
Zoned block devices can be passed through to the guest if the emulated storage
|
||||
controller supports zoned storage. Use ``--blockdev host_device,
|
||||
node-name=drive0,filename=/dev/nullb0,cache.direct=on`` to pass through
|
||||
``/dev/nullb0`` as ``drive0``.
|
||||
|
||||
Windows
|
||||
^^^^^^^
|
||||
|
||||
|
|
|
@ -44,9 +44,16 @@ pflash_write_unknown(const char *name, uint8_t cmd) "%s: unknown command 0x%02x"
|
|||
# virtio-blk.c
|
||||
virtio_blk_req_complete(void *vdev, void *req, int status) "vdev %p req %p status %d"
|
||||
virtio_blk_rw_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d"
|
||||
virtio_blk_zone_report_complete(void *vdev, void *req, unsigned int nr_zones, int ret) "vdev %p req %p nr_zones %u ret %d"
|
||||
virtio_blk_zone_mgmt_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d"
|
||||
virtio_blk_zone_append_complete(void *vdev, void *req, int64_t sector, int ret) "vdev %p req %p, append sector 0x%" PRIx64 " ret %d"
|
||||
virtio_blk_handle_write(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu"
|
||||
virtio_blk_handle_read(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu"
|
||||
virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint64_t offset, size_t size, bool is_write) "vdev %p mrb %p start %d num_reqs %d offset %"PRIu64" size %zu is_write %d"
|
||||
virtio_blk_handle_zone_report(void *vdev, void *req, int64_t sector, unsigned int nr_zones) "vdev %p req %p sector 0x%" PRIx64 " nr_zones %u"
|
||||
virtio_blk_handle_zone_mgmt(void *vdev, void *req, uint8_t op, int64_t sector, int64_t len) "vdev %p req %p op 0x%x sector 0x%" PRIx64 " len 0x%" PRIx64 ""
|
||||
virtio_blk_handle_zone_reset_all(void *vdev, void *req, int64_t sector, int64_t len) "vdev %p req %p sector 0x%" PRIx64 " cap 0x%" PRIx64 ""
|
||||
virtio_blk_handle_zone_append(void *vdev, void *req, int64_t sector) "vdev %p req %p, append sector 0x%" PRIx64 ""
|
||||
|
||||
# hd-geometry.c
|
||||
hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"
|
||||
|
|
|
@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
|
|||
.end = endof(struct virtio_blk_config, discard_sector_alignment)},
|
||||
{.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
|
||||
.end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
|
||||
{.flags = 1ULL << VIRTIO_BLK_F_ZONED,
|
||||
.end = endof(struct virtio_blk_config, zoned)},
|
||||
{}
|
||||
};
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include "qemu/module.h"
|
||||
#include "qemu/error-report.h"
|
||||
#include "qemu/main-loop.h"
|
||||
#include "block/block_int.h"
|
||||
#include "trace.h"
|
||||
#include "hw/block/block.h"
|
||||
#include "hw/qdev-properties.h"
|
||||
|
@ -601,6 +602,351 @@ err:
|
|||
return err_status;
|
||||
}
|
||||
|
||||
typedef struct ZoneCmdData {
|
||||
VirtIOBlockReq *req;
|
||||
struct iovec *in_iov;
|
||||
unsigned in_num;
|
||||
union {
|
||||
struct {
|
||||
unsigned int nr_zones;
|
||||
BlockZoneDescriptor *zones;
|
||||
} zone_report_data;
|
||||
struct {
|
||||
int64_t offset;
|
||||
} zone_append_data;
|
||||
};
|
||||
} ZoneCmdData;
|
||||
|
||||
/*
|
||||
* check zoned_request: error checking before issuing requests. If all checks
|
||||
* passed, return true.
|
||||
* append: true if only zone append requests issued.
|
||||
*/
|
||||
static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
|
||||
bool append, uint8_t *status) {
|
||||
BlockDriverState *bs = blk_bs(s->blk);
|
||||
int index;
|
||||
|
||||
if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
|
||||
*status = VIRTIO_BLK_S_UNSUPP;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (offset < 0 || len < 0 || len > (bs->total_sectors << BDRV_SECTOR_BITS)
|
||||
|| offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {
|
||||
*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (append) {
|
||||
if (bs->bl.write_granularity) {
|
||||
if ((offset % bs->bl.write_granularity) != 0) {
|
||||
*status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
index = offset / bs->bl.zone_size;
|
||||
if (BDRV_ZT_IS_CONV(bs->wps->wp[index])) {
|
||||
*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (len / 512 > bs->bl.max_append_sectors) {
|
||||
if (bs->bl.max_append_sectors == 0) {
|
||||
*status = VIRTIO_BLK_S_UNSUPP;
|
||||
} else {
|
||||
*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void virtio_blk_zone_report_complete(void *opaque, int ret)
|
||||
{
|
||||
ZoneCmdData *data = opaque;
|
||||
VirtIOBlockReq *req = data->req;
|
||||
VirtIOBlock *s = req->dev;
|
||||
VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
|
||||
struct iovec *in_iov = data->in_iov;
|
||||
unsigned in_num = data->in_num;
|
||||
int64_t zrp_size, n, j = 0;
|
||||
int64_t nz = data->zone_report_data.nr_zones;
|
||||
int8_t err_status = VIRTIO_BLK_S_OK;
|
||||
|
||||
trace_virtio_blk_zone_report_complete(vdev, req, nz, ret);
|
||||
if (ret) {
|
||||
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
|
||||
goto out;
|
||||
}
|
||||
|
||||
struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
|
||||
.nr_zones = cpu_to_le64(nz),
|
||||
};
|
||||
zrp_size = sizeof(struct virtio_blk_zone_report)
|
||||
+ sizeof(struct virtio_blk_zone_descriptor) * nz;
|
||||
n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
|
||||
if (n != sizeof(zrp_hdr)) {
|
||||
virtio_error(vdev, "Driver provided input buffer that is too small!");
|
||||
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (size_t i = sizeof(zrp_hdr); i < zrp_size;
|
||||
i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
|
||||
struct virtio_blk_zone_descriptor desc =
|
||||
(struct virtio_blk_zone_descriptor) {
|
||||
.z_start = cpu_to_le64(data->zone_report_data.zones[j].start
|
||||
>> BDRV_SECTOR_BITS),
|
||||
.z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
|
||||
>> BDRV_SECTOR_BITS),
|
||||
.z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
|
||||
>> BDRV_SECTOR_BITS),
|
||||
};
|
||||
|
||||
switch (data->zone_report_data.zones[j].type) {
|
||||
case BLK_ZT_CONV:
|
||||
desc.z_type = VIRTIO_BLK_ZT_CONV;
|
||||
break;
|
||||
case BLK_ZT_SWR:
|
||||
desc.z_type = VIRTIO_BLK_ZT_SWR;
|
||||
break;
|
||||
case BLK_ZT_SWP:
|
||||
desc.z_type = VIRTIO_BLK_ZT_SWP;
|
||||
break;
|
||||
default:
|
||||
g_assert_not_reached();
|
||||
}
|
||||
|
||||
switch (data->zone_report_data.zones[j].state) {
|
||||
case BLK_ZS_RDONLY:
|
||||
desc.z_state = VIRTIO_BLK_ZS_RDONLY;
|
||||
break;
|
||||
case BLK_ZS_OFFLINE:
|
||||
desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
|
||||
break;
|
||||
case BLK_ZS_EMPTY:
|
||||
desc.z_state = VIRTIO_BLK_ZS_EMPTY;
|
||||
break;
|
||||
case BLK_ZS_CLOSED:
|
||||
desc.z_state = VIRTIO_BLK_ZS_CLOSED;
|
||||
break;
|
||||
case BLK_ZS_FULL:
|
||||
desc.z_state = VIRTIO_BLK_ZS_FULL;
|
||||
break;
|
||||
case BLK_ZS_EOPEN:
|
||||
desc.z_state = VIRTIO_BLK_ZS_EOPEN;
|
||||
break;
|
||||
case BLK_ZS_IOPEN:
|
||||
desc.z_state = VIRTIO_BLK_ZS_IOPEN;
|
||||
break;
|
||||
case BLK_ZS_NOT_WP:
|
||||
desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
|
||||
break;
|
||||
default:
|
||||
g_assert_not_reached();
|
||||
}
|
||||
|
||||
/* TODO: it takes O(n^2) time complexity. Optimizations required. */
|
||||
n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
|
||||
if (n != sizeof(desc)) {
|
||||
virtio_error(vdev, "Driver provided input buffer "
|
||||
"for descriptors that is too small!");
|
||||
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
|
||||
virtio_blk_req_complete(req, err_status);
|
||||
virtio_blk_free_request(req);
|
||||
aio_context_release(blk_get_aio_context(s->conf.conf.blk));
|
||||
g_free(data->zone_report_data.zones);
|
||||
g_free(data);
|
||||
}
|
||||
|
||||
static void virtio_blk_handle_zone_report(VirtIOBlockReq *req,
|
||||
struct iovec *in_iov,
|
||||
unsigned in_num)
|
||||
{
|
||||
VirtIOBlock *s = req->dev;
|
||||
VirtIODevice *vdev = VIRTIO_DEVICE(s);
|
||||
unsigned int nr_zones;
|
||||
ZoneCmdData *data;
|
||||
int64_t zone_size, offset;
|
||||
uint8_t err_status;
|
||||
|
||||
if (req->in_len < sizeof(struct virtio_blk_inhdr) +
|
||||
sizeof(struct virtio_blk_zone_report) +
|
||||
sizeof(struct virtio_blk_zone_descriptor)) {
|
||||
virtio_error(vdev, "in buffer too small for zone report");
|
||||
return;
|
||||
}
|
||||
|
||||
/* start byte offset of the zone report */
|
||||
offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
|
||||
if (!check_zoned_request(s, offset, 0, false, &err_status)) {
|
||||
goto out;
|
||||
}
|
||||
nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
|
||||
sizeof(struct virtio_blk_zone_report)) /
|
||||
sizeof(struct virtio_blk_zone_descriptor);
|
||||
trace_virtio_blk_handle_zone_report(vdev, req,
|
||||
offset >> BDRV_SECTOR_BITS, nr_zones);
|
||||
|
||||
zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
|
||||
data = g_malloc(sizeof(ZoneCmdData));
|
||||
data->req = req;
|
||||
data->in_iov = in_iov;
|
||||
data->in_num = in_num;
|
||||
data->zone_report_data.nr_zones = nr_zones;
|
||||
data->zone_report_data.zones = g_malloc(zone_size),
|
||||
|
||||
blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
|
||||
data->zone_report_data.zones,
|
||||
virtio_blk_zone_report_complete, data);
|
||||
return;
|
||||
out:
|
||||
virtio_blk_req_complete(req, err_status);
|
||||
virtio_blk_free_request(req);
|
||||
}
|
||||
|
||||
static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
|
||||
{
|
||||
VirtIOBlockReq *req = opaque;
|
||||
VirtIOBlock *s = req->dev;
|
||||
VirtIODevice *vdev = VIRTIO_DEVICE(s);
|
||||
int8_t err_status = VIRTIO_BLK_S_OK;
|
||||
trace_virtio_blk_zone_mgmt_complete(vdev, req,ret);
|
||||
|
||||
if (ret) {
|
||||
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
|
||||
}
|
||||
|
||||
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
|
||||
virtio_blk_req_complete(req, err_status);
|
||||
virtio_blk_free_request(req);
|
||||
aio_context_release(blk_get_aio_context(s->conf.conf.blk));
|
||||
}
|
||||
|
||||
static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
|
||||
{
|
||||
VirtIOBlock *s = req->dev;
|
||||
VirtIODevice *vdev = VIRTIO_DEVICE(s);
|
||||
BlockDriverState *bs = blk_bs(s->blk);
|
||||
int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
|
||||
uint64_t len;
|
||||
uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
|
||||
uint8_t err_status = VIRTIO_BLK_S_OK;
|
||||
|
||||
uint32_t type = virtio_ldl_p(vdev, &req->out.type);
|
||||
if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
|
||||
/* Entire drive capacity */
|
||||
offset = 0;
|
||||
len = capacity;
|
||||
trace_virtio_blk_handle_zone_reset_all(vdev, req, 0,
|
||||
bs->total_sectors);
|
||||
} else {
|
||||
if (bs->bl.zone_size > capacity - offset) {
|
||||
/* The zoned device allows the last smaller zone. */
|
||||
len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
|
||||
} else {
|
||||
len = bs->bl.zone_size;
|
||||
}
|
||||
trace_virtio_blk_handle_zone_mgmt(vdev, req, op,
|
||||
offset >> BDRV_SECTOR_BITS,
|
||||
len >> BDRV_SECTOR_BITS);
|
||||
}
|
||||
|
||||
if (!check_zoned_request(s, offset, len, false, &err_status)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
blk_aio_zone_mgmt(s->blk, op, offset, len,
|
||||
virtio_blk_zone_mgmt_complete, req);
|
||||
|
||||
return 0;
|
||||
out:
|
||||
virtio_blk_req_complete(req, err_status);
|
||||
virtio_blk_free_request(req);
|
||||
return err_status;
|
||||
}
|
||||
|
||||
static void virtio_blk_zone_append_complete(void *opaque, int ret)
|
||||
{
|
||||
ZoneCmdData *data = opaque;
|
||||
VirtIOBlockReq *req = data->req;
|
||||
VirtIOBlock *s = req->dev;
|
||||
VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
|
||||
int64_t append_sector, n;
|
||||
uint8_t err_status = VIRTIO_BLK_S_OK;
|
||||
|
||||
if (ret) {
|
||||
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
|
||||
goto out;
|
||||
}
|
||||
|
||||
virtio_stq_p(vdev, &append_sector,
|
||||
data->zone_append_data.offset >> BDRV_SECTOR_BITS);
|
||||
n = iov_from_buf(data->in_iov, data->in_num, 0, &append_sector,
|
||||
sizeof(append_sector));
|
||||
if (n != sizeof(append_sector)) {
|
||||
virtio_error(vdev, "Driver provided input buffer less than size of "
|
||||
"append_sector");
|
||||
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
|
||||
goto out;
|
||||
}
|
||||
trace_virtio_blk_zone_append_complete(vdev, req, append_sector, ret);
|
||||
|
||||
out:
|
||||
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
|
||||
virtio_blk_req_complete(req, err_status);
|
||||
virtio_blk_free_request(req);
|
||||
aio_context_release(blk_get_aio_context(s->conf.conf.blk));
|
||||
g_free(data);
|
||||
}
|
||||
|
||||
static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
|
||||
struct iovec *out_iov,
|
||||
struct iovec *in_iov,
|
||||
uint64_t out_num,
|
||||
unsigned in_num) {
|
||||
VirtIOBlock *s = req->dev;
|
||||
VirtIODevice *vdev = VIRTIO_DEVICE(s);
|
||||
uint8_t err_status = VIRTIO_BLK_S_OK;
|
||||
|
||||
int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
|
||||
int64_t len = iov_size(out_iov, out_num);
|
||||
|
||||
trace_virtio_blk_handle_zone_append(vdev, req, offset >> BDRV_SECTOR_BITS);
|
||||
if (!check_zoned_request(s, offset, len, true, &err_status)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
|
||||
data->req = req;
|
||||
data->in_iov = in_iov;
|
||||
data->in_num = in_num;
|
||||
data->zone_append_data.offset = offset;
|
||||
qemu_iovec_init_external(&req->qiov, out_iov, out_num);
|
||||
|
||||
block_acct_start(blk_get_stats(s->blk), &req->acct, len,
|
||||
BLOCK_ACCT_ZONE_APPEND);
|
||||
|
||||
blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
|
||||
virtio_blk_zone_append_complete, data);
|
||||
return 0;
|
||||
|
||||
out:
|
||||
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
|
||||
virtio_blk_req_complete(req, err_status);
|
||||
virtio_blk_free_request(req);
|
||||
aio_context_release(blk_get_aio_context(s->conf.conf.blk));
|
||||
return err_status;
|
||||
}
|
||||
|
||||
static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
|
||||
{
|
||||
uint32_t type;
|
||||
|
@ -687,6 +1033,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
|
|||
case VIRTIO_BLK_T_FLUSH:
|
||||
virtio_blk_handle_flush(req, mrb);
|
||||
break;
|
||||
case VIRTIO_BLK_T_ZONE_REPORT:
|
||||
virtio_blk_handle_zone_report(req, in_iov, in_num);
|
||||
break;
|
||||
case VIRTIO_BLK_T_ZONE_OPEN:
|
||||
virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
|
||||
break;
|
||||
case VIRTIO_BLK_T_ZONE_CLOSE:
|
||||
virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
|
||||
break;
|
||||
case VIRTIO_BLK_T_ZONE_FINISH:
|
||||
virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
|
||||
break;
|
||||
case VIRTIO_BLK_T_ZONE_RESET:
|
||||
virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
|
||||
break;
|
||||
case VIRTIO_BLK_T_ZONE_RESET_ALL:
|
||||
virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
|
||||
break;
|
||||
case VIRTIO_BLK_T_SCSI_CMD:
|
||||
virtio_blk_handle_scsi(req);
|
||||
break;
|
||||
|
@ -705,6 +1069,14 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
|
|||
virtio_blk_free_request(req);
|
||||
break;
|
||||
}
|
||||
case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
|
||||
/*
|
||||
* Passing out_iov/out_num and in_iov/in_num is not safe
|
||||
* to access req->elem.out_sg directly because it may be
|
||||
* modified by virtio_blk_handle_request().
|
||||
*/
|
||||
virtio_blk_handle_zone_append(req, out_iov, in_iov, out_num, in_num);
|
||||
break;
|
||||
/*
|
||||
* VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
|
||||
* VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
|
||||
|
@ -890,6 +1262,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
|
|||
{
|
||||
VirtIOBlock *s = VIRTIO_BLK(vdev);
|
||||
BlockConf *conf = &s->conf.conf;
|
||||
BlockDriverState *bs = blk_bs(s->blk);
|
||||
struct virtio_blk_config blkcfg;
|
||||
uint64_t capacity;
|
||||
int64_t length;
|
||||
|
@ -954,6 +1327,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
|
|||
blkcfg.write_zeroes_may_unmap = 1;
|
||||
virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
|
||||
}
|
||||
if (bs->bl.zoned != BLK_Z_NONE) {
|
||||
switch (bs->bl.zoned) {
|
||||
case BLK_Z_HM:
|
||||
blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
|
||||
break;
|
||||
case BLK_Z_HA:
|
||||
blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
|
||||
break;
|
||||
default:
|
||||
g_assert_not_reached();
|
||||
}
|
||||
|
||||
virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
|
||||
bs->bl.zone_size / 512);
|
||||
virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
|
||||
bs->bl.max_active_zones);
|
||||
virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
|
||||
bs->bl.max_open_zones);
|
||||
virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
|
||||
virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
|
||||
bs->bl.max_append_sectors);
|
||||
} else {
|
||||
blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
|
||||
}
|
||||
memcpy(config, &blkcfg, s->config_size);
|
||||
}
|
||||
|
||||
|
@ -1163,6 +1560,14 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
|
|||
return;
|
||||
}
|
||||
|
||||
BlockDriverState *bs = blk_bs(conf->conf.blk);
|
||||
if (bs->bl.zoned != BLK_Z_NONE) {
|
||||
virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
|
||||
if (bs->bl.zoned == BLK_Z_HM) {
|
||||
virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
|
||||
}
|
||||
}
|
||||
|
||||
if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
|
||||
(!conf->max_discard_sectors ||
|
||||
conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
|
||||
|
|
|
@ -176,6 +176,8 @@ static const qmp_virtio_feature_map_t virtio_blk_feature_map[] = {
|
|||
"VIRTIO_BLK_F_DISCARD: Discard command supported"),
|
||||
FEATURE_ENTRY(VIRTIO_BLK_F_WRITE_ZEROES, \
|
||||
"VIRTIO_BLK_F_WRITE_ZEROES: Write zeroes command supported"),
|
||||
FEATURE_ENTRY(VIRTIO_BLK_F_ZONED, \
|
||||
"VIRTIO_BLK_F_ZONED: Zoned block devices"),
|
||||
#ifndef VIRTIO_BLK_NO_LEGACY
|
||||
FEATURE_ENTRY(VIRTIO_BLK_F_BARRIER, \
|
||||
"VIRTIO_BLK_F_BARRIER: Request barriers supported"),
|
||||
|
|
|
@ -37,6 +37,7 @@ enum BlockAcctType {
|
|||
BLOCK_ACCT_READ,
|
||||
BLOCK_ACCT_WRITE,
|
||||
BLOCK_ACCT_FLUSH,
|
||||
BLOCK_ACCT_ZONE_APPEND,
|
||||
BLOCK_ACCT_UNMAP,
|
||||
BLOCK_MAX_IOTYPE,
|
||||
};
|
||||
|
|
|
@ -75,6 +75,57 @@ typedef struct BlockDriver BlockDriver;
|
|||
typedef struct BdrvChild BdrvChild;
|
||||
typedef struct BdrvChildClass BdrvChildClass;
|
||||
|
||||
typedef enum BlockZoneOp {
|
||||
BLK_ZO_OPEN,
|
||||
BLK_ZO_CLOSE,
|
||||
BLK_ZO_FINISH,
|
||||
BLK_ZO_RESET,
|
||||
} BlockZoneOp;
|
||||
|
||||
typedef enum BlockZoneModel {
|
||||
BLK_Z_NONE = 0x0, /* Regular block device */
|
||||
BLK_Z_HM = 0x1, /* Host-managed zoned block device */
|
||||
BLK_Z_HA = 0x2, /* Host-aware zoned block device */
|
||||
} BlockZoneModel;
|
||||
|
||||
typedef enum BlockZoneState {
|
||||
BLK_ZS_NOT_WP = 0x0,
|
||||
BLK_ZS_EMPTY = 0x1,
|
||||
BLK_ZS_IOPEN = 0x2,
|
||||
BLK_ZS_EOPEN = 0x3,
|
||||
BLK_ZS_CLOSED = 0x4,
|
||||
BLK_ZS_RDONLY = 0xD,
|
||||
BLK_ZS_FULL = 0xE,
|
||||
BLK_ZS_OFFLINE = 0xF,
|
||||
} BlockZoneState;
|
||||
|
||||
typedef enum BlockZoneType {
|
||||
BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
|
||||
BLK_ZT_SWR = 0x2, /* Sequential writes required */
|
||||
BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
|
||||
} BlockZoneType;
|
||||
|
||||
/*
|
||||
* Zone descriptor data structure.
|
||||
* Provides information on a zone with all position and size values in bytes.
|
||||
*/
|
||||
typedef struct BlockZoneDescriptor {
|
||||
uint64_t start;
|
||||
uint64_t length;
|
||||
uint64_t cap;
|
||||
uint64_t wp;
|
||||
BlockZoneType type;
|
||||
BlockZoneState state;
|
||||
} BlockZoneDescriptor;
|
||||
|
||||
/*
|
||||
* Track write pointers of a zone in bytes.
|
||||
*/
|
||||
typedef struct BlockZoneWps {
|
||||
CoMutex colock;
|
||||
uint64_t wp[];
|
||||
} BlockZoneWps;
|
||||
|
||||
typedef struct BlockDriverInfo {
|
||||
/* in bytes, 0 if irrelevant */
|
||||
int cluster_size;
|
||||
|
@ -197,6 +248,12 @@ typedef enum {
|
|||
#define BDRV_SECTOR_BITS 9
|
||||
#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS)
|
||||
|
||||
/*
|
||||
* Get the first most significant bit of wp. If it is zero, then
|
||||
* the zone type is SWR.
|
||||
*/
|
||||
#define BDRV_ZT_IS_CONV(wp) (wp & (1ULL << 63))
|
||||
|
||||
#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
|
||||
INT_MAX >> BDRV_SECTOR_BITS)
|
||||
#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
|
||||
|
|
|
@ -114,6 +114,19 @@ int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs);
|
|||
int coroutine_fn GRAPH_RDLOCK bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
|
||||
int64_t bytes);
|
||||
|
||||
/* Report zone information of zone block device. */
|
||||
int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_report(BlockDriverState *bs,
|
||||
int64_t offset,
|
||||
unsigned int *nr_zones,
|
||||
BlockZoneDescriptor *zones);
|
||||
int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_mgmt(BlockDriverState *bs,
|
||||
BlockZoneOp op,
|
||||
int64_t offset, int64_t len);
|
||||
int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_append(BlockDriverState *bs,
|
||||
int64_t *offset,
|
||||
QEMUIOVector *qiov,
|
||||
BdrvRequestFlags flags);
|
||||
|
||||
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
|
||||
int bdrv_block_status(BlockDriverState *bs, int64_t offset,
|
||||
int64_t bytes, int64_t *pnum, int64_t *map,
|
||||
|
|
|
@ -137,6 +137,11 @@ struct BlockDriver {
|
|||
*/
|
||||
bool is_format;
|
||||
|
||||
/*
|
||||
* Set to true if the BlockDriver supports zoned children.
|
||||
*/
|
||||
bool supports_zoned_children;
|
||||
|
||||
/*
|
||||
* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
|
||||
* this field set to true, except ones that are defined only by their
|
||||
|
@ -713,6 +718,15 @@ struct BlockDriver {
|
|||
int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_load_vmstate)(
|
||||
BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
|
||||
|
||||
int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
|
||||
int64_t offset, unsigned int *nr_zones,
|
||||
BlockZoneDescriptor *zones);
|
||||
int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
|
||||
int64_t offset, int64_t len);
|
||||
int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
|
||||
int64_t *offset, QEMUIOVector *qiov,
|
||||
BdrvRequestFlags flags);
|
||||
|
||||
/* removable device specific */
|
||||
bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_is_inserted)(
|
||||
BlockDriverState *bs);
|
||||
|
@ -862,6 +876,26 @@ typedef struct BlockLimits {
|
|||
* an explicit monitor command to load the disk inside the guest).
|
||||
*/
|
||||
bool has_variable_length;
|
||||
|
||||
/* device zone model */
|
||||
BlockZoneModel zoned;
|
||||
|
||||
/* zone size expressed in bytes */
|
||||
uint32_t zone_size;
|
||||
|
||||
/* total number of zones */
|
||||
uint32_t nr_zones;
|
||||
|
||||
/* maximum sectors of a zone append write operation */
|
||||
uint32_t max_append_sectors;
|
||||
|
||||
/* maximum number of open zones */
|
||||
uint32_t max_open_zones;
|
||||
|
||||
/* maximum number of active zones */
|
||||
uint32_t max_active_zones;
|
||||
|
||||
uint32_t write_granularity;
|
||||
} BlockLimits;
|
||||
|
||||
typedef struct BdrvOpBlocker BdrvOpBlocker;
|
||||
|
@ -1223,6 +1257,9 @@ struct BlockDriverState {
|
|||
CoMutex bsc_modify_lock;
|
||||
/* Always non-NULL, but must only be dereferenced under an RCU read guard */
|
||||
BdrvBlockStatusCache *block_status_cache;
|
||||
|
||||
/* array of write pointers' location of each zone in the zoned device. */
|
||||
BlockZoneWps *wps;
|
||||
};
|
||||
|
||||
struct BlockBackendRootState {
|
||||
|
|
|
@ -28,6 +28,9 @@
|
|||
#define QEMU_AIO_WRITE_ZEROES 0x0020
|
||||
#define QEMU_AIO_COPY_RANGE 0x0040
|
||||
#define QEMU_AIO_TRUNCATE 0x0080
|
||||
#define QEMU_AIO_ZONE_REPORT 0x0100
|
||||
#define QEMU_AIO_ZONE_MGMT 0x0200
|
||||
#define QEMU_AIO_ZONE_APPEND 0x0400
|
||||
#define QEMU_AIO_TYPE_MASK \
|
||||
(QEMU_AIO_READ | \
|
||||
QEMU_AIO_WRITE | \
|
||||
|
@ -36,7 +39,10 @@
|
|||
QEMU_AIO_DISCARD | \
|
||||
QEMU_AIO_WRITE_ZEROES | \
|
||||
QEMU_AIO_COPY_RANGE | \
|
||||
QEMU_AIO_TRUNCATE)
|
||||
QEMU_AIO_TRUNCATE | \
|
||||
QEMU_AIO_ZONE_REPORT | \
|
||||
QEMU_AIO_ZONE_MGMT | \
|
||||
QEMU_AIO_ZONE_APPEND)
|
||||
|
||||
/* AIO flags */
|
||||
#define QEMU_AIO_MISALIGNED 0x1000
|
||||
|
|
|
@ -46,6 +46,16 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
|
|||
BlockCompletionFunc *cb, void *opaque);
|
||||
BlockAIOCB *blk_aio_flush(BlockBackend *blk,
|
||||
BlockCompletionFunc *cb, void *opaque);
|
||||
BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
|
||||
unsigned int *nr_zones,
|
||||
BlockZoneDescriptor *zones,
|
||||
BlockCompletionFunc *cb, void *opaque);
|
||||
BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
|
||||
int64_t offset, int64_t len,
|
||||
BlockCompletionFunc *cb, void *opaque);
|
||||
BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
|
||||
QEMUIOVector *qiov, BdrvRequestFlags flags,
|
||||
BlockCompletionFunc *cb, void *opaque);
|
||||
BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
|
||||
BlockCompletionFunc *cb, void *opaque);
|
||||
void blk_aio_cancel_async(BlockAIOCB *acb);
|
||||
|
@ -191,6 +201,23 @@ int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
|
|||
int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
|
||||
int64_t bytes, BdrvRequestFlags flags);
|
||||
|
||||
int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
|
||||
unsigned int *nr_zones,
|
||||
BlockZoneDescriptor *zones);
|
||||
int co_wrapper_mixed blk_zone_report(BlockBackend *blk, int64_t offset,
|
||||
unsigned int *nr_zones,
|
||||
BlockZoneDescriptor *zones);
|
||||
int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
|
||||
int64_t offset, int64_t len);
|
||||
int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
|
||||
int64_t offset, int64_t len);
|
||||
int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
|
||||
QEMUIOVector *qiov,
|
||||
BdrvRequestFlags flags);
|
||||
int co_wrapper_mixed blk_zone_append(BlockBackend *blk, int64_t *offset,
|
||||
QEMUIOVector *qiov,
|
||||
BdrvRequestFlags flags);
|
||||
|
||||
int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset,
|
||||
int64_t bytes);
|
||||
int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,
|
||||
|
|
|
@ -2025,6 +2025,8 @@ if rdma.found()
|
|||
endif
|
||||
|
||||
# has_header_symbol
|
||||
config_host_data.set('CONFIG_BLKZONED',
|
||||
cc.has_header_symbol('linux/blkzoned.h', 'BLKOPENZONE'))
|
||||
config_host_data.set('CONFIG_EPOLL_CREATE1',
|
||||
cc.has_header_symbol('sys/epoll.h', 'epoll_create1'))
|
||||
config_host_data.set('CONFIG_FALLOCATE_PUNCH_HOLE',
|
||||
|
@ -2060,6 +2062,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID',
|
|||
config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM',
|
||||
cc.has_member('struct stat', 'st_atim',
|
||||
prefix: '#include <sys/stat.h>'))
|
||||
config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY',
|
||||
cc.has_member('struct blk_zone', 'capacity',
|
||||
prefix: '#include <linux/blkzoned.h>'))
|
||||
|
||||
# has_type
|
||||
config_host_data.set('CONFIG_IOVEC',
|
||||
|
|
|
@ -854,6 +854,10 @@
|
|||
# @min_wr_latency_ns: Minimum latency of write operations in the
|
||||
# defined interval, in nanoseconds.
|
||||
#
|
||||
# @min_zone_append_latency_ns: Minimum latency of zone append operations
|
||||
# in the defined interval, in nanoseconds
|
||||
# (since 8.1)
|
||||
#
|
||||
# @min_flush_latency_ns: Minimum latency of flush operations in the
|
||||
# defined interval, in nanoseconds.
|
||||
#
|
||||
|
@ -863,6 +867,10 @@
|
|||
# @max_wr_latency_ns: Maximum latency of write operations in the
|
||||
# defined interval, in nanoseconds.
|
||||
#
|
||||
# @max_zone_append_latency_ns: Maximum latency of zone append operations
|
||||
# in the defined interval, in nanoseconds
|
||||
# (since 8.1)
|
||||
#
|
||||
# @max_flush_latency_ns: Maximum latency of flush operations in the
|
||||
# defined interval, in nanoseconds.
|
||||
#
|
||||
|
@ -872,6 +880,10 @@
|
|||
# @avg_wr_latency_ns: Average latency of write operations in the
|
||||
# defined interval, in nanoseconds.
|
||||
#
|
||||
# @avg_zone_append_latency_ns: Average latency of zone append operations
|
||||
# in the defined interval, in nanoseconds
|
||||
# (since 8.1)
|
||||
#
|
||||
# @avg_flush_latency_ns: Average latency of flush operations in the
|
||||
# defined interval, in nanoseconds.
|
||||
#
|
||||
|
@ -881,15 +893,23 @@
|
|||
# @avg_wr_queue_depth: Average number of pending write operations in
|
||||
# the defined interval.
|
||||
#
|
||||
# @avg_zone_append_queue_depth: Average number of pending zone append
|
||||
# operations in the defined interval
|
||||
# (since 8.1).
|
||||
#
|
||||
# Since: 2.5
|
||||
##
|
||||
{ 'struct': 'BlockDeviceTimedStats',
|
||||
'data': { 'interval_length': 'int', 'min_rd_latency_ns': 'int',
|
||||
'max_rd_latency_ns': 'int', 'avg_rd_latency_ns': 'int',
|
||||
'min_wr_latency_ns': 'int', 'max_wr_latency_ns': 'int',
|
||||
'avg_wr_latency_ns': 'int', 'min_flush_latency_ns': 'int',
|
||||
'max_flush_latency_ns': 'int', 'avg_flush_latency_ns': 'int',
|
||||
'avg_rd_queue_depth': 'number', 'avg_wr_queue_depth': 'number' } }
|
||||
'avg_wr_latency_ns': 'int', 'min_zone_append_latency_ns': 'int',
|
||||
'max_zone_append_latency_ns': 'int',
|
||||
'avg_zone_append_latency_ns': 'int',
|
||||
'min_flush_latency_ns': 'int', 'max_flush_latency_ns': 'int',
|
||||
'avg_flush_latency_ns': 'int', 'avg_rd_queue_depth': 'number',
|
||||
'avg_wr_queue_depth': 'number',
|
||||
'avg_zone_append_queue_depth': 'number' } }
|
||||
|
||||
##
|
||||
# @BlockDeviceStats:
|
||||
|
@ -900,6 +920,9 @@
|
|||
#
|
||||
# @wr_bytes: The number of bytes written by the device.
|
||||
#
|
||||
# @zone_append_bytes: The number of bytes appended by the zoned devices
|
||||
# (since 8.1)
|
||||
#
|
||||
# @unmap_bytes: The number of bytes unmapped by the device (Since 4.2)
|
||||
#
|
||||
# @rd_operations: The number of read operations performed by the
|
||||
|
@ -908,6 +931,9 @@
|
|||
# @wr_operations: The number of write operations performed by the
|
||||
# device.
|
||||
#
|
||||
# @zone_append_operations: The number of zone append operations performed
|
||||
# by the zoned devices (since 8.1)
|
||||
#
|
||||
# @flush_operations: The number of cache flush operations performed by
|
||||
# the device (since 0.15)
|
||||
#
|
||||
|
@ -920,6 +946,9 @@
|
|||
# @wr_total_time_ns: Total time spent on writes in nanoseconds (since
|
||||
# 0.15).
|
||||
#
|
||||
# @zone_append_total_time_ns: Total time spent on zone append writes
|
||||
# in nanoseconds (since 8.1)
|
||||
#
|
||||
# @flush_total_time_ns: Total time spent on cache flushes in
|
||||
# nanoseconds (since 0.15).
|
||||
#
|
||||
|
@ -937,6 +966,9 @@
|
|||
# @wr_merged: Number of write requests that have been merged into
|
||||
# another request (Since 2.3).
|
||||
#
|
||||
# @zone_append_merged: Number of zone append requests that have been merged
|
||||
# into another request (since 8.1)
|
||||
#
|
||||
# @unmap_merged: Number of unmap requests that have been merged into
|
||||
# another request (Since 4.2)
|
||||
#
|
||||
|
@ -950,6 +982,10 @@
|
|||
# @failed_wr_operations: The number of failed write operations
|
||||
# performed by the device (Since 2.5)
|
||||
#
|
||||
# @failed_zone_append_operations: The number of failed zone append write
|
||||
# operations performed by the zoned devices
|
||||
# (since 8.1)
|
||||
#
|
||||
# @failed_flush_operations: The number of failed flush operations
|
||||
# performed by the device (Since 2.5)
|
||||
#
|
||||
|
@ -962,6 +998,9 @@
|
|||
# @invalid_wr_operations: The number of invalid write operations
|
||||
# performed by the device (Since 2.5)
|
||||
#
|
||||
# @invalid_zone_append_operations: The number of invalid zone append operations
|
||||
# performed by the zoned device (since 8.1)
|
||||
#
|
||||
# @invalid_flush_operations: The number of invalid flush operations
|
||||
# performed by the device (Since 2.5)
|
||||
#
|
||||
|
@ -981,27 +1020,34 @@
|
|||
#
|
||||
# @wr_latency_histogram: @BlockLatencyHistogramInfo. (Since 4.0)
|
||||
#
|
||||
# @zone_append_latency_histogram: @BlockLatencyHistogramInfo. (since 8.1)
|
||||
#
|
||||
# @flush_latency_histogram: @BlockLatencyHistogramInfo. (Since 4.0)
|
||||
#
|
||||
# Since: 0.14
|
||||
##
|
||||
{ 'struct': 'BlockDeviceStats',
|
||||
'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'unmap_bytes' : 'int',
|
||||
'rd_operations': 'int', 'wr_operations': 'int',
|
||||
'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'zone_append_bytes': 'int',
|
||||
'unmap_bytes' : 'int', 'rd_operations': 'int',
|
||||
'wr_operations': 'int', 'zone_append_operations': 'int',
|
||||
'flush_operations': 'int', 'unmap_operations': 'int',
|
||||
'rd_total_time_ns': 'int', 'wr_total_time_ns': 'int',
|
||||
'flush_total_time_ns': 'int', 'unmap_total_time_ns': 'int',
|
||||
'wr_highest_offset': 'int',
|
||||
'rd_merged': 'int', 'wr_merged': 'int', 'unmap_merged': 'int',
|
||||
'*idle_time_ns': 'int',
|
||||
'zone_append_total_time_ns': 'int', 'flush_total_time_ns': 'int',
|
||||
'unmap_total_time_ns': 'int', 'wr_highest_offset': 'int',
|
||||
'rd_merged': 'int', 'wr_merged': 'int', 'zone_append_merged': 'int',
|
||||
'unmap_merged': 'int', '*idle_time_ns': 'int',
|
||||
'failed_rd_operations': 'int', 'failed_wr_operations': 'int',
|
||||
'failed_flush_operations': 'int', 'failed_unmap_operations': 'int',
|
||||
'invalid_rd_operations': 'int', 'invalid_wr_operations': 'int',
|
||||
'failed_zone_append_operations': 'int',
|
||||
'failed_flush_operations': 'int',
|
||||
'failed_unmap_operations': 'int', 'invalid_rd_operations': 'int',
|
||||
'invalid_wr_operations': 'int',
|
||||
'invalid_zone_append_operations': 'int',
|
||||
'invalid_flush_operations': 'int', 'invalid_unmap_operations': 'int',
|
||||
'account_invalid': 'bool', 'account_failed': 'bool',
|
||||
'timed_stats': ['BlockDeviceTimedStats'],
|
||||
'*rd_latency_histogram': 'BlockLatencyHistogramInfo',
|
||||
'*wr_latency_histogram': 'BlockLatencyHistogramInfo',
|
||||
'*zone_append_latency_histogram': 'BlockLatencyHistogramInfo',
|
||||
'*flush_latency_histogram': 'BlockLatencyHistogramInfo' } }
|
||||
|
||||
##
|
||||
|
|
|
@ -534,6 +534,9 @@
|
|||
# @boundaries-write: list of interval boundary values for write
|
||||
# latency histogram.
|
||||
#
|
||||
# @boundaries-zap: list of interval boundary values for zone append write
|
||||
# latency histogram.
|
||||
#
|
||||
# @boundaries-flush: list of interval boundary values for flush
|
||||
# latency histogram.
|
||||
#
|
||||
|
@ -587,5 +590,6 @@
|
|||
'*boundaries': ['uint64'],
|
||||
'*boundaries-read': ['uint64'],
|
||||
'*boundaries-write': ['uint64'],
|
||||
'*boundaries-zap': ['uint64'],
|
||||
'*boundaries-flush': ['uint64'] },
|
||||
'allow-preconfig': true }
|
||||
|
|
224
qemu-io-cmds.c
224
qemu-io-cmds.c
|
@ -1730,6 +1730,224 @@ static const cmdinfo_t flush_cmd = {
|
|||
.oneline = "flush all in-core file state to disk",
|
||||
};
|
||||
|
||||
static inline int64_t tosector(int64_t bytes)
|
||||
{
|
||||
return bytes >> BDRV_SECTOR_BITS;
|
||||
}
|
||||
|
||||
static int zone_report_f(BlockBackend *blk, int argc, char **argv)
|
||||
{
|
||||
int ret;
|
||||
int64_t offset;
|
||||
unsigned int nr_zones;
|
||||
|
||||
++optind;
|
||||
offset = cvtnum(argv[optind]);
|
||||
++optind;
|
||||
nr_zones = cvtnum(argv[optind]);
|
||||
|
||||
g_autofree BlockZoneDescriptor *zones = NULL;
|
||||
zones = g_new(BlockZoneDescriptor, nr_zones);
|
||||
ret = blk_zone_report(blk, offset, &nr_zones, zones);
|
||||
if (ret < 0) {
|
||||
printf("zone report failed: %s\n", strerror(-ret));
|
||||
} else {
|
||||
for (int i = 0; i < nr_zones; ++i) {
|
||||
printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", "
|
||||
"cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", "
|
||||
"zcond:%u, [type: %u]\n",
|
||||
tosector(zones[i].start), tosector(zones[i].length),
|
||||
tosector(zones[i].cap), tosector(zones[i].wp),
|
||||
zones[i].state, zones[i].type);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const cmdinfo_t zone_report_cmd = {
|
||||
.name = "zone_report",
|
||||
.altname = "zrp",
|
||||
.cfunc = zone_report_f,
|
||||
.argmin = 2,
|
||||
.argmax = 2,
|
||||
.args = "offset number",
|
||||
.oneline = "report zone information",
|
||||
};
|
||||
|
||||
static int zone_open_f(BlockBackend *blk, int argc, char **argv)
|
||||
{
|
||||
int ret;
|
||||
int64_t offset, len;
|
||||
++optind;
|
||||
offset = cvtnum(argv[optind]);
|
||||
++optind;
|
||||
len = cvtnum(argv[optind]);
|
||||
ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len);
|
||||
if (ret < 0) {
|
||||
printf("zone open failed: %s\n", strerror(-ret));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const cmdinfo_t zone_open_cmd = {
|
||||
.name = "zone_open",
|
||||
.altname = "zo",
|
||||
.cfunc = zone_open_f,
|
||||
.argmin = 2,
|
||||
.argmax = 2,
|
||||
.args = "offset len",
|
||||
.oneline = "explicit open a range of zones in zone block device",
|
||||
};
|
||||
|
||||
static int zone_close_f(BlockBackend *blk, int argc, char **argv)
|
||||
{
|
||||
int ret;
|
||||
int64_t offset, len;
|
||||
++optind;
|
||||
offset = cvtnum(argv[optind]);
|
||||
++optind;
|
||||
len = cvtnum(argv[optind]);
|
||||
ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len);
|
||||
if (ret < 0) {
|
||||
printf("zone close failed: %s\n", strerror(-ret));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const cmdinfo_t zone_close_cmd = {
|
||||
.name = "zone_close",
|
||||
.altname = "zc",
|
||||
.cfunc = zone_close_f,
|
||||
.argmin = 2,
|
||||
.argmax = 2,
|
||||
.args = "offset len",
|
||||
.oneline = "close a range of zones in zone block device",
|
||||
};
|
||||
|
||||
static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
|
||||
{
|
||||
int ret;
|
||||
int64_t offset, len;
|
||||
++optind;
|
||||
offset = cvtnum(argv[optind]);
|
||||
++optind;
|
||||
len = cvtnum(argv[optind]);
|
||||
ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len);
|
||||
if (ret < 0) {
|
||||
printf("zone finish failed: %s\n", strerror(-ret));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const cmdinfo_t zone_finish_cmd = {
|
||||
.name = "zone_finish",
|
||||
.altname = "zf",
|
||||
.cfunc = zone_finish_f,
|
||||
.argmin = 2,
|
||||
.argmax = 2,
|
||||
.args = "offset len",
|
||||
.oneline = "finish a range of zones in zone block device",
|
||||
};
|
||||
|
||||
static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
|
||||
{
|
||||
int ret;
|
||||
int64_t offset, len;
|
||||
++optind;
|
||||
offset = cvtnum(argv[optind]);
|
||||
++optind;
|
||||
len = cvtnum(argv[optind]);
|
||||
ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len);
|
||||
if (ret < 0) {
|
||||
printf("zone reset failed: %s\n", strerror(-ret));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const cmdinfo_t zone_reset_cmd = {
|
||||
.name = "zone_reset",
|
||||
.altname = "zrs",
|
||||
.cfunc = zone_reset_f,
|
||||
.argmin = 2,
|
||||
.argmax = 2,
|
||||
.args = "offset len",
|
||||
.oneline = "reset a zone write pointer in zone block device",
|
||||
};
|
||||
|
||||
static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
|
||||
int64_t *offset, int flags, int *total)
|
||||
{
|
||||
int async_ret = NOT_DONE;
|
||||
|
||||
blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret);
|
||||
while (async_ret == NOT_DONE) {
|
||||
main_loop_wait(false);
|
||||
}
|
||||
|
||||
*total = qiov->size;
|
||||
return async_ret < 0 ? async_ret : 1;
|
||||
}
|
||||
|
||||
static int zone_append_f(BlockBackend *blk, int argc, char **argv)
|
||||
{
|
||||
int ret;
|
||||
bool pflag = false;
|
||||
int flags = 0;
|
||||
int total = 0;
|
||||
int64_t offset;
|
||||
char *buf;
|
||||
int c, nr_iov;
|
||||
int pattern = 0xcd;
|
||||
QEMUIOVector qiov;
|
||||
|
||||
if (optind > argc - 3) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if ((c = getopt(argc, argv, "p")) != -1) {
|
||||
pflag = true;
|
||||
}
|
||||
|
||||
offset = cvtnum(argv[optind]);
|
||||
if (offset < 0) {
|
||||
print_cvtnum_err(offset, argv[optind]);
|
||||
return offset;
|
||||
}
|
||||
optind++;
|
||||
nr_iov = argc - optind;
|
||||
buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern,
|
||||
flags & BDRV_REQ_REGISTERED_BUF);
|
||||
if (buf == NULL) {
|
||||
return -EINVAL;
|
||||
}
|
||||
ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total);
|
||||
if (ret < 0) {
|
||||
printf("zone append failed: %s\n", strerror(-ret));
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (pflag) {
|
||||
printf("After zap done, the append sector is 0x%" PRIx64 "\n",
|
||||
tosector(offset));
|
||||
}
|
||||
|
||||
out:
|
||||
qemu_io_free(blk, buf, qiov.size,
|
||||
flags & BDRV_REQ_REGISTERED_BUF);
|
||||
qemu_iovec_destroy(&qiov);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const cmdinfo_t zone_append_cmd = {
|
||||
.name = "zone_append",
|
||||
.altname = "zap",
|
||||
.cfunc = zone_append_f,
|
||||
.argmin = 3,
|
||||
.argmax = 4,
|
||||
.args = "offset len [len..]",
|
||||
.oneline = "append write a number of bytes at a specified offset",
|
||||
};
|
||||
|
||||
static int truncate_f(BlockBackend *blk, int argc, char **argv);
|
||||
static const cmdinfo_t truncate_cmd = {
|
||||
.name = "truncate",
|
||||
|
@ -2523,6 +2741,12 @@ static void __attribute((constructor)) init_qemuio_commands(void)
|
|||
qemuio_add_command(&aio_write_cmd);
|
||||
qemuio_add_command(&aio_flush_cmd);
|
||||
qemuio_add_command(&flush_cmd);
|
||||
qemuio_add_command(&zone_report_cmd);
|
||||
qemuio_add_command(&zone_open_cmd);
|
||||
qemuio_add_command(&zone_close_cmd);
|
||||
qemuio_add_command(&zone_finish_cmd);
|
||||
qemuio_add_command(&zone_reset_cmd);
|
||||
qemuio_add_command(&zone_append_cmd);
|
||||
qemuio_add_command(&truncate_cmd);
|
||||
qemuio_add_command(&length_cmd);
|
||||
qemuio_add_command(&info_cmd);
|
||||
|
|
|
@ -17,6 +17,7 @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
|
|||
"stats": {
|
||||
"unmap_operations": 0,
|
||||
"unmap_merged": 0,
|
||||
"failed_zone_append_operations": 0,
|
||||
"flush_total_time_ns": 0,
|
||||
"wr_highest_offset": 0,
|
||||
"wr_total_time_ns": 0,
|
||||
|
@ -27,6 +28,7 @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
|
|||
"timed_stats": [
|
||||
],
|
||||
"failed_unmap_operations": 0,
|
||||
"zone_append_merged": 0,
|
||||
"failed_flush_operations": 0,
|
||||
"account_invalid": true,
|
||||
"rd_total_time_ns": 0,
|
||||
|
@ -39,7 +41,11 @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
|
|||
"unmap_total_time_ns": 0,
|
||||
"invalid_flush_operations": 0,
|
||||
"account_failed": true,
|
||||
"zone_append_total_time_ns": 0,
|
||||
"zone_append_operations": 0,
|
||||
"rd_operations": 0,
|
||||
"zone_append_bytes": 0,
|
||||
"invalid_zone_append_operations": 0,
|
||||
"invalid_wr_operations": 0,
|
||||
"invalid_rd_operations": 0
|
||||
},
|
||||
|
@ -82,6 +88,7 @@ Testing: -drive driver=null-co,if=none
|
|||
"stats": {
|
||||
"unmap_operations": 0,
|
||||
"unmap_merged": 0,
|
||||
"failed_zone_append_operations": 0,
|
||||
"flush_total_time_ns": 0,
|
||||
"wr_highest_offset": 0,
|
||||
"wr_total_time_ns": 0,
|
||||
|
@ -92,6 +99,7 @@ Testing: -drive driver=null-co,if=none
|
|||
"timed_stats": [
|
||||
],
|
||||
"failed_unmap_operations": 0,
|
||||
"zone_append_merged": 0,
|
||||
"failed_flush_operations": 0,
|
||||
"account_invalid": true,
|
||||
"rd_total_time_ns": 0,
|
||||
|
@ -104,7 +112,11 @@ Testing: -drive driver=null-co,if=none
|
|||
"unmap_total_time_ns": 0,
|
||||
"invalid_flush_operations": 0,
|
||||
"account_failed": true,
|
||||
"zone_append_total_time_ns": 0,
|
||||
"zone_append_operations": 0,
|
||||
"rd_operations": 0,
|
||||
"zone_append_bytes": 0,
|
||||
"invalid_zone_append_operations": 0,
|
||||
"invalid_wr_operations": 0,
|
||||
"invalid_rd_operations": 0
|
||||
},
|
||||
|
@ -177,6 +189,7 @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
|
|||
"stats": {
|
||||
"unmap_operations": 0,
|
||||
"unmap_merged": 0,
|
||||
"failed_zone_append_operations": 0,
|
||||
"flush_total_time_ns": 0,
|
||||
"wr_highest_offset": 0,
|
||||
"wr_total_time_ns": 0,
|
||||
|
@ -187,6 +200,7 @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
|
|||
"timed_stats": [
|
||||
],
|
||||
"failed_unmap_operations": 0,
|
||||
"zone_append_merged": 0,
|
||||
"failed_flush_operations": 0,
|
||||
"account_invalid": true,
|
||||
"rd_total_time_ns": 0,
|
||||
|
@ -199,7 +213,11 @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
|
|||
"unmap_total_time_ns": 0,
|
||||
"invalid_flush_operations": 0,
|
||||
"account_failed": true,
|
||||
"zone_append_total_time_ns": 0,
|
||||
"zone_append_operations": 0,
|
||||
"rd_operations": 0,
|
||||
"zone_append_bytes": 0,
|
||||
"invalid_zone_append_operations": 0,
|
||||
"invalid_wr_operations": 0,
|
||||
"invalid_rd_operations": 0
|
||||
},
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Test zone management operations.
|
||||
#
|
||||
|
||||
seq="$(basename $0)"
|
||||
echo "QA output created by $seq"
|
||||
status=1 # failure is the default!
|
||||
|
||||
_cleanup()
|
||||
{
|
||||
_cleanup_test_img
|
||||
sudo -n rmmod null_blk
|
||||
}
|
||||
trap "_cleanup; exit \$status" 0 1 2 3 15
|
||||
|
||||
# get standard environment, filters and checks
|
||||
. ../common.rc
|
||||
. ../common.filter
|
||||
. ../common.qemu
|
||||
|
||||
# This test only runs on Linux hosts with raw image files.
|
||||
_supported_fmt raw
|
||||
_supported_proto file
|
||||
_supported_os Linux
|
||||
|
||||
sudo -n true || \
|
||||
_notrun 'Password-less sudo required'
|
||||
|
||||
IMG="--image-opts -n driver=host_device,filename=/dev/nullb0"
|
||||
QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
|
||||
|
||||
echo "Testing a null_blk device:"
|
||||
echo "case 1: if the operations work"
|
||||
sudo -n modprobe null_blk nr_devices=1 zoned=1
|
||||
sudo -n chmod 0666 /dev/nullb0
|
||||
|
||||
echo "(1) report the first zone:"
|
||||
$QEMU_IO $IMG -c "zrp 0 1"
|
||||
echo
|
||||
echo "report the first 10 zones"
|
||||
$QEMU_IO $IMG -c "zrp 0 10"
|
||||
echo
|
||||
echo "report the last zone:"
|
||||
$QEMU_IO $IMG -c "zrp 0x3e70000000 2" # 0x3e70000000 / 512 = 0x1f380000
|
||||
echo
|
||||
echo
|
||||
echo "(2) opening the first zone"
|
||||
$QEMU_IO $IMG -c "zo 0 268435456" # 268435456 / 512 = 524288
|
||||
echo "report after:"
|
||||
$QEMU_IO $IMG -c "zrp 0 1"
|
||||
echo
|
||||
echo "opening the second zone"
|
||||
$QEMU_IO $IMG -c "zo 268435456 268435456" #
|
||||
echo "report after:"
|
||||
$QEMU_IO $IMG -c "zrp 268435456 1"
|
||||
echo
|
||||
echo "opening the last zone"
|
||||
$QEMU_IO $IMG -c "zo 0x3e70000000 268435456"
|
||||
echo "report after:"
|
||||
$QEMU_IO $IMG -c "zrp 0x3e70000000 2"
|
||||
echo
|
||||
echo
|
||||
echo "(3) closing the first zone"
|
||||
$QEMU_IO $IMG -c "zc 0 268435456"
|
||||
echo "report after:"
|
||||
$QEMU_IO $IMG -c "zrp 0 1"
|
||||
echo
|
||||
echo "closing the last zone"
|
||||
$QEMU_IO $IMG -c "zc 0x3e70000000 268435456"
|
||||
echo "report after:"
|
||||
$QEMU_IO $IMG -c "zrp 0x3e70000000 2"
|
||||
echo
|
||||
echo
|
||||
echo "(4) finishing the second zone"
|
||||
$QEMU_IO $IMG -c "zf 268435456 268435456"
|
||||
echo "After finishing a zone:"
|
||||
$QEMU_IO $IMG -c "zrp 268435456 1"
|
||||
echo
|
||||
echo
|
||||
echo "(5) resetting the second zone"
|
||||
$QEMU_IO $IMG -c "zrs 268435456 268435456"
|
||||
echo "After resetting a zone:"
|
||||
$QEMU_IO $IMG -c "zrp 268435456 1"
|
||||
echo
|
||||
echo
|
||||
echo "(6) append write" # the physical block size of the device is 4096
|
||||
$QEMU_IO $IMG -c "zrp 0 1"
|
||||
$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000"
|
||||
echo "After appending the first zone firstly:"
|
||||
$QEMU_IO $IMG -c "zrp 0 1"
|
||||
$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000"
|
||||
echo "After appending the first zone secondly:"
|
||||
$QEMU_IO $IMG -c "zrp 0 1"
|
||||
$QEMU_IO $IMG -c "zap -p 268435456 0x1000 0x2000"
|
||||
echo "After appending the second zone firstly:"
|
||||
$QEMU_IO $IMG -c "zrp 268435456 1"
|
||||
$QEMU_IO $IMG -c "zap -p 268435456 0x1000 0x2000"
|
||||
echo "After appending the second zone secondly:"
|
||||
$QEMU_IO $IMG -c "zrp 268435456 1"
|
||||
|
||||
# success, all done
|
||||
echo "*** done"
|
||||
rm -f $seq.full
|
||||
status=0
|
|
@ -0,0 +1,69 @@
|
|||
QA output created by zoned
|
||||
Testing a null_blk device:
|
||||
case 1: if the operations work
|
||||
(1) report the first zone:
|
||||
start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
|
||||
|
||||
report the first 10 zones
|
||||
start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
|
||||
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
|
||||
start: 0x100000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:1, [type: 2]
|
||||
start: 0x180000, len 0x80000, cap 0x80000, wptr 0x180000, zcond:1, [type: 2]
|
||||
start: 0x200000, len 0x80000, cap 0x80000, wptr 0x200000, zcond:1, [type: 2]
|
||||
start: 0x280000, len 0x80000, cap 0x80000, wptr 0x280000, zcond:1, [type: 2]
|
||||
start: 0x300000, len 0x80000, cap 0x80000, wptr 0x300000, zcond:1, [type: 2]
|
||||
start: 0x380000, len 0x80000, cap 0x80000, wptr 0x380000, zcond:1, [type: 2]
|
||||
start: 0x400000, len 0x80000, cap 0x80000, wptr 0x400000, zcond:1, [type: 2]
|
||||
start: 0x480000, len 0x80000, cap 0x80000, wptr 0x480000, zcond:1, [type: 2]
|
||||
|
||||
report the last zone:
|
||||
start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:1, [type: 2]
|
||||
|
||||
|
||||
(2) opening the first zone
|
||||
report after:
|
||||
start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:3, [type: 2]
|
||||
|
||||
opening the second zone
|
||||
report after:
|
||||
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:3, [type: 2]
|
||||
|
||||
opening the last zone
|
||||
report after:
|
||||
start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:3, [type: 2]
|
||||
|
||||
|
||||
(3) closing the first zone
|
||||
report after:
|
||||
start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
|
||||
|
||||
closing the last zone
|
||||
report after:
|
||||
start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:1, [type: 2]
|
||||
|
||||
|
||||
(4) finishing the second zone
|
||||
After finishing a zone:
|
||||
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2]
|
||||
|
||||
|
||||
(5) resetting the second zone
|
||||
After resetting a zone:
|
||||
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
|
||||
|
||||
|
||||
(6) append write
|
||||
start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
|
||||
After zap done, the append sector is 0x0
|
||||
After appending the first zone firstly:
|
||||
start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2]
|
||||
After zap done, the append sector is 0x18
|
||||
After appending the first zone secondly:
|
||||
start: 0x0, len 0x80000, cap 0x80000, wptr 0x30, zcond:2, [type: 2]
|
||||
After zap done, the append sector is 0x80000
|
||||
After appending the second zone firstly:
|
||||
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2]
|
||||
After zap done, the append sector is 0x80018
|
||||
After appending the second zone secondly:
|
||||
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80030, zcond:2, [type: 2]
|
||||
*** done
|
Loading…
Reference in New Issue