Pull request

This pull request contain's Sam Li's zoned storage support in the QEMU block
 layer and virtio-blk emulation.
 
 v2:
 - Sam fixed the CI failures. CI passes for me now. [Richard]
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCAAdFiEEhpWov9P5fNqsNXdanKSrs4Grc8gFAmRiWCgACgkQnKSrs4Gr
 c8h/7gf+MMm2cGEaf376t8HMwTc6wbXVfbmAlZrge2EXPZfFvEaxj7HClcEraOgV
 yJsGWeU6mOw4r68ICJ/4KhrY1cdv+VZym/LsMLMcFUTXFHnyX4pyU3am31FPOI4K
 +wrDYJOJhc4DkAESWGgEWiMKpuO/uUEgBmHdW+qPFCl77Yl/eP6H5uNP6nGFn55p
 QpS/l8iha7PDkc81EsrjA+e/YI0ubfNSP7+zZElhQ98354CQ0MCfmZ6h9bT+o2bu
 R7SBUj80e+2X0a1b9s/2Jz/x8l4TEsl8kr48/Q1usq3GVVkbjEgqsk6wTN13Q/4g
 CeIR7E61ZeYzmpb4tLFRIqK2Jw+NEQ==
 =Q8xW
 -----END PGP SIGNATURE-----

Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into staging

Pull request

This pull request contain's Sam Li's zoned storage support in the QEMU block
layer and virtio-blk emulation.

v2:
- Sam fixed the CI failures. CI passes for me now. [Richard]

# -----BEGIN PGP SIGNATURE-----
#
# iQEzBAABCAAdFiEEhpWov9P5fNqsNXdanKSrs4Grc8gFAmRiWCgACgkQnKSrs4Gr
# c8h/7gf+MMm2cGEaf376t8HMwTc6wbXVfbmAlZrge2EXPZfFvEaxj7HClcEraOgV
# yJsGWeU6mOw4r68ICJ/4KhrY1cdv+VZym/LsMLMcFUTXFHnyX4pyU3am31FPOI4K
# +wrDYJOJhc4DkAESWGgEWiMKpuO/uUEgBmHdW+qPFCl77Yl/eP6H5uNP6nGFn55p
# QpS/l8iha7PDkc81EsrjA+e/YI0ubfNSP7+zZElhQ98354CQ0MCfmZ6h9bT+o2bu
# R7SBUj80e+2X0a1b9s/2Jz/x8l4TEsl8kr48/Q1usq3GVVkbjEgqsk6wTN13Q/4g
# CeIR7E61ZeYzmpb4tLFRIqK2Jw+NEQ==
# =Q8xW
# -----END PGP SIGNATURE-----
# gpg: Signature made Mon 15 May 2023 09:04:56 AM PDT
# gpg:                using RSA key 8695A8BFD3F97CDAAC35775A9CA4ABB381AB73C8
# gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>" [full]
# gpg:                 aka "Stefan Hajnoczi <stefanha@gmail.com>" [full]

* tag 'block-pull-request' of https://gitlab.com/stefanha/qemu:
  docs/zoned-storage:add zoned emulation use case
  virtio-blk: add some trace events for zoned emulation
  block: add accounting for zone append operation
  virtio-blk: add zoned storage emulation for zoned devices
  block: add some trace events for zone append
  qemu-iotests: test zone append operation
  block: introduce zone append write for zoned devices
  file-posix: add tracking of the zone write pointers
  docs/zoned-storage: add zoned device documentation
  block: add some trace events for new block layer APIs
  iotests: test new zone operations
  block: add zoned BlockDriver check to block layer
  block/raw-format: add zone operations to pass through requests
  block/block-backend: add block layer APIs resembling Linux ZonedBlockDevice ioctls
  block/file-posix: introduce helper functions for sysfs attributes
  block/block-common: add zoned device structs

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2023-05-15 13:54:33 -07:00
commit ab4c44d657
30 changed files with 2106 additions and 58 deletions

19
block.c
View File

@ -7982,6 +7982,25 @@ void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
return;
}
/*
* Non-zoned block drivers do not follow zoned storage constraints
* (i.e. sequential writes to zones). Refuse mixing zoned and non-zoned
* drivers in a graph.
*/
if (!parent_bs->drv->supports_zoned_children &&
child_bs->bl.zoned == BLK_Z_HM) {
/*
* The host-aware model allows zoned storage constraints and random
* write. Allow mixing host-aware and non-zoned drivers. Using
* host-aware device as a regular device.
*/
error_setg(errp, "Cannot add a %s child to a %s parent",
child_bs->bl.zoned == BLK_Z_HM ? "zoned" : "non-zoned",
parent_bs->drv->supports_zoned_children ?
"support zoned children" : "not support zoned children");
return;
}
if (!QLIST_EMPTY(&child_bs->parents)) {
error_setg(errp, "The node %s already has a parent",
child_bs->node_name);

View File

@ -1845,6 +1845,204 @@ int coroutine_fn blk_co_flush(BlockBackend *blk)
return ret;
}
static void coroutine_fn blk_aio_zone_report_entry(void *opaque)
{
BlkAioEmAIOCB *acb = opaque;
BlkRwCo *rwco = &acb->rwco;
rwco->ret = blk_co_zone_report(rwco->blk, rwco->offset,
(unsigned int*)(uintptr_t)acb->bytes,
rwco->iobuf);
blk_aio_complete(acb);
}
BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
unsigned int *nr_zones,
BlockZoneDescriptor *zones,
BlockCompletionFunc *cb, void *opaque)
{
BlkAioEmAIOCB *acb;
Coroutine *co;
IO_CODE();
blk_inc_in_flight(blk);
acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
acb->rwco = (BlkRwCo) {
.blk = blk,
.offset = offset,
.iobuf = zones,
.ret = NOT_DONE,
};
acb->bytes = (int64_t)(uintptr_t)nr_zones,
acb->has_returned = false;
co = qemu_coroutine_create(blk_aio_zone_report_entry, acb);
aio_co_enter(blk_get_aio_context(blk), co);
acb->has_returned = true;
if (acb->rwco.ret != NOT_DONE) {
replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
blk_aio_complete_bh, acb);
}
return &acb->common;
}
static void coroutine_fn blk_aio_zone_mgmt_entry(void *opaque)
{
BlkAioEmAIOCB *acb = opaque;
BlkRwCo *rwco = &acb->rwco;
rwco->ret = blk_co_zone_mgmt(rwco->blk,
(BlockZoneOp)(uintptr_t)rwco->iobuf,
rwco->offset, acb->bytes);
blk_aio_complete(acb);
}
BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
int64_t offset, int64_t len,
BlockCompletionFunc *cb, void *opaque) {
BlkAioEmAIOCB *acb;
Coroutine *co;
IO_CODE();
blk_inc_in_flight(blk);
acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
acb->rwco = (BlkRwCo) {
.blk = blk,
.offset = offset,
.iobuf = (void *)(uintptr_t)op,
.ret = NOT_DONE,
};
acb->bytes = len;
acb->has_returned = false;
co = qemu_coroutine_create(blk_aio_zone_mgmt_entry, acb);
aio_co_enter(blk_get_aio_context(blk), co);
acb->has_returned = true;
if (acb->rwco.ret != NOT_DONE) {
replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
blk_aio_complete_bh, acb);
}
return &acb->common;
}
static void coroutine_fn blk_aio_zone_append_entry(void *opaque)
{
BlkAioEmAIOCB *acb = opaque;
BlkRwCo *rwco = &acb->rwco;
rwco->ret = blk_co_zone_append(rwco->blk, (int64_t *)(uintptr_t)acb->bytes,
rwco->iobuf, rwco->flags);
blk_aio_complete(acb);
}
BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
QEMUIOVector *qiov, BdrvRequestFlags flags,
BlockCompletionFunc *cb, void *opaque) {
BlkAioEmAIOCB *acb;
Coroutine *co;
IO_CODE();
blk_inc_in_flight(blk);
acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
acb->rwco = (BlkRwCo) {
.blk = blk,
.ret = NOT_DONE,
.flags = flags,
.iobuf = qiov,
};
acb->bytes = (int64_t)(uintptr_t)offset;
acb->has_returned = false;
co = qemu_coroutine_create(blk_aio_zone_append_entry, acb);
aio_co_enter(blk_get_aio_context(blk), co);
acb->has_returned = true;
if (acb->rwco.ret != NOT_DONE) {
replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
blk_aio_complete_bh, acb);
}
return &acb->common;
}
/*
* Send a zone_report command.
* offset is a byte offset from the start of the device. No alignment
* required for offset.
* nr_zones represents IN maximum and OUT actual.
*/
int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
unsigned int *nr_zones,
BlockZoneDescriptor *zones)
{
int ret;
IO_CODE();
blk_inc_in_flight(blk); /* increase before waiting */
blk_wait_while_drained(blk);
GRAPH_RDLOCK_GUARD();
if (!blk_is_available(blk)) {
blk_dec_in_flight(blk);
return -ENOMEDIUM;
}
ret = bdrv_co_zone_report(blk_bs(blk), offset, nr_zones, zones);
blk_dec_in_flight(blk);
return ret;
}
/*
* Send a zone_management command.
* op is the zone operation;
* offset is the byte offset from the start of the zoned device;
* len is the maximum number of bytes the command should operate on. It
* should be aligned with the device zone size.
*/
int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
int64_t offset, int64_t len)
{
int ret;
IO_CODE();
blk_inc_in_flight(blk);
blk_wait_while_drained(blk);
GRAPH_RDLOCK_GUARD();
ret = blk_check_byte_request(blk, offset, len);
if (ret < 0) {
blk_dec_in_flight(blk);
return ret;
}
ret = bdrv_co_zone_mgmt(blk_bs(blk), op, offset, len);
blk_dec_in_flight(blk);
return ret;
}
/*
* Send a zone_append command.
*/
int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
QEMUIOVector *qiov, BdrvRequestFlags flags)
{
int ret;
IO_CODE();
blk_inc_in_flight(blk);
blk_wait_while_drained(blk);
GRAPH_RDLOCK_GUARD();
if (!blk_is_available(blk)) {
blk_dec_in_flight(blk);
return -ENOMEDIUM;
}
ret = bdrv_co_zone_append(blk_bs(blk), offset, qiov, flags);
blk_dec_in_flight(blk);
return ret;
}
void blk_drain(BlockBackend *blk)
{
BlockDriverState *bs = blk_bs(blk);

View File

@ -68,6 +68,9 @@
#include <sys/param.h>
#include <sys/syscall.h>
#include <sys/vfs.h>
#if defined(CONFIG_BLKZONED)
#include <linux/blkzoned.h>
#endif
#include <linux/cdrom.h>
#include <linux/fd.h>
#include <linux/fs.h>
@ -157,6 +160,7 @@ typedef struct BDRVRawState {
bool has_write_zeroes:1;
bool use_linux_aio:1;
bool use_linux_io_uring:1;
int64_t *offset; /* offset of zone append operation */
int page_cache_inconsistent; /* errno from fdatasync failure */
bool has_fallocate;
bool needs_alignment;
@ -216,6 +220,13 @@ typedef struct RawPosixAIOData {
PreallocMode prealloc;
Error **errp;
} truncate;
struct {
unsigned int *nr_zones;
BlockZoneDescriptor *zones;
} zone_report;
struct {
unsigned long op;
} zone_mgmt;
};
} RawPosixAIOData;
@ -766,6 +777,18 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
goto fail;
}
}
#ifdef CONFIG_BLKZONED
/*
* The kernel page cache does not reliably work for writes to SWR zones
* of zoned block device because it can not guarantee the order of writes.
*/
if ((bs->bl.zoned != BLK_Z_NONE) &&
(!(s->open_flags & O_DIRECT))) {
error_setg(errp, "The driver supports zoned devices, and it requires "
"cache.direct=on, which was not specified.");
return -EINVAL; /* No host kernel page cache */
}
#endif
if (S_ISBLK(st.st_mode)) {
#ifdef __linux__
@ -1202,15 +1225,91 @@ static int hdev_get_max_hw_transfer(int fd, struct stat *st)
#endif
}
/*
* Get a sysfs attribute value as character string.
*/
#ifdef CONFIG_LINUX
static int get_sysfs_str_val(struct stat *st, const char *attribute,
char **val) {
g_autofree char *sysfspath = NULL;
int ret;
size_t len;
if (!S_ISBLK(st->st_mode)) {
return -ENOTSUP;
}
sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/%s",
major(st->st_rdev), minor(st->st_rdev),
attribute);
ret = g_file_get_contents(sysfspath, val, &len, NULL);
if (ret == -1) {
return -ENOENT;
}
/* The file is ended with '\n' */
char *p;
p = *val;
if (*(p + len - 1) == '\n') {
*(p + len - 1) = '\0';
}
return ret;
}
#endif
#if defined(CONFIG_BLKZONED)
static int get_sysfs_zoned_model(struct stat *st, BlockZoneModel *zoned)
{
g_autofree char *val = NULL;
int ret;
ret = get_sysfs_str_val(st, "zoned", &val);
if (ret < 0) {
return ret;
}
if (strcmp(val, "host-managed") == 0) {
*zoned = BLK_Z_HM;
} else if (strcmp(val, "host-aware") == 0) {
*zoned = BLK_Z_HA;
} else if (strcmp(val, "none") == 0) {
*zoned = BLK_Z_NONE;
} else {
return -ENOTSUP;
}
return 0;
}
#endif /* defined(CONFIG_BLKZONED) */
/*
* Get a sysfs attribute value as a long integer.
*/
#ifdef CONFIG_LINUX
static long get_sysfs_long_val(struct stat *st, const char *attribute)
{
g_autofree char *str = NULL;
const char *end;
long val;
int ret;
ret = get_sysfs_str_val(st, attribute, &str);
if (ret < 0) {
return ret;
}
/* The file is ended with '\n', pass 'end' to accept that. */
ret = qemu_strtol(str, &end, 10, &val);
if (ret == 0 && end && *end == '\0') {
ret = val;
}
return ret;
}
#endif
static int hdev_get_max_segments(int fd, struct stat *st)
{
#ifdef CONFIG_LINUX
char buf[32];
const char *end;
char *sysfspath = NULL;
int ret;
int sysfd = -1;
long max_segments;
if (S_ISCHR(st->st_mode)) {
if (ioctl(fd, SG_GET_SG_TABLESIZE, &ret) == 0) {
@ -1218,44 +1317,176 @@ static int hdev_get_max_segments(int fd, struct stat *st)
}
return -ENOTSUP;
}
if (!S_ISBLK(st->st_mode)) {
return -ENOTSUP;
}
sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
major(st->st_rdev), minor(st->st_rdev));
sysfd = open(sysfspath, O_RDONLY);
if (sysfd == -1) {
ret = -errno;
goto out;
}
ret = RETRY_ON_EINTR(read(sysfd, buf, sizeof(buf) - 1));
if (ret < 0) {
ret = -errno;
goto out;
} else if (ret == 0) {
ret = -EIO;
goto out;
}
buf[ret] = 0;
/* The file is ended with '\n', pass 'end' to accept that. */
ret = qemu_strtol(buf, &end, 10, &max_segments);
if (ret == 0 && end && *end == '\n') {
ret = max_segments;
}
out:
if (sysfd != -1) {
close(sysfd);
}
g_free(sysfspath);
return ret;
return get_sysfs_long_val(st, "max_segments");
#else
return -ENOTSUP;
#endif
}
#if defined(CONFIG_BLKZONED)
/*
* If the reset_all flag is true, then the wps of zone whose state is
* not readonly or offline should be all reset to the start sector.
* Else, take the real wp of the device.
*/
static int get_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
unsigned int nrz, bool reset_all)
{
struct blk_zone *blkz;
size_t rep_size;
uint64_t sector = offset >> BDRV_SECTOR_BITS;
BlockZoneWps *wps = bs->wps;
unsigned int j = offset / bs->bl.zone_size;
unsigned int n = 0, i = 0;
int ret;
rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
g_autofree struct blk_zone_report *rep = NULL;
rep = g_malloc(rep_size);
blkz = (struct blk_zone *)(rep + 1);
while (n < nrz) {
memset(rep, 0, rep_size);
rep->sector = sector;
rep->nr_zones = nrz - n;
do {
ret = ioctl(fd, BLKREPORTZONE, rep);
} while (ret != 0 && errno == EINTR);
if (ret != 0) {
error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
fd, offset, errno);
return -errno;
}
if (!rep->nr_zones) {
break;
}
for (i = 0; i < rep->nr_zones; ++i, ++n, ++j) {
/*
* The wp tracking cares only about sequential writes required and
* sequential write preferred zones so that the wp can advance to
* the right location.
* Use the most significant bit of the wp location to indicate the
* zone type: 0 for SWR/SWP zones and 1 for conventional zones.
*/
if (blkz[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
wps->wp[j] |= 1ULL << 63;
} else {
switch(blkz[i].cond) {
case BLK_ZONE_COND_FULL:
case BLK_ZONE_COND_READONLY:
/* Zone not writable */
wps->wp[j] = (blkz[i].start + blkz[i].len) << BDRV_SECTOR_BITS;
break;
case BLK_ZONE_COND_OFFLINE:
/* Zone not writable nor readable */
wps->wp[j] = (blkz[i].start) << BDRV_SECTOR_BITS;
break;
default:
if (reset_all) {
wps->wp[j] = blkz[i].start << BDRV_SECTOR_BITS;
} else {
wps->wp[j] = blkz[i].wp << BDRV_SECTOR_BITS;
}
break;
}
}
}
sector = blkz[i - 1].start + blkz[i - 1].len;
}
return 0;
}
static void update_zones_wp(BlockDriverState *bs, int fd, int64_t offset,
unsigned int nrz)
{
if (get_zones_wp(bs, fd, offset, nrz, 0) < 0) {
error_report("update zone wp failed");
}
}
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
Error **errp)
{
BDRVRawState *s = bs->opaque;
BlockZoneModel zoned;
int ret;
bs->bl.zoned = BLK_Z_NONE;
ret = get_sysfs_zoned_model(st, &zoned);
if (ret < 0 || zoned == BLK_Z_NONE) {
return;
}
bs->bl.zoned = zoned;
ret = get_sysfs_long_val(st, "max_open_zones");
if (ret >= 0) {
bs->bl.max_open_zones = ret;
}
ret = get_sysfs_long_val(st, "max_active_zones");
if (ret >= 0) {
bs->bl.max_active_zones = ret;
}
/*
* The zoned device must at least have zone size and nr_zones fields.
*/
ret = get_sysfs_long_val(st, "chunk_sectors");
if (ret < 0) {
error_setg_errno(errp, -ret, "Unable to read chunk_sectors "
"sysfs attribute");
return;
} else if (!ret) {
error_setg(errp, "Read 0 from chunk_sectors sysfs attribute");
return;
}
bs->bl.zone_size = ret << BDRV_SECTOR_BITS;
ret = get_sysfs_long_val(st, "nr_zones");
if (ret < 0) {
error_setg_errno(errp, -ret, "Unable to read nr_zones "
"sysfs attribute");
return;
} else if (!ret) {
error_setg(errp, "Read 0 from nr_zones sysfs attribute");
return;
}
bs->bl.nr_zones = ret;
ret = get_sysfs_long_val(st, "zone_append_max_bytes");
if (ret > 0) {
bs->bl.max_append_sectors = ret >> BDRV_SECTOR_BITS;
}
ret = get_sysfs_long_val(st, "physical_block_size");
if (ret >= 0) {
bs->bl.write_granularity = ret;
}
/* The refresh_limits() function can be called multiple times. */
g_free(bs->wps);
bs->wps = g_malloc(sizeof(BlockZoneWps) +
sizeof(int64_t) * bs->bl.nr_zones);
ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 0);
if (ret < 0) {
error_setg_errno(errp, -ret, "report wps failed");
bs->wps = NULL;
return;
}
qemu_co_mutex_init(&bs->wps->colock);
}
#else /* !defined(CONFIG_BLKZONED) */
static void raw_refresh_zoned_limits(BlockDriverState *bs, struct stat *st,
Error **errp)
{
bs->bl.zoned = BLK_Z_NONE;
}
#endif /* !defined(CONFIG_BLKZONED) */
static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
{
BDRVRawState *s = bs->opaque;
@ -1297,6 +1528,8 @@ static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
bs->bl.max_hw_iov = ret;
}
}
raw_refresh_zoned_limits(bs, &st, errp);
}
static int check_for_dasd(int fd)
@ -1320,9 +1553,12 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
BDRVRawState *s = bs->opaque;
int ret;
/* If DASD, get blocksizes */
/* If DASD or zoned devices, get blocksizes */
if (check_for_dasd(s->fd) < 0) {
return -ENOTSUP;
/* zoned devices are not DASD */
if (bs->bl.zoned == BLK_Z_NONE) {
return -ENOTSUP;
}
}
ret = probe_logical_blocksize(s->fd, &bsz->log);
if (ret < 0) {
@ -1463,7 +1699,7 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
ssize_t len;
len = RETRY_ON_EINTR(
(aiocb->aio_type & QEMU_AIO_WRITE) ?
(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
qemu_pwritev(aiocb->aio_fildes,
aiocb->io.iov,
aiocb->io.niov,
@ -1492,7 +1728,7 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
ssize_t len;
while (offset < aiocb->aio_nbytes) {
if (aiocb->aio_type & QEMU_AIO_WRITE) {
if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
len = pwrite(aiocb->aio_fildes,
(const char *)buf + offset,
aiocb->aio_nbytes - offset,
@ -1585,7 +1821,7 @@ static int handle_aiocb_rw(void *opaque)
}
nbytes = handle_aiocb_rw_linear(aiocb, buf);
if (!(aiocb->aio_type & QEMU_AIO_WRITE)) {
if (!(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))) {
char *p = buf;
size_t count = aiocb->aio_nbytes, copy;
int i;
@ -1790,6 +2026,147 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
}
#endif
/*
* parse_zone - Fill a zone descriptor
*/
#if defined(CONFIG_BLKZONED)
static inline int parse_zone(struct BlockZoneDescriptor *zone,
const struct blk_zone *blkz) {
zone->start = blkz->start << BDRV_SECTOR_BITS;
zone->length = blkz->len << BDRV_SECTOR_BITS;
zone->wp = blkz->wp << BDRV_SECTOR_BITS;
#ifdef HAVE_BLK_ZONE_REP_CAPACITY
zone->cap = blkz->capacity << BDRV_SECTOR_BITS;
#else
zone->cap = blkz->len << BDRV_SECTOR_BITS;
#endif
switch (blkz->type) {
case BLK_ZONE_TYPE_SEQWRITE_REQ:
zone->type = BLK_ZT_SWR;
break;
case BLK_ZONE_TYPE_SEQWRITE_PREF:
zone->type = BLK_ZT_SWP;
break;
case BLK_ZONE_TYPE_CONVENTIONAL:
zone->type = BLK_ZT_CONV;
break;
default:
error_report("Unsupported zone type: 0x%x", blkz->type);
return -ENOTSUP;
}
switch (blkz->cond) {
case BLK_ZONE_COND_NOT_WP:
zone->state = BLK_ZS_NOT_WP;
break;
case BLK_ZONE_COND_EMPTY:
zone->state = BLK_ZS_EMPTY;
break;
case BLK_ZONE_COND_IMP_OPEN:
zone->state = BLK_ZS_IOPEN;
break;
case BLK_ZONE_COND_EXP_OPEN:
zone->state = BLK_ZS_EOPEN;
break;
case BLK_ZONE_COND_CLOSED:
zone->state = BLK_ZS_CLOSED;
break;
case BLK_ZONE_COND_READONLY:
zone->state = BLK_ZS_RDONLY;
break;
case BLK_ZONE_COND_FULL:
zone->state = BLK_ZS_FULL;
break;
case BLK_ZONE_COND_OFFLINE:
zone->state = BLK_ZS_OFFLINE;
break;
default:
error_report("Unsupported zone state: 0x%x", blkz->cond);
return -ENOTSUP;
}
return 0;
}
#endif
#if defined(CONFIG_BLKZONED)
static int handle_aiocb_zone_report(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
int fd = aiocb->aio_fildes;
unsigned int *nr_zones = aiocb->zone_report.nr_zones;
BlockZoneDescriptor *zones = aiocb->zone_report.zones;
/* zoned block devices use 512-byte sectors */
uint64_t sector = aiocb->aio_offset / 512;
struct blk_zone *blkz;
size_t rep_size;
unsigned int nrz;
int ret;
unsigned int n = 0, i = 0;
nrz = *nr_zones;
rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
g_autofree struct blk_zone_report *rep = NULL;
rep = g_malloc(rep_size);
blkz = (struct blk_zone *)(rep + 1);
while (n < nrz) {
memset(rep, 0, rep_size);
rep->sector = sector;
rep->nr_zones = nrz - n;
do {
ret = ioctl(fd, BLKREPORTZONE, rep);
} while (ret != 0 && errno == EINTR);
if (ret != 0) {
error_report("%d: ioctl BLKREPORTZONE at %" PRId64 " failed %d",
fd, sector, errno);
return -errno;
}
if (!rep->nr_zones) {
break;
}
for (i = 0; i < rep->nr_zones; i++, n++) {
ret = parse_zone(&zones[n], &blkz[i]);
if (ret != 0) {
return ret;
}
/* The next report should start after the last zone reported */
sector = blkz[i].start + blkz[i].len;
}
}
*nr_zones = n;
return 0;
}
#endif
#if defined(CONFIG_BLKZONED)
static int handle_aiocb_zone_mgmt(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
int fd = aiocb->aio_fildes;
uint64_t sector = aiocb->aio_offset / 512;
int64_t nr_sectors = aiocb->aio_nbytes / 512;
struct blk_zone_range range;
int ret;
/* Execute the operation */
range.sector = sector;
range.nr_sectors = nr_sectors;
do {
ret = ioctl(fd, aiocb->zone_mgmt.op, &range);
} while (ret != 0 && errno == EINTR);
return ret < 0 ? -errno : ret;
}
#endif
static int handle_aiocb_copy_range(void *opaque)
{
RawPosixAIOData *aiocb = opaque;
@ -2072,9 +2449,19 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
{
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
int ret;
if (fd_open(bs) < 0)
return -EIO;
#if defined(CONFIG_BLKZONED)
if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && bs->wps) {
qemu_co_mutex_lock(&bs->wps->colock);
if (type & QEMU_AIO_ZONE_APPEND && bs->bl.zone_size) {
int index = offset / bs->bl.zone_size;
offset = bs->wps->wp[index];
}
}
#endif
/*
* When using O_DIRECT, the request must be aligned to be able to use
@ -2087,12 +2474,15 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
#ifdef CONFIG_LINUX_IO_URING
} else if (s->use_linux_io_uring) {
assert(qiov->size == bytes);
return luring_co_submit(bs, s->fd, offset, qiov, type);
ret = luring_co_submit(bs, s->fd, offset, qiov, type);
goto out;
#endif
#ifdef CONFIG_LINUX_AIO
} else if (s->use_linux_aio) {
assert(qiov->size == bytes);
return laio_co_submit(s->fd, offset, qiov, type, s->aio_max_batch);
ret = laio_co_submit(s->fd, offset, qiov, type,
s->aio_max_batch);
goto out;
#endif
}
@ -2109,7 +2499,41 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, uint64_t offset,
};
assert(qiov->size == bytes);
return raw_thread_pool_submit(handle_aiocb_rw, &acb);
ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
goto out; /* Avoid the compiler err of unused label */
out:
#if defined(CONFIG_BLKZONED)
{
BlockZoneWps *wps = bs->wps;
if (ret == 0) {
if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND))
&& wps && bs->bl.zone_size) {
uint64_t *wp = &wps->wp[offset / bs->bl.zone_size];
if (!BDRV_ZT_IS_CONV(*wp)) {
if (type & QEMU_AIO_ZONE_APPEND) {
*s->offset = *wp;
trace_zbd_zone_append_complete(bs, *s->offset
>> BDRV_SECTOR_BITS);
}
/* Advance the wp if needed */
if (offset + bytes > *wp) {
*wp = offset + bytes;
}
}
}
} else {
if (type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
update_zones_wp(bs, s->fd, 0, 1);
}
}
if ((type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) && wps) {
qemu_co_mutex_unlock(&wps->colock);
}
}
#endif
return ret;
}
static int coroutine_fn raw_co_preadv(BlockDriverState *bs, int64_t offset,
@ -2212,6 +2636,9 @@ static void raw_close(BlockDriverState *bs)
BDRVRawState *s = bs->opaque;
if (s->fd >= 0) {
#if defined(CONFIG_BLKZONED)
g_free(bs->wps);
#endif
qemu_close(s->fd);
s->fd = -1;
}
@ -2969,6 +3396,171 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
}
}
/*
* zone report - Get a zone block device's information in the form
* of an array of zone descriptors.
* zones is an array of zone descriptors to hold zone information on reply;
* offset can be any byte within the entire size of the device;
* nr_zones is the maxium number of sectors the command should operate on.
*/
#if defined(CONFIG_BLKZONED)
static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
unsigned int *nr_zones,
BlockZoneDescriptor *zones) {
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb = (RawPosixAIOData) {
.bs = bs,
.aio_fildes = s->fd,
.aio_type = QEMU_AIO_ZONE_REPORT,
.aio_offset = offset,
.zone_report = {
.nr_zones = nr_zones,
.zones = zones,
},
};
trace_zbd_zone_report(bs, *nr_zones, offset >> BDRV_SECTOR_BITS);
return raw_thread_pool_submit(handle_aiocb_zone_report, &acb);
}
#endif
/*
* zone management operations - Execute an operation on a zone
*/
#if defined(CONFIG_BLKZONED)
static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
int64_t offset, int64_t len) {
BDRVRawState *s = bs->opaque;
RawPosixAIOData acb;
int64_t zone_size, zone_size_mask;
const char *op_name;
unsigned long zo;
int ret;
BlockZoneWps *wps = bs->wps;
int64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
zone_size = bs->bl.zone_size;
zone_size_mask = zone_size - 1;
if (offset & zone_size_mask) {
error_report("sector offset %" PRId64 " is not aligned to zone size "
"%" PRId64 "", offset / 512, zone_size / 512);
return -EINVAL;
}
if (((offset + len) < capacity && len & zone_size_mask) ||
offset + len > capacity) {
error_report("number of sectors %" PRId64 " is not aligned to zone size"
" %" PRId64 "", len / 512, zone_size / 512);
return -EINVAL;
}
uint32_t i = offset / bs->bl.zone_size;
uint32_t nrz = len / bs->bl.zone_size;
uint64_t *wp = &wps->wp[i];
if (BDRV_ZT_IS_CONV(*wp) && len != capacity) {
error_report("zone mgmt operations are not allowed for conventional zones");
return -EIO;
}
switch (op) {
case BLK_ZO_OPEN:
op_name = "BLKOPENZONE";
zo = BLKOPENZONE;
break;
case BLK_ZO_CLOSE:
op_name = "BLKCLOSEZONE";
zo = BLKCLOSEZONE;
break;
case BLK_ZO_FINISH:
op_name = "BLKFINISHZONE";
zo = BLKFINISHZONE;
break;
case BLK_ZO_RESET:
op_name = "BLKRESETZONE";
zo = BLKRESETZONE;
break;
default:
error_report("Unsupported zone op: 0x%x", op);
return -ENOTSUP;
}
acb = (RawPosixAIOData) {
.bs = bs,
.aio_fildes = s->fd,
.aio_type = QEMU_AIO_ZONE_MGMT,
.aio_offset = offset,
.aio_nbytes = len,
.zone_mgmt = {
.op = zo,
},
};
trace_zbd_zone_mgmt(bs, op_name, offset >> BDRV_SECTOR_BITS,
len >> BDRV_SECTOR_BITS);
ret = raw_thread_pool_submit(handle_aiocb_zone_mgmt, &acb);
if (ret != 0) {
update_zones_wp(bs, s->fd, offset, i);
error_report("ioctl %s failed %d", op_name, ret);
return ret;
}
if (zo == BLKRESETZONE && len == capacity) {
ret = get_zones_wp(bs, s->fd, 0, bs->bl.nr_zones, 1);
if (ret < 0) {
error_report("reporting single wp failed");
return ret;
}
} else if (zo == BLKRESETZONE) {
for (unsigned int j = 0; j < nrz; ++j) {
wp[j] = offset + j * zone_size;
}
} else if (zo == BLKFINISHZONE) {
for (unsigned int j = 0; j < nrz; ++j) {
/* The zoned device allows the last zone smaller that the
* zone size. */
wp[j] = MIN(offset + (j + 1) * zone_size, offset + len);
}
}
return ret;
}
#endif
#if defined(CONFIG_BLKZONED)
static int coroutine_fn raw_co_zone_append(BlockDriverState *bs,
int64_t *offset,
QEMUIOVector *qiov,
BdrvRequestFlags flags) {
assert(flags == 0);
int64_t zone_size_mask = bs->bl.zone_size - 1;
int64_t iov_len = 0;
int64_t len = 0;
BDRVRawState *s = bs->opaque;
s->offset = offset;
if (*offset & zone_size_mask) {
error_report("sector offset %" PRId64 " is not aligned to zone size "
"%" PRId32 "", *offset / 512, bs->bl.zone_size / 512);
return -EINVAL;
}
int64_t wg = bs->bl.write_granularity;
int64_t wg_mask = wg - 1;
for (int i = 0; i < qiov->niov; i++) {
iov_len = qiov->iov[i].iov_len;
if (iov_len & wg_mask) {
error_report("len of IOVector[%d] %" PRId64 " is not aligned to "
"block size %" PRId64 "", i, iov_len, wg);
return -EINVAL;
}
len += iov_len;
}
trace_zbd_zone_append(bs, *offset >> BDRV_SECTOR_BITS);
return raw_co_prw(bs, *offset, len, qiov, QEMU_AIO_ZONE_APPEND);
}
#endif
static coroutine_fn int
raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
bool blkdev)
@ -3724,6 +4316,14 @@ static BlockDriver bdrv_host_device = {
#ifdef __linux__
.bdrv_co_ioctl = hdev_co_ioctl,
#endif
/* zoned device */
#if defined(CONFIG_BLKZONED)
/* zone management operations */
.bdrv_co_zone_report = raw_co_zone_report,
.bdrv_co_zone_mgmt = raw_co_zone_mgmt,
.bdrv_co_zone_append = raw_co_zone_append,
#endif
};
#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)

View File

@ -3113,6 +3113,74 @@ out:
return co.ret;
}
int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset,
unsigned int *nr_zones,
BlockZoneDescriptor *zones)
{
BlockDriver *drv = bs->drv;
CoroutineIOCompletion co = {
.coroutine = qemu_coroutine_self(),
};
IO_CODE();
bdrv_inc_in_flight(bs);
if (!drv || !drv->bdrv_co_zone_report || bs->bl.zoned == BLK_Z_NONE) {
co.ret = -ENOTSUP;
goto out;
}
co.ret = drv->bdrv_co_zone_report(bs, offset, nr_zones, zones);
out:
bdrv_dec_in_flight(bs);
return co.ret;
}
int coroutine_fn bdrv_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
int64_t offset, int64_t len)
{
BlockDriver *drv = bs->drv;
CoroutineIOCompletion co = {
.coroutine = qemu_coroutine_self(),
};
IO_CODE();
bdrv_inc_in_flight(bs);
if (!drv || !drv->bdrv_co_zone_mgmt || bs->bl.zoned == BLK_Z_NONE) {
co.ret = -ENOTSUP;
goto out;
}
co.ret = drv->bdrv_co_zone_mgmt(bs, op, offset, len);
out:
bdrv_dec_in_flight(bs);
return co.ret;
}
int coroutine_fn bdrv_co_zone_append(BlockDriverState *bs, int64_t *offset,
QEMUIOVector *qiov,
BdrvRequestFlags flags)
{
int ret;
BlockDriver *drv = bs->drv;
CoroutineIOCompletion co = {
.coroutine = qemu_coroutine_self(),
};
IO_CODE();
ret = bdrv_check_qiov_request(*offset, qiov->size, qiov, 0, NULL);
if (ret < 0) {
return ret;
}
bdrv_inc_in_flight(bs);
if (!drv || !drv->bdrv_co_zone_append || bs->bl.zoned == BLK_Z_NONE) {
co.ret = -ENOTSUP;
goto out;
}
co.ret = drv->bdrv_co_zone_append(bs, offset, qiov, flags);
out:
bdrv_dec_in_flight(bs);
return co.ret;
}
void *qemu_blockalign(BlockDriverState *bs, size_t size)
{
IO_CODE();

View File

@ -350,6 +350,10 @@ static int luring_do_submit(int fd, LuringAIOCB *luringcb, LuringState *s,
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
break;
case QEMU_AIO_ZONE_APPEND:
io_uring_prep_writev(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);
break;
case QEMU_AIO_READ:
io_uring_prep_readv(sqes, fd, luringcb->qiov->iov,
luringcb->qiov->niov, offset);

View File

@ -394,6 +394,9 @@ static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
case QEMU_AIO_WRITE:
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
break;
case QEMU_AIO_ZONE_APPEND:
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
break;
case QEMU_AIO_READ:
io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
break;

View File

@ -517,6 +517,7 @@ void qmp_block_latency_histogram_set(
bool has_boundaries, uint64List *boundaries,
bool has_boundaries_read, uint64List *boundaries_read,
bool has_boundaries_write, uint64List *boundaries_write,
bool has_boundaries_append, uint64List *boundaries_append,
bool has_boundaries_flush, uint64List *boundaries_flush,
Error **errp)
{
@ -557,6 +558,16 @@ void qmp_block_latency_histogram_set(
}
}
if (has_boundaries || has_boundaries_append) {
ret = block_latency_histogram_set(
stats, BLOCK_ACCT_ZONE_APPEND,
has_boundaries_append ? boundaries_append : boundaries);
if (ret) {
error_setg(errp, "Device '%s' set append write boundaries fail", id);
return;
}
}
if (has_boundaries || has_boundaries_flush) {
ret = block_latency_histogram_set(
stats, BLOCK_ACCT_FLUSH,

View File

@ -533,27 +533,36 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
ds->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
ds->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
ds->zone_append_bytes = stats->nr_bytes[BLOCK_ACCT_ZONE_APPEND];
ds->unmap_bytes = stats->nr_bytes[BLOCK_ACCT_UNMAP];
ds->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
ds->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
ds->zone_append_operations = stats->nr_ops[BLOCK_ACCT_ZONE_APPEND];
ds->unmap_operations = stats->nr_ops[BLOCK_ACCT_UNMAP];
ds->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
ds->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
ds->failed_zone_append_operations =
stats->failed_ops[BLOCK_ACCT_ZONE_APPEND];
ds->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
ds->failed_unmap_operations = stats->failed_ops[BLOCK_ACCT_UNMAP];
ds->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
ds->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
ds->invalid_zone_append_operations =
stats->invalid_ops[BLOCK_ACCT_ZONE_APPEND];
ds->invalid_flush_operations =
stats->invalid_ops[BLOCK_ACCT_FLUSH];
ds->invalid_unmap_operations = stats->invalid_ops[BLOCK_ACCT_UNMAP];
ds->rd_merged = stats->merged[BLOCK_ACCT_READ];
ds->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
ds->zone_append_merged = stats->merged[BLOCK_ACCT_ZONE_APPEND];
ds->unmap_merged = stats->merged[BLOCK_ACCT_UNMAP];
ds->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
ds->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
ds->zone_append_total_time_ns =
stats->total_time_ns[BLOCK_ACCT_ZONE_APPEND];
ds->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
ds->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
ds->unmap_total_time_ns = stats->total_time_ns[BLOCK_ACCT_UNMAP];
@ -571,6 +580,7 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
TimedAverage *zap = &ts->latency[BLOCK_ACCT_ZONE_APPEND];
TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
dev_stats->interval_length = ts->interval_length;
@ -583,6 +593,10 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
dev_stats->max_wr_latency_ns = timed_average_max(wr);
dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
dev_stats->min_zone_append_latency_ns = timed_average_min(zap);
dev_stats->max_zone_append_latency_ns = timed_average_max(zap);
dev_stats->avg_zone_append_latency_ns = timed_average_avg(zap);
dev_stats->min_flush_latency_ns = timed_average_min(fl);
dev_stats->max_flush_latency_ns = timed_average_max(fl);
dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
@ -591,6 +605,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
block_acct_queue_depth(ts, BLOCK_ACCT_READ);
dev_stats->avg_wr_queue_depth =
block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
dev_stats->avg_zone_append_queue_depth =
block_acct_queue_depth(ts, BLOCK_ACCT_ZONE_APPEND);
QAPI_LIST_PREPEND(ds->timed_stats, dev_stats);
}
@ -600,6 +616,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk)
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_READ]);
ds->wr_latency_histogram
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_WRITE]);
ds->zone_append_latency_histogram
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_ZONE_APPEND]);
ds->flush_latency_histogram
= bdrv_latency_histogram_stats(&hgram[BLOCK_ACCT_FLUSH]);
}

View File

@ -317,6 +317,28 @@ raw_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
return bdrv_co_pdiscard(bs->file, offset, bytes);
}
static int coroutine_fn GRAPH_RDLOCK
raw_co_zone_report(BlockDriverState *bs, int64_t offset,
unsigned int *nr_zones,
BlockZoneDescriptor *zones)
{
return bdrv_co_zone_report(bs->file->bs, offset, nr_zones, zones);
}
static int coroutine_fn GRAPH_RDLOCK
raw_co_zone_mgmt(BlockDriverState *bs, BlockZoneOp op,
int64_t offset, int64_t len)
{
return bdrv_co_zone_mgmt(bs->file->bs, op, offset, len);
}
static int coroutine_fn GRAPH_RDLOCK
raw_co_zone_append(BlockDriverState *bs,int64_t *offset, QEMUIOVector *qiov,
BdrvRequestFlags flags)
{
return bdrv_co_zone_append(bs->file->bs, offset, qiov, flags);
}
static int64_t coroutine_fn GRAPH_RDLOCK
raw_co_getlength(BlockDriverState *bs)
{
@ -608,6 +630,7 @@ static void raw_child_perm(BlockDriverState *bs, BdrvChild *c,
BlockDriver bdrv_raw = {
.format_name = "raw",
.instance_size = sizeof(BDRVRawState),
.supports_zoned_children = true,
.bdrv_probe = &raw_probe,
.bdrv_reopen_prepare = &raw_reopen_prepare,
.bdrv_reopen_commit = &raw_reopen_commit,
@ -619,6 +642,9 @@ BlockDriver bdrv_raw = {
.bdrv_co_pwritev = &raw_co_pwritev,
.bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
.bdrv_co_pdiscard = &raw_co_pdiscard,
.bdrv_co_zone_report = &raw_co_zone_report,
.bdrv_co_zone_mgmt = &raw_co_zone_mgmt,
.bdrv_co_zone_append = &raw_co_zone_append,
.bdrv_co_block_status = &raw_co_block_status,
.bdrv_co_copy_range_from = &raw_co_copy_range_from,
.bdrv_co_copy_range_to = &raw_co_copy_range_to,

View File

@ -209,6 +209,10 @@ file_FindEjectableOpticalMedia(const char *media) "Matching using %s"
file_setup_cdrom(const char *partition) "Using %s as optical disc"
file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
file_flush_fdatasync_failed(int err) "errno %d"
zbd_zone_report(void *bs, unsigned int nr_zones, int64_t sector) "bs %p report %d zones starting at sector offset 0x%" PRIx64 ""
zbd_zone_mgmt(void *bs, const char *op_name, int64_t sector, int64_t len) "bs %p %s starts at sector offset 0x%" PRIx64 " over a range of 0x%" PRIx64 " sectors"
zbd_zone_append(void *bs, int64_t sector) "bs %p append at sector offset 0x%" PRIx64 ""
zbd_zone_append_complete(void *bs, int64_t sector) "bs %p returns append sector 0x%" PRIx64 ""
# ssh.c
sftp_error(const char *op, const char *ssh_err, int ssh_err_code, int sftp_err_code) "%s failed: %s (libssh error code: %d, sftp error code: %d)"

View File

@ -12,3 +12,4 @@ generated from in-code annotations to function prototypes.
memory
modules
ui
zoned-storage

View File

@ -0,0 +1,62 @@
=============
zoned-storage
=============
Zoned Block Devices (ZBDs) divide the LBA space into block regions called zones
that are larger than the LBA size. They can only allow sequential writes, which
can reduce write amplification in SSDs, and potentially lead to higher
throughput and increased capacity. More details about ZBDs can be found at:
https://zonedstorage.io/docs/introduction/zoned-storage
1. Block layer APIs for zoned storage
-------------------------------------
QEMU block layer supports three zoned storage models:
- BLK_Z_HM: The host-managed zoned model only allows sequential writes access
to zones. It supports ZBD-specific I/O commands that can be used by a host to
manage the zones of a device.
- BLK_Z_HA: The host-aware zoned model allows random write operations in
zones, making it backward compatible with regular block devices.
- BLK_Z_NONE: The non-zoned model has no zones support. It includes both
regular and drive-managed ZBD devices. ZBD-specific I/O commands are not
supported.
The block device information resides inside BlockDriverState. QEMU uses
BlockLimits struct(BlockDriverState::bl) that is continuously accessed by the
block layer while processing I/O requests. A BlockBackend has a root pointer to
a BlockDriverState graph(for example, raw format on top of file-posix). The
zoned storage information can be propagated from the leaf BlockDriverState all
the way up to the BlockBackend. If the zoned storage model in file-posix is
set to BLK_Z_HM, then block drivers will declare support for zoned host device.
The block layer APIs support commands needed for zoned storage devices,
including report zones, four zone operations, and zone append.
2. Emulating zoned storage controllers
--------------------------------------
When the BlockBackend's BlockLimits model reports a zoned storage device, users
like the virtio-blk emulation or the qemu-io-cmds.c utility can use block layer
APIs for zoned storage emulation or testing.
For example, to test zone_report on a null_blk device using qemu-io is::
$ path/to/qemu-io --image-opts -n driver=host_device,filename=/dev/nullb0 -c "zrp offset nr_zones"
To expose the host's zoned block device through virtio-blk, the command line
can be (includes the -device parameter)::
-blockdev node-name=drive0,driver=host_device,filename=/dev/nullb0,cache.direct=on \
-device virtio-blk-pci,drive=drive0
Or only use the -drive parameter::
-driver driver=host_device,file=/dev/nullb0,if=virtio,cache.direct=on
Additionally, QEMU has several ways of supporting zoned storage, including:
(1) Using virtio-scsi: --device scsi-block allows for the passing through of
SCSI ZBC devices, enabling the attachment of ZBC or ZAC HDDs to QEMU.
(2) PCI device pass-through: While NVMe ZNS emulation is available for testing
purposes, it cannot yet pass through a zoned device from the host. To pass on
the NVMe ZNS device to the guest, use VFIO PCI pass the entire NVMe PCI adapter
through to the guest. Likewise, an HDD HBA can be passed on to QEMU all HDDs
attached to the HBA.

View File

@ -430,6 +430,12 @@ Hard disks
you may corrupt your host data (use the ``-snapshot`` command
line option or modify the device permissions accordingly).
Zoned block devices
Zoned block devices can be passed through to the guest if the emulated storage
controller supports zoned storage. Use ``--blockdev host_device,
node-name=drive0,filename=/dev/nullb0,cache.direct=on`` to pass through
``/dev/nullb0`` as ``drive0``.
Windows
^^^^^^^

View File

@ -44,9 +44,16 @@ pflash_write_unknown(const char *name, uint8_t cmd) "%s: unknown command 0x%02x"
# virtio-blk.c
virtio_blk_req_complete(void *vdev, void *req, int status) "vdev %p req %p status %d"
virtio_blk_rw_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d"
virtio_blk_zone_report_complete(void *vdev, void *req, unsigned int nr_zones, int ret) "vdev %p req %p nr_zones %u ret %d"
virtio_blk_zone_mgmt_complete(void *vdev, void *req, int ret) "vdev %p req %p ret %d"
virtio_blk_zone_append_complete(void *vdev, void *req, int64_t sector, int ret) "vdev %p req %p, append sector 0x%" PRIx64 " ret %d"
virtio_blk_handle_write(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu"
virtio_blk_handle_read(void *vdev, void *req, uint64_t sector, size_t nsectors) "vdev %p req %p sector %"PRIu64" nsectors %zu"
virtio_blk_submit_multireq(void *vdev, void *mrb, int start, int num_reqs, uint64_t offset, size_t size, bool is_write) "vdev %p mrb %p start %d num_reqs %d offset %"PRIu64" size %zu is_write %d"
virtio_blk_handle_zone_report(void *vdev, void *req, int64_t sector, unsigned int nr_zones) "vdev %p req %p sector 0x%" PRIx64 " nr_zones %u"
virtio_blk_handle_zone_mgmt(void *vdev, void *req, uint8_t op, int64_t sector, int64_t len) "vdev %p req %p op 0x%x sector 0x%" PRIx64 " len 0x%" PRIx64 ""
virtio_blk_handle_zone_reset_all(void *vdev, void *req, int64_t sector, int64_t len) "vdev %p req %p sector 0x%" PRIx64 " cap 0x%" PRIx64 ""
virtio_blk_handle_zone_append(void *vdev, void *req, int64_t sector) "vdev %p req %p, append sector 0x%" PRIx64 ""
# hd-geometry.c
hd_geometry_lchs_guess(void *blk, int cyls, int heads, int secs) "blk %p LCHS %d %d %d"

View File

@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
.end = endof(struct virtio_blk_config, discard_sector_alignment)},
{.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
.end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
{.flags = 1ULL << VIRTIO_BLK_F_ZONED,
.end = endof(struct virtio_blk_config, zoned)},
{}
};

View File

@ -17,6 +17,7 @@
#include "qemu/module.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "block/block_int.h"
#include "trace.h"
#include "hw/block/block.h"
#include "hw/qdev-properties.h"
@ -601,6 +602,351 @@ err:
return err_status;
}
typedef struct ZoneCmdData {
VirtIOBlockReq *req;
struct iovec *in_iov;
unsigned in_num;
union {
struct {
unsigned int nr_zones;
BlockZoneDescriptor *zones;
} zone_report_data;
struct {
int64_t offset;
} zone_append_data;
};
} ZoneCmdData;
/*
* check zoned_request: error checking before issuing requests. If all checks
* passed, return true.
* append: true if only zone append requests issued.
*/
static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
bool append, uint8_t *status) {
BlockDriverState *bs = blk_bs(s->blk);
int index;
if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
*status = VIRTIO_BLK_S_UNSUPP;
return false;
}
if (offset < 0 || len < 0 || len > (bs->total_sectors << BDRV_SECTOR_BITS)
|| offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {
*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
return false;
}
if (append) {
if (bs->bl.write_granularity) {
if ((offset % bs->bl.write_granularity) != 0) {
*status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
return false;
}
}
index = offset / bs->bl.zone_size;
if (BDRV_ZT_IS_CONV(bs->wps->wp[index])) {
*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
return false;
}
if (len / 512 > bs->bl.max_append_sectors) {
if (bs->bl.max_append_sectors == 0) {
*status = VIRTIO_BLK_S_UNSUPP;
} else {
*status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
}
return false;
}
}
return true;
}
static void virtio_blk_zone_report_complete(void *opaque, int ret)
{
ZoneCmdData *data = opaque;
VirtIOBlockReq *req = data->req;
VirtIOBlock *s = req->dev;
VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
struct iovec *in_iov = data->in_iov;
unsigned in_num = data->in_num;
int64_t zrp_size, n, j = 0;
int64_t nz = data->zone_report_data.nr_zones;
int8_t err_status = VIRTIO_BLK_S_OK;
trace_virtio_blk_zone_report_complete(vdev, req, nz, ret);
if (ret) {
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
goto out;
}
struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
.nr_zones = cpu_to_le64(nz),
};
zrp_size = sizeof(struct virtio_blk_zone_report)
+ sizeof(struct virtio_blk_zone_descriptor) * nz;
n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
if (n != sizeof(zrp_hdr)) {
virtio_error(vdev, "Driver provided input buffer that is too small!");
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
goto out;
}
for (size_t i = sizeof(zrp_hdr); i < zrp_size;
i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
struct virtio_blk_zone_descriptor desc =
(struct virtio_blk_zone_descriptor) {
.z_start = cpu_to_le64(data->zone_report_data.zones[j].start
>> BDRV_SECTOR_BITS),
.z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
>> BDRV_SECTOR_BITS),
.z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
>> BDRV_SECTOR_BITS),
};
switch (data->zone_report_data.zones[j].type) {
case BLK_ZT_CONV:
desc.z_type = VIRTIO_BLK_ZT_CONV;
break;
case BLK_ZT_SWR:
desc.z_type = VIRTIO_BLK_ZT_SWR;
break;
case BLK_ZT_SWP:
desc.z_type = VIRTIO_BLK_ZT_SWP;
break;
default:
g_assert_not_reached();
}
switch (data->zone_report_data.zones[j].state) {
case BLK_ZS_RDONLY:
desc.z_state = VIRTIO_BLK_ZS_RDONLY;
break;
case BLK_ZS_OFFLINE:
desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
break;
case BLK_ZS_EMPTY:
desc.z_state = VIRTIO_BLK_ZS_EMPTY;
break;
case BLK_ZS_CLOSED:
desc.z_state = VIRTIO_BLK_ZS_CLOSED;
break;
case BLK_ZS_FULL:
desc.z_state = VIRTIO_BLK_ZS_FULL;
break;
case BLK_ZS_EOPEN:
desc.z_state = VIRTIO_BLK_ZS_EOPEN;
break;
case BLK_ZS_IOPEN:
desc.z_state = VIRTIO_BLK_ZS_IOPEN;
break;
case BLK_ZS_NOT_WP:
desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
break;
default:
g_assert_not_reached();
}
/* TODO: it takes O(n^2) time complexity. Optimizations required. */
n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
if (n != sizeof(desc)) {
virtio_error(vdev, "Driver provided input buffer "
"for descriptors that is too small!");
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
}
}
out:
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
virtio_blk_req_complete(req, err_status);
virtio_blk_free_request(req);
aio_context_release(blk_get_aio_context(s->conf.conf.blk));
g_free(data->zone_report_data.zones);
g_free(data);
}
static void virtio_blk_handle_zone_report(VirtIOBlockReq *req,
struct iovec *in_iov,
unsigned in_num)
{
VirtIOBlock *s = req->dev;
VirtIODevice *vdev = VIRTIO_DEVICE(s);
unsigned int nr_zones;
ZoneCmdData *data;
int64_t zone_size, offset;
uint8_t err_status;
if (req->in_len < sizeof(struct virtio_blk_inhdr) +
sizeof(struct virtio_blk_zone_report) +
sizeof(struct virtio_blk_zone_descriptor)) {
virtio_error(vdev, "in buffer too small for zone report");
return;
}
/* start byte offset of the zone report */
offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
if (!check_zoned_request(s, offset, 0, false, &err_status)) {
goto out;
}
nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
sizeof(struct virtio_blk_zone_report)) /
sizeof(struct virtio_blk_zone_descriptor);
trace_virtio_blk_handle_zone_report(vdev, req,
offset >> BDRV_SECTOR_BITS, nr_zones);
zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
data = g_malloc(sizeof(ZoneCmdData));
data->req = req;
data->in_iov = in_iov;
data->in_num = in_num;
data->zone_report_data.nr_zones = nr_zones;
data->zone_report_data.zones = g_malloc(zone_size),
blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
data->zone_report_data.zones,
virtio_blk_zone_report_complete, data);
return;
out:
virtio_blk_req_complete(req, err_status);
virtio_blk_free_request(req);
}
static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
{
VirtIOBlockReq *req = opaque;
VirtIOBlock *s = req->dev;
VirtIODevice *vdev = VIRTIO_DEVICE(s);
int8_t err_status = VIRTIO_BLK_S_OK;
trace_virtio_blk_zone_mgmt_complete(vdev, req,ret);
if (ret) {
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
}
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
virtio_blk_req_complete(req, err_status);
virtio_blk_free_request(req);
aio_context_release(blk_get_aio_context(s->conf.conf.blk));
}
static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
{
VirtIOBlock *s = req->dev;
VirtIODevice *vdev = VIRTIO_DEVICE(s);
BlockDriverState *bs = blk_bs(s->blk);
int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
uint64_t len;
uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
uint8_t err_status = VIRTIO_BLK_S_OK;
uint32_t type = virtio_ldl_p(vdev, &req->out.type);
if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
/* Entire drive capacity */
offset = 0;
len = capacity;
trace_virtio_blk_handle_zone_reset_all(vdev, req, 0,
bs->total_sectors);
} else {
if (bs->bl.zone_size > capacity - offset) {
/* The zoned device allows the last smaller zone. */
len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
} else {
len = bs->bl.zone_size;
}
trace_virtio_blk_handle_zone_mgmt(vdev, req, op,
offset >> BDRV_SECTOR_BITS,
len >> BDRV_SECTOR_BITS);
}
if (!check_zoned_request(s, offset, len, false, &err_status)) {
goto out;
}
blk_aio_zone_mgmt(s->blk, op, offset, len,
virtio_blk_zone_mgmt_complete, req);
return 0;
out:
virtio_blk_req_complete(req, err_status);
virtio_blk_free_request(req);
return err_status;
}
static void virtio_blk_zone_append_complete(void *opaque, int ret)
{
ZoneCmdData *data = opaque;
VirtIOBlockReq *req = data->req;
VirtIOBlock *s = req->dev;
VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
int64_t append_sector, n;
uint8_t err_status = VIRTIO_BLK_S_OK;
if (ret) {
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
goto out;
}
virtio_stq_p(vdev, &append_sector,
data->zone_append_data.offset >> BDRV_SECTOR_BITS);
n = iov_from_buf(data->in_iov, data->in_num, 0, &append_sector,
sizeof(append_sector));
if (n != sizeof(append_sector)) {
virtio_error(vdev, "Driver provided input buffer less than size of "
"append_sector");
err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
goto out;
}
trace_virtio_blk_zone_append_complete(vdev, req, append_sector, ret);
out:
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
virtio_blk_req_complete(req, err_status);
virtio_blk_free_request(req);
aio_context_release(blk_get_aio_context(s->conf.conf.blk));
g_free(data);
}
static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
struct iovec *out_iov,
struct iovec *in_iov,
uint64_t out_num,
unsigned in_num) {
VirtIOBlock *s = req->dev;
VirtIODevice *vdev = VIRTIO_DEVICE(s);
uint8_t err_status = VIRTIO_BLK_S_OK;
int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
int64_t len = iov_size(out_iov, out_num);
trace_virtio_blk_handle_zone_append(vdev, req, offset >> BDRV_SECTOR_BITS);
if (!check_zoned_request(s, offset, len, true, &err_status)) {
goto out;
}
ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
data->req = req;
data->in_iov = in_iov;
data->in_num = in_num;
data->zone_append_data.offset = offset;
qemu_iovec_init_external(&req->qiov, out_iov, out_num);
block_acct_start(blk_get_stats(s->blk), &req->acct, len,
BLOCK_ACCT_ZONE_APPEND);
blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
virtio_blk_zone_append_complete, data);
return 0;
out:
aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
virtio_blk_req_complete(req, err_status);
virtio_blk_free_request(req);
aio_context_release(blk_get_aio_context(s->conf.conf.blk));
return err_status;
}
static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
{
uint32_t type;
@ -687,6 +1033,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
case VIRTIO_BLK_T_FLUSH:
virtio_blk_handle_flush(req, mrb);
break;
case VIRTIO_BLK_T_ZONE_REPORT:
virtio_blk_handle_zone_report(req, in_iov, in_num);
break;
case VIRTIO_BLK_T_ZONE_OPEN:
virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
break;
case VIRTIO_BLK_T_ZONE_CLOSE:
virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
break;
case VIRTIO_BLK_T_ZONE_FINISH:
virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
break;
case VIRTIO_BLK_T_ZONE_RESET:
virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
break;
case VIRTIO_BLK_T_ZONE_RESET_ALL:
virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
break;
case VIRTIO_BLK_T_SCSI_CMD:
virtio_blk_handle_scsi(req);
break;
@ -705,6 +1069,14 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
virtio_blk_free_request(req);
break;
}
case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
/*
* Passing out_iov/out_num and in_iov/in_num is not safe
* to access req->elem.out_sg directly because it may be
* modified by virtio_blk_handle_request().
*/
virtio_blk_handle_zone_append(req, out_iov, in_iov, out_num, in_num);
break;
/*
* VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
* VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
@ -890,6 +1262,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
{
VirtIOBlock *s = VIRTIO_BLK(vdev);
BlockConf *conf = &s->conf.conf;
BlockDriverState *bs = blk_bs(s->blk);
struct virtio_blk_config blkcfg;
uint64_t capacity;
int64_t length;
@ -954,6 +1327,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
blkcfg.write_zeroes_may_unmap = 1;
virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
}
if (bs->bl.zoned != BLK_Z_NONE) {
switch (bs->bl.zoned) {
case BLK_Z_HM:
blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
break;
case BLK_Z_HA:
blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
break;
default:
g_assert_not_reached();
}
virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
bs->bl.zone_size / 512);
virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
bs->bl.max_active_zones);
virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
bs->bl.max_open_zones);
virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
bs->bl.max_append_sectors);
} else {
blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
}
memcpy(config, &blkcfg, s->config_size);
}
@ -1163,6 +1560,14 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
return;
}
BlockDriverState *bs = blk_bs(conf->conf.blk);
if (bs->bl.zoned != BLK_Z_NONE) {
virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
if (bs->bl.zoned == BLK_Z_HM) {
virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
}
}
if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
(!conf->max_discard_sectors ||
conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {

View File

@ -176,6 +176,8 @@ static const qmp_virtio_feature_map_t virtio_blk_feature_map[] = {
"VIRTIO_BLK_F_DISCARD: Discard command supported"),
FEATURE_ENTRY(VIRTIO_BLK_F_WRITE_ZEROES, \
"VIRTIO_BLK_F_WRITE_ZEROES: Write zeroes command supported"),
FEATURE_ENTRY(VIRTIO_BLK_F_ZONED, \
"VIRTIO_BLK_F_ZONED: Zoned block devices"),
#ifndef VIRTIO_BLK_NO_LEGACY
FEATURE_ENTRY(VIRTIO_BLK_F_BARRIER, \
"VIRTIO_BLK_F_BARRIER: Request barriers supported"),

View File

@ -37,6 +37,7 @@ enum BlockAcctType {
BLOCK_ACCT_READ,
BLOCK_ACCT_WRITE,
BLOCK_ACCT_FLUSH,
BLOCK_ACCT_ZONE_APPEND,
BLOCK_ACCT_UNMAP,
BLOCK_MAX_IOTYPE,
};

View File

@ -75,6 +75,57 @@ typedef struct BlockDriver BlockDriver;
typedef struct BdrvChild BdrvChild;
typedef struct BdrvChildClass BdrvChildClass;
typedef enum BlockZoneOp {
BLK_ZO_OPEN,
BLK_ZO_CLOSE,
BLK_ZO_FINISH,
BLK_ZO_RESET,
} BlockZoneOp;
typedef enum BlockZoneModel {
BLK_Z_NONE = 0x0, /* Regular block device */
BLK_Z_HM = 0x1, /* Host-managed zoned block device */
BLK_Z_HA = 0x2, /* Host-aware zoned block device */
} BlockZoneModel;
typedef enum BlockZoneState {
BLK_ZS_NOT_WP = 0x0,
BLK_ZS_EMPTY = 0x1,
BLK_ZS_IOPEN = 0x2,
BLK_ZS_EOPEN = 0x3,
BLK_ZS_CLOSED = 0x4,
BLK_ZS_RDONLY = 0xD,
BLK_ZS_FULL = 0xE,
BLK_ZS_OFFLINE = 0xF,
} BlockZoneState;
typedef enum BlockZoneType {
BLK_ZT_CONV = 0x1, /* Conventional random writes supported */
BLK_ZT_SWR = 0x2, /* Sequential writes required */
BLK_ZT_SWP = 0x3, /* Sequential writes preferred */
} BlockZoneType;
/*
* Zone descriptor data structure.
* Provides information on a zone with all position and size values in bytes.
*/
typedef struct BlockZoneDescriptor {
uint64_t start;
uint64_t length;
uint64_t cap;
uint64_t wp;
BlockZoneType type;
BlockZoneState state;
} BlockZoneDescriptor;
/*
* Track write pointers of a zone in bytes.
*/
typedef struct BlockZoneWps {
CoMutex colock;
uint64_t wp[];
} BlockZoneWps;
typedef struct BlockDriverInfo {
/* in bytes, 0 if irrelevant */
int cluster_size;
@ -197,6 +248,12 @@ typedef enum {
#define BDRV_SECTOR_BITS 9
#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS)
/*
* Get the first most significant bit of wp. If it is zero, then
* the zone type is SWR.
*/
#define BDRV_ZT_IS_CONV(wp) (wp & (1ULL << 63))
#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
INT_MAX >> BDRV_SECTOR_BITS)
#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)

View File

@ -114,6 +114,19 @@ int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs);
int coroutine_fn GRAPH_RDLOCK bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
int64_t bytes);
/* Report zone information of zone block device. */
int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_report(BlockDriverState *bs,
int64_t offset,
unsigned int *nr_zones,
BlockZoneDescriptor *zones);
int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_mgmt(BlockDriverState *bs,
BlockZoneOp op,
int64_t offset, int64_t len);
int coroutine_fn GRAPH_RDLOCK bdrv_co_zone_append(BlockDriverState *bs,
int64_t *offset,
QEMUIOVector *qiov,
BdrvRequestFlags flags);
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
int bdrv_block_status(BlockDriverState *bs, int64_t offset,
int64_t bytes, int64_t *pnum, int64_t *map,

View File

@ -137,6 +137,11 @@ struct BlockDriver {
*/
bool is_format;
/*
* Set to true if the BlockDriver supports zoned children.
*/
bool supports_zoned_children;
/*
* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
* this field set to true, except ones that are defined only by their
@ -713,6 +718,15 @@ struct BlockDriver {
int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_load_vmstate)(
BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
int64_t offset, unsigned int *nr_zones,
BlockZoneDescriptor *zones);
int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, BlockZoneOp op,
int64_t offset, int64_t len);
int coroutine_fn (*bdrv_co_zone_append)(BlockDriverState *bs,
int64_t *offset, QEMUIOVector *qiov,
BdrvRequestFlags flags);
/* removable device specific */
bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_is_inserted)(
BlockDriverState *bs);
@ -862,6 +876,26 @@ typedef struct BlockLimits {
* an explicit monitor command to load the disk inside the guest).
*/
bool has_variable_length;
/* device zone model */
BlockZoneModel zoned;
/* zone size expressed in bytes */
uint32_t zone_size;
/* total number of zones */
uint32_t nr_zones;
/* maximum sectors of a zone append write operation */
uint32_t max_append_sectors;
/* maximum number of open zones */
uint32_t max_open_zones;
/* maximum number of active zones */
uint32_t max_active_zones;
uint32_t write_granularity;
} BlockLimits;
typedef struct BdrvOpBlocker BdrvOpBlocker;
@ -1223,6 +1257,9 @@ struct BlockDriverState {
CoMutex bsc_modify_lock;
/* Always non-NULL, but must only be dereferenced under an RCU read guard */
BdrvBlockStatusCache *block_status_cache;
/* array of write pointers' location of each zone in the zoned device. */
BlockZoneWps *wps;
};
struct BlockBackendRootState {

View File

@ -28,6 +28,9 @@
#define QEMU_AIO_WRITE_ZEROES 0x0020
#define QEMU_AIO_COPY_RANGE 0x0040
#define QEMU_AIO_TRUNCATE 0x0080
#define QEMU_AIO_ZONE_REPORT 0x0100
#define QEMU_AIO_ZONE_MGMT 0x0200
#define QEMU_AIO_ZONE_APPEND 0x0400
#define QEMU_AIO_TYPE_MASK \
(QEMU_AIO_READ | \
QEMU_AIO_WRITE | \
@ -36,7 +39,10 @@
QEMU_AIO_DISCARD | \
QEMU_AIO_WRITE_ZEROES | \
QEMU_AIO_COPY_RANGE | \
QEMU_AIO_TRUNCATE)
QEMU_AIO_TRUNCATE | \
QEMU_AIO_ZONE_REPORT | \
QEMU_AIO_ZONE_MGMT | \
QEMU_AIO_ZONE_APPEND)
/* AIO flags */
#define QEMU_AIO_MISALIGNED 0x1000

View File

@ -46,6 +46,16 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
BlockCompletionFunc *cb, void *opaque);
BlockAIOCB *blk_aio_flush(BlockBackend *blk,
BlockCompletionFunc *cb, void *opaque);
BlockAIOCB *blk_aio_zone_report(BlockBackend *blk, int64_t offset,
unsigned int *nr_zones,
BlockZoneDescriptor *zones,
BlockCompletionFunc *cb, void *opaque);
BlockAIOCB *blk_aio_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
int64_t offset, int64_t len,
BlockCompletionFunc *cb, void *opaque);
BlockAIOCB *blk_aio_zone_append(BlockBackend *blk, int64_t *offset,
QEMUIOVector *qiov, BdrvRequestFlags flags,
BlockCompletionFunc *cb, void *opaque);
BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes,
BlockCompletionFunc *cb, void *opaque);
void blk_aio_cancel_async(BlockAIOCB *acb);
@ -191,6 +201,23 @@ int co_wrapper_mixed blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
int64_t bytes, BdrvRequestFlags flags);
int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
unsigned int *nr_zones,
BlockZoneDescriptor *zones);
int co_wrapper_mixed blk_zone_report(BlockBackend *blk, int64_t offset,
unsigned int *nr_zones,
BlockZoneDescriptor *zones);
int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
int64_t offset, int64_t len);
int co_wrapper_mixed blk_zone_mgmt(BlockBackend *blk, BlockZoneOp op,
int64_t offset, int64_t len);
int coroutine_fn blk_co_zone_append(BlockBackend *blk, int64_t *offset,
QEMUIOVector *qiov,
BdrvRequestFlags flags);
int co_wrapper_mixed blk_zone_append(BlockBackend *blk, int64_t *offset,
QEMUIOVector *qiov,
BdrvRequestFlags flags);
int co_wrapper_mixed blk_pdiscard(BlockBackend *blk, int64_t offset,
int64_t bytes);
int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset,

View File

@ -2025,6 +2025,8 @@ if rdma.found()
endif
# has_header_symbol
config_host_data.set('CONFIG_BLKZONED',
cc.has_header_symbol('linux/blkzoned.h', 'BLKOPENZONE'))
config_host_data.set('CONFIG_EPOLL_CREATE1',
cc.has_header_symbol('sys/epoll.h', 'epoll_create1'))
config_host_data.set('CONFIG_FALLOCATE_PUNCH_HOLE',
@ -2060,6 +2062,9 @@ config_host_data.set('HAVE_SIGEV_NOTIFY_THREAD_ID',
config_host_data.set('HAVE_STRUCT_STAT_ST_ATIM',
cc.has_member('struct stat', 'st_atim',
prefix: '#include <sys/stat.h>'))
config_host_data.set('HAVE_BLK_ZONE_REP_CAPACITY',
cc.has_member('struct blk_zone', 'capacity',
prefix: '#include <linux/blkzoned.h>'))
# has_type
config_host_data.set('CONFIG_IOVEC',

View File

@ -854,6 +854,10 @@
# @min_wr_latency_ns: Minimum latency of write operations in the
# defined interval, in nanoseconds.
#
# @min_zone_append_latency_ns: Minimum latency of zone append operations
# in the defined interval, in nanoseconds
# (since 8.1)
#
# @min_flush_latency_ns: Minimum latency of flush operations in the
# defined interval, in nanoseconds.
#
@ -863,6 +867,10 @@
# @max_wr_latency_ns: Maximum latency of write operations in the
# defined interval, in nanoseconds.
#
# @max_zone_append_latency_ns: Maximum latency of zone append operations
# in the defined interval, in nanoseconds
# (since 8.1)
#
# @max_flush_latency_ns: Maximum latency of flush operations in the
# defined interval, in nanoseconds.
#
@ -872,6 +880,10 @@
# @avg_wr_latency_ns: Average latency of write operations in the
# defined interval, in nanoseconds.
#
# @avg_zone_append_latency_ns: Average latency of zone append operations
# in the defined interval, in nanoseconds
# (since 8.1)
#
# @avg_flush_latency_ns: Average latency of flush operations in the
# defined interval, in nanoseconds.
#
@ -881,15 +893,23 @@
# @avg_wr_queue_depth: Average number of pending write operations in
# the defined interval.
#
# @avg_zone_append_queue_depth: Average number of pending zone append
# operations in the defined interval
# (since 8.1).
#
# Since: 2.5
##
{ 'struct': 'BlockDeviceTimedStats',
'data': { 'interval_length': 'int', 'min_rd_latency_ns': 'int',
'max_rd_latency_ns': 'int', 'avg_rd_latency_ns': 'int',
'min_wr_latency_ns': 'int', 'max_wr_latency_ns': 'int',
'avg_wr_latency_ns': 'int', 'min_flush_latency_ns': 'int',
'max_flush_latency_ns': 'int', 'avg_flush_latency_ns': 'int',
'avg_rd_queue_depth': 'number', 'avg_wr_queue_depth': 'number' } }
'avg_wr_latency_ns': 'int', 'min_zone_append_latency_ns': 'int',
'max_zone_append_latency_ns': 'int',
'avg_zone_append_latency_ns': 'int',
'min_flush_latency_ns': 'int', 'max_flush_latency_ns': 'int',
'avg_flush_latency_ns': 'int', 'avg_rd_queue_depth': 'number',
'avg_wr_queue_depth': 'number',
'avg_zone_append_queue_depth': 'number' } }
##
# @BlockDeviceStats:
@ -900,6 +920,9 @@
#
# @wr_bytes: The number of bytes written by the device.
#
# @zone_append_bytes: The number of bytes appended by the zoned devices
# (since 8.1)
#
# @unmap_bytes: The number of bytes unmapped by the device (Since 4.2)
#
# @rd_operations: The number of read operations performed by the
@ -908,6 +931,9 @@
# @wr_operations: The number of write operations performed by the
# device.
#
# @zone_append_operations: The number of zone append operations performed
# by the zoned devices (since 8.1)
#
# @flush_operations: The number of cache flush operations performed by
# the device (since 0.15)
#
@ -920,6 +946,9 @@
# @wr_total_time_ns: Total time spent on writes in nanoseconds (since
# 0.15).
#
# @zone_append_total_time_ns: Total time spent on zone append writes
# in nanoseconds (since 8.1)
#
# @flush_total_time_ns: Total time spent on cache flushes in
# nanoseconds (since 0.15).
#
@ -937,6 +966,9 @@
# @wr_merged: Number of write requests that have been merged into
# another request (Since 2.3).
#
# @zone_append_merged: Number of zone append requests that have been merged
# into another request (since 8.1)
#
# @unmap_merged: Number of unmap requests that have been merged into
# another request (Since 4.2)
#
@ -950,6 +982,10 @@
# @failed_wr_operations: The number of failed write operations
# performed by the device (Since 2.5)
#
# @failed_zone_append_operations: The number of failed zone append write
# operations performed by the zoned devices
# (since 8.1)
#
# @failed_flush_operations: The number of failed flush operations
# performed by the device (Since 2.5)
#
@ -962,6 +998,9 @@
# @invalid_wr_operations: The number of invalid write operations
# performed by the device (Since 2.5)
#
# @invalid_zone_append_operations: The number of invalid zone append operations
# performed by the zoned device (since 8.1)
#
# @invalid_flush_operations: The number of invalid flush operations
# performed by the device (Since 2.5)
#
@ -981,27 +1020,34 @@
#
# @wr_latency_histogram: @BlockLatencyHistogramInfo. (Since 4.0)
#
# @zone_append_latency_histogram: @BlockLatencyHistogramInfo. (since 8.1)
#
# @flush_latency_histogram: @BlockLatencyHistogramInfo. (Since 4.0)
#
# Since: 0.14
##
{ 'struct': 'BlockDeviceStats',
'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'unmap_bytes' : 'int',
'rd_operations': 'int', 'wr_operations': 'int',
'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'zone_append_bytes': 'int',
'unmap_bytes' : 'int', 'rd_operations': 'int',
'wr_operations': 'int', 'zone_append_operations': 'int',
'flush_operations': 'int', 'unmap_operations': 'int',
'rd_total_time_ns': 'int', 'wr_total_time_ns': 'int',
'flush_total_time_ns': 'int', 'unmap_total_time_ns': 'int',
'wr_highest_offset': 'int',
'rd_merged': 'int', 'wr_merged': 'int', 'unmap_merged': 'int',
'*idle_time_ns': 'int',
'zone_append_total_time_ns': 'int', 'flush_total_time_ns': 'int',
'unmap_total_time_ns': 'int', 'wr_highest_offset': 'int',
'rd_merged': 'int', 'wr_merged': 'int', 'zone_append_merged': 'int',
'unmap_merged': 'int', '*idle_time_ns': 'int',
'failed_rd_operations': 'int', 'failed_wr_operations': 'int',
'failed_flush_operations': 'int', 'failed_unmap_operations': 'int',
'invalid_rd_operations': 'int', 'invalid_wr_operations': 'int',
'failed_zone_append_operations': 'int',
'failed_flush_operations': 'int',
'failed_unmap_operations': 'int', 'invalid_rd_operations': 'int',
'invalid_wr_operations': 'int',
'invalid_zone_append_operations': 'int',
'invalid_flush_operations': 'int', 'invalid_unmap_operations': 'int',
'account_invalid': 'bool', 'account_failed': 'bool',
'timed_stats': ['BlockDeviceTimedStats'],
'*rd_latency_histogram': 'BlockLatencyHistogramInfo',
'*wr_latency_histogram': 'BlockLatencyHistogramInfo',
'*zone_append_latency_histogram': 'BlockLatencyHistogramInfo',
'*flush_latency_histogram': 'BlockLatencyHistogramInfo' } }
##

View File

@ -534,6 +534,9 @@
# @boundaries-write: list of interval boundary values for write
# latency histogram.
#
# @boundaries-zap: list of interval boundary values for zone append write
# latency histogram.
#
# @boundaries-flush: list of interval boundary values for flush
# latency histogram.
#
@ -587,5 +590,6 @@
'*boundaries': ['uint64'],
'*boundaries-read': ['uint64'],
'*boundaries-write': ['uint64'],
'*boundaries-zap': ['uint64'],
'*boundaries-flush': ['uint64'] },
'allow-preconfig': true }

View File

@ -1730,6 +1730,224 @@ static const cmdinfo_t flush_cmd = {
.oneline = "flush all in-core file state to disk",
};
static inline int64_t tosector(int64_t bytes)
{
return bytes >> BDRV_SECTOR_BITS;
}
static int zone_report_f(BlockBackend *blk, int argc, char **argv)
{
int ret;
int64_t offset;
unsigned int nr_zones;
++optind;
offset = cvtnum(argv[optind]);
++optind;
nr_zones = cvtnum(argv[optind]);
g_autofree BlockZoneDescriptor *zones = NULL;
zones = g_new(BlockZoneDescriptor, nr_zones);
ret = blk_zone_report(blk, offset, &nr_zones, zones);
if (ret < 0) {
printf("zone report failed: %s\n", strerror(-ret));
} else {
for (int i = 0; i < nr_zones; ++i) {
printf("start: 0x%" PRIx64 ", len 0x%" PRIx64 ", "
"cap"" 0x%" PRIx64 ", wptr 0x%" PRIx64 ", "
"zcond:%u, [type: %u]\n",
tosector(zones[i].start), tosector(zones[i].length),
tosector(zones[i].cap), tosector(zones[i].wp),
zones[i].state, zones[i].type);
}
}
return ret;
}
static const cmdinfo_t zone_report_cmd = {
.name = "zone_report",
.altname = "zrp",
.cfunc = zone_report_f,
.argmin = 2,
.argmax = 2,
.args = "offset number",
.oneline = "report zone information",
};
static int zone_open_f(BlockBackend *blk, int argc, char **argv)
{
int ret;
int64_t offset, len;
++optind;
offset = cvtnum(argv[optind]);
++optind;
len = cvtnum(argv[optind]);
ret = blk_zone_mgmt(blk, BLK_ZO_OPEN, offset, len);
if (ret < 0) {
printf("zone open failed: %s\n", strerror(-ret));
}
return ret;
}
static const cmdinfo_t zone_open_cmd = {
.name = "zone_open",
.altname = "zo",
.cfunc = zone_open_f,
.argmin = 2,
.argmax = 2,
.args = "offset len",
.oneline = "explicit open a range of zones in zone block device",
};
static int zone_close_f(BlockBackend *blk, int argc, char **argv)
{
int ret;
int64_t offset, len;
++optind;
offset = cvtnum(argv[optind]);
++optind;
len = cvtnum(argv[optind]);
ret = blk_zone_mgmt(blk, BLK_ZO_CLOSE, offset, len);
if (ret < 0) {
printf("zone close failed: %s\n", strerror(-ret));
}
return ret;
}
static const cmdinfo_t zone_close_cmd = {
.name = "zone_close",
.altname = "zc",
.cfunc = zone_close_f,
.argmin = 2,
.argmax = 2,
.args = "offset len",
.oneline = "close a range of zones in zone block device",
};
static int zone_finish_f(BlockBackend *blk, int argc, char **argv)
{
int ret;
int64_t offset, len;
++optind;
offset = cvtnum(argv[optind]);
++optind;
len = cvtnum(argv[optind]);
ret = blk_zone_mgmt(blk, BLK_ZO_FINISH, offset, len);
if (ret < 0) {
printf("zone finish failed: %s\n", strerror(-ret));
}
return ret;
}
static const cmdinfo_t zone_finish_cmd = {
.name = "zone_finish",
.altname = "zf",
.cfunc = zone_finish_f,
.argmin = 2,
.argmax = 2,
.args = "offset len",
.oneline = "finish a range of zones in zone block device",
};
static int zone_reset_f(BlockBackend *blk, int argc, char **argv)
{
int ret;
int64_t offset, len;
++optind;
offset = cvtnum(argv[optind]);
++optind;
len = cvtnum(argv[optind]);
ret = blk_zone_mgmt(blk, BLK_ZO_RESET, offset, len);
if (ret < 0) {
printf("zone reset failed: %s\n", strerror(-ret));
}
return ret;
}
static const cmdinfo_t zone_reset_cmd = {
.name = "zone_reset",
.altname = "zrs",
.cfunc = zone_reset_f,
.argmin = 2,
.argmax = 2,
.args = "offset len",
.oneline = "reset a zone write pointer in zone block device",
};
static int do_aio_zone_append(BlockBackend *blk, QEMUIOVector *qiov,
int64_t *offset, int flags, int *total)
{
int async_ret = NOT_DONE;
blk_aio_zone_append(blk, offset, qiov, flags, aio_rw_done, &async_ret);
while (async_ret == NOT_DONE) {
main_loop_wait(false);
}
*total = qiov->size;
return async_ret < 0 ? async_ret : 1;
}
static int zone_append_f(BlockBackend *blk, int argc, char **argv)
{
int ret;
bool pflag = false;
int flags = 0;
int total = 0;
int64_t offset;
char *buf;
int c, nr_iov;
int pattern = 0xcd;
QEMUIOVector qiov;
if (optind > argc - 3) {
return -EINVAL;
}
if ((c = getopt(argc, argv, "p")) != -1) {
pflag = true;
}
offset = cvtnum(argv[optind]);
if (offset < 0) {
print_cvtnum_err(offset, argv[optind]);
return offset;
}
optind++;
nr_iov = argc - optind;
buf = create_iovec(blk, &qiov, &argv[optind], nr_iov, pattern,
flags & BDRV_REQ_REGISTERED_BUF);
if (buf == NULL) {
return -EINVAL;
}
ret = do_aio_zone_append(blk, &qiov, &offset, flags, &total);
if (ret < 0) {
printf("zone append failed: %s\n", strerror(-ret));
goto out;
}
if (pflag) {
printf("After zap done, the append sector is 0x%" PRIx64 "\n",
tosector(offset));
}
out:
qemu_io_free(blk, buf, qiov.size,
flags & BDRV_REQ_REGISTERED_BUF);
qemu_iovec_destroy(&qiov);
return ret;
}
static const cmdinfo_t zone_append_cmd = {
.name = "zone_append",
.altname = "zap",
.cfunc = zone_append_f,
.argmin = 3,
.argmax = 4,
.args = "offset len [len..]",
.oneline = "append write a number of bytes at a specified offset",
};
static int truncate_f(BlockBackend *blk, int argc, char **argv);
static const cmdinfo_t truncate_cmd = {
.name = "truncate",
@ -2523,6 +2741,12 @@ static void __attribute((constructor)) init_qemuio_commands(void)
qemuio_add_command(&aio_write_cmd);
qemuio_add_command(&aio_flush_cmd);
qemuio_add_command(&flush_cmd);
qemuio_add_command(&zone_report_cmd);
qemuio_add_command(&zone_open_cmd);
qemuio_add_command(&zone_close_cmd);
qemuio_add_command(&zone_finish_cmd);
qemuio_add_command(&zone_reset_cmd);
qemuio_add_command(&zone_append_cmd);
qemuio_add_command(&truncate_cmd);
qemuio_add_command(&length_cmd);
qemuio_add_command(&info_cmd);

View File

@ -17,6 +17,7 @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
"stats": {
"unmap_operations": 0,
"unmap_merged": 0,
"failed_zone_append_operations": 0,
"flush_total_time_ns": 0,
"wr_highest_offset": 0,
"wr_total_time_ns": 0,
@ -27,6 +28,7 @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
"timed_stats": [
],
"failed_unmap_operations": 0,
"zone_append_merged": 0,
"failed_flush_operations": 0,
"account_invalid": true,
"rd_total_time_ns": 0,
@ -39,7 +41,11 @@ Testing: -drive driver=null-co,read-zeroes=on,if=virtio
"unmap_total_time_ns": 0,
"invalid_flush_operations": 0,
"account_failed": true,
"zone_append_total_time_ns": 0,
"zone_append_operations": 0,
"rd_operations": 0,
"zone_append_bytes": 0,
"invalid_zone_append_operations": 0,
"invalid_wr_operations": 0,
"invalid_rd_operations": 0
},
@ -82,6 +88,7 @@ Testing: -drive driver=null-co,if=none
"stats": {
"unmap_operations": 0,
"unmap_merged": 0,
"failed_zone_append_operations": 0,
"flush_total_time_ns": 0,
"wr_highest_offset": 0,
"wr_total_time_ns": 0,
@ -92,6 +99,7 @@ Testing: -drive driver=null-co,if=none
"timed_stats": [
],
"failed_unmap_operations": 0,
"zone_append_merged": 0,
"failed_flush_operations": 0,
"account_invalid": true,
"rd_total_time_ns": 0,
@ -104,7 +112,11 @@ Testing: -drive driver=null-co,if=none
"unmap_total_time_ns": 0,
"invalid_flush_operations": 0,
"account_failed": true,
"zone_append_total_time_ns": 0,
"zone_append_operations": 0,
"rd_operations": 0,
"zone_append_bytes": 0,
"invalid_zone_append_operations": 0,
"invalid_wr_operations": 0,
"invalid_rd_operations": 0
},
@ -177,6 +189,7 @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
"stats": {
"unmap_operations": 0,
"unmap_merged": 0,
"failed_zone_append_operations": 0,
"flush_total_time_ns": 0,
"wr_highest_offset": 0,
"wr_total_time_ns": 0,
@ -187,6 +200,7 @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
"timed_stats": [
],
"failed_unmap_operations": 0,
"zone_append_merged": 0,
"failed_flush_operations": 0,
"account_invalid": true,
"rd_total_time_ns": 0,
@ -199,7 +213,11 @@ Testing: -blockdev driver=null-co,read-zeroes=on,node-name=null -device virtio-b
"unmap_total_time_ns": 0,
"invalid_flush_operations": 0,
"account_failed": true,
"zone_append_total_time_ns": 0,
"zone_append_operations": 0,
"rd_operations": 0,
"zone_append_bytes": 0,
"invalid_zone_append_operations": 0,
"invalid_wr_operations": 0,
"invalid_rd_operations": 0
},

105
tests/qemu-iotests/tests/zoned Executable file
View File

@ -0,0 +1,105 @@
#!/usr/bin/env bash
#
# Test zone management operations.
#
seq="$(basename $0)"
echo "QA output created by $seq"
status=1 # failure is the default!
_cleanup()
{
_cleanup_test_img
sudo -n rmmod null_blk
}
trap "_cleanup; exit \$status" 0 1 2 3 15
# get standard environment, filters and checks
. ../common.rc
. ../common.filter
. ../common.qemu
# This test only runs on Linux hosts with raw image files.
_supported_fmt raw
_supported_proto file
_supported_os Linux
sudo -n true || \
_notrun 'Password-less sudo required'
IMG="--image-opts -n driver=host_device,filename=/dev/nullb0"
QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
echo "Testing a null_blk device:"
echo "case 1: if the operations work"
sudo -n modprobe null_blk nr_devices=1 zoned=1
sudo -n chmod 0666 /dev/nullb0
echo "(1) report the first zone:"
$QEMU_IO $IMG -c "zrp 0 1"
echo
echo "report the first 10 zones"
$QEMU_IO $IMG -c "zrp 0 10"
echo
echo "report the last zone:"
$QEMU_IO $IMG -c "zrp 0x3e70000000 2" # 0x3e70000000 / 512 = 0x1f380000
echo
echo
echo "(2) opening the first zone"
$QEMU_IO $IMG -c "zo 0 268435456" # 268435456 / 512 = 524288
echo "report after:"
$QEMU_IO $IMG -c "zrp 0 1"
echo
echo "opening the second zone"
$QEMU_IO $IMG -c "zo 268435456 268435456" #
echo "report after:"
$QEMU_IO $IMG -c "zrp 268435456 1"
echo
echo "opening the last zone"
$QEMU_IO $IMG -c "zo 0x3e70000000 268435456"
echo "report after:"
$QEMU_IO $IMG -c "zrp 0x3e70000000 2"
echo
echo
echo "(3) closing the first zone"
$QEMU_IO $IMG -c "zc 0 268435456"
echo "report after:"
$QEMU_IO $IMG -c "zrp 0 1"
echo
echo "closing the last zone"
$QEMU_IO $IMG -c "zc 0x3e70000000 268435456"
echo "report after:"
$QEMU_IO $IMG -c "zrp 0x3e70000000 2"
echo
echo
echo "(4) finishing the second zone"
$QEMU_IO $IMG -c "zf 268435456 268435456"
echo "After finishing a zone:"
$QEMU_IO $IMG -c "zrp 268435456 1"
echo
echo
echo "(5) resetting the second zone"
$QEMU_IO $IMG -c "zrs 268435456 268435456"
echo "After resetting a zone:"
$QEMU_IO $IMG -c "zrp 268435456 1"
echo
echo
echo "(6) append write" # the physical block size of the device is 4096
$QEMU_IO $IMG -c "zrp 0 1"
$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000"
echo "After appending the first zone firstly:"
$QEMU_IO $IMG -c "zrp 0 1"
$QEMU_IO $IMG -c "zap -p 0 0x1000 0x2000"
echo "After appending the first zone secondly:"
$QEMU_IO $IMG -c "zrp 0 1"
$QEMU_IO $IMG -c "zap -p 268435456 0x1000 0x2000"
echo "After appending the second zone firstly:"
$QEMU_IO $IMG -c "zrp 268435456 1"
$QEMU_IO $IMG -c "zap -p 268435456 0x1000 0x2000"
echo "After appending the second zone secondly:"
$QEMU_IO $IMG -c "zrp 268435456 1"
# success, all done
echo "*** done"
rm -f $seq.full
status=0

View File

@ -0,0 +1,69 @@
QA output created by zoned
Testing a null_blk device:
case 1: if the operations work
(1) report the first zone:
start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
report the first 10 zones
start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
start: 0x100000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:1, [type: 2]
start: 0x180000, len 0x80000, cap 0x80000, wptr 0x180000, zcond:1, [type: 2]
start: 0x200000, len 0x80000, cap 0x80000, wptr 0x200000, zcond:1, [type: 2]
start: 0x280000, len 0x80000, cap 0x80000, wptr 0x280000, zcond:1, [type: 2]
start: 0x300000, len 0x80000, cap 0x80000, wptr 0x300000, zcond:1, [type: 2]
start: 0x380000, len 0x80000, cap 0x80000, wptr 0x380000, zcond:1, [type: 2]
start: 0x400000, len 0x80000, cap 0x80000, wptr 0x400000, zcond:1, [type: 2]
start: 0x480000, len 0x80000, cap 0x80000, wptr 0x480000, zcond:1, [type: 2]
report the last zone:
start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:1, [type: 2]
(2) opening the first zone
report after:
start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:3, [type: 2]
opening the second zone
report after:
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:3, [type: 2]
opening the last zone
report after:
start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:3, [type: 2]
(3) closing the first zone
report after:
start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
closing the last zone
report after:
start: 0x1f380000, len 0x80000, cap 0x80000, wptr 0x1f380000, zcond:1, [type: 2]
(4) finishing the second zone
After finishing a zone:
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x100000, zcond:14, [type: 2]
(5) resetting the second zone
After resetting a zone:
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80000, zcond:1, [type: 2]
(6) append write
start: 0x0, len 0x80000, cap 0x80000, wptr 0x0, zcond:1, [type: 2]
After zap done, the append sector is 0x0
After appending the first zone firstly:
start: 0x0, len 0x80000, cap 0x80000, wptr 0x18, zcond:2, [type: 2]
After zap done, the append sector is 0x18
After appending the first zone secondly:
start: 0x0, len 0x80000, cap 0x80000, wptr 0x30, zcond:2, [type: 2]
After zap done, the append sector is 0x80000
After appending the second zone firstly:
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80018, zcond:2, [type: 2]
After zap done, the append sector is 0x80018
After appending the second zone secondly:
start: 0x80000, len 0x80000, cap 0x80000, wptr 0x80030, zcond:2, [type: 2]
*** done