From c519d9d55e701acf4d502c4aabad8644b75a0115 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 9 Nov 2020 12:23:18 +0100 Subject: [PATCH 01/56] hw/block/nvme: remove superfluous NvmeCtrl parameter nvme_check_bounds has no use of the NvmeCtrl parameter; remove it. Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im --- hw/block/nvme.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 27d2c72716..b0816d19ef 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -866,8 +866,8 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len) return NVME_SUCCESS; } -static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns, - uint64_t slba, uint32_t nlb) +static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba, + uint32_t nlb) { uint64_t nsze = le64_to_cpu(ns->id_ns.nsze); @@ -943,7 +943,7 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb); - status = nvme_check_bounds(n, ns, slba, nlb); + status = nvme_check_bounds(ns, slba, nlb); if (status) { trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); return status; @@ -979,7 +979,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) goto invalid; } - status = nvme_check_bounds(n, ns, slba, nlb); + status = nvme_check_bounds(ns, slba, nlb); if (status) { trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); goto invalid; From 54eea8d947869856133a4eba0c0cb0b17cb9a5cd Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 10 Nov 2020 08:53:20 +0100 Subject: [PATCH 02/56] hw/block/nvme: pull aio error handling Add a new function, nvme_aio_err, to handle errors resulting from AIOs and use this from the callbacks. Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 61 +++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b0816d19ef..a245ff8ceb 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -878,6 +878,41 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba, return NVME_SUCCESS; } +static void nvme_aio_err(NvmeRequest *req, int ret) +{ + uint16_t status = NVME_SUCCESS; + Error *local_err = NULL; + + switch (req->cmd.opcode) { + case NVME_CMD_READ: + status = NVME_UNRECOVERED_READ; + break; + case NVME_CMD_FLUSH: + case NVME_CMD_WRITE: + case NVME_CMD_WRITE_ZEROES: + status = NVME_WRITE_FAULT; + break; + default: + status = NVME_INTERNAL_DEV_ERROR; + break; + } + + trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status); + + error_setg_errno(&local_err, -ret, "aio failed"); + error_report_err(local_err); + + /* + * Set the command status code to the first encountered error but allow a + * subsequent Internal Device Error to trump it. + */ + if (req->status && status != NVME_INTERNAL_DEV_ERROR) { + return; + } + + req->status = status; +} + static void nvme_rw_cb(void *opaque, int ret) { NvmeRequest *req = opaque; @@ -887,37 +922,13 @@ static void nvme_rw_cb(void *opaque, int ret) BlockAcctCookie *acct = &req->acct; BlockAcctStats *stats = blk_get_stats(blk); - Error *local_err = NULL; - trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); if (!ret) { block_acct_done(stats, acct); } else { - uint16_t status; - block_acct_failed(stats, acct); - - switch (req->cmd.opcode) { - case NVME_CMD_READ: - status = NVME_UNRECOVERED_READ; - break; - case NVME_CMD_FLUSH: - case NVME_CMD_WRITE: - case NVME_CMD_WRITE_ZEROES: - status = NVME_WRITE_FAULT; - break; - default: - status = NVME_INTERNAL_DEV_ERROR; - break; - } - - trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status); - - error_setg_errno(&local_err, -ret, "aio failed"); - error_report_err(local_err); - - req->status = status; + nvme_aio_err(req, ret); } nvme_enqueue_req_completion(nvme_cq(req), req); From 54064e51d1e2c85e08ab64f05d12957ff65dfcd3 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 14 Oct 2020 09:55:08 +0200 Subject: [PATCH 03/56] hw/block/nvme: add dulbe support Add support for reporting the Deallocated or Unwritten Logical Block Error (DULBE). Rely on the block status flags reported by the block layer and consider any block with the BDRV_BLOCK_ZERO flag to be deallocated. Multiple factors affect when a Write Zeroes command result in deallocation of blocks. * the underlying file system block size * the blockdev format * the 'discard' and 'logical_block_size' parameters format | discard | wz (512B) wz (4KiB) wz (64KiB) ----------------------------------------------------- qcow2 ignore n n y qcow2 unmap n n y raw ignore n y y raw unmap n y y So, this works best with an image in raw format and 4KiB LBAs, since holes can then be punched on a per-block basis (this assumes a file system with a 4kb block size, YMMV). A qcow2 image, uses a cluster size of 64KiB by default and blocks will only be marked deallocated if a full cluster is zeroed or discarded. However, this *is* consistent with the spec since Write Zeroes "should" deallocate the block if the Deallocate attribute is set and "may" deallocate if the Deallocate attribute is not set. Thus, we always try to deallocate (the BDRV_REQ_MAY_UNMAP flag is always set). Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme-ns.c | 8 ++-- hw/block/nvme-ns.h | 4 ++ hw/block/nvme.c | 91 ++++++++++++++++++++++++++++++++++++++++++- hw/block/trace-events | 4 ++ include/block/nvme.h | 5 +++ 5 files changed, 107 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 2670787d26..53ded46034 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -33,9 +33,7 @@ static void nvme_ns_init(NvmeNamespace *ns) NvmeIdNs *id_ns = &ns->id_ns; int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); - if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) { - ns->id_ns.dlfeat = 0x9; - } + ns->id_ns.dlfeat = 0x9; id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size); @@ -44,6 +42,9 @@ static void nvme_ns_init(NvmeNamespace *ns) /* no thin provisioning */ id_ns->ncap = id_ns->nsze; id_ns->nuse = id_ns->ncap; + + /* support DULBE */ + id_ns->nsfeat |= 0x4; } static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) @@ -93,6 +94,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) } nvme_ns_init(ns); + if (nvme_register_namespace(n, ns, errp)) { return -1; } diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 83734f4606..44bf6271b7 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -31,6 +31,10 @@ typedef struct NvmeNamespace { NvmeIdNs id_ns; NvmeNamespaceParams params; + + struct { + uint32_t err_rec; + } features; } NvmeNamespace; static inline uint32_t nvme_nsid(NvmeNamespace *ns) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index a245ff8ceb..6e6bdb338a 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -105,6 +105,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = { static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE, + [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS, [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE, [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE, [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE, @@ -878,6 +879,49 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba, return NVME_SUCCESS; } +static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba, + uint32_t nlb) +{ + BlockDriverState *bs = blk_bs(ns->blkconf.blk); + + int64_t pnum = 0, bytes = nvme_l2b(ns, nlb); + int64_t offset = nvme_l2b(ns, slba); + bool zeroed; + int ret; + + Error *local_err = NULL; + + /* + * `pnum` holds the number of bytes after offset that shares the same + * allocation status as the byte at offset. If `pnum` is different from + * `bytes`, we should check the allocation status of the next range and + * continue this until all bytes have been checked. + */ + do { + bytes -= pnum; + + ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL); + if (ret < 0) { + error_setg_errno(&local_err, -ret, "unable to get block status"); + error_report_err(local_err); + + return NVME_INTERNAL_DEV_ERROR; + } + + zeroed = !!(ret & BDRV_BLOCK_ZERO); + + trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed); + + if (zeroed) { + return NVME_DULB; + } + + offset += pnum; + } while (pnum != bytes); + + return NVME_SUCCESS; +} + static void nvme_aio_err(NvmeRequest *req, int ret) { uint16_t status = NVME_SUCCESS; @@ -996,6 +1040,15 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) goto invalid; } + if (acct == BLOCK_ACCT_READ) { + if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { + status = nvme_check_dulbe(ns, slba, nlb); + if (status) { + goto invalid; + } + } + } + status = nvme_map_dptr(n, data_size, req); if (status) { goto invalid; @@ -1641,6 +1694,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) uint8_t fid = NVME_GETSETFEAT_FID(dw10); NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); uint16_t iv; + NvmeNamespace *ns; static const uint32_t nvme_feature_default[NVME_FID_MAX] = { [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, @@ -1703,6 +1757,18 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) } return NVME_INVALID_FIELD | NVME_DNR; + case NVME_ERROR_RECOVERY: + if (!nvme_nsid_valid(n, nsid)) { + return NVME_INVALID_NSID | NVME_DNR; + } + + ns = nvme_ns(n, nsid); + if (unlikely(!ns)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + result = ns->features.err_rec; + goto out; case NVME_VOLATILE_WRITE_CACHE: result = n->features.vwc; trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); @@ -1775,7 +1841,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) { - NvmeNamespace *ns; + NvmeNamespace *ns = NULL; NvmeCmd *cmd = &req->cmd; uint32_t dw10 = le32_to_cpu(cmd->cdw10); @@ -1783,6 +1849,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) uint32_t nsid = le32_to_cpu(cmd->nsid); uint8_t fid = NVME_GETSETFEAT_FID(dw10); uint8_t save = NVME_SETFEAT_SAVE(dw10); + int i; trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11); @@ -1842,11 +1909,31 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) NVME_LOG_SMART_INFO); } + break; + case NVME_ERROR_RECOVERY: + if (nsid == NVME_NSID_BROADCAST) { + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + + if (!ns) { + continue; + } + + if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { + ns->features.err_rec = dw11; + } + } + + break; + } + + assert(ns); + ns->features.err_rec = dw11; break; case NVME_VOLATILE_WRITE_CACHE: n->features.vwc = dw11 & 0x1; - for (int i = 1; i <= n->num_namespaces; i++) { + for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); if (!ns) { continue; diff --git a/hw/block/trace-events b/hw/block/trace-events index c1537e3ac0..1ffe0b3f76 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -43,6 +43,10 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opna pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" +pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d" +pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32"" +pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" +pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 3e02d9ca98..b663d11e60 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -693,6 +693,7 @@ enum NvmeStatusCodes { NVME_E2E_REF_ERROR = 0x0284, NVME_CMP_FAILURE = 0x0285, NVME_ACCESS_DENIED = 0x0286, + NVME_DULB = 0x0287, NVME_MORE = 0x2000, NVME_DNR = 0x4000, NVME_NO_COMPLETE = 0xffff, @@ -909,6 +910,9 @@ enum NvmeIdCtrlLpa { #define NVME_AEC_NS_ATTR(aec) ((aec >> 8) & 0x1) #define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1) +#define NVME_ERR_REC_TLER(err_rec) (err_rec & 0xffff) +#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000) + enum NvmeFeatureIds { NVME_ARBITRATION = 0x1, NVME_POWER_MANAGEMENT = 0x2, @@ -1029,6 +1033,7 @@ enum NvmeNsIdentifierType { #define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1)) +#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1) #define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1) #define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf)) #define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1) From 6fd704a59af99056ee6ed1284784b092725a9416 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 23 Oct 2020 08:07:50 +0200 Subject: [PATCH 04/56] nvme: add namespace I/O optimization fields to shared header This adds the NPWG, NPWA, NPDG, NPDA and NOWS family of fields to the shared nvme.h header for use by later patches. Signed-off-by: Klaus Jensen Cc: Stefan Hajnoczi Cc: Fam Zheng Reviewed-by: Stefan Hajnoczi Reviewed-by: Minwoo Im --- include/block/nvme.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index b663d11e60..11ac1c2b7d 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -996,7 +996,12 @@ typedef struct QEMU_PACKED NvmeIdNs { uint16_t nabspf; uint16_t noiob; uint8_t nvmcap[16]; - uint8_t rsvd64[40]; + uint16_t npwg; + uint16_t npwa; + uint16_t npdg; + uint16_t npda; + uint16_t nows; + uint8_t rsvd74[30]; uint8_t nguid[16]; uint64_t eui64; NvmeLBAF lbaf[16]; From 2605257a26b873eb8a0917391063df9c7ed7a976 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 21 Oct 2020 14:03:19 +0200 Subject: [PATCH 05/56] hw/block/nvme: add the dataset management command Add support for the Dataset Management command and the Deallocate attribute. Deallocation results in discards being sent to the underlying block device. Whether of not the blocks are actually deallocated is affected by the same factors as Write Zeroes (see previous commit). format | discard | dsm (512B) dsm (4KiB) dsm (64KiB) -------------------------------------------------------- qcow2 ignore n n n qcow2 unmap n n y raw ignore n n n raw unmap n y y Again, a raw format and 4KiB LBAs are preferable. In order to set the Namespace Preferred Deallocate Granularity and Alignment fields (NPDG and NPDA), choose a sane minimum discard granularity of 4KiB. If we are using a passthru device supporting discard at a 512B granularity, user should set the discard_granularity property explicitly. NPDG and NPDA will also account for the cluster_size of the block driver if required (i.e. for QCOW2). See NVM Express 1.3d, Section 6.7 ("Dataset Management command"). Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme-ns.c | 30 ++++++++++++-- hw/block/nvme.c | 98 +++++++++++++++++++++++++++++++++++++++++++++- hw/block/nvme.h | 2 + 3 files changed, 125 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 53ded46034..37f95951a6 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -28,10 +28,14 @@ #include "nvme.h" #include "nvme-ns.h" -static void nvme_ns_init(NvmeNamespace *ns) +#define MIN_DISCARD_GRANULARITY (4 * KiB) + +static int nvme_ns_init(NvmeNamespace *ns, Error **errp) { + BlockDriverInfo bdi; NvmeIdNs *id_ns = &ns->id_ns; int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas); + int npdg; ns->id_ns.dlfeat = 0x9; @@ -43,8 +47,19 @@ static void nvme_ns_init(NvmeNamespace *ns) id_ns->ncap = id_ns->nsze; id_ns->nuse = id_ns->ncap; - /* support DULBE */ - id_ns->nsfeat |= 0x4; + /* support DULBE and I/O optimization fields */ + id_ns->nsfeat |= (0x4 | 0x10); + + npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size; + + if (bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi) >= 0 && + bdi.cluster_size > ns->blkconf.discard_granularity) { + npdg = bdi.cluster_size / ns->blkconf.logical_block_size; + } + + id_ns->npda = id_ns->npdg = npdg - 1; + + return 0; } static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) @@ -60,6 +75,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) return -1; } + if (ns->blkconf.discard_granularity == -1) { + ns->blkconf.discard_granularity = + MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY); + } + ns->size = blk_getlength(ns->blkconf.blk); if (ns->size < 0) { error_setg_errno(errp, -ns->size, "could not get blockdev size"); @@ -93,7 +113,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) return -1; } - nvme_ns_init(ns); + if (nvme_ns_init(ns, errp)) { + return -1; + } if (nvme_register_namespace(n, ns, errp)) { return -1; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 6e6bdb338a..f019d43788 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -242,6 +242,7 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) static void nvme_req_clear(NvmeRequest *req) { req->ns = NULL; + req->opaque = NULL; memset(&req->cqe, 0x0, sizeof(req->cqe)); req->status = NVME_SUCCESS; } @@ -978,6 +979,99 @@ static void nvme_rw_cb(void *opaque, int ret) nvme_enqueue_req_completion(nvme_cq(req), req); } +static void nvme_aio_discard_cb(void *opaque, int ret) +{ + NvmeRequest *req = opaque; + uintptr_t *discards = (uintptr_t *)&req->opaque; + + trace_pci_nvme_aio_discard_cb(nvme_cid(req)); + + if (ret) { + nvme_aio_err(req, ret); + } + + (*discards)--; + + if (*discards) { + return; + } + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns = req->ns; + NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd; + + uint32_t attr = le32_to_cpu(dsm->attributes); + uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1; + + uint16_t status = NVME_SUCCESS; + + trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr); + + if (attr & NVME_DSMGMT_AD) { + int64_t offset; + size_t len; + NvmeDsmRange range[nr]; + uintptr_t *discards = (uintptr_t *)&req->opaque; + + status = nvme_dma(n, (uint8_t *)range, sizeof(range), + DMA_DIRECTION_TO_DEVICE, req); + if (status) { + return status; + } + + /* + * AIO callbacks may be called immediately, so initialize discards to 1 + * to make sure the the callback does not complete the request before + * all discards have been issued. + */ + *discards = 1; + + for (int i = 0; i < nr; i++) { + uint64_t slba = le64_to_cpu(range[i].slba); + uint32_t nlb = le32_to_cpu(range[i].nlb); + + if (nvme_check_bounds(ns, slba, nlb)) { + trace_pci_nvme_err_invalid_lba_range(slba, nlb, + ns->id_ns.nsze); + continue; + } + + trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba, + nlb); + + offset = nvme_l2b(ns, slba); + len = nvme_l2b(ns, nlb); + + while (len) { + size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len); + + (*discards)++; + + blk_aio_pdiscard(ns->blkconf.blk, offset, bytes, + nvme_aio_discard_cb, req); + + offset += bytes; + len -= bytes; + } + } + + /* account for the 1-initialization */ + (*discards)--; + + if (*discards) { + status = NVME_NO_COMPLETE; + } else { + status = req->status; + } + } + + return status; +} + static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, @@ -1107,6 +1201,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) case NVME_CMD_WRITE: case NVME_CMD_READ: return nvme_rw(n, req); + case NVME_CMD_DSM: + return nvme_dsm(n, req); default: trace_pci_nvme_err_invalid_opc(req->cmd.opcode); return NVME_INVALID_OPCODE | NVME_DNR; @@ -2829,7 +2925,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->cqes = (0x4 << 4) | 0x4; id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | - NVME_ONCS_FEATURES); + NVME_ONCS_FEATURES | NVME_ONCS_DSM); id->vwc = 0x1; id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | diff --git a/hw/block/nvme.h b/hw/block/nvme.h index e080a2318a..574333caa3 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -28,6 +28,7 @@ typedef struct NvmeRequest { struct NvmeNamespace *ns; BlockAIOCB *aiocb; uint16_t status; + void *opaque; NvmeCqe cqe; NvmeCmd cmd; BlockAcctCookie acct; @@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; + case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; default: return "NVME_NVM_CMD_UNKNOWN"; } } From 0a384f923f519b4229c07816900e87f4a28d3abb Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Mon, 16 Nov 2020 11:14:02 +0100 Subject: [PATCH 06/56] hw/block/nvme: add compare command Add the Compare command. This implementation uses a bounce buffer to read in the data from storage and then compare with the host supplied buffer. Signed-off-by: Gollu Appalanaidu [k.jensen: rebased] Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im Reviewed-by: Keith Busch --- hw/block/nvme.c | 101 +++++++++++++++++++++++++++++++++++++++++- hw/block/trace-events | 2 + 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f019d43788..b9313fdc47 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -999,6 +999,51 @@ static void nvme_aio_discard_cb(void *opaque, int ret) nvme_enqueue_req_completion(nvme_cq(req), req); } +struct nvme_compare_ctx { + QEMUIOVector iov; + uint8_t *bounce; + size_t len; +}; + +static void nvme_compare_cb(void *opaque, int ret) +{ + NvmeRequest *req = opaque; + NvmeNamespace *ns = req->ns; + struct nvme_compare_ctx *ctx = req->opaque; + g_autofree uint8_t *buf = NULL; + uint16_t status; + + trace_pci_nvme_compare_cb(nvme_cid(req)); + + if (!ret) { + block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct); + } else { + block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct); + nvme_aio_err(req, ret); + goto out; + } + + buf = g_malloc(ctx->len); + + status = nvme_dma(nvme_ctrl(req), buf, ctx->len, DMA_DIRECTION_TO_DEVICE, + req); + if (status) { + req->status = status; + goto out; + } + + if (memcmp(buf, ctx->bounce, ctx->len)) { + req->status = NVME_CMP_FAILURE; + } + +out: + qemu_iovec_destroy(&ctx->iov); + g_free(ctx->bounce); + g_free(ctx); + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) { NvmeNamespace *ns = req->ns; @@ -1072,6 +1117,57 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req) return status; } +static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + NvmeNamespace *ns = req->ns; + BlockBackend *blk = ns->blkconf.blk; + uint64_t slba = le64_to_cpu(rw->slba); + uint32_t nlb = le16_to_cpu(rw->nlb) + 1; + size_t len = nvme_l2b(ns, nlb); + int64_t offset = nvme_l2b(ns, slba); + uint8_t *bounce = NULL; + struct nvme_compare_ctx *ctx = NULL; + uint16_t status; + + trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb); + + status = nvme_check_mdts(n, len); + if (status) { + trace_pci_nvme_err_mdts(nvme_cid(req), len); + return status; + } + + status = nvme_check_bounds(ns, slba, nlb); + if (status) { + trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); + return status; + } + + if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { + status = nvme_check_dulbe(ns, slba, nlb); + if (status) { + return status; + } + } + + bounce = g_malloc(len); + + ctx = g_new(struct nvme_compare_ctx, 1); + ctx->bounce = bounce; + ctx->len = len; + + req->opaque = ctx; + + qemu_iovec_init(&ctx->iov, 1); + qemu_iovec_add(&ctx->iov, bounce, len); + + block_acct_start(blk_get_stats(blk), &req->acct, len, BLOCK_ACCT_READ); + blk_aio_preadv(blk, offset, &ctx->iov, 0, nvme_compare_cb, req); + + return NVME_NO_COMPLETE; +} + static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) { block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, @@ -1201,6 +1297,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) case NVME_CMD_WRITE: case NVME_CMD_READ: return nvme_rw(n, req); + case NVME_CMD_COMPARE: + return nvme_compare(n, req); case NVME_CMD_DSM: return nvme_dsm(n, req); default: @@ -2925,7 +3023,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->cqes = (0x4 << 4) | 0x4; id->nn = cpu_to_le32(n->num_namespaces); id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP | - NVME_ONCS_FEATURES | NVME_ONCS_DSM); + NVME_ONCS_FEATURES | NVME_ONCS_DSM | + NVME_ONCS_COMPARE); id->vwc = 0x1; id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | diff --git a/hw/block/trace-events b/hw/block/trace-events index 1ffe0b3f76..68a4c8ed35 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -46,6 +46,8 @@ pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d" pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32"" pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" +pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32"" +pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" From e1f81c1478398713f14c1b6ba011d4bb841dea27 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 8 Dec 2020 08:43:04 +0100 Subject: [PATCH 07/56] hw/block/nvme: fix bad clearing of CAP Commit 37712e00b1f0 ("hw/block/nvme: factor out pmr setup") changed the control flow such that the CAP register is erronously cleared after nvme_init_pmr() has configured it. Since the entire NvmeCtrl structure is zero-filled initially, there is no need for the explicit clearing, so just remove it. Fixes: 37712e00b1f0 ("hw/block/nvme: factor out pmr setup") Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch Reviewed-by: Minwoo Im --- hw/block/nvme.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b9313fdc47..de52487aaf 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3038,7 +3038,6 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->psd[0].enlat = cpu_to_le32(0x10); id->psd[0].exlat = cpu_to_le32(0x4); - n->bar.cap = 0; NVME_CAP_SET_MQES(n->bar.cap, 0x7ff); NVME_CAP_SET_CQR(n->bar.cap, 1); NVME_CAP_SET_TO(n->bar.cap, 0xf); From ba69f224817437b5280f4c8ef511d09f17ab1305 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 9 Dec 2020 05:03:58 +0900 Subject: [PATCH 08/56] hw/block/nvme: Process controller reset and shutdown differently Controller reset ans subsystem shutdown are handled very much the same in the current code, but some of the steps should be different in these two cases. Introduce two new functions, nvme_reset_ctrl() and nvme_shutdown_ctrl(), to separate some portions of the code from nvme_clear_ctrl(). The steps that are made different between reset and shutdown are that BAR.CC is not reset to zero upon the shutdown and namespace data is flushed to backing storage as a part of shutdown handling, but not upon reset. Suggested-by: Klaus Jensen Signed-off-by: Dmitry Fomichev Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 2 +- hw/block/nvme-ns.h | 2 +- hw/block/nvme.c | 24 ++++++++++++++++++------ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 37f95951a6..a0de53e718 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -129,7 +129,7 @@ void nvme_ns_drain(NvmeNamespace *ns) blk_drain(ns->blkconf.blk); } -void nvme_ns_flush(NvmeNamespace *ns) +void nvme_ns_shutdown(NvmeNamespace *ns) { blk_flush(ns->blkconf.blk); } diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 44bf6271b7..ed3d7e65d5 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -73,6 +73,6 @@ typedef struct NvmeCtrl NvmeCtrl; int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); -void nvme_ns_flush(NvmeNamespace *ns); +void nvme_ns_shutdown(NvmeNamespace *ns); #endif /* NVME_NS_H */ diff --git a/hw/block/nvme.c b/hw/block/nvme.c index de52487aaf..f54c5c6ea4 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2295,6 +2295,20 @@ static void nvme_clear_ctrl(NvmeCtrl *n) n->aer_queued = 0; n->outstanding_aers = 0; n->qs_created = false; +} + +static void nvme_ctrl_reset(NvmeCtrl *n) +{ + nvme_clear_ctrl(n); + n->bar.cc = 0; +} + +static void nvme_ctrl_shutdown(NvmeCtrl *n) +{ + NvmeNamespace *ns; + int i; + + nvme_clear_ctrl(n); for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); @@ -2302,10 +2316,8 @@ static void nvme_clear_ctrl(NvmeCtrl *n) continue; } - nvme_ns_flush(ns); + nvme_ns_shutdown(ns); } - - n->bar.cc = 0; } static int nvme_start_ctrl(NvmeCtrl *n) @@ -2472,12 +2484,12 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, } } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) { trace_pci_nvme_mmio_stopped(); - nvme_clear_ctrl(n); + nvme_ctrl_reset(n); n->bar.csts &= ~NVME_CSTS_READY; } if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) { trace_pci_nvme_mmio_shutdown_set(); - nvme_clear_ctrl(n); + nvme_ctrl_shutdown(n); n->bar.cc = data; n->bar.csts |= NVME_CSTS_SHST_COMPLETE; } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) { @@ -3088,7 +3100,7 @@ static void nvme_exit(PCIDevice *pci_dev) { NvmeCtrl *n = NVME(pci_dev); - nvme_clear_ctrl(n); + nvme_ctrl_shutdown(n); g_free(n->cq); g_free(n->sq); g_free(n->aer_reqs); From b52f26cd1f743a63682d33a7e3d427b9610e9545 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 9 Dec 2020 05:03:59 +0900 Subject: [PATCH 09/56] hw/block/nvme: Generate namespace UUIDs In NVMe 1.4, a namespace must report an ID descriptor of UUID type if it doesn't support EUI64 or NGUID. Add a new namespace property, "uuid", that provides the user the option to either specify the UUID explicitly or have a UUID generated automatically every time a namespace is initialized. Suggested-by: Klaus Jensen Signed-off-by: Dmitry Fomichev Reviewed-by: Keith Busch Reviewed-by: Niklas Cassel Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 1 + hw/block/nvme-ns.h | 1 + hw/block/nvme.c | 9 +++++---- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index a0de53e718..f6d752b714 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -151,6 +151,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) static Property nvme_ns_props[] = { DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf), DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), + DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index ed3d7e65d5..aeca810fc7 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -21,6 +21,7 @@ typedef struct NvmeNamespaceParams { uint32_t nsid; + QemuUUID uuid; } NvmeNamespaceParams; typedef struct NvmeNamespace { diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f54c5c6ea4..7b243a56ef 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1765,6 +1765,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) { + NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); uint8_t list[NVME_IDENTIFY_DATA_SIZE]; @@ -1784,7 +1785,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_NSID | NVME_DNR; } - if (unlikely(!nvme_ns(n, nsid))) { + ns = nvme_ns(n, nsid); + if (unlikely(!ns)) { return NVME_INVALID_FIELD | NVME_DNR; } @@ -1793,12 +1795,11 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) /* * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data * structure, a Namespace UUID (nidt = 0x3) must be reported in the - * Namespace Identification Descriptor. Add a very basic Namespace UUID - * here. + * Namespace Identification Descriptor. Add the namespace UUID here. */ ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID; ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN; - stl_be_p(&ns_descrs->uuid.v, nsid); + memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDT_UUID_LEN); return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE, DMA_DIRECTION_FROM_DEVICE, req); From 13a7b6539df527e89ec59ca6b90f9e73d208a81a Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 9 Dec 2020 05:04:00 +0900 Subject: [PATCH 10/56] hw/block/nvme: Separate read and write handlers The majority of code in nvme_rw() is becoming read- or write-specific. Move these parts to two separate handlers, nvme_read() and nvme_write() to make the code more readable and to remove multiple is_write checks that has been present in the i/o path. This is a refactoring patch, no change in functionality. Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 107 ++++++++++++++++++++++++++++-------------- hw/block/trace-events | 3 +- 2 files changed, 74 insertions(+), 36 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 7b243a56ef..905fd1ba93 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1176,6 +1176,61 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) return NVME_NO_COMPLETE; } +static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + NvmeNamespace *ns = req->ns; + uint64_t slba = le64_to_cpu(rw->slba); + uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; + uint64_t data_size = nvme_l2b(ns, nlb); + uint64_t data_offset; + BlockBackend *blk = ns->blkconf.blk; + uint16_t status; + + trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba); + + status = nvme_check_mdts(n, data_size); + if (status) { + trace_pci_nvme_err_mdts(nvme_cid(req), data_size); + goto invalid; + } + + status = nvme_check_bounds(ns, slba, nlb); + if (status) { + trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); + goto invalid; + } + + status = nvme_map_dptr(n, data_size, req); + if (status) { + goto invalid; + } + + if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { + status = nvme_check_dulbe(ns, slba, nlb); + if (status) { + goto invalid; + } + } + + data_offset = nvme_l2b(ns, slba); + + block_acct_start(blk_get_stats(blk), &req->acct, data_size, + BLOCK_ACCT_READ); + if (req->qsg.sg) { + req->aiocb = dma_blk_read(blk, &req->qsg, data_offset, + BDRV_SECTOR_SIZE, nvme_rw_cb, req); + } else { + req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0, + nvme_rw_cb, req); + } + return NVME_NO_COMPLETE; + +invalid: + block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ); + return status | NVME_DNR; +} + static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; @@ -1201,22 +1256,19 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) return NVME_NO_COMPLETE; } -static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; NvmeNamespace *ns = req->ns; - uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; uint64_t slba = le64_to_cpu(rw->slba); - + uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; uint64_t data_size = nvme_l2b(ns, nlb); - uint64_t data_offset = nvme_l2b(ns, slba); - enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ? - BLOCK_ACCT_WRITE : BLOCK_ACCT_READ; + uint64_t data_offset; BlockBackend *blk = ns->blkconf.blk; uint16_t status; - trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode), - nvme_nsid(ns), nlb, data_size, slba); + trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode), + nvme_nsid(ns), nlb, data_size, slba); status = nvme_check_mdts(n, data_size); if (status) { @@ -1230,43 +1282,27 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req) goto invalid; } - if (acct == BLOCK_ACCT_READ) { - if (NVME_ERR_REC_DULBE(ns->features.err_rec)) { - status = nvme_check_dulbe(ns, slba, nlb); - if (status) { - goto invalid; - } - } - } - status = nvme_map_dptr(n, data_size, req); if (status) { goto invalid; } - block_acct_start(blk_get_stats(blk), &req->acct, data_size, acct); + data_offset = nvme_l2b(ns, slba); + + block_acct_start(blk_get_stats(blk), &req->acct, data_size, + BLOCK_ACCT_WRITE); if (req->qsg.sg) { - if (acct == BLOCK_ACCT_WRITE) { - req->aiocb = dma_blk_write(blk, &req->qsg, data_offset, - BDRV_SECTOR_SIZE, nvme_rw_cb, req); - } else { - req->aiocb = dma_blk_read(blk, &req->qsg, data_offset, - BDRV_SECTOR_SIZE, nvme_rw_cb, req); - } + req->aiocb = dma_blk_write(blk, &req->qsg, data_offset, + BDRV_SECTOR_SIZE, nvme_rw_cb, req); } else { - if (acct == BLOCK_ACCT_WRITE) { - req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0, - nvme_rw_cb, req); - } else { - req->aiocb = blk_aio_preadv(blk, data_offset, &req->iov, 0, - nvme_rw_cb, req); - } + req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0, + nvme_rw_cb, req); } return NVME_NO_COMPLETE; invalid: - block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct); - return status; + block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE); + return status | NVME_DNR; } static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) @@ -1295,8 +1331,9 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) case NVME_CMD_WRITE_ZEROES: return nvme_write_zeroes(n, req); case NVME_CMD_WRITE: + return nvme_write(n, req); case NVME_CMD_READ: - return nvme_rw(n, req); + return nvme_read(n, req); case NVME_CMD_COMPARE: return nvme_compare(n, req); case NVME_CMD_DSM: diff --git a/hw/block/trace-events b/hw/block/trace-events index 68a4c8ed35..ec1b43220e 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -40,7 +40,8 @@ pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, pci_nvme_map_sgl(uint16_t cid, uint8_t typ, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" len %"PRIu64"" pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'" -pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" +pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" +pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d" From 3ec1d547a5b70482aac3a446c848f1a9eed165e6 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 9 Dec 2020 05:04:01 +0900 Subject: [PATCH 11/56] hw/block/nvme: Combine nvme_write_zeroes() and nvme_write() Move write processing to nvme_do_write() that now handles both WRITE and WRITE ZEROES. Both nvme_write() and nvme_write_zeroes() become inline helper functions. Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 78 ++++++++++++++++++++----------------------- hw/block/trace-events | 1 - 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 905fd1ba93..4d1ca8c466 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1231,32 +1231,7 @@ invalid: return status | NVME_DNR; } -static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) -{ - NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; - NvmeNamespace *ns = req->ns; - uint64_t slba = le64_to_cpu(rw->slba); - uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; - uint64_t offset = nvme_l2b(ns, slba); - uint32_t count = nvme_l2b(ns, nlb); - uint16_t status; - - trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb); - - status = nvme_check_bounds(ns, slba, nlb); - if (status) { - trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze); - return status; - } - - block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, - BLOCK_ACCT_WRITE); - req->aiocb = blk_aio_pwrite_zeroes(req->ns->blkconf.blk, offset, count, - BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req); - return NVME_NO_COMPLETE; -} - -static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) +static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool wrz) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; NvmeNamespace *ns = req->ns; @@ -1270,10 +1245,12 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode), nvme_nsid(ns), nlb, data_size, slba); - status = nvme_check_mdts(n, data_size); - if (status) { - trace_pci_nvme_err_mdts(nvme_cid(req), data_size); - goto invalid; + if (!wrz) { + status = nvme_check_mdts(n, data_size); + if (status) { + trace_pci_nvme_err_mdts(nvme_cid(req), data_size); + goto invalid; + } } status = nvme_check_bounds(ns, slba, nlb); @@ -1282,21 +1259,28 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) goto invalid; } - status = nvme_map_dptr(n, data_size, req); - if (status) { - goto invalid; - } - data_offset = nvme_l2b(ns, slba); - block_acct_start(blk_get_stats(blk), &req->acct, data_size, - BLOCK_ACCT_WRITE); - if (req->qsg.sg) { - req->aiocb = dma_blk_write(blk, &req->qsg, data_offset, - BDRV_SECTOR_SIZE, nvme_rw_cb, req); + if (!wrz) { + status = nvme_map_dptr(n, data_size, req); + if (status) { + goto invalid; + } + + block_acct_start(blk_get_stats(blk), &req->acct, data_size, + BLOCK_ACCT_WRITE); + if (req->qsg.sg) { + req->aiocb = dma_blk_write(blk, &req->qsg, data_offset, + BDRV_SECTOR_SIZE, nvme_rw_cb, req); + } else { + req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0, + nvme_rw_cb, req); + } } else { - req->aiocb = blk_aio_pwritev(blk, data_offset, &req->iov, 0, - nvme_rw_cb, req); + block_acct_start(blk_get_stats(blk), &req->acct, 0, BLOCK_ACCT_WRITE); + req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size, + BDRV_REQ_MAY_UNMAP, nvme_rw_cb, + req); } return NVME_NO_COMPLETE; @@ -1305,6 +1289,16 @@ invalid: return status | NVME_DNR; } +static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) +{ + return nvme_do_write(n, req, false); +} + +static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) +{ + return nvme_do_write(n, req, true); +} + static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) { uint32_t nsid = le32_to_cpu(req->cmd.nsid); diff --git a/hw/block/trace-events b/hw/block/trace-events index ec1b43220e..60262b03c9 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -43,7 +43,6 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opna pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64"" pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'" -pci_nvme_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d" pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32"" pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32"" From 62e8faa468109ae66b55a038cbd268ad6e40472b Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 9 Dec 2020 05:04:02 +0900 Subject: [PATCH 12/56] hw/block/nvme: Add Commands Supported and Effects log This log page becomes necessary to implement to allow checking for Zone Append command support in Zoned Namespace Command Set. This commit adds the code to report this log page for NVM Command Set only. The parts that are specific to zoned operation will be added later in the series. All incoming admin and i/o commands are now only processed if their corresponding support bits are set in this log. This provides an easy way to control what commands to support and what not to depending on set CC.CSS. Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.h | 1 + hw/block/nvme.c | 102 ++++++++++++++++++++++++++++++++++++++---- hw/block/trace-events | 1 + include/block/nvme.h | 19 ++++++++ 4 files changed, 114 insertions(+), 9 deletions(-) diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index aeca810fc7..bdeaf1c0de 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -30,6 +30,7 @@ typedef struct NvmeNamespace { int32_t bootindex; int64_t size; NvmeIdNs id_ns; + const uint32_t *iocs; NvmeNamespaceParams params; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 4d1ca8c466..05e799623c 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -112,6 +112,30 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = { [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE, }; +static const uint32_t nvme_cse_acs[256] = { + [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, +}; + +static const uint32_t nvme_cse_iocs_none[256]; + +static const uint32_t nvme_cse_iocs_nvm[256] = { + [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, +}; + static void nvme_process_sq(void *opaque); static uint16_t nvme_cid(NvmeRequest *req) @@ -1306,10 +1330,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req), req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode)); - if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) { - return NVME_INVALID_OPCODE | NVME_DNR; - } - if (!nvme_nsid_valid(n, nsid)) { return NVME_INVALID_NSID | NVME_DNR; } @@ -1319,6 +1339,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } + if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { + trace_pci_nvme_err_invalid_opc(req->cmd.opcode); + return NVME_INVALID_OPCODE | NVME_DNR; + } + switch (req->cmd.opcode) { case NVME_CMD_FLUSH: return nvme_flush(n, req); @@ -1333,9 +1358,10 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) case NVME_CMD_DSM: return nvme_dsm(n, req); default: - trace_pci_nvme_err_invalid_opc(req->cmd.opcode); - return NVME_INVALID_OPCODE | NVME_DNR; + assert(false); } + + return NVME_INVALID_OPCODE | NVME_DNR; } static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) @@ -1570,6 +1596,37 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, DMA_DIRECTION_FROM_DEVICE, req); } +static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len, + uint64_t off, NvmeRequest *req) +{ + NvmeEffectsLog log = {}; + const uint32_t *src_iocs = NULL; + uint32_t trans_len; + + if (off >= sizeof(log)) { + trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log)); + return NVME_INVALID_FIELD | NVME_DNR; + } + + switch (NVME_CC_CSS(n->bar.cc)) { + case NVME_CC_CSS_NVM: + src_iocs = nvme_cse_iocs_nvm; + case NVME_CC_CSS_ADMIN_ONLY: + break; + } + + memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs)); + + if (src_iocs) { + memcpy(log.iocs, src_iocs, sizeof(log.iocs)); + } + + trans_len = MIN(sizeof(log) - off, buf_len); + + return nvme_dma(n, ((uint8_t *)&log) + off, trans_len, + DMA_DIRECTION_FROM_DEVICE, req); +} + static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) { NvmeCmd *cmd = &req->cmd; @@ -1613,6 +1670,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) return nvme_smart_info(n, rae, len, off, req); case NVME_LOG_FW_SLOT_INFO: return nvme_fw_log_info(n, len, off, req); + case NVME_LOG_CMD_EFFECTS: + return nvme_cmd_effects(n, len, off, req); default: trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); return NVME_INVALID_FIELD | NVME_DNR; @@ -2229,6 +2288,11 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, nvme_adm_opc_str(req->cmd.opcode)); + if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) { + trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode); + return NVME_INVALID_OPCODE | NVME_DNR; + } + switch (req->cmd.opcode) { case NVME_ADM_CMD_DELETE_SQ: return nvme_del_sq(n, req); @@ -2251,9 +2315,10 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) case NVME_ADM_CMD_ASYNC_EV_REQ: return nvme_aer(n, req); default: - trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode); - return NVME_INVALID_OPCODE | NVME_DNR; + assert(false); } + + return NVME_INVALID_OPCODE | NVME_DNR; } static void nvme_process_sq(void *opaque) @@ -2352,6 +2417,23 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n) } } +static void nvme_select_ns_iocs(NvmeCtrl *n) +{ + NvmeNamespace *ns; + int i; + + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + ns->iocs = nvme_cse_iocs_none; + if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) { + ns->iocs = nvme_cse_iocs_nvm; + } + } +} + static int nvme_start_ctrl(NvmeCtrl *n) { uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12; @@ -2450,6 +2532,8 @@ static int nvme_start_ctrl(NvmeCtrl *n) QTAILQ_INIT(&n->aer_queue); + nvme_select_ns_iocs(n); + return 0; } @@ -3057,7 +3141,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->acl = 3; id->aerl = n->params.aerl; id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO; - id->lpa = NVME_LPA_NS_SMART | NVME_LPA_EXTENDED; + id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED; /* recommended default value (~70 C) */ id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING); diff --git a/hw/block/trace-events b/hw/block/trace-events index 60262b03c9..9e1a17e627 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -108,6 +108,7 @@ pci_nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PR pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" +pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64"" pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16"" pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16"" pci_nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 11ac1c2b7d..397f7ca3b5 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -752,10 +752,27 @@ enum NvmeSmartWarn { NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, }; +typedef struct NvmeEffectsLog { + uint32_t acs[256]; + uint32_t iocs[256]; + uint8_t resv[2048]; +} NvmeEffectsLog; + +enum { + NVME_CMD_EFF_CSUPP = 1 << 0, + NVME_CMD_EFF_LBCC = 1 << 1, + NVME_CMD_EFF_NCC = 1 << 2, + NVME_CMD_EFF_NIC = 1 << 3, + NVME_CMD_EFF_CCC = 1 << 4, + NVME_CMD_EFF_CSE_MASK = 3 << 16, + NVME_CMD_EFF_UUID_SEL = 1 << 19, +}; + enum NvmeLogIdentifier { NVME_LOG_ERROR_INFO = 0x01, NVME_LOG_SMART_INFO = 0x02, NVME_LOG_FW_SLOT_INFO = 0x03, + NVME_LOG_CMD_EFFECTS = 0x05, }; typedef struct QEMU_PACKED NvmePSD { @@ -868,6 +885,7 @@ enum NvmeIdCtrlFrmw { enum NvmeIdCtrlLpa { NVME_LPA_NS_SMART = 1 << 0, + NVME_LPA_CSE = 1 << 1, NVME_LPA_EXTENDED = 1 << 2, }; @@ -1076,6 +1094,7 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); + QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16); From 141354d55ba92768ea8fad54d441d3ba90d2b59e Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 9 Dec 2020 05:04:03 +0900 Subject: [PATCH 13/56] hw/block/nvme: Add support for Namespace Types Define the structures and constants required to implement Namespace Types support. Namespace Types introduce a new command set, "I/O Command Sets", that allows the host to retrieve the command sets associated with a namespace. Introduce support for the command set and enable detection for the NVM Command Set. The new workflows for identify commands rely heavily on zero-filled identify structs. E.g., certain CNS commands are defined to return a zero-filled identify struct when an inactive namespace NSID is supplied. Add a helper function in order to avoid code duplication when reporting zero-filled identify structures. Signed-off-by: Niklas Cassel Signed-off-by: Dmitry Fomichev Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 2 + hw/block/nvme-ns.h | 1 + hw/block/nvme.c | 188 +++++++++++++++++++++++++++++++++++------- hw/block/trace-events | 6 ++ include/block/nvme.h | 64 ++++++++++---- 5 files changed, 217 insertions(+), 44 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index f6d752b714..f5f6284a4d 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -43,6 +43,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns)); + ns->csi = NVME_CSI_NVM; + /* no thin provisioning */ id_ns->ncap = id_ns->nsze; id_ns->nuse = id_ns->ncap; diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index bdeaf1c0de..bdbc98c2ec 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -31,6 +31,7 @@ typedef struct NvmeNamespace { int64_t size; NvmeIdNs id_ns; const uint32_t *iocs; + uint8_t csi; NvmeNamespaceParams params; diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 05e799623c..27679c8be8 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1596,7 +1596,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, DMA_DIRECTION_FROM_DEVICE, req); } -static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len, +static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, uint64_t off, NvmeRequest *req) { NvmeEffectsLog log = {}; @@ -1611,8 +1611,15 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len, switch (NVME_CC_CSS(n->bar.cc)) { case NVME_CC_CSS_NVM: src_iocs = nvme_cse_iocs_nvm; + /* fall through */ case NVME_CC_CSS_ADMIN_ONLY: break; + case NVME_CC_CSS_CSI: + switch (csi) { + case NVME_CSI_NVM: + src_iocs = nvme_cse_iocs_nvm; + break; + } } memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs)); @@ -1638,6 +1645,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) uint8_t lid = dw10 & 0xff; uint8_t lsp = (dw10 >> 8) & 0xf; uint8_t rae = (dw10 >> 15) & 0x1; + uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24; uint32_t numdl, numdu; uint64_t off, lpol, lpou; size_t len; @@ -1671,7 +1679,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) case NVME_LOG_FW_SLOT_INFO: return nvme_fw_log_info(n, len, off, req); case NVME_LOG_CMD_EFFECTS: - return nvme_cmd_effects(n, len, off, req); + return nvme_cmd_effects(n, csi, len, off, req); default: trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid); return NVME_INVALID_FIELD | NVME_DNR; @@ -1784,6 +1792,13 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) return NVME_SUCCESS; } +static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req) +{ + uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {}; + + return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req); +} + static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_identify_ctrl(); @@ -1792,11 +1807,23 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) DMA_DIRECTION_FROM_DEVICE, req); } +static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + + trace_pci_nvme_identify_ctrl_csi(c->csi); + + if (c->csi == NVME_CSI_NVM) { + return nvme_rpt_empty_id_struct(n, req); + } + + return NVME_INVALID_FIELD | NVME_DNR; +} + static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) { NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - NvmeIdNs *id_ns, inactive = { 0 }; uint32_t nsid = le32_to_cpu(c->nsid); trace_pci_nvme_identify_ns(nsid); @@ -1807,23 +1834,46 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) ns = nvme_ns(n, nsid); if (unlikely(!ns)) { - id_ns = &inactive; - } else { - id_ns = &ns->id_ns; + return nvme_rpt_empty_id_struct(n, req); } - return nvme_dma(n, (uint8_t *)id_ns, sizeof(NvmeIdNs), + return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), DMA_DIRECTION_FROM_DEVICE, req); } +static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns; + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + uint32_t nsid = le32_to_cpu(c->nsid); + + trace_pci_nvme_identify_ns_csi(nsid, c->csi); + + if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) { + return NVME_INVALID_NSID | NVME_DNR; + } + + ns = nvme_ns(n, nsid); + if (unlikely(!ns)) { + return nvme_rpt_empty_id_struct(n, req); + } + + if (c->csi == NVME_CSI_NVM) { + return nvme_rpt_empty_id_struct(n, req); + } + + return NVME_INVALID_FIELD | NVME_DNR; +} + static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) { + NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; - static const int data_len = NVME_IDENTIFY_DATA_SIZE; uint32_t min_nsid = le32_to_cpu(c->nsid); - uint32_t *list; - uint16_t ret; - int j = 0; + uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; + static const int data_len = sizeof(list); + uint32_t *list_ptr = (uint32_t *)list; + int i, j = 0; trace_pci_nvme_identify_nslist(min_nsid); @@ -1837,20 +1887,61 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_NSID | NVME_DNR; } - list = g_malloc0(data_len); - for (int i = 1; i <= n->num_namespaces; i++) { - if (i <= min_nsid || !nvme_ns(n, i)) { + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { continue; } - list[j++] = cpu_to_le32(i); + if (ns->params.nsid <= min_nsid) { + continue; + } + list_ptr[j++] = cpu_to_le32(ns->params.nsid); if (j == data_len / sizeof(uint32_t)) { break; } } - ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE, - req); - g_free(list); - return ret; + + return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); +} + +static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeNamespace *ns; + NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + uint32_t min_nsid = le32_to_cpu(c->nsid); + uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; + static const int data_len = sizeof(list); + uint32_t *list_ptr = (uint32_t *)list; + int i, j = 0; + + trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi); + + /* + * Same as in nvme_identify_nslist(), 0xffffffff/0xfffffffe are invalid. + */ + if (min_nsid >= NVME_NSID_BROADCAST - 1) { + return NVME_INVALID_NSID | NVME_DNR; + } + + if (c->csi != NVME_CSI_NVM) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + if (ns->params.nsid <= min_nsid) { + continue; + } + list_ptr[j++] = cpu_to_le32(ns->params.nsid); + if (j == data_len / sizeof(uint32_t)) { + break; + } + } + + return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) @@ -1858,13 +1949,17 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) NvmeNamespace *ns; NvmeIdentify *c = (NvmeIdentify *)&req->cmd; uint32_t nsid = le32_to_cpu(c->nsid); - uint8_t list[NVME_IDENTIFY_DATA_SIZE]; + uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; struct data { struct { NvmeIdNsDescr hdr; - uint8_t v[16]; + uint8_t v[NVME_NIDL_UUID]; } uuid; + struct { + NvmeIdNsDescr hdr; + uint8_t v; + } csi; }; struct data *ns_descrs = (struct data *)list; @@ -1880,19 +1975,31 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } - memset(list, 0x0, sizeof(list)); - /* * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data * structure, a Namespace UUID (nidt = 0x3) must be reported in the * Namespace Identification Descriptor. Add the namespace UUID here. */ ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID; - ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN; - memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDT_UUID_LEN); + ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID; + memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID); - return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE, - DMA_DIRECTION_FROM_DEVICE, req); + ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI; + ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI; + ns_descrs->csi.v = ns->csi; + + return nvme_dma(n, list, sizeof(list), DMA_DIRECTION_FROM_DEVICE, req); +} + +static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) +{ + uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {}; + static const int data_len = sizeof(list); + + trace_pci_nvme_identify_cmd_set(); + + NVME_SET_CSI(*list, NVME_CSI_NVM); + return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); } static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) @@ -1902,12 +2009,20 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) switch (le32_to_cpu(c->cns)) { case NVME_ID_CNS_NS: return nvme_identify_ns(n, req); + case NVME_ID_CNS_CS_NS: + return nvme_identify_ns_csi(n, req); case NVME_ID_CNS_CTRL: return nvme_identify_ctrl(n, req); + case NVME_ID_CNS_CS_CTRL: + return nvme_identify_ctrl_csi(n, req); case NVME_ID_CNS_NS_ACTIVE_LIST: return nvme_identify_nslist(n, req); + case NVME_ID_CNS_CS_NS_ACTIVE_LIST: + return nvme_identify_nslist_csi(n, req); case NVME_ID_CNS_NS_DESCR_LIST: return nvme_identify_ns_descr_list(n, req); + case NVME_ID_CNS_IO_COMMAND_SET: + return nvme_identify_cmd_set(n, req); default: trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns)); return NVME_INVALID_FIELD | NVME_DNR; @@ -2096,7 +2211,9 @@ defaults: if (iv == n->admin_cq.vector) { result |= NVME_INTVC_NOCOALESCING; } - + break; + case NVME_COMMAND_SET_PROFILE: + result = 0; break; default: result = nvme_feature_default[fid]; @@ -2258,6 +2375,12 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) break; case NVME_TIMESTAMP: return nvme_set_feature_timestamp(n, req); + case NVME_COMMAND_SET_PROFILE: + if (dw11 & 0x1ff) { + trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff); + return NVME_CMD_SET_CMB_REJECTED | NVME_DNR; + } + break; default: return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR; } @@ -2428,8 +2551,12 @@ static void nvme_select_ns_iocs(NvmeCtrl *n) continue; } ns->iocs = nvme_cse_iocs_none; - if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) { - ns->iocs = nvme_cse_iocs_nvm; + switch (ns->csi) { + case NVME_CSI_NVM: + if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) { + ns->iocs = nvme_cse_iocs_nvm; + } + break; } } } @@ -3170,6 +3297,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_CQR(n->bar.cap, 1); NVME_CAP_SET_TO(n->bar.cap, 0xf); NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM); + NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP); NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); diff --git a/hw/block/trace-events b/hw/block/trace-events index 9e1a17e627..e56e2b66b0 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -54,8 +54,12 @@ pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16"" pci_nvme_identify_ctrl(void) "identify controller" +pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8"" pci_nvme_identify_ns(uint32_t ns) "nsid %"PRIu32"" +pci_nvme_identify_ns_csi(uint32_t ns, uint8_t csi) "nsid=%"PRIu32", csi=0x%"PRIx8"" pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32"" +pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", csi=0x%"PRIx8"" +pci_nvme_identify_cmd_set(void) "identify i/o command set" pci_nvme_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32"" pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64"" pci_nvme_getfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32"" @@ -109,6 +113,7 @@ pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64"" +pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32"" pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16"" pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16"" pci_nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16"" @@ -165,6 +170,7 @@ pci_nvme_ub_db_wr_invalid_cq(uint32_t qid) "completion queue doorbell write for pci_nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion queue doorbell write value beyond queue size, cqid=%"PRIu32", new_head=%"PRIu16", ignoring" pci_nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring" pci_nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring" +pci_nvme_ub_unknown_css_value(void) "unknown value in cc.css field" # xen-block.c xen_block_realize(const char *type, uint32_t disk, uint32_t partition) "%s d%up%u" diff --git a/include/block/nvme.h b/include/block/nvme.h index 397f7ca3b5..19347cf69e 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -84,6 +84,7 @@ enum NvmeCapMask { enum NvmeCapCss { NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_CSI_SUPP = 1 << 6, NVME_CAP_CSS_ADMIN_ONLY = 1 << 7, }; @@ -117,9 +118,25 @@ enum NvmeCcMask { enum NvmeCcCss { NVME_CC_CSS_NVM = 0x0, + NVME_CC_CSS_CSI = 0x6, NVME_CC_CSS_ADMIN_ONLY = 0x7, }; +#define NVME_SET_CC_EN(cc, val) \ + (cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT) +#define NVME_SET_CC_CSS(cc, val) \ + (cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT) +#define NVME_SET_CC_MPS(cc, val) \ + (cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT) +#define NVME_SET_CC_AMS(cc, val) \ + (cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT) +#define NVME_SET_CC_SHN(cc, val) \ + (cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT) +#define NVME_SET_CC_IOSQES(cc, val) \ + (cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT) +#define NVME_SET_CC_IOCQES(cc, val) \ + (cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT) + enum NvmeCstsShift { CSTS_RDY_SHIFT = 0, CSTS_CFS_SHIFT = 1, @@ -540,8 +557,13 @@ typedef struct QEMU_PACKED NvmeIdentify { uint64_t rsvd2[2]; uint64_t prp1; uint64_t prp2; - uint32_t cns; - uint32_t rsvd11[5]; + uint8_t cns; + uint8_t rsvd10; + uint16_t ctrlid; + uint16_t nvmsetid; + uint8_t rsvd11; + uint8_t csi; + uint32_t rsvd12[4]; } NvmeIdentify; typedef struct QEMU_PACKED NvmeRwCmd { @@ -662,6 +684,7 @@ enum NvmeStatusCodes { NVME_SGL_DESCR_TYPE_INVALID = 0x0011, NVME_INVALID_USE_OF_CMB = 0x0012, NVME_INVALID_PRP_OFFSET = 0x0013, + NVME_CMD_SET_CMB_REJECTED = 0x002b, NVME_LBA_RANGE = 0x0080, NVME_CAP_EXCEEDED = 0x0081, NVME_NS_NOT_READY = 0x0082, @@ -789,11 +812,15 @@ typedef struct QEMU_PACKED NvmePSD { #define NVME_IDENTIFY_DATA_SIZE 4096 -enum { - NVME_ID_CNS_NS = 0x0, - NVME_ID_CNS_CTRL = 0x1, - NVME_ID_CNS_NS_ACTIVE_LIST = 0x2, - NVME_ID_CNS_NS_DESCR_LIST = 0x3, +enum NvmeIdCns { + NVME_ID_CNS_NS = 0x00, + NVME_ID_CNS_CTRL = 0x01, + NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_DESCR_LIST = 0x03, + NVME_ID_CNS_CS_NS = 0x05, + NVME_ID_CNS_CS_CTRL = 0x06, + NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07, + NVME_ID_CNS_IO_COMMAND_SET = 0x1c, }; typedef struct QEMU_PACKED NvmeIdCtrl { @@ -944,6 +971,7 @@ enum NvmeFeatureIds { NVME_WRITE_ATOMICITY = 0xa, NVME_ASYNCHRONOUS_EVENT_CONF = 0xb, NVME_TIMESTAMP = 0xe, + NVME_COMMAND_SET_PROFILE = 0x19, NVME_SOFTWARE_PROGRESS_MARKER = 0x80, NVME_FID_MAX = 0x100, }; @@ -1033,18 +1061,26 @@ typedef struct QEMU_PACKED NvmeIdNsDescr { uint8_t rsvd2[2]; } NvmeIdNsDescr; -enum { - NVME_NIDT_EUI64_LEN = 8, - NVME_NIDT_NGUID_LEN = 16, - NVME_NIDT_UUID_LEN = 16, +enum NvmeNsIdentifierLength { + NVME_NIDL_EUI64 = 8, + NVME_NIDL_NGUID = 16, + NVME_NIDL_UUID = 16, + NVME_NIDL_CSI = 1, }; enum NvmeNsIdentifierType { - NVME_NIDT_EUI64 = 0x1, - NVME_NIDT_NGUID = 0x2, - NVME_NIDT_UUID = 0x3, + NVME_NIDT_EUI64 = 0x01, + NVME_NIDT_NGUID = 0x02, + NVME_NIDT_UUID = 0x03, + NVME_NIDT_CSI = 0x04, }; +enum NvmeCsi { + NVME_CSI_NVM = 0x00, +}; + +#define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi))) + /*Deallocate Logical Block Features*/ #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat) ((dlfeat) & 0x10) #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat) ((dlfeat) & 0x08) From 922e6f4ebd09c69c354b7b008552f0002521f5c8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 9 Dec 2020 05:04:04 +0900 Subject: [PATCH 14/56] hw/block/nvme: Support allocated CNS command variants Many CNS commands have "allocated" command variants. These include a namespace as long as it is allocated, that is a namespace is included regardless if it is active (attached) or not. While these commands are optional (they are mandatory for controllers supporting the namespace attachment command), our QEMU implementation is more complete by actually providing support for these CNS values. However, since our QEMU model currently does not support the namespace attachment command, these new allocated CNS commands will return the same result as the active CNS command variants. The reason for not hooking up this command completely is because the NVMe specification requires the namespace management command to be supported if the namespace attachment command is supported. Signed-off-by: Niklas Cassel Signed-off-by: Dmitry Fomichev Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 8 ++++++++ include/block/nvme.h | 20 ++++++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 27679c8be8..f1cc66d381 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2008,16 +2008,24 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req) switch (le32_to_cpu(c->cns)) { case NVME_ID_CNS_NS: + /* fall through */ + case NVME_ID_CNS_NS_PRESENT: return nvme_identify_ns(n, req); case NVME_ID_CNS_CS_NS: + /* fall through */ + case NVME_ID_CNS_CS_NS_PRESENT: return nvme_identify_ns_csi(n, req); case NVME_ID_CNS_CTRL: return nvme_identify_ctrl(n, req); case NVME_ID_CNS_CS_CTRL: return nvme_identify_ctrl_csi(n, req); case NVME_ID_CNS_NS_ACTIVE_LIST: + /* fall through */ + case NVME_ID_CNS_NS_PRESENT_LIST: return nvme_identify_nslist(n, req); case NVME_ID_CNS_CS_NS_ACTIVE_LIST: + /* fall through */ + case NVME_ID_CNS_CS_NS_PRESENT_LIST: return nvme_identify_nslist_csi(n, req); case NVME_ID_CNS_NS_DESCR_LIST: return nvme_identify_ns_descr_list(n, req); diff --git a/include/block/nvme.h b/include/block/nvme.h index 19347cf69e..adb5806365 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -813,14 +813,18 @@ typedef struct QEMU_PACKED NvmePSD { #define NVME_IDENTIFY_DATA_SIZE 4096 enum NvmeIdCns { - NVME_ID_CNS_NS = 0x00, - NVME_ID_CNS_CTRL = 0x01, - NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, - NVME_ID_CNS_NS_DESCR_LIST = 0x03, - NVME_ID_CNS_CS_NS = 0x05, - NVME_ID_CNS_CS_CTRL = 0x06, - NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07, - NVME_ID_CNS_IO_COMMAND_SET = 0x1c, + NVME_ID_CNS_NS = 0x00, + NVME_ID_CNS_CTRL = 0x01, + NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_DESCR_LIST = 0x03, + NVME_ID_CNS_CS_NS = 0x05, + NVME_ID_CNS_CS_CTRL = 0x06, + NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07, + NVME_ID_CNS_NS_PRESENT_LIST = 0x10, + NVME_ID_CNS_NS_PRESENT = 0x11, + NVME_ID_CNS_CS_NS_PRESENT_LIST = 0x1a, + NVME_ID_CNS_CS_NS_PRESENT = 0x1b, + NVME_ID_CNS_IO_COMMAND_SET = 0x1c, }; typedef struct QEMU_PACKED NvmeIdCtrl { From e9ba46eeafd7062724d21b92750c7919951e16a8 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 9 Dec 2020 05:04:05 +0900 Subject: [PATCH 15/56] nvme: Make ZNS-related definitions Define values and structures that are needed to support Zoned Namespace Command Set (NVMe TP 4053). Signed-off-by: Dmitry Fomichev Acked-by: Stefan Hajnoczi Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- include/block/nvme.h | 114 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index adb5806365..9494246f1f 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -489,6 +489,9 @@ enum NvmeIoCommands { NVME_CMD_COMPARE = 0x05, NVME_CMD_WRITE_ZEROES = 0x08, NVME_CMD_DSM = 0x09, + NVME_CMD_ZONE_MGMT_SEND = 0x79, + NVME_CMD_ZONE_MGMT_RECV = 0x7a, + NVME_CMD_ZONE_APPEND = 0x7d, }; typedef struct QEMU_PACKED NvmeDeleteQ { @@ -654,9 +657,13 @@ typedef struct QEMU_PACKED NvmeAerResult { uint8_t resv; } NvmeAerResult; +typedef struct QEMU_PACKED NvmeZonedResult { + uint64_t slba; +} NvmeZonedResult; + typedef struct QEMU_PACKED NvmeCqe { uint32_t result; - uint32_t rsvd; + uint32_t dw1; uint16_t sq_head; uint16_t sq_id; uint16_t cid; @@ -685,6 +692,7 @@ enum NvmeStatusCodes { NVME_INVALID_USE_OF_CMB = 0x0012, NVME_INVALID_PRP_OFFSET = 0x0013, NVME_CMD_SET_CMB_REJECTED = 0x002b, + NVME_INVALID_CMD_SET = 0x002c, NVME_LBA_RANGE = 0x0080, NVME_CAP_EXCEEDED = 0x0081, NVME_NS_NOT_READY = 0x0082, @@ -709,6 +717,14 @@ enum NvmeStatusCodes { NVME_CONFLICTING_ATTRS = 0x0180, NVME_INVALID_PROT_INFO = 0x0181, NVME_WRITE_TO_RO = 0x0182, + NVME_ZONE_BOUNDARY_ERROR = 0x01b8, + NVME_ZONE_FULL = 0x01b9, + NVME_ZONE_READ_ONLY = 0x01ba, + NVME_ZONE_OFFLINE = 0x01bb, + NVME_ZONE_INVALID_WRITE = 0x01bc, + NVME_ZONE_TOO_MANY_ACTIVE = 0x01bd, + NVME_ZONE_TOO_MANY_OPEN = 0x01be, + NVME_ZONE_INVAL_TRANSITION = 0x01bf, NVME_WRITE_FAULT = 0x0280, NVME_UNRECOVERED_READ = 0x0281, NVME_E2E_GUARD_ERROR = 0x0282, @@ -894,6 +910,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl { uint8_t vs[1024]; } NvmeIdCtrl; +typedef struct NvmeIdCtrlZoned { + uint8_t zasl; + uint8_t rsvd1[4095]; +} NvmeIdCtrlZoned; + enum NvmeIdCtrlOacs { NVME_OACS_SECURITY = 1 << 0, NVME_OACS_FORMAT = 1 << 1, @@ -1022,6 +1043,12 @@ typedef struct QEMU_PACKED NvmeLBAF { uint8_t rp; } NvmeLBAF; +typedef struct QEMU_PACKED NvmeLBAFE { + uint64_t zsze; + uint8_t zdes; + uint8_t rsvd9[7]; +} NvmeLBAFE; + #define NVME_NSID_BROADCAST 0xffffffff typedef struct QEMU_PACKED NvmeIdNs { @@ -1081,10 +1108,24 @@ enum NvmeNsIdentifierType { enum NvmeCsi { NVME_CSI_NVM = 0x00, + NVME_CSI_ZONED = 0x02, }; #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi))) +typedef struct QEMU_PACKED NvmeIdNsZoned { + uint16_t zoc; + uint16_t ozcs; + uint32_t mar; + uint32_t mor; + uint32_t rrl; + uint32_t frl; + uint8_t rsvd20[2796]; + NvmeLBAFE lbafe[16]; + uint8_t rsvd3072[768]; + uint8_t vs[256]; +} NvmeIdNsZoned; + /*Deallocate Logical Block Features*/ #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat) ((dlfeat) & 0x10) #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat) ((dlfeat) & 0x08) @@ -1117,10 +1158,76 @@ enum NvmeIdNsDps { DPS_FIRST_EIGHT = 8, }; +enum NvmeZoneAttr { + NVME_ZA_FINISHED_BY_CTLR = 1 << 0, + NVME_ZA_FINISH_RECOMMENDED = 1 << 1, + NVME_ZA_RESET_RECOMMENDED = 1 << 2, + NVME_ZA_ZD_EXT_VALID = 1 << 7, +}; + +typedef struct QEMU_PACKED NvmeZoneReportHeader { + uint64_t nr_zones; + uint8_t rsvd[56]; +} NvmeZoneReportHeader; + +enum NvmeZoneReceiveAction { + NVME_ZONE_REPORT = 0, + NVME_ZONE_REPORT_EXTENDED = 1, +}; + +enum NvmeZoneReportType { + NVME_ZONE_REPORT_ALL = 0, + NVME_ZONE_REPORT_EMPTY = 1, + NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2, + NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3, + NVME_ZONE_REPORT_CLOSED = 4, + NVME_ZONE_REPORT_FULL = 5, + NVME_ZONE_REPORT_READ_ONLY = 6, + NVME_ZONE_REPORT_OFFLINE = 7, +}; + +enum NvmeZoneType { + NVME_ZONE_TYPE_RESERVED = 0x00, + NVME_ZONE_TYPE_SEQ_WRITE = 0x02, +}; + +enum NvmeZoneSendAction { + NVME_ZONE_ACTION_RSD = 0x00, + NVME_ZONE_ACTION_CLOSE = 0x01, + NVME_ZONE_ACTION_FINISH = 0x02, + NVME_ZONE_ACTION_OPEN = 0x03, + NVME_ZONE_ACTION_RESET = 0x04, + NVME_ZONE_ACTION_OFFLINE = 0x05, + NVME_ZONE_ACTION_SET_ZD_EXT = 0x10, +}; + +typedef struct QEMU_PACKED NvmeZoneDescr { + uint8_t zt; + uint8_t zs; + uint8_t za; + uint8_t rsvd3[5]; + uint64_t zcap; + uint64_t zslba; + uint64_t wp; + uint8_t rsvd32[32]; +} NvmeZoneDescr; + +enum NvmeZoneState { + NVME_ZONE_STATE_RESERVED = 0x00, + NVME_ZONE_STATE_EMPTY = 0x01, + NVME_ZONE_STATE_IMPLICITLY_OPEN = 0x02, + NVME_ZONE_STATE_EXPLICITLY_OPEN = 0x03, + NVME_ZONE_STATE_CLOSED = 0x04, + NVME_ZONE_STATE_READ_ONLY = 0x0D, + NVME_ZONE_STATE_FULL = 0x0E, + NVME_ZONE_STATE_OFFLINE = 0x0F, +}; + static inline void _nvme_check_size(void) { QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeZonedResult) != 8); QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64); @@ -1136,8 +1243,13 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096); QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64); } #endif From a479335bfaf3aa286924c14fbd75307456ebe1fa Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 9 Dec 2020 05:04:06 +0900 Subject: [PATCH 16/56] hw/block/nvme: Support Zoned Namespace Command Set The emulation code has been changed to advertise NVM Command Set when "zoned" device property is not set (default) and Zoned Namespace Command Set otherwise. Define values and structures that are needed to support Zoned Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator. Define trace events where needed in newly introduced code. In order to improve scalability, all open, closed and full zones are organized in separate linked lists. Consequently, almost all zone operations don't require scanning of the entire zone array (which potentially can be quite large) - it is only necessary to enumerate one or more zone lists. Handlers for three new NVMe commands introduced in Zoned Namespace Command Set specification are added, namely for Zone Management Receive, Zone Management Send and Zone Append. Device initialization code has been extended to create a proper configuration for zoned operation using device properties. Read/Write command handler is modified to only allow writes at the write pointer if the namespace is zoned. For Zone Append command, writes implicitly happen at the write pointer and the starting write pointer value is returned as the result of the command. Write Zeroes handler is modified to add zoned checks that are identical to those done as a part of Write flow. Subsequent commits in this series add ZDE support and checks for active and open zone limits. Signed-off-by: Niklas Cassel Signed-off-by: Hans Holmberg Signed-off-by: Ajay Joshi Signed-off-by: Chaitanya Kulkarni Signed-off-by: Matias Bjorling Signed-off-by: Aravind Ramesh Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Adam Manzanares Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 166 +++++++++ hw/block/nvme-ns.h | 52 +++ hw/block/nvme.c | 807 +++++++++++++++++++++++++++++++++++++++++- hw/block/nvme.h | 6 + hw/block/trace-events | 17 + 5 files changed, 1040 insertions(+), 8 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index f5f6284a4d..d79452c627 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -25,6 +25,7 @@ #include "hw/qdev-properties.h" #include "hw/qdev-core.h" +#include "trace.h" #include "nvme.h" #include "nvme-ns.h" @@ -95,6 +96,147 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) return 0; } +static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) +{ + uint64_t zone_size, zone_cap; + uint32_t lbasz = ns->blkconf.logical_block_size; + + /* Make sure that the values of ZNS properties are sane */ + if (ns->params.zone_size_bs) { + zone_size = ns->params.zone_size_bs; + } else { + zone_size = NVME_DEFAULT_ZONE_SIZE; + } + if (ns->params.zone_cap_bs) { + zone_cap = ns->params.zone_cap_bs; + } else { + zone_cap = zone_size; + } + if (zone_cap > zone_size) { + error_setg(errp, "zone capacity %"PRIu64"B exceeds " + "zone size %"PRIu64"B", zone_cap, zone_size); + return -1; + } + if (zone_size < lbasz) { + error_setg(errp, "zone size %"PRIu64"B too small, " + "must be at least %"PRIu32"B", zone_size, lbasz); + return -1; + } + if (zone_cap < lbasz) { + error_setg(errp, "zone capacity %"PRIu64"B too small, " + "must be at least %"PRIu32"B", zone_cap, lbasz); + return -1; + } + + /* + * Save the main zone geometry values to avoid + * calculating them later again. + */ + ns->zone_size = zone_size / lbasz; + ns->zone_capacity = zone_cap / lbasz; + ns->num_zones = ns->size / lbasz / ns->zone_size; + return 0; +} + +static void nvme_ns_zoned_init_state(NvmeNamespace *ns) +{ + uint64_t start = 0, zone_size = ns->zone_size; + uint64_t capacity = ns->num_zones * zone_size; + NvmeZone *zone; + int i; + + ns->zone_array = g_new0(NvmeZone, ns->num_zones); + + QTAILQ_INIT(&ns->exp_open_zones); + QTAILQ_INIT(&ns->imp_open_zones); + QTAILQ_INIT(&ns->closed_zones); + QTAILQ_INIT(&ns->full_zones); + + zone = ns->zone_array; + for (i = 0; i < ns->num_zones; i++, zone++) { + if (start + zone_size > capacity) { + zone_size = capacity - start; + } + zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE; + nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); + zone->d.za = 0; + zone->d.zcap = ns->zone_capacity; + zone->d.zslba = start; + zone->d.wp = start; + zone->w_ptr = start; + start += zone_size; + } + + ns->zone_size_log2 = 0; + if (is_power_of_2(ns->zone_size)) { + ns->zone_size_log2 = 63 - clz64(ns->zone_size); + } +} + +static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index) +{ + NvmeIdNsZoned *id_ns_z; + + nvme_ns_zoned_init_state(ns); + + id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned)); + + /* MAR/MOR are zeroes-based, 0xffffffff means no limit */ + id_ns_z->mar = 0xffffffff; + id_ns_z->mor = 0xffffffff; + id_ns_z->zoc = 0; + id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00; + + id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size); + id_ns_z->lbafe[lba_index].zdes = 0; + + ns->csi = NVME_CSI_ZONED; + ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size); + ns->id_ns.ncap = ns->id_ns.nsze; + ns->id_ns.nuse = ns->id_ns.ncap; + + ns->id_ns_zoned = id_ns_z; +} + +static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone) +{ + uint8_t state; + + zone->w_ptr = zone->d.wp; + state = nvme_get_zone_state(zone); + if (zone->d.wp != zone->d.zslba) { + if (state != NVME_ZONE_STATE_CLOSED) { + trace_pci_nvme_clear_ns_close(state, zone->d.zslba); + nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED); + } + QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry); + } else { + trace_pci_nvme_clear_ns_reset(state, zone->d.zslba); + nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); + } +} + +/* + * Close all the zones that are currently open. + */ +static void nvme_zoned_ns_shutdown(NvmeNamespace *ns) +{ + NvmeZone *zone, *next; + + QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { + QTAILQ_REMOVE(&ns->closed_zones, zone, entry); + nvme_clear_zone(ns, zone); + } + QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { + QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); + nvme_clear_zone(ns, zone); + } + QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { + QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); + nvme_clear_zone(ns, zone); + } +} + static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) { if (!ns->blkconf.blk) { @@ -118,6 +260,12 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) if (nvme_ns_init(ns, errp)) { return -1; } + if (ns->params.zoned) { + if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) { + return -1; + } + nvme_ns_init_zoned(n, ns, 0); + } if (nvme_register_namespace(n, ns, errp)) { return -1; @@ -134,6 +282,17 @@ void nvme_ns_drain(NvmeNamespace *ns) void nvme_ns_shutdown(NvmeNamespace *ns) { blk_flush(ns->blkconf.blk); + if (ns->params.zoned) { + nvme_zoned_ns_shutdown(ns); + } +} + +void nvme_ns_cleanup(NvmeNamespace *ns) +{ + if (ns->params.zoned) { + g_free(ns->id_ns_zoned); + g_free(ns->zone_array); + } } static void nvme_ns_realize(DeviceState *dev, Error **errp) @@ -154,6 +313,13 @@ static Property nvme_ns_props[] = { DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf), DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), + DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false), + DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs, + NVME_DEFAULT_ZONE_SIZE), + DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace, params.zone_cap_bs, + 0), + DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace, + params.cross_zone_read, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index bdbc98c2ec..388381dda0 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -19,9 +19,20 @@ #define NVME_NS(obj) \ OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS) +typedef struct NvmeZone { + NvmeZoneDescr d; + uint64_t w_ptr; + QTAILQ_ENTRY(NvmeZone) entry; +} NvmeZone; + typedef struct NvmeNamespaceParams { uint32_t nsid; QemuUUID uuid; + + bool zoned; + bool cross_zone_read; + uint64_t zone_size_bs; + uint64_t zone_cap_bs; } NvmeNamespaceParams; typedef struct NvmeNamespace { @@ -33,6 +44,17 @@ typedef struct NvmeNamespace { const uint32_t *iocs; uint8_t csi; + NvmeIdNsZoned *id_ns_zoned; + NvmeZone *zone_array; + QTAILQ_HEAD(, NvmeZone) exp_open_zones; + QTAILQ_HEAD(, NvmeZone) imp_open_zones; + QTAILQ_HEAD(, NvmeZone) closed_zones; + QTAILQ_HEAD(, NvmeZone) full_zones; + uint32_t num_zones; + uint64_t zone_size; + uint64_t zone_capacity; + uint32_t zone_size_log2; + NvmeNamespaceParams params; struct { @@ -74,8 +96,38 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba) typedef struct NvmeCtrl NvmeCtrl; +static inline enum NvmeZoneState nvme_get_zone_state(NvmeZone *zone) +{ + return zone->d.zs >> 4; +} + +static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState state) +{ + zone->d.zs = state << 4; +} + +static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone) +{ + return zone->d.zslba + ns->zone_size; +} + +static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone) +{ + return zone->d.zslba + zone->d.zcap; +} + +static inline bool nvme_wp_is_valid(NvmeZone *zone) +{ + uint8_t st = nvme_get_zone_state(zone); + + return st != NVME_ZONE_STATE_FULL && + st != NVME_ZONE_STATE_READ_ONLY && + st != NVME_ZONE_STATE_OFFLINE; +} + int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); void nvme_ns_shutdown(NvmeNamespace *ns); +void nvme_ns_cleanup(NvmeNamespace *ns); #endif /* NVME_NS_H */ diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f1cc66d381..be83be3fb5 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -136,6 +136,18 @@ static const uint32_t nvme_cse_iocs_nvm[256] = { [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, }; +static const uint32_t nvme_cse_iocs_zoned[256] = { + [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP, + [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, + [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP, +}; + static void nvme_process_sq(void *opaque); static uint16_t nvme_cid(NvmeRequest *req) @@ -152,6 +164,48 @@ static uint16_t nvme_sqid(NvmeRequest *req) return le16_to_cpu(req->sq->sqid); } +static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone, + enum NvmeZoneState state) +{ + if (QTAILQ_IN_USE(zone, entry)) { + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); + break; + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); + break; + case NVME_ZONE_STATE_CLOSED: + QTAILQ_REMOVE(&ns->closed_zones, zone, entry); + break; + case NVME_ZONE_STATE_FULL: + QTAILQ_REMOVE(&ns->full_zones, zone, entry); + default: + ; + } + } + + nvme_set_zone_state(zone, state); + + switch (state) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry); + break; + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry); + break; + case NVME_ZONE_STATE_CLOSED: + QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry); + break; + case NVME_ZONE_STATE_FULL: + QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry); + case NVME_ZONE_STATE_READ_ONLY: + break; + default: + zone->d.za = 0; + } +} + static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) { hwaddr low = n->ctrl_mem.addr; @@ -959,6 +1013,7 @@ static void nvme_aio_err(NvmeRequest *req, int ret) case NVME_CMD_FLUSH: case NVME_CMD_WRITE: case NVME_CMD_WRITE_ZEROES: + case NVME_CMD_ZONE_APPEND: status = NVME_WRITE_FAULT; break; default: @@ -982,6 +1037,201 @@ static void nvme_aio_err(NvmeRequest *req, int ret) req->status = status; } +static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba) +{ + return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 : + slba / ns->zone_size; +} + +static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba) +{ + uint32_t zone_idx = nvme_zone_idx(ns, slba); + + assert(zone_idx < ns->num_zones); + return &ns->zone_array[zone_idx]; +} + +static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) +{ + uint16_t status; + + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EMPTY: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_CLOSED: + status = NVME_SUCCESS; + break; + case NVME_ZONE_STATE_FULL: + status = NVME_ZONE_FULL; + break; + case NVME_ZONE_STATE_OFFLINE: + status = NVME_ZONE_OFFLINE; + break; + case NVME_ZONE_STATE_READ_ONLY: + status = NVME_ZONE_READ_ONLY; + break; + default: + assert(false); + } + + return status; +} + +static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, + NvmeZone *zone, uint64_t slba, + uint32_t nlb, bool append) +{ + uint16_t status; + + if (unlikely((slba + nlb) > nvme_zone_wr_boundary(zone))) { + status = NVME_ZONE_BOUNDARY_ERROR; + } else { + status = nvme_check_zone_state_for_write(zone); + } + + if (status != NVME_SUCCESS) { + trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status); + } else { + assert(nvme_wp_is_valid(zone)); + if (append) { + if (unlikely(slba != zone->d.zslba)) { + trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); + status = NVME_ZONE_INVALID_WRITE; + } + if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) { + trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl); + status = NVME_INVALID_FIELD; + } + } else if (unlikely(slba != zone->w_ptr)) { + trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, + zone->w_ptr); + status = NVME_ZONE_INVALID_WRITE; + } + } + + return status; +} + +static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) +{ + uint16_t status; + + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EMPTY: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_FULL: + case NVME_ZONE_STATE_CLOSED: + case NVME_ZONE_STATE_READ_ONLY: + status = NVME_SUCCESS; + break; + case NVME_ZONE_STATE_OFFLINE: + status = NVME_ZONE_OFFLINE; + break; + default: + assert(false); + } + + return status; +} + +static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, + uint32_t nlb) +{ + NvmeZone *zone = nvme_get_zone_by_slba(ns, slba); + uint64_t bndry = nvme_zone_rd_boundary(ns, zone); + uint64_t end = slba + nlb; + uint16_t status; + + status = nvme_check_zone_state_for_read(zone); + if (status != NVME_SUCCESS) { + ; + } else if (unlikely(end > bndry)) { + if (!ns->params.cross_zone_read) { + status = NVME_ZONE_BOUNDARY_ERROR; + } else { + /* + * Read across zone boundary - check that all subsequent + * zones that are being read have an appropriate state. + */ + do { + zone++; + status = nvme_check_zone_state_for_read(zone); + if (status != NVME_SUCCESS) { + break; + } + } while (end > nvme_zone_rd_boundary(ns, zone)); + } + } + + return status; +} + +static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, + bool failed) +{ + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + NvmeZone *zone; + NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe; + uint64_t slba; + uint32_t nlb; + + slba = le64_to_cpu(rw->slba); + nlb = le16_to_cpu(rw->nlb) + 1; + zone = nvme_get_zone_by_slba(ns, slba); + + if (failed) { + res->slba = 0; + zone->d.wp += nlb; + } else if (zone->w_ptr == nvme_zone_wr_boundary(zone)) { + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_CLOSED: + case NVME_ZONE_STATE_EMPTY: + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); + /* fall through */ + case NVME_ZONE_STATE_FULL: + break; + default: + assert(false); + } + zone->d.wp = zone->w_ptr; + } else { + zone->d.wp += nlb; + } +} + +static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, + uint32_t nlb) +{ + uint64_t result = zone->w_ptr; + uint8_t zs; + + zone->w_ptr += nlb; + + if (zone->w_ptr < nvme_zone_wr_boundary(zone)) { + zs = nvme_get_zone_state(zone); + switch (zs) { + case NVME_ZONE_STATE_EMPTY: + case NVME_ZONE_STATE_CLOSED: + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); + } + } + + return result; +} + +static inline bool nvme_is_write(NvmeRequest *req) +{ + NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; + + return rw->opcode == NVME_CMD_WRITE || + rw->opcode == NVME_CMD_ZONE_APPEND || + rw->opcode == NVME_CMD_WRITE_ZEROES; +} + static void nvme_rw_cb(void *opaque, int ret) { NvmeRequest *req = opaque; @@ -993,6 +1243,10 @@ static void nvme_rw_cb(void *opaque, int ret) trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk)); + if (ns->params.zoned && nvme_is_write(req)) { + nvme_finalize_zoned_write(ns, req, ret != 0); + } + if (!ret) { block_acct_done(stats, acct); } else { @@ -1225,6 +1479,14 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) goto invalid; } + if (ns->params.zoned) { + status = nvme_check_zone_read(ns, slba, nlb); + if (status != NVME_SUCCESS) { + trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status); + goto invalid; + } + } + status = nvme_map_dptr(n, data_size, req); if (status) { goto invalid; @@ -1255,7 +1517,8 @@ invalid: return status | NVME_DNR; } -static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool wrz) +static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, + bool wrz) { NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd; NvmeNamespace *ns = req->ns; @@ -1263,6 +1526,8 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool wrz) uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1; uint64_t data_size = nvme_l2b(ns, nlb); uint64_t data_offset; + NvmeZone *zone; + NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe; BlockBackend *blk = ns->blkconf.blk; uint16_t status; @@ -1283,6 +1548,25 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool wrz) goto invalid; } + if (ns->params.zoned) { + zone = nvme_get_zone_by_slba(ns, slba); + + status = nvme_check_zone_write(n, ns, zone, slba, nlb, append); + if (status != NVME_SUCCESS) { + goto invalid; + } + + if (append) { + slba = zone->w_ptr; + } + + res->slba = nvme_advance_zone_wp(ns, zone, nlb); + } else if (append) { + trace_pci_nvme_err_invalid_opc(rw->opcode); + status = NVME_INVALID_OPCODE; + goto invalid; + } + data_offset = nvme_l2b(ns, slba); if (!wrz) { @@ -1315,12 +1599,445 @@ invalid: static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req) { - return nvme_do_write(n, req, false); + return nvme_do_write(n, req, false, false); } static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req) { - return nvme_do_write(n, req, true); + return nvme_do_write(n, req, false, true); +} + +static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req) +{ + return nvme_do_write(n, req, true, false); +} + +static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c, + uint64_t *slba, uint32_t *zone_idx) +{ + uint32_t dw10 = le32_to_cpu(c->cdw10); + uint32_t dw11 = le32_to_cpu(c->cdw11); + + if (!ns->params.zoned) { + trace_pci_nvme_err_invalid_opc(c->opcode); + return NVME_INVALID_OPCODE | NVME_DNR; + } + + *slba = ((uint64_t)dw11) << 32 | dw10; + if (unlikely(*slba >= ns->id_ns.nsze)) { + trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze); + *slba = 0; + return NVME_LBA_RANGE | NVME_DNR; + } + + *zone_idx = nvme_zone_idx(ns, *slba); + assert(*zone_idx < ns->num_zones); + + return NVME_SUCCESS; +} + +typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, + enum NvmeZoneState); + +enum NvmeZoneProcessingMask { + NVME_PROC_CURRENT_ZONE = 0, + NVME_PROC_IMP_OPEN_ZONES = 1 << 0, + NVME_PROC_EXP_OPEN_ZONES = 1 << 1, + NVME_PROC_CLOSED_ZONES = 1 << 2, + NVME_PROC_READ_ONLY_ZONES = 1 << 3, + NVME_PROC_FULL_ZONES = 1 << 4, +}; + +static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, + enum NvmeZoneState state) +{ + switch (state) { + case NVME_ZONE_STATE_EMPTY: + case NVME_ZONE_STATE_CLOSED: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); + /* fall through */ + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + return NVME_SUCCESS; + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, + enum NvmeZoneState state) +{ + switch (state) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); + /* fall through */ + case NVME_ZONE_STATE_CLOSED: + return NVME_SUCCESS; + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, + enum NvmeZoneState state) +{ + switch (state) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_CLOSED: + case NVME_ZONE_STATE_EMPTY: + zone->w_ptr = nvme_zone_wr_boundary(zone); + zone->d.wp = zone->w_ptr; + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); + /* fall through */ + case NVME_ZONE_STATE_FULL: + return NVME_SUCCESS; + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, + enum NvmeZoneState state) +{ + switch (state) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_CLOSED: + case NVME_ZONE_STATE_FULL: + zone->w_ptr = zone->d.zslba; + zone->d.wp = zone->w_ptr; + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); + /* fall through */ + case NVME_ZONE_STATE_EMPTY: + return NVME_SUCCESS; + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, + enum NvmeZoneState state) +{ + switch (state) { + case NVME_ZONE_STATE_READ_ONLY: + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE); + /* fall through */ + case NVME_ZONE_STATE_OFFLINE: + return NVME_SUCCESS; + default: + return NVME_ZONE_INVAL_TRANSITION; + } +} + +static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, + enum NvmeZoneProcessingMask proc_mask, + op_handler_t op_hndlr) +{ + uint16_t status = NVME_SUCCESS; + enum NvmeZoneState zs = nvme_get_zone_state(zone); + bool proc_zone; + + switch (zs) { + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + proc_zone = proc_mask & NVME_PROC_IMP_OPEN_ZONES; + break; + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + proc_zone = proc_mask & NVME_PROC_EXP_OPEN_ZONES; + break; + case NVME_ZONE_STATE_CLOSED: + proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES; + break; + case NVME_ZONE_STATE_READ_ONLY: + proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES; + break; + case NVME_ZONE_STATE_FULL: + proc_zone = proc_mask & NVME_PROC_FULL_ZONES; + break; + default: + proc_zone = false; + } + + if (proc_zone) { + status = op_hndlr(ns, zone, zs); + } + + return status; +} + +static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, + enum NvmeZoneProcessingMask proc_mask, + op_handler_t op_hndlr) +{ + NvmeZone *next; + uint16_t status = NVME_SUCCESS; + int i; + + if (!proc_mask) { + status = op_hndlr(ns, zone, nvme_get_zone_state(zone)); + } else { + if (proc_mask & NVME_PROC_CLOSED_ZONES) { + QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); + if (status != NVME_SUCCESS) { + goto out; + } + } + } + if (proc_mask & NVME_PROC_IMP_OPEN_ZONES) { + QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); + if (status != NVME_SUCCESS) { + goto out; + } + } + } + if (proc_mask & NVME_PROC_EXP_OPEN_ZONES) { + QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); + if (status != NVME_SUCCESS) { + goto out; + } + } + } + if (proc_mask & NVME_PROC_FULL_ZONES) { + QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); + if (status != NVME_SUCCESS) { + goto out; + } + } + } + + if (proc_mask & NVME_PROC_READ_ONLY_ZONES) { + for (i = 0; i < ns->num_zones; i++, zone++) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); + if (status != NVME_SUCCESS) { + goto out; + } + } + } + } + +out: + return status; +} + +static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeCmd *cmd = (NvmeCmd *)&req->cmd; + NvmeNamespace *ns = req->ns; + NvmeZone *zone; + uint32_t dw13 = le32_to_cpu(cmd->cdw13); + uint64_t slba = 0; + uint32_t zone_idx = 0; + uint16_t status; + uint8_t action; + bool all; + enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE; + + action = dw13 & 0xff; + all = dw13 & 0x100; + + req->status = NVME_SUCCESS; + + if (!all) { + status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); + if (status) { + return status; + } + } + + zone = &ns->zone_array[zone_idx]; + if (slba != zone->d.zslba) { + trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba); + return NVME_INVALID_FIELD | NVME_DNR; + } + + switch (action) { + + case NVME_ZONE_ACTION_OPEN: + if (all) { + proc_mask = NVME_PROC_CLOSED_ZONES; + } + trace_pci_nvme_open_zone(slba, zone_idx, all); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone); + break; + + case NVME_ZONE_ACTION_CLOSE: + if (all) { + proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES; + } + trace_pci_nvme_close_zone(slba, zone_idx, all); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone); + break; + + case NVME_ZONE_ACTION_FINISH: + if (all) { + proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES | + NVME_PROC_CLOSED_ZONES; + } + trace_pci_nvme_finish_zone(slba, zone_idx, all); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone); + break; + + case NVME_ZONE_ACTION_RESET: + if (all) { + proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES | + NVME_PROC_CLOSED_ZONES | NVME_PROC_FULL_ZONES; + } + trace_pci_nvme_reset_zone(slba, zone_idx, all); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone); + break; + + case NVME_ZONE_ACTION_OFFLINE: + if (all) { + proc_mask = NVME_PROC_READ_ONLY_ZONES; + } + trace_pci_nvme_offline_zone(slba, zone_idx, all); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone); + break; + + case NVME_ZONE_ACTION_SET_ZD_EXT: + trace_pci_nvme_set_descriptor_extension(slba, zone_idx); + return NVME_INVALID_FIELD | NVME_DNR; + break; + + default: + trace_pci_nvme_err_invalid_mgmt_action(action); + status = NVME_INVALID_FIELD; + } + + if (status == NVME_ZONE_INVAL_TRANSITION) { + trace_pci_nvme_err_invalid_zone_state_transition(action, slba, + zone->d.za); + } + if (status) { + status |= NVME_DNR; + } + + return status; +} + +static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl) +{ + enum NvmeZoneState zs = nvme_get_zone_state(zl); + + switch (zafs) { + case NVME_ZONE_REPORT_ALL: + return true; + case NVME_ZONE_REPORT_EMPTY: + return zs == NVME_ZONE_STATE_EMPTY; + case NVME_ZONE_REPORT_IMPLICITLY_OPEN: + return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN; + case NVME_ZONE_REPORT_EXPLICITLY_OPEN: + return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN; + case NVME_ZONE_REPORT_CLOSED: + return zs == NVME_ZONE_STATE_CLOSED; + case NVME_ZONE_REPORT_FULL: + return zs == NVME_ZONE_STATE_FULL; + case NVME_ZONE_REPORT_READ_ONLY: + return zs == NVME_ZONE_STATE_READ_ONLY; + case NVME_ZONE_REPORT_OFFLINE: + return zs == NVME_ZONE_STATE_OFFLINE; + default: + return false; + } +} + +static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) +{ + NvmeCmd *cmd = (NvmeCmd *)&req->cmd; + NvmeNamespace *ns = req->ns; + /* cdw12 is zero-based number of dwords to return. Convert to bytes */ + uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2; + uint32_t dw13 = le32_to_cpu(cmd->cdw13); + uint32_t zone_idx, zra, zrasf, partial; + uint64_t max_zones, nr_zones = 0; + uint16_t status; + uint64_t slba, capacity = nvme_ns_nlbas(ns); + NvmeZoneDescr *z; + NvmeZone *zone; + NvmeZoneReportHeader *header; + void *buf, *buf_p; + size_t zone_entry_sz; + + req->status = NVME_SUCCESS; + + status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx); + if (status) { + return status; + } + + zra = dw13 & 0xff; + if (zra != NVME_ZONE_REPORT) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + zrasf = (dw13 >> 8) & 0xff; + if (zrasf > NVME_ZONE_REPORT_OFFLINE) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + if (data_size < sizeof(NvmeZoneReportHeader)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + status = nvme_check_mdts(n, data_size); + if (status) { + trace_pci_nvme_err_mdts(nvme_cid(req), data_size); + return status; + } + + partial = (dw13 >> 16) & 0x01; + + zone_entry_sz = sizeof(NvmeZoneDescr); + + max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz; + buf = g_malloc0(data_size); + + zone = &ns->zone_array[zone_idx]; + for (; slba < capacity; slba += ns->zone_size) { + if (partial && nr_zones >= max_zones) { + break; + } + if (nvme_zone_matches_filter(zrasf, zone++)) { + nr_zones++; + } + } + header = (NvmeZoneReportHeader *)buf; + header->nr_zones = cpu_to_le64(nr_zones); + + buf_p = buf + sizeof(NvmeZoneReportHeader); + for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) { + zone = &ns->zone_array[zone_idx]; + if (nvme_zone_matches_filter(zrasf, zone)) { + z = (NvmeZoneDescr *)buf_p; + buf_p += sizeof(NvmeZoneDescr); + + z->zt = zone->d.zt; + z->zs = zone->d.zs; + z->zcap = cpu_to_le64(zone->d.zcap); + z->zslba = cpu_to_le64(zone->d.zslba); + z->za = zone->d.za; + + if (nvme_wp_is_valid(zone)) { + z->wp = cpu_to_le64(zone->d.wp); + } else { + z->wp = cpu_to_le64(~0ULL); + } + + max_zones--; + } + } + + status = nvme_dma(n, (uint8_t *)buf, data_size, + DMA_DIRECTION_FROM_DEVICE, req); + + g_free(buf); + + return status; } static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) @@ -1349,6 +2066,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_flush(n, req); case NVME_CMD_WRITE_ZEROES: return nvme_write_zeroes(n, req); + case NVME_CMD_ZONE_APPEND: + return nvme_zone_append(n, req); case NVME_CMD_WRITE: return nvme_write(n, req); case NVME_CMD_READ: @@ -1357,6 +2076,10 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_compare(n, req); case NVME_CMD_DSM: return nvme_dsm(n, req); + case NVME_CMD_ZONE_MGMT_SEND: + return nvme_zone_mgmt_send(n, req); + case NVME_CMD_ZONE_MGMT_RECV: + return nvme_zone_mgmt_recv(n, req); default: assert(false); } @@ -1619,6 +2342,9 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len, case NVME_CSI_NVM: src_iocs = nvme_cse_iocs_nvm; break; + case NVME_CSI_ZONED: + src_iocs = nvme_cse_iocs_zoned; + break; } } @@ -1799,6 +2525,16 @@ static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req) return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req); } +static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns) +{ + switch (ns->csi) { + case NVME_CSI_NVM: + case NVME_CSI_ZONED: + return true; + } + return false; +} + static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_identify_ctrl(); @@ -1810,11 +2546,18 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req) static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req) { NvmeIdentify *c = (NvmeIdentify *)&req->cmd; + NvmeIdCtrlZoned id = {}; trace_pci_nvme_identify_ctrl_csi(c->csi); if (c->csi == NVME_CSI_NVM) { return nvme_rpt_empty_id_struct(n, req); + } else if (c->csi == NVME_CSI_ZONED) { + if (n->params.zasl_bs) { + id.zasl = n->zasl; + } + return nvme_dma(n, (uint8_t *)&id, sizeof(id), + DMA_DIRECTION_FROM_DEVICE, req); } return NVME_INVALID_FIELD | NVME_DNR; @@ -1837,8 +2580,12 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req) return nvme_rpt_empty_id_struct(n, req); } - return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), - DMA_DIRECTION_FROM_DEVICE, req); + if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { + return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), + DMA_DIRECTION_FROM_DEVICE, req); + } + + return NVME_INVALID_CMD_SET | NVME_DNR; } static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req) @@ -1858,8 +2605,11 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req) return nvme_rpt_empty_id_struct(n, req); } - if (c->csi == NVME_CSI_NVM) { + if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) { return nvme_rpt_empty_id_struct(n, req); + } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) { + return nvme_dma(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned), + DMA_DIRECTION_FROM_DEVICE, req); } return NVME_INVALID_FIELD | NVME_DNR; @@ -1923,7 +2673,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_NSID | NVME_DNR; } - if (c->csi != NVME_CSI_NVM) { + if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) { return NVME_INVALID_FIELD | NVME_DNR; } @@ -1932,7 +2682,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req) if (!ns) { continue; } - if (ns->params.nsid <= min_nsid) { + if (ns->params.nsid <= min_nsid || c->csi != ns->csi) { continue; } list_ptr[j++] = cpu_to_le32(ns->params.nsid); @@ -1999,6 +2749,8 @@ static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_identify_cmd_set(); NVME_SET_CSI(*list, NVME_CSI_NVM); + NVME_SET_CSI(*list, NVME_CSI_ZONED); + return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req); } @@ -2565,6 +3317,13 @@ static void nvme_select_ns_iocs(NvmeCtrl *n) ns->iocs = nvme_cse_iocs_nvm; } break; + case NVME_CSI_ZONED: + if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) { + ns->iocs = nvme_cse_iocs_zoned; + } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) { + ns->iocs = nvme_cse_iocs_nvm; + } + break; } } } @@ -2663,6 +3422,17 @@ static int nvme_start_ctrl(NvmeCtrl *n) nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0, NVME_AQA_ASQS(n->bar.aqa) + 1); + if (!n->params.zasl_bs) { + n->zasl = n->params.mdts; + } else { + if (n->params.zasl_bs < n->page_size) { + trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs, + n->page_size); + return -1; + } + n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size); + } + nvme_set_timestamp(n, 0ULL); QTAILQ_INIT(&n->aer_queue); @@ -3087,6 +3857,13 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) host_memory_backend_set_mapped(n->pmrdev, true); } + + if (n->params.zasl_bs) { + if (!is_power_of_2(n->params.zasl_bs)) { + error_setg(errp, "zone append size limit has to be a power of 2"); + return; + } + } } static void nvme_init_state(NvmeCtrl *n) @@ -3351,8 +4128,20 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) static void nvme_exit(PCIDevice *pci_dev) { NvmeCtrl *n = NVME(pci_dev); + NvmeNamespace *ns; + int i; nvme_ctrl_shutdown(n); + + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + + nvme_ns_cleanup(ns); + } + g_free(n->cq); g_free(n->sq); g_free(n->aer_reqs); @@ -3380,6 +4169,8 @@ static Property nvme_props[] = { DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64), DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), + DEFINE_PROP_SIZE32("zoned.append_size_limit", NvmeCtrl, params.zasl_bs, + NVME_DEFAULT_MAX_ZA_SIZE), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 574333caa3..b7fbcca39d 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -6,6 +6,9 @@ #define NVME_MAX_NAMESPACES 256 +#define NVME_DEFAULT_ZONE_SIZE (128 * MiB) +#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB) + typedef struct NvmeParams { char *serial; uint32_t num_queues; /* deprecated since 5.1 */ @@ -16,6 +19,7 @@ typedef struct NvmeParams { uint32_t aer_max_queued; uint8_t mdts; bool use_intel_id; + uint32_t zasl_bs; } NvmeParams; typedef struct NvmeAsyncEvent { @@ -149,6 +153,8 @@ typedef struct NvmeCtrl { QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue; int aer_queued; + uint8_t zasl; + NvmeNamespace namespace; NvmeNamespace *namespaces[NVME_MAX_NAMESPACES]; NvmeSQueue **sq; diff --git a/hw/block/trace-events b/hw/block/trace-events index e56e2b66b0..64366d8936 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -95,6 +95,14 @@ pci_nvme_mmio_start_success(void) "setting controller enable bit succeeded" pci_nvme_mmio_stopped(void) "cleared controller enable bit" pci_nvme_mmio_shutdown_set(void) "shutdown bit set" pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" +pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_reset_zone(uint64_t slba, uint32_t zone_idx, int all) "reset zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_offline_zone(uint64_t slba, uint32_t zone_idx, int all) "offline zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone descriptor extension, slba=%"PRIu64", idx=%"PRIu32"" +pci_nvme_clear_ns_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Closed state" +pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state" # nvme traces for error conditions pci_nvme_err_mdts(uint16_t cid, size_t len) "cid %"PRIu16" len %zu" @@ -113,6 +121,13 @@ pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64"" +pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) "unaligned zone op 0x%"PRIx32", got slba=%"PRIu64", zslba=%"PRIu64"" +pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32"" +pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) "writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64"" +pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at slba=%"PRIu64", but zone=%"PRIu64"" +pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" +pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" +pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8"" pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32"" pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16"" pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16"" @@ -146,7 +161,9 @@ pci_nvme_err_startfail_sqent_too_large(uint8_t log2ps, uint8_t maxlog2ps) "nvme_ pci_nvme_err_startfail_css(uint8_t css) "nvme_start_ctrl failed because invalid command set selected:%u" pci_nvme_err_startfail_asqent_sz_zero(void) "nvme_start_ctrl failed because the admin submission queue size is zero" pci_nvme_err_startfail_acqent_sz_zero(void) "nvme_start_ctrl failed because the admin completion queue size is zero" +pci_nvme_err_startfail_zasl_too_small(uint32_t zasl, uint32_t pagesz) "nvme_start_ctrl failed because zone append size limit %"PRIu32" is too small, needs to be >= %"PRIu32"" pci_nvme_err_startfail(void) "setting controller enable bit failed" +pci_nvme_err_invalid_mgmt_action(uint8_t action) "action=0x%"PRIx8"" # Traces for undefined behavior pci_nvme_ub_mmiowr_misaligned32(uint64_t offset) "MMIO write not 32-bit aligned, offset=0x%"PRIx64"" From 8d18ddcd229753933a20a20dc54e52379ce43d27 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 9 Dec 2020 05:04:07 +0900 Subject: [PATCH 17/56] hw/block/nvme: Introduce max active and open zone limits Add two module properties, "zoned.max_active" and "zoned.max_open" to control the maximum number of zones that can be active or open. Once these variables are set to non-default values, these limits are checked during I/O and Too Many Active or Too Many Open command status is returned if they are exceeded. Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 31 ++++++++++++++- hw/block/nvme-ns.h | 41 +++++++++++++++++++ hw/block/nvme.c | 92 +++++++++++++++++++++++++++++++++++++++++++ hw/block/trace-events | 2 + 4 files changed, 164 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index d79452c627..c55afc1920 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -135,6 +135,21 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) ns->zone_size = zone_size / lbasz; ns->zone_capacity = zone_cap / lbasz; ns->num_zones = ns->size / lbasz / ns->zone_size; + + /* Do a few more sanity checks of ZNS properties */ + if (ns->params.max_open_zones > ns->num_zones) { + error_setg(errp, + "max_open_zones value %u exceeds the number of zones %u", + ns->params.max_open_zones, ns->num_zones); + return -1; + } + if (ns->params.max_active_zones > ns->num_zones) { + error_setg(errp, + "max_active_zones value %u exceeds the number of zones %u", + ns->params.max_active_zones, ns->num_zones); + return -1; + } + return 0; } @@ -182,8 +197,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index) id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned)); /* MAR/MOR are zeroes-based, 0xffffffff means no limit */ - id_ns_z->mar = 0xffffffff; - id_ns_z->mor = 0xffffffff; + id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1); + id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1); id_ns_z->zoc = 0; id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00; @@ -209,6 +224,7 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone) trace_pci_nvme_clear_ns_close(state, zone->d.zslba); nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED); } + nvme_aor_inc_active(ns); QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry); } else { trace_pci_nvme_clear_ns_reset(state, zone->d.zslba); @@ -225,16 +241,23 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns) QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { QTAILQ_REMOVE(&ns->closed_zones, zone, entry); + nvme_aor_dec_active(ns); nvme_clear_zone(ns, zone); } QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); + nvme_aor_dec_open(ns); + nvme_aor_dec_active(ns); nvme_clear_zone(ns, zone); } QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); + nvme_aor_dec_open(ns); + nvme_aor_dec_active(ns); nvme_clear_zone(ns, zone); } + + assert(ns->nr_open_zones == 0); } static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) @@ -320,6 +343,10 @@ static Property nvme_ns_props[] = { 0), DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace, params.cross_zone_read, false), + DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace, + params.max_active_zones, 0), + DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace, + params.max_open_zones, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 388381dda0..7e1fd26909 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -33,6 +33,8 @@ typedef struct NvmeNamespaceParams { bool cross_zone_read; uint64_t zone_size_bs; uint64_t zone_cap_bs; + uint32_t max_active_zones; + uint32_t max_open_zones; } NvmeNamespaceParams; typedef struct NvmeNamespace { @@ -54,6 +56,8 @@ typedef struct NvmeNamespace { uint64_t zone_size; uint64_t zone_capacity; uint32_t zone_size_log2; + int32_t nr_open_zones; + int32_t nr_active_zones; NvmeNamespaceParams params; @@ -125,6 +129,43 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone) st != NVME_ZONE_STATE_OFFLINE; } +static inline void nvme_aor_inc_open(NvmeNamespace *ns) +{ + assert(ns->nr_open_zones >= 0); + if (ns->params.max_open_zones) { + ns->nr_open_zones++; + assert(ns->nr_open_zones <= ns->params.max_open_zones); + } +} + +static inline void nvme_aor_dec_open(NvmeNamespace *ns) +{ + if (ns->params.max_open_zones) { + assert(ns->nr_open_zones > 0); + ns->nr_open_zones--; + } + assert(ns->nr_open_zones >= 0); +} + +static inline void nvme_aor_inc_active(NvmeNamespace *ns) +{ + assert(ns->nr_active_zones >= 0); + if (ns->params.max_active_zones) { + ns->nr_active_zones++; + assert(ns->nr_active_zones <= ns->params.max_active_zones); + } +} + +static inline void nvme_aor_dec_active(NvmeNamespace *ns) +{ + if (ns->params.max_active_zones) { + assert(ns->nr_active_zones > 0); + ns->nr_active_zones--; + assert(ns->nr_active_zones >= ns->nr_open_zones); + } + assert(ns->nr_active_zones >= 0); +} + int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); void nvme_ns_shutdown(NvmeNamespace *ns); diff --git a/hw/block/nvme.c b/hw/block/nvme.c index be83be3fb5..c07dbcd2a8 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -206,6 +206,26 @@ static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone, } } +/* + * Check if we can open a zone without exceeding open/active limits. + * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5). + */ +static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn) +{ + if (ns->params.max_active_zones != 0 && + ns->nr_active_zones + act > ns->params.max_active_zones) { + trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones); + return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR; + } + if (ns->params.max_open_zones != 0 && + ns->nr_open_zones + opn > ns->params.max_open_zones) { + trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones); + return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR; + } + + return NVME_SUCCESS; +} + static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) { hwaddr low = n->ctrl_mem.addr; @@ -1168,6 +1188,40 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, return status; } +static void nvme_auto_transition_zone(NvmeNamespace *ns) +{ + NvmeZone *zone; + + if (ns->params.max_open_zones && + ns->nr_open_zones == ns->params.max_open_zones) { + zone = QTAILQ_FIRST(&ns->imp_open_zones); + if (zone) { + /* + * Automatically close this implicitly open zone. + */ + QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); + nvme_aor_dec_open(ns); + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); + } + } +} + +static uint16_t nvme_auto_open_zone(NvmeNamespace *ns, NvmeZone *zone) +{ + uint16_t status = NVME_SUCCESS; + uint8_t zs = nvme_get_zone_state(zone); + + if (zs == NVME_ZONE_STATE_EMPTY) { + nvme_auto_transition_zone(ns); + status = nvme_aor_check(ns, 1, 1); + } else if (zs == NVME_ZONE_STATE_CLOSED) { + nvme_auto_transition_zone(ns); + status = nvme_aor_check(ns, 0, 1); + } + + return status; +} + static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, bool failed) { @@ -1188,7 +1242,11 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, switch (nvme_get_zone_state(zone)) { case NVME_ZONE_STATE_IMPLICITLY_OPEN: case NVME_ZONE_STATE_EXPLICITLY_OPEN: + nvme_aor_dec_open(ns); + /* fall through */ case NVME_ZONE_STATE_CLOSED: + nvme_aor_dec_active(ns); + /* fall through */ case NVME_ZONE_STATE_EMPTY: nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL); /* fall through */ @@ -1215,7 +1273,10 @@ static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, zs = nvme_get_zone_state(zone); switch (zs) { case NVME_ZONE_STATE_EMPTY: + nvme_aor_inc_active(ns); + /* fall through */ case NVME_ZONE_STATE_CLOSED: + nvme_aor_inc_open(ns); nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); } } @@ -1556,6 +1617,11 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, goto invalid; } + status = nvme_auto_open_zone(ns, zone); + if (status != NVME_SUCCESS) { + goto invalid; + } + if (append) { slba = zone->w_ptr; } @@ -1651,9 +1717,26 @@ enum NvmeZoneProcessingMask { static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, enum NvmeZoneState state) { + uint16_t status; + switch (state) { case NVME_ZONE_STATE_EMPTY: + status = nvme_aor_check(ns, 1, 0); + if (status != NVME_SUCCESS) { + return status; + } + nvme_aor_inc_active(ns); + /* fall through */ case NVME_ZONE_STATE_CLOSED: + status = nvme_aor_check(ns, 0, 1); + if (status != NVME_SUCCESS) { + if (state == NVME_ZONE_STATE_EMPTY) { + nvme_aor_dec_active(ns); + } + return status; + } + nvme_aor_inc_open(ns); + /* fall through */ case NVME_ZONE_STATE_IMPLICITLY_OPEN: nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN); /* fall through */ @@ -1670,6 +1753,7 @@ static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_aor_dec_open(ns); nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); /* fall through */ case NVME_ZONE_STATE_CLOSED: @@ -1685,7 +1769,11 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_aor_dec_open(ns); + /* fall through */ case NVME_ZONE_STATE_CLOSED: + nvme_aor_dec_active(ns); + /* fall through */ case NVME_ZONE_STATE_EMPTY: zone->w_ptr = nvme_zone_wr_boundary(zone); zone->d.wp = zone->w_ptr; @@ -1704,7 +1792,11 @@ static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_aor_dec_open(ns); + /* fall through */ case NVME_ZONE_STATE_CLOSED: + nvme_aor_dec_active(ns); + /* fall through */ case NVME_ZONE_STATE_FULL: zone->w_ptr = zone->d.zslba; zone->d.wp = zone->w_ptr; diff --git a/hw/block/trace-events b/hw/block/trace-events index 64366d8936..bbc10bc737 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -128,6 +128,8 @@ pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at slb pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8"" +pci_nvme_err_insuff_active_res(uint32_t max_active) "max_active=%"PRIu32" zone limit exceeded" +pci_nvme_err_insuff_open_res(uint32_t max_open) "max_open=%"PRIu32" zone limit exceeded" pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32"" pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16"" pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16"" From 1a9290ade33a8993ec0ceca8da36e504ec356ed0 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 9 Dec 2020 05:04:08 +0900 Subject: [PATCH 18/56] hw/block/nvme: Support Zone Descriptor Extensions Zone Descriptor Extension is a label that can be assigned to a zone. It can be set to an Empty zone and it stays assigned until the zone is reset. This commit adds a new optional module property, "zoned.descr_ext_size". Its value must be a multiple of 64 bytes. If this value is non-zero, it becomes possible to assign extensions of that size to any Empty zones. The default value for this property is 0, therefore setting extensions is disabled by default. Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 25 ++++++++++++++++++-- hw/block/nvme-ns.h | 8 +++++++ hw/block/nvme.c | 53 +++++++++++++++++++++++++++++++++++++++++-- hw/block/trace-events | 2 ++ 4 files changed, 84 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index c55afc1920..838b15c064 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -150,6 +150,18 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) return -1; } + if (ns->params.zd_extension_size) { + if (ns->params.zd_extension_size & 0x3f) { + error_setg(errp, + "zone descriptor extension size must be a multiple of 64B"); + return -1; + } + if ((ns->params.zd_extension_size >> 6) > 0xff) { + error_setg(errp, "zone descriptor extension size is too large"); + return -1; + } + } + return 0; } @@ -161,6 +173,10 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns) int i; ns->zone_array = g_new0(NvmeZone, ns->num_zones); + if (ns->params.zd_extension_size) { + ns->zd_extensions = g_malloc0(ns->params.zd_extension_size * + ns->num_zones); + } QTAILQ_INIT(&ns->exp_open_zones); QTAILQ_INIT(&ns->imp_open_zones); @@ -203,7 +219,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index) id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00; id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size); - id_ns_z->lbafe[lba_index].zdes = 0; + id_ns_z->lbafe[lba_index].zdes = + ns->params.zd_extension_size >> 6; /* Units of 64B */ ns->csi = NVME_CSI_ZONED; ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size); @@ -219,7 +236,8 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone) zone->w_ptr = zone->d.wp; state = nvme_get_zone_state(zone); - if (zone->d.wp != zone->d.zslba) { + if (zone->d.wp != zone->d.zslba || + (zone->d.za & NVME_ZA_ZD_EXT_VALID)) { if (state != NVME_ZONE_STATE_CLOSED) { trace_pci_nvme_clear_ns_close(state, zone->d.zslba); nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED); @@ -315,6 +333,7 @@ void nvme_ns_cleanup(NvmeNamespace *ns) if (ns->params.zoned) { g_free(ns->id_ns_zoned); g_free(ns->zone_array); + g_free(ns->zd_extensions); } } @@ -347,6 +366,8 @@ static Property nvme_ns_props[] = { params.max_active_zones, 0), DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace, params.max_open_zones, 0), + DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace, + params.zd_extension_size, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index 7e1fd26909..f8f3c28c36 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -35,6 +35,7 @@ typedef struct NvmeNamespaceParams { uint64_t zone_cap_bs; uint32_t max_active_zones; uint32_t max_open_zones; + uint32_t zd_extension_size; } NvmeNamespaceParams; typedef struct NvmeNamespace { @@ -56,6 +57,7 @@ typedef struct NvmeNamespace { uint64_t zone_size; uint64_t zone_capacity; uint32_t zone_size_log2; + uint8_t *zd_extensions; int32_t nr_open_zones; int32_t nr_active_zones; @@ -129,6 +131,12 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone) st != NVME_ZONE_STATE_OFFLINE; } +static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns, + uint32_t zone_idx) +{ + return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size]; +} + static inline void nvme_aor_inc_open(NvmeNamespace *ns) { assert(ns->nr_open_zones >= 0); diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c07dbcd2a8..4bcc766073 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1823,6 +1823,25 @@ static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, } } +static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone) +{ + uint16_t status; + uint8_t state = nvme_get_zone_state(zone); + + if (state == NVME_ZONE_STATE_EMPTY) { + status = nvme_aor_check(ns, 1, 0); + if (status != NVME_SUCCESS) { + return status; + } + nvme_aor_inc_active(ns); + zone->d.za |= NVME_ZA_ZD_EXT_VALID; + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED); + return NVME_SUCCESS; + } + + return NVME_ZONE_INVAL_TRANSITION; +} + static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, enum NvmeZoneProcessingMask proc_mask, op_handler_t op_hndlr) @@ -1921,6 +1940,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) NvmeCmd *cmd = (NvmeCmd *)&req->cmd; NvmeNamespace *ns = req->ns; NvmeZone *zone; + uint8_t *zd_ext; uint32_t dw13 = le32_to_cpu(cmd->cdw13); uint64_t slba = 0; uint32_t zone_idx = 0; @@ -1993,7 +2013,22 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) case NVME_ZONE_ACTION_SET_ZD_EXT: trace_pci_nvme_set_descriptor_extension(slba, zone_idx); - return NVME_INVALID_FIELD | NVME_DNR; + if (all || !ns->params.zd_extension_size) { + return NVME_INVALID_FIELD | NVME_DNR; + } + zd_ext = nvme_get_zd_extension(ns, zone_idx); + status = nvme_dma(n, zd_ext, ns->params.zd_extension_size, + DMA_DIRECTION_TO_DEVICE, req); + if (status) { + trace_pci_nvme_err_zd_extension_map_error(zone_idx); + return status; + } + + status = nvme_set_zd_ext(ns, zone); + if (status == NVME_SUCCESS) { + trace_pci_nvme_zd_extension_set(zone_idx); + return status; + } break; default: @@ -2063,7 +2098,10 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) } zra = dw13 & 0xff; - if (zra != NVME_ZONE_REPORT) { + if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) { + return NVME_INVALID_FIELD | NVME_DNR; + } + if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) { return NVME_INVALID_FIELD | NVME_DNR; } @@ -2085,6 +2123,9 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) partial = (dw13 >> 16) & 0x01; zone_entry_sz = sizeof(NvmeZoneDescr); + if (zra == NVME_ZONE_REPORT_EXTENDED) { + zone_entry_sz += ns->params.zd_extension_size; + } max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz; buf = g_malloc0(data_size); @@ -2120,6 +2161,14 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req) z->wp = cpu_to_le64(~0ULL); } + if (zra == NVME_ZONE_REPORT_EXTENDED) { + if (zone->d.za & NVME_ZA_ZD_EXT_VALID) { + memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx), + ns->params.zd_extension_size); + } + buf_p += ns->params.zd_extension_size; + } + max_zones--; } } diff --git a/hw/block/trace-events b/hw/block/trace-events index bbc10bc737..a455979e59 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -101,6 +101,7 @@ pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, sl pci_nvme_reset_zone(uint64_t slba, uint32_t zone_idx, int all) "reset zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" pci_nvme_offline_zone(uint64_t slba, uint32_t zone_idx, int all) "offline zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone descriptor extension, slba=%"PRIu64", idx=%"PRIu32"" +pci_nvme_zd_extension_set(uint32_t zone_idx) "set descriptor extension for zone_idx=%"PRIu32"" pci_nvme_clear_ns_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Closed state" pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state" @@ -130,6 +131,7 @@ pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slb pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8"" pci_nvme_err_insuff_active_res(uint32_t max_active) "max_active=%"PRIu32" zone limit exceeded" pci_nvme_err_insuff_open_res(uint32_t max_open) "max_open=%"PRIu32" zone limit exceeded" +pci_nvme_err_zd_extension_map_error(uint32_t zone_idx) "can't map descriptor extension for zone_idx=%"PRIu32"" pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32"" pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16"" pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16"" From 00dd640dff7d8a294be8ab6afcff2355c1d664a7 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 9 Dec 2020 05:04:10 +0900 Subject: [PATCH 19/56] hw/block/nvme: Document zoned parameters in usage text Added brief descriptions of the new device properties that are now available to users to configure features of Zoned Namespace Command Set in the emulator. This patch is for documentation only, no functionality change. Signed-off-by: Dmitry Fomichev Reviewed-by: Niklas Cassel Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 4bcc766073..f4f1487afe 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -9,7 +9,7 @@ */ /** - * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e + * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e * * https://nvmexpress.org/developers/nvme-specification/ */ @@ -22,8 +22,9 @@ * [pmrdev=,] \ * max_ioqpairs=, \ * aerl=, aer_max_queued=, \ - * mdts= - * -device nvme-ns,drive=,bus=bus_name,nsid= + * mdts=,zoned.append_size_limit= \ + * -device nvme-ns,drive=,bus=,nsid=,\ + * zoned= * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. @@ -41,14 +42,46 @@ * ~~~~~~~~~~~~~~~~~~~~~~ * - `aerl` * The Asynchronous Event Request Limit (AERL). Indicates the maximum number - * of concurrently outstanding Asynchronous Event Request commands suppoert + * of concurrently outstanding Asynchronous Event Request commands support * by the controller. This is a 0's based value. * * - `aer_max_queued` * This is the maximum number of events that the device will enqueue for - * completion when there are no oustanding AERs. When the maximum number of + * completion when there are no outstanding AERs. When the maximum number of * enqueued events are reached, subsequent events will be dropped. * + * - `zoned.append_size_limit` + * The maximum I/O size in bytes that is allowed in Zone Append command. + * The default is 128KiB. Since internally this this value is maintained as + * ZASL = log2( / ), some values assigned + * to this property may be rounded down and result in a lower maximum ZA + * data size being in effect. By setting this property to 0, users can make + * ZASL to be equal to MDTS. This property only affects zoned namespaces. + * + * Setting `zoned` to true selects Zoned Command Set at the namespace. + * In this case, the following namespace properties are available to configure + * zoned operation: + * zoned.zone_size= + * The number may be followed by K, M, G as in kilo-, mega- or giga-. + * + * zoned.zone_capacity= + * The value 0 (default) forces zone capacity to be the same as zone + * size. The value of this property may not exceed zone size. + * + * zoned.descr_ext_size= + * This value needs to be specified in 64B units. If it is zero, + * namespace(s) will not support zone descriptor extensions. + * + * zoned.max_active= + * The default value means there is no limit to the number of + * concurrently active zones. + * + * zoned.max_open= + * The default value means there is no limit to the number of + * concurrently open zones. + * + * zoned.cross_zone_read= + * Setting this property to true enables Read Across Zone Boundaries. */ #include "qemu/osdep.h" From 55886345d01d72a03cbfaf1c4d89f967174dedd9 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 12 Jan 2021 13:30:26 +0100 Subject: [PATCH 20/56] hw/block/nvme: fix for non-msix machines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1c0c2163aa08 ("hw/block/nvme: verify msix_init_exclusive_bar() return value") had the unintended effect of breaking support on several platforms not supporting MSI-X. Still check for errors, but only report that MSI-X is unsupported instead of bailing out. Fixes: 1c0c2163aa08 ("hw/block/nvme: verify msix_init_exclusive_bar() return value") Fixes: fbf2e5375e33 ("hw/block/nvme: Verify msix_vector_use() returned value") Reported-by: Guenter Roeck Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f4f1487afe..b0b7abf331 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2590,7 +2590,9 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) { n->cq[cq->cqid] = NULL; timer_free(cq->timer); - msix_vector_unuse(&n->parent_obj, cq->vector); + if (msix_enabled(&n->parent_obj)) { + msix_vector_unuse(&n->parent_obj, cq->vector); + } if (cq->cqid) { g_free(cq); } @@ -2624,8 +2626,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, { int ret; - ret = msix_vector_use(&n->parent_obj, vector); - assert(ret == 0); + if (msix_enabled(&n->parent_obj)) { + ret = msix_vector_use(&n->parent_obj, vector); + assert(ret == 0); + } cq->ctrl = n; cq->cqid = cqid; cq->size = size; @@ -4161,9 +4165,12 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr); } -static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) +static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) { uint8_t *pci_conf = pci_dev->config; + int ret; + + Error *err = NULL; pci_conf[PCI_INTERRUPT_PIN] = 1; pci_config_set_prog_interface(pci_conf, 0x2); @@ -4183,8 +4190,14 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) n->reg_size); pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem); - if (msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp)) { - return; + ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, &err); + if (ret < 0) { + if (ret == -ENOTSUP) { + warn_report_err(err); + } else { + error_propagate(errp, err); + return ret; + } } if (n->params.cmb_size_mb) { @@ -4192,6 +4205,8 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) } else if (n->pmrdev) { nvme_init_pmr(n, pci_dev); } + + return 0; } static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) @@ -4280,9 +4295,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) &pci_dev->qdev, n->parent_obj.qdev.id); nvme_init_state(n); - nvme_init_pci(n, pci_dev, &local_err); - if (local_err) { - error_propagate(errp, local_err); + if (nvme_init_pci(n, pci_dev, errp)) { return; } From 1b5804a80d8bd6ecb8910e864afb89049279df17 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 11 Jan 2021 13:52:40 +0100 Subject: [PATCH 21/56] hw/block/nvme: conditionally enable DULBE for zoned namespaces The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated" status of logical blocks. Since the zoned namespaces command set specification defines that logical blocks SHALL be marked as deallocated when the zone is in the Empty or Offline states, DULBE can only be supported if the zone size is a multiple of the calculated deallocation granularity (reported in NPDG) which depends on the underlying block device cluster size (if applicable) or the configured discard_granularity. Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 838b15c064..9be170abb7 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -16,6 +16,7 @@ #include "qemu/units.h" #include "qemu/cutils.h" #include "qemu/log.h" +#include "qemu/error-report.h" #include "hw/block/block.h" #include "hw/pci/pci.h" #include "sysemu/sysemu.h" @@ -227,6 +228,22 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index) ns->id_ns.ncap = ns->id_ns.nsze; ns->id_ns.nuse = ns->id_ns.ncap; + /* + * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated" + * status of logical blocks. Since the spec defines that logical blocks + * SHALL be deallocated when then zone is in the Empty or Offline states, + * we can only support DULBE if the zone size is a multiple of the + * calculated NPDG. + */ + if (ns->zone_size % (ns->id_ns.npdg + 1)) { + warn_report("the zone size (%"PRIu64" blocks) is not a multiple of " + "the calculated deallocation granularity (%d blocks); " + "DULBE support disabled", + ns->zone_size, ns->id_ns.npdg + 1); + + ns->id_ns.nsfeat &= ~0x4; + } + ns->id_ns_zoned = id_ns_z; } From 165f134f3d8f62cb74ab2f53e3b72158be0cfb31 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 9 Dec 2020 13:10:45 +0100 Subject: [PATCH 22/56] hw/block/nvme: fix shutdown/reset logic A shutdown is only about flushing stuff. It is the host that should delete any queues, so do not perform a reset here. Also, on shutdown, make sure that the PMR is flushed if in use. Fixes: 368f4e752cf9 ("hw/block/nvme: Process controller reset and shutdown differently") Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev --- hw/block/nvme.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b0b7abf331..551878338e 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3419,7 +3419,7 @@ static void nvme_process_sq(void *opaque) } } -static void nvme_clear_ctrl(NvmeCtrl *n) +static void nvme_ctrl_reset(NvmeCtrl *n) { NvmeNamespace *ns; int i; @@ -3453,11 +3453,7 @@ static void nvme_clear_ctrl(NvmeCtrl *n) n->aer_queued = 0; n->outstanding_aers = 0; n->qs_created = false; -} -static void nvme_ctrl_reset(NvmeCtrl *n) -{ - nvme_clear_ctrl(n); n->bar.cc = 0; } @@ -3466,7 +3462,9 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n) NvmeNamespace *ns; int i; - nvme_clear_ctrl(n); + if (n->pmrdev) { + memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size); + } for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); @@ -4318,7 +4316,7 @@ static void nvme_exit(PCIDevice *pci_dev) NvmeNamespace *ns; int i; - nvme_ctrl_shutdown(n); + nvme_ctrl_reset(n); for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); From 57206696058e3a8b1ee6ec8a129722baa6e7ec9f Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 9 Dec 2020 23:00:20 +0100 Subject: [PATCH 23/56] hw/block/nvme: merge implicitly/explicitly opened processing masks Implicitly and explicitly opended zones are always bulk processed together, so merge the two processing masks. Signed-off-by: Klaus Jensen Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev --- hw/block/nvme.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 551878338e..a7245a7e05 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1740,11 +1740,10 @@ typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, enum NvmeZoneProcessingMask { NVME_PROC_CURRENT_ZONE = 0, - NVME_PROC_IMP_OPEN_ZONES = 1 << 0, - NVME_PROC_EXP_OPEN_ZONES = 1 << 1, - NVME_PROC_CLOSED_ZONES = 1 << 2, - NVME_PROC_READ_ONLY_ZONES = 1 << 3, - NVME_PROC_FULL_ZONES = 1 << 4, + NVME_PROC_OPENED_ZONES = 1 << 0, + NVME_PROC_CLOSED_ZONES = 1 << 1, + NVME_PROC_READ_ONLY_ZONES = 1 << 2, + NVME_PROC_FULL_ZONES = 1 << 3, }; static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, @@ -1885,10 +1884,8 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, switch (zs) { case NVME_ZONE_STATE_IMPLICITLY_OPEN: - proc_zone = proc_mask & NVME_PROC_IMP_OPEN_ZONES; - break; case NVME_ZONE_STATE_EXPLICITLY_OPEN: - proc_zone = proc_mask & NVME_PROC_EXP_OPEN_ZONES; + proc_zone = proc_mask & NVME_PROC_OPENED_ZONES; break; case NVME_ZONE_STATE_CLOSED: proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES; @@ -1929,15 +1926,14 @@ static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, } } } - if (proc_mask & NVME_PROC_IMP_OPEN_ZONES) { + if (proc_mask & NVME_PROC_OPENED_ZONES) { QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); if (status != NVME_SUCCESS) { goto out; } } - } - if (proc_mask & NVME_PROC_EXP_OPEN_ZONES) { + QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); if (status != NVME_SUCCESS) { @@ -2012,7 +2008,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) case NVME_ZONE_ACTION_CLOSE: if (all) { - proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES; + proc_mask = NVME_PROC_OPENED_ZONES; } trace_pci_nvme_close_zone(slba, zone_idx, all); status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone); @@ -2020,8 +2016,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) case NVME_ZONE_ACTION_FINISH: if (all) { - proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES | - NVME_PROC_CLOSED_ZONES; + proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES; } trace_pci_nvme_finish_zone(slba, zone_idx, all); status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone); @@ -2029,8 +2024,8 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) case NVME_ZONE_ACTION_RESET: if (all) { - proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES | - NVME_PROC_CLOSED_ZONES | NVME_PROC_FULL_ZONES; + proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES | + NVME_PROC_FULL_ZONES; } trace_pci_nvme_reset_zone(slba, zone_idx, all); status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone); From b05fde2881118e7e6e66b352e3a7ea1b8f52e1ad Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 9 Dec 2020 23:12:49 +0100 Subject: [PATCH 24/56] hw/block/nvme: enum style fix Align with existing style and use a typedef for header-file enums. Signed-off-by: Klaus Jensen Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev --- hw/block/nvme-ns.h | 4 ++-- hw/block/nvme.c | 19 +++++++++---------- include/block/nvme.h | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index f8f3c28c36..a0baa5f6d4 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -102,12 +102,12 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba) typedef struct NvmeCtrl NvmeCtrl; -static inline enum NvmeZoneState nvme_get_zone_state(NvmeZone *zone) +static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone) { return zone->d.zs >> 4; } -static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState state) +static inline void nvme_set_zone_state(NvmeZone *zone, NvmeZoneState state) { zone->d.zs = state << 4; } diff --git a/hw/block/nvme.c b/hw/block/nvme.c index a7245a7e05..a5cf798bbb 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -198,7 +198,7 @@ static uint16_t nvme_sqid(NvmeRequest *req) } static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone, - enum NvmeZoneState state) + NvmeZoneState state) { if (QTAILQ_IN_USE(zone, entry)) { switch (nvme_get_zone_state(zone)) { @@ -1735,8 +1735,7 @@ static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c, return NVME_SUCCESS; } -typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, - enum NvmeZoneState); +typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState); enum NvmeZoneProcessingMask { NVME_PROC_CURRENT_ZONE = 0, @@ -1747,7 +1746,7 @@ enum NvmeZoneProcessingMask { }; static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, - enum NvmeZoneState state) + NvmeZoneState state) { uint16_t status; @@ -1780,7 +1779,7 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, } static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, - enum NvmeZoneState state) + NvmeZoneState state) { switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: @@ -1796,7 +1795,7 @@ static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, } static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, - enum NvmeZoneState state) + NvmeZoneState state) { switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: @@ -1819,7 +1818,7 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, } static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, - enum NvmeZoneState state) + NvmeZoneState state) { switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: @@ -1842,7 +1841,7 @@ static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, } static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, - enum NvmeZoneState state) + NvmeZoneState state) { switch (state) { case NVME_ZONE_STATE_READ_ONLY: @@ -1879,7 +1878,7 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, op_handler_t op_hndlr) { uint16_t status = NVME_SUCCESS; - enum NvmeZoneState zs = nvme_get_zone_state(zone); + NvmeZoneState zs = nvme_get_zone_state(zone); bool proc_zone; switch (zs) { @@ -2077,7 +2076,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl) { - enum NvmeZoneState zs = nvme_get_zone_state(zl); + NvmeZoneState zs = nvme_get_zone_state(zl); switch (zafs) { case NVME_ZONE_REPORT_ALL: diff --git a/include/block/nvme.h b/include/block/nvme.h index 9494246f1f..45b2678db1 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -1212,7 +1212,7 @@ typedef struct QEMU_PACKED NvmeZoneDescr { uint8_t rsvd32[32]; } NvmeZoneDescr; -enum NvmeZoneState { +typedef enum NvmeZoneState { NVME_ZONE_STATE_RESERVED = 0x00, NVME_ZONE_STATE_EMPTY = 0x01, NVME_ZONE_STATE_IMPLICITLY_OPEN = 0x02, @@ -1221,7 +1221,7 @@ enum NvmeZoneState { NVME_ZONE_STATE_READ_ONLY = 0x0D, NVME_ZONE_STATE_FULL = 0x0E, NVME_ZONE_STATE_OFFLINE = 0x0F, -}; +} NvmeZoneState; static inline void _nvme_check_size(void) { From 5f5dc4c6a9426d6a1fe69ea1b539721f5eab7176 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 9 Dec 2020 23:43:15 +0100 Subject: [PATCH 25/56] hw/block/nvme: zero out zones on reset The zoned command set specification states that "All logical blocks in a zone *shall* be marked as deallocated when [the zone is reset]". Since the device guarantees 0x00 to be read from deallocated blocks we have to issue a pwrite_zeroes since we cannot be sure that a discard will do anything. But typically, this will be achieved with an efficient unmap/discard operation. Signed-off-by: Klaus Jensen Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev --- hw/block/nvme.c | 150 +++++++++++++++++++++++++++++++----------- hw/block/trace-events | 1 + 2 files changed, 113 insertions(+), 38 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index a5cf798bbb..7222eff755 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1371,6 +1371,53 @@ static void nvme_aio_discard_cb(void *opaque, int ret) nvme_enqueue_req_completion(nvme_cq(req), req); } +struct nvme_zone_reset_ctx { + NvmeRequest *req; + NvmeZone *zone; +}; + +static void nvme_aio_zone_reset_cb(void *opaque, int ret) +{ + struct nvme_zone_reset_ctx *ctx = opaque; + NvmeRequest *req = ctx->req; + NvmeNamespace *ns = req->ns; + NvmeZone *zone = ctx->zone; + uintptr_t *resets = (uintptr_t *)&req->opaque; + + g_free(ctx); + + trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba); + + if (!ret) { + switch (nvme_get_zone_state(zone)) { + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + nvme_aor_dec_open(ns); + /* fall through */ + case NVME_ZONE_STATE_CLOSED: + nvme_aor_dec_active(ns); + /* fall through */ + case NVME_ZONE_STATE_FULL: + zone->w_ptr = zone->d.zslba; + zone->d.wp = zone->w_ptr; + nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); + /* fall through */ + default: + break; + } + } else { + nvme_aio_err(req, ret); + } + + (*resets)--; + + if (*resets) { + return; + } + + nvme_enqueue_req_completion(nvme_cq(req), req); +} + struct nvme_compare_ctx { QEMUIOVector iov; uint8_t *bounce; @@ -1735,7 +1782,8 @@ static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c, return NVME_SUCCESS; } -typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState); +typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState, + NvmeRequest *); enum NvmeZoneProcessingMask { NVME_PROC_CURRENT_ZONE = 0, @@ -1746,7 +1794,7 @@ enum NvmeZoneProcessingMask { }; static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, - NvmeZoneState state) + NvmeZoneState state, NvmeRequest *req) { uint16_t status; @@ -1779,7 +1827,7 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, } static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, - NvmeZoneState state) + NvmeZoneState state, NvmeRequest *req) { switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: @@ -1795,7 +1843,7 @@ static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone, } static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, - NvmeZoneState state) + NvmeZoneState state, NvmeRequest *req) { switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: @@ -1818,30 +1866,42 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone, } static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone, - NvmeZoneState state) + NvmeZoneState state, NvmeRequest *req) { + uintptr_t *resets = (uintptr_t *)&req->opaque; + struct nvme_zone_reset_ctx *ctx; + switch (state) { - case NVME_ZONE_STATE_EXPLICITLY_OPEN: - case NVME_ZONE_STATE_IMPLICITLY_OPEN: - nvme_aor_dec_open(ns); - /* fall through */ - case NVME_ZONE_STATE_CLOSED: - nvme_aor_dec_active(ns); - /* fall through */ - case NVME_ZONE_STATE_FULL: - zone->w_ptr = zone->d.zslba; - zone->d.wp = zone->w_ptr; - nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY); - /* fall through */ case NVME_ZONE_STATE_EMPTY: return NVME_SUCCESS; + case NVME_ZONE_STATE_EXPLICITLY_OPEN: + case NVME_ZONE_STATE_IMPLICITLY_OPEN: + case NVME_ZONE_STATE_CLOSED: + case NVME_ZONE_STATE_FULL: + break; default: return NVME_ZONE_INVAL_TRANSITION; } + + /* + * The zone reset aio callback needs to know the zone that is being reset + * in order to transition the zone on completion. + */ + ctx = g_new(struct nvme_zone_reset_ctx, 1); + ctx->req = req; + ctx->zone = zone; + + (*resets)++; + + blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba), + nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP, + nvme_aio_zone_reset_cb, ctx); + + return NVME_NO_COMPLETE; } static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone, - NvmeZoneState state) + NvmeZoneState state, NvmeRequest *req) { switch (state) { case NVME_ZONE_STATE_READ_ONLY: @@ -1875,7 +1935,7 @@ static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone) static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, enum NvmeZoneProcessingMask proc_mask, - op_handler_t op_hndlr) + op_handler_t op_hndlr, NvmeRequest *req) { uint16_t status = NVME_SUCCESS; NvmeZoneState zs = nvme_get_zone_state(zone); @@ -1900,7 +1960,7 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, } if (proc_zone) { - status = op_hndlr(ns, zone, zs); + status = op_hndlr(ns, zone, zs, req); } return status; @@ -1908,42 +1968,46 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone, static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, enum NvmeZoneProcessingMask proc_mask, - op_handler_t op_hndlr) + op_handler_t op_hndlr, NvmeRequest *req) { NvmeZone *next; uint16_t status = NVME_SUCCESS; int i; if (!proc_mask) { - status = op_hndlr(ns, zone, nvme_get_zone_state(zone)); + status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req); } else { if (proc_mask & NVME_PROC_CLOSED_ZONES) { QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { - status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); - if (status != NVME_SUCCESS) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, + req); + if (status && status != NVME_NO_COMPLETE) { goto out; } } } if (proc_mask & NVME_PROC_OPENED_ZONES) { QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { - status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); - if (status != NVME_SUCCESS) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, + req); + if (status && status != NVME_NO_COMPLETE) { goto out; } } QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { - status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); - if (status != NVME_SUCCESS) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, + req); + if (status && status != NVME_NO_COMPLETE) { goto out; } } } if (proc_mask & NVME_PROC_FULL_ZONES) { QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) { - status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); - if (status != NVME_SUCCESS) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, + req); + if (status && status != NVME_NO_COMPLETE) { goto out; } } @@ -1951,8 +2015,9 @@ static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone, if (proc_mask & NVME_PROC_READ_ONLY_ZONES) { for (i = 0; i < ns->num_zones; i++, zone++) { - status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr); - if (status != NVME_SUCCESS) { + status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr, + req); + if (status && status != NVME_NO_COMPLETE) { goto out; } } @@ -1968,6 +2033,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) NvmeCmd *cmd = (NvmeCmd *)&req->cmd; NvmeNamespace *ns = req->ns; NvmeZone *zone; + uintptr_t *resets; uint8_t *zd_ext; uint32_t dw13 = le32_to_cpu(cmd->cdw13); uint64_t slba = 0; @@ -2002,7 +2068,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) proc_mask = NVME_PROC_CLOSED_ZONES; } trace_pci_nvme_open_zone(slba, zone_idx, all); - status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req); break; case NVME_ZONE_ACTION_CLOSE: @@ -2010,7 +2076,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) proc_mask = NVME_PROC_OPENED_ZONES; } trace_pci_nvme_close_zone(slba, zone_idx, all); - status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req); break; case NVME_ZONE_ACTION_FINISH: @@ -2018,24 +2084,32 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req) proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES; } trace_pci_nvme_finish_zone(slba, zone_idx, all); - status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req); break; case NVME_ZONE_ACTION_RESET: + resets = (uintptr_t *)&req->opaque; + if (all) { proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES | NVME_PROC_FULL_ZONES; } trace_pci_nvme_reset_zone(slba, zone_idx, all); - status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone); - break; + + *resets = 1; + + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req); + + (*resets)--; + + return *resets ? NVME_NO_COMPLETE : req->status; case NVME_ZONE_ACTION_OFFLINE: if (all) { proc_mask = NVME_PROC_READ_ONLY_ZONES; } trace_pci_nvme_offline_zone(slba, zone_idx, all); - status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone); + status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req); break; case NVME_ZONE_ACTION_SET_ZD_EXT: diff --git a/hw/block/trace-events b/hw/block/trace-events index a455979e59..6d1686e6dc 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -49,6 +49,7 @@ pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32"" pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16"" pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16"" +pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64"" pci_nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16"" pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d" pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" From cd42771a334434c5ca8a4a48a62e48570dbe8d9e Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 9 Dec 2020 23:55:06 +0100 Subject: [PATCH 26/56] hw/block/nvme: add missing string representations for commands Add missing string representations for a couple of new commands. Signed-off-by: Klaus Jensen Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev --- hw/block/nvme.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index b7fbcca39d..65540b650e 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -64,8 +64,12 @@ static inline const char *nvme_io_opc_str(uint8_t opc) case NVME_CMD_FLUSH: return "NVME_NVM_CMD_FLUSH"; case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE"; case NVME_CMD_READ: return "NVME_NVM_CMD_READ"; + case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE"; case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES"; case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM"; + case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND"; + case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV"; + case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND"; default: return "NVME_NVM_CMD_UNKNOWN"; } } From 521ea778b264116310182db9c4d329c5aa867c59 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Thu, 10 Dec 2020 08:11:32 +0100 Subject: [PATCH 27/56] hw/block/nvme: remove unnecessary check for append nvme_io_cmd already checks if the namespace supports the Zone Append command, so the removed check is dead code. Signed-off-by: Klaus Jensen Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev --- hw/block/nvme.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 7222eff755..c73afdf805 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1707,10 +1707,6 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, } res->slba = nvme_advance_zone_wp(ns, zone, nlb); - } else if (append) { - trace_pci_nvme_err_invalid_opc(rw->opcode); - status = NVME_INVALID_OPCODE; - goto invalid; } data_offset = nvme_l2b(ns, slba); From add961300c8e29167465fe8206539c4e6bffde28 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Mon, 18 Jan 2021 12:39:17 +0900 Subject: [PATCH 28/56] hw/block/nvme: Correct error status for unaligned ZA TP 4053 says (in section 2.3.1.1) - ... if a Zone Append command specifies a ZSLBA that is not the lowest logical block address in that zone, then the controller shall abort that command with a status code of Invalid Field In Command. In the code, Zone Invalid Write is returned instead, fix this. Signed-off-by: Dmitry Fomichev Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c73afdf805..35f39ecd95 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1150,7 +1150,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, if (append) { if (unlikely(slba != zone->d.zslba)) { trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); - status = NVME_ZONE_INVALID_WRITE; + status = NVME_INVALID_FIELD; } if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) { trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl); From 1490be5a8a278c17bceffa0be1dbd21dcb2f9bee Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 17 Jan 2021 23:53:31 +0900 Subject: [PATCH 29/56] hw/block/nvme: remove unused argument in nvme_ns_init_zoned nvme_ns_init_zoned() has no use for given NvmeCtrl object. Signed-off-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 9be170abb7..d35c2925ec 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -205,7 +205,7 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns) } } -static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index) +static void nvme_ns_init_zoned(NvmeNamespace *ns, int lba_index) { NvmeIdNsZoned *id_ns_z; @@ -322,7 +322,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) { return -1; } - nvme_ns_init_zoned(n, ns, 0); + nvme_ns_init_zoned(ns, 0); } if (nvme_register_namespace(n, ns, errp)) { From aa5e55e3b07ede87a8fd7aa3e67583dfc464dd52 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 17 Jan 2021 23:53:32 +0900 Subject: [PATCH 30/56] hw/block/nvme: open code for volatile write cache Volatile Write Cache(VWC) feature is set in nvme_ns_setup() in the initial time. This feature is related to block device backed, but this feature is controlled in controller level via Set/Get Features command. This patch removed dependency between nvme and nvme-ns to manage the VWC flag value. Also, it open coded the Get Features for VWC to check all namespaces attached to the controller, and if false detected, return directly false. Signed-off-by: Minwoo Im [k.jensen: report write cache preset if present on ANY namespace] Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 4 ---- hw/block/nvme.c | 15 ++++++++++++--- hw/block/nvme.h | 1 - 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index d35c2925ec..7a5a779837 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -90,10 +90,6 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) return -1; } - if (blk_enable_write_cache(ns->blkconf.blk)) { - n->features.vwc = 0x1; - } - return 0; } diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 35f39ecd95..0b002cb2be 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3097,6 +3097,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10); uint16_t iv; NvmeNamespace *ns; + int i; static const uint32_t nvme_feature_default[NVME_FID_MAX] = { [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT, @@ -3172,7 +3173,17 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req) result = ns->features.err_rec; goto out; case NVME_VOLATILE_WRITE_CACHE: - result = n->features.vwc; + for (i = 1; i <= n->num_namespaces; i++) { + ns = nvme_ns(n, i); + if (!ns) { + continue; + } + + result = blk_enable_write_cache(ns->blkconf.blk); + if (result) { + break; + } + } trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled"); goto out; case NVME_ASYNCHRONOUS_EVENT_CONF: @@ -3335,8 +3346,6 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) ns->features.err_rec = dw11; break; case NVME_VOLATILE_WRITE_CACHE: - n->features.vwc = dw11 & 0x1; - for (i = 1; i <= n->num_namespaces; i++) { ns = nvme_ns(n, i); if (!ns) { diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 65540b650e..347c149e79 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -121,7 +121,6 @@ typedef struct NvmeFeatureVal { uint16_t temp_thresh_low; }; uint32_t async_config; - uint32_t vwc; } NvmeFeatureVal; typedef struct NvmeCtrl { From 337ccd7650cfab379bfba9eed54ad790e88ec62e Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 17 Jan 2021 23:53:33 +0900 Subject: [PATCH 31/56] hw/block/nvme: remove unused argument in nvme_ns_init_blk Removed no longer used aregument NvmeCtrl object in nvme_ns_init_blk(). Signed-off-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 7a5a779837..17e876e6bc 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -66,7 +66,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp) return 0; } -static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) +static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp) { bool read_only; @@ -307,7 +307,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) return -1; } - if (nvme_ns_init_blk(n, ns, errp)) { + if (nvme_ns_init_blk(ns, errp)) { return -1; } From 15d024d4aa9b73ff05604f6f6d998788d3ea26e2 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 17 Jan 2021 23:53:34 +0900 Subject: [PATCH 32/56] hw/block/nvme: split setup and register for namespace In NVMe, namespace is being attached to process I/O. We register NVMe namespace to a controller via nvme_register_namespace() during nvme_ns_setup(). This is main reason of receiving NvmeCtrl object instance to this function to map the namespace to a controller. To make namespace instance more independent, it should be split into two parts: setup and register. This patch split them into two differnt parts, and finally nvme_ns_setup() does not have nothing to do with NvmeCtrl instance at all. This patch is a former patch to introduce NVMe subsystem scheme to the existing design especially for multi-path. In that case, it should be split into two to make namespace independent from a controller. Signed-off-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 17e876e6bc..ce79ad4a53 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -321,10 +321,6 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) nvme_ns_init_zoned(ns, 0); } - if (nvme_register_namespace(n, ns, errp)) { - return -1; - } - return 0; } @@ -362,6 +358,13 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) "could not setup namespace: "); return; } + + if (nvme_register_namespace(n, ns, errp)) { + error_propagate_prepend(errp, local_err, + "could not register namespace: "); + return; + } + } static Property nvme_ns_props[] = { From 24ec776a5ac70954e95a33249d8fe3378c01f3a0 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 17 Jan 2021 23:53:35 +0900 Subject: [PATCH 33/56] hw/block/nvme: remove unused argument in nvme_ns_setup nvme_ns_setup() finally does not have nothing to do with NvmeCtrl instance. Signed-off-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 4 ++-- hw/block/nvme-ns.h | 2 +- hw/block/nvme.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index ce79ad4a53..3f52acb89c 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -301,7 +301,7 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) return 0; } -int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) +int nvme_ns_setup(NvmeNamespace *ns, Error **errp) { if (nvme_ns_check_constraints(ns, errp)) { return -1; @@ -353,7 +353,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) NvmeCtrl *n = NVME(s->parent); Error *local_err = NULL; - if (nvme_ns_setup(n, ns, &local_err)) { + if (nvme_ns_setup(ns, &local_err)) { error_propagate_prepend(errp, local_err, "could not setup namespace: "); return; diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h index a0baa5f6d4..293ac990e3 100644 --- a/hw/block/nvme-ns.h +++ b/hw/block/nvme-ns.h @@ -174,7 +174,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns) assert(ns->nr_active_zones >= 0); } -int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp); +int nvme_ns_setup(NvmeNamespace *ns, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); void nvme_ns_shutdown(NvmeNamespace *ns); void nvme_ns_cleanup(NvmeNamespace *ns); diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 0b002cb2be..30bd70fd5b 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4377,7 +4377,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) ns = &n->namespace; ns->params.nsid = 1; - if (nvme_ns_setup(n, ns, errp)) { + if (nvme_ns_setup(ns, errp)) { return; } } From 635b23ad43e37910eb7607cfee6887e89ae9e69a Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 12 Jan 2021 10:32:37 +0100 Subject: [PATCH 34/56] hw/block/nvme: fix zone write finalize The zone write pointer is unconditionally advanced, even for write faults. Make sure that the zone is always transitioned to Full if the write pointer reaches zone capacity. Cc: Dmitry Fomichev Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 30bd70fd5b..4d73398798 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1268,10 +1268,13 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, nlb = le16_to_cpu(rw->nlb) + 1; zone = nvme_get_zone_by_slba(ns, slba); + zone->d.wp += nlb; + if (failed) { res->slba = 0; - zone->d.wp += nlb; - } else if (zone->w_ptr == nvme_zone_wr_boundary(zone)) { + } + + if (zone->d.wp == nvme_zone_wr_boundary(zone)) { switch (nvme_get_zone_state(zone)) { case NVME_ZONE_STATE_IMPLICITLY_OPEN: case NVME_ZONE_STATE_EXPLICITLY_OPEN: @@ -1288,9 +1291,6 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, default: assert(false); } - zone->d.wp = zone->w_ptr; - } else { - zone->d.wp += nlb; } } From c6d1b5c13bf937d3314f4bad596ced52f32106c5 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Fri, 15 Jan 2021 11:27:00 +0800 Subject: [PATCH 35/56] nvme: introduce bit 5 for critical warning According to NVM Express v1.4, Section 5.14.1.2 ("SMART / Health Information"), introduce bit 5 for "Persistent Memory Region has become read-only or unreliable". Signed-off-by: zhenwei pi [k.jensen: minor brush ups in commit message] Signed-off-by: Klaus Jensen --- include/block/nvme.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/block/nvme.h b/include/block/nvme.h index 45b2678db1..41614c5e12 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -789,6 +789,7 @@ enum NvmeSmartWarn { NVME_SMART_RELIABILITY = 1 << 2, NVME_SMART_MEDIA_READ_ONLY = 1 << 3, NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, + NVME_SMART_PMR_UNRELIABLE = 1 << 5, }; typedef struct NvmeEffectsLog { From 4714791b66ae39bea6b6e3c3e84aaed8dcb005c7 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Fri, 15 Jan 2021 11:27:01 +0800 Subject: [PATCH 36/56] hw/block/nvme: add smart_critical_warning property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a very low probability that hitting physical NVMe disk hardware critical warning case, it's hard to write & test a monitor agent service. For debugging purposes, add a new 'smart_critical_warning' property to emulate this situation. The orignal version of this change is implemented by adding a fixed property which could be initialized by QEMU command line. Suggested by Philippe & Klaus, rework like current version. Test with this patch: 1, change smart_critical_warning property for a running VM: #virsh qemu-monitor-command nvme-upstream '{ "execute": "qom-set", "arguments": { "path": "/machine/peripheral-anon/device[0]", "property": "smart_critical_warning", "value":16 } }' 2, run smartctl in guest #smartctl -H -l error /dev/nvme0n1 === START OF SMART DATA SECTION === SMART overall-health self-assessment test result: FAILED! - volatile memory backup device has failed Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: zhenwei pi Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 45 +++++++++++++++++++++++++++++++++++++++++--- hw/block/nvme.h | 1 + include/block/nvme.h | 1 + 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 4d73398798..f0cb7acd74 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2490,6 +2490,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len, } trans_len = MIN(sizeof(smart) - off, buf_len); + smart.critical_warning = n->smart_critical_warning; smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read, 1000)); @@ -4432,6 +4433,40 @@ static Property nvme_props[] = { DEFINE_PROP_END_OF_LIST(), }; +static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + NvmeCtrl *n = NVME(obj); + uint8_t value = n->smart_critical_warning; + + visit_type_uint8(v, name, &value, errp); +} + +static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + NvmeCtrl *n = NVME(obj); + uint8_t value, cap = 0; + + if (!visit_type_uint8(v, name, &value, errp)) { + return; + } + + cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY + | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA; + if (NVME_CAP_PMR(n->bar.cap)) { + cap |= NVME_SMART_PMR_UNRELIABLE; + } + + if ((value & cap) != value) { + error_setg(errp, "unsupported smart critical warning bits: 0x%x", + value & ~cap); + return; + } + + n->smart_critical_warning = value; +} + static const VMStateDescription nvme_vmstate = { .name = "nvme", .unmigratable = 1, @@ -4455,13 +4490,17 @@ static void nvme_class_init(ObjectClass *oc, void *data) static void nvme_instance_init(Object *obj) { - NvmeCtrl *s = NVME(obj); + NvmeCtrl *n = NVME(obj); - if (s->namespace.blkconf.blk) { - device_add_bootindex_property(obj, &s->namespace.blkconf.bootindex, + if (n->namespace.blkconf.blk) { + device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex, "bootindex", "/namespace@1,0", DEVICE(obj)); } + + object_property_add(obj, "smart_critical_warning", "uint8", + nvme_get_smart_warning, + nvme_set_smart_warning, NULL, NULL); } static const TypeInfo nvme_info = { diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 347c149e79..b0d5b6409d 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -148,6 +148,7 @@ typedef struct NvmeCtrl { uint64_t timestamp_set_qemu_clock_ms; /* QEMU clock time */ uint64_t starttime_ms; uint16_t temperature; + uint8_t smart_critical_warning; HostMemoryBackend *pmrdev; diff --git a/include/block/nvme.h b/include/block/nvme.h index 41614c5e12..88af3b4234 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -60,6 +60,7 @@ enum NvmeCapMask { #define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK) #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) +#define NVME_CAP_PMR(cap) (((cap) >> CAP_PMR_SHIFT) & CAP_PMR_MASK) #define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ << CAP_MQES_SHIFT) From c62720f137dffcefa7f03ac1a51b4aadba2a38be Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Fri, 15 Jan 2021 11:27:02 +0800 Subject: [PATCH 37/56] hw/block/nvme: trigger async event during injecting smart warning During smart critical warning injection by setting property from QMP command, also try to trigger asynchronous event. Suggested by Keith, if a event has already been raised, there is no need to enqueue the duplicate event any more. Signed-off-by: zhenwei pi [k.jensen: fix typo in commit message] Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 48 +++++++++++++++++++++++++++++++++++++------- include/block/nvme.h | 1 + 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f0cb7acd74..09eb1f06e8 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -980,6 +980,35 @@ static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type, nvme_process_aers(n); } +static void nvme_smart_event(NvmeCtrl *n, uint8_t event) +{ + uint8_t aer_info; + + /* Ref SPEC */ + if (!(NVME_AEC_SMART(n->features.async_config) & event)) { + return; + } + + switch (event) { + case NVME_SMART_SPARE: + aer_info = NVME_AER_INFO_SMART_SPARE_THRESH; + break; + case NVME_SMART_TEMPERATURE: + aer_info = NVME_AER_INFO_SMART_TEMP_THRESH; + break; + case NVME_SMART_RELIABILITY: + case NVME_SMART_MEDIA_READ_ONLY: + case NVME_SMART_FAILED_VOLATILE_MEDIA: + case NVME_SMART_PMR_UNRELIABLE: + aer_info = NVME_AER_INFO_SMART_RELIABILITY; + break; + default: + return; + } + + nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO); +} + static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type) { n->aer_mask &= ~(1 << event_type); @@ -3317,12 +3346,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } - if (((n->temperature >= n->features.temp_thresh_hi) || - (n->temperature <= n->features.temp_thresh_low)) && - NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) { - nvme_enqueue_event(n, NVME_AER_TYPE_SMART, - NVME_AER_INFO_SMART_TEMP_THRESH, - NVME_LOG_SMART_INFO); + if ((n->temperature >= n->features.temp_thresh_hi) || + (n->temperature <= n->features.temp_thresh_low)) { + nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH); } break; @@ -4446,7 +4472,7 @@ static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) { NvmeCtrl *n = NVME(obj); - uint8_t value, cap = 0; + uint8_t value, old_value, cap = 0, index, event; if (!visit_type_uint8(v, name, &value, errp)) { return; @@ -4464,7 +4490,15 @@ static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, return; } + old_value = n->smart_critical_warning; n->smart_critical_warning = value; + + /* only inject new bits of smart critical warning */ + for (index = 0; index < NVME_SMART_WARN_MAX; index++) { + event = 1 << index; + if (value & ~old_value & event) + nvme_smart_event(n, event); + } } static const VMStateDescription nvme_vmstate = { diff --git a/include/block/nvme.h b/include/block/nvme.h index 88af3b4234..854fb2abb6 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -784,6 +784,7 @@ typedef struct QEMU_PACKED NvmeSmartLog { uint8_t reserved2[320]; } NvmeSmartLog; +#define NVME_SMART_WARN_MAX 6 enum NvmeSmartWarn { NVME_SMART_SPARE = 1 << 0, NVME_SMART_TEMPERATURE = 1 << 1, From ffacaf090893c5d6a5a6ea51b93087411af859c4 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 18 Jan 2021 07:30:50 +0100 Subject: [PATCH 38/56] hw/block/nvme: add size to mmio read/write trace events Add the size of the mmio read/write to the trace event. Reviewed-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 4 ++-- hw/block/trace-events | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 09eb1f06e8..2407b6578a 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3878,7 +3878,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) uint8_t *ptr = (uint8_t *)&n->bar; uint64_t val = 0; - trace_pci_nvme_mmio_read(addr); + trace_pci_nvme_mmio_read(addr, size); if (unlikely(addr & (sizeof(uint32_t) - 1))) { NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32, @@ -4042,7 +4042,7 @@ static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data, { NvmeCtrl *n = (NvmeCtrl *)opaque; - trace_pci_nvme_mmio_write(addr, data); + trace_pci_nvme_mmio_write(addr, data, size); if (addr < sizeof(n->bar)) { nvme_write_bar(n, addr, data, size); diff --git a/hw/block/trace-events b/hw/block/trace-events index 6d1686e6dc..3772502033 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -80,8 +80,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d" pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8"" pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs" pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16"" -pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64"" -pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64"" +pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d" +pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" data 0x%"PRIx64" size %d" pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16"" pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "sqid %"PRIu16" new_tail %"PRIu16"" pci_nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64"" From 0d3d5da2ccc8823c7c904b790b8d0fdf569790f0 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Mon, 18 Jan 2021 07:31:45 +0100 Subject: [PATCH 39/56] hw/block/nvme: fix 64 bit register hi/lo split writes 64 bit registers like ASQ and ACQ should be writable by both a hi/lo 32 bit write combination as well as a plain 64 bit write. The spec does not define ordering on the hi/lo split, but the code currently assumes that the low order bits are written first. Additionally, the code does not consider that another address might already have been written into the register, causing the OR'ing to result in a bad address. Fix this by explicitly overwriting only the low or high order bits for 32 bit writes. Signed-off-by: Klaus Jensen Reviewed-by: Keith Busch --- hw/block/nvme.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 2407b6578a..2785127037 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3819,19 +3819,21 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, trace_pci_nvme_mmio_aqattr(data & 0xffffffff); break; case 0x28: /* ASQ */ - n->bar.asq = data; + n->bar.asq = size == 8 ? data : + (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff); trace_pci_nvme_mmio_asqaddr(data); break; case 0x2c: /* ASQ hi */ - n->bar.asq |= data << 32; + n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32); trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq); break; case 0x30: /* ACQ */ trace_pci_nvme_mmio_acqaddr(data); - n->bar.acq = data; + n->bar.acq = size == 8 ? data : + (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff); break; case 0x34: /* ACQ hi */ - n->bar.acq |= data << 32; + n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32); trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq); break; case 0x38: /* CMBLOC */ From c7050631297f07917c23c7f4cdec8a6cca0eed12 Mon Sep 17 00:00:00 2001 From: Andrzej Jakowski Date: Fri, 13 Nov 2020 08:00:47 +0100 Subject: [PATCH 40/56] hw/block/nvme: indicate CMB support through controller capabilities register This patch sets CMBS bit in controller capabilities register when user configures NVMe driver with CMB support, so capabilites are correctly reported to guest OS. Signed-off-by: Andrzej Jakowski Reviewed-by: Maxim Levitsky Reviewed-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 1 + include/block/nvme.h | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 2785127037..5f12ac1200 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4374,6 +4374,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP); NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); + NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0); n->bar.vs = NVME_SPEC_VER; n->bar.intmc = n->bar.intms = 0; diff --git a/include/block/nvme.h b/include/block/nvme.h index 854fb2abb6..151921da21 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -36,6 +36,7 @@ enum NvmeCapShift { CAP_MPSMIN_SHIFT = 48, CAP_MPSMAX_SHIFT = 52, CAP_PMR_SHIFT = 56, + CAP_CMB_SHIFT = 57, }; enum NvmeCapMask { @@ -49,6 +50,7 @@ enum NvmeCapMask { CAP_MPSMIN_MASK = 0xf, CAP_MPSMAX_MASK = 0xf, CAP_PMR_MASK = 0x1, + CAP_CMB_MASK = 0x1, }; #define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) @@ -79,9 +81,11 @@ enum NvmeCapMask { #define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\ << CAP_MPSMIN_SHIFT) #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ - << CAP_MPSMAX_SHIFT) -#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\ - << CAP_PMR_SHIFT) + << CAP_MPSMAX_SHIFT) +#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK) \ + << CAP_PMR_SHIFT) +#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMB_MASK) \ + << CAP_CMB_SHIFT) enum NvmeCapCss { NVME_CAP_CSS_NVM = 1 << 0, From 1901b4967c3fdd47e59d9023aea2285d94f3998a Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 13 Nov 2020 09:50:33 +0100 Subject: [PATCH 41/56] hw/block/nvme: move msix table and pba to BAR 0 In the interest of supporting both CMB and PMR to be enabled on the same device, move the MSI-X table and pending bit array out of BAR 4 and into BAR 0. This is a simplified version of the patch contributed by Andrzej Jakowski (see [1]). Leaving the CMB at offset 0 removes the need for changes to CMB address mapping code. [1]: https://lore.kernel.org/qemu-devel/20200729220107.37758-3-andrzej.jakowski@linux.intel.com/ Reviewed-by: Minwoo Im Tested-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 23 +++++++++++++++++++++-- hw/block/nvme.h | 1 + 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 5f12ac1200..85d3c43c4f 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4268,6 +4268,8 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) { uint8_t *pci_conf = pci_dev->config; + uint64_t bar_size, msix_table_size, msix_pba_size; + unsigned msix_table_offset, msix_pba_offset; int ret; Error *err = NULL; @@ -4286,11 +4288,28 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS); pcie_endpoint_cap_init(pci_dev, 0x80); + bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB); + msix_table_offset = bar_size; + msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize; + + bar_size += msix_table_size; + bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB); + msix_pba_offset = bar_size; + msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8; + + bar_size += msix_pba_size; + bar_size = pow2ceil(bar_size); + + memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size); memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme", n->reg_size); + memory_region_add_subregion(&n->bar0, 0, &n->iomem); + pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | - PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem); - ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, &err); + PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0); + ret = msix_init(pci_dev, n->params.msix_qsize, + &n->bar0, 0, msix_table_offset, + &n->bar0, 0, msix_pba_offset, 0, &err); if (ret < 0) { if (ret == -ENOTSUP) { warn_report_err(err); diff --git a/hw/block/nvme.h b/hw/block/nvme.h index b0d5b6409d..1cdb360bc5 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -125,6 +125,7 @@ typedef struct NvmeFeatureVal { typedef struct NvmeCtrl { PCIDevice parent_obj; + MemoryRegion bar0; MemoryRegion iomem; MemoryRegion ctrl_mem; NvmeBar bar; From 709cc8fc68f7456f4db604db7c24c8a96eebb652 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 13 Nov 2020 09:57:13 +0100 Subject: [PATCH 42/56] hw/block/nvme: allow cmb and pmr to coexist With BAR 4 now free to use, allow PMR and CMB to be enabled simultaneously. Reviewed-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 85d3c43c4f..4ce75642f1 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -29,14 +29,13 @@ * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. * - * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation - * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when - * both provided. * Enabling pmr emulation can be achieved by pointing to memory-backend-file. * For example: * -object memory-backend-file,id=,share=on,mem-path=, \ * size= .... -device nvme,...,pmrdev= * + * The PMR will use BAR 4/5 exclusively. + * * * nvme device parameters * ~~~~~~~~~~~~~~~~~~~~~~ @@ -109,7 +108,7 @@ #define NVME_DB_SIZE 4 #define NVME_SPEC_VER 0x00010300 #define NVME_CMB_BIR 2 -#define NVME_PMR_BIR 2 +#define NVME_PMR_BIR 4 #define NVME_TEMPERATURE 0x143 #define NVME_TEMPERATURE_WARNING 0x157 #define NVME_TEMPERATURE_CRITICAL 0x175 @@ -4121,7 +4120,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) return; } - if (!n->params.cmb_size_mb && n->pmrdev) { + if (n->pmrdev) { if (host_memory_backend_is_mapped(n->pmrdev)) { error_setg(errp, "can't use already busy memdev: %s", object_get_canonical_path_component(OBJECT(n->pmrdev))); @@ -4218,9 +4217,6 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) { - /* Controller Capabilities register */ - NVME_CAP_SET_PMRS(n->bar.cap, 1); - /* PMR Capabities register */ n->bar.pmrcap = 0; NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0); @@ -4321,7 +4317,9 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) if (n->params.cmb_size_mb) { nvme_init_cmb(n, pci_dev); - } else if (n->pmrdev) { + } + + if (n->pmrdev) { nvme_init_pmr(n, pci_dev); } @@ -4394,6 +4392,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0); + NVME_CAP_SET_PMRS(n->bar.cap, n->pmrdev ? 1 : 0); n->bar.vs = NVME_SPEC_VER; n->bar.intmc = n->bar.intms = 0; From 8e9e8b48216b134bb9165c1f023a60a4ec480b42 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 18 Dec 2020 12:54:45 +0100 Subject: [PATCH 43/56] hw/block/nvme: rename PMR/CMB shift/mask fields Use the correct field names. Reviewed-by: Minwoo Im Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 2 +- include/block/nvme.h | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 4ce75642f1..0057a02402 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4501,7 +4501,7 @@ static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name, cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA; - if (NVME_CAP_PMR(n->bar.cap)) { + if (NVME_CAP_PMRS(n->bar.cap)) { cap |= NVME_SMART_PMR_UNRELIABLE; } diff --git a/include/block/nvme.h b/include/block/nvme.h index 151921da21..008108bd1a 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -35,8 +35,8 @@ enum NvmeCapShift { CAP_CSS_SHIFT = 37, CAP_MPSMIN_SHIFT = 48, CAP_MPSMAX_SHIFT = 52, - CAP_PMR_SHIFT = 56, - CAP_CMB_SHIFT = 57, + CAP_PMRS_SHIFT = 56, + CAP_CMBS_SHIFT = 57, }; enum NvmeCapMask { @@ -49,8 +49,8 @@ enum NvmeCapMask { CAP_CSS_MASK = 0xff, CAP_MPSMIN_MASK = 0xf, CAP_MPSMAX_MASK = 0xf, - CAP_PMR_MASK = 0x1, - CAP_CMB_MASK = 0x1, + CAP_PMRS_MASK = 0x1, + CAP_CMBS_MASK = 0x1, }; #define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) @@ -62,7 +62,7 @@ enum NvmeCapMask { #define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK) #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) -#define NVME_CAP_PMR(cap) (((cap) >> CAP_PMR_SHIFT) & CAP_PMR_MASK) +#define NVME_CAP_PMRS(cap) (((cap) >> CAP_PMRS_SHIFT) & CAP_PMRS_MASK) #define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ << CAP_MQES_SHIFT) @@ -82,10 +82,10 @@ enum NvmeCapMask { << CAP_MPSMIN_SHIFT) #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ << CAP_MPSMAX_SHIFT) -#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK) \ - << CAP_PMR_SHIFT) -#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMB_MASK) \ - << CAP_CMB_SHIFT) +#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMRS_MASK) \ + << CAP_PMRS_SHIFT) +#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMBS_MASK) \ + << CAP_CMBS_SHIFT) enum NvmeCapCss { NVME_CAP_CSS_NVM = 1 << 0, From b78b9bb0ee1f07b27b649065158dcac571986f30 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 18 Dec 2020 13:03:09 +0100 Subject: [PATCH 44/56] hw/block/nvme: remove redundant zeroing of PMR registers The controller registers are initially zero. Remove the redundant zeroing. Reviewed-by: Keith Busch Reviewed-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 0057a02402..f8dd771925 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -4217,43 +4217,9 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) { - /* PMR Capabities register */ - n->bar.pmrcap = 0; - NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0); - NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0); NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR); - NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0); /* Turn on bit 1 support */ NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02); - NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0); - NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0); - - /* PMR Control register */ - n->bar.pmrctl = 0; - NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0); - - /* PMR Status register */ - n->bar.pmrsts = 0; - NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0); - NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0); - NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0); - NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0); - - /* PMR Elasticity Buffer Size register */ - n->bar.pmrebs = 0; - NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0); - NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0); - NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0); - - /* PMR Sustained Write Throughput register */ - n->bar.pmrswtp = 0; - NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0); - NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0); - - /* PMR Memory Space Control register */ - n->bar.pmrmsc = 0; - NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0); - NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0); pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap), PCI_BASE_ADDRESS_SPACE_MEMORY | From 75c3c9de961db9a76842452c973aea921eb4fa1f Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 18 Dec 2020 13:04:19 +0100 Subject: [PATCH 45/56] hw/block/nvme: disable PMR at boot up The PMR should not be enabled at boot up. Disable the PMR MemoryRegion initially and implement MMIO for PMRCTL, allowing the host to enable the PMR explicitly. Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index f8dd771925..d773796051 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3848,8 +3848,16 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly, "invalid write to PMRCAP register, ignored"); return; - case 0xE04: /* TODO PMRCTL */ - break; + case 0xE04: /* PMRCTL */ + n->bar.pmrctl = data; + if (NVME_PMRCTL_EN(data)) { + memory_region_set_enabled(&n->pmrdev->mr, true); + n->bar.pmrsts = 0; + } else { + memory_region_set_enabled(&n->pmrdev->mr, false); + NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1); + } + return; case 0xE08: /* PMRSTS */ NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly, "invalid write to PMRSTS register, ignored"); @@ -4225,6 +4233,8 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 | PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr); + + memory_region_set_enabled(&n->pmrdev->mr, false); } static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) From 7ec9f2eef979e51891d99667e01d6c0e789a52b2 Mon Sep 17 00:00:00 2001 From: Naveen Nagar Date: Fri, 13 Nov 2020 11:00:05 +0530 Subject: [PATCH 46/56] hw/block/nvme: add PMR RDS/WDS support Add support for the PMRMSCL and PMRMSCU MMIO registers. This allows adding RDS/WDS support for PMR as well. Reviewed-by: Keith Busch Signed-off-by: Naveen Nagar Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 122 +++++++++++++++++++++++++++++++++++++++--------- hw/block/nvme.h | 6 ++- 2 files changed, 106 insertions(+), 22 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index d773796051..7f1c8dd775 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -273,6 +273,24 @@ static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) return &n->cmbuf[addr - n->ctrl_mem.addr]; } +static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr) +{ + hwaddr hi; + + if (!n->pmr.cmse) { + return false; + } + + hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size); + + return addr >= n->pmr.cba && addr < hi; +} + +static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr) +{ + return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba); +} + static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) { hwaddr hi = addr + size - 1; @@ -285,6 +303,11 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size) return 0; } + if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) { + memcpy(buf, nvme_addr_to_pmr(n, addr), size); + return 0; + } + return pci_dma_read(&n->parent_obj, addr, buf, size); } @@ -406,9 +429,27 @@ static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, return NVME_SUCCESS; } +static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr, + size_t len) +{ + if (!len) { + return NVME_SUCCESS; + } + + if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) { + return NVME_DATA_TRAS_ERROR; + } + + qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len); + + return NVME_SUCCESS; +} + static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, hwaddr addr, size_t len) { + bool cmb = false, pmr = false; + if (!len) { return NVME_SUCCESS; } @@ -416,6 +457,12 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, trace_pci_nvme_map_addr(addr, len); if (nvme_addr_is_cmb(n, addr)) { + cmb = true; + } else if (nvme_addr_is_pmr(n, addr)) { + pmr = true; + } + + if (cmb || pmr) { if (qsg && qsg->sg) { return NVME_INVALID_USE_OF_CMB | NVME_DNR; } @@ -426,7 +473,11 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, qemu_iovec_init(iov, 1); } - return nvme_map_addr_cmb(n, iov, addr, len); + if (cmb) { + return nvme_map_addr_cmb(n, iov, addr, len); + } else { + return nvme_map_addr_pmr(n, iov, addr, len); + } } if (iov && iov->iov) { @@ -459,7 +510,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps); - if (nvme_addr_is_cmb(n, prp1)) { + if (nvme_addr_is_cmb(n, prp1) || (nvme_addr_is_pmr(n, prp1))) { qemu_iovec_init(iov, num_prps); } else { pci_dma_sglist_init(qsg, &n->parent_obj, num_prps); @@ -3561,8 +3612,8 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n) NvmeNamespace *ns; int i; - if (n->pmrdev) { - memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size); + if (n->pmr.dev) { + memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); } for (i = 1; i <= n->num_namespaces; i++) { @@ -3851,11 +3902,12 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, case 0xE04: /* PMRCTL */ n->bar.pmrctl = data; if (NVME_PMRCTL_EN(data)) { - memory_region_set_enabled(&n->pmrdev->mr, true); + memory_region_set_enabled(&n->pmr.dev->mr, true); n->bar.pmrsts = 0; } else { - memory_region_set_enabled(&n->pmrdev->mr, false); + memory_region_set_enabled(&n->pmr.dev->mr, false); NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1); + n->pmr.cmse = false; } return; case 0xE08: /* PMRSTS */ @@ -3870,8 +3922,33 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly, "invalid write to PMRSWTP register, ignored"); return; - case 0xE14: /* TODO PMRMSC */ - break; + case 0xE14: /* PMRMSCL */ + if (!NVME_CAP_PMRS(n->bar.cap)) { + return; + } + + n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff); + n->pmr.cmse = false; + + if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) { + hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT; + if (cba + int128_get64(n->pmr.dev->mr.size) < cba) { + NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1); + return; + } + + n->pmr.cmse = true; + n->pmr.cba = cba; + } + + return; + case 0xE18: /* PMRMSCU */ + if (!NVME_CAP_PMRS(n->bar.cap)) { + return; + } + + n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32); + return; default: NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid, "invalid MMIO write," @@ -3909,7 +3986,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size) */ if (addr == 0xE08 && (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) { - memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size); + memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size); } memcpy(&val, ptr + addr, size); } else { @@ -4128,19 +4205,19 @@ static void nvme_check_constraints(NvmeCtrl *n, Error **errp) return; } - if (n->pmrdev) { - if (host_memory_backend_is_mapped(n->pmrdev)) { + if (n->pmr.dev) { + if (host_memory_backend_is_mapped(n->pmr.dev)) { error_setg(errp, "can't use already busy memdev: %s", - object_get_canonical_path_component(OBJECT(n->pmrdev))); + object_get_canonical_path_component(OBJECT(n->pmr.dev))); return; } - if (!is_power_of_2(n->pmrdev->size)) { + if (!is_power_of_2(n->pmr.dev->size)) { error_setg(errp, "pmr backend size needs to be power of 2 in size"); return; } - host_memory_backend_set_mapped(n->pmrdev, true); + host_memory_backend_set_mapped(n->pmr.dev, true); } if (n->params.zasl_bs) { @@ -4225,16 +4302,19 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) { + NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1); + NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1); NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR); /* Turn on bit 1 support */ NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02); + NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1); pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap), PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 | - PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr); + PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr); - memory_region_set_enabled(&n->pmrdev->mr, false); + memory_region_set_enabled(&n->pmr.dev->mr, false); } static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) @@ -4295,7 +4375,7 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp) nvme_init_cmb(n, pci_dev); } - if (n->pmrdev) { + if (n->pmr.dev) { nvme_init_pmr(n, pci_dev); } @@ -4368,7 +4448,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY); NVME_CAP_SET_MPSMAX(n->bar.cap, 4); NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0); - NVME_CAP_SET_PMRS(n->bar.cap, n->pmrdev ? 1 : 0); + NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0); n->bar.vs = NVME_SPEC_VER; n->bar.intmc = n->bar.intms = 0; @@ -4432,15 +4512,15 @@ static void nvme_exit(PCIDevice *pci_dev) g_free(n->cmbuf); } - if (n->pmrdev) { - host_memory_backend_set_mapped(n->pmrdev, false); + if (n->pmr.dev) { + host_memory_backend_set_mapped(n->pmr.dev, false); } msix_uninit_exclusive_bar(pci_dev); } static Property nvme_props[] = { DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf), - DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmrdev, TYPE_MEMORY_BACKEND, + DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND, HostMemoryBackend *), DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial), DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0), diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 1cdb360bc5..b7702e937e 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -151,7 +151,11 @@ typedef struct NvmeCtrl { uint16_t temperature; uint8_t smart_critical_warning; - HostMemoryBackend *pmrdev; + struct { + HostMemoryBackend *dev; + bool cmse; + hwaddr cba; + } pmr; uint8_t aer_mask; NvmeRequest **aer_reqs; From f4319477b4fd692b568c22ed97dde7f542a48ac9 Mon Sep 17 00:00:00 2001 From: Padmakar Kalghatgi Date: Fri, 18 Dec 2020 00:32:16 +0100 Subject: [PATCH 47/56] hw/block/nvme: move cmb logic to v1.4 Implement v1.4 logic for configuring the Controller Memory Buffer. By default, the v1.4 scheme will be used (CMB must be explicitly enabled by the host), so drivers that only support v1.3 will not be able to use the CMB anymore. To retain the v1.3 behavior, set the boolean 'legacy-cmb' nvme device parameter. Reviewed-by: Keith Busch Reviewed-by: Minwoo Im Signed-off-by: Padmakar Kalghatgi Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 101 +++++++++++++++++++++++++++++---------- hw/block/nvme.h | 10 +++- hw/block/trace-events | 2 + include/block/nvme.h | 107 +++++++++++++++++++++++++++++++++++++----- 4 files changed, 182 insertions(+), 38 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 7f1c8dd775..1e13d25b08 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -27,7 +27,9 @@ * zoned= * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at - * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. + * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the + * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to + * always enable the CMBLOC and CMBSZ registers (v1.3 behavior). * * Enabling pmr emulation can be achieved by pointing to memory-backend-file. * For example: @@ -260,17 +262,22 @@ static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn) static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) { - hwaddr low = n->ctrl_mem.addr; - hwaddr hi = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size); + hwaddr hi, lo; - return addr >= low && addr < hi; + if (!n->cmb.cmse) { + return false; + } + + lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; + hi = lo + int128_get64(n->cmb.mem.size); + + return addr >= lo && addr < hi; } static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr) { - assert(nvme_addr_is_cmb(n, addr)); - - return &n->cmbuf[addr - n->ctrl_mem.addr]; + hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba; + return &n->cmb.buf[addr - base]; } static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr) @@ -3768,6 +3775,19 @@ static int nvme_start_ctrl(NvmeCtrl *n) return 0; } +static void nvme_cmb_enable_regs(NvmeCtrl *n) +{ + NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR); + + NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); + NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0); + NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1); + NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1); + NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1); + NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */ + NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb); +} + static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, unsigned size) { @@ -3895,6 +3915,38 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data, NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly, "invalid write to read only CMBSZ, ignored"); return; + case 0x50: /* CMBMSC */ + if (!NVME_CAP_CMBS(n->bar.cap)) { + return; + } + + n->bar.cmbmsc = size == 8 ? data : + (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff); + n->cmb.cmse = false; + + if (NVME_CMBMSC_CRE(data)) { + nvme_cmb_enable_regs(n); + + if (NVME_CMBMSC_CMSE(data)) { + hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT; + if (cba + int128_get64(n->cmb.mem.size) < cba) { + NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1); + return; + } + + n->cmb.cba = cba; + n->cmb.cmse = true; + } + } else { + n->bar.cmbsz = 0; + n->bar.cmbloc = 0; + } + + return; + case 0x54: /* CMBMSC hi */ + n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32); + return; + case 0xE00: /* PMRCAP */ NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly, "invalid write to PMRCAP register, ignored"); @@ -4151,13 +4203,13 @@ static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { NvmeCtrl *n = (NvmeCtrl *)opaque; - stn_le_p(&n->cmbuf[addr], size, data); + stn_le_p(&n->cmb.buf[addr], size, data); } static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size) { NvmeCtrl *n = (NvmeCtrl *)opaque; - return ldn_le_p(&n->cmbuf[addr], size); + return ldn_le_p(&n->cmb.buf[addr], size); } static const MemoryRegionOps nvme_cmb_ops = { @@ -4280,24 +4332,22 @@ int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) { - NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR); - NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0); + uint64_t cmb_size = n->params.cmb_size_mb * MiB; - NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); - NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0); - NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1); - NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1); - NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1); - NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */ - NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb); - - n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz)); - memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n, - "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz)); - pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc), + n->cmb.buf = g_malloc0(cmb_size); + memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n, + "nvme-cmb", cmb_size); + pci_register_bar(pci_dev, NVME_CMB_BIR, PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 | - PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem); + PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem); + + NVME_CAP_SET_CMBS(n->bar.cap, 1); + + if (n->params.legacy_cmb) { + nvme_cmb_enable_regs(n); + n->cmb.cmse = true; + } } static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev) @@ -4509,7 +4559,7 @@ static void nvme_exit(PCIDevice *pci_dev) g_free(n->aer_reqs); if (n->params.cmb_size_mb) { - g_free(n->cmbuf); + g_free(n->cmb.buf); } if (n->pmr.dev) { @@ -4531,6 +4581,7 @@ static Property nvme_props[] = { DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64), DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7), DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), + DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), DEFINE_PROP_SIZE32("zoned.append_size_limit", NvmeCtrl, params.zasl_bs, NVME_DEFAULT_MAX_ZA_SIZE), DEFINE_PROP_END_OF_LIST(), diff --git a/hw/block/nvme.h b/hw/block/nvme.h index b7702e937e..dee6092bd4 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -20,6 +20,7 @@ typedef struct NvmeParams { uint8_t mdts; bool use_intel_id; uint32_t zasl_bs; + bool legacy_cmb; } NvmeParams; typedef struct NvmeAsyncEvent { @@ -127,7 +128,6 @@ typedef struct NvmeCtrl { PCIDevice parent_obj; MemoryRegion bar0; MemoryRegion iomem; - MemoryRegion ctrl_mem; NvmeBar bar; NvmeParams params; NvmeBus bus; @@ -143,7 +143,6 @@ typedef struct NvmeCtrl { uint32_t num_namespaces; uint32_t max_q_ents; uint8_t outstanding_aers; - uint8_t *cmbuf; uint32_t irq_status; uint64_t host_timestamp; /* Timestamp sent by the host */ uint64_t timestamp_set_qemu_clock_ms; /* QEMU clock time */ @@ -151,6 +150,13 @@ typedef struct NvmeCtrl { uint16_t temperature; uint8_t smart_critical_warning; + struct { + MemoryRegion mem; + uint8_t *buf; + bool cmse; + hwaddr cba; + } cmb; + struct { HostMemoryBackend *dev; bool cmse; diff --git a/hw/block/trace-events b/hw/block/trace-events index 3772502033..87ab6c5090 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -123,6 +123,8 @@ pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" pci_nvme_err_invalid_log_page_offset(uint64_t ofs, uint64_t size) "must be <= %"PRIu64", got %"PRIu64"" +pci_nvme_err_cmb_invalid_cba(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64"" +pci_nvme_err_cmb_not_enabled(uint64_t cmbmsc) "cmbmsc 0x%"PRIx64"" pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) "unaligned zone op 0x%"PRIx32", got slba=%"PRIu64", zslba=%"PRIu64"" pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32"" pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) "writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64"" diff --git a/include/block/nvme.h b/include/block/nvme.h index 008108bd1a..2e85b97a6c 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -15,14 +15,19 @@ typedef struct QEMU_PACKED NvmeBar { uint64_t acq; uint32_t cmbloc; uint32_t cmbsz; - uint8_t padding[3520]; /* not used by QEMU */ + uint32_t bpinfo; + uint32_t bprsel; + uint64_t bpmbl; + uint64_t cmbmsc; + uint32_t cmbsts; + uint8_t rsvd92[3492]; uint32_t pmrcap; uint32_t pmrctl; uint32_t pmrsts; uint32_t pmrebs; uint32_t pmrswtp; uint64_t pmrmsc; - uint8_t reserved[484]; + uint8_t css[484]; } NvmeBar; enum NvmeCapShift { @@ -63,6 +68,7 @@ enum NvmeCapMask { #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) #define NVME_CAP_PMRS(cap) (((cap) >> CAP_PMRS_SHIFT) & CAP_PMRS_MASK) +#define NVME_CAP_CMBS(cap) (((cap) >> CAP_CMBS_SHIFT) & CAP_CMBS_MASK) #define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ << CAP_MQES_SHIFT) @@ -184,25 +190,64 @@ enum NvmeAqaMask { #define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK) enum NvmeCmblocShift { - CMBLOC_BIR_SHIFT = 0, - CMBLOC_OFST_SHIFT = 12, + CMBLOC_BIR_SHIFT = 0, + CMBLOC_CQMMS_SHIFT = 3, + CMBLOC_CQPDS_SHIFT = 4, + CMBLOC_CDPMLS_SHIFT = 5, + CMBLOC_CDPCILS_SHIFT = 6, + CMBLOC_CDMMMS_SHIFT = 7, + CMBLOC_CQDA_SHIFT = 8, + CMBLOC_OFST_SHIFT = 12, }; enum NvmeCmblocMask { - CMBLOC_BIR_MASK = 0x7, - CMBLOC_OFST_MASK = 0xfffff, + CMBLOC_BIR_MASK = 0x7, + CMBLOC_CQMMS_MASK = 0x1, + CMBLOC_CQPDS_MASK = 0x1, + CMBLOC_CDPMLS_MASK = 0x1, + CMBLOC_CDPCILS_MASK = 0x1, + CMBLOC_CDMMMS_MASK = 0x1, + CMBLOC_CQDA_MASK = 0x1, + CMBLOC_OFST_MASK = 0xfffff, }; -#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT) & \ - CMBLOC_BIR_MASK) -#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \ - CMBLOC_OFST_MASK) +#define NVME_CMBLOC_BIR(cmbloc) \ + ((cmbloc >> CMBLOC_BIR_SHIFT) & CMBLOC_BIR_MASK) +#define NVME_CMBLOC_CQMMS(cmbloc) \ + ((cmbloc >> CMBLOC_CQMMS_SHIFT) & CMBLOC_CQMMS_MASK) +#define NVME_CMBLOC_CQPDS(cmbloc) \ + ((cmbloc >> CMBLOC_CQPDS_SHIFT) & CMBLOC_CQPDS_MASK) +#define NVME_CMBLOC_CDPMLS(cmbloc) \ + ((cmbloc >> CMBLOC_CDPMLS_SHIFT) & CMBLOC_CDPMLS_MASK) +#define NVME_CMBLOC_CDPCILS(cmbloc) \ + ((cmbloc >> CMBLOC_CDPCILS_SHIFT) & CMBLOC_CDPCILS_MASK) +#define NVME_CMBLOC_CDMMMS(cmbloc) \ + ((cmbloc >> CMBLOC_CDMMMS_SHIFT) & CMBLOC_CDMMMS_MASK) +#define NVME_CMBLOC_CQDA(cmbloc) \ + ((cmbloc >> CMBLOC_CQDA_SHIFT) & CMBLOC_CQDA_MASK) +#define NVME_CMBLOC_OFST(cmbloc) \ + ((cmbloc >> CMBLOC_OFST_SHIFT) & CMBLOC_OFST_MASK) -#define NVME_CMBLOC_SET_BIR(cmbloc, val) \ +#define NVME_CMBLOC_SET_BIR(cmbloc, val) \ (cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT) +#define NVME_CMBLOC_SET_CQMMS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CQMMS_MASK) << CMBLOC_CQMMS_SHIFT) +#define NVME_CMBLOC_SET_CQPDS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CQPDS_MASK) << CMBLOC_CQPDS_SHIFT) +#define NVME_CMBLOC_SET_CDPMLS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CDPMLS_MASK) << CMBLOC_CDPMLS_SHIFT) +#define NVME_CMBLOC_SET_CDPCILS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CDPCILS_MASK) << CMBLOC_CDPCILS_SHIFT) +#define NVME_CMBLOC_SET_CDMMMS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CDMMMS_MASK) << CMBLOC_CDMMMS_SHIFT) +#define NVME_CMBLOC_SET_CQDA(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CQDA_MASK) << CMBLOC_CQDA_SHIFT) #define NVME_CMBLOC_SET_OFST(cmbloc, val) \ (cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT) +#define NVME_CMBMSMC_SET_CRE (cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBMSC_CRE_SHIFT) + enum NvmeCmbszShift { CMBSZ_SQS_SHIFT = 0, CMBSZ_CQS_SHIFT = 1, @@ -249,6 +294,46 @@ enum NvmeCmbszMask { #define NVME_CMBSZ_GETSIZE(cmbsz) \ (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz)))) +enum NvmeCmbmscShift { + CMBMSC_CRE_SHIFT = 0, + CMBMSC_CMSE_SHIFT = 1, + CMBMSC_CBA_SHIFT = 12, +}; + +enum NvmeCmbmscMask { + CMBMSC_CRE_MASK = 0x1, + CMBMSC_CMSE_MASK = 0x1, + CMBMSC_CBA_MASK = ((1ULL << 52) - 1), +}; + +#define NVME_CMBMSC_CRE(cmbmsc) \ + ((cmbmsc >> CMBMSC_CRE_SHIFT) & CMBMSC_CRE_MASK) +#define NVME_CMBMSC_CMSE(cmbmsc) \ + ((cmbmsc >> CMBMSC_CMSE_SHIFT) & CMBMSC_CMSE_MASK) +#define NVME_CMBMSC_CBA(cmbmsc) \ + ((cmbmsc >> CMBMSC_CBA_SHIFT) & CMBMSC_CBA_MASK) + + +#define NVME_CMBMSC_SET_CRE(cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBMSC_CRE_MASK) << CMBMSC_CRE_SHIFT) +#define NVME_CMBMSC_SET_CMSE(cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBMSC_CMSE_MASK) << CMBMSC_CMSE_SHIFT) +#define NVME_CMBMSC_SET_CBA(cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBMSC_CBA_MASK) << CMBMSC_CBA_SHIFT) + +enum NvmeCmbstsShift { + CMBSTS_CBAI_SHIFT = 0, +}; +enum NvmeCmbstsMask { + CMBSTS_CBAI_MASK = 0x1, +}; + +#define NVME_CMBSTS_CBAI(cmbsts) \ + ((cmbsts >> CMBSTS_CBAI_SHIFT) & CMBSTS_CBAI_MASK) + +#define NVME_CMBSTS_SET_CBAI(cmbsts, val) \ + (cmbsts |= (uint64_t)(val & CMBSTS_CBAI_MASK) << CMBSTS_CBAI_SHIFT) + enum NvmePmrcapShift { PMRCAP_RDS_SHIFT = 3, PMRCAP_WDS_SHIFT = 4, From c2a3640de8e97bc0398976a7fc0fe9f6a088e777 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Wed, 13 Jan 2021 10:19:44 +0100 Subject: [PATCH 48/56] hw/block/nvme: bump to v1.4 With the new CMB logic in place, bump the implemented specification version to v1.4 by default. This requires adding the setting the CNTRLTYPE field and modifying the VWC field since 0x00 is no longer a valid value for bits 2:1. Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 5 +++-- include/block/nvme.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 1e13d25b08..c4c968f595 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -108,7 +108,7 @@ #define NVME_MAX_IOQPAIRS 0xffff #define NVME_DB_SIZE 4 -#define NVME_SPEC_VER 0x00010300 +#define NVME_SPEC_VER 0x00010400 #define NVME_CMB_BIR 2 #define NVME_PMR_BIR 4 #define NVME_TEMPERATURE 0x143 @@ -4450,6 +4450,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); id->oacs = cpu_to_le16(0); + id->cntrltype = 0x1; /* * Because the controller always completes the Abort command immediately, @@ -4478,7 +4479,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) NVME_ONCS_FEATURES | NVME_ONCS_DSM | NVME_ONCS_COMPARE); - id->vwc = 0x1; + id->vwc = (0x2 << 1) | 0x1; id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN | NVME_CTRL_SGLS_BITBUCKET); diff --git a/include/block/nvme.h b/include/block/nvme.h index 2e85b97a6c..07cfc92936 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -951,7 +951,8 @@ typedef struct QEMU_PACKED NvmeIdCtrl { uint32_t rtd3e; uint32_t oaes; uint32_t ctratt; - uint8_t rsvd100[12]; + uint8_t rsvd100[11]; + uint8_t cntrltype; uint8_t fguid[16]; uint8_t rsvd128[128]; uint16_t oacs; From 38001f73403808dde35c523695ee895587bcc6ba Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Fri, 18 Dec 2020 00:32:57 +0100 Subject: [PATCH 49/56] hw/block/nvme: lift cmb restrictions The controller now implements v1.4 and we can lift the restrictions on CMB Data Pointer and Command Independent Locations Support (CDPCILS) and CMB Data Pointer Mixed Locations Support (CDPMLS) since the device really does not care about mixed host/cmb pointers in those cases. Reviewed-by: Keith Busch Reviewed-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c4c968f595..40784bd908 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -509,7 +509,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, trans_len = MIN(len, trans_len); int num_prps = (len >> n->page_bits) + 1; uint16_t status; - bool prp_list_in_cmb = false; int ret; QEMUSGList *qsg = &req->qsg; @@ -535,10 +534,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, uint32_t nents, prp_trans; int i = 0; - if (nvme_addr_is_cmb(n, prp2)) { - prp_list_in_cmb = true; - } - nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); @@ -555,10 +550,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, uint64_t prp2, return NVME_INVALID_PRP_OFFSET | NVME_DNR; } - if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) { - return NVME_INVALID_USE_OF_CMB | NVME_DNR; - } - i = 0; nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); @@ -692,7 +683,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, uint64_t nsgld; uint32_t seg_len; uint16_t status; - bool sgl_in_cmb = false; hwaddr addr; int ret; @@ -714,18 +704,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, goto out; } - /* - * If the segment is located in the CMB, the submission queue of the - * request must also reside there. - */ - if (nvme_addr_is_cmb(n, addr)) { - if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) { - return NVME_INVALID_USE_OF_CMB | NVME_DNR; - } - - sgl_in_cmb = true; - } - for (;;) { switch (NVME_SGL_TYPE(sgld->type)) { case NVME_SGL_DESCR_TYPE_SEGMENT: @@ -814,15 +792,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov, if (status) { goto unmap; } - - /* - * If the next segment is in the CMB, make sure that the sgl was - * already located there. - */ - if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) { - status = NVME_INVALID_USE_OF_CMB | NVME_DNR; - goto unmap; - } } out: @@ -3777,6 +3746,8 @@ static int nvme_start_ctrl(NvmeCtrl *n) static void nvme_cmb_enable_regs(NvmeCtrl *n) { + NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1); + NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1); NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR); NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); From 044f1876b0b00a970e65b99d9be21925cdd7dc6b Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Fri, 15 Jan 2021 21:19:20 +0900 Subject: [PATCH 50/56] hw/block/nvme: error if drive less than a zone size If a user assigns a backing device with less capacity than the size of a single zone, the namespace capacity will be reported as zero and the kernel will silently fail to allocate the namespace. This patch errors out in case that the backing device cannot accomodate at least a single zone. Signed-off-by: Minwoo Im [k.jensen: small fixup in the error and commit message] Signed-off-by: Klaus Jensen --- hw/block/nvme-ns.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c index 3f52acb89c..dfed71a950 100644 --- a/hw/block/nvme-ns.c +++ b/hw/block/nvme-ns.c @@ -134,6 +134,13 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) ns->num_zones = ns->size / lbasz / ns->zone_size; /* Do a few more sanity checks of ZNS properties */ + if (!ns->num_zones) { + error_setg(errp, + "insufficient drive capacity, must be at least the size " + "of one zone (%"PRIu64"B)", zone_size); + return -1; + } + if (ns->params.max_open_zones > ns->num_zones) { error_setg(errp, "max_open_zones value %u exceeds the number of zones %u", From 56990c777a635ded6e2f191c470ca6410cf5c11a Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Sun, 24 Jan 2021 21:24:40 +0530 Subject: [PATCH 51/56] hw/block/nvme: fix set feature for error recovery Only enable DULBE if the namespace supports it. Signed-off-by: Gollu Appalanaidu Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 40784bd908..b3d072c8b2 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3396,7 +3396,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) } assert(ns); - ns->features.err_rec = dw11; + if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) { + ns->features.err_rec = dw11; + } break; case NVME_VOLATILE_WRITE_CACHE: for (i = 1; i <= n->num_namespaces; i++) { From 0065f42ef1206527188a44e9c456c9b6d10de5ec Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Sun, 24 Jan 2021 21:05:32 +0530 Subject: [PATCH 52/56] hw/block/nvme: fix set feature save field check Currently, no features are saveable, so the current check is not wrong, but add a check against the feature capabilities to make sure this will not regress if saveable features are added later. Signed-off-by: Gollu Appalanaidu Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index b3d072c8b2..c99a3fbf34 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -3324,7 +3324,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req) trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11); - if (save) { + if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) { return NVME_FID_NOT_SAVEABLE | NVME_DNR; } From 74eb89219e6683d62e0a547281bd890f9db0916a Mon Sep 17 00:00:00 2001 From: Gollu Appalanaidu Date: Sun, 24 Jan 2021 22:00:24 +0530 Subject: [PATCH 53/56] hw/block/nvme: align with existing style Change status checks to align with the existing style and remove the explicit check against NVME_SUCCESS. Cc: Dmitry Fomichev Signed-off-by: Gollu Appalanaidu Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch Reviewed-by: Dmitry Fomichev Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index c99a3fbf34..2335739bdb 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1198,7 +1198,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, status = nvme_check_zone_state_for_write(zone); } - if (status != NVME_SUCCESS) { + if (status) { trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status); } else { assert(nvme_wp_is_valid(zone)); @@ -1253,7 +1253,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, uint16_t status; status = nvme_check_zone_state_for_read(zone); - if (status != NVME_SUCCESS) { + if (status) { ; } else if (unlikely(end > bndry)) { if (!ns->params.cross_zone_read) { @@ -1266,7 +1266,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba, do { zone++; status = nvme_check_zone_state_for_read(zone); - if (status != NVME_SUCCESS) { + if (status) { break; } } while (end > nvme_zone_rd_boundary(ns, zone)); @@ -1677,7 +1677,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req) if (ns->params.zoned) { status = nvme_check_zone_read(ns, slba, nlb); - if (status != NVME_SUCCESS) { + if (status) { trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status); goto invalid; } @@ -1748,12 +1748,12 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, zone = nvme_get_zone_by_slba(ns, slba); status = nvme_check_zone_write(n, ns, zone, slba, nlb, append); - if (status != NVME_SUCCESS) { + if (status) { goto invalid; } status = nvme_auto_open_zone(ns, zone); - if (status != NVME_SUCCESS) { + if (status) { goto invalid; } @@ -1852,14 +1852,14 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone, switch (state) { case NVME_ZONE_STATE_EMPTY: status = nvme_aor_check(ns, 1, 0); - if (status != NVME_SUCCESS) { + if (status) { return status; } nvme_aor_inc_active(ns); /* fall through */ case NVME_ZONE_STATE_CLOSED: status = nvme_aor_check(ns, 0, 1); - if (status != NVME_SUCCESS) { + if (status) { if (state == NVME_ZONE_STATE_EMPTY) { nvme_aor_dec_active(ns); } @@ -1972,7 +1972,7 @@ static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone) if (state == NVME_ZONE_STATE_EMPTY) { status = nvme_aor_check(ns, 1, 0); - if (status != NVME_SUCCESS) { + if (status) { return status; } nvme_aor_inc_active(ns); @@ -3301,7 +3301,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req) ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp), DMA_DIRECTION_TO_DEVICE, req); - if (ret != NVME_SUCCESS) { + if (ret) { return ret; } From 74cbbf3031e92c44bb138f16d3f0dc46ca2bc84c Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Tue, 26 Jan 2021 09:19:24 +0900 Subject: [PATCH 54/56] hw/block/nvme: fix wrong parameter name 'cross_read' The actual parameter name is 'cross_read' rather than 'cross_zone_read'. Signed-off-by: Minwoo Im Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 2335739bdb..e562d7467b 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -81,7 +81,7 @@ * The default value means there is no limit to the number of * concurrently open zones. * - * zoned.cross_zone_read= + * zoned.cross_read= * Setting this property to true enables Read Across Zone Boundaries. */ From a679dc3efd580de67f30dcb8cb1a52a4bb559899 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 19 Jan 2021 12:42:58 +0100 Subject: [PATCH 55/56] hw/block/nvme: fix zone boundary check for append When a zone append is processed the controller checks that validity of the write before assigning the LBA to the append command. This causes the boundary check to be wrong. Fix this by checking the write *after* assigning the LBA. Remove the append special case from the nvme_check_zone_write and open code it in nvme_do_write, assigning the slba when basic sanity checks have been performed. Then check the validity of the resulting write like any other write command. In the process, also fix a missing endianness conversion for the zone append ALBA. Reported-by: Niklas Cassel Cc: Dmitry Fomichev Tested-by: Niklas Cassel Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index e562d7467b..cedb4ad9ff 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1188,7 +1188,7 @@ static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone, uint64_t slba, - uint32_t nlb, bool append) + uint32_t nlb) { uint16_t status; @@ -1202,16 +1202,8 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status); } else { assert(nvme_wp_is_valid(zone)); - if (append) { - if (unlikely(slba != zone->d.zslba)) { - trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); - status = NVME_INVALID_FIELD; - } - if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) { - trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl); - status = NVME_INVALID_FIELD; - } - } else if (unlikely(slba != zone->w_ptr)) { + + if (unlikely(slba != zone->w_ptr)) { trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr); status = NVME_ZONE_INVALID_WRITE; @@ -1349,10 +1341,9 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req, } } -static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, - uint32_t nlb) +static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, + uint32_t nlb) { - uint64_t result = zone->w_ptr; uint8_t zs; zone->w_ptr += nlb; @@ -1368,8 +1359,6 @@ static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone, nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN); } } - - return result; } static inline bool nvme_is_write(NvmeRequest *req) @@ -1747,7 +1736,24 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, if (ns->params.zoned) { zone = nvme_get_zone_by_slba(ns, slba); - status = nvme_check_zone_write(n, ns, zone, slba, nlb, append); + if (append) { + if (unlikely(slba != zone->d.zslba)) { + trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba); + status = NVME_INVALID_FIELD; + goto invalid; + } + + if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) { + trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl); + status = NVME_INVALID_FIELD; + goto invalid; + } + + slba = zone->w_ptr; + res->slba = cpu_to_le64(slba); + } + + status = nvme_check_zone_write(n, ns, zone, slba, nlb); if (status) { goto invalid; } @@ -1757,11 +1763,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append, goto invalid; } - if (append) { - slba = zone->w_ptr; - } - - res->slba = nvme_advance_zone_wp(ns, zone, nlb); + nvme_advance_zone_wp(ns, zone, nlb); } data_offset = nvme_l2b(ns, slba); From 3e22762edc74be3e1ecafc361351a9640d114978 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 19 Jan 2021 14:21:50 +0100 Subject: [PATCH 56/56] hw/block/nvme: refactor the logic for zone write checks Refactor the zone write check logic such that the most "meaningful" error is returned first. That is, first, if the zone is not writable, return an appropriate status code for that. Then, make sure we are actually writing at the write pointer and finally check that we do not cross the zone write boundary. This aligns with the "priority" of status codes for zone read checks. Also add a couple of additional descriptive trace events and remove an always true assert. Cc: Dmitry Fomichev Tested-by: Niklas Cassel Tested-by: Dmitry Fomichev Reviewed-by: Dmitry Fomichev Reviewed-by: Keith Busch Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 49 ++++++++++++++++++++----------------------- hw/block/trace-events | 5 +++++ 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index cedb4ad9ff..5ce21b7100 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1161,56 +1161,53 @@ static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba) static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone) { - uint16_t status; + uint64_t zslba = zone->d.zslba; switch (nvme_get_zone_state(zone)) { case NVME_ZONE_STATE_EMPTY: case NVME_ZONE_STATE_IMPLICITLY_OPEN: case NVME_ZONE_STATE_EXPLICITLY_OPEN: case NVME_ZONE_STATE_CLOSED: - status = NVME_SUCCESS; - break; + return NVME_SUCCESS; case NVME_ZONE_STATE_FULL: - status = NVME_ZONE_FULL; - break; + trace_pci_nvme_err_zone_is_full(zslba); + return NVME_ZONE_FULL; case NVME_ZONE_STATE_OFFLINE: - status = NVME_ZONE_OFFLINE; - break; + trace_pci_nvme_err_zone_is_offline(zslba); + return NVME_ZONE_OFFLINE; case NVME_ZONE_STATE_READ_ONLY: - status = NVME_ZONE_READ_ONLY; - break; + trace_pci_nvme_err_zone_is_read_only(zslba); + return NVME_ZONE_READ_ONLY; default: assert(false); } - return status; + return NVME_INTERNAL_DEV_ERROR; } static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone, uint64_t slba, uint32_t nlb) { + uint64_t zcap = nvme_zone_wr_boundary(zone); uint16_t status; - if (unlikely((slba + nlb) > nvme_zone_wr_boundary(zone))) { - status = NVME_ZONE_BOUNDARY_ERROR; - } else { - status = nvme_check_zone_state_for_write(zone); - } - + status = nvme_check_zone_state_for_write(zone); if (status) { - trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status); - } else { - assert(nvme_wp_is_valid(zone)); - - if (unlikely(slba != zone->w_ptr)) { - trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, - zone->w_ptr); - status = NVME_ZONE_INVALID_WRITE; - } + return status; } - return status; + if (unlikely(slba != zone->w_ptr)) { + trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr); + return NVME_ZONE_INVALID_WRITE; + } + + if (unlikely((slba + nlb) > zcap)) { + trace_pci_nvme_err_zone_boundary(slba, nlb, zcap); + return NVME_ZONE_BOUNDARY_ERROR; + } + + return NVME_SUCCESS; } static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone) diff --git a/hw/block/trace-events b/hw/block/trace-events index 87ab6c5090..d32475c398 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -129,6 +129,11 @@ pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) " pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32"" pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) "writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64"" pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at slba=%"PRIu64", but zone=%"PRIu64"" +pci_nvme_err_zone_is_full(uint64_t zslba) "zslba 0x%"PRIx64"" +pci_nvme_err_zone_is_read_only(uint64_t zslba) "zslba 0x%"PRIx64"" +pci_nvme_err_zone_is_offline(uint64_t zslba) "zslba 0x%"PRIx64"" +pci_nvme_err_zone_boundary(uint64_t slba, uint32_t nlb, uint64_t zcap) "lba 0x%"PRIx64" nlb %"PRIu32" zcap 0x%"PRIx64"" +pci_nvme_err_zone_invalid_write(uint64_t slba, uint64_t wp) "lba 0x%"PRIx64" wp 0x%"PRIx64"" pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) "slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8""