From 461bba04bff0b3712a02fe49812b497c758e78da Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Tue, 16 Jul 2019 19:30:18 +0300
Subject: [PATCH 1/5] block/nvme: fix doorbell stride

Fix the math involving non standard doorbell stride

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190716163020.13383-2-mlevitsk@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index 9896b7f7c6..82fdefccd6 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -217,7 +217,7 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
         error_propagate(errp, local_err);
         goto fail;
     }
-    q->cq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale + 1];
+    q->cq.doorbell = &s->regs->doorbells[(idx * 2 + 1) * s->doorbell_scale];
 
     return q;
 fail:

From 118d1b6a81c7c22023ab1c3aad46d37184d1d838 Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Tue, 16 Jul 2019 19:30:19 +0300
Subject: [PATCH 2/5] block/nvme: support larger that 512 bytes sector devices

Currently the driver hardcodes the sector size to 512,
and doesn't check the underlying device. Fix that.

Also fail if underlying nvme device is formatted with metadata
as this needs special support.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Message-id: 20190716163020.13383-3-mlevitsk@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/nvme.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 82fdefccd6..35ce10dc79 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -102,8 +102,11 @@ typedef struct {
     size_t doorbell_scale;
     bool write_cache_supported;
     EventNotifier irq_notifier;
+
     uint64_t nsze; /* Namespace size reported by identify command */
     int nsid;      /* The namespace id to read/write data. */
+    size_t blkshift;
+
     uint64_t max_transfer;
     bool plugged;
 
@@ -418,8 +421,9 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     BDRVNVMeState *s = bs->opaque;
     NvmeIdCtrl *idctrl;
     NvmeIdNs *idns;
+    NvmeLBAF *lbaf;
     uint8_t *resp;
-    int r;
+    int r, hwsect_size;
     uint64_t iova;
     NvmeCmd cmd = {
         .opcode = NVME_ADM_CMD_IDENTIFY,
@@ -466,7 +470,22 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
     }
 
     s->nsze = le64_to_cpu(idns->nsze);
+    lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
 
+    if (lbaf->ms) {
+        error_setg(errp, "Namespaces with metadata are not yet supported");
+        goto out;
+    }
+
+    hwsect_size = 1 << lbaf->ds;
+
+    if (hwsect_size < BDRV_SECTOR_SIZE || hwsect_size > s->page_size) {
+        error_setg(errp, "Namespace has unsupported block size (%d)",
+                hwsect_size);
+        goto out;
+    }
+
+    s->blkshift = lbaf->ds;
 out:
     qemu_vfio_dma_unmap(s->vfio, resp);
     qemu_vfree(resp);
@@ -785,8 +804,22 @@ fail:
 static int64_t nvme_getlength(BlockDriverState *bs)
 {
     BDRVNVMeState *s = bs->opaque;
+    return s->nsze << s->blkshift;
+}
 
-    return s->nsze << BDRV_SECTOR_BITS;
+static int64_t nvme_get_blocksize(BlockDriverState *bs)
+{
+    BDRVNVMeState *s = bs->opaque;
+    assert(s->blkshift >= BDRV_SECTOR_BITS);
+    return 1 << s->blkshift;
+}
+
+static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
+{
+    int64_t blocksize = nvme_get_blocksize(bs);
+    bsz->phys = blocksize;
+    bsz->log = blocksize;
+    return 0;
 }
 
 /* Called with s->dma_map_lock */
@@ -917,13 +950,14 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
     BDRVNVMeState *s = bs->opaque;
     NVMeQueuePair *ioq = s->queues[1];
     NVMeRequest *req;
-    uint32_t cdw12 = (((bytes >> BDRV_SECTOR_BITS) - 1) & 0xFFFF) |
+
+    uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
                        (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
     NvmeCmd cmd = {
         .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
         .nsid = cpu_to_le32(s->nsid),
-        .cdw10 = cpu_to_le32((offset >> BDRV_SECTOR_BITS) & 0xFFFFFFFF),
-        .cdw11 = cpu_to_le32(((offset >> BDRV_SECTOR_BITS) >> 32) & 0xFFFFFFFF),
+        .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
+        .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
         .cdw12 = cpu_to_le32(cdw12),
     };
     NVMeCoData data = {
@@ -1154,6 +1188,7 @@ static BlockDriver bdrv_nvme = {
     .bdrv_file_open           = nvme_file_open,
     .bdrv_close               = nvme_close,
     .bdrv_getlength           = nvme_getlength,
+    .bdrv_probe_blocksizes    = nvme_probe_blocksizes,
 
     .bdrv_co_preadv           = nvme_co_preadv,
     .bdrv_co_pwritev          = nvme_co_pwritev,

From 258867d1dc32c300690cc32bfcf3e648ae12c4c9 Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <mlevitsk@redhat.com>
Date: Tue, 16 Jul 2019 19:30:20 +0300
Subject: [PATCH 3/5] block/nvme: don't touch the completion entries

Completion entries are meant to be only read by the host and written by the device.
The driver is supposed to scan the completions from the last point where it left,
and until it sees a completion with non flipped phase bit.

Signed-off-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190716163020.13383-4-mlevitsk@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/nvme.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 35ce10dc79..c28755cc31 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -318,7 +318,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
     while (q->inflight) {
         int16_t cid;
         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
-        if (!c->cid || (le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
+        if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
             break;
         }
         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
@@ -342,10 +342,7 @@ static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
         qemu_mutex_unlock(&q->lock);
         req.cb(req.opaque, nvme_translate_error(c));
         qemu_mutex_lock(&q->lock);
-        c->cid = cpu_to_le16(0);
         q->inflight--;
-        /* Flip Phase Tag bit. */
-        c->status = cpu_to_le16(le16_to_cpu(c->status) ^ 0x1);
         progress = true;
     }
     if (progress) {

From 65181d63817b33b10ecb1c418eb96c99e7cf8842 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Mon, 22 Jul 2019 15:30:53 +0200
Subject: [PATCH 4/5] block: Dec. drained_end_counter before bdrv_wakeup

Decrementing drained_end_counter after bdrv_dec_in_flight() (which in
turn invokes bdrv_wakeup() and thus aio_wait_kick()) is not very clever.
We should decrement it beforehand, so that any waiting aio_poll() that
is woken by bdrv_dec_in_flight() sees the decremented
drained_end_counter.

Because the time window between decrementing drained_end_counter and
aio_wait_kick() is very small, I cannot supply a reliable regression
test.  However, running e.g. the /bdrv-drain/blockjob/iothread/drain_all
test in test-bdrv-drain has a small chance of hanging without this
patch (about 1/200 or so; it gets to nearly 100 % if you add e.g. an
fputc(' ', stderr); after the bdrv_dec_in_flight()).

Fixes: e037c09c78520cbdb6da7cfc6ad0256d5870b814
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190722133054.21781-2-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/io.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/io.c b/block/io.c
index b89e155d21..06305c6ea6 100644
--- a/block/io.c
+++ b/block/io.c
@@ -217,13 +217,12 @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
         bs->drv->bdrv_co_drain_end(bs);
     }
 
-    /* Set data->done before reading bs->wakeup.  */
+    /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
     atomic_mb_set(&data->done, true);
-    bdrv_dec_in_flight(bs);
-
     if (!data->begin) {
         atomic_dec(data->drained_end_counter);
     }
+    bdrv_dec_in_flight(bs);
 
     g_free(data);
 }

From 43eaaaef0e18817bf78d8f135993f8579cad2cc6 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Mon, 22 Jul 2019 15:30:54 +0200
Subject: [PATCH 5/5] block: Only the main loop can change AioContexts

bdrv_set_aio_context_ignore() can only work in the main loop:
bdrv_drained_begin() only works in the main loop and the node's (old)
AioContext; and bdrv_drained_end() really only works in the main loop
and the node's (new) AioContext (contrary to its current comment, which
is just wrong).

Consequentially, bdrv_set_aio_context_ignore() must be called from the
main loop.  Luckily, assuming that we can make block graph changes only
from the main loop as well, all its callers do that already.

Note that changing a node's context in a sense is an operation that
changes the block graph, so it actually makes sense to require this
function to be called from the main loop.

Also, fix bdrv_drained_end()'s description.  You can only use it from
the main loop or the node's AioContext, and in the latter case, the
whole subtree must be in the same context.

Fixes: e037c09c78520cbdb6da7cfc6ad0256d5870b814
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20190722133054.21781-3-mreitz@redhat.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block.c               | 13 ++++++++-----
 include/block/block.h |  8 +++-----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/block.c b/block.c
index 9c94f7f28a..cbd8da5f3b 100644
--- a/block.c
+++ b/block.c
@@ -5914,6 +5914,8 @@ static void bdrv_attach_aio_context(BlockDriverState *bs,
  * Changes the AioContext used for fd handlers, timers, and BHs by this
  * BlockDriverState and all its children and parents.
  *
+ * Must be called from the main AioContext.
+ *
  * The caller must own the AioContext lock for the old AioContext of bs, but it
  * must not own the AioContext lock for new_context (unless new_context is the
  * same as the current context of bs).
@@ -5925,9 +5927,10 @@ void bdrv_set_aio_context_ignore(BlockDriverState *bs,
                                  AioContext *new_context, GSList **ignore)
 {
     AioContext *old_context = bdrv_get_aio_context(bs);
-    AioContext *current_context = qemu_get_current_aio_context();
     BdrvChild *child;
 
+    g_assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+
     if (old_context == new_context) {
         return;
     }
@@ -5953,7 +5956,7 @@ void bdrv_set_aio_context_ignore(BlockDriverState *bs,
     bdrv_detach_aio_context(bs);
 
     /* Acquire the new context, if necessary */
-    if (current_context != new_context) {
+    if (qemu_get_aio_context() != new_context) {
         aio_context_acquire(new_context);
     }
 
@@ -5965,16 +5968,16 @@ void bdrv_set_aio_context_ignore(BlockDriverState *bs,
      * subtree that have not yet been moved to the new AioContext.
      * Release the old one so bdrv_drained_end() can poll them.
      */
-    if (current_context != old_context) {
+    if (qemu_get_aio_context() != old_context) {
         aio_context_release(old_context);
     }
 
     bdrv_drained_end(bs);
 
-    if (current_context != old_context) {
+    if (qemu_get_aio_context() != old_context) {
         aio_context_acquire(old_context);
     }
-    if (current_context != new_context) {
+    if (qemu_get_aio_context() != new_context) {
         aio_context_release(new_context);
     }
 }
diff --git a/include/block/block.h b/include/block/block.h
index 60f00479e0..50a07c1c33 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -667,11 +667,9 @@ void bdrv_subtree_drained_begin(BlockDriverState *bs);
  *
  * This polls @bs's AioContext until all scheduled sub-drained_ends
  * have settled.  On one hand, that may result in graph changes.  On
- * the other, this requires that all involved nodes (@bs and all of
- * its parents) are in the same AioContext, and that the caller has
- * acquired it.
- * If there are any nodes that are in different contexts from @bs,
- * these contexts must not be acquired.
+ * the other, this requires that the caller either runs in the main
+ * loop; or that all involved nodes (@bs and all of its parents) are
+ * in the caller's AioContext.
  */
 void bdrv_drained_end(BlockDriverState *bs);