From 055c6f912c8d3cd9a901972ae432c47e5872f71a Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 20 Aug 2015 12:46:47 +0200
Subject: [PATCH 1/7] block/nfs: fix calculation of allocated file size

st.st_blocks is always counted in 512 byte units. Do not
use st.st_blksize as multiplicator which may be larger.

Cc: qemu-stable@nongnu.org
Signed-off-by: Peter Lieven <pl@kamp.de>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Message-id: 1440067607-14547-1-git-send-email-pl@kamp.de
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/nfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nfs.c b/block/nfs.c
index c026ff6883..02eb4e4643 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -475,7 +475,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
         aio_poll(client->aio_context, true);
     }
 
-    return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize);
+    return (task.ret < 0 ? task.ret : st.st_blocks * 512);
 }
 
 static int nfs_file_truncate(BlockDriverState *bs, int64_t offset)

From 18a8056e0bc744e5dd2bb5cb998423b607d99f19 Mon Sep 17 00:00:00 2001
From: Peter Lieven <pl@kamp.de>
Date: Thu, 27 Aug 2015 12:30:41 +0200
Subject: [PATCH 2/7] block/nfs: cache allocated filesize for read-only files

If the file is readonly its not expected to grow so
save the blocking call to nfs_fstat_async and use
the value saved at connection time. Also important
the monitor (and thus the main loop) will not hang
if block device info is queried and the NFS share
is unresponsive.

Signed-off-by: Peter Lieven <pl@kamp.de>
Reviewed-by: Jeff Cody <jcody@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 1440671441-7978-1-git-send-email-pl@kamp.de
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/nfs.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/block/nfs.c b/block/nfs.c
index 02eb4e4643..887a98e3fc 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -43,6 +43,7 @@ typedef struct NFSClient {
     int events;
     bool has_zero_init;
     AioContext *aio_context;
+    blkcnt_t st_blocks;
 } NFSClient;
 
 typedef struct NFSRPC {
@@ -374,6 +375,7 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename,
     }
 
     ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE);
+    client->st_blocks = st.st_blocks;
     client->has_zero_init = S_ISREG(st.st_mode);
     goto out;
 fail:
@@ -464,6 +466,11 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
     NFSRPC task = {0};
     struct stat st;
 
+    if (bdrv_is_read_only(bs) &&
+        !(bs->open_flags & BDRV_O_NOCACHE)) {
+        return client->st_blocks * 512;
+    }
+
     task.st = &st;
     if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
                         &task) != 0) {
@@ -484,6 +491,34 @@ static int nfs_file_truncate(BlockDriverState *bs, int64_t offset)
     return nfs_ftruncate(client->context, client->fh, offset);
 }
 
+/* Note that this will not re-establish a connection with the NFS server
+ * - it is effectively a NOP.  */
+static int nfs_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue, Error **errp)
+{
+    NFSClient *client = state->bs->opaque;
+    struct stat st;
+    int ret = 0;
+
+    if (state->flags & BDRV_O_RDWR && bdrv_is_read_only(state->bs)) {
+        error_setg(errp, "Cannot open a read-only mount as read-write");
+        return -EACCES;
+    }
+
+    /* Update cache for read-only reopens */
+    if (!(state->flags & BDRV_O_RDWR)) {
+        ret = nfs_fstat(client->context, client->fh, &st);
+        if (ret < 0) {
+            error_setg(errp, "Failed to fstat file: %s",
+                       nfs_get_error(client->context));
+            return ret;
+        }
+        client->st_blocks = st.st_blocks;
+    }
+
+    return 0;
+}
+
 static BlockDriver bdrv_nfs = {
     .format_name                    = "nfs",
     .protocol_name                  = "nfs",
@@ -499,6 +534,7 @@ static BlockDriver bdrv_nfs = {
     .bdrv_file_open                 = nfs_file_open,
     .bdrv_close                     = nfs_file_close,
     .bdrv_create                    = nfs_file_create,
+    .bdrv_reopen_prepare            = nfs_reopen_prepare,
 
     .bdrv_co_readv                  = nfs_co_readv,
     .bdrv_co_writev                 = nfs_co_writev,

From 4da65c80921139f3e0ff63f5ea20c5d9c778364f Mon Sep 17 00:00:00 2001
From: Liu Yuan <liuyuan@cmss.chinamobile.com>
Date: Fri, 28 Aug 2015 10:53:58 +0800
Subject: [PATCH 3/7] sheepdog: add reopen support

With reopen supported, block-commit (and offline commit) is now supported for
image files whose base image uses the Sheepdog protocol driver.

Cc: qemu-devel@nongnu.org
Cc: Jeff Cody <jcody@redhat.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Liu Yuan <liuyuan@cmss.chinamobile.com>
Message-id: 1440730438-24676-1-git-send-email-namei.unix@gmail.com
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/sheepdog.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 67ca788d5c..255372eea9 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -377,6 +377,11 @@ typedef struct BDRVSheepdogState {
     QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
 } BDRVSheepdogState;
 
+typedef struct BDRVSheepdogReopenState {
+    int fd;
+    int cache_flags;
+} BDRVSheepdogReopenState;
+
 static const char * sd_strerror(int err)
 {
     int i;
@@ -1486,6 +1491,68 @@ out:
     return ret;
 }
 
+static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue,
+                             Error **errp)
+{
+    BDRVSheepdogState *s = state->bs->opaque;
+    BDRVSheepdogReopenState *re_s;
+    int ret = 0;
+
+    re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1);
+
+    re_s->cache_flags = SD_FLAG_CMD_CACHE;
+    if (state->flags & BDRV_O_NOCACHE) {
+        re_s->cache_flags = SD_FLAG_CMD_DIRECT;
+    }
+
+    re_s->fd = get_sheep_fd(s, errp);
+    if (re_s->fd < 0) {
+        ret = re_s->fd;
+        return ret;
+    }
+
+    return ret;
+}
+
+static void sd_reopen_commit(BDRVReopenState *state)
+{
+    BDRVSheepdogReopenState *re_s = state->opaque;
+    BDRVSheepdogState *s = state->bs->opaque;
+
+    if (s->fd) {
+        aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL);
+        closesocket(s->fd);
+    }
+
+    s->fd = re_s->fd;
+    s->cache_flags = re_s->cache_flags;
+
+    g_free(state->opaque);
+    state->opaque = NULL;
+
+    return;
+}
+
+static void sd_reopen_abort(BDRVReopenState *state)
+{
+    BDRVSheepdogReopenState *re_s = state->opaque;
+    BDRVSheepdogState *s = state->bs->opaque;
+
+    if (re_s == NULL) {
+        return;
+    }
+
+    if (re_s->fd) {
+        aio_set_fd_handler(s->aio_context, re_s->fd, NULL, NULL, NULL);
+        closesocket(re_s->fd);
+    }
+
+    g_free(state->opaque);
+    state->opaque = NULL;
+
+    return;
+}
+
 static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
                         Error **errp)
 {
@@ -2702,6 +2769,9 @@ static BlockDriver bdrv_sheepdog = {
     .instance_size  = sizeof(BDRVSheepdogState),
     .bdrv_needs_filename = true,
     .bdrv_file_open = sd_open,
+    .bdrv_reopen_prepare    = sd_reopen_prepare,
+    .bdrv_reopen_commit     = sd_reopen_commit,
+    .bdrv_reopen_abort      = sd_reopen_abort,
     .bdrv_close     = sd_close,
     .bdrv_create    = sd_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
@@ -2735,6 +2805,9 @@ static BlockDriver bdrv_sheepdog_tcp = {
     .instance_size  = sizeof(BDRVSheepdogState),
     .bdrv_needs_filename = true,
     .bdrv_file_open = sd_open,
+    .bdrv_reopen_prepare    = sd_reopen_prepare,
+    .bdrv_reopen_commit     = sd_reopen_commit,
+    .bdrv_reopen_abort      = sd_reopen_abort,
     .bdrv_close     = sd_close,
     .bdrv_create    = sd_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
@@ -2768,6 +2841,9 @@ static BlockDriver bdrv_sheepdog_unix = {
     .instance_size  = sizeof(BDRVSheepdogState),
     .bdrv_needs_filename = true,
     .bdrv_file_open = sd_open,
+    .bdrv_reopen_prepare    = sd_reopen_prepare,
+    .bdrv_reopen_commit     = sd_reopen_commit,
+    .bdrv_reopen_abort      = sd_reopen_abort,
     .bdrv_close     = sd_close,
     .bdrv_create    = sd_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,

From 9568b511c9f91c3d21ea3e83426d4ee7168c98bb Mon Sep 17 00:00:00 2001
From: Wen Congyang <wency@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 11:28:32 +0800
Subject: [PATCH 4/7] block: Introduce a new API bdrv_co_no_copy_on_readv()

In some cases, we need to disable copy-on-read, and just
read the data.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Message-id: 1441682913-14320-2-git-send-email-wency@cn.fujitsu.com
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/io.c            | 12 +++++++++++-
 include/block/block.h |  9 ++++++---
 trace-events          |  1 +
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/block/io.c b/block/io.c
index d4bc83b33b..94e18e6a9d 100644
--- a/block/io.c
+++ b/block/io.c
@@ -932,7 +932,8 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
         return ret;
     }
 
-    if (bs->copy_on_read) {
+    /* Don't do copy-on-read if we read data before write operation */
+    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_COPY_ON_READ)) {
         flags |= BDRV_REQ_COPY_ON_READ;
     }
 
@@ -1001,6 +1002,15 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
 }
 
+int coroutine_fn bdrv_co_no_copy_on_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    trace_bdrv_co_no_copy_on_readv(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
+                            BDRV_REQ_NO_COPY_ON_READ);
+}
+
 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
diff --git a/include/block/block.h b/include/block/block.h
index ef67353108..2dd66300ed 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -51,15 +51,16 @@ typedef struct BlockFragInfo {
 } BlockFragInfo;
 
 typedef enum {
-    BDRV_REQ_COPY_ON_READ = 0x1,
-    BDRV_REQ_ZERO_WRITE   = 0x2,
+    BDRV_REQ_COPY_ON_READ       = 0x1,
+    BDRV_REQ_ZERO_WRITE         = 0x2,
     /* The BDRV_REQ_MAY_UNMAP flag is used to indicate that the block driver
      * is allowed to optimize a write zeroes request by unmapping (discarding)
      * blocks if it is guaranteed that the result will read back as
      * zeroes. The flag is only passed to the driver if the block device is
      * opened with BDRV_O_UNMAP.
      */
-    BDRV_REQ_MAY_UNMAP    = 0x4,
+    BDRV_REQ_MAY_UNMAP          = 0x4,
+    BDRV_REQ_NO_COPY_ON_READ    = 0x8,
 } BdrvRequestFlags;
 
 typedef struct BlockSizes {
@@ -252,6 +253,8 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+int coroutine_fn bdrv_co_no_copy_on_readv(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 /*
diff --git a/trace-events b/trace-events
index 25c53e0c7c..a70ea9c3ae 100644
--- a/trace-events
+++ b/trace-events
@@ -69,6 +69,7 @@ bdrv_aio_write_zeroes(void *bs, int64_t sector_num, int nb_sectors, int flags, v
 bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+bdrv_co_no_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector, int flags) "bs %p sector_num %"PRId64" nb_sectors %d flags %#x"
 bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"

From 06c3916b35a1cf6db548450a0cfb96983c33c82f Mon Sep 17 00:00:00 2001
From: Wen Congyang <wency@cn.fujitsu.com>
Date: Tue, 8 Sep 2015 11:28:33 +0800
Subject: [PATCH 5/7] Backup: don't do copy-on-read in before_write_notifier

We will copy data in before_write_notifier to do backup.
It is a nested I/O request, so we cannot do copy-on-read.

The steps to reproduce it:
1. -drive copy-on-read=on,...  // qemu option
2. drive_backup -f disk0 /path_to_backup.img // monitor command

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Tested-by: Jeff Cody <jcody@redhat.com>
Message-id: 1441682913-14320-3-git-send-email-wency@cn.fujitsu.com
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/backup.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 965654d521..5696431711 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -89,7 +89,8 @@ static void cow_request_end(CowRequest *req)
 
 static int coroutine_fn backup_do_cow(BlockDriverState *bs,
                                       int64_t sector_num, int nb_sectors,
-                                      bool *error_is_read)
+                                      bool *error_is_read,
+                                      bool is_write_notifier)
 {
     BackupBlockJob *job = (BackupBlockJob *)bs->job;
     CowRequest cow_request;
@@ -129,8 +130,14 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
         iov.iov_len = n * BDRV_SECTOR_SIZE;
         qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 
-        ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
-                            &bounce_qiov);
+        if (is_write_notifier) {
+            ret = bdrv_co_no_copy_on_readv(bs,
+                                           start * BACKUP_SECTORS_PER_CLUSTER,
+                                           n, &bounce_qiov);
+        } else {
+            ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
+                                &bounce_qiov);
+        }
         if (ret < 0) {
             trace_backup_do_cow_read_fail(job, start, ret);
             if (error_is_read) {
@@ -190,7 +197,7 @@ static int coroutine_fn backup_before_write_notify(
     assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
     assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 
-    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
+    return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true);
 }
 
 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -303,7 +310,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
                     return ret;
                 }
                 ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
-                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+                                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read,
+                                    false);
                 if ((ret < 0) &&
                     backup_error_action(job, error_is_read, -ret) ==
                     BLOCK_ERROR_ACTION_REPORT) {
@@ -408,7 +416,7 @@ static void coroutine_fn backup_run(void *opaque)
             }
             /* FULL sync mode we copy the whole drive. */
             ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
-                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
+                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false);
             if (ret < 0) {
                 /* Depending on error action, fail now or retry cluster */
                 BlockErrorAction action =

From 498f21405a286f718a0767c791b7d2db19f4e5bd Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Date: Tue, 1 Sep 2015 12:03:09 +0900
Subject: [PATCH 6/7] sheepdog: use per AIOCB dirty indexes for non overlapping
 requests

In the commit 96b14ff85acf, requests for overlapping areas are
serialized. However, it cannot handle a case of non overlapping
requests. In such a case, min_dirty_data_idx and max_dirty_data_idx
can be overwritten by the requests and invalid inode update can
happen e.g. a case like create(1, 2) and create(3, 4) are issued in
parallel.

This patch lets SheepdogAIOCB have dirty data indexes instead of
BDRVSheepdogState for avoiding the above situation.

This patch also does trivial renaming for better description:
overwrapping -> overlapping

Cc: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Cc: Vasiliy Tolstov <v.tolstov@selfip.ru>
Cc: Jeff Cody <jcody@redhat.com>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Tested-by: Vasiliy Tolstov <v.tolstov@selfip.ru>
Message-id: 1441076590-8015-2-git-send-email-mitake.hitoshi@lab.ntt.co.jp
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/sheepdog.c | 63 +++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 255372eea9..08a09e9683 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -318,7 +318,7 @@ enum AIOCBState {
     AIOCB_DISCARD_OBJ,
 };
 
-#define AIOCBOverwrapping(x, y)                                 \
+#define AIOCBOverlapping(x, y)                                 \
     (!(x->max_affect_data_idx < y->min_affect_data_idx          \
        || y->max_affect_data_idx < x->min_affect_data_idx))
 
@@ -342,6 +342,15 @@ struct SheepdogAIOCB {
     uint32_t min_affect_data_idx;
     uint32_t max_affect_data_idx;
 
+    /*
+     * The difference between affect_data_idx and dirty_data_idx:
+     * affect_data_idx represents range of index of all request types.
+     * dirty_data_idx represents range of index updated by COW requests.
+     * dirty_data_idx is used for updating an inode object.
+     */
+    uint32_t min_dirty_data_idx;
+    uint32_t max_dirty_data_idx;
+
     QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings;
 };
 
@@ -351,9 +360,6 @@ typedef struct BDRVSheepdogState {
 
     SheepdogInode inode;
 
-    uint32_t min_dirty_data_idx;
-    uint32_t max_dirty_data_idx;
-
     char name[SD_MAX_VDI_LEN];
     bool is_snapshot;
     uint32_t cache_flags;
@@ -373,7 +379,7 @@ typedef struct BDRVSheepdogState {
     QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head;
     QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head;
 
-    CoQueue overwrapping_queue;
+    CoQueue overlapping_queue;
     QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head;
 } BDRVSheepdogState;
 
@@ -561,6 +567,9 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
     acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE +
                               acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size;
 
+    acb->min_dirty_data_idx = UINT32_MAX;
+    acb->max_dirty_data_idx = 0;
+
     return acb;
 }
 
@@ -824,8 +833,8 @@ static void coroutine_fn aio_read_response(void *opaque)
              */
             if (rsp.result == SD_RES_SUCCESS) {
                 s->inode.data_vdi_id[idx] = s->inode.vdi_id;
-                s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
-                s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+                acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx);
+                acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx);
             }
         }
         break;
@@ -1471,13 +1480,11 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     memcpy(&s->inode, buf, sizeof(s->inode));
-    s->min_dirty_data_idx = UINT32_MAX;
-    s->max_dirty_data_idx = 0;
 
     bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
     pstrcpy(s->name, sizeof(s->name), vdi);
     qemu_co_mutex_init(&s->lock);
-    qemu_co_queue_init(&s->overwrapping_queue);
+    qemu_co_queue_init(&s->overlapping_queue);
     qemu_opts_del(opts);
     g_free(buf);
     return 0;
@@ -1989,16 +1996,16 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
     AIOReq *aio_req;
     uint32_t offset, data_len, mn, mx;
 
-    mn = s->min_dirty_data_idx;
-    mx = s->max_dirty_data_idx;
+    mn = acb->min_dirty_data_idx;
+    mx = acb->max_dirty_data_idx;
     if (mn <= mx) {
         /* we need to update the vdi object. */
         offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
             mn * sizeof(s->inode.data_vdi_id[0]);
         data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
 
-        s->min_dirty_data_idx = UINT32_MAX;
-        s->max_dirty_data_idx = 0;
+        acb->min_dirty_data_idx = UINT32_MAX;
+        acb->max_dirty_data_idx = 0;
 
         iov.iov_base = &s->inode;
         iov.iov_len = sizeof(s->inode);
@@ -2224,12 +2231,12 @@ out:
     return 1;
 }
 
-static bool check_overwrapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
+static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb)
 {
     SheepdogAIOCB *cb;
 
     QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
-        if (AIOCBOverwrapping(aiocb, cb)) {
+        if (AIOCBOverlapping(aiocb, cb)) {
             return true;
         }
     }
@@ -2258,15 +2265,15 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
     acb->aiocb_type = AIOCB_WRITE_UDATA;
 
 retry:
-    if (check_overwrapping_aiocb(s, acb)) {
-        qemu_co_queue_wait(&s->overwrapping_queue);
+    if (check_overlapping_aiocb(s, acb)) {
+        qemu_co_queue_wait(&s->overlapping_queue);
         goto retry;
     }
 
     ret = sd_co_rw_vector(acb);
     if (ret <= 0) {
         QLIST_REMOVE(acb, aiocb_siblings);
-        qemu_co_queue_restart_all(&s->overwrapping_queue);
+        qemu_co_queue_restart_all(&s->overlapping_queue);
         qemu_aio_unref(acb);
         return ret;
     }
@@ -2274,7 +2281,7 @@ retry:
     qemu_coroutine_yield();
 
     QLIST_REMOVE(acb, aiocb_siblings);
-    qemu_co_queue_restart_all(&s->overwrapping_queue);
+    qemu_co_queue_restart_all(&s->overlapping_queue);
 
     return acb->ret;
 }
@@ -2291,15 +2298,15 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
     acb->aio_done_func = sd_finish_aiocb;
 
 retry:
-    if (check_overwrapping_aiocb(s, acb)) {
-        qemu_co_queue_wait(&s->overwrapping_queue);
+    if (check_overlapping_aiocb(s, acb)) {
+        qemu_co_queue_wait(&s->overlapping_queue);
         goto retry;
     }
 
     ret = sd_co_rw_vector(acb);
     if (ret <= 0) {
         QLIST_REMOVE(acb, aiocb_siblings);
-        qemu_co_queue_restart_all(&s->overwrapping_queue);
+        qemu_co_queue_restart_all(&s->overlapping_queue);
         qemu_aio_unref(acb);
         return ret;
     }
@@ -2307,7 +2314,7 @@ retry:
     qemu_coroutine_yield();
 
     QLIST_REMOVE(acb, aiocb_siblings);
-    qemu_co_queue_restart_all(&s->overwrapping_queue);
+    qemu_co_queue_restart_all(&s->overlapping_queue);
     return acb->ret;
 }
 
@@ -2656,15 +2663,15 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
     acb->aio_done_func = sd_finish_aiocb;
 
 retry:
-    if (check_overwrapping_aiocb(s, acb)) {
-        qemu_co_queue_wait(&s->overwrapping_queue);
+    if (check_overlapping_aiocb(s, acb)) {
+        qemu_co_queue_wait(&s->overlapping_queue);
         goto retry;
     }
 
     ret = sd_co_rw_vector(acb);
     if (ret <= 0) {
         QLIST_REMOVE(acb, aiocb_siblings);
-        qemu_co_queue_restart_all(&s->overwrapping_queue);
+        qemu_co_queue_restart_all(&s->overlapping_queue);
         qemu_aio_unref(acb);
         return ret;
     }
@@ -2672,7 +2679,7 @@ retry:
     qemu_coroutine_yield();
 
     QLIST_REMOVE(acb, aiocb_siblings);
-    qemu_co_queue_restart_all(&s->overwrapping_queue);
+    qemu_co_queue_restart_all(&s->overlapping_queue);
 
     return acb->ret;
 }

From e6fd57ea297ec3aad32b24090c5d3757a99df3fe Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Date: Tue, 1 Sep 2015 12:03:10 +0900
Subject: [PATCH 7/7] sheepdog: refine discard support

This patch refines discard support of the sheepdog driver. The
existing discard mechanism was implemented on SD_OP_DISCARD_OBJ, which
was introduced before fine grained reference counting on newer
sheepdog. It doesn't care about relations of snapshots and clones and
discards objects unconditionally.

With this patch, the driver just updates an inode object for updating
reference. Removing the object is done in sheep process side.

Cc: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Cc: Vasiliy Tolstov <v.tolstov@selfip.ru>
Cc: Jeff Cody <jcody@redhat.com>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Tested-by: Vasiliy Tolstov <v.tolstov@selfip.ru>
Message-id: 1441076590-8015-3-git-send-email-mitake.hitoshi@lab.ntt.co.jp
Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 block/sheepdog.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 08a09e9683..e7e58b782c 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -28,7 +28,6 @@
 #define SD_OP_READ_OBJ       0x02
 #define SD_OP_WRITE_OBJ      0x03
 /* 0x04 is used internally by Sheepdog */
-#define SD_OP_DISCARD_OBJ    0x05
 
 #define SD_OP_NEW_VDI        0x11
 #define SD_OP_LOCK_VDI       0x12
@@ -861,10 +860,6 @@ static void coroutine_fn aio_read_response(void *opaque)
             rsp.result = SD_RES_SUCCESS;
             s->discard_supported = false;
             break;
-        case SD_RES_SUCCESS:
-            idx = data_oid_to_idx(aio_req->oid);
-            s->inode.data_vdi_id[idx] = 0;
-            break;
         default:
             break;
         }
@@ -1179,7 +1174,13 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
         hdr.flags = SD_FLAG_CMD_WRITE | flags;
         break;
     case AIOCB_DISCARD_OBJ:
-        hdr.opcode = SD_OP_DISCARD_OBJ;
+        hdr.opcode = SD_OP_WRITE_OBJ;
+        hdr.flags = SD_FLAG_CMD_WRITE | flags;
+        s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0;
+        offset = offsetof(SheepdogInode,
+                          data_vdi_id[data_oid_to_idx(oid)]);
+        oid = vid_to_vdi_oid(s->inode.vdi_id);
+        wlen = datalen = sizeof(uint32_t);
         break;
     }
 
@@ -2214,7 +2215,9 @@ static int coroutine_fn sd_co_rw_vector(void *p)
         }
 
         aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create,
-                                old_oid, done);
+                                old_oid,
+                                acb->aiocb_type == AIOCB_DISCARD_OBJ ?
+                                0 : done);
         QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
 
         add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov,
@@ -2650,15 +2653,23 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
                                       int nb_sectors)
 {
     SheepdogAIOCB *acb;
-    QEMUIOVector dummy;
     BDRVSheepdogState *s = bs->opaque;
     int ret;
+    QEMUIOVector discard_iov;
+    struct iovec iov;
+    uint32_t zero = 0;
 
     if (!s->discard_supported) {
             return 0;
     }
 
-    acb = sd_aio_setup(bs, &dummy, sector_num, nb_sectors);
+    memset(&discard_iov, 0, sizeof(discard_iov));
+    memset(&iov, 0, sizeof(iov));
+    iov.iov_base = &zero;
+    iov.iov_len = sizeof(zero);
+    discard_iov.iov = &iov;
+    discard_iov.niov = 1;
+    acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors);
     acb->aiocb_type = AIOCB_DISCARD_OBJ;
     acb->aio_done_func = sd_finish_aiocb;