From 28fa7133b89555355418aeee4fca01cae2081aeb Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Fri, 15 Aug 2014 13:32:37 +0200
Subject: [PATCH 01/35] ide: Fix bootindex for bus_id > 9

We identify devices by their Open Firmware device paths.  The encoding
of bus numbers is incorrect: idebus_get_fw_dev_path() formats them in
decimal, while SeaBIOS uses hexadecimal.  With bus number > 9, SeaBIOS
will miss the bootindex (lucky case), or apply it to another device
(unlucky case).

Bug can't bite right now: ich9-ahci has six ports, and the sysbus-ahci
created by Calxeda Highbank has just one.

Fix it anyway, by changing %d to %x.

I couldn't find an Open Firmware spec covering this.  For what it's
worth, OVMF agrees with SeaBIOS.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/ide/qdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ide/qdev.c b/hw/ide/qdev.c
index b4a467116e..efab95b320 100644
--- a/hw/ide/qdev.c
+++ b/hw/ide/qdev.c
@@ -59,7 +59,7 @@ static char *idebus_get_fw_dev_path(DeviceState *dev)
 {
     char path[30];
 
-    snprintf(path, sizeof(path), "%s@%d", qdev_fw_name(dev),
+    snprintf(path, sizeof(path), "%s@%x", qdev_fw_name(dev),
              ((IDEBus*)dev->parent_bus)->bus_id);
 
     return g_strdup(path);

From 212aefaa53d142baa9a22f5aadd2e72eb916c0c0 Mon Sep 17 00:00:00 2001
From: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>
Date: Wed, 13 Aug 2014 12:44:27 -0300
Subject: [PATCH 02/35] block.curl: adding 'timeout' option

The curl hardcoded timeout (5 seconds) sometimes is not long
enough depending on the remote server configuration and network
traffic. The user should be able to set how much long he is
willing to wait for the connection.

Adding a new option to set this timeout gives the user this
flexibility. The previous default timeout of 5 seconds will be
used if this option is not present.

Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Daniel Henrique Barboza <danielhb@linux.vnet.ibm.com>
Reviewed-by: Benoit Canet <benoit.canet@nodalink.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/curl.c    | 13 ++++++++++++-
 qemu-options.hx | 10 ++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index d4b85d20a5..2698ae31e8 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -63,6 +63,7 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define CURL_NUM_ACB    8
 #define SECTOR_SIZE     512
 #define READ_AHEAD_DEFAULT (256 * 1024)
+#define CURL_TIMEOUT_DEFAULT 5
 
 #define FIND_RET_NONE   0
 #define FIND_RET_OK     1
@@ -71,6 +72,7 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define CURL_BLOCK_OPT_URL       "url"
 #define CURL_BLOCK_OPT_READAHEAD "readahead"
 #define CURL_BLOCK_OPT_SSLVERIFY "sslverify"
+#define CURL_BLOCK_OPT_TIMEOUT "timeout"
 
 struct BDRVCURLState;
 
@@ -109,6 +111,7 @@ typedef struct BDRVCURLState {
     char *url;
     size_t readahead_size;
     bool sslverify;
+    int timeout;
     bool accept_range;
     AioContext *aio_context;
 } BDRVCURLState;
@@ -382,7 +385,7 @@ static CURLState *curl_init_state(BDRVCURLState *s)
         curl_easy_setopt(state->curl, CURLOPT_URL, s->url);
         curl_easy_setopt(state->curl, CURLOPT_SSL_VERIFYPEER,
                          (long) s->sslverify);
-        curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, 5);
+        curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, s->timeout);
         curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION,
                          (void *)curl_read_cb);
         curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state);
@@ -489,6 +492,11 @@ static QemuOptsList runtime_opts = {
             .type = QEMU_OPT_BOOL,
             .help = "Verify SSL certificate"
         },
+        {
+            .name = CURL_BLOCK_OPT_TIMEOUT,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Curl timeout"
+        },
         { /* end of list */ }
     },
 };
@@ -525,6 +533,9 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
         goto out_noclean;
     }
 
+    s->timeout = qemu_opt_get_number(opts, CURL_BLOCK_OPT_TIMEOUT,
+                                     CURL_TIMEOUT_DEFAULT);
+
     s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true);
 
     file = qemu_opt_get(opts, CURL_BLOCK_OPT_URL);
diff --git a/qemu-options.hx b/qemu-options.hx
index c573dd8893..52d56f4a1c 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2351,6 +2351,11 @@ multiple of 512 bytes. It defaults to 256k.
 @item sslverify
 Whether to verify the remote server's certificate when connecting over SSL. It
 can have the value 'on' or 'off'. It defaults to 'on'.
+
+@item timeout
+Set the timeout in seconds of the CURL connection. This timeout is the time
+that CURL waits for a response from the remote server to get the size of the
+image to be downloaded. If not set, the default timeout of 5 seconds is used.
 @end table
 
 Note that when passing options to qemu explicitly, @option{driver} is the value
@@ -2372,9 +2377,10 @@ qemu-system-x86_64 -drive file=/tmp/Fedora-x86_64-20-20131211.1-sda.qcow2,copy-o
 @end example
 
 Example: boot from an image stored on a VMware vSphere server with a self-signed
-certificate using a local overlay for writes and a readahead of 64k
+certificate using a local overlay for writes, a readahead of 64k and a timeout
+of 10 seconds.
 @example
-qemu-img create -f qcow2 -o backing_file='json:@{"file.driver":"https",, "file.url":"https://user:password@@vsphere.example.com/folder/test/test-flat.vmdk?dcPath=Datacenter&dsName=datastore1",, "file.sslverify":"off",, "file.readahead":"64k"@}' /tmp/test.qcow2
+qemu-img create -f qcow2 -o backing_file='json:@{"file.driver":"https",, "file.url":"https://user:password@@vsphere.example.com/folder/test/test-flat.vmdk?dcPath=Datacenter&dsName=datastore1",, "file.sslverify":"off",, "file.readahead":"64k",, "file.timeout":10@}' /tmp/test.qcow2
 
 qemu-system-x86_64 -drive file=/tmp/test.qcow2
 @end example

From a3981eb978b76e75aca3d04f6c6e26de8322001b Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 26 Aug 2014 19:17:54 +0100
Subject: [PATCH 03/35] qemu-img: fix img_commit() error return value

The img_commit() return value is a process exit code.  Use 1 for failure
instead of -1.  The other failure paths in this function already use 1.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qemu-img.c b/qemu-img.c
index 2052b14b84..863798e54b 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -752,7 +752,7 @@ static int img_commit(int argc, char **argv)
     ret = bdrv_parse_cache_flags(cache, &flags);
     if (ret < 0) {
         error_report("Invalid cache option: %s", cache);
-        return -1;
+        return 1;
     }
 
     bs = bdrv_new_open("image", filename, fmt, flags, true, quiet);

From cbda016d94017fad3be1c657f0ad98f88395c12a Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 26 Aug 2014 19:17:55 +0100
Subject: [PATCH 04/35] qemu-img: fix img_compare() flags error path

If img_compare() fails to parse the cache flags the goto out3 code path
will call qemu_progress_end().  Make sure we actually call
qemu_progress_init() first.

Reported-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 863798e54b..a761c36e7b 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -999,6 +999,9 @@ static int img_compare(int argc, char **argv)
     filename1 = argv[optind++];
     filename2 = argv[optind++];
 
+    /* Initialize before goto out */
+    qemu_progress_init(progress, 2.0);
+
     flags = BDRV_O_FLAGS;
     ret = bdrv_parse_cache_flags(cache, &flags);
     if (ret < 0) {
@@ -1007,9 +1010,6 @@ static int img_compare(int argc, char **argv)
         goto out3;
     }
 
-    /* Initialize before goto out */
-    qemu_progress_init(progress, 2.0);
-
     bs1 = bdrv_new_open("image 1", filename1, fmt1, flags, true, quiet);
     if (!bs1) {
         error_report("Can't open file %s", filename1);

From 40ed35a3c4f7d26247cbbb01a2b3ff544fb50819 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 26 Aug 2014 19:17:56 +0100
Subject: [PATCH 05/35] qemu-img: always goto out in img_snapshot() error paths

The out label has the qemu_progress_end() and other cleanup calls.
Always goto out in error paths so the cleanup happens.  These error
paths now return 1 instead of -1.

Note that bdrv_unref(NULL) is safe.  We just need to initialize bs to
NULL at the top of the function.

We can now remove the obsolete bs_old_backing = NULL and bs_new_backing
= NULL for safe mode.  Originally it was necessary in commit 3e85c6fd
("qemu-img rebase") but became useless in commit c2abcce ("qemu-img:
avoid calling exit(1) to release resources properly") because the
variables are already initialized during declaration.

Reported-by: John Snow <jsnow@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 qemu-img.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index a761c36e7b..ff29ed1c67 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -2304,7 +2304,7 @@ static int img_snapshot(int argc, char **argv)
 
 static int img_rebase(int argc, char **argv)
 {
-    BlockDriverState *bs, *bs_old_backing = NULL, *bs_new_backing = NULL;
+    BlockDriverState *bs = NULL, *bs_old_backing = NULL, *bs_new_backing = NULL;
     BlockDriver *old_backing_drv, *new_backing_drv;
     char *filename;
     const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg;
@@ -2376,14 +2376,14 @@ static int img_rebase(int argc, char **argv)
     ret = bdrv_parse_cache_flags(cache, &flags);
     if (ret < 0) {
         error_report("Invalid cache option: %s", cache);
-        return -1;
+        goto out;
     }
 
     src_flags = BDRV_O_FLAGS;
     ret = bdrv_parse_cache_flags(src_cache, &src_flags);
     if (ret < 0) {
         error_report("Invalid source cache option: %s", src_cache);
-        return -1;
+        goto out;
     }
 
     /*
@@ -2394,7 +2394,8 @@ static int img_rebase(int argc, char **argv)
      */
     bs = bdrv_new_open("image", filename, fmt, flags, true, quiet);
     if (!bs) {
-        return 1;
+        ret = -1;
+        goto out;
     }
 
     /* Find the right drivers for the backing files */
@@ -2420,11 +2421,7 @@ static int img_rebase(int argc, char **argv)
     }
 
     /* For safe rebasing we need to compare old and new backing file */
-    if (unsafe) {
-        /* Make the compiler happy */
-        bs_old_backing = NULL;
-        bs_new_backing = NULL;
-    } else {
+    if (!unsafe) {
         char backing_name[1024];
 
         bs_old_backing = bdrv_new("old_backing", &error_abort);

From 1dbfafed7f0478fde29de7b69692c4d58b9e6c25 Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Date: Mon, 11 Aug 2014 14:43:45 +0900
Subject: [PATCH 06/35] sheepdog: adopting protocol update for VDI locking

The update is required for supporting iSCSI multipath. It doesn't
affect behavior of QEMU driver but adding a new field to vdi request
struct is required.

Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Liu Yuan <namei.unix@gmail.com>
Cc: MORITA Kazutaka <morita.kazutaka@gmail.com>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/sheepdog.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 12cbd9dcb4..93c0266fea 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -103,6 +103,9 @@
 #define SD_INODE_SIZE (sizeof(SheepdogInode))
 #define CURRENT_VDI_ID 0
 
+#define LOCK_TYPE_NORMAL 0
+#define LOCK_TYPE_SHARED 1      /* for iSCSI multipath */
+
 typedef struct SheepdogReq {
     uint8_t proto_ver;
     uint8_t opcode;
@@ -166,7 +169,8 @@ typedef struct SheepdogVdiReq {
     uint8_t copy_policy;
     uint8_t reserved[2];
     uint32_t snapid;
-    uint32_t pad[3];
+    uint32_t type;
+    uint32_t pad[2];
 } SheepdogVdiReq;
 
 typedef struct SheepdogVdiRsp {
@@ -1090,6 +1094,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
     memset(&hdr, 0, sizeof(hdr));
     if (lock) {
         hdr.opcode = SD_OP_LOCK_VDI;
+        hdr.type = LOCK_TYPE_NORMAL;
     } else {
         hdr.opcode = SD_OP_GET_VDI_INFO;
     }
@@ -1793,6 +1798,7 @@ static void sd_close(BlockDriverState *bs)
     memset(&hdr, 0, sizeof(hdr));
 
     hdr.opcode = SD_OP_RELEASE_VDI;
+    hdr.type = LOCK_TYPE_NORMAL;
     hdr.base_vdi_id = s->inode.vdi_id;
     wlen = strlen(s->name) + 1;
     hdr.data_length = wlen;

From 38890b246d7c21d29ac50831c0792994cf289a2c Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Date: Mon, 11 Aug 2014 14:43:46 +0900
Subject: [PATCH 07/35] sheepdog: improve error handling for a case of failed
 lock

Recently, sheepdog revived its VDI locking functionality. This patch
updates sheepdog driver of QEMU for this feature. It changes an error
code for a case of failed locking. -EBUSY is a suitable one.

Reported-by: Valerio Pachera <sirio81@gmail.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Liu Yuan <namei.unix@gmail.com>
Cc: MORITA Kazutaka <morita.kazutaka@gmail.com>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/sheepdog.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 93c0266fea..49a9a9e5eb 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1115,6 +1115,8 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename,
                    sd_strerror(rsp->result), filename, snapid, tag);
         if (rsp->result == SD_RES_NO_VDI) {
             ret = -ENOENT;
+        } else if (rsp->result == SD_RES_VDI_LOCKED) {
+            ret = -EBUSY;
         } else {
             ret = -EIO;
         }

From 62c6031f96997d864edfc49304e9f50d9496907a Mon Sep 17 00:00:00 2001
From: Liu Yuan <namei.unix@gmail.com>
Date: Mon, 18 Aug 2014 17:41:04 +0800
Subject: [PATCH 08/35] qapi: add read-pattern enum for quorum
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: Eric Blake <eblake@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Benoît Canet <benoit.canet@nodalink.com>
Signed-off-by: Liu Yuan <namei.unix@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 qapi/block-core.json | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index fb74c56e32..a685d02728 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1418,6 +1418,19 @@
   'data': { 'test': 'BlockdevRef',
             'raw': 'BlockdevRef' } }
 
+##
+# @QuorumReadPattern
+#
+# An enumeration of quorum read patterns.
+#
+# @quorum: read all the children and do a quorum vote on reads
+#
+# @fifo: read only from the first child that has not failed
+#
+# Since: 2.2
+##
+{ 'enum': 'QuorumReadPattern', 'data': [ 'quorum', 'fifo' ] }
+
 ##
 # @BlockdevOptionsQuorum
 #
@@ -1433,12 +1446,17 @@
 # @rewrite-corrupted: #optional rewrite corrupted data when quorum is reached
 #                     (Since 2.1)
 #
+# @read-pattern: #optional choose read pattern and set to quorum by default
+#                (Since 2.2)
+#
 # Since: 2.0
 ##
 { 'type': 'BlockdevOptionsQuorum',
   'data': { '*blkverify': 'bool',
             'children': [ 'BlockdevRef' ],
-            'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } }
+            'vote-threshold': 'int',
+            '*rewrite-corrupted': 'bool',
+            '*read-pattern': 'QuorumReadPattern' } }
 
 ##
 # @BlockdevOptions

From a9db86b223030bd40bdd81b160788196bc95fe6f Mon Sep 17 00:00:00 2001
From: Liu Yuan <namei.unix@gmail.com>
Date: Mon, 18 Aug 2014 17:41:05 +0800
Subject: [PATCH 09/35] block/quorum: add simple read pattern support

This patch adds single read pattern to quorum driver and quorum vote is default
pattern.

For now we do a quorum vote on all the reads, it is designed for unreliable
underlying storage such as non-redundant NFS to make sure data integrity at the
cost of the read performance.

For some use cases as following:

        VM
  --------------
  |            |
  v            v
  A            B

Both A and B has hardware raid storage to justify the data integrity on its own.
So it would help performance if we do a single read instead of on all the nodes.
Further, if we run VM on either of the storage node, we can make a local read
request for better performance.

This patch generalize the above 2 nodes case in the N nodes. That is,

vm -> write to all the N nodes, read just one of them. If single read fails, we
try to read next node in FIFO order specified by the startup command.

The 2 nodes case is very similar to DRBD[1] though lack of auto-sync
functionality in the single device/node failure for now. But compared with DRBD
we still have some advantages over it:

- Suppose we have 20 VMs running on one(assume A) of 2 nodes' DRBD backed
storage. And if A crashes, we need to restart all the VMs on node B. But for
practice case, we can't because B might not have enough resources to setup 20 VMs
at once. So if we run our 20 VMs with quorum driver, and scatter the replicated
images over the data center, we can very likely restart 20 VMs without any
resource problem.

After all, I think we can build a more powerful replicated image functionality
on quorum and block jobs(block mirror) to meet various High Availibility needs.

E.g, Enable single read pattern on 2 children,

-drive driver=quorum,children.0.file.filename=0.qcow2,\
children.1.file.filename=1.qcow2,read-pattern=fifo,vote-threshold=1

[1] http://en.wikipedia.org/wiki/Distributed_Replicated_Block_Device

[Dropped \n from an error_setg() error message
--Stefan]

Cc: Benoit Canet <benoit@irqsave.net>
Cc: Eric Blake <eblake@redhat.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Liu Yuan <namei.unix@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/quorum.c | 179 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 130 insertions(+), 49 deletions(-)

diff --git a/block/quorum.c b/block/quorum.c
index 0de07bb036..0160fe3ae9 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -29,6 +29,7 @@
 #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
 #define QUORUM_OPT_BLKVERIFY      "blkverify"
 #define QUORUM_OPT_REWRITE        "rewrite-corrupted"
+#define QUORUM_OPT_READ_PATTERN   "read-pattern"
 
 /* This union holds a vote hash value */
 typedef union QuorumVoteValue {
@@ -79,6 +80,8 @@ typedef struct BDRVQuorumState {
     bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted
                             * block if Quorum is reached.
                             */
+
+    QuorumReadPattern read_pattern;
 } BDRVQuorumState;
 
 typedef struct QuorumAIOCB QuorumAIOCB;
@@ -122,6 +125,7 @@ struct QuorumAIOCB {
 
     bool is_read;
     int vote_ret;
+    int child_iter;             /* which child to read in fifo pattern */
 };
 
 static bool quorum_vote(QuorumAIOCB *acb);
@@ -148,7 +152,6 @@ static AIOCBInfo quorum_aiocb_info = {
 
 static void quorum_aio_finalize(QuorumAIOCB *acb)
 {
-    BDRVQuorumState *s = acb->common.bs->opaque;
     int i, ret = 0;
 
     if (acb->vote_ret) {
@@ -158,7 +161,8 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
     acb->common.cb(acb->common.opaque, ret);
 
     if (acb->is_read) {
-        for (i = 0; i < s->num_children; i++) {
+        /* on the quorum case acb->child_iter == s->num_children - 1 */
+        for (i = 0; i <= acb->child_iter; i++) {
             qemu_vfree(acb->qcrs[i].buf);
             qemu_iovec_destroy(&acb->qcrs[i].qiov);
         }
@@ -261,6 +265,21 @@ static void quorum_rewrite_aio_cb(void *opaque, int ret)
     quorum_aio_finalize(acb);
 }
 
+static BlockDriverAIOCB *read_fifo_child(QuorumAIOCB *acb);
+
+static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
+{
+    int i;
+    assert(dest->niov == source->niov);
+    assert(dest->size == source->size);
+    for (i = 0; i < source->niov; i++) {
+        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
+        memcpy(dest->iov[i].iov_base,
+               source->iov[i].iov_base,
+               source->iov[i].iov_len);
+    }
+}
+
 static void quorum_aio_cb(void *opaque, int ret)
 {
     QuorumChildRequest *sacb = opaque;
@@ -268,6 +287,21 @@ static void quorum_aio_cb(void *opaque, int ret)
     BDRVQuorumState *s = acb->common.bs->opaque;
     bool rewrite = false;
 
+    if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
+        /* We try to read next child in FIFO order if we fail to read */
+        if (ret < 0 && ++acb->child_iter < s->num_children) {
+            read_fifo_child(acb);
+            return;
+        }
+
+        if (ret == 0) {
+            quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov);
+        }
+        acb->vote_ret = ret;
+        quorum_aio_finalize(acb);
+        return;
+    }
+
     sacb->ret = ret;
     acb->count++;
     if (ret == 0) {
@@ -348,19 +382,6 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
     return count;
 }
 
-static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
-{
-    int i;
-    assert(dest->niov == source->niov);
-    assert(dest->size == source->size);
-    for (i = 0; i < source->niov; i++) {
-        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
-        memcpy(dest->iov[i].iov_base,
-               source->iov[i].iov_base,
-               source->iov[i].iov_len);
-    }
-}
-
 static void quorum_count_vote(QuorumVotes *votes,
                               QuorumVoteValue *value,
                               int index)
@@ -620,32 +641,60 @@ free_exit:
     return rewrite;
 }
 
+static BlockDriverAIOCB *read_quorum_children(QuorumAIOCB *acb)
+{
+    BDRVQuorumState *s = acb->common.bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size);
+        qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
+        qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf);
+    }
+
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov,
+                       acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]);
+    }
+
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *read_fifo_child(QuorumAIOCB *acb)
+{
+    BDRVQuorumState *s = acb->common.bs->opaque;
+
+    acb->qcrs[acb->child_iter].buf = qemu_blockalign(s->bs[acb->child_iter],
+                                                     acb->qiov->size);
+    qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov);
+    qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
+                     acb->qcrs[acb->child_iter].buf);
+    bdrv_aio_readv(s->bs[acb->child_iter], acb->sector_num,
+                   &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
+                   quorum_aio_cb, &acb->qcrs[acb->child_iter]);
+
+    return &acb->common;
+}
+
 static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov,
-                                         int nb_sectors,
-                                         BlockDriverCompletionFunc *cb,
-                                         void *opaque)
+                                          int64_t sector_num,
+                                          QEMUIOVector *qiov,
+                                          int nb_sectors,
+                                          BlockDriverCompletionFunc *cb,
+                                          void *opaque)
 {
     BDRVQuorumState *s = bs->opaque;
     QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
                                       nb_sectors, cb, opaque);
-    int i;
-
     acb->is_read = true;
 
-    for (i = 0; i < s->num_children; i++) {
-        acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size);
-        qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov);
-        qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf);
+    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
+        acb->child_iter = s->num_children - 1;
+        return read_quorum_children(acb);
     }
 
-    for (i = 0; i < s->num_children; i++) {
-        bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors,
-                       quorum_aio_cb, &acb->qcrs[i]);
-    }
-
-    return &acb->common;
+    acb->child_iter = 0;
+    return read_fifo_child(acb);
 }
 
 static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
@@ -787,10 +836,33 @@ static QemuOptsList quorum_runtime_opts = {
             .type = QEMU_OPT_BOOL,
             .help = "Rewrite corrupted block on read quorum",
         },
+        {
+            .name = QUORUM_OPT_READ_PATTERN,
+            .type = QEMU_OPT_STRING,
+            .help = "Allowed pattern: quorum, fifo. Quorum is default",
+        },
         { /* end of list */ }
     },
 };
 
+static int parse_read_pattern(const char *opt)
+{
+    int i;
+
+    if (!opt) {
+        /* Set quorum as default */
+        return QUORUM_READ_PATTERN_QUORUM;
+    }
+
+    for (i = 0; i < QUORUM_READ_PATTERN_MAX; i++) {
+        if (!strcmp(opt, QuorumReadPattern_lookup[i])) {
+            return i;
+        }
+    }
+
+    return -EINVAL;
+}
+
 static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
                        Error **errp)
 {
@@ -832,28 +904,37 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
-
-    /* and validate it against s->num_children */
-    ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+    ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
     if (ret < 0) {
+        error_setg(&local_err, "Please set read-pattern as fifo or quorum");
         goto exit;
     }
+    s->read_pattern = ret;
 
-    /* is the driver in blkverify mode */
-    if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
-        s->num_children == 2 && s->threshold == 2) {
-        s->is_blkverify = true;
-    } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) {
-        fprintf(stderr, "blkverify mode is set by setting blkverify=on "
-                "and using two files with vote_threshold=2\n");
-    }
+    if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
+        /* and validate it against s->num_children */
+        ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+        if (ret < 0) {
+            goto exit;
+        }
 
-    s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false);
-    if (s->rewrite_corrupted && s->is_blkverify) {
-        error_setg(&local_err,
-                   "rewrite-corrupted=on cannot be used with blkverify=on");
-        ret = -EINVAL;
-        goto exit;
+        /* is the driver in blkverify mode */
+        if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
+            s->num_children == 2 && s->threshold == 2) {
+            s->is_blkverify = true;
+        } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) {
+            fprintf(stderr, "blkverify mode is set by setting blkverify=on "
+                    "and using two files with vote_threshold=2\n");
+        }
+
+        s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE,
+                                                 false);
+        if (s->rewrite_corrupted && s->is_blkverify) {
+            error_setg(&local_err,
+                       "rewrite-corrupted=on cannot be used with blkverify=on");
+            ret = -EINVAL;
+            goto exit;
+        }
     }
 
     /* allocate the children BlockDriverState array */

From 0b9caf9b3166c8deb3c4f3a774c2384b069dc29c Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Tue, 26 Aug 2014 15:15:43 +0800
Subject: [PATCH 10/35] coroutine: Drop co_sleep_ns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

block_job_sleep_ns is the only user. Since we are moving towards
AioContext aware code, it's better to use the explicit version and drop
the old one.

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Benoît Canet <benoit.canet@nodalink.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 blockjob.c                |  2 +-
 include/block/coroutine.h |  8 --------
 qemu-coroutine-sleep.c    | 12 ------------
 3 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/blockjob.c b/blockjob.c
index ca0b4e25d0..0689fdd2b5 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -205,7 +205,7 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns)
     if (block_job_is_paused(job)) {
         qemu_coroutine_yield();
     } else {
-        co_sleep_ns(type, ns);
+        co_aio_sleep_ns(bdrv_get_aio_context(job->bs), type, ns);
     }
     job->busy = true;
 }
diff --git a/include/block/coroutine.h b/include/block/coroutine.h
index b9b7f488c9..793df0ef8b 100644
--- a/include/block/coroutine.h
+++ b/include/block/coroutine.h
@@ -200,14 +200,6 @@ void qemu_co_rwlock_wrlock(CoRwlock *lock);
  */
 void qemu_co_rwlock_unlock(CoRwlock *lock);
 
-/**
- * Yield the coroutine for a given duration
- *
- * Note this function uses timers and hence only works when a main loop is in
- * use.  See main-loop.h and do not use from qemu-tool programs.
- */
-void coroutine_fn co_sleep_ns(QEMUClockType type, int64_t ns);
-
 /**
  * Yield the coroutine for a given duration
  *
diff --git a/qemu-coroutine-sleep.c b/qemu-coroutine-sleep.c
index ad78fbaa2a..9abb7fdf31 100644
--- a/qemu-coroutine-sleep.c
+++ b/qemu-coroutine-sleep.c
@@ -27,18 +27,6 @@ static void co_sleep_cb(void *opaque)
     qemu_coroutine_enter(sleep_cb->co, NULL);
 }
 
-void coroutine_fn co_sleep_ns(QEMUClockType type, int64_t ns)
-{
-    CoSleepCB sleep_cb = {
-        .co = qemu_coroutine_self(),
-    };
-    sleep_cb.ts = timer_new(type, SCALE_NS, co_sleep_cb, &sleep_cb);
-    timer_mod(sleep_cb.ts, qemu_clock_get_ns(type) + ns);
-    qemu_coroutine_yield();
-    timer_del(sleep_cb.ts);
-    timer_free(sleep_cb.ts);
-}
-
 void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
                                   int64_t ns)
 {

From 3cbbe9fd1feaf3264f745fccb0bf5f62c583078f Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 27 Aug 2014 14:29:59 +0100
Subject: [PATCH 11/35] blockdev: fix drive-mirror 'granularity' error message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Name the 'granularity' parameter and give its expected value range.
Previously the device name was mistakenly reported as the parameter
name.

Note that the error class is unchanged from ERROR_CLASS_GENERIC_ERROR.

Reported-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Benoît Canet <benoit.canet@nodalink.com>
---
 blockdev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 6a204c662d..eeb414efc0 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2179,11 +2179,12 @@ void qmp_drive_mirror(const char *device, const char *target,
     }
 
     if (granularity != 0 && (granularity < 512 || granularity > 1048576 * 64)) {
-        error_set(errp, QERR_INVALID_PARAMETER, device);
+        error_set(errp, QERR_INVALID_PARAMETER_VALUE, "granularity",
+                  "a value in range [512B, 64MB]");
         return;
     }
     if (granularity & (granularity - 1)) {
-        error_set(errp, QERR_INVALID_PARAMETER, device);
+        error_set(errp, QERR_INVALID_PARAMETER_VALUE, "granularity", "power of 2");
         return;
     }
 

From 845ca10dd089b4e48f0a79bad005fb30eb77584e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jul 2014 11:53:01 +0200
Subject: [PATCH 12/35] AioContext: take bottom halves into account when
 computing aio_poll timeout

Right now, QEMU invokes aio_bh_poll before the "poll" phase
of aio_poll.  It is simpler to do it afterwards and skip the
"poll" phase altogether when the OS-dependent parts of AioContext
are invoked from GSource.  This way, AioContext behaves more
similarly when used as a GSource vs. when used as stand-alone.

As a start, take bottom halves into account when computing the
poll timeout.  If a bottom half is ready, do a non-blocking
poll.  As a side effect, this makes idle bottom halves work
with aio_poll; an improvement, but not really an important
one since they are deprecated.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 aio-posix.c         |  2 +-
 aio-win32.c         |  4 ++--
 async.c             | 32 ++++++++++++++++++--------------
 include/block/aio.h |  8 ++++++++
 4 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/aio-posix.c b/aio-posix.c
index 2eada2e049..55706f8205 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -249,7 +249,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
     /* wait until next event */
     ret = qemu_poll_ns((GPollFD *)ctx->pollfds->data,
                          ctx->pollfds->len,
-                         blocking ? timerlistgroup_deadline_ns(&ctx->tlg) : 0);
+                         blocking ? aio_compute_timeout(ctx) : 0);
 
     /* if we have any readable fds, dispatch event */
     if (ret > 0) {
diff --git a/aio-win32.c b/aio-win32.c
index c12f61e97d..fe7ee5bb22 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -165,8 +165,8 @@ bool aio_poll(AioContext *ctx, bool blocking)
     while (count > 0) {
         int ret;
 
-        timeout = blocking ?
-            qemu_timeout_ns_to_ms(timerlistgroup_deadline_ns(&ctx->tlg)) : 0;
+        timeout = blocking
+            ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
 
         /* if we have any signaled events, dispatch event */
diff --git a/async.c b/async.c
index 34af0b25ca..09e09c6526 100644
--- a/async.c
+++ b/async.c
@@ -152,39 +152,43 @@ void qemu_bh_delete(QEMUBH *bh)
     bh->deleted = 1;
 }
 
-static gboolean
-aio_ctx_prepare(GSource *source, gint    *timeout)
+int64_t
+aio_compute_timeout(AioContext *ctx)
 {
-    AioContext *ctx = (AioContext *) source;
+    int64_t deadline;
+    int timeout = -1;
     QEMUBH *bh;
-    int deadline;
 
-    /* We assume there is no timeout already supplied */
-    *timeout = -1;
     for (bh = ctx->first_bh; bh; bh = bh->next) {
         if (!bh->deleted && bh->scheduled) {
             if (bh->idle) {
                 /* idle bottom halves will be polled at least
                  * every 10ms */
-                *timeout = 10;
+                timeout = 10000000;
             } else {
                 /* non-idle bottom halves will be executed
                  * immediately */
-                *timeout = 0;
-                return true;
+                return 0;
             }
         }
     }
 
-    deadline = qemu_timeout_ns_to_ms(timerlistgroup_deadline_ns(&ctx->tlg));
+    deadline = timerlistgroup_deadline_ns(&ctx->tlg);
     if (deadline == 0) {
-        *timeout = 0;
-        return true;
+        return 0;
     } else {
-        *timeout = qemu_soonest_timeout(*timeout, deadline);
+        return qemu_soonest_timeout(timeout, deadline);
     }
+}
 
-    return false;
+static gboolean
+aio_ctx_prepare(GSource *source, gint    *timeout)
+{
+    AioContext *ctx = (AioContext *) source;
+
+    /* We assume there is no timeout already supplied */
+    *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
+    return *timeout == 0;
 }
 
 static gboolean
diff --git a/include/block/aio.h b/include/block/aio.h
index c23de3cd1f..05b531ca25 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -303,4 +303,12 @@ static inline void aio_timer_init(AioContext *ctx,
     timer_init(ts, ctx->tlg.tl[type], scale, cb, opaque);
 }
 
+/**
+ * aio_compute_timeout:
+ * @ctx: the aio context
+ *
+ * Compute the timeout that a blocking aio_poll should use.
+ */
+int64_t aio_compute_timeout(AioContext *ctx);
+
 #endif

From d397ec99bb1c03a06afbeab7bfa65a43138eafb6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jul 2014 11:53:02 +0200
Subject: [PATCH 13/35] aio-win32: Evaluate timers after handles

This is similar to what aio_poll does in the stand-alone case.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 aio-win32.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aio-win32.c b/aio-win32.c
index fe7ee5bb22..7b284119e0 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -109,9 +109,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
         progress = true;
     }
 
-    /* Run timers */
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
-
     /*
      * Then dispatch any pending callbacks from the GSource.
      *
@@ -145,6 +142,9 @@ bool aio_poll(AioContext *ctx, bool blocking)
         }
     }
 
+    /* Run timers */
+    progress |= timerlistgroup_run_timers(&ctx->tlg);
+
     if (progress && !blocking) {
         return true;
     }

From a398dea34c62b238e714bb4c3a968b4ca11e256b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jul 2014 11:53:03 +0200
Subject: [PATCH 14/35] aio-win32: Factor out duplicate code into
 aio_dispatch_handlers

Later, the call to aio_dispatch will move int the GSource wrapper, while the
standalone case will still be call the component functions aio_bh_poll,
aio_dispatch_handlers and timerlistgroup_run_timers.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 aio-win32.c | 89 +++++++++++++++++++++++------------------------------
 1 file changed, 39 insertions(+), 50 deletions(-)

diff --git a/aio-win32.c b/aio-win32.c
index 7b284119e0..5e37b42530 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -89,29 +89,12 @@ bool aio_pending(AioContext *ctx)
     return false;
 }
 
-bool aio_poll(AioContext *ctx, bool blocking)
+static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
 {
     AioHandler *node;
-    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
-    bool progress;
-    int count;
-    int timeout;
-
-    progress = false;
+    bool progress = false;
 
     /*
-     * If there are callbacks left that have been queued, we need to call then.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    if (aio_bh_poll(ctx)) {
-        blocking = false;
-        progress = true;
-    }
-
-    /*
-     * Then dispatch any pending callbacks from the GSource.
-     *
      * We have to walk very carefully in case aio_set_fd_handler is
      * called while we're walking.
      */
@@ -121,7 +104,9 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         ctx->walking_handlers++;
 
-        if (node->pfd.revents && node->io_notify) {
+        if (!node->deleted &&
+            (node->pfd.revents || event_notifier_get_handle(node->e) == event) &&
+            node->io_notify) {
             node->pfd.revents = 0;
             node->io_notify(node->e);
 
@@ -142,8 +127,40 @@ bool aio_poll(AioContext *ctx, bool blocking)
         }
     }
 
-    /* Run timers */
+    return progress;
+}
+
+static bool aio_dispatch(AioContext *ctx)
+{
+    bool progress;
+
+    progress = aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
+    return progress;
+}
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+    AioHandler *node;
+    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+    bool progress;
+    int count;
+    int timeout;
+
+    progress = false;
+
+    /*
+     * If there are callbacks left that have been queued, we need to call then.
+     * Do not call select in this case, because it is possible that the caller
+     * does not need a complete flush (as is the case for aio_poll loops).
+     */
+    if (aio_bh_poll(ctx)) {
+        blocking = false;
+        progress = true;
+    }
+
+    /* Dispatch any pending callbacks from the GSource.  */
+    progress |= aio_dispatch(ctx);
 
     if (progress && !blocking) {
         return true;
@@ -176,35 +193,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
         blocking = false;
 
-        /* we have to walk very carefully in case
-         * aio_set_fd_handler is called while we're walking */
-        node = QLIST_FIRST(&ctx->aio_handlers);
-        while (node) {
-            AioHandler *tmp;
-
-            ctx->walking_handlers++;
-
-            if (!node->deleted &&
-                event_notifier_get_handle(node->e) == events[ret - WAIT_OBJECT_0] &&
-                node->io_notify) {
-                node->io_notify(node->e);
-
-                /* aio_notify() does not count as progress */
-                if (node->e != &ctx->notifier) {
-                    progress = true;
-                }
-            }
-
-            tmp = node;
-            node = QLIST_NEXT(node, node);
-
-            ctx->walking_handlers--;
-
-            if (!ctx->walking_handlers && tmp->deleted) {
-                QLIST_REMOVE(tmp, node);
-                g_free(tmp);
-            }
-        }
+        progress |= aio_dispatch_handlers(ctx, events[ret - WAIT_OBJECT_0]);
 
         /* Try again, but only call each handler once.  */
         events[ret - WAIT_OBJECT_0] = events[--count];

From 3672fa50837c1700deb1f86f0068c22c7e49aa22 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jul 2014 11:53:04 +0200
Subject: [PATCH 15/35] AioContext: run bottom halves after polling

Make the dispatching phase the same before blocking and afterwards.
The next patch will make aio_dispatch public and use it directly
for the GSource case, instead of aio_poll.  aio_poll can then be
simplified heavily.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 aio-posix.c | 4 ++++
 aio-win32.c | 8 +++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/aio-posix.c b/aio-posix.c
index 55706f8205..798a3ff532 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -264,6 +264,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     /* Run dispatch even if there were no readable fds to run timers */
     aio_set_dispatching(ctx, true);
+    if (aio_bh_poll(ctx)) {
+        progress = true;
+    }
+
     if (aio_dispatch(ctx)) {
         progress = true;
     }
diff --git a/aio-win32.c b/aio-win32.c
index 5e37b42530..2ac38a897c 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -143,7 +143,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
 {
     AioHandler *node;
     HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
-    bool progress;
+    bool progress, first;
     int count;
     int timeout;
 
@@ -177,6 +177,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
     }
 
     ctx->walking_handlers--;
+    first = true;
 
     /* wait until next event */
     while (count > 0) {
@@ -186,6 +187,11 @@ bool aio_poll(AioContext *ctx, bool blocking)
             ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
 
+        if (first && aio_bh_poll(ctx)) {
+            progress = true;
+        }
+        first = false;
+
         /* if we have any signaled events, dispatch event */
         if ((DWORD) (ret - WAIT_OBJECT_0) >= count) {
             break;

From e4c7e2d12d7b1c4501ab3397218a206d4953e633 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jul 2014 11:53:05 +0200
Subject: [PATCH 16/35] AioContext: export and use aio_dispatch

So far, aio_poll's scheme was dispatch/poll/dispatch, where
the first dispatch phase was used only in the GSource case in
order to avoid a blocking poll.  Earlier patches changed it to
dispatch/prepare/poll/dispatch, where prepare is aio_compute_timeout.

By making aio_dispatch public, we can remove the first dispatch
phase altogether, so that both aio_poll and the GSource use the same
prepare/poll/dispatch scheme.

This patch breaks the invariant that aio_poll(..., true) will not block
the first time it returns false.  This used to be fundamental for
qemu_aio_flush's implementation as "while (qemu_aio_wait()) {}" but
no code in QEMU relies on this invariant anymore.  The return value
of aio_poll() is now comparable with that of g_main_context_iteration.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 aio-posix.c         | 55 +++++++++++----------------------------------
 aio-win32.c         | 31 ++++---------------------
 async.c             |  2 +-
 include/block/aio.h |  6 +++++
 4 files changed, 24 insertions(+), 70 deletions(-)

diff --git a/aio-posix.c b/aio-posix.c
index 798a3ff532..0936b4ff9d 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -119,11 +119,20 @@ bool aio_pending(AioContext *ctx)
     return false;
 }
 
-static bool aio_dispatch(AioContext *ctx)
+bool aio_dispatch(AioContext *ctx)
 {
     AioHandler *node;
     bool progress = false;
 
+    /*
+     * If there are callbacks left that have been queued, we need to call them.
+     * Do not call select in this case, because it is possible that the caller
+     * does not need a complete flush (as is the case for aio_poll loops).
+     */
+    if (aio_bh_poll(ctx)) {
+        progress = true;
+    }
+
     /*
      * We have to walk very carefully in case aio_set_fd_handler is
      * called while we're walking.
@@ -184,22 +193,9 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     /* aio_notify can avoid the expensive event_notifier_set if
      * everything (file descriptors, bottom halves, timers) will
-     * be re-evaluated before the next blocking poll().  This happens
-     * in two cases:
-     *
-     * 1) when aio_poll is called with blocking == false
-     *
-     * 2) when we are called after poll().  If we are called before
-     *    poll(), bottom halves will not be re-evaluated and we need
-     *    aio_notify() if blocking == true.
-     *
-     * The first aio_dispatch() only does something when AioContext is
-     * running as a GSource, and in that case aio_poll is used only
-     * with blocking == false, so this optimization is already quite
-     * effective.  However, the code is ugly and should be restructured
-     * to have a single aio_dispatch() call.  To do this, we need to
-     * reorganize aio_poll into a prepare/poll/dispatch model like
-     * glib's.
+     * be re-evaluated before the next blocking poll().  This is
+     * already true when aio_poll is called with blocking == false;
+     * if blocking == true, it is only true after poll() returns.
      *
      * If we're in a nested event loop, ctx->dispatching might be true.
      * In that case we can restore it just before returning, but we
@@ -207,26 +203,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
      */
     aio_set_dispatching(ctx, !blocking);
 
-    /*
-     * If there are callbacks left that have been queued, we need to call them.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    if (aio_bh_poll(ctx)) {
-        blocking = false;
-        progress = true;
-    }
-
-    /* Re-evaluate condition (1) above.  */
-    aio_set_dispatching(ctx, !blocking);
-    if (aio_dispatch(ctx)) {
-        progress = true;
-    }
-
-    if (progress && !blocking) {
-        goto out;
-    }
-
     ctx->walking_handlers++;
 
     g_array_set_size(ctx->pollfds, 0);
@@ -264,15 +240,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     /* Run dispatch even if there were no readable fds to run timers */
     aio_set_dispatching(ctx, true);
-    if (aio_bh_poll(ctx)) {
-        progress = true;
-    }
-
     if (aio_dispatch(ctx)) {
         progress = true;
     }
 
-out:
     aio_set_dispatching(ctx, was_dispatching);
     return progress;
 }
diff --git a/aio-win32.c b/aio-win32.c
index 2ac38a897c..1ec434a83c 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -130,11 +130,12 @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
     return progress;
 }
 
-static bool aio_dispatch(AioContext *ctx)
+bool aio_dispatch(AioContext *ctx)
 {
     bool progress;
 
-    progress = aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    progress = aio_bh_poll(ctx);
+    progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
     progress |= timerlistgroup_run_timers(&ctx->tlg);
     return progress;
 }
@@ -149,23 +150,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     progress = false;
 
-    /*
-     * If there are callbacks left that have been queued, we need to call then.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    if (aio_bh_poll(ctx)) {
-        blocking = false;
-        progress = true;
-    }
-
-    /* Dispatch any pending callbacks from the GSource.  */
-    progress |= aio_dispatch(ctx);
-
-    if (progress && !blocking) {
-        return true;
-    }
-
     ctx->walking_handlers++;
 
     /* fill fd sets */
@@ -205,14 +189,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
         events[ret - WAIT_OBJECT_0] = events[--count];
     }
 
-    if (blocking) {
-        /* Run the timers a second time. We do this because otherwise aio_wait
-         * will not note progress - and will stop a drain early - if we have
-         * a timer that was not ready to run entering g_poll but is ready
-         * after g_poll. This will only do anything if a timer has expired.
-         */
-        progress |= timerlistgroup_run_timers(&ctx->tlg);
-    }
+    progress |= timerlistgroup_run_timers(&ctx->tlg);
 
     return progress;
 }
diff --git a/async.c b/async.c
index 09e09c6526..293a52a0a7 100644
--- a/async.c
+++ b/async.c
@@ -213,7 +213,7 @@ aio_ctx_dispatch(GSource     *source,
     AioContext *ctx = (AioContext *) source;
 
     assert(callback == NULL);
-    aio_poll(ctx, false);
+    aio_dispatch(ctx);
     return true;
 }
 
diff --git a/include/block/aio.h b/include/block/aio.h
index 05b531ca25..7ba3e9675b 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -211,6 +211,12 @@ void qemu_bh_delete(QEMUBH *bh);
  */
 bool aio_pending(AioContext *ctx);
 
+/* Dispatch any pending callbacks from the GSource attached to the AioContext.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_dispatch(AioContext *ctx);
+
 /* Progress in completing AIO work to occur.  This can issue new pending
  * aio as a result of executing I/O completion or bh callbacks.
  *

From 363285d4b39cbd80427b4fb0a71266fef17290bf Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jul 2014 11:53:06 +0200
Subject: [PATCH 17/35] test-aio: test timers on Windows too

Use EventNotifier instead of a pipe, which makes it trivial to test
timers on Windows.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 tests/test-aio.c | 48 +++++++++++-------------------------------------
 1 file changed, 11 insertions(+), 37 deletions(-)

diff --git a/tests/test-aio.c b/tests/test-aio.c
index f12b6e0ae8..c6a8713e83 100644
--- a/tests/test-aio.c
+++ b/tests/test-aio.c
@@ -57,8 +57,6 @@ static void bh_test_cb(void *opaque)
     }
 }
 
-#if !defined(_WIN32)
-
 static void timer_test_cb(void *opaque)
 {
     TimerTestData *data = opaque;
@@ -68,12 +66,10 @@ static void timer_test_cb(void *opaque)
     }
 }
 
-static void dummy_io_handler_read(void *opaque)
+static void dummy_io_handler_read(EventNotifier *e)
 {
 }
 
-#endif /* !_WIN32 */
-
 static void bh_delete_cb(void *opaque)
 {
     BHTestData *data = opaque;
@@ -428,24 +424,18 @@ static void test_wait_event_notifier_noflush(void)
     event_notifier_cleanup(&data.e);
 }
 
-#if !defined(_WIN32)
-
 static void test_timer_schedule(void)
 {
     TimerTestData data = { .n = 0, .ctx = ctx, .ns = SCALE_MS * 750LL,
                            .max = 2,
                            .clock_type = QEMU_CLOCK_VIRTUAL };
-    int pipefd[2];
+    EventNotifier e;
 
     /* aio_poll will not block to wait for timers to complete unless it has
      * an fd to wait on. Fixing this breaks other tests. So create a dummy one.
      */
-    g_assert(!qemu_pipe(pipefd));
-    qemu_set_nonblock(pipefd[0]);
-    qemu_set_nonblock(pipefd[1]);
-
-    aio_set_fd_handler(ctx, pipefd[0],
-                       dummy_io_handler_read, NULL, NULL);
+    event_notifier_init(&e, false);
+    aio_set_event_notifier(ctx, &e, dummy_io_handler_read);
     aio_poll(ctx, false);
 
     aio_timer_init(ctx, &data.timer, data.clock_type,
@@ -484,15 +474,12 @@ static void test_timer_schedule(void)
     g_assert(!aio_poll(ctx, false));
     g_assert_cmpint(data.n, ==, 2);
 
-    aio_set_fd_handler(ctx, pipefd[0], NULL, NULL, NULL);
-    close(pipefd[0]);
-    close(pipefd[1]);
+    aio_set_event_notifier(ctx, &e, NULL);
+    event_notifier_cleanup(&e);
 
     timer_del(&data.timer);
 }
 
-#endif /* !_WIN32 */
-
 /* Now the same tests, using the context as a GSource.  They are
  * very similar to the ones above, with g_main_context_iteration
  * replacing aio_poll.  However:
@@ -775,25 +762,19 @@ static void test_source_wait_event_notifier_noflush(void)
     event_notifier_cleanup(&data.e);
 }
 
-#if !defined(_WIN32)
-
 static void test_source_timer_schedule(void)
 {
     TimerTestData data = { .n = 0, .ctx = ctx, .ns = SCALE_MS * 750LL,
                            .max = 2,
                            .clock_type = QEMU_CLOCK_VIRTUAL };
-    int pipefd[2];
+    EventNotifier e;
     int64_t expiry;
 
     /* aio_poll will not block to wait for timers to complete unless it has
      * an fd to wait on. Fixing this breaks other tests. So create a dummy one.
      */
-    g_assert(!qemu_pipe(pipefd));
-    qemu_set_nonblock(pipefd[0]);
-    qemu_set_nonblock(pipefd[1]);
-
-    aio_set_fd_handler(ctx, pipefd[0],
-                       dummy_io_handler_read, NULL, NULL);
+    event_notifier_init(&e, false);
+    aio_set_event_notifier(ctx, &e, dummy_io_handler_read);
     do {} while (g_main_context_iteration(NULL, false));
 
     aio_timer_init(ctx, &data.timer, data.clock_type,
@@ -818,15 +799,12 @@ static void test_source_timer_schedule(void)
     g_assert_cmpint(data.n, ==, 2);
     g_assert(qemu_clock_get_ns(data.clock_type) > expiry);
 
-    aio_set_fd_handler(ctx, pipefd[0], NULL, NULL, NULL);
-    close(pipefd[0]);
-    close(pipefd[1]);
+    aio_set_event_notifier(ctx, &e, NULL);
+    event_notifier_cleanup(&e);
 
     timer_del(&data.timer);
 }
 
-#endif /* !_WIN32 */
-
 
 /* End of tests.  */
 
@@ -857,9 +835,7 @@ int main(int argc, char **argv)
     g_test_add_func("/aio/event/wait",              test_wait_event_notifier);
     g_test_add_func("/aio/event/wait/no-flush-cb",  test_wait_event_notifier_noflush);
     g_test_add_func("/aio/event/flush",             test_flush_event_notifier);
-#if !defined(_WIN32)
     g_test_add_func("/aio/timer/schedule",          test_timer_schedule);
-#endif
 
     g_test_add_func("/aio-gsource/notify",                  test_source_notify);
     g_test_add_func("/aio-gsource/flush",                   test_source_flush);
@@ -874,8 +850,6 @@ int main(int argc, char **argv)
     g_test_add_func("/aio-gsource/event/wait",              test_source_wait_event_notifier);
     g_test_add_func("/aio-gsource/event/wait/no-flush-cb",  test_source_wait_event_notifier_noflush);
     g_test_add_func("/aio-gsource/event/flush",             test_source_flush_event_notifier);
-#if !defined(_WIN32)
     g_test_add_func("/aio-gsource/timer/schedule",          test_source_timer_schedule);
-#endif
     return g_test_run();
 }

From 0a9dd1664a0509f7c3c0c7a5550446258ee70f4b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jul 2014 11:53:07 +0200
Subject: [PATCH 18/35] aio-win32: add aio_set_dispatching optimization

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 aio-win32.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/aio-win32.c b/aio-win32.c
index 1ec434a83c..fd526864be 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -144,12 +144,25 @@ bool aio_poll(AioContext *ctx, bool blocking)
 {
     AioHandler *node;
     HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
-    bool progress, first;
+    bool was_dispatching, progress, first;
     int count;
     int timeout;
 
+    was_dispatching = ctx->dispatching;
     progress = false;
 
+    /* aio_notify can avoid the expensive event_notifier_set if
+     * everything (file descriptors, bottom halves, timers) will
+     * be re-evaluated before the next blocking poll().  This is
+     * already true when aio_poll is called with blocking == false;
+     * if blocking == true, it is only true after poll() returns.
+     *
+     * If we're in a nested event loop, ctx->dispatching might be true.
+     * In that case we can restore it just before returning, but we
+     * have to clear it now.
+     */
+    aio_set_dispatching(ctx, !blocking);
+
     ctx->walking_handlers++;
 
     /* fill fd sets */
@@ -170,6 +183,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
         timeout = blocking
             ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+        aio_set_dispatching(ctx, true);
 
         if (first && aio_bh_poll(ctx)) {
             progress = true;
@@ -191,5 +205,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     progress |= timerlistgroup_run_timers(&ctx->tlg);
 
+    aio_set_dispatching(ctx, was_dispatching);
     return progress;
 }

From a3462c656128e7b900ccc5d436f9e858d07de264 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jul 2014 11:53:08 +0200
Subject: [PATCH 19/35] AioContext: introduce aio_prepare

This will be used to implement socket polling on Windows.
On Windows, select() and g_poll() are completely different;
sockets are polled with select() before calling g_poll,
and the g_poll must be nonblocking if select() says a
socket is ready.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 aio-posix.c         | 5 +++++
 aio-win32.c         | 5 +++++
 async.c             | 5 +++++
 include/block/aio.h | 9 ++++++++-
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/aio-posix.c b/aio-posix.c
index 0936b4ff9d..d3ac06e238 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -100,6 +100,11 @@ void aio_set_event_notifier(AioContext *ctx,
                        (IOHandler *)io_read, NULL, notifier);
 }
 
+bool aio_prepare(AioContext *ctx)
+{
+    return false;
+}
+
 bool aio_pending(AioContext *ctx)
 {
     AioHandler *node;
diff --git a/aio-win32.c b/aio-win32.c
index fd526864be..45422704a4 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -76,6 +76,11 @@ void aio_set_event_notifier(AioContext *ctx,
     aio_notify(ctx);
 }
 
+bool aio_prepare(AioContext *ctx)
+{
+    return false;
+}
+
 bool aio_pending(AioContext *ctx)
 {
     AioHandler *node;
diff --git a/async.c b/async.c
index 293a52a0a7..a99e7f639a 100644
--- a/async.c
+++ b/async.c
@@ -188,6 +188,11 @@ aio_ctx_prepare(GSource *source, gint    *timeout)
 
     /* We assume there is no timeout already supplied */
     *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
+
+    if (aio_prepare(ctx)) {
+        *timeout = 0;
+    }
+
     return *timeout == 0;
 }
 
diff --git a/include/block/aio.h b/include/block/aio.h
index 7ba3e9675b..ef4197b861 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -205,7 +205,14 @@ void qemu_bh_cancel(QEMUBH *bh);
 void qemu_bh_delete(QEMUBH *bh);
 
 /* Return whether there are any pending callbacks from the GSource
- * attached to the AioContext.
+ * attached to the AioContext, before g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_prepare(AioContext *ctx);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, after g_poll is invoked.
  *
  * This is used internally in the implementation of the GSource.
  */

From 79d9b6566b90efac072720f37a1b57d73f539264 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jul 2014 11:53:09 +0200
Subject: [PATCH 20/35] qemu-coroutine-io: fix for Win32

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 nbd.c               | 2 +-
 qemu-coroutine-io.c | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/nbd.c b/nbd.c
index e7d1ceec43..5c28f71118 100644
--- a/nbd.c
+++ b/nbd.c
@@ -156,7 +156,7 @@ ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
             err = socket_error();
 
             /* recoverable error */
-            if (err == EINTR || (offset > 0 && err == EAGAIN)) {
+            if (err == EINTR || (offset > 0 && (err == EAGAIN || err == EWOULDBLOCK))) {
                 continue;
             }
 
diff --git a/qemu-coroutine-io.c b/qemu-coroutine-io.c
index 054ca70627..d4049260da 100644
--- a/qemu-coroutine-io.c
+++ b/qemu-coroutine-io.c
@@ -34,13 +34,15 @@ qemu_co_sendv_recvv(int sockfd, struct iovec *iov, unsigned iov_cnt,
 {
     size_t done = 0;
     ssize_t ret;
+    int err;
     while (done < bytes) {
         ret = iov_send_recv(sockfd, iov, iov_cnt,
                             offset + done, bytes - done, do_send);
         if (ret > 0) {
             done += ret;
         } else if (ret < 0) {
-            if (errno == EAGAIN) {
+            err = socket_error();
+            if (err == EAGAIN || err == EWOULDBLOCK) {
                 qemu_coroutine_yield();
             } else if (done == 0) {
                 return -1;

From b493317d344357f7ac56606246d09b5604e54ab6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Jul 2014 11:53:10 +0200
Subject: [PATCH 21/35] aio-win32: add support for sockets

Uses the same select/WSAEventSelect scheme as main-loop.c.
WSAEventSelect() is edge-triggered, so it cannot be used
directly, but it is still used as a way to exit from a
blocking g_poll().

Before g_poll() is called, we poll sockets with a non-blocking
select() to achieve the level-triggered semantics we require:
if a socket is ready, the g_poll() is made non-blocking too.

Based on a patch from Or Goshen.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 aio-win32.c         | 150 ++++++++++++++++++++++++++++++++++++++++++--
 block/Makefile.objs |   2 -
 include/block/aio.h |   2 -
 3 files changed, 145 insertions(+), 9 deletions(-)

diff --git a/aio-win32.c b/aio-win32.c
index 45422704a4..61e3d2ddfe 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -22,12 +22,80 @@
 
 struct AioHandler {
     EventNotifier *e;
+    IOHandler *io_read;
+    IOHandler *io_write;
     EventNotifierHandler *io_notify;
     GPollFD pfd;
     int deleted;
+    void *opaque;
     QLIST_ENTRY(AioHandler) node;
 };
 
+void aio_set_fd_handler(AioContext *ctx,
+                        int fd,
+                        IOHandler *io_read,
+                        IOHandler *io_write,
+                        void *opaque)
+{
+    /* fd is a SOCKET in our case */
+    AioHandler *node;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->pfd.fd == fd && !node->deleted) {
+            break;
+        }
+    }
+
+    /* Are we deleting the fd handler? */
+    if (!io_read && !io_write) {
+        if (node) {
+            /* If the lock is held, just mark the node as deleted */
+            if (ctx->walking_handlers) {
+                node->deleted = 1;
+                node->pfd.revents = 0;
+            } else {
+                /* Otherwise, delete it for real.  We can't just mark it as
+                 * deleted because deleted nodes are only cleaned up after
+                 * releasing the walking_handlers lock.
+                 */
+                QLIST_REMOVE(node, node);
+                g_free(node);
+            }
+        }
+    } else {
+        HANDLE event;
+
+        if (node == NULL) {
+            /* Alloc and insert if it's not already there */
+            node = g_malloc0(sizeof(AioHandler));
+            node->pfd.fd = fd;
+            QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
+        }
+
+        node->pfd.events = 0;
+        if (node->io_read) {
+            node->pfd.events |= G_IO_IN;
+        }
+        if (node->io_write) {
+            node->pfd.events |= G_IO_OUT;
+        }
+
+        node->e = &ctx->notifier;
+
+        /* Update handler with latest information */
+        node->opaque = opaque;
+        node->io_read = io_read;
+        node->io_write = io_write;
+
+        event = event_notifier_get_handle(&ctx->notifier);
+        WSAEventSelect(node->pfd.fd, event,
+                       FD_READ | FD_ACCEPT | FD_CLOSE |
+                       FD_CONNECT | FD_WRITE | FD_OOB);
+    }
+
+    aio_notify(ctx);
+}
+
 void aio_set_event_notifier(AioContext *ctx,
                             EventNotifier *e,
                             EventNotifierHandler *io_notify)
@@ -78,7 +146,39 @@ void aio_set_event_notifier(AioContext *ctx,
 
 bool aio_prepare(AioContext *ctx)
 {
-    return false;
+    static struct timeval tv0;
+    AioHandler *node;
+    bool have_select_revents = false;
+    fd_set rfds, wfds;
+
+    /* fill fd sets */
+    FD_ZERO(&rfds);
+    FD_ZERO(&wfds);
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->io_read) {
+            FD_SET ((SOCKET)node->pfd.fd, &rfds);
+        }
+        if (node->io_write) {
+            FD_SET ((SOCKET)node->pfd.fd, &wfds);
+        }
+    }
+
+    if (select(0, &rfds, &wfds, NULL, &tv0) > 0) {
+        QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+            node->pfd.revents = 0;
+            if (FD_ISSET(node->pfd.fd, &rfds)) {
+                node->pfd.revents |= G_IO_IN;
+                have_select_revents = true;
+            }
+
+            if (FD_ISSET(node->pfd.fd, &wfds)) {
+                node->pfd.revents |= G_IO_OUT;
+                have_select_revents = true;
+            }
+        }
+    }
+
+    return have_select_revents;
 }
 
 bool aio_pending(AioContext *ctx)
@@ -89,6 +189,13 @@ bool aio_pending(AioContext *ctx)
         if (node->pfd.revents && node->io_notify) {
             return true;
         }
+
+        if ((node->pfd.revents & G_IO_IN) && node->io_read) {
+            return true;
+        }
+        if ((node->pfd.revents & G_IO_OUT) && node->io_write) {
+            return true;
+        }
     }
 
     return false;
@@ -106,11 +213,12 @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
     node = QLIST_FIRST(&ctx->aio_handlers);
     while (node) {
         AioHandler *tmp;
+        int revents = node->pfd.revents;
 
         ctx->walking_handlers++;
 
         if (!node->deleted &&
-            (node->pfd.revents || event_notifier_get_handle(node->e) == event) &&
+            (revents || event_notifier_get_handle(node->e) == event) &&
             node->io_notify) {
             node->pfd.revents = 0;
             node->io_notify(node->e);
@@ -121,6 +229,28 @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
             }
         }
 
+        if (!node->deleted &&
+            (node->io_read || node->io_write)) {
+            node->pfd.revents = 0;
+            if ((revents & G_IO_IN) && node->io_read) {
+                node->io_read(node->opaque);
+                progress = true;
+            }
+            if ((revents & G_IO_OUT) && node->io_write) {
+                node->io_write(node->opaque);
+                progress = true;
+            }
+
+            /* if the next select() will return an event, we have progressed */
+            if (event == event_notifier_get_handle(&ctx->notifier)) {
+                WSANETWORKEVENTS ev;
+                WSAEnumNetworkEvents(node->pfd.fd, event, &ev);
+                if (ev.lNetworkEvents) {
+                    progress = true;
+                }
+            }
+        }
+
         tmp = node;
         node = QLIST_NEXT(node, node);
 
@@ -149,10 +279,15 @@ bool aio_poll(AioContext *ctx, bool blocking)
 {
     AioHandler *node;
     HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
-    bool was_dispatching, progress, first;
+    bool was_dispatching, progress, have_select_revents, first;
     int count;
     int timeout;
 
+    if (aio_prepare(ctx)) {
+        blocking = false;
+        have_select_revents = true;
+    }
+
     was_dispatching = ctx->dispatching;
     progress = false;
 
@@ -183,6 +318,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     /* wait until next event */
     while (count > 0) {
+        HANDLE event;
         int ret;
 
         timeout = blocking
@@ -196,13 +332,17 @@ bool aio_poll(AioContext *ctx, bool blocking)
         first = false;
 
         /* if we have any signaled events, dispatch event */
-        if ((DWORD) (ret - WAIT_OBJECT_0) >= count) {
+        event = NULL;
+        if ((DWORD) (ret - WAIT_OBJECT_0) < count) {
+            event = events[ret - WAIT_OBJECT_0];
+        } else if (!have_select_revents) {
             break;
         }
 
+        have_select_revents = false;
         blocking = false;
 
-        progress |= aio_dispatch_handlers(ctx, events[ret - WAIT_OBJECT_0]);
+        progress |= aio_dispatch_handlers(ctx, event);
 
         /* Try again, but only call each handler once.  */
         events[ret - WAIT_OBJECT_0] = events[--count];
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 858d2b387b..f45f9399aa 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -10,7 +10,6 @@ block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
-ifeq ($(CONFIG_POSIX),y)
 block-obj-y += nbd.o nbd-client.o sheepdog.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_LIBNFS) += nfs.o
@@ -19,7 +18,6 @@ block-obj-$(CONFIG_RBD) += rbd.o
 block-obj-$(CONFIG_GLUSTERFS) += gluster.o
 block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
-endif
 
 common-obj-y += stream.o
 common-obj-y += commit.o
diff --git a/include/block/aio.h b/include/block/aio.h
index ef4197b861..4603c0f066 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -239,7 +239,6 @@ bool aio_dispatch(AioContext *ctx);
  */
 bool aio_poll(AioContext *ctx, bool blocking);
 
-#ifdef CONFIG_POSIX
 /* Register a file descriptor and associated callbacks.  Behaves very similarly
  * to qemu_set_fd_handler2.  Unlike qemu_set_fd_handler2, these callbacks will
  * be invoked when using aio_poll().
@@ -252,7 +251,6 @@ void aio_set_fd_handler(AioContext *ctx,
                         IOHandler *io_read,
                         IOHandler *io_write,
                         void *opaque);
-#endif
 
 /* Register an event notifier and associated callbacks.  Behaves very similarly
  * to event_notifier_set_handler.  Unlike event_notifier_set_handler, these callbacks

From a780dea0454d2820e31407c33f167acf00fe447d Mon Sep 17 00:00:00 2001
From: Liu Yuan <namei.unix@gmail.com>
Date: Thu, 28 Aug 2014 18:27:55 +0800
Subject: [PATCH 22/35] sheepdog: fix a core dump while do auto-reconnecting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We should reinit local_err as NULL inside the while loop or g_free() will report
corrupption and abort the QEMU when sheepdog driver tries reconnecting.

This was broken in commit 356b4ca.

qemu-system-x86_64: failed to get the header, Resource temporarily unavailable
qemu-system-x86_64: Failed to connect to socket: Connection refused
qemu-system-x86_64: (null)
[xcb] Unknown sequence number while awaiting reply
[xcb] Most likely this is a multi-threaded client and XInitThreads has not been called
[xcb] Aborting, sorry about that.
qemu-system-x86_64: ../../src/xcb_io.c:298: poll_for_response: Assertion `!xcb_xlib_threads_sequence_lost' failed.
Aborted (core dumped)

Cc: qemu-devel@nongnu.org
Cc: Markus Armbruster <armbru@redhat.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Liu Yuan <namei.unix@gmail.com>
Reviewed-by: Benoît Canet <benoit.canet@nodalink.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/sheepdog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 49a9a9e5eb..f91afc3a5b 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -716,7 +716,6 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid)
 
 static coroutine_fn void reconnect_to_sdog(void *opaque)
 {
-    Error *local_err = NULL;
     BDRVSheepdogState *s = opaque;
     AIOReq *aio_req, *next;
 
@@ -731,6 +730,7 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
 
     /* Try to reconnect the sheepdog server every one second. */
     while (s->fd < 0) {
+        Error *local_err = NULL;
         s->fd = get_sheep_fd(s, &local_err);
         if (s->fd < 0) {
             DPRINTF("Wait for connection to be established\n");

From 958c717df97ea9ca47a2253b8371130fe5f22980 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 20 Jun 2014 21:57:32 +0200
Subject: [PATCH 23/35] nbd: Drop nbd_can_read()

There is no variant of aio_set_fd_handler() like qemu_set_fd_handler2(),
so we cannot give a can_read() callback function. Instead, unregister
the nbd_read() function whenever we cannot read and re-register it as
soon as we can read again.

All this is hidden behind the functions nbd_set_handlers() (which
registers all handlers for the AIO context and file descriptor belonging
to the given client), nbd_unset_handlers() (which unregisters them) and
nbd_update_can_read() (which checks whether NBD can read for the given
client and acts accordingly).

Signed-off-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 nbd.c | 72 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 19 deletions(-)

diff --git a/nbd.c b/nbd.c
index 5c28f71118..763c84f1be 100644
--- a/nbd.c
+++ b/nbd.c
@@ -18,6 +18,7 @@
 
 #include "block/nbd.h"
 #include "block/block.h"
+#include "block/block_int.h"
 
 #include "block/coroutine.h"
 
@@ -107,6 +108,8 @@ struct NBDExport {
     uint32_t nbdflags;
     QTAILQ_HEAD(, NBDClient) clients;
     QTAILQ_ENTRY(NBDExport) next;
+
+    AioContext *ctx;
 };
 
 static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
@@ -123,6 +126,8 @@ struct NBDClient {
     CoMutex send_lock;
     Coroutine *send_coroutine;
 
+    bool can_read;
+
     QTAILQ_ENTRY(NBDClient) next;
     int nb_requests;
     bool closing;
@@ -130,6 +135,10 @@ struct NBDClient {
 
 /* That's all folks */
 
+static void nbd_set_handlers(NBDClient *client);
+static void nbd_unset_handlers(NBDClient *client);
+static void nbd_update_can_read(NBDClient *client);
+
 ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
 {
     size_t offset = 0;
@@ -862,7 +871,7 @@ void nbd_client_put(NBDClient *client)
          */
         assert(client->closing);
 
-        qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
+        nbd_unset_handlers(client);
         close(client->sock);
         client->sock = -1;
         if (client->exp) {
@@ -898,6 +907,7 @@ static NBDRequest *nbd_request_get(NBDClient *client)
 
     assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
     client->nb_requests++;
+    nbd_update_can_read(client);
 
     req = g_slice_new0(NBDRequest);
     nbd_client_get(client);
@@ -914,9 +924,8 @@ static void nbd_request_put(NBDRequest *req)
     }
     g_slice_free(NBDRequest, req);
 
-    if (client->nb_requests-- == MAX_NBD_REQUESTS) {
-        qemu_notify_event();
-    }
+    client->nb_requests--;
+    nbd_update_can_read(client);
     nbd_client_put(client);
 }
 
@@ -932,6 +941,7 @@ NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
     exp->nbdflags = nbdflags;
     exp->size = size == -1 ? bdrv_getlength(bs) : size;
     exp->close = close;
+    exp->ctx = bdrv_get_aio_context(bs);
     bdrv_ref(bs);
     return exp;
 }
@@ -1023,10 +1033,6 @@ void nbd_export_close_all(void)
     }
 }
 
-static int nbd_can_read(void *opaque);
-static void nbd_read(void *opaque);
-static void nbd_restart_write(void *opaque);
-
 static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
                                  int len)
 {
@@ -1035,9 +1041,8 @@ static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
     ssize_t rc, ret;
 
     qemu_co_mutex_lock(&client->send_lock);
-    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
-                         nbd_restart_write, client);
     client->send_coroutine = qemu_coroutine_self();
+    nbd_set_handlers(client);
 
     if (!len) {
         rc = nbd_send_reply(csock, reply);
@@ -1054,7 +1059,7 @@ static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
     }
 
     client->send_coroutine = NULL;
-    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
+    nbd_set_handlers(client);
     qemu_co_mutex_unlock(&client->send_lock);
     return rc;
 }
@@ -1067,6 +1072,8 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *reque
     ssize_t rc;
 
     client->recv_coroutine = qemu_coroutine_self();
+    nbd_update_can_read(client);
+
     rc = nbd_receive_request(csock, request);
     if (rc < 0) {
         if (rc != -EAGAIN) {
@@ -1108,6 +1115,8 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *reque
 
 out:
     client->recv_coroutine = NULL;
+    nbd_update_can_read(client);
+
     return rc;
 }
 
@@ -1259,13 +1268,6 @@ out:
     nbd_client_close(client);
 }
 
-static int nbd_can_read(void *opaque)
-{
-    NBDClient *client = opaque;
-
-    return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
-}
-
 static void nbd_read(void *opaque)
 {
     NBDClient *client = opaque;
@@ -1284,6 +1286,37 @@ static void nbd_restart_write(void *opaque)
     qemu_coroutine_enter(client->send_coroutine, NULL);
 }
 
+static void nbd_set_handlers(NBDClient *client)
+{
+    if (client->exp && client->exp->ctx) {
+        aio_set_fd_handler(client->exp->ctx, client->sock,
+                           client->can_read ? nbd_read : NULL,
+                           client->send_coroutine ? nbd_restart_write : NULL,
+                           client);
+    }
+}
+
+static void nbd_unset_handlers(NBDClient *client)
+{
+    if (client->exp && client->exp->ctx) {
+        aio_set_fd_handler(client->exp->ctx, client->sock, NULL, NULL, NULL);
+    }
+}
+
+static void nbd_update_can_read(NBDClient *client)
+{
+    bool can_read = client->recv_coroutine ||
+                    client->nb_requests < MAX_NBD_REQUESTS;
+
+    if (can_read != client->can_read) {
+        client->can_read = can_read;
+        nbd_set_handlers(client);
+
+        /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
+         * in nbd_set_handlers() will have taken care of that */
+    }
+}
+
 NBDClient *nbd_client_new(NBDExport *exp, int csock,
                           void (*close)(NBDClient *))
 {
@@ -1292,13 +1325,14 @@ NBDClient *nbd_client_new(NBDExport *exp, int csock,
     client->refcount = 1;
     client->exp = exp;
     client->sock = csock;
+    client->can_read = true;
     if (nbd_send_negotiate(client)) {
         g_free(client);
         return NULL;
     }
     client->close = close;
     qemu_co_mutex_init(&client->send_lock);
-    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
+    nbd_set_handlers(client);
 
     if (exp) {
         QTAILQ_INSERT_TAIL(&exp->clients, client, next);

From 33384421b3abe74555baeaf788b17204cd8c6080 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 20 Jun 2014 21:57:33 +0200
Subject: [PATCH 24/35] block: Add AIO context notifiers

If a long-running operation on a BDS wants to always remain in the same
AIO context, it somehow needs to keep track of the BDS changing its
context. This adds a function for registering callbacks on a BDS which
are called whenever the BDS is attached or detached from an AIO context.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block.c                   | 56 +++++++++++++++++++++++++++++++++++++++
 include/block/block_int.h | 41 ++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

diff --git a/block.c b/block.c
index 1df13ac1c7..9c5566b55b 100644
--- a/block.c
+++ b/block.c
@@ -1819,6 +1819,8 @@ void bdrv_reopen_abort(BDRVReopenState *reopen_state)
 
 void bdrv_close(BlockDriverState *bs)
 {
+    BdrvAioNotifier *ban, *ban_next;
+
     if (bs->job) {
         block_job_cancel_sync(bs->job);
     }
@@ -1863,6 +1865,11 @@ void bdrv_close(BlockDriverState *bs)
     if (bs->io_limits_enabled) {
         bdrv_io_limits_disable(bs);
     }
+
+    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
+        g_free(ban);
+    }
+    QLIST_INIT(&bs->aio_notifiers);
 }
 
 void bdrv_close_all(void)
@@ -5729,10 +5736,16 @@ AioContext *bdrv_get_aio_context(BlockDriverState *bs)
 
 void bdrv_detach_aio_context(BlockDriverState *bs)
 {
+    BdrvAioNotifier *baf;
+
     if (!bs->drv) {
         return;
     }
 
+    QLIST_FOREACH(baf, &bs->aio_notifiers, list) {
+        baf->detach_aio_context(baf->opaque);
+    }
+
     if (bs->io_limits_enabled) {
         throttle_detach_aio_context(&bs->throttle_state);
     }
@@ -5752,6 +5765,8 @@ void bdrv_detach_aio_context(BlockDriverState *bs)
 void bdrv_attach_aio_context(BlockDriverState *bs,
                              AioContext *new_context)
 {
+    BdrvAioNotifier *ban;
+
     if (!bs->drv) {
         return;
     }
@@ -5770,6 +5785,10 @@ void bdrv_attach_aio_context(BlockDriverState *bs,
     if (bs->io_limits_enabled) {
         throttle_attach_aio_context(&bs->throttle_state, new_context);
     }
+
+    QLIST_FOREACH(ban, &bs->aio_notifiers, list) {
+        ban->attached_aio_context(new_context, ban->opaque);
+    }
 }
 
 void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
@@ -5786,6 +5805,43 @@ void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context)
     aio_context_release(new_context);
 }
 
+void bdrv_add_aio_context_notifier(BlockDriverState *bs,
+        void (*attached_aio_context)(AioContext *new_context, void *opaque),
+        void (*detach_aio_context)(void *opaque), void *opaque)
+{
+    BdrvAioNotifier *ban = g_new(BdrvAioNotifier, 1);
+    *ban = (BdrvAioNotifier){
+        .attached_aio_context = attached_aio_context,
+        .detach_aio_context   = detach_aio_context,
+        .opaque               = opaque
+    };
+
+    QLIST_INSERT_HEAD(&bs->aio_notifiers, ban, list);
+}
+
+void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
+                                      void (*attached_aio_context)(AioContext *,
+                                                                   void *),
+                                      void (*detach_aio_context)(void *),
+                                      void *opaque)
+{
+    BdrvAioNotifier *ban, *ban_next;
+
+    QLIST_FOREACH_SAFE(ban, &bs->aio_notifiers, list, ban_next) {
+        if (ban->attached_aio_context == attached_aio_context &&
+            ban->detach_aio_context   == detach_aio_context   &&
+            ban->opaque               == opaque)
+        {
+            QLIST_REMOVE(ban, list);
+            g_free(ban);
+
+            return;
+        }
+    }
+
+    abort();
+}
+
 void bdrv_add_before_write_notifier(BlockDriverState *bs,
                                     NotifierWithReturn *notifier)
 {
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 233489547e..8a61215ac0 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -294,6 +294,15 @@ typedef struct BlockLimits {
 
 typedef struct BdrvOpBlocker BdrvOpBlocker;
 
+typedef struct BdrvAioNotifier {
+    void (*attached_aio_context)(AioContext *new_context, void *opaque);
+    void (*detach_aio_context)(void *opaque);
+
+    void *opaque;
+
+    QLIST_ENTRY(BdrvAioNotifier) list;
+} BdrvAioNotifier;
+
 /*
  * Note: the function bdrv_append() copies and swaps contents of
  * BlockDriverStates, so if you add new fields to this struct, please
@@ -320,6 +329,10 @@ struct BlockDriverState {
     void *dev_opaque;
 
     AioContext *aio_context; /* event loop used for fd handlers, timers, etc */
+    /* long-running tasks intended to always use the same AioContext as this
+     * BDS may register themselves in this list to be notified of changes
+     * regarding this BDS's context */
+    QLIST_HEAD(, BdrvAioNotifier) aio_notifiers;
 
     char filename[1024];
     char backing_file[1024]; /* if non zero, the image is a diff of
@@ -437,6 +450,34 @@ void bdrv_detach_aio_context(BlockDriverState *bs);
 void bdrv_attach_aio_context(BlockDriverState *bs,
                              AioContext *new_context);
 
+/**
+ * bdrv_add_aio_context_notifier:
+ *
+ * If a long-running job intends to be always run in the same AioContext as a
+ * certain BDS, it may use this function to be notified of changes regarding the
+ * association of the BDS to an AioContext.
+ *
+ * attached_aio_context() is called after the target BDS has been attached to a
+ * new AioContext; detach_aio_context() is called before the target BDS is being
+ * detached from its old AioContext.
+ */
+void bdrv_add_aio_context_notifier(BlockDriverState *bs,
+        void (*attached_aio_context)(AioContext *new_context, void *opaque),
+        void (*detach_aio_context)(void *opaque), void *opaque);
+
+/**
+ * bdrv_remove_aio_context_notifier:
+ *
+ * Unsubscribe of change notifications regarding the BDS's AioContext. The
+ * parameters given here have to be the same as those given to
+ * bdrv_add_aio_context_notifier().
+ */
+void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
+                                      void (*aio_context_attached)(AioContext *,
+                                                                   void *),
+                                      void (*aio_context_detached)(void *),
+                                      void *opaque);
+
 #ifdef _WIN32
 int is_windows_drive(const char *filename);
 #endif

From f21492817bc426a3bc0b98fa852df95be9dea1e8 Mon Sep 17 00:00:00 2001
From: Max Reitz <mreitz@redhat.com>
Date: Fri, 20 Jun 2014 21:57:34 +0200
Subject: [PATCH 25/35] nbd: Follow the BDS' AIO context

Keep the NBD server always in the same AIO context as the exported BDS
by calling bdrv_add_aio_context_notifier() and implementing the required
callbacks.

Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 nbd.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/nbd.c b/nbd.c
index 763c84f1be..e9b539be58 100644
--- a/nbd.c
+++ b/nbd.c
@@ -929,6 +929,34 @@ static void nbd_request_put(NBDRequest *req)
     nbd_client_put(client);
 }
 
+static void bs_aio_attached(AioContext *ctx, void *opaque)
+{
+    NBDExport *exp = opaque;
+    NBDClient *client;
+
+    TRACE("Export %s: Attaching clients to AIO context %p\n", exp->name, ctx);
+
+    exp->ctx = ctx;
+
+    QTAILQ_FOREACH(client, &exp->clients, next) {
+        nbd_set_handlers(client);
+    }
+}
+
+static void bs_aio_detach(void *opaque)
+{
+    NBDExport *exp = opaque;
+    NBDClient *client;
+
+    TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
+
+    QTAILQ_FOREACH(client, &exp->clients, next) {
+        nbd_unset_handlers(client);
+    }
+
+    exp->ctx = NULL;
+}
+
 NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
                           off_t size, uint32_t nbdflags,
                           void (*close)(NBDExport *))
@@ -943,6 +971,7 @@ NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
     exp->close = close;
     exp->ctx = bdrv_get_aio_context(bs);
     bdrv_ref(bs);
+    bdrv_add_aio_context_notifier(bs, bs_aio_attached, bs_aio_detach, exp);
     return exp;
 }
 
@@ -990,6 +1019,8 @@ void nbd_export_close(NBDExport *exp)
     nbd_export_set_name(exp, NULL);
     nbd_export_put(exp);
     if (exp->bs) {
+        bdrv_remove_aio_context_notifier(exp->bs, bs_aio_attached,
+                                         bs_aio_detach, exp);
         bdrv_unref(exp->bs);
         exp->bs = NULL;
     }

From 391827eb106d2d02062b2582d1545a7c221631c6 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 30 Jul 2014 09:53:30 +0100
Subject: [PATCH 26/35] block: fix overlapping multiwrite requests

When request A is a strict superset of request B:

  AAAAAAAA
    BBBB

multiwrite_merge() merges them as follows:

  AABBBB

The tail of request A should have been included:

  AABBBBAA

This patch fixes data loss but this code path is probably rare.  Since
guests cannot assume ordering between in-flight requests, few
applications submit overlapping write requests.

Reported-by: Slava Pestov <sviatoslav.pestov@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Benoit Canet <benoit@irqsave.net>
---
 block.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/block.c b/block.c
index 9c5566b55b..cb670fd54d 100644
--- a/block.c
+++ b/block.c
@@ -4553,6 +4553,12 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
             // Add the second request
             qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size);
 
+            // Add tail of first request, if necessary
+            if (qiov->size < reqs[outidx].qiov->size) {
+                qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size,
+                                  reqs[outidx].qiov->size - qiov->size);
+            }
+
             reqs[outidx].nb_sectors = qiov->size >> 9;
             reqs[outidx].qiov = qiov;
 

From 12ade7609004bb1b09a845c144b36ea1850854c7 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Wed, 30 Jul 2014 09:53:31 +0100
Subject: [PATCH 27/35] qemu-iotests: add multiwrite test cases

This test case covers the basic bdrv_aio_multiwrite() scenarios:
1. Single request
2. Sequential requests (AABB)
3. Superset overlapping requests (AABBAA)
4. Subset overlapping requests (BBAABB)
5. Head overlapping requests (AABB)
6. Tail overlapping requests (BBAA)
7. Disjoint requests (AA BB)

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Benoit Canet <benoit@irqsave.net>
---
 tests/qemu-iotests/100     | 134 +++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/100.out |  89 ++++++++++++++++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 224 insertions(+)
 create mode 100755 tests/qemu-iotests/100
 create mode 100644 tests/qemu-iotests/100.out

diff --git a/tests/qemu-iotests/100 b/tests/qemu-iotests/100
new file mode 100755
index 0000000000..9124aba760
--- /dev/null
+++ b/tests/qemu-iotests/100
@@ -0,0 +1,134 @@
+#!/bin/bash
+#
+# Test simple read/write using plain bdrv_read/bdrv_write
+#
+# Copyright (C) 2014 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=stefanha@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt generic
+_supported_proto generic
+_supported_os Linux
+
+
+size=128M
+
+echo
+echo "== Single request =="
+_make_test_img $size
+$QEMU_IO -c "multiwrite 0 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== verify pattern =="
+$QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== Sequential requests =="
+_make_test_img $size
+$QEMU_IO -c "multiwrite 0 4k ; 4k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== verify pattern =="
+$QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0xce 4k 4k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 8k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== Superset overlapping requests =="
+_make_test_img $size
+$QEMU_IO -c "multiwrite 0 4k ; 1k 2k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== verify pattern =="
+# Order of overlapping in-flight requests is not guaranteed so we cannot verify
+# [1k, 3k) since it could have either pattern 0xcd or 0xce.
+$QEMU_IO -c "read -P 0xcd 0 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0xcd 3k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== Subset overlapping requests =="
+_make_test_img $size
+$QEMU_IO -c "multiwrite 1k 2k ; 0k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== verify pattern =="
+# Order of overlapping in-flight requests is not guaranteed so we cannot verify
+# [1k, 3k) since it could have either pattern 0xcd or 0xce.
+$QEMU_IO -c "read -P 0xce 0 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0xce 3k 1k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== Head overlapping requests =="
+_make_test_img $size
+$QEMU_IO -c "multiwrite 0k 2k ; 0k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== verify pattern =="
+# Order of overlapping in-flight requests is not guaranteed so we cannot verify
+# [0k, 2k) since it could have either pattern 0xcd or 0xce.
+$QEMU_IO -c "read -P 0xce 2k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== Tail overlapping requests =="
+_make_test_img $size
+$QEMU_IO -c "multiwrite 2k 2k ; 0k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== verify pattern =="
+# Order of overlapping in-flight requests is not guaranteed so we cannot verify
+# [2k, 4k) since it could have either pattern 0xcd or 0xce.
+$QEMU_IO -c "read -P 0xce 0k 2k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== Disjoint requests =="
+_make_test_img $size
+$QEMU_IO -c "multiwrite 0 4k ; 64k 4k" "$TEST_IMG" | _filter_qemu_io
+
+echo
+echo "== verify pattern =="
+$QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 4k 60k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0xce 64k 4k" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "read -P 0 68k 4k" "$TEST_IMG" | _filter_qemu_io
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/100.out b/tests/qemu-iotests/100.out
new file mode 100644
index 0000000000..2d6e9f0a7d
--- /dev/null
+++ b/tests/qemu-iotests/100.out
@@ -0,0 +1,89 @@
+QA output created by 100
+
+== Single request ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
+wrote 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify pattern ==
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 4096
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== Sequential requests ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
+wrote 8192/8192 bytes at offset 0
+8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify pattern ==
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 4096
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 8192
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== Superset overlapping requests ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
+wrote 6144/6144 bytes at offset 0
+6 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify pattern ==
+read 1024/1024 bytes at offset 0
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 3072
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 4096
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== Subset overlapping requests ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
+wrote 6144/6144 bytes at offset 1024
+6 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify pattern ==
+read 1024/1024 bytes at offset 0
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 1024/1024 bytes at offset 3072
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 4096
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== Head overlapping requests ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
+wrote 6144/6144 bytes at offset 0
+6 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify pattern ==
+read 2048/2048 bytes at offset 2048
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 4096
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== Tail overlapping requests ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
+wrote 6144/6144 bytes at offset 2048
+6 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify pattern ==
+read 2048/2048 bytes at offset 0
+2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 4096
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== Disjoint requests ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
+wrote 8192/8192 bytes at offset 0
+8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== verify pattern ==
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 61440/61440 bytes at offset 4096
+60 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 65536
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 69632
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 2803d68d62..0920b28db4 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -101,5 +101,6 @@
 092 rw auto quick
 095 rw auto quick
 099 rw auto quick
+100 rw auto quick
 101 rw auto quick
 103 rw auto quick

From 2cdff7f620ebd3b5246cf0c0d1f6fa0eededa4ca Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 4 Aug 2014 16:56:33 +0100
Subject: [PATCH 28/35] linux-aio: avoid deadlock in nested aio_poll() calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If two Linux AIO request completions are fetched in the same
io_getevents() call, QEMU will deadlock if request A's callback waits
for request B to complete using an aio_poll() loop.  This was reported
to happen with the mirror blockjob.

This patch moves completion processing into a BH and makes it resumable.
Nested event loops can resume completion processing so that request B
will complete and the deadlock will not occur.

Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Ming Lei <ming.lei@canonical.com>
Cc: Marcin Gibuła <m.gibula@beyond.pl>
Reported-by: Marcin Gibuła <m.gibula@beyond.pl>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Tested-by: Marcin Gibuła <m.gibula@beyond.pl>
---
 block/linux-aio.c | 73 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 17 deletions(-)

diff --git a/block/linux-aio.c b/block/linux-aio.c
index 7ac7e8c99c..9aca758b10 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -51,6 +51,12 @@ struct qemu_laio_state {
 
     /* io queue for submit at batch */
     LaioQueue io_q;
+
+    /* I/O completion processing */
+    QEMUBH *completion_bh;
+    struct io_event events[MAX_EVENTS];
+    int event_idx;
+    int event_max;
 };
 
 static inline ssize_t io_event_ret(struct io_event *ev)
@@ -86,27 +92,58 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
     qemu_aio_release(laiocb);
 }
 
+/* The completion BH fetches completed I/O requests and invokes their
+ * callbacks.
+ *
+ * The function is somewhat tricky because it supports nested event loops, for
+ * example when a request callback invokes aio_poll().  In order to do this,
+ * the completion events array and index are kept in qemu_laio_state.  The BH
+ * reschedules itself as long as there are completions pending so it will
+ * either be called again in a nested event loop or will be called after all
+ * events have been completed.  When there are no events left to complete, the
+ * BH returns without rescheduling.
+ */
+static void qemu_laio_completion_bh(void *opaque)
+{
+    struct qemu_laio_state *s = opaque;
+
+    /* Fetch more completion events when empty */
+    if (s->event_idx == s->event_max) {
+        do {
+            struct timespec ts = { 0 };
+            s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS,
+                                        s->events, &ts);
+        } while (s->event_max == -EINTR);
+
+        s->event_idx = 0;
+        if (s->event_max <= 0) {
+            s->event_max = 0;
+            return; /* no more events */
+        }
+    }
+
+    /* Reschedule so nested event loops see currently pending completions */
+    qemu_bh_schedule(s->completion_bh);
+
+    /* Process completion events */
+    while (s->event_idx < s->event_max) {
+        struct iocb *iocb = s->events[s->event_idx].obj;
+        struct qemu_laiocb *laiocb =
+                container_of(iocb, struct qemu_laiocb, iocb);
+
+        laiocb->ret = io_event_ret(&s->events[s->event_idx]);
+        s->event_idx++;
+
+        qemu_laio_process_completion(s, laiocb);
+    }
+}
+
 static void qemu_laio_completion_cb(EventNotifier *e)
 {
     struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e);
 
-    while (event_notifier_test_and_clear(&s->e)) {
-        struct io_event events[MAX_EVENTS];
-        struct timespec ts = { 0 };
-        int nevents, i;
-
-        do {
-            nevents = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, events, &ts);
-        } while (nevents == -EINTR);
-
-        for (i = 0; i < nevents; i++) {
-            struct iocb *iocb = events[i].obj;
-            struct qemu_laiocb *laiocb =
-                    container_of(iocb, struct qemu_laiocb, iocb);
-
-            laiocb->ret = io_event_ret(&events[i]);
-            qemu_laio_process_completion(s, laiocb);
-        }
+    if (event_notifier_test_and_clear(&s->e)) {
+        qemu_bh_schedule(s->completion_bh);
     }
 }
 
@@ -272,12 +309,14 @@ void laio_detach_aio_context(void *s_, AioContext *old_context)
     struct qemu_laio_state *s = s_;
 
     aio_set_event_notifier(old_context, &s->e, NULL);
+    qemu_bh_delete(s->completion_bh);
 }
 
 void laio_attach_aio_context(void *s_, AioContext *new_context)
 {
     struct qemu_laio_state *s = s_;
 
+    s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
     aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb);
 }
 

From 8ad4202bf61bc1d124ff26016cfe17cb261cc392 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 18 Aug 2014 16:07:12 +0100
Subject: [PATCH 29/35] block: acquire AioContext in do_drive_del()

Make drive_del safe for dataplane where another thread may be running
the BlockDriverState's AioContext.

Note the assumption that AioContext's lifetime exceeds DriveInfo and
BlockDriverState.  We release AioContext after DriveInfo and
BlockDriverState are potentially freed.

This is clearly safe with the global AioContext but also with -object
iothread and implicit iothreads created by -device
virtio-blk-pci,x-data-plane=on (their lifetime is tied to DeviceState,
not BlockDriverState).

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 blockdev.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/blockdev.c b/blockdev.c
index eeb414efc0..e37b068e9e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1757,6 +1757,7 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 {
     const char *id = qdict_get_str(qdict, "id");
     BlockDriverState *bs;
+    AioContext *aio_context;
     Error *local_err = NULL;
 
     bs = bdrv_find(id);
@@ -1764,9 +1765,14 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
         error_report("Device '%s' not found", id);
         return -1;
     }
+
+    aio_context = bdrv_get_aio_context(bs);
+    aio_context_acquire(aio_context);
+
     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_DRIVE_DEL, &local_err)) {
         error_report("%s", error_get_pretty(local_err));
         error_free(local_err);
+        aio_context_release(aio_context);
         return -1;
     }
 
@@ -1790,6 +1796,7 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
         drive_del(drive_get_by_blockdev(bs));
     }
 
+    aio_context_release(aio_context);
     return 0;
 }
 

From 3255d1c21f364774390061fbc3b8e25c027cc862 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 18 Aug 2014 16:07:13 +0100
Subject: [PATCH 30/35] virtio-blk: allow drive_del with dataplane

Now that drive_del acquires the AioContext we can safely allow deleting
the drive.  As with non-dataplane mode, all I/Os submitted by the guest
after drive_del will return EIO.

This patch makes hot unplug work with virtio-blk dataplane.  Previously
drive_del reported an error because the device was busy.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 hw/block/dataplane/virtio-blk.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index c07adc6e4f..b55188cb82 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -194,6 +194,7 @@ void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
     error_setg(&s->blocker, "block device is in use by data plane");
     bdrv_op_block_all(blk->conf.bs, s->blocker);
     bdrv_op_unblock(blk->conf.bs, BLOCK_OP_TYPE_RESIZE, s->blocker);
+    bdrv_op_unblock(blk->conf.bs, BLOCK_OP_TYPE_DRIVE_DEL, s->blocker);
 
     *dataplane = s;
 }

From a94f83d94fdf907680f068f1be7ad13d1f697067 Mon Sep 17 00:00:00 2001
From: "Richard W.M. Jones" <rjones@redhat.com>
Date: Fri, 29 Aug 2014 16:03:12 +0100
Subject: [PATCH 31/35] curl: Allow a cookie or cookies to be sent with
 http/https requests.

In order to access VMware ESX efficiently, we need to send a session
cookie.  This patch is very simple and just allows you to send that
session cookie.  It punts on the question of how you get the session
cookie in the first place, but in practice you can just run a `curl'
command against the server and extract the cookie that way.

To use it, add file.cookie to the curl URL.  For example:

$ qemu-img info 'json: {
    "file.driver":"https",
    "file.url":"https://vcenter/folder/Windows%202003/Windows%202003-flat.vmdk?dcPath=Datacenter&dsName=datastore1",
    "file.sslverify":"off",
    "file.cookie":"vmware_soap_session=\"52a01262-bf93-ccce-d379-8dabb3e55560\""}'
image: [...]
file format: raw
virtual size: 8.0G (8589934592 bytes)
disk size: unavailable

Signed-off-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/curl.c    | 16 ++++++++++++++++
 qemu-options.hx |  5 +++++
 2 files changed, 21 insertions(+)

diff --git a/block/curl.c b/block/curl.c
index 2698ae31e8..9051bc05c1 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -73,6 +73,7 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle,
 #define CURL_BLOCK_OPT_READAHEAD "readahead"
 #define CURL_BLOCK_OPT_SSLVERIFY "sslverify"
 #define CURL_BLOCK_OPT_TIMEOUT "timeout"
+#define CURL_BLOCK_OPT_COOKIE    "cookie"
 
 struct BDRVCURLState;
 
@@ -112,6 +113,7 @@ typedef struct BDRVCURLState {
     size_t readahead_size;
     bool sslverify;
     int timeout;
+    char *cookie;
     bool accept_range;
     AioContext *aio_context;
 } BDRVCURLState;
@@ -385,6 +387,9 @@ static CURLState *curl_init_state(BDRVCURLState *s)
         curl_easy_setopt(state->curl, CURLOPT_URL, s->url);
         curl_easy_setopt(state->curl, CURLOPT_SSL_VERIFYPEER,
                          (long) s->sslverify);
+        if (s->cookie) {
+            curl_easy_setopt(state->curl, CURLOPT_COOKIE, s->cookie);
+        }
         curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, s->timeout);
         curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION,
                          (void *)curl_read_cb);
@@ -497,6 +502,11 @@ static QemuOptsList runtime_opts = {
             .type = QEMU_OPT_NUMBER,
             .help = "Curl timeout"
         },
+        {
+            .name = CURL_BLOCK_OPT_COOKIE,
+            .type = QEMU_OPT_STRING,
+            .help = "Pass the cookie or list of cookies with each request"
+        },
         { /* end of list */ }
     },
 };
@@ -509,6 +519,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
     QemuOpts *opts;
     Error *local_err = NULL;
     const char *file;
+    const char *cookie;
     double d;
 
     static int inited = 0;
@@ -538,6 +549,9 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
 
     s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true);
 
+    cookie = qemu_opt_get(opts, CURL_BLOCK_OPT_COOKIE);
+    s->cookie = g_strdup(cookie);
+
     file = qemu_opt_get(opts, CURL_BLOCK_OPT_URL);
     if (file == NULL) {
         error_setg(errp, "curl block driver requires an 'url' option");
@@ -593,6 +607,7 @@ out:
     curl_easy_cleanup(state->curl);
     state->curl = NULL;
 out_noclean:
+    g_free(s->cookie);
     g_free(s->url);
     qemu_opts_del(opts);
     return -EINVAL;
@@ -695,6 +710,7 @@ static void curl_close(BlockDriverState *bs)
     DPRINTF("CURL: Close\n");
     curl_detach_aio_context(bs);
 
+    g_free(s->cookie);
     g_free(s->url);
 }
 
diff --git a/qemu-options.hx b/qemu-options.hx
index 52d56f4a1c..5479cf54f4 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2352,6 +2352,11 @@ multiple of 512 bytes. It defaults to 256k.
 Whether to verify the remote server's certificate when connecting over SSL. It
 can have the value 'on' or 'off'. It defaults to 'on'.
 
+@item cookie
+Send this cookie (it can also be a list of cookies separated by ';') with
+each outgoing request.  Only supported when using protocols such as HTTP
+which support cookies, otherwise ignored.
+
 @item timeout
 Set the timeout in seconds of the CURL connection. This timeout is the time
 that CURL waits for a response from the remote server to get the size of the

From a2f468e48f8b6559ec9123e94948bc373b788941 Mon Sep 17 00:00:00 2001
From: "Richard W.M. Jones" <rjones@redhat.com>
Date: Thu, 28 Aug 2014 09:04:21 +0100
Subject: [PATCH 32/35] curl: Don't deref NULL pointer in call to aio_poll.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In commit 63f0f45f2e89b60ff8245fec81328ddfde42a303 the following
mechanical change was made:

         if (!state) {
-            qemu_aio_wait();
+            aio_poll(state->s->aio_context, true);
         }

The new code now checks if state is NULL and then dereferences it
('state->s') which is obviously incorrect.

This commit replaces state->s->aio_context with
bdrv_get_aio_context(bs), fixing this problem.  The two other hunks
are concerned with getting the BlockDriverState pointer bs to where it
is needed.

The original bug causes a segfault when using libguestfs to access a
VMware vCenter Server and doing any kind of complex read-heavy
operations.  With this commit the segfault goes away.

Signed-off-by: Richard W.M. Jones <rjones@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Benoît Canet <benoit.canet@nodalink.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/curl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index 9051bc05c1..025833994c 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -357,7 +357,7 @@ static void curl_multi_timeout_do(void *arg)
 #endif
 }
 
-static CURLState *curl_init_state(BDRVCURLState *s)
+static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s)
 {
     CURLState *state = NULL;
     int i, j;
@@ -375,7 +375,7 @@ static CURLState *curl_init_state(BDRVCURLState *s)
             break;
         }
         if (!state) {
-            aio_poll(state->s->aio_context, true);
+            aio_poll(bdrv_get_aio_context(bs), true);
         }
     } while(!state);
 
@@ -566,7 +566,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags,
     DPRINTF("CURL: Opening %s\n", file);
     s->aio_context = bdrv_get_aio_context(bs);
     s->url = g_strdup(file);
-    state = curl_init_state(s);
+    state = curl_init_state(bs, s);
     if (!state)
         goto out_noclean;
 
@@ -651,7 +651,7 @@ static void curl_readv_bh_cb(void *p)
     }
 
     // No cache found, so let's start a new request
-    state = curl_init_state(s);
+    state = curl_init_state(acb->common.bs, s);
     if (!state) {
         acb->common.cb(acb->common.opaque, -EIO);
         qemu_aio_release(acb);

From 810f4f86b7ebfd0a89fb65bff24aae006483cd58 Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Thu, 28 Aug 2014 13:56:10 +0800
Subject: [PATCH 33/35] nfs: Fix leak of opts in nfs_file_open
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Benoît Canet <benoit.canet@nodalink.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/nfs.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/block/nfs.c b/block/nfs.c
index 93d87f3256..194f301501 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -393,16 +393,20 @@ static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags,
     qemu_opts_absorb_qdict(opts, options, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
-        return -EINVAL;
+        ret = -EINVAL;
+        goto out;
     }
     ret = nfs_client_open(client, qemu_opt_get(opts, "filename"),
                           (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY,
                           errp);
     if (ret < 0) {
-        return ret;
+        goto out;
     }
     bs->total_sectors = ret;
-    return 0;
+    ret = 0;
+out:
+    qemu_opts_del(opts);
+    return ret;
 }
 
 static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)

From 315859312628e581322fe44742f3a05d1549539a Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Thu, 28 Aug 2014 13:56:11 +0800
Subject: [PATCH 34/35] blkverify: Fix leak of opts in blkverify_open
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Benoît Canet <benoit.canet@nodalink.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/blkverify.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blkverify.c b/block/blkverify.c
index 7c78ca41a5..163064cf6b 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -158,6 +158,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags,
 
     ret = 0;
 fail:
+    qemu_opts_del(opts);
     return ret;
 }
 

From 8df3abfceef557551f00adac1618ddd6fe46f85c Mon Sep 17 00:00:00 2001
From: Fam Zheng <famz@redhat.com>
Date: Thu, 28 Aug 2014 13:56:12 +0800
Subject: [PATCH 35/35] quorum: Fix leak of opts in quorum_open
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Benoît Canet <benoit.canet@nodalink.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/quorum.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/quorum.c b/block/quorum.c
index 0160fe3ae9..093382e8f5 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -868,7 +868,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
 {
     BDRVQuorumState *s = bs->opaque;
     Error *local_err = NULL;
-    QemuOpts *opts;
+    QemuOpts *opts = NULL;
     bool *opened;
     QDict *sub = NULL;
     QList *list = NULL;
@@ -989,6 +989,7 @@ close_exit:
     g_free(s->bs);
     g_free(opened);
 exit:
+    qemu_opts_del(opts);
     /* propagate error */
     if (local_err) {
         error_propagate(errp, local_err);