diff --git a/QMP/qmp-events.txt b/QMP/qmp-events.txt
index 39b6016460..4b24ec900d 100644
--- a/QMP/qmp-events.txt
+++ b/QMP/qmp-events.txt
@@ -18,6 +18,28 @@ Example:
     "data": { "actual": 944766976 },
     "timestamp": { "seconds": 1267020223, "microseconds": 435656 } }
 
+BLOCK_IMAGE_CORRUPTED
+---------------------
+
+Emitted when a disk image is being marked corrupt.
+
+Data:
+
+- "device": Device name (json-string)
+- "msg":    Informative message (e.g., reason for the corruption) (json-string)
+- "offset": If the corruption resulted from an image access, this is the access
+            offset into the image (json-int)
+- "size":   If the corruption resulted from an image access, this is the access
+            size (json-int)
+
+Example:
+
+{ "event": "BLOCK_IMAGE_CORRUPTED",
+    "data": { "device": "ide0-hd0",
+        "msg": "Prevented active L1 table overwrite", "offset": 196608,
+        "size": 65536 },
+    "timestamp": { "seconds": 1378126126, "microseconds": 966463 } }
+
 BLOCK_IO_ERROR
 --------------
 
diff --git a/block-migration.c b/block-migration.c
index f803f2006f..daf9ec1eab 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -336,8 +336,8 @@ static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
         bmds->completed_sectors = 0;
         bmds->shared_base = block_mig_state.shared_base;
         alloc_aio_bitmap(bmds);
-        drive_get_ref(drive_get_by_blockdev(bs));
         bdrv_set_in_use(bs, 1);
+        bdrv_ref(bs);
 
         block_mig_state.total_sector_sum += sectors;
 
@@ -575,7 +575,7 @@ static void blk_mig_cleanup(void)
     while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
         bdrv_set_in_use(bmds->bs, 0);
-        drive_put_ref(drive_get_by_blockdev(bmds->bs));
+        bdrv_unref(bmds->bs);
         g_free(bmds->aio_bitmap);
         g_free(bmds);
     }
diff --git a/block.c b/block.c
index 26639e8b70..a325efcb21 100644
--- a/block.c
+++ b/block.c
@@ -86,13 +86,6 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors);
 
-static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
-        bool is_write, double elapsed_time, uint64_t *wait);
-static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
-        double elapsed_time, uint64_t *wait);
-static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
-        bool is_write, int64_t *wait);
-
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -123,69 +116,101 @@ int is_windows_drive(const char *filename)
 #endif
 
 /* throttling disk I/O limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        ThrottleConfig *cfg)
+{
+    int i;
+
+    throttle_config(&bs->throttle_state, cfg);
+
+    for (i = 0; i < 2; i++) {
+        qemu_co_enter_next(&bs->throttled_reqs[i]);
+    }
+}
+
+/* this function drain all the throttled IOs */
+static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
+{
+    bool drained = false;
+    bool enabled = bs->io_limits_enabled;
+    int i;
+
+    bs->io_limits_enabled = false;
+
+    for (i = 0; i < 2; i++) {
+        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
+            drained = true;
+        }
+    }
+
+    bs->io_limits_enabled = enabled;
+
+    return drained;
+}
+
 void bdrv_io_limits_disable(BlockDriverState *bs)
 {
     bs->io_limits_enabled = false;
 
-    do {} while (qemu_co_enter_next(&bs->throttled_reqs));
+    bdrv_start_throttled_reqs(bs);
 
-    if (bs->block_timer) {
-        timer_del(bs->block_timer);
-        timer_free(bs->block_timer);
-        bs->block_timer = NULL;
-    }
-
-    bs->slice_start = 0;
-    bs->slice_end   = 0;
+    throttle_destroy(&bs->throttle_state);
 }
 
-static void bdrv_block_timer(void *opaque)
+static void bdrv_throttle_read_timer_cb(void *opaque)
 {
     BlockDriverState *bs = opaque;
-
-    qemu_co_enter_next(&bs->throttled_reqs);
+    qemu_co_enter_next(&bs->throttled_reqs[0]);
 }
 
+static void bdrv_throttle_write_timer_cb(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    qemu_co_enter_next(&bs->throttled_reqs[1]);
+}
+
+/* should be called before bdrv_set_io_limits if a limit is set */
 void bdrv_io_limits_enable(BlockDriverState *bs)
 {
-    bs->block_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, bdrv_block_timer, bs);
+    assert(!bs->io_limits_enabled);
+    throttle_init(&bs->throttle_state,
+                  QEMU_CLOCK_VIRTUAL,
+                  bdrv_throttle_read_timer_cb,
+                  bdrv_throttle_write_timer_cb,
+                  bs);
     bs->io_limits_enabled = true;
 }
 
-bool bdrv_io_limits_enabled(BlockDriverState *bs)
-{
-    BlockIOLimit *io_limits = &bs->io_limits;
-    return io_limits->bps[BLOCK_IO_LIMIT_READ]
-         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
-         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
-         || io_limits->iops[BLOCK_IO_LIMIT_READ]
-         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
-         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
-}
-
+/* This function makes an IO wait if needed
+ *
+ * @nb_sectors: the number of sectors of the IO
+ * @is_write:   is the IO a write
+ */
 static void bdrv_io_limits_intercept(BlockDriverState *bs,
-                                     bool is_write, int nb_sectors)
+                                     int nb_sectors,
+                                     bool is_write)
 {
-    int64_t wait_time = -1;
+    /* does this io must wait */
+    bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write);
 
-    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
-        qemu_co_queue_wait(&bs->throttled_reqs);
+    /* if must wait or any request of this type throttled queue the IO */
+    if (must_wait ||
+        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
+        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
     }
 
-    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
-     * throttled requests will not be dequeued until the current request is
-     * allowed to be serviced. So if the current request still exceeds the
-     * limits, it will be inserted to the head. All requests followed it will
-     * be still in throttled_reqs queue.
-     */
+    /* the IO will be executed, do the accounting */
+    throttle_account(&bs->throttle_state,
+                     is_write,
+                     nb_sectors * BDRV_SECTOR_SIZE);
 
-    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
-        timer_mod(bs->block_timer,
-                       wait_time + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
-        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
+    /* if the next request must wait -> do nothing */
+    if (throttle_schedule_timer(&bs->throttle_state, is_write)) {
+        return;
     }
 
-    qemu_co_queue_next(&bs->throttled_reqs);
+    /* else queue next request for execution */
+    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
 }
 
 /* check if the path starts with "<protocol>:" */
@@ -305,7 +330,9 @@ BlockDriverState *bdrv_new(const char *device_name)
     bdrv_iostatus_disable(bs);
     notifier_list_init(&bs->close_notifiers);
     notifier_with_return_list_init(&bs->before_write_notifiers);
-    qemu_co_queue_init(&bs->throttled_reqs);
+    qemu_co_queue_init(&bs->throttled_reqs[0]);
+    qemu_co_queue_init(&bs->throttled_reqs[1]);
+    bs->refcnt = 1;
 
     return bs;
 }
@@ -876,7 +903,7 @@ fail:
     if (!bs->drv) {
         QDECREF(bs->options);
     }
-    bdrv_delete(bs);
+    bdrv_unref(bs);
     return ret;
 }
 
@@ -927,7 +954,7 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options)
                     *backing_filename ? backing_filename : NULL, options,
                     back_flags, back_drv);
     if (ret < 0) {
-        bdrv_delete(bs->backing_hd);
+        bdrv_unref(bs->backing_hd);
         bs->backing_hd = NULL;
         bs->open_flags |= BDRV_O_NO_BACKING;
         return ret;
@@ -1002,12 +1029,12 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
         bs1 = bdrv_new("");
         ret = bdrv_open(bs1, filename, NULL, 0, drv);
         if (ret < 0) {
-            bdrv_delete(bs1);
+            bdrv_unref(bs1);
             goto fail;
         }
         total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
 
-        bdrv_delete(bs1);
+        bdrv_unref(bs1);
 
         ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename));
         if (ret < 0) {
@@ -1081,7 +1108,7 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
     }
 
     if (bs->file != file) {
-        bdrv_delete(file);
+        bdrv_unref(file);
         file = NULL;
     }
 
@@ -1112,16 +1139,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
         bdrv_dev_change_media_cb(bs, true);
     }
 
-    /* throttling disk I/O limits */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_enable(bs);
-    }
-
     return 0;
 
 unlink_and_fail:
     if (file != NULL) {
-        bdrv_delete(file);
+        bdrv_unref(file);
     }
     if (bs->is_temporary) {
         unlink(filename);
@@ -1382,7 +1404,7 @@ void bdrv_close(BlockDriverState *bs)
 
     if (bs->drv) {
         if (bs->backing_hd) {
-            bdrv_delete(bs->backing_hd);
+            bdrv_unref(bs->backing_hd);
             bs->backing_hd = NULL;
         }
         bs->drv->bdrv_close(bs);
@@ -1407,7 +1429,7 @@ void bdrv_close(BlockDriverState *bs)
         bs->options = NULL;
 
         if (bs->file != NULL) {
-            bdrv_delete(bs->file);
+            bdrv_unref(bs->file);
             bs->file = NULL;
         }
     }
@@ -1435,7 +1457,10 @@ static bool bdrv_requests_pending(BlockDriverState *bs)
     if (!QLIST_EMPTY(&bs->tracked_requests)) {
         return true;
     }
-    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
+    if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) {
+        return true;
+    }
+    if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) {
         return true;
     }
     if (bs->file && bdrv_requests_pending(bs->file)) {
@@ -1481,7 +1506,7 @@ void bdrv_drain_all(void)
          * a busy wait.
          */
         QTAILQ_FOREACH(bs, &bdrv_states, list) {
-            while (qemu_co_enter_next(&bs->throttled_reqs)) {
+            if (bdrv_start_throttled_reqs(bs)) {
                 busy = true;
             }
         }
@@ -1523,13 +1548,12 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
 
     bs_dest->enable_write_cache = bs_src->enable_write_cache;
 
-    /* i/o timing parameters */
-    bs_dest->slice_start        = bs_src->slice_start;
-    bs_dest->slice_end          = bs_src->slice_end;
-    bs_dest->slice_submitted    = bs_src->slice_submitted;
-    bs_dest->io_limits          = bs_src->io_limits;
-    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
-    bs_dest->block_timer        = bs_src->block_timer;
+    /* i/o throttled req */
+    memcpy(&bs_dest->throttle_state,
+           &bs_src->throttle_state,
+           sizeof(ThrottleState));
+    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
+    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
 
     /* r/w error */
@@ -1543,6 +1567,9 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
     /* dirty bitmap */
     bs_dest->dirty_bitmap       = bs_src->dirty_bitmap;
 
+    /* reference count */
+    bs_dest->refcnt             = bs_src->refcnt;
+
     /* job */
     bs_dest->in_use             = bs_src->in_use;
     bs_dest->job                = bs_src->job;
@@ -1576,7 +1603,7 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
     assert(bs_new->dev == NULL);
     assert(bs_new->in_use == 0);
     assert(bs_new->io_limits_enabled == false);
-    assert(bs_new->block_timer == NULL);
+    assert(!throttle_have_timer(&bs_new->throttle_state));
 
     tmp = *bs_new;
     *bs_new = *bs_old;
@@ -1595,7 +1622,7 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
     assert(bs_new->job == NULL);
     assert(bs_new->in_use == 0);
     assert(bs_new->io_limits_enabled == false);
-    assert(bs_new->block_timer == NULL);
+    assert(!throttle_have_timer(&bs_new->throttle_state));
 
     bdrv_rebind(bs_new);
     bdrv_rebind(bs_old);
@@ -1626,11 +1653,12 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
             bs_new->drv ? bs_new->drv->format_name : "");
 }
 
-void bdrv_delete(BlockDriverState *bs)
+static void bdrv_delete(BlockDriverState *bs)
 {
     assert(!bs->dev);
     assert(!bs->job);
     assert(!bs->in_use);
+    assert(!bs->refcnt);
 
     bdrv_close(bs);
 
@@ -1829,8 +1857,11 @@ int bdrv_commit(BlockDriverState *bs)
     buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
 
     for (sector = 0; sector < total_sectors; sector += n) {
-        if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
-
+        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
+        if (ret < 0) {
+            goto ro_cleanup;
+        }
+        if (ret) {
             if (bdrv_read(bs, sector, buf, n) != 0) {
                 ret = -EIO;
                 goto ro_cleanup;
@@ -2146,7 +2177,7 @@ int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
     QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
         /* so that bdrv_close() does not recursively close the chain */
         intermediate_state->bs->backing_hd = NULL;
-        bdrv_delete(intermediate_state->bs);
+        bdrv_unref(intermediate_state->bs);
     }
     ret = 0;
 
@@ -2538,11 +2569,6 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         return -EIO;
     }
 
-    /* throttling disk read I/O */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, false, nb_sectors);
-    }
-
     if (bs->copy_on_read) {
         flags |= BDRV_REQ_COPY_ON_READ;
     }
@@ -2554,12 +2580,17 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
     }
 
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, nb_sectors, false);
+    }
+
     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
 
     if (flags & BDRV_REQ_COPY_ON_READ) {
         int pnum;
 
-        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
+        ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum);
         if (ret < 0) {
             goto out;
         }
@@ -2679,15 +2710,15 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         return -EIO;
     }
 
-    /* throttling disk write I/O */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, true, nb_sectors);
-    }
-
     if (bs->copy_on_read_in_flight) {
         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
     }
 
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, nb_sectors, true);
+    }
+
     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
 
     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
@@ -2711,6 +2742,9 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
     if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
         bs->wr_highest_sector = sector_num + nb_sectors - 1;
     }
+    if (bs->growable && ret >= 0) {
+        bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors);
+    }
 
     tracked_request_end(&req);
 
@@ -2785,7 +2819,7 @@ int64_t bdrv_getlength(BlockDriverState *bs)
     if (!drv)
         return -ENOMEDIUM;
 
-    if (bs->growable || bdrv_dev_has_removable_media(bs)) {
+    if (bdrv_dev_has_removable_media(bs)) {
         if (drv->bdrv_getlength) {
             return drv->bdrv_getlength(bs);
         }
@@ -2805,14 +2839,6 @@ void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
     *nb_sectors_ptr = length;
 }
 
-/* throttling disk io limits */
-void bdrv_set_io_limits(BlockDriverState *bs,
-                        BlockIOLimit *io_limits)
-{
-    bs->io_limits = *io_limits;
-    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
-}
-
 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
                        BlockdevOnError on_write_error)
 {
@@ -3005,6 +3031,11 @@ int bdrv_has_zero_init(BlockDriverState *bs)
 {
     assert(bs->drv);
 
+    /* If BS is a copy on write image, it is initialized to
+       the contents of the base image, which may not be zeroes.  */
+    if (bs->backing_hd) {
+        return 0;
+    }
     if (bs->drv->bdrv_has_zero_init) {
         return bs->drv->bdrv_has_zero_init(bs);
     }
@@ -3013,15 +3044,15 @@ int bdrv_has_zero_init(BlockDriverState *bs)
     return 0;
 }
 
-typedef struct BdrvCoIsAllocatedData {
+typedef struct BdrvCoGetBlockStatusData {
     BlockDriverState *bs;
     BlockDriverState *base;
     int64_t sector_num;
     int nb_sectors;
     int *pnum;
-    int ret;
+    int64_t ret;
     bool done;
-} BdrvCoIsAllocatedData;
+} BdrvCoGetBlockStatusData;
 
 /*
  * Returns true iff the specified sector is present in the disk image. Drivers
@@ -3038,12 +3069,20 @@ typedef struct BdrvCoIsAllocatedData {
  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
  * beyond the end of the disk image it will be clamped.
  */
-int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
-                                      int nb_sectors, int *pnum)
+static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
+                                                     int64_t sector_num,
+                                                     int nb_sectors, int *pnum)
 {
+    int64_t length;
     int64_t n;
+    int64_t ret, ret2;
 
-    if (sector_num >= bs->total_sectors) {
+    length = bdrv_getlength(bs);
+    if (length < 0) {
+        return length;
+    }
+
+    if (sector_num >= (length >> BDRV_SECTOR_BITS)) {
         *pnum = 0;
         return 0;
     }
@@ -3053,35 +3092,69 @@ int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
         nb_sectors = n;
     }
 
-    if (!bs->drv->bdrv_co_is_allocated) {
+    if (!bs->drv->bdrv_co_get_block_status) {
         *pnum = nb_sectors;
-        return 1;
+        ret = BDRV_BLOCK_DATA;
+        if (bs->drv->protocol_name) {
+            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
+        }
+        return ret;
     }
 
-    return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
+    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (!(ret & BDRV_BLOCK_DATA)) {
+        if (bdrv_has_zero_init(bs)) {
+            ret |= BDRV_BLOCK_ZERO;
+        } else {
+            BlockDriverState *bs2 = bs->backing_hd;
+            int64_t length2 = bdrv_getlength(bs2);
+            if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) {
+                ret |= BDRV_BLOCK_ZERO;
+            }
+        }
+    }
+
+    if (bs->file &&
+        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
+        (ret & BDRV_BLOCK_OFFSET_VALID)) {
+        ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS,
+                                        *pnum, pnum);
+        if (ret2 >= 0) {
+            /* Ignore errors.  This is just providing extra information, it
+             * is useful but not necessary.
+             */
+            ret |= (ret2 & BDRV_BLOCK_ZERO);
+        }
+    }
+
+    return ret;
 }
 
-/* Coroutine wrapper for bdrv_is_allocated() */
-static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
+/* Coroutine wrapper for bdrv_get_block_status() */
+static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque)
 {
-    BdrvCoIsAllocatedData *data = opaque;
+    BdrvCoGetBlockStatusData *data = opaque;
     BlockDriverState *bs = data->bs;
 
-    data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
-                                     data->pnum);
+    data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors,
+                                         data->pnum);
     data->done = true;
 }
 
 /*
- * Synchronous wrapper around bdrv_co_is_allocated().
+ * Synchronous wrapper around bdrv_co_get_block_status().
  *
- * See bdrv_co_is_allocated() for details.
+ * See bdrv_co_get_block_status() for details.
  */
-int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                      int *pnum)
+int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
+                              int nb_sectors, int *pnum)
 {
     Coroutine *co;
-    BdrvCoIsAllocatedData data = {
+    BdrvCoGetBlockStatusData data = {
         .bs = bs,
         .sector_num = sector_num,
         .nb_sectors = nb_sectors,
@@ -3089,14 +3162,31 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
         .done = false,
     };
 
-    co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
-    qemu_coroutine_enter(co, &data);
-    while (!data.done) {
-        qemu_aio_wait();
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_get_block_status_co_entry(&data);
+    } else {
+        co = qemu_coroutine_create(bdrv_get_block_status_co_entry);
+        qemu_coroutine_enter(co, &data);
+        while (!data.done) {
+            qemu_aio_wait();
+        }
     }
     return data.ret;
 }
 
+int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                                   int nb_sectors, int *pnum)
+{
+    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum);
+    if (ret < 0) {
+        return ret;
+    }
+    return
+        (ret & BDRV_BLOCK_DATA) ||
+        ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs));
+}
+
 /*
  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
  *
@@ -3109,10 +3199,10 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
  *  allocated/unallocated state.
  *
  */
-int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
-                                            BlockDriverState *base,
-                                            int64_t sector_num,
-                                            int nb_sectors, int *pnum)
+int bdrv_is_allocated_above(BlockDriverState *top,
+                            BlockDriverState *base,
+                            int64_t sector_num,
+                            int nb_sectors, int *pnum)
 {
     BlockDriverState *intermediate;
     int ret, n = nb_sectors;
@@ -3120,8 +3210,8 @@ int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
     intermediate = top;
     while (intermediate && intermediate != base) {
         int pnum_inter;
-        ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors,
-                                   &pnum_inter);
+        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
+                                &pnum_inter);
         if (ret < 0) {
             return ret;
         } else if (ret) {
@@ -3148,44 +3238,6 @@ int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
     return 0;
 }
 
-/* Coroutine wrapper for bdrv_is_allocated_above() */
-static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque)
-{
-    BdrvCoIsAllocatedData *data = opaque;
-    BlockDriverState *top = data->bs;
-    BlockDriverState *base = data->base;
-
-    data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num,
-                                           data->nb_sectors, data->pnum);
-    data->done = true;
-}
-
-/*
- * Synchronous wrapper around bdrv_co_is_allocated_above().
- *
- * See bdrv_co_is_allocated_above() for details.
- */
-int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
-                            int64_t sector_num, int nb_sectors, int *pnum)
-{
-    Coroutine *co;
-    BdrvCoIsAllocatedData data = {
-        .bs = top,
-        .base = base,
-        .sector_num = sector_num,
-        .nb_sectors = nb_sectors,
-        .pnum = pnum,
-        .done = false,
-    };
-
-    co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry);
-    qemu_coroutine_enter(co, &data);
-    while (!data.done) {
-        qemu_aio_wait();
-    }
-    return data.ret;
-}
-
 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
 {
     if (bs->backing_hd && bs->backing_hd->encrypted)
@@ -3622,169 +3674,6 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->aiocb_info->cancel(acb);
 }
 
-/* block I/O throttling */
-static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
-                 bool is_write, double elapsed_time, uint64_t *wait)
-{
-    uint64_t bps_limit = 0;
-    uint64_t extension;
-    double   bytes_limit, bytes_base, bytes_res;
-    double   slice_time, wait_time;
-
-    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
-        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
-    } else if (bs->io_limits.bps[is_write]) {
-        bps_limit = bs->io_limits.bps[is_write];
-    } else {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    slice_time = bs->slice_end - bs->slice_start;
-    slice_time /= (NANOSECONDS_PER_SECOND);
-    bytes_limit = bps_limit * slice_time;
-    bytes_base  = bs->slice_submitted.bytes[is_write];
-    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
-        bytes_base += bs->slice_submitted.bytes[!is_write];
-    }
-
-    /* bytes_base: the bytes of data which have been read/written; and
-     *             it is obtained from the history statistic info.
-     * bytes_res: the remaining bytes of data which need to be read/written.
-     * (bytes_base + bytes_res) / bps_limit: used to calcuate
-     *             the total time for completing reading/writting all data.
-     */
-    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-
-    if (bytes_base + bytes_res <= bytes_limit) {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    /* Calc approx time to dispatch */
-    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
-
-    /* When the I/O rate at runtime exceeds the limits,
-     * bs->slice_end need to be extended in order that the current statistic
-     * info can be kept until the timer fire, so it is increased and tuned
-     * based on the result of experiment.
-     */
-    extension = wait_time * NANOSECONDS_PER_SECOND;
-    extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
-                BLOCK_IO_SLICE_TIME;
-    bs->slice_end += extension;
-    if (wait) {
-        *wait = wait_time * NANOSECONDS_PER_SECOND;
-    }
-
-    return true;
-}
-
-static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
-                             double elapsed_time, uint64_t *wait)
-{
-    uint64_t iops_limit = 0;
-    double   ios_limit, ios_base;
-    double   slice_time, wait_time;
-
-    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
-        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
-    } else if (bs->io_limits.iops[is_write]) {
-        iops_limit = bs->io_limits.iops[is_write];
-    } else {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    slice_time = bs->slice_end - bs->slice_start;
-    slice_time /= (NANOSECONDS_PER_SECOND);
-    ios_limit  = iops_limit * slice_time;
-    ios_base   = bs->slice_submitted.ios[is_write];
-    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
-        ios_base += bs->slice_submitted.ios[!is_write];
-    }
-
-    if (ios_base + 1 <= ios_limit) {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    /* Calc approx time to dispatch, in seconds */
-    wait_time = (ios_base + 1) / iops_limit;
-    if (wait_time > elapsed_time) {
-        wait_time = wait_time - elapsed_time;
-    } else {
-        wait_time = 0;
-    }
-
-    /* Exceeded current slice, extend it by another slice time */
-    bs->slice_end += BLOCK_IO_SLICE_TIME;
-    if (wait) {
-        *wait = wait_time * NANOSECONDS_PER_SECOND;
-    }
-
-    return true;
-}
-
-static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
-                           bool is_write, int64_t *wait)
-{
-    int64_t  now, max_wait;
-    uint64_t bps_wait = 0, iops_wait = 0;
-    double   elapsed_time;
-    int      bps_ret, iops_ret;
-
-    now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
-    if (now > bs->slice_end) {
-        bs->slice_start = now;
-        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
-        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
-    }
-
-    elapsed_time  = now - bs->slice_start;
-    elapsed_time  /= (NANOSECONDS_PER_SECOND);
-
-    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
-                                      is_write, elapsed_time, &bps_wait);
-    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
-                                      elapsed_time, &iops_wait);
-    if (bps_ret || iops_ret) {
-        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
-        if (wait) {
-            *wait = max_wait;
-        }
-
-        now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
-        if (bs->slice_end < now + max_wait) {
-            bs->slice_end = now + max_wait;
-        }
-
-        return true;
-    }
-
-    if (wait) {
-        *wait = 0;
-    }
-
-    bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
-                                           BDRV_SECTOR_SIZE;
-    bs->slice_submitted.ios[is_write]++;
-
-    return false;
-}
-
 /**************************************************************/
 /* async block device emulation */
 
@@ -4445,6 +4334,23 @@ int64_t bdrv_get_dirty_count(BlockDriverState *bs)
     }
 }
 
+/* Get a reference to bs */
+void bdrv_ref(BlockDriverState *bs)
+{
+    bs->refcnt++;
+}
+
+/* Release a previously grabbed reference to bs.
+ * If after releasing, reference count is zero, the BlockDriverState is
+ * deleted. */
+void bdrv_unref(BlockDriverState *bs)
+{
+    assert(bs->refcnt > 0);
+    if (--bs->refcnt == 0) {
+        bdrv_delete(bs);
+    }
+}
+
 void bdrv_set_in_use(BlockDriverState *bs, int in_use)
 {
     assert(bs->in_use != in_use);
@@ -4658,7 +4564,7 @@ out:
     free_option_parameters(param);
 
     if (bs) {
-        bdrv_delete(bs);
+        bdrv_unref(bs);
     }
 }
 
diff --git a/block/backup.c b/block/backup.c
index 23c7264488..04c4b5c263 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -289,14 +289,14 @@ static void coroutine_fn backup_run(void *opaque)
                  * backing file. */
 
                 for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
-                    /* bdrv_co_is_allocated() only returns true/false based
+                    /* bdrv_is_allocated() only returns true/false based
                      * on the first set of sectors it comes across that
                      * are are all in the same state.
                      * For that reason we must verify each sector in the
                      * backup cluster length.  We end up copying more than
                      * needed but at some point that is always the case. */
                     alloced =
-                        bdrv_co_is_allocated(bs,
+                        bdrv_is_allocated(bs,
                                 start * BACKUP_SECTORS_PER_CLUSTER + i,
                                 BACKUP_SECTORS_PER_CLUSTER - i, &n);
                     i += n;
@@ -338,7 +338,7 @@ static void coroutine_fn backup_run(void *opaque)
     hbitmap_free(job->bitmap);
 
     bdrv_iostatus_disable(target);
-    bdrv_delete(target);
+    bdrv_unref(target);
 
     block_job_completed(&job->common, ret);
 }
diff --git a/block/blkverify.c b/block/blkverify.c
index 1d58cc3932..c4e961eeb1 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -155,7 +155,7 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags)
     s->test_file = bdrv_new("");
     ret = bdrv_open(s->test_file, filename, NULL, flags, NULL);
     if (ret < 0) {
-        bdrv_delete(s->test_file);
+        bdrv_unref(s->test_file);
         s->test_file = NULL;
         goto fail;
     }
@@ -169,7 +169,7 @@ static void blkverify_close(BlockDriverState *bs)
 {
     BDRVBlkverifyState *s = bs->opaque;
 
-    bdrv_delete(s->test_file);
+    bdrv_unref(s->test_file);
     s->test_file = NULL;
 }
 
diff --git a/block/commit.c b/block/commit.c
index 51a1ab3678..ac4b7ccbc9 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -108,9 +108,9 @@ wait:
             break;
         }
         /* Copy if allocated above the base */
-        ret = bdrv_co_is_allocated_above(top, base, sector_num,
-                                         COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
-                                         &n);
+        ret = bdrv_is_allocated_above(top, base, sector_num,
+                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
+                                      &n);
         copy = (ret == 1);
         trace_commit_one_iteration(s, sector_num, n, ret);
         if (copy) {
diff --git a/block/cow.c b/block/cow.c
index 1cc2e89c7c..764b93fae0 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -106,7 +106,7 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags)
  * XXX(hch): right now these functions are extremely inefficient.
  * We should just read the whole bitmap we'll need in one go instead.
  */
-static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum)
+static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum, bool *first)
 {
     uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
     uint8_t bitmap;
@@ -117,27 +117,52 @@ static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum)
        return ret;
     }
 
+    if (bitmap & (1 << (bitnum % 8))) {
+        return 0;
+    }
+
+    if (*first) {
+        ret = bdrv_flush(bs->file);
+        if (ret < 0) {
+            return ret;
+        }
+        *first = false;
+    }
+
     bitmap |= (1 << (bitnum % 8));
 
-    ret = bdrv_pwrite_sync(bs->file, offset, &bitmap, sizeof(bitmap));
+    ret = bdrv_pwrite(bs->file, offset, &bitmap, sizeof(bitmap));
     if (ret < 0) {
        return ret;
     }
     return 0;
 }
 
-static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
+#define BITS_PER_BITMAP_SECTOR (512 * 8)
+
+/* Cannot use bitmap.c on big-endian machines.  */
+static int cow_test_bit(int64_t bitnum, const uint8_t *bitmap)
 {
-    uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8;
-    uint8_t bitmap;
-    int ret;
+    return (bitmap[bitnum / 8] & (1 << (bitnum & 7))) != 0;
+}
 
-    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
-    if (ret < 0) {
-       return ret;
+static int cow_find_streak(const uint8_t *bitmap, int value, int start, int nb_sectors)
+{
+    int streak_value = value ? 0xFF : 0;
+    int last = MIN(start + nb_sectors, BITS_PER_BITMAP_SECTOR);
+    int bitnum = start;
+    while (bitnum < last) {
+        if ((bitnum & 7) == 0 && bitmap[bitnum / 8] == streak_value) {
+            bitnum += 8;
+            continue;
+        }
+        if (cow_test_bit(bitnum, bitmap) == value) {
+            bitnum++;
+            continue;
+        }
+        break;
     }
-
-    return !!(bitmap & (1 << (bitnum % 8)));
+    return MIN(bitnum, last) - start;
 }
 
 /* Return true if first block has been changed (ie. current version is
@@ -146,34 +171,44 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum)
 static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *num_same)
 {
+    int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8;
+    uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE;
+    uint8_t bitmap[BDRV_SECTOR_SIZE];
+    int ret;
     int changed;
 
-    if (nb_sectors == 0) {
-	*num_same = nb_sectors;
-	return 0;
-    }
-
-    changed = is_bit_set(bs, sector_num);
-    if (changed < 0) {
-        return 0; /* XXX: how to return I/O errors? */
-    }
-
-    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
-	if (is_bit_set(bs, sector_num + *num_same) != changed)
-	    break;
+    ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap));
+    if (ret < 0) {
+        return ret;
     }
 
+    bitnum &= BITS_PER_BITMAP_SECTOR - 1;
+    changed = cow_test_bit(bitnum, bitmap);
+    *num_same = cow_find_streak(bitmap, changed, bitnum, nb_sectors);
     return changed;
 }
 
+static int64_t coroutine_fn cow_co_get_block_status(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, int *num_same)
+{
+    BDRVCowState *s = bs->opaque;
+    int ret = cow_co_is_allocated(bs, sector_num, nb_sectors, num_same);
+    int64_t offset = s->cow_sectors_offset + (sector_num << BDRV_SECTOR_BITS);
+    if (ret < 0) {
+        return ret;
+    }
+    return (ret ? BDRV_BLOCK_DATA : 0) | offset | BDRV_BLOCK_OFFSET_VALID;
+}
+
 static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num,
         int nb_sectors)
 {
     int error = 0;
     int i;
+    bool first = true;
 
     for (i = 0; i < nb_sectors; i++) {
-        error = cow_set_bit(bs, sector_num + i);
+        error = cow_set_bit(bs, sector_num + i, &first);
         if (error) {
             break;
         }
@@ -189,7 +224,11 @@ static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num,
     int ret, n;
 
     while (nb_sectors > 0) {
-        if (bdrv_co_is_allocated(bs, sector_num, nb_sectors, &n)) {
+        ret = cow_co_is_allocated(bs, sector_num, nb_sectors, &n);
+        if (ret < 0) {
+            return ret;
+        }
+        if (ret) {
             ret = bdrv_pread(bs->file,
                         s->cow_sectors_offset + sector_num * 512,
                         buf, n * 512);
@@ -314,7 +353,7 @@ static int cow_create(const char *filename, QEMUOptionParameter *options)
     }
 
 exit:
-    bdrv_delete(cow_bs);
+    bdrv_unref(cow_bs);
     return ret;
 }
 
@@ -344,7 +383,7 @@ static BlockDriver bdrv_cow = {
 
     .bdrv_read              = cow_co_read,
     .bdrv_write             = cow_co_write,
-    .bdrv_co_is_allocated   = cow_co_is_allocated,
+    .bdrv_co_get_block_status   = cow_co_get_block_status,
 
     .create_options = cow_create_options,
 };
diff --git a/block/iscsi.c b/block/iscsi.c
index 2bbee1f6e5..813abd8fef 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1241,11 +1241,11 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
 {
     int ret = 0;
     int64_t total_size = 0;
-    BlockDriverState bs;
+    BlockDriverState *bs;
     IscsiLun *iscsilun = NULL;
     QDict *bs_options;
 
-    memset(&bs, 0, sizeof(BlockDriverState));
+    bs = bdrv_new("");
 
     /* Read out options */
     while (options && options->name) {
@@ -1255,12 +1255,12 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
         options++;
     }
 
-    bs.opaque = g_malloc0(sizeof(struct IscsiLun));
-    iscsilun = bs.opaque;
+    bs->opaque = g_malloc0(sizeof(struct IscsiLun));
+    iscsilun = bs->opaque;
 
     bs_options = qdict_new();
     qdict_put(bs_options, "filename", qstring_from_str(filename));
-    ret = iscsi_open(&bs, bs_options, 0);
+    ret = iscsi_open(bs, bs_options, 0);
     QDECREF(bs_options);
 
     if (ret != 0) {
@@ -1274,7 +1274,7 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
         ret = -ENODEV;
         goto out;
     }
-    if (bs.total_sectors < total_size) {
+    if (bs->total_sectors < total_size) {
         ret = -ENOSPC;
         goto out;
     }
@@ -1284,7 +1284,9 @@ out:
     if (iscsilun->iscsi != NULL) {
         iscsi_destroy_context(iscsilun->iscsi);
     }
-    g_free(bs.opaque);
+    g_free(bs->opaque);
+    bs->opaque = NULL;
+    bdrv_unref(bs);
     return ret;
 }
 
diff --git a/block/mirror.c b/block/mirror.c
index 86de4582b4..f61a7799de 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -338,8 +338,8 @@ static void coroutine_fn mirror_run(void *opaque)
         base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
         for (sector_num = 0; sector_num < end; ) {
             int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1;
-            ret = bdrv_co_is_allocated_above(bs, base,
-                                             sector_num, next - sector_num, &n);
+            ret = bdrv_is_allocated_above(bs, base,
+                                          sector_num, next - sector_num, &n);
 
             if (ret < 0) {
                 goto immediate_exit;
@@ -480,7 +480,7 @@ immediate_exit:
         bdrv_swap(s->target, s->common.bs);
     }
     bdrv_close(s->target);
-    bdrv_delete(s->target);
+    bdrv_unref(s->target);
     block_job_completed(&s->common, ret);
 }
 
diff --git a/block/qapi.c b/block/qapi.c
index a4bc4113b7..782051c65d 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -223,18 +223,44 @@ void bdrv_query_info(BlockDriverState *bs,
         info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
 
         if (bs->io_limits_enabled) {
-            info->inserted->bps =
-                           bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
-            info->inserted->bps_rd =
-                           bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
-            info->inserted->bps_wr =
-                           bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
-            info->inserted->iops =
-                           bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
-            info->inserted->iops_rd =
-                           bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
-            info->inserted->iops_wr =
-                           bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
+            ThrottleConfig cfg;
+            throttle_get_config(&bs->throttle_state, &cfg);
+            info->inserted->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
+            info->inserted->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
+            info->inserted->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg;
+
+            info->inserted->iops    = cfg.buckets[THROTTLE_OPS_TOTAL].avg;
+            info->inserted->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg;
+            info->inserted->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg;
+
+            info->inserted->has_bps_max     =
+                cfg.buckets[THROTTLE_BPS_TOTAL].max;
+            info->inserted->bps_max         =
+                cfg.buckets[THROTTLE_BPS_TOTAL].max;
+            info->inserted->has_bps_rd_max  =
+                cfg.buckets[THROTTLE_BPS_READ].max;
+            info->inserted->bps_rd_max      =
+                cfg.buckets[THROTTLE_BPS_READ].max;
+            info->inserted->has_bps_wr_max  =
+                cfg.buckets[THROTTLE_BPS_WRITE].max;
+            info->inserted->bps_wr_max      =
+                cfg.buckets[THROTTLE_BPS_WRITE].max;
+
+            info->inserted->has_iops_max    =
+                cfg.buckets[THROTTLE_OPS_TOTAL].max;
+            info->inserted->iops_max        =
+                cfg.buckets[THROTTLE_OPS_TOTAL].max;
+            info->inserted->has_iops_rd_max =
+                cfg.buckets[THROTTLE_OPS_READ].max;
+            info->inserted->iops_rd_max     =
+                cfg.buckets[THROTTLE_OPS_READ].max;
+            info->inserted->has_iops_wr_max =
+                cfg.buckets[THROTTLE_OPS_WRITE].max;
+            info->inserted->iops_wr_max     =
+                cfg.buckets[THROTTLE_OPS_WRITE].max;
+
+            info->inserted->has_iops_size = cfg.op_size;
+            info->inserted->iops_size = cfg.op_size;
         }
 
         bs0 = bs;
diff --git a/block/qcow.c b/block/qcow.c
index 5239bd68f1..93a993bb44 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -395,7 +395,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
     return cluster_offset;
 }
 
-static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
     BDRVQcowState *s = bs->opaque;
@@ -410,7 +410,14 @@ static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs,
     if (n > nb_sectors)
         n = nb_sectors;
     *pnum = n;
-    return (cluster_offset != 0);
+    if (!cluster_offset) {
+        return 0;
+    }
+    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypt_method) {
+        return BDRV_BLOCK_DATA;
+    }
+    cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
+    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset;
 }
 
 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
@@ -751,7 +758,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options)
     g_free(tmp);
     ret = 0;
 exit:
-    bdrv_delete(qcow_bs);
+    bdrv_unref(qcow_bs);
     return ret;
 }
 
@@ -896,7 +903,7 @@ static BlockDriver bdrv_qcow = {
 
     .bdrv_co_readv          = qcow_co_readv,
     .bdrv_co_writev         = qcow_co_writev,
-    .bdrv_co_is_allocated   = qcow_co_is_allocated,
+    .bdrv_co_get_block_status   = qcow_co_get_block_status,
 
     .bdrv_set_key           = qcow_set_key,
     .bdrv_make_empty        = qcow_make_empty,
diff --git a/block/qcow2.c b/block/qcow2.c
index 4bc679a155..578792f0a3 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -688,24 +688,34 @@ static int qcow2_reopen_prepare(BDRVReopenState *state,
     return 0;
 }
 
-static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t cluster_offset;
-    int ret;
+    int index_in_cluster, ret;
+    int64_t status = 0;
 
     *pnum = nb_sectors;
-    /* FIXME We can get errors here, but the bdrv_co_is_allocated interface
-     * can't pass them on today */
     qemu_co_mutex_lock(&s->lock);
     ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset);
     qemu_co_mutex_unlock(&s->lock);
     if (ret < 0) {
-        *pnum = 0;
+        return ret;
     }
 
-    return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO);
+    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
+        !s->crypt_method) {
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
+        status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
+    }
+    if (ret == QCOW2_CLUSTER_ZERO) {
+        status |= BDRV_BLOCK_ZERO;
+    } else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
+        status |= BDRV_BLOCK_DATA;
+    }
+    return status;
 }
 
 /* handle reading after the end of the backing file */
@@ -1452,7 +1462,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
 
     ret = 0;
 out:
-    bdrv_delete(bs);
+    bdrv_unref(bs);
     return ret;
 }
 
@@ -1868,7 +1878,7 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_reopen_prepare  = qcow2_reopen_prepare,
     .bdrv_create        = qcow2_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_is_allocated = qcow2_co_is_allocated,
+    .bdrv_co_get_block_status = qcow2_co_get_block_status,
     .bdrv_set_key       = qcow2_set_key,
     .bdrv_make_empty    = qcow2_make_empty,
 
diff --git a/block/qed.c b/block/qed.c
index cc904c4834..49b3a37ed5 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -599,7 +599,7 @@ static int qed_create(const char *filename, uint32_t cluster_size,
     ret = 0; /* success */
 out:
     g_free(l1_table);
-    bdrv_delete(bs);
+    bdrv_unref(bs);
     return ret;
 }
 
@@ -652,45 +652,66 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options)
 }
 
 typedef struct {
+    BlockDriverState *bs;
     Coroutine *co;
-    int is_allocated;
+    uint64_t pos;
+    int64_t status;
     int *pnum;
 } QEDIsAllocatedCB;
 
 static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
 {
     QEDIsAllocatedCB *cb = opaque;
+    BDRVQEDState *s = cb->bs->opaque;
     *cb->pnum = len / BDRV_SECTOR_SIZE;
-    cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO);
+    switch (ret) {
+    case QED_CLUSTER_FOUND:
+        offset |= qed_offset_into_cluster(s, cb->pos);
+        cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
+        break;
+    case QED_CLUSTER_ZERO:
+        cb->status = BDRV_BLOCK_ZERO;
+        break;
+    case QED_CLUSTER_L2:
+    case QED_CLUSTER_L1:
+        cb->status = 0;
+        break;
+    default:
+        assert(ret < 0);
+        cb->status = ret;
+        break;
+    }
+
     if (cb->co) {
         qemu_coroutine_enter(cb->co, NULL);
     }
 }
 
-static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
                                                  int64_t sector_num,
                                                  int nb_sectors, int *pnum)
 {
     BDRVQEDState *s = bs->opaque;
-    uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
     size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
     QEDIsAllocatedCB cb = {
-        .is_allocated = -1,
+        .bs = bs,
+        .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE,
+        .status = BDRV_BLOCK_OFFSET_MASK,
         .pnum = pnum,
     };
     QEDRequest request = { .l2_table = NULL };
 
-    qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
+    qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb);
 
     /* Now sleep if the callback wasn't invoked immediately */
-    while (cb.is_allocated == -1) {
+    while (cb.status == BDRV_BLOCK_OFFSET_MASK) {
         cb.co = qemu_coroutine_self();
         qemu_coroutine_yield();
     }
 
     qed_unref_l2_cache_entry(request.l2_table);
 
-    return cb.is_allocated;
+    return cb.status;
 }
 
 static int bdrv_qed_make_empty(BlockDriverState *bs)
@@ -1575,7 +1596,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
     .bdrv_create              = bdrv_qed_create,
     .bdrv_has_zero_init       = bdrv_has_zero_init_1,
-    .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated,
+    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
     .bdrv_make_empty          = bdrv_qed_make_empty,
     .bdrv_aio_readv           = bdrv_qed_aio_readv,
     .bdrv_aio_writev          = bdrv_qed_aio_writev,
diff --git a/block/raw-posix.c b/block/raw-posix.c
index ba721d3f5b..1b41ea3356 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1084,12 +1084,12 @@ static int raw_create(const char *filename, QEMUOptionParameter *options)
  * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
  * beyond the end of the disk image it will be clamped.
  */
-static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
                                             int64_t sector_num,
                                             int nb_sectors, int *pnum)
 {
     off_t start, data, hole;
-    int ret;
+    int64_t ret;
 
     ret = fd_open(bs);
     if (ret < 0) {
@@ -1097,6 +1097,7 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
     }
 
     start = sector_num * BDRV_SECTOR_SIZE;
+    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
 
 #ifdef CONFIG_FIEMAP
 
@@ -1114,7 +1115,7 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
     if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
         /* Assume everything is allocated.  */
         *pnum = nb_sectors;
-        return 1;
+        return ret;
     }
 
     if (f.fm.fm_mapped_extents == 0) {
@@ -1127,6 +1128,9 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
     } else {
         data = f.fe.fe_logical;
         hole = f.fe.fe_logical + f.fe.fe_length;
+        if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
+            ret |= BDRV_BLOCK_ZERO;
+        }
     }
 
 #elif defined SEEK_HOLE && defined SEEK_DATA
@@ -1141,7 +1145,7 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
 
         /* Most likely EINVAL.  Assume everything is allocated.  */
         *pnum = nb_sectors;
-        return 1;
+        return ret;
     }
 
     if (hole > start) {
@@ -1154,19 +1158,21 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
         }
     }
 #else
-    *pnum = nb_sectors;
-    return 1;
+    data = 0;
+    hole = start + nb_sectors * BDRV_SECTOR_SIZE;
 #endif
 
     if (data <= start) {
         /* On a data extent, compute sectors to the end of the extent.  */
         *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE);
-        return 1;
     } else {
         /* On a hole, compute sectors to the beginning of the next extent.  */
         *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
-        return 0;
+        ret &= ~BDRV_BLOCK_DATA;
+        ret |= BDRV_BLOCK_ZERO;
     }
+
+    return ret;
 }
 
 static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs,
@@ -1200,7 +1206,7 @@ static BlockDriver bdrv_file = {
     .bdrv_close = raw_close,
     .bdrv_create = raw_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_is_allocated = raw_co_is_allocated,
+    .bdrv_co_get_block_status = raw_co_get_block_status,
 
     .bdrv_aio_readv = raw_aio_readv,
     .bdrv_aio_writev = raw_aio_writev,
diff --git a/block/raw-win32.c b/block/raw-win32.c
index 9b5b2af4e8..d2d2d9f4d4 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -535,13 +535,29 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
 {
     BDRVRawState *s = bs->opaque;
     int access_flags, create_flags;
+    int ret = 0;
     DWORD overlapped;
     char device_name[64];
-    const char *filename = qdict_get_str(options, "filename");
+
+    Error *local_err = NULL;
+    const char *filename;
+
+    QemuOpts *opts = qemu_opts_create_nofail(&raw_runtime_opts);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (error_is_set(&local_err)) {
+        qerror_report_err(local_err);
+        error_free(local_err);
+        ret = -EINVAL;
+        goto done;
+    }
+
+    filename = qemu_opt_get(opts, "filename");
 
     if (strstart(filename, "/dev/cdrom", NULL)) {
-        if (find_cdrom(device_name, sizeof(device_name)) < 0)
-            return -ENOENT;
+        if (find_cdrom(device_name, sizeof(device_name)) < 0) {
+            ret = -ENOENT;
+            goto done;
+        }
         filename = device_name;
     } else {
         /* transform drive letters into device name */
@@ -564,11 +580,17 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags)
     if (s->hfile == INVALID_HANDLE_VALUE) {
         int err = GetLastError();
 
-        if (err == ERROR_ACCESS_DENIED)
-            return -EACCES;
-        return -1;
+        if (err == ERROR_ACCESS_DENIED) {
+            ret = -EACCES;
+        } else {
+            ret = -1;
+        }
+        goto done;
     }
-    return 0;
+
+done:
+    qemu_opts_del(opts);
+    return ret;
 }
 
 static BlockDriver bdrv_host_device = {
diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index ab2b0fd7d2..a9060caec4 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -58,11 +58,11 @@ static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num,
     return bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov);
 }
 
-static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs,
-                                            int64_t sector_num, int nb_sectors,
-                                            int *pnum)
+static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            int nb_sectors, int *pnum)
 {
-    return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum);
+    return bdrv_get_block_status(bs->file, sector_num, nb_sectors, pnum);
 }
 
 static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs,
@@ -164,7 +164,7 @@ static BlockDriver bdrv_raw = {
     .bdrv_co_writev       = &raw_co_writev,
     .bdrv_co_write_zeroes = &raw_co_write_zeroes,
     .bdrv_co_discard      = &raw_co_discard,
-    .bdrv_co_is_allocated = &raw_co_is_allocated,
+    .bdrv_co_get_block_status = &raw_co_get_block_status,
     .bdrv_truncate        = &raw_truncate,
     .bdrv_getlength       = &raw_getlength,
     .bdrv_get_info        = &raw_get_info,
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 1ad4d070e7..f9988d35ba 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -1430,7 +1430,7 @@ static int sd_prealloc(const char *filename)
     }
 out:
     if (bs) {
-        bdrv_delete(bs);
+        bdrv_unref(bs);
     }
     g_free(buf);
 
@@ -1509,13 +1509,13 @@ static int sd_create(const char *filename, QEMUOptionParameter *options)
 
         if (!is_snapshot(&s->inode)) {
             error_report("cannot clone from a non snapshot vdi");
-            bdrv_delete(bs);
+            bdrv_unref(bs);
             ret = -EINVAL;
             goto out;
         }
 
         base_vid = s->inode.vdi_id;
-        bdrv_delete(bs);
+        bdrv_unref(bs);
     }
 
     ret = do_sd_create(s, vdi, vdi_size, base_vid, &vid, 0);
@@ -2270,9 +2270,9 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
     return acb->ret;
 }
 
-static coroutine_fn int
-sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                   int *pnum)
+static coroutine_fn int64_t
+sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
+                       int *pnum)
 {
     BDRVSheepdogState *s = bs->opaque;
     SheepdogInode *inode = &s->inode;
@@ -2280,7 +2280,7 @@ sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
                   end = DIV_ROUND_UP((sector_num + nb_sectors) *
                                      BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
     unsigned long idx;
-    int ret = 1;
+    int64_t ret = BDRV_BLOCK_DATA;
 
     for (idx = start; idx < end; idx++) {
         if (inode->data_vdi_id[idx] == 0) {
@@ -2338,7 +2338,7 @@ static BlockDriver bdrv_sheepdog = {
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
     .bdrv_co_discard = sd_co_discard,
-    .bdrv_co_is_allocated = sd_co_is_allocated,
+    .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,
@@ -2366,7 +2366,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
     .bdrv_co_discard = sd_co_discard,
-    .bdrv_co_is_allocated = sd_co_is_allocated,
+    .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,
@@ -2394,7 +2394,7 @@ static BlockDriver bdrv_sheepdog_unix = {
     .bdrv_co_writev = sd_co_writev,
     .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
     .bdrv_co_discard = sd_co_discard,
-    .bdrv_co_is_allocated = sd_co_is_allocated,
+    .bdrv_co_get_block_status = sd_co_get_block_status,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,
diff --git a/block/snapshot.c b/block/snapshot.c
index 6c6d9deea1..8f61cc0745 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -99,7 +99,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
         ret = bdrv_snapshot_goto(bs->file, snapshot_id);
         open_ret = drv->bdrv_open(bs, NULL, bs->open_flags);
         if (open_ret < 0) {
-            bdrv_delete(bs->file);
+            bdrv_unref(bs->file);
             bs->drv = NULL;
             return open_ret;
         }
diff --git a/block/stream.c b/block/stream.c
index 99821252b1..078ce4aa6a 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -73,7 +73,7 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
         unused = intermediate;
         intermediate = intermediate->backing_hd;
         unused->backing_hd = NULL;
-        bdrv_delete(unused);
+        bdrv_unref(unused);
     }
 }
 
@@ -119,16 +119,16 @@ wait:
             break;
         }
 
-        ret = bdrv_co_is_allocated(bs, sector_num,
-                                   STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
+        ret = bdrv_is_allocated(bs, sector_num,
+                                STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
         if (ret == 1) {
             /* Allocated in the top, no need to copy.  */
             copy = false;
-        } else {
+        } else if (ret >= 0) {
             /* Copy if allocated in the intermediate images.  Limit to the
              * known-unallocated area [sector_num, sector_num+n).  */
-            ret = bdrv_co_is_allocated_above(bs->backing_hd, base,
-                                             sector_num, n, &n);
+            ret = bdrv_is_allocated_above(bs->backing_hd, base,
+                                          sector_num, n, &n);
 
             /* Finish early if end of backing file has been reached */
             if (ret == 0 && n == 0) {
diff --git a/block/vdi.c b/block/vdi.c
index 8a915257e8..1bf7dc575a 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -470,7 +470,7 @@ static int vdi_reopen_prepare(BDRVReopenState *state,
     return 0;
 }
 
-static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
     /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */
@@ -479,12 +479,23 @@ static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
     size_t sector_in_block = sector_num % s->block_sectors;
     int n_sectors = s->block_sectors - sector_in_block;
     uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]);
+    uint64_t offset;
+    int result;
+
     logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum);
     if (n_sectors > nb_sectors) {
         n_sectors = nb_sectors;
     }
     *pnum = n_sectors;
-    return VDI_IS_ALLOCATED(bmap_entry);
+    result = VDI_IS_ALLOCATED(bmap_entry);
+    if (!result) {
+        return 0;
+    }
+
+    offset = s->header.offset_data +
+                              (uint64_t)bmap_entry * s->block_size +
+                              sector_in_block * SECTOR_SIZE;
+    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 }
 
 static int vdi_co_read(BlockDriverState *bs,
@@ -780,7 +791,7 @@ static BlockDriver bdrv_vdi = {
     .bdrv_reopen_prepare = vdi_reopen_prepare,
     .bdrv_create = vdi_create,
     .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_is_allocated = vdi_co_is_allocated,
+    .bdrv_co_get_block_status = vdi_co_get_block_status,
     .bdrv_make_empty = vdi_make_empty,
 
     .bdrv_read = vdi_co_read,
diff --git a/block/vmdk.c b/block/vmdk.c
index 63b489d29e..fb5b5297ce 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -216,7 +216,7 @@ static void vmdk_free_extents(BlockDriverState *bs)
         g_free(e->l2_cache);
         g_free(e->l1_backup_table);
         if (e->file != bs->file) {
-            bdrv_delete(e->file);
+            bdrv_unref(e->file);
         }
     }
     g_free(s->extents);
@@ -746,7 +746,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
             /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/
             ret = vmdk_open_sparse(bs, extent_file, bs->open_flags);
             if (ret) {
-                bdrv_delete(extent_file);
+                bdrv_unref(extent_file);
                 return ret;
             }
         } else {
@@ -1042,7 +1042,7 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
     return NULL;
 }
 
-static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
     BDRVVmdkState *s = bs->opaque;
@@ -1059,7 +1059,24 @@ static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs,
                             sector_num * 512, 0, &offset);
     qemu_co_mutex_unlock(&s->lock);
 
-    ret = (ret == VMDK_OK || ret == VMDK_ZEROED);
+    switch (ret) {
+    case VMDK_ERROR:
+        ret = -EIO;
+        break;
+    case VMDK_UNALLOC:
+        ret = 0;
+        break;
+    case VMDK_ZEROED:
+        ret = BDRV_BLOCK_ZERO;
+        break;
+    case VMDK_OK:
+        ret = BDRV_BLOCK_DATA;
+        if (extent->file == bs->file) {
+            ret |= BDRV_BLOCK_OFFSET_VALID | offset;
+        }
+
+        break;
+    }
 
     index_in_cluster = sector_num % extent->cluster_sectors;
     n = extent->cluster_sectors - index_in_cluster;
@@ -1636,15 +1653,15 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
         BlockDriverState *bs = bdrv_new("");
         ret = bdrv_open(bs, backing_file, NULL, 0, NULL);
         if (ret != 0) {
-            bdrv_delete(bs);
+            bdrv_unref(bs);
             return ret;
         }
         if (strcmp(bs->drv->format_name, "vmdk")) {
-            bdrv_delete(bs);
+            bdrv_unref(bs);
             return -EINVAL;
         }
         parent_cid = vmdk_read_cid(bs, 0);
-        bdrv_delete(bs);
+        bdrv_unref(bs);
         snprintf(parent_desc_line, sizeof(parent_desc_line),
                 "parentFileNameHint=\"%s\"", backing_file);
     }
@@ -1837,7 +1854,7 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_close                   = vmdk_close,
     .bdrv_create                  = vmdk_create,
     .bdrv_co_flush_to_disk        = vmdk_co_flush,
-    .bdrv_co_is_allocated         = vmdk_co_is_allocated,
+    .bdrv_co_get_block_status     = vmdk_co_get_block_status,
     .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
     .bdrv_has_zero_init           = vmdk_has_zero_init,
 
diff --git a/block/vvfat.c b/block/vvfat.c
index cd3b8edd9f..0129195e29 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2874,16 +2874,17 @@ static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
-static int coroutine_fn vvfat_co_is_allocated(BlockDriverState *bs,
+static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs,
 	int64_t sector_num, int nb_sectors, int* n)
 {
     BDRVVVFATState* s = bs->opaque;
     *n = s->sector_count - sector_num;
-    if (*n > nb_sectors)
-	*n = nb_sectors;
-    else if (*n < 0)
-	return 0;
-    return 1;
+    if (*n > nb_sectors) {
+        *n = nb_sectors;
+    } else if (*n < 0) {
+        return 0;
+    }
+    return BDRV_BLOCK_DATA;
 }
 
 static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
@@ -2894,7 +2895,7 @@ static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
 
 static void write_target_close(BlockDriverState *bs) {
     BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
-    bdrv_delete(s->qcow);
+    bdrv_unref(s->qcow);
     g_free(s->qcow_filename);
 }
 
@@ -2935,7 +2936,7 @@ static int enable_write_target(BDRVVVFATState *s)
     ret = bdrv_open(s->qcow, s->qcow_filename, NULL,
             BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
     if (ret < 0) {
-        bdrv_delete(s->qcow);
+        bdrv_unref(s->qcow);
         goto err;
     }
 
@@ -2943,7 +2944,7 @@ static int enable_write_target(BDRVVVFATState *s)
     unlink(s->qcow_filename);
 #endif
 
-    s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1);
+    s->bs->backing_hd = bdrv_new("");
     s->bs->backing_hd->drv = &vvfat_write_target;
     s->bs->backing_hd->opaque = g_malloc(sizeof(void*));
     *(void**)s->bs->backing_hd->opaque = s;
@@ -2984,7 +2985,7 @@ static BlockDriver bdrv_vvfat = {
 
     .bdrv_read              = vvfat_co_read,
     .bdrv_write             = vvfat_co_write,
-    .bdrv_co_is_allocated   = vvfat_co_is_allocated,
+    .bdrv_co_get_block_status = vvfat_co_get_block_status,
 };
 
 static void bdrv_vvfat_init(void)
diff --git a/blockdev-nbd.c b/blockdev-nbd.c
index 95f10c81e3..922cf5657b 100644
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -69,12 +69,6 @@ static void nbd_close_notifier(Notifier *n, void *data)
     g_free(cn);
 }
 
-static void nbd_server_put_ref(NBDExport *exp)
-{
-    BlockDriverState *bs = nbd_export_get_blockdev(exp);
-    drive_put_ref(drive_get_by_blockdev(bs));
-}
-
 void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
                         Error **errp)
 {
@@ -105,11 +99,9 @@ void qmp_nbd_server_add(const char *device, bool has_writable, bool writable,
         writable = false;
     }
 
-    exp = nbd_export_new(bs, 0, -1, writable ? 0 : NBD_FLAG_READ_ONLY,
-                         nbd_server_put_ref);
+    exp = nbd_export_new(bs, 0, -1, writable ? 0 : NBD_FLAG_READ_ONLY, NULL);
 
     nbd_export_set_name(exp, device);
-    drive_get_ref(drive_get_by_blockdev(bs));
 
     n = g_malloc0(sizeof(NBDCloseNotifier));
     n->n.notify = nbd_close_notifier;
diff --git a/blockdev.c b/blockdev.c
index e70e16e4de..07dac05a2c 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -212,7 +212,7 @@ static void bdrv_format_print(void *opaque, const char *name)
 static void drive_uninit(DriveInfo *dinfo)
 {
     qemu_opts_del(dinfo->opts);
-    bdrv_delete(dinfo->bdrv);
+    bdrv_unref(dinfo->bdrv);
     g_free(dinfo->id);
     QTAILQ_REMOVE(&drives, dinfo, next);
     g_free(dinfo->serial);
@@ -234,32 +234,32 @@ void drive_get_ref(DriveInfo *dinfo)
 
 typedef struct {
     QEMUBH *bh;
-    DriveInfo *dinfo;
-} DrivePutRefBH;
+    BlockDriverState *bs;
+} BDRVPutRefBH;
 
-static void drive_put_ref_bh(void *opaque)
+static void bdrv_put_ref_bh(void *opaque)
 {
-    DrivePutRefBH *s = opaque;
+    BDRVPutRefBH *s = opaque;
 
-    drive_put_ref(s->dinfo);
+    bdrv_unref(s->bs);
     qemu_bh_delete(s->bh);
     g_free(s);
 }
 
 /*
- * Release a drive reference in a BH
+ * Release a BDS reference in a BH
  *
- * It is not possible to use drive_put_ref() from a callback function when the
- * callers still need the drive.  In such cases we schedule a BH to release the
- * reference.
+ * It is not safe to use bdrv_unref() from a callback function when the callers
+ * still need the BlockDriverState.  In such cases we schedule a BH to release
+ * the reference.
  */
-static void drive_put_ref_bh_schedule(DriveInfo *dinfo)
+static void bdrv_put_ref_bh_schedule(BlockDriverState *bs)
 {
-    DrivePutRefBH *s;
+    BDRVPutRefBH *s;
 
-    s = g_new(DrivePutRefBH, 1);
-    s->bh = qemu_bh_new(drive_put_ref_bh, s);
-    s->dinfo = dinfo;
+    s = g_new(BDRVPutRefBH, 1);
+    s->bh = qemu_bh_new(bdrv_put_ref_bh, s);
+    s->bs = bs;
     qemu_bh_schedule(s->bh);
 }
 
@@ -280,32 +280,16 @@ static int parse_block_error_action(const char *buf, bool is_read)
     }
 }
 
-static bool do_check_io_limits(BlockIOLimit *io_limits, Error **errp)
+static bool check_throttle_config(ThrottleConfig *cfg, Error **errp)
 {
-    bool bps_flag;
-    bool iops_flag;
-
-    assert(io_limits);
-
-    bps_flag  = (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] != 0)
-                 && ((io_limits->bps[BLOCK_IO_LIMIT_READ] != 0)
-                 || (io_limits->bps[BLOCK_IO_LIMIT_WRITE] != 0));
-    iops_flag = (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] != 0)
-                 && ((io_limits->iops[BLOCK_IO_LIMIT_READ] != 0)
-                 || (io_limits->iops[BLOCK_IO_LIMIT_WRITE] != 0));
-    if (bps_flag || iops_flag) {
-        error_setg(errp, "bps(iops) and bps_rd/bps_wr(iops_rd/iops_wr) "
-                         "cannot be used at the same time");
+    if (throttle_conflicting(cfg)) {
+        error_setg(errp, "bps/iops/max total values and read/write values"
+                         " cannot be used at the same time");
         return false;
     }
 
-    if (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] < 0 ||
-        io_limits->bps[BLOCK_IO_LIMIT_WRITE] < 0 ||
-        io_limits->bps[BLOCK_IO_LIMIT_READ] < 0 ||
-        io_limits->iops[BLOCK_IO_LIMIT_TOTAL] < 0 ||
-        io_limits->iops[BLOCK_IO_LIMIT_WRITE] < 0 ||
-        io_limits->iops[BLOCK_IO_LIMIT_READ] < 0) {
-        error_setg(errp, "bps and iops values must be 0 or greater");
+    if (!throttle_is_valid(cfg)) {
+        error_setg(errp, "bps/iops/maxs values must be 0 or greater");
         return false;
     }
 
@@ -330,7 +314,7 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts,
     int on_read_error, on_write_error;
     const char *devaddr;
     DriveInfo *dinfo;
-    BlockIOLimit io_limits;
+    ThrottleConfig cfg;
     int snapshot = 0;
     bool copy_on_read;
     int ret;
@@ -496,20 +480,36 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts,
     }
 
     /* disk I/O throttling */
-    io_limits.bps[BLOCK_IO_LIMIT_TOTAL]  =
+    memset(&cfg, 0, sizeof(cfg));
+    cfg.buckets[THROTTLE_BPS_TOTAL].avg =
         qemu_opt_get_number(opts, "throttling.bps-total", 0);
-    io_limits.bps[BLOCK_IO_LIMIT_READ]   =
+    cfg.buckets[THROTTLE_BPS_READ].avg  =
         qemu_opt_get_number(opts, "throttling.bps-read", 0);
-    io_limits.bps[BLOCK_IO_LIMIT_WRITE]  =
+    cfg.buckets[THROTTLE_BPS_WRITE].avg =
         qemu_opt_get_number(opts, "throttling.bps-write", 0);
-    io_limits.iops[BLOCK_IO_LIMIT_TOTAL] =
+    cfg.buckets[THROTTLE_OPS_TOTAL].avg =
         qemu_opt_get_number(opts, "throttling.iops-total", 0);
-    io_limits.iops[BLOCK_IO_LIMIT_READ]  =
+    cfg.buckets[THROTTLE_OPS_READ].avg =
         qemu_opt_get_number(opts, "throttling.iops-read", 0);
-    io_limits.iops[BLOCK_IO_LIMIT_WRITE] =
+    cfg.buckets[THROTTLE_OPS_WRITE].avg =
         qemu_opt_get_number(opts, "throttling.iops-write", 0);
 
-    if (!do_check_io_limits(&io_limits, &error)) {
+    cfg.buckets[THROTTLE_BPS_TOTAL].max =
+        qemu_opt_get_number(opts, "throttling.bps-total-max", 0);
+    cfg.buckets[THROTTLE_BPS_READ].max  =
+        qemu_opt_get_number(opts, "throttling.bps-read-max", 0);
+    cfg.buckets[THROTTLE_BPS_WRITE].max =
+        qemu_opt_get_number(opts, "throttling.bps-write-max", 0);
+    cfg.buckets[THROTTLE_OPS_TOTAL].max =
+        qemu_opt_get_number(opts, "throttling.iops-total-max", 0);
+    cfg.buckets[THROTTLE_OPS_READ].max =
+        qemu_opt_get_number(opts, "throttling.iops-read-max", 0);
+    cfg.buckets[THROTTLE_OPS_WRITE].max =
+        qemu_opt_get_number(opts, "throttling.iops-write-max", 0);
+
+    cfg.op_size = qemu_opt_get_number(opts, "throttling.iops-size", 0);
+
+    if (!check_throttle_config(&cfg, &error)) {
         error_report("%s", error_get_pretty(error));
         error_free(error);
         return NULL;
@@ -636,7 +636,10 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts,
     bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
 
     /* disk I/O throttling */
-    bdrv_set_io_limits(dinfo->bdrv, &io_limits);
+    if (throttle_enabled(&cfg)) {
+        bdrv_io_limits_enable(dinfo->bdrv);
+        bdrv_set_io_limits(dinfo->bdrv, &cfg);
+    }
 
     switch(type) {
     case IF_IDE:
@@ -732,7 +735,7 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts,
 err:
     qemu_opts_del(opts);
     QDECREF(bs_opts);
-    bdrv_delete(dinfo->bdrv);
+    bdrv_unref(dinfo->bdrv);
     g_free(dinfo->id);
     QTAILQ_REMOVE(&drives, dinfo, next);
     g_free(dinfo);
@@ -763,6 +766,17 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type)
     qemu_opt_rename(all_opts, "bps_rd", "throttling.bps-read");
     qemu_opt_rename(all_opts, "bps_wr", "throttling.bps-write");
 
+    qemu_opt_rename(all_opts, "iops_max", "throttling.iops-total-max");
+    qemu_opt_rename(all_opts, "iops_rd_max", "throttling.iops-read-max");
+    qemu_opt_rename(all_opts, "iops_wr_max", "throttling.iops-write-max");
+
+    qemu_opt_rename(all_opts, "bps_max", "throttling.bps-total-max");
+    qemu_opt_rename(all_opts, "bps_rd_max", "throttling.bps-read-max");
+    qemu_opt_rename(all_opts, "bps_wr_max", "throttling.bps-write-max");
+
+    qemu_opt_rename(all_opts,
+                    "iops_size", "throttling.iops-size");
+
     qemu_opt_rename(all_opts, "readonly", "read-only");
 
     value = qemu_opt_get(all_opts, "cache");
@@ -982,7 +996,7 @@ static void external_snapshot_abort(BlkTransactionState *common)
     ExternalSnapshotState *state =
                              DO_UPCAST(ExternalSnapshotState, common, common);
     if (state->new_bs) {
-        bdrv_delete(state->new_bs);
+        bdrv_unref(state->new_bs);
     }
 }
 
@@ -1247,10 +1261,26 @@ void qmp_change_blockdev(const char *device, const char *filename,
 
 /* throttling disk I/O limits */
 void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
-                               int64_t bps_wr, int64_t iops, int64_t iops_rd,
-                               int64_t iops_wr, Error **errp)
+                               int64_t bps_wr,
+                               int64_t iops,
+                               int64_t iops_rd,
+                               int64_t iops_wr,
+                               bool has_bps_max,
+                               int64_t bps_max,
+                               bool has_bps_rd_max,
+                               int64_t bps_rd_max,
+                               bool has_bps_wr_max,
+                               int64_t bps_wr_max,
+                               bool has_iops_max,
+                               int64_t iops_max,
+                               bool has_iops_rd_max,
+                               int64_t iops_rd_max,
+                               bool has_iops_wr_max,
+                               int64_t iops_wr_max,
+                               bool has_iops_size,
+                               int64_t iops_size, Error **errp)
 {
-    BlockIOLimit io_limits;
+    ThrottleConfig cfg;
     BlockDriverState *bs;
 
     bs = bdrv_find(device);
@@ -1259,27 +1289,50 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
         return;
     }
 
-    io_limits.bps[BLOCK_IO_LIMIT_TOTAL] = bps;
-    io_limits.bps[BLOCK_IO_LIMIT_READ]  = bps_rd;
-    io_limits.bps[BLOCK_IO_LIMIT_WRITE] = bps_wr;
-    io_limits.iops[BLOCK_IO_LIMIT_TOTAL]= iops;
-    io_limits.iops[BLOCK_IO_LIMIT_READ] = iops_rd;
-    io_limits.iops[BLOCK_IO_LIMIT_WRITE]= iops_wr;
+    memset(&cfg, 0, sizeof(cfg));
+    cfg.buckets[THROTTLE_BPS_TOTAL].avg = bps;
+    cfg.buckets[THROTTLE_BPS_READ].avg  = bps_rd;
+    cfg.buckets[THROTTLE_BPS_WRITE].avg = bps_wr;
 
-    if (!do_check_io_limits(&io_limits, errp)) {
+    cfg.buckets[THROTTLE_OPS_TOTAL].avg = iops;
+    cfg.buckets[THROTTLE_OPS_READ].avg  = iops_rd;
+    cfg.buckets[THROTTLE_OPS_WRITE].avg = iops_wr;
+
+    if (has_bps_max) {
+        cfg.buckets[THROTTLE_BPS_TOTAL].max = bps_max;
+    }
+    if (has_bps_rd_max) {
+        cfg.buckets[THROTTLE_BPS_READ].max = bps_rd_max;
+    }
+    if (has_bps_wr_max) {
+        cfg.buckets[THROTTLE_BPS_WRITE].max = bps_wr_max;
+    }
+    if (has_iops_max) {
+        cfg.buckets[THROTTLE_OPS_TOTAL].max = iops_max;
+    }
+    if (has_iops_rd_max) {
+        cfg.buckets[THROTTLE_OPS_READ].max = iops_rd_max;
+    }
+    if (has_iops_wr_max) {
+        cfg.buckets[THROTTLE_OPS_WRITE].max = iops_wr_max;
+    }
+
+    if (has_iops_size) {
+        cfg.op_size = iops_size;
+    }
+
+    if (!check_throttle_config(&cfg, errp)) {
         return;
     }
 
-    bs->io_limits = io_limits;
-
-    if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) {
+    if (!bs->io_limits_enabled && throttle_enabled(&cfg)) {
         bdrv_io_limits_enable(bs);
-    } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {
+    } else if (bs->io_limits_enabled && !throttle_enabled(&cfg)) {
         bdrv_io_limits_disable(bs);
-    } else {
-        if (bs->block_timer) {
-            timer_mod(bs->block_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
-        }
+    }
+
+    if (bs->io_limits_enabled) {
+        bdrv_set_io_limits(bs, &cfg);
     }
 }
 
@@ -1383,7 +1436,7 @@ static void block_job_cb(void *opaque, int ret)
     }
     qobject_decref(obj);
 
-    drive_put_ref_bh_schedule(drive_get_by_blockdev(bs));
+    bdrv_put_ref_bh_schedule(bs);
 }
 
 void qmp_block_stream(const char *device, bool has_base,
@@ -1420,11 +1473,6 @@ void qmp_block_stream(const char *device, bool has_base,
         return;
     }
 
-    /* Grab a reference so hotplug does not delete the BlockDriverState from
-     * underneath us.
-     */
-    drive_get_ref(drive_get_by_blockdev(bs));
-
     trace_qmp_block_stream(bs, bs->job);
 }
 
@@ -1481,10 +1529,6 @@ void qmp_block_commit(const char *device,
         error_propagate(errp, local_err);
         return;
     }
-    /* Grab a reference so hotplug does not delete the BlockDriverState from
-     * underneath us.
-     */
-    drive_get_ref(drive_get_by_blockdev(bs));
 }
 
 void qmp_drive_backup(const char *device, const char *target,
@@ -1585,7 +1629,7 @@ void qmp_drive_backup(const char *device, const char *target,
     target_bs = bdrv_new("");
     ret = bdrv_open(target_bs, target, NULL, flags, drv);
     if (ret < 0) {
-        bdrv_delete(target_bs);
+        bdrv_unref(target_bs);
         error_setg_file_open(errp, -ret, target);
         return;
     }
@@ -1593,15 +1637,10 @@ void qmp_drive_backup(const char *device, const char *target,
     backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error,
                  block_job_cb, bs, &local_err);
     if (local_err != NULL) {
-        bdrv_delete(target_bs);
+        bdrv_unref(target_bs);
         error_propagate(errp, local_err);
         return;
     }
-
-    /* Grab a reference so hotplug does not delete the BlockDriverState from
-     * underneath us.
-     */
-    drive_get_ref(drive_get_by_blockdev(bs));
 }
 
 #define DEFAULT_MIRROR_BUF_SIZE   (10 << 20)
@@ -1725,7 +1764,7 @@ void qmp_drive_mirror(const char *device, const char *target,
     target_bs = bdrv_new("");
     ret = bdrv_open(target_bs, target, NULL, flags | BDRV_O_NO_BACKING, drv);
     if (ret < 0) {
-        bdrv_delete(target_bs);
+        bdrv_unref(target_bs);
         error_setg_file_open(errp, -ret, target);
         return;
     }
@@ -1734,15 +1773,10 @@ void qmp_drive_mirror(const char *device, const char *target,
                  on_source_error, on_target_error,
                  block_job_cb, bs, &local_err);
     if (local_err != NULL) {
-        bdrv_delete(target_bs);
+        bdrv_unref(target_bs);
         error_propagate(errp, local_err);
         return;
     }
-
-    /* Grab a reference so hotplug does not delete the BlockDriverState from
-     * underneath us.
-     */
-    drive_get_ref(drive_get_by_blockdev(bs));
 }
 
 static BlockJob *find_block_job(const char *device)
@@ -1967,6 +2001,34 @@ QemuOptsList qemu_common_drive_opts = {
             .name = "throttling.bps-write",
             .type = QEMU_OPT_NUMBER,
             .help = "limit write bytes per second",
+        },{
+            .name = "throttling.iops-total-max",
+            .type = QEMU_OPT_NUMBER,
+            .help = "I/O operations burst",
+        },{
+            .name = "throttling.iops-read-max",
+            .type = QEMU_OPT_NUMBER,
+            .help = "I/O operations read burst",
+        },{
+            .name = "throttling.iops-write-max",
+            .type = QEMU_OPT_NUMBER,
+            .help = "I/O operations write burst",
+        },{
+            .name = "throttling.bps-total-max",
+            .type = QEMU_OPT_NUMBER,
+            .help = "total bytes burst",
+        },{
+            .name = "throttling.bps-read-max",
+            .type = QEMU_OPT_NUMBER,
+            .help = "total bytes read burst",
+        },{
+            .name = "throttling.bps-write-max",
+            .type = QEMU_OPT_NUMBER,
+            .help = "total bytes write burst",
+        },{
+            .name = "throttling.iops-size",
+            .type = QEMU_OPT_NUMBER,
+            .help = "when limiting by iops max size of an I/O in bytes",
         },{
             .name = "copy-on-read",
             .type = QEMU_OPT_BOOL,
diff --git a/blockjob.c b/blockjob.c
index 7edc945119..e7d49b7169 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -45,6 +45,7 @@ void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
         error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs));
         return NULL;
     }
+    bdrv_ref(bs);
     bdrv_set_in_use(bs, 1);
 
     job = g_malloc0(job_type->instance_size);
diff --git a/hmp.c b/hmp.c
index fcca6aea8f..2bd31d1c69 100644
--- a/hmp.c
+++ b/hmp.c
@@ -344,14 +344,30 @@ void hmp_info_block(Monitor *mon, const QDict *qdict)
         {
             monitor_printf(mon, "    I/O throttling:   bps=%" PRId64
                             " bps_rd=%" PRId64  " bps_wr=%" PRId64
+                            " bps_max=%" PRId64
+                            " bps_rd_max=%" PRId64
+                            " bps_wr_max=%" PRId64
                             " iops=%" PRId64 " iops_rd=%" PRId64
-                            " iops_wr=%" PRId64 "\n",
+                            " iops_wr=%" PRId64
+                            " iops_max=%" PRId64
+                            " iops_rd_max=%" PRId64
+                            " iops_wr_max=%" PRId64
+                            " iops_size=%" PRId64 "\n",
                             info->value->inserted->bps,
                             info->value->inserted->bps_rd,
                             info->value->inserted->bps_wr,
+                            info->value->inserted->bps_max,
+                            info->value->inserted->bps_rd_max,
+                            info->value->inserted->bps_wr_max,
                             info->value->inserted->iops,
                             info->value->inserted->iops_rd,
-                            info->value->inserted->iops_wr);
+                            info->value->inserted->iops_wr,
+                            info->value->inserted->iops_max,
+                            info->value->inserted->iops_rd_max,
+                            info->value->inserted->iops_wr_max,
+                            info->value->inserted->iops_size);
+        } else {
+            monitor_printf(mon, " [not inserted]");
         }
 
         if (verbose) {
@@ -1098,7 +1114,21 @@ void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict)
                               qdict_get_int(qdict, "bps_wr"),
                               qdict_get_int(qdict, "iops"),
                               qdict_get_int(qdict, "iops_rd"),
-                              qdict_get_int(qdict, "iops_wr"), &err);
+                              qdict_get_int(qdict, "iops_wr"),
+                              false, /* no burst max via HMP */
+                              0,
+                              false,
+                              0,
+                              false,
+                              0,
+                              false,
+                              0,
+                              false,
+                              0,
+                              false,
+                              0,
+                              false, /* No default I/O size */
+                              0, &err);
     hmp_handle_error(mon, &err);
 }
 
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 5a96ccd416..f2d7350a50 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -42,6 +42,7 @@ typedef struct {
 
 struct VirtIOBlockDataPlane {
     bool started;
+    bool starting;
     bool stopping;
     QEMUBH *start_bh;
     QemuThread thread;
@@ -451,8 +452,15 @@ void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
         return;
     }
 
+    if (s->starting) {
+        return;
+    }
+
+    s->starting = true;
+
     vq = virtio_get_queue(s->vdev, 0);
     if (!vring_setup(&s->vring, s->vdev, 0)) {
+        s->starting = false;
         return;
     }
 
@@ -482,6 +490,7 @@ void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
     s->io_notifier = *ioq_get_notifier(&s->ioqueue);
     aio_set_event_notifier(s->ctx, &s->io_notifier, handle_io);
 
+    s->starting = false;
     s->started = true;
     trace_virtio_blk_data_plane_start(s);
 
diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 727f4331c0..668cc069ff 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -813,7 +813,7 @@ static int blk_connect(struct XenDevice *xendev)
                                                            readonly);
             if (bdrv_open(blkdev->bs,
                           blkdev->filename, NULL, qflags, drv) != 0) {
-                bdrv_delete(blkdev->bs);
+                bdrv_unref(blkdev->bs);
                 blkdev->bs = NULL;
             }
         }
@@ -824,6 +824,9 @@ static int blk_connect(struct XenDevice *xendev)
         /* setup via qemu cmdline -> already setup for us */
         xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline setup)\n");
         blkdev->bs = blkdev->dinfo->bdrv;
+        /* blkdev->bs is not create by us, we get a reference
+         * so we can bdrv_unref() unconditionally */
+        bdrv_ref(blkdev->bs);
     }
     bdrv_attach_dev_nofail(blkdev->bs, blkdev);
     blkdev->file_size = bdrv_getlength(blkdev->bs);
@@ -922,12 +925,8 @@ static void blk_disconnect(struct XenDevice *xendev)
     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
 
     if (blkdev->bs) {
-        if (!blkdev->dinfo) {
-            /* close/delete only if we created it ourself */
-            bdrv_close(blkdev->bs);
-            bdrv_detach_dev(blkdev->bs, blkdev);
-            bdrv_delete(blkdev->bs);
-        }
+        bdrv_detach_dev(blkdev->bs, blkdev);
+        bdrv_unref(blkdev->bs);
         blkdev->bs = NULL;
     }
     xen_be_unbind_evtchn(&blkdev->xendev);
diff --git a/include/block/block.h b/include/block/block.h
index e6b391ce88..728ec1aebf 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -81,6 +81,32 @@ typedef struct BlockDevOps {
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
 #define BDRV_SECTOR_MASK   ~(BDRV_SECTOR_SIZE - 1)
 
+/* BDRV_BLOCK_DATA: data is read from bs->file or another file
+ * BDRV_BLOCK_ZERO: sectors read as zero
+ * BDRV_BLOCK_OFFSET_VALID: sector stored in bs->file as raw data
+ *
+ * If BDRV_BLOCK_OFFSET_VALID is set, bits 9-62 represent the offset in
+ * bs->file where sector data can be read from as raw data.
+ *
+ * DATA == 0 && ZERO == 0 means that data is read from backing_hd if present.
+ *
+ * DATA ZERO OFFSET_VALID
+ *  t    t        t       sectors read as zero, bs->file is zero at offset
+ *  t    f        t       sectors read as valid from bs->file at offset
+ *  f    t        t       sectors preallocated, read as zero, bs->file not
+ *                        necessarily zero at offset
+ *  f    f        t       sectors preallocated but read from backing_hd,
+ *                        bs->file contains garbage at offset
+ *  t    t        f       sectors preallocated, read as zero, unknown offset
+ *  t    f        f       sectors read from unknown file or offset
+ *  f    t        f       not allocated or unknown offset, read as zero
+ *  f    f        f       not allocated or unknown offset, read from backing_hd
+ */
+#define BDRV_BLOCK_DATA         1
+#define BDRV_BLOCK_ZERO         2
+#define BDRV_BLOCK_OFFSET_VALID 4
+#define BDRV_BLOCK_OFFSET_MASK  BDRV_SECTOR_MASK
+
 typedef enum {
     BDRV_ACTION_REPORT, BDRV_ACTION_IGNORE, BDRV_ACTION_STOP
 } BlockErrorAction;
@@ -107,7 +133,6 @@ void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 /* disk I/O throttling */
 void bdrv_io_limits_enable(BlockDriverState *bs);
 void bdrv_io_limits_disable(BlockDriverState *bs);
-bool bdrv_io_limits_enabled(BlockDriverState *bs);
 
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
@@ -123,7 +148,6 @@ BlockDriverState *bdrv_new(const char *device_name);
 void bdrv_make_anon(BlockDriverState *bs);
 void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old);
 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top);
-void bdrv_delete(BlockDriverState *bs);
 int bdrv_parse_cache_flags(const char *mode, int *flags);
 int bdrv_parse_discard_flags(const char *mode, int *flags);
 int bdrv_file_open(BlockDriverState **pbs, const char *filename,
@@ -181,12 +205,6 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
  */
 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors);
-int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
-    int nb_sectors, int *pnum);
-int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top,
-                                            BlockDriverState *base,
-                                            int64_t sector_num,
-                                            int nb_sectors, int *pnum);
 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
     const char *backing_file);
 int bdrv_get_backing_file_depth(BlockDriverState *bs);
@@ -277,6 +295,8 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
 int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
 int bdrv_has_zero_init_1(BlockDriverState *bs);
 int bdrv_has_zero_init(BlockDriverState *bs);
+int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
+                              int nb_sectors, int *pnum);
 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
                       int *pnum);
 int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
@@ -356,6 +376,8 @@ int64_t bdrv_get_dirty_count(BlockDriverState *bs);
 void bdrv_enable_copy_on_read(BlockDriverState *bs);
 void bdrv_disable_copy_on_read(BlockDriverState *bs);
 
+void bdrv_ref(BlockDriverState *bs);
+void bdrv_unref(BlockDriverState *bs);
 void bdrv_set_in_use(BlockDriverState *bs, int in_use);
 int bdrv_in_use(BlockDriverState *bs);
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 8012e253c9..7c35198ad7 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -35,18 +35,12 @@
 #include "qemu/hbitmap.h"
 #include "block/snapshot.h"
 #include "qemu/main-loop.h"
+#include "qemu/throttle.h"
 
 #define BLOCK_FLAG_ENCRYPT          1
 #define BLOCK_FLAG_COMPAT6          4
 #define BLOCK_FLAG_LAZY_REFCOUNTS   8
 
-#define BLOCK_IO_LIMIT_READ     0
-#define BLOCK_IO_LIMIT_WRITE    1
-#define BLOCK_IO_LIMIT_TOTAL    2
-
-#define BLOCK_IO_SLICE_TIME     100000000
-#define NANOSECONDS_PER_SECOND  1000000000.0
-
 #define BLOCK_OPT_SIZE              "size"
 #define BLOCK_OPT_ENCRYPT           "encryption"
 #define BLOCK_OPT_COMPAT6           "compat6"
@@ -70,17 +64,6 @@ typedef struct BdrvTrackedRequest {
     CoQueue wait_queue; /* coroutines blocked on this request */
 } BdrvTrackedRequest;
 
-
-typedef struct BlockIOLimit {
-    int64_t bps[3];
-    int64_t iops[3];
-} BlockIOLimit;
-
-typedef struct BlockIOBaseValue {
-    uint64_t bytes[2];
-    uint64_t ios[2];
-} BlockIOBaseValue;
-
 struct BlockDriver {
     const char *format_name;
     int instance_size;
@@ -135,7 +118,7 @@ struct BlockDriver {
         int64_t sector_num, int nb_sectors);
     int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors);
-    int coroutine_fn (*bdrv_co_is_allocated)(BlockDriverState *bs,
+    int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum);
 
     /*
@@ -264,13 +247,9 @@ struct BlockDriverState {
     /* number of in-flight copy-on-read requests */
     unsigned int copy_on_read_in_flight;
 
-    /* the time for latest disk I/O */
-    int64_t slice_start;
-    int64_t slice_end;
-    BlockIOLimit io_limits;
-    BlockIOBaseValue slice_submitted;
-    CoQueue      throttled_reqs;
-    QEMUTimer    *block_timer;
+    /* I/O throttling */
+    ThrottleState throttle_state;
+    CoQueue      throttled_reqs[2];
     bool         io_limits_enabled;
 
     /* I/O stats (display with "info blockstats"). */
@@ -298,6 +277,7 @@ struct BlockDriverState {
     BlockDeviceIoStatus iostatus;
     char device_name[32];
     HBitmap *dirty_bitmap;
+    int refcnt;
     int in_use; /* users other than guest access, eg. block migration */
     QTAILQ_ENTRY(BlockDriverState) list;
 
@@ -312,7 +292,8 @@ struct BlockDriverState {
 int get_tmp_filename(char *filename, int size);
 
 void bdrv_set_io_limits(BlockDriverState *bs,
-                        BlockIOLimit *io_limits);
+                        ThrottleConfig *cfg);
+
 
 /**
  * bdrv_add_before_write_notifier:
diff --git a/include/qemu/throttle.h b/include/qemu/throttle.h
new file mode 100644
index 0000000000..ab29b0b918
--- /dev/null
+++ b/include/qemu/throttle.h
@@ -0,0 +1,110 @@
+/*
+ * QEMU throttling infrastructure
+ *
+ * Copyright (C) Nodalink, SARL. 2013
+ *
+ * Author:
+ *   Benoît Canet <benoit.canet@irqsave.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef THROTTLE_H
+#define THROTTLE_H
+
+#include <stdint.h>
+#include "qemu-common.h"
+#include "qemu/timer.h"
+
+#define NANOSECONDS_PER_SECOND  1000000000.0
+
+typedef enum {
+    THROTTLE_BPS_TOTAL,
+    THROTTLE_BPS_READ,
+    THROTTLE_BPS_WRITE,
+    THROTTLE_OPS_TOTAL,
+    THROTTLE_OPS_READ,
+    THROTTLE_OPS_WRITE,
+    BUCKETS_COUNT,
+} BucketType;
+
+/*
+ * The max parameter of the leaky bucket throttling algorithm can be used to
+ * allow the guest to do bursts.
+ * The max value is a pool of I/O that the guest can use without being throttled
+ * at all. Throttling is triggered once this pool is empty.
+ */
+
+typedef struct LeakyBucket {
+    double  avg;              /* average goal in units per second */
+    double  max;              /* leaky bucket max burst in units */
+    double  level;            /* bucket level in units */
+} LeakyBucket;
+
+/* The following structure is used to configure a ThrottleState
+ * It contains a bit of state: the bucket field of the LeakyBucket structure.
+ * However it allows to keep the code clean and the bucket field is reset to
+ * zero at the right time.
+ */
+typedef struct ThrottleConfig {
+    LeakyBucket buckets[BUCKETS_COUNT]; /* leaky buckets */
+    uint64_t op_size;         /* size of an operation in bytes */
+} ThrottleConfig;
+
+typedef struct ThrottleState {
+    ThrottleConfig cfg;       /* configuration */
+    int64_t previous_leak;    /* timestamp of the last leak done */
+    QEMUTimer * timers[2];    /* timers used to do the throttling */
+    QEMUClockType clock_type; /* the clock used */
+} ThrottleState;
+
+/* operations on single leaky buckets */
+void throttle_leak_bucket(LeakyBucket *bkt, int64_t delta);
+
+int64_t throttle_compute_wait(LeakyBucket *bkt);
+
+/* expose timer computation function for unit tests */
+bool throttle_compute_timer(ThrottleState *ts,
+                            bool is_write,
+                            int64_t now,
+                            int64_t *next_timestamp);
+
+/* init/destroy cycle */
+void throttle_init(ThrottleState *ts,
+                   QEMUClockType clock_type,
+                   void (read_timer)(void *),
+                   void (write_timer)(void *),
+                   void *timer_opaque);
+
+void throttle_destroy(ThrottleState *ts);
+
+bool throttle_have_timer(ThrottleState *ts);
+
+/* configuration */
+bool throttle_enabled(ThrottleConfig *cfg);
+
+bool throttle_conflicting(ThrottleConfig *cfg);
+
+bool throttle_is_valid(ThrottleConfig *cfg);
+
+void throttle_config(ThrottleState *ts, ThrottleConfig *cfg);
+
+void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg);
+
+/* usage */
+bool throttle_schedule_timer(ThrottleState *ts, bool is_write);
+
+void throttle_account(ThrottleState *ts, bool is_write, uint64_t size);
+
+#endif
diff --git a/nbd.c b/nbd.c
index 0fd05836ca..f847940f3e 100644
--- a/nbd.c
+++ b/nbd.c
@@ -882,6 +882,7 @@ NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
     exp->nbdflags = nbdflags;
     exp->size = size == -1 ? bdrv_getlength(bs) : size;
     exp->close = close;
+    bdrv_ref(bs);
     return exp;
 }
 
@@ -928,6 +929,10 @@ void nbd_export_close(NBDExport *exp)
     }
     nbd_export_set_name(exp, NULL);
     nbd_export_put(exp);
+    if (exp->bs) {
+        bdrv_unref(exp->bs);
+        exp->bs = NULL;
+    }
 }
 
 void nbd_export_get(NBDExport *exp)
diff --git a/qapi-schema.json b/qapi-schema.json
index a51f7d2d6e..2b2c8bce07 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -785,6 +785,20 @@
 #
 # @image: the info of image used (since: 1.6)
 #
+# @bps_max: #optional total max in bytes (Since 1.7)
+#
+# @bps_rd_max: #optional read max in bytes (Since 1.7)
+#
+# @bps_wr_max: #optional write max in bytes (Since 1.7)
+#
+# @iops_max: #optional total I/O operations max (Since 1.7)
+#
+# @iops_rd_max: #optional read I/O operations max (Since 1.7)
+#
+# @iops_wr_max: #optional write I/O operations max (Since 1.7)
+#
+# @iops_size: #optional an I/O size in bytes (Since 1.7)
+#
 # Since: 0.14.0
 #
 # Notes: This interface is only found in @BlockInfo.
@@ -795,7 +809,11 @@
             'encrypted': 'bool', 'encryption_key_missing': 'bool',
             'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int',
             'iops': 'int', 'iops_rd': 'int', 'iops_wr': 'int',
-            'image': 'ImageInfo' } }
+            'image': 'ImageInfo',
+            '*bps_max': 'int', '*bps_rd_max': 'int',
+            '*bps_wr_max': 'int', '*iops_max': 'int',
+            '*iops_rd_max': 'int', '*iops_wr_max': 'int',
+            '*iops_size': 'int' } }
 
 ##
 # @BlockDeviceIoStatus:
@@ -812,6 +830,35 @@
 ##
 { 'enum': 'BlockDeviceIoStatus', 'data': [ 'ok', 'failed', 'nospace' ] }
 
+##
+# @BlockDeviceMapEntry:
+#
+# Entry in the metadata map of the device (returned by "qemu-img map")
+#
+# @start: Offset in the image of the first byte described by this entry
+#         (in bytes)
+#
+# @length: Length of the range described by this entry (in bytes)
+#
+# @depth: Number of layers (0 = top image, 1 = top image's backing file, etc.)
+#         before reaching one for which the range is allocated.  The value is
+#         in the range 0 to the depth of the image chain - 1.
+#
+# @zero: the sectors in this range read as zeros
+#
+# @data: reading the image will actually read data from a file (in particular,
+#        if @offset is present this means that the sectors are not simply
+#        preallocated, but contain actual data in raw format)
+#
+# @offset: if present, the image file stores the data for this range in
+#          raw format at the given offset.
+#
+# Since 1.7
+##
+{ 'type': 'BlockDeviceMapEntry',
+  'data': { 'start': 'int', 'length': 'int', 'depth': 'int', 'zero': 'bool',
+            'data': 'bool', '*offset': 'int' } }
+
 ##
 # @BlockDirtyInfo:
 #
@@ -2174,6 +2221,20 @@
 #
 # @iops_wr: write I/O operations per second
 #
+# @bps_max: #optional total max in bytes (Since 1.7)
+#
+# @bps_rd_max: #optional read max in bytes (Since 1.7)
+#
+# @bps_wr_max: #optional write max in bytes (Since 1.7)
+#
+# @iops_max: #optional total I/O operations max (Since 1.7)
+#
+# @iops_rd_max: #optional read I/O operations max (Since 1.7)
+#
+# @iops_wr_max: #optional write I/O operations max (Since 1.7)
+#
+# @iops_size: #optional an I/O size in bytes (Since 1.7)
+#
 # Returns: Nothing on success
 #          If @device is not a valid block device, DeviceNotFound
 #
@@ -2181,7 +2242,11 @@
 ##
 { 'command': 'block_set_io_throttle',
   'data': { 'device': 'str', 'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int',
-            'iops': 'int', 'iops_rd': 'int', 'iops_wr': 'int' } }
+            'iops': 'int', 'iops_rd': 'int', 'iops_wr': 'int',
+            '*bps_max': 'int', '*bps_rd_max': 'int',
+            '*bps_wr_max': 'int', '*iops_max': 'int',
+            '*iops_rd_max': 'int', '*iops_wr_max': 'int',
+            '*iops_size': 'int' } }
 
 ##
 # @block-stream:
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 4ca7e95655..0c36e5968f 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -34,9 +34,9 @@ STEXI
 ETEXI
 
 DEF("convert", img_convert,
-    "convert [-c] [-p] [-q] [-f fmt] [-t cache] [-O output_fmt] [-o options] [-s snapshot_name] [-S sparse_size] filename [filename2 [...]] output_filename")
+    "convert [-c] [-p] [-q] [-n] [-f fmt] [-t cache] [-O output_fmt] [-o options] [-s snapshot_name] [-S sparse_size] filename [filename2 [...]] output_filename")
 STEXI
-@item convert [-c] [-p] [-q] [-f @var{fmt}] [-t @var{cache}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename}
+@item convert [-c] [-p] [-q] [-n] [-f @var{fmt}] [-t @var{cache}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename}
 ETEXI
 
 DEF("info", img_info,
@@ -45,6 +45,12 @@ STEXI
 @item info [-f @var{fmt}] [--output=@var{ofmt}] [--backing-chain] @var{filename}
 ETEXI
 
+DEF("map", img_map,
+    "map [-f fmt] [--output=ofmt] filename")
+STEXI
+@item map [-f @var{fmt}] [--output=@var{ofmt}] @var{filename}
+ETEXI
+
 DEF("snapshot", img_snapshot,
     "snapshot [-q] [-l | -a snapshot | -c snapshot | -d snapshot] filename")
 STEXI
diff --git a/qemu-img.c b/qemu-img.c
index b9a848db74..3e5e388d1c 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -103,6 +103,8 @@ static void help(void)
            "  '-S' indicates the consecutive number of bytes that must contain only zeros\n"
            "       for qemu-img to create a sparse image during conversion\n"
            "  '--output' takes the format in which the output must be done (human or json)\n"
+           "  '-n' skips the target volume creation (useful if the volume is created\n"
+           "       prior to running qemu-img)\n"
            "\n"
            "Parameters to check subcommand:\n"
            "  '-r' tries to repair any inconsistencies that are found during the check.\n"
@@ -298,7 +300,7 @@ static BlockDriverState *bdrv_new_open(const char *filename,
     return bs;
 fail:
     if (bs) {
-        bdrv_delete(bs);
+        bdrv_unref(bs);
     }
     return NULL;
 }
@@ -652,7 +654,7 @@ static int img_check(int argc, char **argv)
 
 fail:
     qapi_free_ImageCheck(check);
-    bdrv_delete(bs);
+    bdrv_unref(bs);
 
     return ret;
 }
@@ -722,7 +724,7 @@ static int img_commit(int argc, char **argv)
         break;
     }
 
-    bdrv_delete(bs);
+    bdrv_unref(bs);
     if (ret) {
         return 1;
     }
@@ -1104,11 +1106,11 @@ static int img_compare(int argc, char **argv)
     ret = 0;
 
 out:
-    bdrv_delete(bs2);
+    bdrv_unref(bs2);
     qemu_vfree(buf1);
     qemu_vfree(buf2);
 out2:
-    bdrv_delete(bs1);
+    bdrv_unref(bs1);
 out3:
     qemu_progress_end();
     return ret;
@@ -1116,7 +1118,8 @@ out3:
 
 static int img_convert(int argc, char **argv)
 {
-    int c, ret = 0, n, n1, bs_n, bs_i, compress, cluster_size, cluster_sectors;
+    int c, ret = 0, n, n1, bs_n, bs_i, compress, cluster_size,
+        cluster_sectors, skip_create;
     int progress = 0, flags;
     const char *fmt, *out_fmt, *cache, *out_baseimg, *out_filename;
     BlockDriver *drv, *proto_drv;
@@ -1139,8 +1142,9 @@ static int img_convert(int argc, char **argv)
     cache = "unsafe";
     out_baseimg = NULL;
     compress = 0;
+    skip_create = 0;
     for(;;) {
-        c = getopt(argc, argv, "f:O:B:s:hce6o:pS:t:q");
+        c = getopt(argc, argv, "f:O:B:s:hce6o:pS:t:qn");
         if (c == -1) {
             break;
         }
@@ -1197,6 +1201,9 @@ static int img_convert(int argc, char **argv)
         case 'q':
             quiet = true;
             break;
+        case 'n':
+            skip_create = 1;
+            break;
         }
     }
 
@@ -1329,20 +1336,22 @@ static int img_convert(int argc, char **argv)
         }
     }
 
-    /* Create the new image */
-    ret = bdrv_create(drv, out_filename, param);
-    if (ret < 0) {
-        if (ret == -ENOTSUP) {
-            error_report("Formatting not supported for file format '%s'",
-                         out_fmt);
-        } else if (ret == -EFBIG) {
-            error_report("The image size is too large for file format '%s'",
-                         out_fmt);
-        } else {
-            error_report("%s: error while converting %s: %s",
-                         out_filename, out_fmt, strerror(-ret));
+    if (!skip_create) {
+        /* Create the new image */
+        ret = bdrv_create(drv, out_filename, param);
+        if (ret < 0) {
+            if (ret == -ENOTSUP) {
+                error_report("Formatting not supported for file format '%s'",
+                             out_fmt);
+            } else if (ret == -EFBIG) {
+                error_report("The image size is too large for file format '%s'",
+                             out_fmt);
+            } else {
+                error_report("%s: error while converting %s: %s",
+                             out_filename, out_fmt, strerror(-ret));
+            }
+            goto out;
         }
-        goto out;
     }
 
     flags = BDRV_O_RDWR;
@@ -1363,6 +1372,20 @@ static int img_convert(int argc, char **argv)
     bdrv_get_geometry(bs[0], &bs_sectors);
     buf = qemu_blockalign(out_bs, IO_BUF_SIZE);
 
+    if (skip_create) {
+        int64_t output_length = bdrv_getlength(out_bs);
+        if (output_length < 0) {
+            error_report("unable to get output image length: %s\n",
+                         strerror(-output_length));
+            ret = -1;
+            goto out;
+        } else if (output_length < total_sectors << BDRV_SECTOR_BITS) {
+            error_report("output file is smaller than input file");
+            ret = -1;
+            goto out;
+        }
+    }
+
     if (compress) {
         ret = bdrv_get_info(out_bs, &bdi);
         if (ret < 0) {
@@ -1479,21 +1502,26 @@ static int img_convert(int argc, char **argv)
                 n = bs_offset + bs_sectors - sector_num;
             }
 
-            if (has_zero_init) {
-                /* If the output image is being created as a copy on write image,
-                   assume that sectors which are unallocated in the input image
-                   are present in both the output's and input's base images (no
-                   need to copy them). */
-                if (out_baseimg) {
-                    if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset,
-                                           n, &n1)) {
-                        sector_num += n1;
-                        continue;
-                    }
-                    /* The next 'n1' sectors are allocated in the input image. Copy
-                       only those as they may be followed by unallocated sectors. */
-                    n = n1;
+            /* If the output image is being created as a copy on write image,
+               assume that sectors which are unallocated in the input image
+               are present in both the output's and input's base images (no
+               need to copy them). */
+            if (out_baseimg) {
+                ret = bdrv_is_allocated(bs[bs_i], sector_num - bs_offset,
+                                        n, &n1);
+                if (ret < 0) {
+                    error_report("error while reading metadata for sector "
+                                 "%" PRId64 ": %s",
+                                 sector_num - bs_offset, strerror(-ret));
+                    goto out;
                 }
+                if (!ret) {
+                    sector_num += n1;
+                    continue;
+                }
+                /* The next 'n1' sectors are allocated in the input image. Copy
+                   only those as they may be followed by unallocated sectors. */
+                n = n1;
             } else {
                 n1 = n;
             }
@@ -1509,14 +1537,7 @@ static int img_convert(int argc, char **argv)
                should add a specific call to have the info to go faster */
             buf1 = buf;
             while (n > 0) {
-                /* If the output image is being created as a copy on write image,
-                   copy all sectors even the ones containing only NUL bytes,
-                   because they may differ from the sectors in the base image.
-
-                   If the output is to a host device, we also write out
-                   sectors that are entirely 0, since whatever data was
-                   already there is garbage, not 0s. */
-                if (!has_zero_init || out_baseimg ||
+                if (!has_zero_init ||
                     is_allocated_sectors_min(buf1, n, &n1, min_sparse)) {
                     ret = bdrv_write(out_bs, sector_num, buf1, n1);
                     if (ret < 0) {
@@ -1538,12 +1559,12 @@ out:
     free_option_parameters(param);
     qemu_vfree(buf);
     if (out_bs) {
-        bdrv_delete(out_bs);
+        bdrv_unref(out_bs);
     }
     if (bs) {
         for (bs_i = 0; bs_i < bs_n; bs_i++) {
             if (bs[bs_i]) {
-                bdrv_delete(bs[bs_i]);
+                bdrv_unref(bs[bs_i]);
             }
         }
         g_free(bs);
@@ -1681,7 +1702,7 @@ static ImageInfoList *collect_image_info_list(const char *filename,
         *last = elem;
         last = &elem->next;
 
-        bdrv_delete(bs);
+        bdrv_unref(bs);
 
         filename = fmt = NULL;
         if (chain) {
@@ -1780,6 +1801,197 @@ static int img_info(int argc, char **argv)
     return 0;
 }
 
+
+typedef struct MapEntry {
+    int flags;
+    int depth;
+    int64_t start;
+    int64_t length;
+    int64_t offset;
+    BlockDriverState *bs;
+} MapEntry;
+
+static void dump_map_entry(OutputFormat output_format, MapEntry *e,
+                           MapEntry *next)
+{
+    switch (output_format) {
+    case OFORMAT_HUMAN:
+        if ((e->flags & BDRV_BLOCK_DATA) &&
+            !(e->flags & BDRV_BLOCK_OFFSET_VALID)) {
+            error_report("File contains external, encrypted or compressed clusters.");
+            exit(1);
+        }
+        if ((e->flags & (BDRV_BLOCK_DATA|BDRV_BLOCK_ZERO)) == BDRV_BLOCK_DATA) {
+            printf("%#-16"PRIx64"%#-16"PRIx64"%#-16"PRIx64"%s\n",
+                   e->start, e->length, e->offset, e->bs->filename);
+        }
+        /* This format ignores the distinction between 0, ZERO and ZERO|DATA.
+         * Modify the flags here to allow more coalescing.
+         */
+        if (next &&
+            (next->flags & (BDRV_BLOCK_DATA|BDRV_BLOCK_ZERO)) != BDRV_BLOCK_DATA) {
+            next->flags &= ~BDRV_BLOCK_DATA;
+            next->flags |= BDRV_BLOCK_ZERO;
+        }
+        break;
+    case OFORMAT_JSON:
+        printf("%s{ \"start\": %"PRId64", \"length\": %"PRId64", \"depth\": %d,"
+               " \"zero\": %s, \"data\": %s",
+               (e->start == 0 ? "[" : ",\n"),
+               e->start, e->length, e->depth,
+               (e->flags & BDRV_BLOCK_ZERO) ? "true" : "false",
+               (e->flags & BDRV_BLOCK_DATA) ? "true" : "false");
+        if (e->flags & BDRV_BLOCK_OFFSET_VALID) {
+            printf(", 'offset': %"PRId64"", e->offset);
+        }
+        putchar('}');
+
+        if (!next) {
+            printf("]\n");
+        }
+        break;
+    }
+}
+
+static int get_block_status(BlockDriverState *bs, int64_t sector_num,
+                            int nb_sectors, MapEntry *e)
+{
+    int64_t ret;
+    int depth;
+
+    /* As an optimization, we could cache the current range of unallocated
+     * clusters in each file of the chain, and avoid querying the same
+     * range repeatedly.
+     */
+
+    depth = 0;
+    for (;;) {
+        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &nb_sectors);
+        if (ret < 0) {
+            return ret;
+        }
+        assert(nb_sectors);
+        if (ret & (BDRV_BLOCK_ZERO|BDRV_BLOCK_DATA)) {
+            break;
+        }
+        bs = bs->backing_hd;
+        if (bs == NULL) {
+            ret = 0;
+            break;
+        }
+
+        depth++;
+    }
+
+    e->start = sector_num * BDRV_SECTOR_SIZE;
+    e->length = nb_sectors * BDRV_SECTOR_SIZE;
+    e->flags = ret & ~BDRV_BLOCK_OFFSET_MASK;
+    e->offset = ret & BDRV_BLOCK_OFFSET_MASK;
+    e->depth = depth;
+    e->bs = bs;
+    return 0;
+}
+
+static int img_map(int argc, char **argv)
+{
+    int c;
+    OutputFormat output_format = OFORMAT_HUMAN;
+    BlockDriverState *bs;
+    const char *filename, *fmt, *output;
+    int64_t length;
+    MapEntry curr = { .length = 0 }, next;
+    int ret = 0;
+
+    fmt = NULL;
+    output = NULL;
+    for (;;) {
+        int option_index = 0;
+        static const struct option long_options[] = {
+            {"help", no_argument, 0, 'h'},
+            {"format", required_argument, 0, 'f'},
+            {"output", required_argument, 0, OPTION_OUTPUT},
+            {0, 0, 0, 0}
+        };
+        c = getopt_long(argc, argv, "f:h",
+                        long_options, &option_index);
+        if (c == -1) {
+            break;
+        }
+        switch (c) {
+        case '?':
+        case 'h':
+            help();
+            break;
+        case 'f':
+            fmt = optarg;
+            break;
+        case OPTION_OUTPUT:
+            output = optarg;
+            break;
+        }
+    }
+    if (optind >= argc) {
+        help();
+    }
+    filename = argv[optind++];
+
+    if (output && !strcmp(output, "json")) {
+        output_format = OFORMAT_JSON;
+    } else if (output && !strcmp(output, "human")) {
+        output_format = OFORMAT_HUMAN;
+    } else if (output) {
+        error_report("--output must be used with human or json as argument.");
+        return 1;
+    }
+
+    bs = bdrv_new_open(filename, fmt, BDRV_O_FLAGS, true, false);
+    if (!bs) {
+        return 1;
+    }
+
+    if (output_format == OFORMAT_HUMAN) {
+        printf("%-16s%-16s%-16s%s\n", "Offset", "Length", "Mapped to", "File");
+    }
+
+    length = bdrv_getlength(bs);
+    while (curr.start + curr.length < length) {
+        int64_t nsectors_left;
+        int64_t sector_num;
+        int n;
+
+        sector_num = (curr.start + curr.length) >> BDRV_SECTOR_BITS;
+
+        /* Probe up to 1 GiB at a time.  */
+        nsectors_left = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE) - sector_num;
+        n = MIN(1 << (30 - BDRV_SECTOR_BITS), nsectors_left);
+        ret = get_block_status(bs, sector_num, n, &next);
+
+        if (ret < 0) {
+            error_report("Could not read file metadata: %s", strerror(-ret));
+            goto out;
+        }
+
+        if (curr.length != 0 && curr.flags == next.flags &&
+            curr.depth == next.depth &&
+            ((curr.flags & BDRV_BLOCK_OFFSET_VALID) == 0 ||
+             curr.offset + curr.length == next.offset)) {
+            curr.length += next.length;
+            continue;
+        }
+
+        if (curr.length > 0) {
+            dump_map_entry(output_format, &curr, &next);
+        }
+        curr = next;
+    }
+
+    dump_map_entry(output_format, &curr, NULL);
+
+out:
+    bdrv_unref(bs);
+    return ret < 0;
+}
+
 #define SNAPSHOT_LIST   1
 #define SNAPSHOT_CREATE 2
 #define SNAPSHOT_APPLY  3
@@ -1895,7 +2107,7 @@ static int img_snapshot(int argc, char **argv)
     }
 
     /* Cleanup */
-    bdrv_delete(bs);
+    bdrv_unref(bs);
     if (ret) {
         return 1;
     }
@@ -2076,6 +2288,11 @@ static int img_rebase(int argc, char **argv)
 
             /* If the cluster is allocated, we don't need to take action */
             ret = bdrv_is_allocated(bs, sector, n, &n);
+            if (ret < 0) {
+                error_report("error while reading image metadata: %s",
+                             strerror(-ret));
+                goto out;
+            }
             if (ret) {
                 continue;
             }
@@ -2170,14 +2387,14 @@ out:
     /* Cleanup */
     if (!unsafe) {
         if (bs_old_backing != NULL) {
-            bdrv_delete(bs_old_backing);
+            bdrv_unref(bs_old_backing);
         }
         if (bs_new_backing != NULL) {
-            bdrv_delete(bs_new_backing);
+            bdrv_unref(bs_new_backing);
         }
     }
 
-    bdrv_delete(bs);
+    bdrv_unref(bs);
     if (ret) {
         return 1;
     }
@@ -2300,7 +2517,7 @@ static int img_resize(int argc, char **argv)
     }
 out:
     if (bs) {
-        bdrv_delete(bs);
+        bdrv_unref(bs);
     }
     if (ret) {
         return 1;
diff --git a/qemu-img.texi b/qemu-img.texi
index 69f1bda6ae..43ee4eb5c4 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -96,6 +96,14 @@ Second image format
 Strict mode - fail on on different image size or sector allocation
 @end table
 
+Parameters to convert subcommand:
+
+@table @option
+
+@item -n
+Skip the creation of the target volume
+@end table
+
 Command description:
 
 @table @option
@@ -171,7 +179,7 @@ Error on reading data
 
 @end table
 
-@item convert [-c] [-p] [-f @var{fmt}] [-t @var{cache}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename}
+@item convert [-c] [-p] [-n] [-f @var{fmt}] [-t @var{cache}] [-O @var{output_fmt}] [-o @var{options}] [-s @var{snapshot_name}] [-S @var{sparse_size}] @var{filename} [@var{filename2} [...]] @var{output_filename}
 
 Convert the disk image @var{filename} or a snapshot @var{snapshot_name} to disk image @var{output_filename}
 using format @var{output_fmt}. It can be optionally compressed (@code{-c}
@@ -190,6 +198,11 @@ created as a copy on write image of the specified base image; the
 @var{backing_file} should have the same content as the input's base image,
 however the path, image format, etc may differ.
 
+If the @code{-n} option is specified, the target volume creation will be
+skipped. This is useful for formats such as @code{rbd} if the target
+volume has already been created with site specific options that cannot
+be supplied through qemu-img.
+
 @item info [-f @var{fmt}] [--output=@var{ofmt}] [--backing-chain] @var{filename}
 
 Give information about the disk image @var{filename}. Use it in
@@ -213,6 +226,61 @@ To enumerate information about each disk image in the above chain, starting from
 qemu-img info --backing-chain snap2.qcow2
 @end example
 
+@item map [-f @var{fmt}] [--output=@var{ofmt}] @var{filename}
+
+Dump the metadata of image @var{filename} and its backing file chain.
+In particular, this commands dumps the allocation state of every sector
+of @var{filename}, together with the topmost file that allocates it in
+the backing file chain.
+
+Two option formats are possible.  The default format (@code{human})
+only dumps known-nonzero areas of the file.  Known-zero parts of the
+file are omitted altogether, and likewise for parts that are not allocated
+throughout the chain.  @command{qemu-img} output will identify a file
+from where the data can be read, and the offset in the file.  Each line
+will include four fields, the first three of which are hexadecimal
+numbers.  For example the first line of:
+@example
+Offset          Length          Mapped to       File
+0               0x20000         0x50000         /tmp/overlay.qcow2
+0x100000        0x10000         0x95380000      /tmp/backing.qcow2
+@end example
+@noindent
+means that 0x20000 (131072) bytes starting at offset 0 in the image are
+available in /tmp/overlay.qcow2 (opened in @code{raw} format) starting
+at offset 0x50000 (327680).  Data that is compressed, encrypted, or
+otherwise not available in raw format will cause an error if @code{human}
+format is in use.  Note that file names can include newlines, thus it is
+not safe to parse this output format in scripts.
+
+The alternative format @code{json} will return an array of dictionaries
+in JSON format.  It will include similar information in
+the @code{start}, @code{length}, @code{offset} fields;
+it will also include other more specific information:
+@itemize @minus
+@item
+whether the sectors contain actual data or not (boolean field @code{data};
+if false, the sectors are either unallocated or stored as optimized
+all-zero clusters);
+
+@item
+whether the data is known to read as zero (boolean field @code{zero});
+
+@item
+in order to make the output shorter, the target file is expressed as
+a @code{depth}; for example, a depth of 2 refers to the backing file
+of the backing file of @var{filename}.
+@end itemize
+
+In JSON format, the @code{offset} field is optional; it is absent in
+cases where @code{human} format would omit the entry or exit with an error.
+If @code{data} is false and the @code{offset} field is present, the
+corresponding sectors in the file are not yet in use, but they are
+preallocated.
+
+For more information, consult @file{include/block/block.h} in QEMU's
+source code.
+
 @item snapshot [-l | -a @var{snapshot} | -c @var{snapshot} | -d @var{snapshot} ] @var{filename}
 
 List, apply, create or delete snapshots in image @var{filename}.
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index f91b6c4f02..8565d49336 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1830,6 +1830,10 @@ static int alloc_f(BlockDriverState *bs, int argc, char **argv)
     sector_num = offset >> 9;
     while (remaining) {
         ret = bdrv_is_allocated(bs, sector_num, remaining, &num);
+        if (ret < 0) {
+            printf("is_allocated failed: %s\n", strerror(-ret));
+            return 0;
+        }
         sector_num += num;
         remaining -= num;
         if (ret) {
diff --git a/qemu-io.c b/qemu-io.c
index d54dc86921..71f4ff1302 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -32,7 +32,7 @@ static char **cmdline;
 
 static int close_f(BlockDriverState *bs, int argc, char **argv)
 {
-    bdrv_delete(bs);
+    bdrv_unref(bs);
     qemuio_bs = NULL;
     return 0;
 }
@@ -61,7 +61,7 @@ static int openfile(char *name, int flags, int growable)
 
         if (bdrv_open(qemuio_bs, name, NULL, flags, NULL) < 0) {
             fprintf(stderr, "%s: can't open device %s\n", progname, name);
-            bdrv_delete(qemuio_bs);
+            bdrv_unref(qemuio_bs);
             qemuio_bs = NULL;
             return 1;
         }
@@ -422,7 +422,7 @@ int main(int argc, char **argv)
     bdrv_drain_all();
 
     if (qemuio_bs) {
-        bdrv_delete(qemuio_bs);
+        bdrv_unref(qemuio_bs);
     }
     return 0;
 }
diff --git a/qemu-options.hx b/qemu-options.hx
index d15338e879..5dc8b75cdb 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -409,7 +409,11 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "       [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n"
     "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
     "       [,readonly=on|off][,copy-on-read=on|off]\n"
-    "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
+    "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]]\n"
+    "       [[,iops=i]|[[,iops_rd=r][,iops_wr=w]]]\n"
+    "       [[,bps_max=bm]|[[,bps_rd_max=rm][,bps_wr_max=wm]]]\n"
+    "       [[,iops_max=im]|[[,iops_rd_max=irm][,iops_wr_max=iwm]]]\n"
+    "       [[,iops_size=is]]\n"
     "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
 STEXI
 @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 8a8f342eab..008cad95a2 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1389,7 +1389,7 @@ EQMP
 
     {
         .name       = "block_set_io_throttle",
-        .args_type  = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l",
+        .args_type  = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l,bps_max:l?,bps_rd_max:l?,bps_wr_max:l?,iops_max:l?,iops_rd_max:l?,iops_wr_max:l?,iops_size:l?",
         .mhandler.cmd_new = qmp_marshal_input_block_set_io_throttle,
     },
 
@@ -1408,6 +1408,13 @@ Arguments:
 - "iops": total I/O operations per second (json-int)
 - "iops_rd": read I/O operations per second (json-int)
 - "iops_wr": write I/O operations per second (json-int)
+- "bps_max":  total max in bytes (json-int)
+- "bps_rd_max":  read max in bytes (json-int)
+- "bps_wr_max":  write max in bytes (json-int)
+- "iops_max":  total I/O operations max (json-int)
+- "iops_rd_max":  read I/O operations max (json-int)
+- "iops_wr_max":  write I/O operations max (json-int)
+- "iops_size":  I/O size in bytes when limiting (json-int)
 
 Example:
 
@@ -1417,7 +1424,14 @@ Example:
                                                "bps_wr": 0,
                                                "iops": 0,
                                                "iops_rd": 0,
-                                               "iops_wr": 0 } }
+                                               "iops_wr": 0,
+                                               "bps_max": 8000000,
+                                               "bps_rd_max": 0,
+                                               "bps_wr_max": 0,
+                                               "iops_max": 0,
+                                               "iops_rd_max": 0,
+                                               "iops_wr_max": 0,
+                                               "iops_size": 0 } }
 <- { "return": {} }
 
 EQMP
@@ -1758,6 +1772,13 @@ Each json-object contain the following:
          - "iops": limit total I/O operations per second (json-int)
          - "iops_rd": limit read operations per second (json-int)
          - "iops_wr": limit write operations per second (json-int)
+         - "bps_max":  total max in bytes (json-int)
+         - "bps_rd_max":  read max in bytes (json-int)
+         - "bps_wr_max":  write max in bytes (json-int)
+         - "iops_max":  total I/O operations max (json-int)
+         - "iops_rd_max":  read I/O operations max (json-int)
+         - "iops_wr_max":  write I/O operations max (json-int)
+         - "iops_size": I/O size when limiting by iops (json-int)
          - "image": the detail of the image, it is a json-object containing
             the following:
              - "filename": image file name (json-string)
@@ -1827,6 +1848,13 @@ Example:
                "iops":1000000,
                "iops_rd":0,
                "iops_wr":0,
+               "bps_max": 8000000,
+               "bps_rd_max": 0,
+               "bps_wr_max": 0,
+               "iops_max": 0,
+               "iops_rd_max": 0,
+               "iops_wr_max": 0,
+               "iops_size": 0,
                "image":{
                   "filename":"disks/test.qcow2",
                   "format":"qcow2",
diff --git a/tests/Makefile b/tests/Makefile
index baba9e95ad..c13fefc314 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -31,6 +31,7 @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
 check-unit-y += tests/test-iov$(EXESUF)
 gcov-files-test-iov-y = util/iov.c
 check-unit-y += tests/test-aio$(EXESUF)
+check-unit-y += tests/test-throttle$(EXESUF)
 gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
 gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
 check-unit-y += tests/test-thread-pool$(EXESUF)
@@ -120,6 +121,7 @@ tests/check-qfloat$(EXESUF): tests/check-qfloat.o libqemuutil.a
 tests/check-qjson$(EXESUF): tests/check-qjson.o libqemuutil.a libqemustub.a
 tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(block-obj-y) libqemuutil.a libqemustub.a
 tests/test-aio$(EXESUF): tests/test-aio.o $(block-obj-y) libqemuutil.a libqemustub.a
+tests/test-throttle$(EXESUF): tests/test-throttle.o $(block-obj-y) libqemuutil.a libqemustub.a
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(block-obj-y) libqemuutil.a libqemustub.a
 tests/test-iov$(EXESUF): tests/test-iov.o libqemuutil.a
 tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o libqemuutil.a libqemustub.a
diff --git a/tests/qemu-iotests/026.out b/tests/qemu-iotests/026.out
index fb4f20e7cd..0764389f8e 100644
--- a/tests/qemu-iotests/026.out
+++ b/tests/qemu-iotests/026.out
@@ -126,62 +126,64 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 Event: l2_update; errno: 5; imm: off; once: on; write 
 write failed: Input/output error
 
-128 leaked clusters were found on the image.
+127 leaked clusters were found on the image.
 This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_update; errno: 5; imm: off; once: on; write -b
 write failed: Input/output error
 
-128 leaked clusters were found on the image.
+127 leaked clusters were found on the image.
 This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_update; errno: 5; imm: off; once: off; write 
 write failed: Input/output error
 
-128 leaked clusters were found on the image.
+127 leaked clusters were found on the image.
 This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_update; errno: 5; imm: off; once: off; write -b
 write failed: Input/output error
 
-128 leaked clusters were found on the image.
+127 leaked clusters were found on the image.
 This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_update; errno: 28; imm: off; once: on; write 
 write failed: No space left on device
 
-128 leaked clusters were found on the image.
+127 leaked clusters were found on the image.
 This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_update; errno: 28; imm: off; once: on; write -b
 write failed: No space left on device
 
-128 leaked clusters were found on the image.
+127 leaked clusters were found on the image.
 This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_update; errno: 28; imm: off; once: off; write 
 write failed: No space left on device
 
-128 leaked clusters were found on the image.
+127 leaked clusters were found on the image.
 This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_update; errno: 28; imm: off; once: off; write -b
 write failed: No space left on device
 
-128 leaked clusters were found on the image.
+127 leaked clusters were found on the image.
 This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_alloc.write; errno: 5; imm: off; once: on; write 
 write failed: Input/output error
-No errors were found on the image.
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_alloc.write; errno: 5; imm: off; once: on; write -b
@@ -205,7 +207,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 
 Event: l2_alloc.write; errno: 28; imm: off; once: on; write 
 write failed: No space left on device
-No errors were found on the image.
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l2_alloc.write; errno: 28; imm: off; once: on; write -b
@@ -575,7 +579,6 @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_grow.write_table; errno: 5; imm: off; once: off
-qcow2_free_clusters failed: Input/output error
 write failed: Input/output error
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -586,7 +589,6 @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_grow.write_table; errno: 28; imm: off; once: off
-qcow2_free_clusters failed: No space left on device
 write failed: No space left on device
 No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
@@ -597,7 +599,6 @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_grow.activate_table; errno: 5; imm: off; once: off
-qcow2_free_clusters failed: Input/output error
 write failed: Input/output error
 
 96 leaked clusters were found on the image.
@@ -610,7 +611,6 @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
 
 Event: l1_grow.activate_table; errno: 28; imm: off; once: off
-qcow2_free_clusters failed: No space left on device
 write failed: No space left on device
 
 96 leaked clusters were found on the image.
diff --git a/tests/qemu-iotests/026.out.nocache b/tests/qemu-iotests/026.out.nocache
new file mode 100644
index 0000000000..33bad0d6ae
--- /dev/null
+++ b/tests/qemu-iotests/026.out.nocache
@@ -0,0 +1,626 @@
+QA output created by 026
+Errors while writing 128 kB
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_update; errno: 5; imm: off; once: on; write 
+write failed: Input/output error
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_update; errno: 5; imm: off; once: on; write -b
+write failed: Input/output error
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_update; errno: 5; imm: off; once: off; write 
+write failed: Input/output error
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_update; errno: 5; imm: off; once: off; write -b
+write failed: Input/output error
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_update; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_update; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_update; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_update; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_load; errno: 5; imm: off; once: on; write 
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+write failed: Input/output error
+read failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_load; errno: 5; imm: off; once: on; write -b
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+write failed: Input/output error
+read failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_load; errno: 5; imm: off; once: off; write 
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+write failed: Input/output error
+read failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_load; errno: 5; imm: off; once: off; write -b
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+write failed: Input/output error
+read failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_load; errno: 28; imm: off; once: on; write 
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+write failed: No space left on device
+read failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_load; errno: 28; imm: off; once: on; write -b
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+write failed: No space left on device
+read failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_load; errno: 28; imm: off; once: off; write 
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+write failed: No space left on device
+read failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_load; errno: 28; imm: off; once: off; write -b
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+write failed: No space left on device
+read failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_update; errno: 5; imm: off; once: on; write 
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+127 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_update; errno: 5; imm: off; once: on; write -b
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+127 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_update; errno: 5; imm: off; once: off; write 
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+127 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_update; errno: 5; imm: off; once: off; write -b
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+127 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_update; errno: 28; imm: off; once: on; write 
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+127 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_update; errno: 28; imm: off; once: on; write -b
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+127 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_update; errno: 28; imm: off; once: off; write 
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+127 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_update; errno: 28; imm: off; once: off; write -b
+wrote 131072/131072 bytes at offset 0
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+127 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_alloc.write; errno: 5; imm: off; once: on; write 
+write failed: Input/output error
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_alloc.write; errno: 5; imm: off; once: on; write -b
+write failed: Input/output error
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_alloc.write; errno: 5; imm: off; once: off; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_alloc.write; errno: 5; imm: off; once: off; write -b
+write failed: Input/output error
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_alloc.write; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_alloc.write; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_alloc.write; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l2_alloc.write; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+
+1 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: write_aio; errno: 5; imm: off; once: on; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: write_aio; errno: 5; imm: off; once: on; write -b
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: write_aio; errno: 5; imm: off; once: off; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: write_aio; errno: 5; imm: off; once: off; write -b
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: write_aio; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: write_aio; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: write_aio; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: write_aio; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_load; errno: 5; imm: off; once: on; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_load; errno: 5; imm: off; once: on; write -b
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_load; errno: 5; imm: off; once: off; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_load; errno: 5; imm: off; once: off; write -b
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_load; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_load; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_load; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_load; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_update_part; errno: 5; imm: off; once: on; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_update_part; errno: 5; imm: off; once: on; write -b
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_update_part; errno: 5; imm: off; once: off; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_update_part; errno: 5; imm: off; once: off; write -b
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_update_part; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_update_part; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_update_part; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_update_part; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc; errno: 5; imm: off; once: on; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc; errno: 5; imm: off; once: on; write -b
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc; errno: 5; imm: off; once: off; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc; errno: 5; imm: off; once: off; write -b
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: cluster_alloc; errno: 5; imm: off; once: on; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: cluster_alloc; errno: 5; imm: off; once: on; write -b
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: cluster_alloc; errno: 5; imm: off; once: off; write 
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: cluster_alloc; errno: 5; imm: off; once: off; write -b
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: cluster_alloc; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: cluster_alloc; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: cluster_alloc; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: cluster_alloc; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+No errors were found on the image.
+
+=== Refcout table growth tests ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.hookup; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.hookup; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.hookup; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+
+55 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.hookup; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+
+251 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write_blocks; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write_blocks; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write_blocks; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+
+10 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write_blocks; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+
+23 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write_table; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write_table; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write_table; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+
+10 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.write_table; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+
+23 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.switch_table; errno: 28; imm: off; once: on; write 
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.switch_table; errno: 28; imm: off; once: on; write -b
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.switch_table; errno: 28; imm: off; once: off; write 
+write failed: No space left on device
+
+10 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: refblock_alloc.switch_table; errno: 28; imm: off; once: off; write -b
+write failed: No space left on device
+
+23 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+
+=== L1 growth tests ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.alloc_table; errno: 5; imm: off; once: on
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.alloc_table; errno: 5; imm: off; once: off
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.alloc_table; errno: 28; imm: off; once: on
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.alloc_table; errno: 28; imm: off; once: off
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.write_table; errno: 5; imm: off; once: on
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.write_table; errno: 5; imm: off; once: off
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.write_table; errno: 28; imm: off; once: on
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.write_table; errno: 28; imm: off; once: off
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.activate_table; errno: 5; imm: off; once: on
+write failed: Input/output error
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.activate_table; errno: 5; imm: off; once: off
+write failed: Input/output error
+
+96 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.activate_table; errno: 28; imm: off; once: on
+write failed: No space left on device
+No errors were found on the image.
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 
+
+Event: l1_grow.activate_table; errno: 28; imm: off; once: off
+write failed: No space left on device
+
+96 leaked clusters were found on the image.
+This means waste of disk space, but no harm to data.
+*** done
diff --git a/tests/qemu-iotests/039.out b/tests/qemu-iotests/039.out
index cb510d6716..077fa64cbf 100644
--- a/tests/qemu-iotests/039.out
+++ b/tests/qemu-iotests/039.out
@@ -12,8 +12,8 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 incompatible_features     0x1
-ERROR OFLAG_COPIED: offset=8000000000050000 refcount=0
 ERROR cluster 5 refcount=0 reference=1
+ERROR OFLAG_COPIED data cluster: l2_entry=8000000000050000 refcount=0
 
 2 errors were found on the image.
 Data may be corrupted, or further writes to the image may corrupt it.
@@ -24,7 +24,6 @@ read 512/512 bytes at offset 0
 incompatible_features     0x1
 
 == Repairing the image file must succeed ==
-ERROR OFLAG_COPIED: offset=8000000000050000 refcount=0
 Repairing cluster 5 refcount=0 reference=1
 The following inconsistencies were found and repaired:
 
@@ -44,7 +43,6 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 incompatible_features     0x1
-ERROR OFLAG_COPIED: offset=8000000000050000 refcount=0
 Repairing cluster 5 refcount=0 reference=1
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
diff --git a/tests/qemu-iotests/063 b/tests/qemu-iotests/063
new file mode 100755
index 0000000000..de0cbbd8bb
--- /dev/null
+++ b/tests/qemu-iotests/063
@@ -0,0 +1,97 @@
+#!/bin/bash
+#
+# test of qemu-img convert -n - convert without creation
+#
+# Copyright (C) 2009 Red Hat, Inc.
+# Copyright (C) 2013 Alex Bligh (alex@alex.org.uk)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=alex@alex.org.uk
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+	rm -f $TEST_IMG.orig $TEST_IMG.raw $TEST_IMG.raw2
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.pattern
+
+_supported_fmt qcow qcow2 vmdk qed raw
+_supported_proto generic
+_supported_os Linux
+
+_make_test_img 4M
+
+echo "== Testing conversion with -n fails with no target file =="
+# check .orig file does not exist
+rm -f $TEST_IMG.orig
+if $QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n $TEST_IMG $TEST_IMG.orig >/dev/null 2>&1; then
+    exit 1
+fi
+
+echo "== Testing conversion with -n succeeds with a target file =="
+rm -f $TEST_IMG.orig
+cp $TEST_IMG $TEST_IMG.orig
+if ! $QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n $TEST_IMG $TEST_IMG.orig ; then
+    exit 1
+fi
+
+echo "== Testing conversion to raw is the same after conversion with -n =="
+# compare the raw files
+if ! $QEMU_IMG convert -f $IMGFMT -O raw $TEST_IMG $TEST_IMG.raw1 ; then
+    exit 1
+fi
+
+if ! $QEMU_IMG convert -f $IMGFMT -O raw $TEST_IMG.orig $TEST_IMG.raw2 ; then
+    exit 1
+fi
+
+if ! cmp $TEST_IMG.raw1 $TEST_IMG.raw2 ; then
+    exit 1
+fi
+
+echo "== Testing conversion back to original format =="
+if ! $QEMU_IMG convert -f raw -O $IMGFMT -n $TEST_IMG.raw2 $TEST_IMG ; then
+    exit 1
+fi
+_check_test_img
+
+echo "== Testing conversion to a smaller file fails =="
+rm -f $TEST_IMG.orig
+mv $TEST_IMG $TEST_IMG.orig
+_make_test_img 2M
+if $QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n $TEST_IMG.orig $TEST_IMG >/dev/null 2>&1; then
+    exit 1
+fi
+
+rm -f $TEST_IMG.orig $TEST_IMG.raw $TEST_IMG.raw2
+
+echo "*** done"
+rm -f $seq.full
+status=0
+exit 0
diff --git a/tests/qemu-iotests/063.out b/tests/qemu-iotests/063.out
new file mode 100644
index 0000000000..de1c99afd8
--- /dev/null
+++ b/tests/qemu-iotests/063.out
@@ -0,0 +1,10 @@
+QA output created by 063
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4194304
+== Testing conversion with -n fails with no target file ==
+== Testing conversion with -n succeeds with a target file ==
+== Testing conversion to raw is the same after conversion with -n ==
+== Testing conversion back to original format ==
+No errors were found on the image.
+== Testing conversion to a smaller file fails ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=2097152
+*** done
diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
index 74628ae637..4ecf497d8e 100755
--- a/tests/qemu-iotests/check
+++ b/tests/qemu-iotests/check
@@ -78,50 +78,50 @@ _wrapup()
 
     if $showme
     then
-	:
+        :
     elif $needwrap
     then
-	if [ -f check.time -a -f $tmp.time ]
-	then
-	    cat check.time $tmp.time \
-	    | $AWK_PROG '
-	{ t[$1] = $2 }
-END	{ if (NR > 0) {
-	    for (i in t) print i " " t[i]
-	  }
-	}' \
-	    | sort -n >$tmp.out
-	    mv $tmp.out check.time
-	fi
+        if [ -f check.time -a -f $tmp.time ]
+        then
+            cat check.time $tmp.time \
+            | $AWK_PROG '
+        { t[$1] = $2 }
+END        { if (NR > 0) {
+            for (i in t) print i " " t[i]
+          }
+        }' \
+            | sort -n >$tmp.out
+            mv $tmp.out check.time
+        fi
 
-	if [ -f $tmp.expunged ]
-	then
-	    notrun=`wc -l <$tmp.expunged | sed -e 's/  *//g'`
-	    try=`expr $try - $notrun`
-	    list=`echo "$list" | sed -f $tmp.expunged`
-	fi
+        if [ -f $tmp.expunged ]
+        then
+            notrun=`wc -l <$tmp.expunged | sed -e 's/  *//g'`
+            try=`expr $try - $notrun`
+            list=`echo "$list" | sed -f $tmp.expunged`
+        fi
 
-	echo "" >>check.log
-	date >>check.log
-	echo $list | fmt | sed -e 's/^/    /' >>check.log
-	$interrupt && echo "Interrupted!" >>check.log
-        
-	if [ ! -z "$notrun" ]
-	then
-	    echo "Not run:$notrun"
-	    echo "Not run:$notrun" >>check.log
-	fi
+        echo "" >>check.log
+        date >>check.log
+        echo $list | fmt | sed -e 's/^/    /' >>check.log
+        $interrupt && echo "Interrupted!" >>check.log
+
+        if [ ! -z "$notrun" ]
+        then
+            echo "Not run:$notrun"
+            echo "Not run:$notrun" >>check.log
+        fi
         if [ ! -z "$n_bad" -a $n_bad != 0 ]
-	then
-	    echo "Failures:$bad"
-	    echo "Failed $n_bad of $try tests"
-	    echo "Failures:$bad" | fmt >>check.log
-	    echo "Failed $n_bad of $try tests" >>check.log
-	else
-	    echo "Passed all $try tests"
-	    echo "Passed all $try tests" >>check.log
-	fi
-	needwrap=false
+        then
+            echo "Failures:$bad"
+            echo "Failed $n_bad of $try tests"
+            echo "Failures:$bad" | fmt >>check.log
+            echo "Failed $n_bad of $try tests" >>check.log
+        else
+            echo "Passed all $try tests"
+            echo "Passed all $try tests" >>check.log
+        fi
+        needwrap=false
     fi
 
     rm -f /tmp/*.out /tmp/*.err /tmp/*.time
@@ -185,82 +185,88 @@ do
 
     if $showme
     then
-	echo
-	continue
-    elif [ -f expunged ] && $expunge && egrep "^$seq([ 	]|\$)" expunged >/dev/null
+        echo
+        continue
+    elif [ -f expunged ] && $expunge && egrep "^$seq([         ]|\$)" expunged >/dev/null
     then
-	echo " - expunged"
-	rm -f $seq.out.bad
-	echo "/^$seq\$/d" >>$tmp.expunged
+        echo " - expunged"
+        rm -f $seq.out.bad
+        echo "/^$seq\$/d" >>$tmp.expunged
     elif [ ! -f $seq ]
     then
-	echo " - no such test?"
-	echo "/^$seq\$/d" >>$tmp.expunged
+        echo " - no such test?"
+        echo "/^$seq\$/d" >>$tmp.expunged
     else
-	# really going to try and run this one
-	#
-	rm -f $seq.out.bad
-	lasttime=`sed -n -e "/^$seq /s/.* //p" <check.time`
-	if [ "X$lasttime" != X ]; then
-		echo -n " ${lasttime}s ..."
-	else
-		echo -n "	"	# prettier output with timestamps.
-	fi
-	rm -f core $seq.notrun
+        # really going to try and run this one
+        #
+        rm -f $seq.out.bad
+        lasttime=`sed -n -e "/^$seq /s/.* //p" <check.time`
+        if [ "X$lasttime" != X ]; then
+                echo -n " ${lasttime}s ..."
+        else
+                echo -n "        "        # prettier output with timestamps.
+        fi
+        rm -f core $seq.notrun
 
-	# for hangcheck ...
-	echo "$seq" >/tmp/check.sts
+        # for hangcheck ...
+        echo "$seq" >/tmp/check.sts
 
-	start=`_wallclock`
-	$timestamp && echo -n "	["`date "+%T"`"]"
-	[ ! -x $seq ] && chmod u+x $seq # ensure we can run it
-	MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(($RANDOM % 255 + 1))} \
-		./$seq >$tmp.out 2>&1
-	sts=$?
-	$timestamp && _timestamp
-	stop=`_wallclock`
+        start=`_wallclock`
+        $timestamp && echo -n "        ["`date "+%T"`"]"
+        [ ! -x $seq ] && chmod u+x $seq # ensure we can run it
+        MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(($RANDOM % 255 + 1))} \
+                ./$seq >$tmp.out 2>&1
+        sts=$?
+        $timestamp && _timestamp
+        stop=`_wallclock`
 
-	if [ -f core ]
-	then
-	    echo -n " [dumped core]"
-	    mv core $seq.core
-	    err=true
-	fi
+        if [ -f core ]
+        then
+            echo -n " [dumped core]"
+            mv core $seq.core
+            err=true
+        fi
 
-	if [ -f $seq.notrun ]
-	then
-	    $timestamp || echo -n " [not run] "
-	    $timestamp && echo " [not run]" && echo -n "	$seq -- "
-	    cat $seq.notrun
-	    notrun="$notrun $seq"
-	else
-	    if [ $sts -ne 0 ]
-	    then
-		echo -n " [failed, exit status $sts]"
-		err=true
-	    fi
-	    if [ ! -f $seq.out ]
-	    then
-		echo " - no qualified output"
-		err=true
-	    else
-		if diff -w $seq.out $tmp.out >/dev/null 2>&1
-		then
-		    echo ""
-		    if $err
-		    then
-			:
-		    else
-			echo "$seq `expr $stop - $start`" >>$tmp.time
-		    fi
-		else
-		    echo " - output mismatch (see $seq.out.bad)"
-		    mv $tmp.out $seq.out.bad
-		    $diff -w $seq.out $seq.out.bad
-		    err=true
-		fi
-	    fi
-	fi
+        if [ -f $seq.notrun ]
+        then
+            $timestamp || echo -n " [not run] "
+            $timestamp && echo " [not run]" && echo -n "        $seq -- "
+            cat $seq.notrun
+            notrun="$notrun $seq"
+        else
+            if [ $sts -ne 0 ]
+            then
+                echo -n " [failed, exit status $sts]"
+                err=true
+            fi
+
+            reference=$seq.out
+            if (echo $QEMU_IO_OPTIONS | grep -s -- '--nocache' > /dev/null); then
+                [ -f $seq.out.nocache ] && reference=$seq.out.nocache
+            fi
+
+            if [ ! -f $reference ]
+            then
+                echo " - no qualified output"
+                err=true
+            else
+                if diff -w $reference $tmp.out >/dev/null 2>&1
+                then
+                    echo ""
+                    if $err
+                    then
+                        :
+                    else
+                        echo "$seq `expr $stop - $start`" >>$tmp.time
+                    fi
+                else
+                    echo " - output mismatch (see $seq.out.bad)"
+                    mv $tmp.out $seq.out.bad
+                    $diff -w $reference $seq.out.bad
+                    err=true
+                fi
+            fi
+        fi
 
     fi
 
@@ -268,12 +274,12 @@ do
     #
     if $err
     then
-	bad="$bad $seq"
-	n_bad=`expr $n_bad + 1`
-	quick=false
+        bad="$bad $seq"
+        n_bad=`expr $n_bad + 1`
+        quick=false
     fi
     [ -f $seq.notrun ] || try=`expr $try + 1`
-    
+
     seq="after_$seq"
 done
 
diff --git a/tests/qemu-iotests/common b/tests/qemu-iotests/common
index 6826ea72fe..fecaf85074 100644
--- a/tests/qemu-iotests/common
+++ b/tests/qemu-iotests/common
@@ -54,58 +54,58 @@ do
 
     if $group
     then
-	# arg after -g
-	group_list=`sed -n <group -e 's/$/ /' -e "/^[0-9][0-9][0-9].* $r /"'{
+        # arg after -g
+        group_list=`sed -n <group -e 's/$/ /' -e "/^[0-9][0-9][0-9].* $r /"'{
 s/ .*//p
 }'`
-	if [ -z "$group_list" ]
-	then
-	    echo "Group \"$r\" is empty or not defined?"
-	    exit 1
-	fi
-	[ ! -s $tmp.list ] && touch $tmp.list
-	for t in $group_list
-	do
-	    if grep -s "^$t\$" $tmp.list >/dev/null
-	    then
-		:
-	    else
-		echo "$t" >>$tmp.list
-	    fi
-	done
-	group=false
-	continue
+        if [ -z "$group_list" ]
+        then
+            echo "Group \"$r\" is empty or not defined?"
+            exit 1
+        fi
+        [ ! -s $tmp.list ] && touch $tmp.list
+        for t in $group_list
+        do
+            if grep -s "^$t\$" $tmp.list >/dev/null
+            then
+                :
+            else
+                echo "$t" >>$tmp.list
+            fi
+        done
+        group=false
+        continue
 
     elif $xgroup
     then
-	# arg after -x
-	[ ! -s $tmp.list ] && ls [0-9][0-9][0-9] [0-9][0-9][0-9][0-9] >$tmp.list 2>/dev/null
-	group_list=`sed -n <group -e 's/$/ /' -e "/^[0-9][0-9][0-9].* $r /"'{
+        # arg after -x
+        [ ! -s $tmp.list ] && ls [0-9][0-9][0-9] [0-9][0-9][0-9][0-9] >$tmp.list 2>/dev/null
+        group_list=`sed -n <group -e 's/$/ /' -e "/^[0-9][0-9][0-9].* $r /"'{
 s/ .*//p
 }'`
-	if [ -z "$group_list" ]
-	then
-	    echo "Group \"$r\" is empty or not defined?"
-	    exit 1
-	fi
-	numsed=0
-	rm -f $tmp.sed
-	for t in $group_list
-	do
-	    if [ $numsed -gt 100 ]
-	    then
-		sed -f $tmp.sed <$tmp.list >$tmp.tmp
-		mv $tmp.tmp $tmp.list
-		numsed=0
-		rm -f $tmp.sed
-	    fi
-	    echo "/^$t\$/d" >>$tmp.sed
-	    numsed=`expr $numsed + 1`
-	done
-	sed -f $tmp.sed <$tmp.list >$tmp.tmp
-	mv $tmp.tmp $tmp.list
-	xgroup=false
-	continue
+        if [ -z "$group_list" ]
+        then
+            echo "Group \"$r\" is empty or not defined?"
+            exit 1
+        fi
+        numsed=0
+        rm -f $tmp.sed
+        for t in $group_list
+        do
+            if [ $numsed -gt 100 ]
+            then
+                sed -f $tmp.sed <$tmp.list >$tmp.tmp
+                mv $tmp.tmp $tmp.list
+                numsed=0
+                rm -f $tmp.sed
+            fi
+            echo "/^$t\$/d" >>$tmp.sed
+            numsed=`expr $numsed + 1`
+        done
+        sed -f $tmp.sed <$tmp.list >$tmp.tmp
+        mv $tmp.tmp $tmp.list
+        xgroup=false
+        continue
 
     elif $imgopts
     then
@@ -119,11 +119,11 @@ s/ .*//p
     case "$r"
     in
 
-	-\? | -h | --help)	# usage
-	    echo "Usage: $0 [options] [testlist]"'
+        -\? | -h | --help)        # usage
+            echo "Usage: $0 [options] [testlist]"'
 
 common options
-    -v			verbose
+    -v                        verbose
 
 check options
     -raw                test raw (default)
@@ -138,162 +138,162 @@ check options
     -sheepdog           test sheepdog
     -nbd                test nbd
     -ssh                test ssh
-    -xdiff		graphical mode diff
-    -nocache		use O_DIRECT on backing file
-    -misalign		misalign memory allocations
-    -n			show me, do not run tests
+    -xdiff                graphical mode diff
+    -nocache                use O_DIRECT on backing file
+    -misalign                misalign memory allocations
+    -n                        show me, do not run tests
     -o options          -o options to pass to qemu-img create/convert
-    -T			output timestamps
-    -r 			randomize test order
-    
+    -T                        output timestamps
+    -r                         randomize test order
+
 testlist options
-    -g group[,group...]	include tests from these groups
-    -x group[,group...]	exclude tests from these groups
-    NNN			include test NNN
-    NNN-NNN		include test range (eg. 012-021)
+    -g group[,group...]        include tests from these groups
+    -x group[,group...]        exclude tests from these groups
+    NNN                        include test NNN
+    NNN-NNN                include test range (eg. 012-021)
 '
-	    exit 0
-	    ;;
+            exit 0
+            ;;
 
-	-raw)
-	    IMGFMT=raw
-	    xpand=false
-	    ;;
+        -raw)
+            IMGFMT=raw
+            xpand=false
+            ;;
 
-	-cow)
-	    IMGFMT=cow
-	    xpand=false
-	    ;;
+        -cow)
+            IMGFMT=cow
+            xpand=false
+            ;;
 
-	-qcow)
-	    IMGFMT=qcow
-	    xpand=false
-	    ;;
+        -qcow)
+            IMGFMT=qcow
+            xpand=false
+            ;;
 
-	-qcow2)
-	    IMGFMT=qcow2
-	    xpand=false
-	    ;;
+        -qcow2)
+            IMGFMT=qcow2
+            xpand=false
+            ;;
 
-	-qed)
-	    IMGFMT=qed
-	    xpand=false
-	    ;;
+        -qed)
+            IMGFMT=qed
+            xpand=false
+            ;;
 
-	-vdi)
-	    IMGFMT=vdi
-	    xpand=false
-	    ;;
+        -vdi)
+            IMGFMT=vdi
+            xpand=false
+            ;;
 
-	-vmdk)
-	    IMGFMT=vmdk
-	    xpand=false
-	    ;;
+        -vmdk)
+            IMGFMT=vmdk
+            xpand=false
+            ;;
 
-	-vpc)
-	    IMGFMT=vpc
-	    xpand=false
-	    ;;
+        -vpc)
+            IMGFMT=vpc
+            xpand=false
+            ;;
 
-	-rbd)
-	    IMGPROTO=rbd
-	    xpand=false
-	    ;;
-	-sheepdog)
-	    IMGPROTO=sheepdog
-	    xpand=false
-	    ;;
-	-nbd)
-	    IMGPROTO=nbd
-	    xpand=false
-	    ;;
+        -rbd)
+            IMGPROTO=rbd
+            xpand=false
+            ;;
+        -sheepdog)
+            IMGPROTO=sheepdog
+            xpand=false
+            ;;
+        -nbd)
+            IMGPROTO=nbd
+            xpand=false
+            ;;
         -ssh)
             IMGPROTO=ssh
             xpand=false
             ;;
-	-nocache)
-	    QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --nocache"
-	    xpand=false
-	    ;;
+        -nocache)
+            QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --nocache"
+            xpand=false
+            ;;
 
-	-misalign)
-	    QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --misalign"
-	    xpand=false
-	    ;;
+        -misalign)
+            QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS --misalign"
+            xpand=false
+            ;;
 
     -valgrind)
         valgrind=true
-	    xpand=false
+            xpand=false
         ;;
 
-	-g)	# -g group ... pick from group file
-	    group=true
-	    xpand=false
-	    ;;
+        -g)        # -g group ... pick from group file
+            group=true
+            xpand=false
+            ;;
 
-	-xdiff)	# graphical diff mode
-	    xpand=false
+        -xdiff)        # graphical diff mode
+            xpand=false
 
-	    if [ ! -z "$DISPLAY" ]
-	    then
-		which xdiff >/dev/null 2>&1 && diff=xdiff
-		which gdiff >/dev/null 2>&1 && diff=gdiff
-		which tkdiff >/dev/null 2>&1 && diff=tkdiff
-		which xxdiff >/dev/null 2>&1 && diff=xxdiff
-	    fi
-	    ;;
+            if [ ! -z "$DISPLAY" ]
+            then
+                which xdiff >/dev/null 2>&1 && diff=xdiff
+                which gdiff >/dev/null 2>&1 && diff=gdiff
+                which tkdiff >/dev/null 2>&1 && diff=tkdiff
+                which xxdiff >/dev/null 2>&1 && diff=xxdiff
+            fi
+            ;;
 
-	-n)	# show me, don't do it
-	    showme=true
-	    xpand=false
-	    ;;
+        -n)        # show me, don't do it
+            showme=true
+            xpand=false
+            ;;
         -o)
             imgopts=true
             xpand=false
             ;;
-        -r)	# randomize test order
-	    randomize=true
-	    xpand=false
-	    ;;
+        -r)        # randomize test order
+            randomize=true
+            xpand=false
+            ;;
 
-	-T)	# turn on timestamp output
-	    timestamp=true
-	    xpand=false
-	    ;;
+        -T)        # turn on timestamp output
+            timestamp=true
+            xpand=false
+            ;;
 
-	-v)
-	    verbose=true
-	    xpand=false
-	    ;;
-	-x)	# -x group ... exclude from group file
-	    xgroup=true
-	    xpand=false
-	    ;;
-	'[0-9][0-9][0-9] [0-9][0-9][0-9][0-9]')
-	    echo "No tests?"
-	    status=1
-	    exit $status
-	    ;;
+        -v)
+            verbose=true
+            xpand=false
+            ;;
+        -x)        # -x group ... exclude from group file
+            xgroup=true
+            xpand=false
+            ;;
+        '[0-9][0-9][0-9] [0-9][0-9][0-9][0-9]')
+            echo "No tests?"
+            status=1
+            exit $status
+            ;;
 
-	[0-9]*-[0-9]*)
-	    eval `echo $r | sed -e 's/^/start=/' -e 's/-/ end=/'`
-	    ;;
+        [0-9]*-[0-9]*)
+            eval `echo $r | sed -e 's/^/start=/' -e 's/-/ end=/'`
+            ;;
 
-	[0-9]*-)
-	    eval `echo $r | sed -e 's/^/start=/' -e 's/-//'`
-	    end=`echo [0-9][0-9][0-9] [0-9][0-9][0-9][0-9] | sed -e 's/\[0-9]//g' -e 's/  *$//' -e 's/.* //'`
-	    if [ -z "$end" ]
-	    then
-		echo "No tests in range \"$r\"?"
-		status=1
-		exit $status
-	    fi
-	    ;;
+        [0-9]*-)
+            eval `echo $r | sed -e 's/^/start=/' -e 's/-//'`
+            end=`echo [0-9][0-9][0-9] [0-9][0-9][0-9][0-9] | sed -e 's/\[0-9]//g' -e 's/  *$//' -e 's/.* //'`
+            if [ -z "$end" ]
+            then
+                echo "No tests in range \"$r\"?"
+                status=1
+                exit $status
+            fi
+            ;;
 
-	*)
-	    start=$r
-	    end=$r
-	    ;;
+        *)
+            start=$r
+            end=$r
+            ;;
 
     esac
 
@@ -303,26 +303,26 @@ testlist options
 
     if $xpand
     then
-	have_test_arg=true
-	$AWK_PROG </dev/null '
-BEGIN	{ for (t='$start'; t<='$end'; t++) printf "%03d\n",t }' \
-	| while read id
-	do
-	    if grep -s "^$id " group >/dev/null
-	    then
-		# in group file ... OK
-		echo $id >>$tmp.list
-	    else
-		if [ -f expunged ] && $expunge && egrep "^$id([ 	]|\$)" expunged >/dev/null
-		then
-		    # expunged ... will be reported, but not run, later
-		    echo $id >>$tmp.list
-		else
-		    # oops
-		    echo "$id - unknown test, ignored"
-		fi
-	    fi
-	done
+        have_test_arg=true
+        $AWK_PROG </dev/null '
+BEGIN        { for (t='$start'; t<='$end'; t++) printf "%03d\n",t }' \
+        | while read id
+        do
+            if grep -s "^$id " group >/dev/null
+            then
+                # in group file ... OK
+                echo $id >>$tmp.list
+            else
+                if [ -f expunged ] && $expunge && egrep "^$id([         ]|\$)" expunged >/dev/null
+                then
+                    # expunged ... will be reported, but not run, later
+                    echo $id >>$tmp.list
+                else
+                    # oops
+                    echo "$id - unknown test, ignored"
+                fi
+            fi
+        done
     fi
 
 done
@@ -337,11 +337,11 @@ then
 else
     if $have_test_arg
     then
-	# had test numbers, but none in group file ... do nothing
-	touch $tmp.list
+        # had test numbers, but none in group file ... do nothing
+        touch $tmp.list
     else
-	# no test numbers, do everything from group file
-	sed -n -e '/^[0-9][0-9][0-9]*/s/[ 	].*//p' <group >$tmp.list
+        # no test numbers, do everything from group file
+        sed -n -e '/^[0-9][0-9][0-9]*/s/[         ].*//p' <group >$tmp.list
     fi
 fi
 
diff --git a/tests/qemu-iotests/common.config b/tests/qemu-iotests/common.config
index 08a3f100b8..d794e624e7 100644
--- a/tests/qemu-iotests/common.config
+++ b/tests/qemu-iotests/common.config
@@ -19,7 +19,7 @@
 # setup and check for config parameters, and in particular
 #
 # EMAIL -           email of the script runner.
-# TEST_DIR -        scratch test directory 
+# TEST_DIR -        scratch test directory
 #
 # - These can be added to $HOST_CONFIG_DIR (witch default to ./config)
 #   below or a separate local configuration file can be used (using
@@ -111,11 +111,11 @@ export QEMU_NBD=$QEMU_NBD_PROG
 [ -f /etc/qemu-iotest.config ]       && . /etc/qemu-iotest.config
 
 if [ -z "$TEST_DIR" ]; then
-	TEST_DIR=`pwd`/scratch
+        TEST_DIR=`pwd`/scratch
 fi
 
 if [ ! -e "$TEST_DIR" ]; then
-	mkdir "$TEST_DIR"
+        mkdir "$TEST_DIR"
 fi
 
 if [ ! -d "$TEST_DIR" ]; then
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index 97a31ff0b1..5dfda63e59 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -25,19 +25,19 @@
 # Outputs suitable message to stdout if it's not in range.
 #
 # A verbose option, -v, may be used as the LAST argument
-# 
-# e.g. 
+#
+# e.g.
 # foo: 0.0298 = 0.03 +/- 5%
-# _within_tolerance "foo" 0.0298 0.03 5%  
-# 
+# _within_tolerance "foo" 0.0298 0.03 5%
+#
 # foo: 0.0298 = 0.03 +/- 0.01
 # _within_tolerance "foo" 0.0298 0.03 0.01
 #
 # foo: 0.0298 = 0.03 -0.01 +0.002
 # _within_tolerance "foo" 0.0298 0.03 0.01 0.002
 #
-# foo: verbose output of 0.0298 = 0.03 +/- 5% 
-# _within_tolerance "foo" 0.0298 0.03 5% -v 
+# foo: verbose output of 0.0298 = 0.03 +/- 5%
+# _within_tolerance "foo" 0.0298 0.03 5% -v
 _within_tolerance()
 {
   _name=$1
@@ -51,10 +51,10 @@ _within_tolerance()
   # maxtol arg is optional
   # verbose arg is optional
   if [ $# -ge 5 ]
-  then 
+  then
      if [ "$5" = "-v" ]
      then
-	_verbose=1
+        _verbose=1
      else
         _maxtol=$5
      fi
@@ -65,18 +65,18 @@ _within_tolerance()
   fi
 
   # find min with or without %
-  _mintolerance=`echo $_mintol | sed -e 's/%//'` 
+  _mintolerance=`echo $_mintol | sed -e 's/%//'`
   if [ $_mintol = $_mintolerance ]
-  then 
+  then
       _min=`echo "scale=5; $_correct_val-$_mintolerance" | bc`
   else
       _min=`echo "scale=5; $_correct_val-$_mintolerance*0.01*$_correct_val" | bc`
   fi
 
   # find max with or without %
-  _maxtolerance=`echo $_maxtol | sed -e 's/%//'` 
+  _maxtolerance=`echo $_maxtol | sed -e 's/%//'`
   if [ $_maxtol = $_maxtolerance ]
-  then 
+  then
       _max=`echo "scale=5; $_correct_val+$_maxtolerance" | bc`
   else
       _max=`echo "scale=5; $_correct_val+$_maxtolerance*0.01*$_correct_val" | bc`
@@ -88,7 +88,7 @@ _within_tolerance()
   cat <<EOF >$tmp.bc.1
 scale=5;
 if ($_min <= $_given_val) 1;
-if ($_min > $_given_val) 0; 
+if ($_min > $_given_val) 0;
 EOF
 
   cat <<EOF >$tmp.bc.2
@@ -102,21 +102,21 @@ EOF
 
   rm -f $tmp.bc.[12]
 
-  _in_range=`expr $_above_min \& $_below_max` 
+  _in_range=`expr $_above_min \& $_below_max`
 
   # fix up min, max precision for output
   # can vary for 5.3, 6.2
   _min=`echo $_min | sed -e 's/0*$//'` # get rid of trailling zeroes
   _max=`echo $_max | sed -e 's/0*$//'` # get rid of trailling zeroes
 
-  if [ $_in_range -eq 1 ] 
+  if [ $_in_range -eq 1 ]
   then
-	[ $_verbose -eq 1 ] && echo $_name is in range
-	return 0
+        [ $_verbose -eq 1 ] && echo $_name is in range
+        return 0
   else
-	[ $_verbose -eq 1 ] && echo $_name has value of $_given_val
-	[ $_verbose -eq 1 ] && echo $_name is NOT in range $_min .. $_max	
-	return 1
+        [ $_verbose -eq 1 ] && echo $_name has value of $_given_val
+        [ $_verbose -eq 1 ] && echo $_name is NOT in range $_min .. $_max
+        return 1
   fi
 }
 
@@ -125,7 +125,7 @@ EOF
 _filter_date()
 {
     sed \
-	-e 's/[A-Z][a-z][a-z] [A-z][a-z][a-z]  *[0-9][0-9]* [0-9][0-9]:[0-9][0-9]:[0-9][0-9] [0-9][0-9][0-9][0-9]$/DATE/'
+        -e 's/[A-Z][a-z][a-z] [A-z][a-z][a-z]  *[0-9][0-9]* [0-9][0-9]:[0-9][0-9]:[0-9][0-9] [0-9][0-9][0-9][0-9]$/DATE/'
 }
 
 # replace occurrences of the actual TEST_DIR value with TEST_DIR
diff --git a/tests/qemu-iotests/common.pattern b/tests/qemu-iotests/common.pattern
index 85a40eecc0..00e0f605fd 100644
--- a/tests/qemu-iotests/common.pattern
+++ b/tests/qemu-iotests/common.pattern
@@ -106,8 +106,8 @@ function io_test2() {
     local num=$3
 
     # Pattern (repeat after 9 clusters):
-    #	used - used - free - used - compressed - compressed -
-    #	free - free - compressed
+    #        used - used - free - used - compressed - compressed -
+    #        free - free - compressed
 
     # Write the clusters to be compressed
     echo === Clusters to be compressed [1]
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 5e077c3573..88fecf7870 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -20,17 +20,17 @@
 dd()
 {
    if [ "$HOSTOS" == "Linux" ]
-   then	
-	command dd --help | grep noxfer > /dev/null 2>&1
-	
-	if [ "$?" -eq 0 ]
-	    then
-		command dd status=noxfer $@
-	    else
-		command dd $@
-    	fi
+   then
+        command dd --help | grep noxfer > /dev/null 2>&1
+
+        if [ "$?" -eq 0 ]
+            then
+                command dd status=noxfer $@
+            else
+                command dd $@
+            fi
    else
-	command dd $@
+        command dd $@
    fi
 }
 
@@ -193,8 +193,8 @@ _get_pids_by_name()
 {
     if [ $# -ne 1 ]
     then
-	echo "Usage: _get_pids_by_name process-name" 1>&2
-	exit 1
+        echo "Usage: _get_pids_by_name process-name" 1>&2
+        exit 1
     fi
 
     # Algorithm ... all ps(1) variants have a time of the form MM:SS or
@@ -206,12 +206,12 @@ _get_pids_by_name()
 
     ps $PS_ALL_FLAGS \
     | sed -n \
-	-e 's/$/ /' \
-	-e 's/[ 	][ 	]*/ /g' \
-	-e 's/^ //' \
-	-e 's/^[^ ]* //' \
-	-e "/[0-9]:[0-9][0-9]  *[^ ]*\/$1 /s/ .*//p" \
-	-e "/[0-9]:[0-9][0-9]  *$1 /s/ .*//p"
+        -e 's/$/ /' \
+        -e 's/[         ][         ]*/ /g' \
+        -e 's/^ //' \
+        -e 's/^[^ ]* //' \
+        -e "/[0-9]:[0-9][0-9]  *[^ ]*\/$1 /s/ .*//p" \
+        -e "/[0-9]:[0-9][0-9]  *$1 /s/ .*//p"
 }
 
 # fqdn for localhost
@@ -229,8 +229,8 @@ _need_to_be_root()
     id=`id | $SED_PROG -e 's/(.*//' -e 's/.*=//'`
     if [ "$id" -ne 0 ]
     then
-	echo "Arrgh ... you need to be root (not uid=$id) to run this test"
-	exit 1
+        echo "Arrgh ... you need to be root (not uid=$id) to run this test"
+        exit 1
     fi
 }
 
@@ -248,33 +248,33 @@ _need_to_be_root()
 _do()
 {
     if [ $# -eq 1 ]; then
-	_cmd=$1
+        _cmd=$1
     elif [ $# -eq 2 ]; then
-	_note=$1
-	_cmd=$2
-	echo -n "$_note... "
+        _note=$1
+        _cmd=$2
+        echo -n "$_note... "
     else
-	echo "Usage: _do [note] cmd" 1>&2
-	status=1; exit
+        echo "Usage: _do [note] cmd" 1>&2
+        status=1; exit
     fi
 
     (eval "echo '---' \"$_cmd\"") >>$here/$seq.full
     (eval "$_cmd") >$tmp._out 2>&1; ret=$?
     cat $tmp._out >>$here/$seq.full
     if [ $# -eq 2 ]; then
-	if [ $ret -eq 0 ]; then
-	    echo "done"
-	else
-	    echo "fail"
-	fi
+        if [ $ret -eq 0 ]; then
+            echo "done"
+        else
+            echo "fail"
+        fi
     fi
     if [ $ret -ne 0  ] \
-	&& [ "$_do_die_on_error" = "always" \
-	    -o \( $# -eq 2 -a "$_do_die_on_error" = "message_only" \) ]
+        && [ "$_do_die_on_error" = "always" \
+            -o \( $# -eq 2 -a "$_do_die_on_error" = "message_only" \) ]
     then
-	[ $# -ne 2 ] && echo
-	eval "echo \"$_cmd\" failed \(returned $ret\): see $seq.full"
-	status=1; exit
+        [ $# -ne 2 ] && echo
+        eval "echo \"$_cmd\" failed \(returned $ret\): see $seq.full"
+        status=1; exit
     fi
 
     return $ret
@@ -305,9 +305,9 @@ _fail()
 _supported_fmt()
 {
     for f; do
-	if [ "$f" = "$IMGFMT" -o "$f" = "generic" ]; then
-	    return
-	fi
+        if [ "$f" = "$IMGFMT" -o "$f" = "generic" ]; then
+            return
+        fi
     done
 
     _notrun "not suitable for this image format: $IMGFMT"
@@ -318,9 +318,9 @@ _supported_fmt()
 _supported_proto()
 {
     for f; do
-	if [ "$f" = "$IMGPROTO" -o "$f" = "generic" ]; then
-	    return
-	fi
+        if [ "$f" = "$IMGPROTO" -o "$f" = "generic" ]; then
+            return
+        fi
     done
 
     _notrun "not suitable for this image protocol: $IMGPROTO"
@@ -332,10 +332,10 @@ _supported_os()
 {
     for h
     do
-	if [ "$h" = "$HOSTOS" ]
-	then
-	    return
-	fi
+        if [ "$h" = "$HOSTOS" ]
+        then
+            return
+        fi
     done
 
     _notrun "not suitable for this OS: $HOSTOS"
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index b6962421fa..316b1dd75c 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -66,3 +66,4 @@
 059 rw auto
 060 rw auto
 062 rw auto
+063 rw auto
diff --git a/tests/test-aio.c b/tests/test-aio.c
index 532a1de3f9..c4fe0fc3b7 100644
--- a/tests/test-aio.c
+++ b/tests/test-aio.c
@@ -13,6 +13,7 @@
 #include <glib.h>
 #include "block/aio.h"
 #include "qemu/timer.h"
+#include "qemu/sockets.h"
 
 AioContext *ctx;
 
@@ -375,7 +376,10 @@ static void test_timer_schedule(void)
     /* aio_poll will not block to wait for timers to complete unless it has
      * an fd to wait on. Fixing this breaks other tests. So create a dummy one.
      */
-    g_assert(!pipe2(pipefd, O_NONBLOCK));
+    g_assert(!qemu_pipe(pipefd));
+    qemu_set_nonblock(pipefd[0]);
+    qemu_set_nonblock(pipefd[1]);
+
     aio_set_fd_handler(ctx, pipefd[0],
                        dummy_io_handler_read, NULL, NULL);
     aio_poll(ctx, false);
@@ -716,7 +720,10 @@ static void test_source_timer_schedule(void)
     /* aio_poll will not block to wait for timers to complete unless it has
      * an fd to wait on. Fixing this breaks other tests. So create a dummy one.
      */
-    g_assert(!pipe2(pipefd, O_NONBLOCK));
+    g_assert(!qemu_pipe(pipefd));
+    qemu_set_nonblock(pipefd[0]);
+    qemu_set_nonblock(pipefd[1]);
+
     aio_set_fd_handler(ctx, pipefd[0],
                        dummy_io_handler_read, NULL, NULL);
     do {} while (g_main_context_iteration(NULL, false));
diff --git a/tests/test-throttle.c b/tests/test-throttle.c
new file mode 100644
index 0000000000..760812645b
--- /dev/null
+++ b/tests/test-throttle.c
@@ -0,0 +1,481 @@
+/*
+ * Throttle infrastructure tests
+ *
+ * Copyright Nodalink, SARL. 2013
+ *
+ * Authors:
+ *  Benoît Canet     <benoit.canet@irqsave.net>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include <glib.h>
+#include <math.h>
+#include "qemu/throttle.h"
+
+LeakyBucket    bkt;
+ThrottleConfig cfg;
+ThrottleState  ts;
+
+/* usefull function */
+static bool double_cmp(double x, double y)
+{
+    return fabsl(x - y) < 1e-6;
+}
+
+/* tests for single bucket operations */
+static void test_leak_bucket(void)
+{
+    /* set initial value */
+    bkt.avg = 150;
+    bkt.max = 15;
+    bkt.level = 1.5;
+
+    /* leak an op work of time */
+    throttle_leak_bucket(&bkt, NANOSECONDS_PER_SECOND / 150);
+    g_assert(bkt.avg == 150);
+    g_assert(bkt.max == 15);
+    g_assert(double_cmp(bkt.level, 0.5));
+
+    /* leak again emptying the bucket */
+    throttle_leak_bucket(&bkt, NANOSECONDS_PER_SECOND / 150);
+    g_assert(bkt.avg == 150);
+    g_assert(bkt.max == 15);
+    g_assert(double_cmp(bkt.level, 0));
+
+    /* check that the bucket level won't go lower */
+    throttle_leak_bucket(&bkt, NANOSECONDS_PER_SECOND / 150);
+    g_assert(bkt.avg == 150);
+    g_assert(bkt.max == 15);
+    g_assert(double_cmp(bkt.level, 0));
+}
+
+static void test_compute_wait(void)
+{
+    int64_t wait;
+    int64_t result;
+
+    /* no operation limit set */
+    bkt.avg = 0;
+    bkt.max = 15;
+    bkt.level = 1.5;
+    wait = throttle_compute_wait(&bkt);
+    g_assert(!wait);
+
+    /* zero delta */
+    bkt.avg = 150;
+    bkt.max = 15;
+    bkt.level = 15;
+    wait = throttle_compute_wait(&bkt);
+    g_assert(!wait);
+
+    /* below zero delta */
+    bkt.avg = 150;
+    bkt.max = 15;
+    bkt.level = 9;
+    wait = throttle_compute_wait(&bkt);
+    g_assert(!wait);
+
+    /* half an operation above max */
+    bkt.avg = 150;
+    bkt.max = 15;
+    bkt.level = 15.5;
+    wait = throttle_compute_wait(&bkt);
+    /* time required to do half an operation */
+    result = (int64_t)  NANOSECONDS_PER_SECOND / 150 / 2;
+    g_assert(wait == result);
+}
+
+/* functions to test ThrottleState initialization/destroy methods */
+static void read_timer_cb(void *opaque)
+{
+}
+
+static void write_timer_cb(void *opaque)
+{
+}
+
+static void test_init(void)
+{
+    int i;
+
+    /* fill the structure with crap */
+    memset(&ts, 1, sizeof(ts));
+
+    /* init the structure */
+    throttle_init(&ts, QEMU_CLOCK_VIRTUAL, read_timer_cb, write_timer_cb, &ts);
+
+    /* check initialized fields */
+    g_assert(ts.clock_type == QEMU_CLOCK_VIRTUAL);
+    g_assert(ts.timers[0]);
+    g_assert(ts.timers[1]);
+
+    /* check other fields where cleared */
+    g_assert(!ts.previous_leak);
+    g_assert(!ts.cfg.op_size);
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        g_assert(!ts.cfg.buckets[i].avg);
+        g_assert(!ts.cfg.buckets[i].max);
+        g_assert(!ts.cfg.buckets[i].level);
+    }
+
+    throttle_destroy(&ts);
+}
+
+static void test_destroy(void)
+{
+    int i;
+    throttle_init(&ts, QEMU_CLOCK_VIRTUAL, read_timer_cb, write_timer_cb, &ts);
+    throttle_destroy(&ts);
+    for (i = 0; i < 2; i++) {
+        g_assert(!ts.timers[i]);
+    }
+}
+
+/* function to test throttle_config and throttle_get_config */
+static void test_config_functions(void)
+{
+    int i;
+    ThrottleConfig orig_cfg, final_cfg;
+
+    orig_cfg.buckets[THROTTLE_BPS_TOTAL].avg = 153;
+    orig_cfg.buckets[THROTTLE_BPS_READ].avg  = 56;
+    orig_cfg.buckets[THROTTLE_BPS_WRITE].avg = 1;
+
+    orig_cfg.buckets[THROTTLE_OPS_TOTAL].avg = 150;
+    orig_cfg.buckets[THROTTLE_OPS_READ].avg  = 69;
+    orig_cfg.buckets[THROTTLE_OPS_WRITE].avg = 23;
+
+    orig_cfg.buckets[THROTTLE_BPS_TOTAL].max = 0; /* should be corrected */
+    orig_cfg.buckets[THROTTLE_BPS_READ].max  = 1; /* should not be corrected */
+    orig_cfg.buckets[THROTTLE_BPS_WRITE].max = 120;
+
+    orig_cfg.buckets[THROTTLE_OPS_TOTAL].max = 150;
+    orig_cfg.buckets[THROTTLE_OPS_READ].max  = 400;
+    orig_cfg.buckets[THROTTLE_OPS_WRITE].max = 500;
+
+    orig_cfg.buckets[THROTTLE_BPS_TOTAL].level = 45;
+    orig_cfg.buckets[THROTTLE_BPS_READ].level  = 65;
+    orig_cfg.buckets[THROTTLE_BPS_WRITE].level = 23;
+
+    orig_cfg.buckets[THROTTLE_OPS_TOTAL].level = 1;
+    orig_cfg.buckets[THROTTLE_OPS_READ].level  = 90;
+    orig_cfg.buckets[THROTTLE_OPS_WRITE].level = 75;
+
+    orig_cfg.op_size = 1;
+
+    throttle_init(&ts, QEMU_CLOCK_VIRTUAL, read_timer_cb, write_timer_cb, &ts);
+    /* structure reset by throttle_init previous_leak should be null */
+    g_assert(!ts.previous_leak);
+    throttle_config(&ts, &orig_cfg);
+
+    /* has previous leak been initialized by throttle_config ? */
+    g_assert(ts.previous_leak);
+
+    /* get back the fixed configuration */
+    throttle_get_config(&ts, &final_cfg);
+
+    throttle_destroy(&ts);
+
+    g_assert(final_cfg.buckets[THROTTLE_BPS_TOTAL].avg == 153);
+    g_assert(final_cfg.buckets[THROTTLE_BPS_READ].avg  == 56);
+    g_assert(final_cfg.buckets[THROTTLE_BPS_WRITE].avg == 1);
+
+    g_assert(final_cfg.buckets[THROTTLE_OPS_TOTAL].avg == 150);
+    g_assert(final_cfg.buckets[THROTTLE_OPS_READ].avg  == 69);
+    g_assert(final_cfg.buckets[THROTTLE_OPS_WRITE].avg == 23);
+
+    g_assert(final_cfg.buckets[THROTTLE_BPS_TOTAL].max == 15.3);/* fixed */
+    g_assert(final_cfg.buckets[THROTTLE_BPS_READ].max  == 1);   /* not fixed */
+    g_assert(final_cfg.buckets[THROTTLE_BPS_WRITE].max == 120);
+
+    g_assert(final_cfg.buckets[THROTTLE_OPS_TOTAL].max == 150);
+    g_assert(final_cfg.buckets[THROTTLE_OPS_READ].max  == 400);
+    g_assert(final_cfg.buckets[THROTTLE_OPS_WRITE].max == 500);
+
+    g_assert(final_cfg.op_size == 1);
+
+    /* check bucket have been cleared */
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        g_assert(!final_cfg.buckets[i].level);
+    }
+}
+
+/* functions to test is throttle is enabled by a config */
+static void set_cfg_value(bool is_max, int index, int value)
+{
+    if (is_max) {
+        cfg.buckets[index].max = value;
+    } else {
+        cfg.buckets[index].avg = value;
+    }
+}
+
+static void test_enabled(void)
+{
+    int i;
+
+    memset(&cfg, 0, sizeof(cfg));
+    g_assert(!throttle_enabled(&cfg));
+
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        memset(&cfg, 0, sizeof(cfg));
+        set_cfg_value(false, i, 150);
+        g_assert(throttle_enabled(&cfg));
+    }
+
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        memset(&cfg, 0, sizeof(cfg));
+        set_cfg_value(false, i, -150);
+        g_assert(!throttle_enabled(&cfg));
+    }
+}
+
+/* tests functions for throttle_conflicting */
+
+static void test_conflicts_for_one_set(bool is_max,
+                                       int total,
+                                       int read,
+                                       int write)
+{
+    memset(&cfg, 0, sizeof(cfg));
+    g_assert(!throttle_conflicting(&cfg));
+
+    set_cfg_value(is_max, total, 1);
+    set_cfg_value(is_max, read,  1);
+    g_assert(throttle_conflicting(&cfg));
+
+    memset(&cfg, 0, sizeof(cfg));
+    set_cfg_value(is_max, total, 1);
+    set_cfg_value(is_max, write, 1);
+    g_assert(throttle_conflicting(&cfg));
+
+    memset(&cfg, 0, sizeof(cfg));
+    set_cfg_value(is_max, total, 1);
+    set_cfg_value(is_max, read,  1);
+    set_cfg_value(is_max, write, 1);
+    g_assert(throttle_conflicting(&cfg));
+
+    memset(&cfg, 0, sizeof(cfg));
+    set_cfg_value(is_max, total, 1);
+    g_assert(!throttle_conflicting(&cfg));
+
+    memset(&cfg, 0, sizeof(cfg));
+    set_cfg_value(is_max, read,  1);
+    set_cfg_value(is_max, write, 1);
+    g_assert(!throttle_conflicting(&cfg));
+}
+
+static void test_conflicting_config(void)
+{
+    /* bps average conflicts */
+    test_conflicts_for_one_set(false,
+                               THROTTLE_BPS_TOTAL,
+                               THROTTLE_BPS_READ,
+                               THROTTLE_BPS_WRITE);
+
+    /* ops average conflicts */
+    test_conflicts_for_one_set(false,
+                               THROTTLE_OPS_TOTAL,
+                               THROTTLE_OPS_READ,
+                               THROTTLE_OPS_WRITE);
+
+    /* bps average conflicts */
+    test_conflicts_for_one_set(true,
+                               THROTTLE_BPS_TOTAL,
+                               THROTTLE_BPS_READ,
+                               THROTTLE_BPS_WRITE);
+    /* ops average conflicts */
+    test_conflicts_for_one_set(true,
+                               THROTTLE_OPS_TOTAL,
+                               THROTTLE_OPS_READ,
+                               THROTTLE_OPS_WRITE);
+}
+/* functions to test the throttle_is_valid function */
+static void test_is_valid_for_value(int value, bool should_be_valid)
+{
+    int is_max, index;
+    for (is_max = 0; is_max < 2; is_max++) {
+        for (index = 0; index < BUCKETS_COUNT; index++) {
+            memset(&cfg, 0, sizeof(cfg));
+            set_cfg_value(is_max, index, value);
+            g_assert(throttle_is_valid(&cfg) == should_be_valid);
+        }
+    }
+}
+
+static void test_is_valid(void)
+{
+    /* negative number are invalid */
+    test_is_valid_for_value(-1, false);
+    /* zero are valids */
+    test_is_valid_for_value(0, true);
+    /* positives numers are valids */
+    test_is_valid_for_value(1, true);
+}
+
+static void test_have_timer(void)
+{
+    /* zero the structure */
+    memset(&ts, 0, sizeof(ts));
+
+    /* no timer set shoudl return false */
+    g_assert(!throttle_have_timer(&ts));
+
+    /* init the structure */
+    throttle_init(&ts, QEMU_CLOCK_VIRTUAL, read_timer_cb, write_timer_cb, &ts);
+
+    /* timer set by init should return true */
+    g_assert(throttle_have_timer(&ts));
+
+    throttle_destroy(&ts);
+}
+
+static bool do_test_accounting(bool is_ops, /* are we testing bps or ops */
+                int size,                   /* size of the operation to do */
+                double avg,                 /* io limit */
+                uint64_t op_size,           /* ideal size of an io */
+                double total_result,
+                double read_result,
+                double write_result)
+{
+    BucketType to_test[2][3] = { { THROTTLE_BPS_TOTAL,
+                                   THROTTLE_BPS_READ,
+                                   THROTTLE_BPS_WRITE, },
+                                 { THROTTLE_OPS_TOTAL,
+                                   THROTTLE_OPS_READ,
+                                   THROTTLE_OPS_WRITE, } };
+    ThrottleConfig cfg;
+    BucketType index;
+    int i;
+
+    for (i = 0; i < 3; i++) {
+        BucketType index = to_test[is_ops][i];
+        cfg.buckets[index].avg = avg;
+    }
+
+    cfg.op_size = op_size;
+
+    throttle_init(&ts, QEMU_CLOCK_VIRTUAL, read_timer_cb, write_timer_cb, &ts);
+    throttle_config(&ts, &cfg);
+
+    /* account a read */
+    throttle_account(&ts, false, size);
+    /* account a write */
+    throttle_account(&ts, true, size);
+
+    /* check total result */
+    index = to_test[is_ops][0];
+    if (!double_cmp(ts.cfg.buckets[index].level, total_result)) {
+        return false;
+    }
+
+    /* check read result */
+    index = to_test[is_ops][1];
+    if (!double_cmp(ts.cfg.buckets[index].level, read_result)) {
+        return false;
+    }
+
+    /* check write result */
+    index = to_test[is_ops][2];
+    if (!double_cmp(ts.cfg.buckets[index].level, write_result)) {
+        return false;
+    }
+
+    throttle_destroy(&ts);
+
+    return true;
+}
+
+static void test_accounting(void)
+{
+    /* tests for bps */
+
+    /* op of size 1 */
+    g_assert(do_test_accounting(false,
+                                1 * 512,
+                                150,
+                                0,
+                                1024,
+                                512,
+                                512));
+
+    /* op of size 2 */
+    g_assert(do_test_accounting(false,
+                                2 * 512,
+                                150,
+                                0,
+                                2048,
+                                1024,
+                                1024));
+
+    /* op of size 2 and orthogonal parameter change */
+    g_assert(do_test_accounting(false,
+                                2 * 512,
+                                150,
+                                17,
+                                2048,
+                                1024,
+                                1024));
+
+
+    /* tests for ops */
+
+    /* op of size 1 */
+    g_assert(do_test_accounting(true,
+                                1 * 512,
+                                150,
+                                0,
+                                2,
+                                1,
+                                1));
+
+    /* op of size 2 */
+    g_assert(do_test_accounting(true,
+                                2 *  512,
+                                150,
+                                0,
+                                2,
+                                1,
+                                1));
+
+    /* jumbo op accounting fragmentation : size 64 with op size of 13 units */
+    g_assert(do_test_accounting(true,
+                                64 * 512,
+                                150,
+                                13 * 512,
+                                (64.0 * 2) / 13,
+                                (64.0 / 13),
+                                (64.0 / 13)));
+
+    /* same with orthogonal parameters changes */
+    g_assert(do_test_accounting(true,
+                                64 * 512,
+                                300,
+                                13 * 512,
+                                (64.0 * 2) / 13,
+                                (64.0 / 13),
+                                (64.0 / 13)));
+}
+
+int main(int argc, char **argv)
+{
+    init_clocks();
+    do {} while (g_main_context_iteration(NULL, false));
+
+    /* tests in the same order as the header function declarations */
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/throttle/leak_bucket",        test_leak_bucket);
+    g_test_add_func("/throttle/compute_wait",       test_compute_wait);
+    g_test_add_func("/throttle/init",               test_init);
+    g_test_add_func("/throttle/destroy",            test_destroy);
+    g_test_add_func("/throttle/have_timer",         test_have_timer);
+    g_test_add_func("/throttle/config/enabled",     test_enabled);
+    g_test_add_func("/throttle/config/conflicting", test_conflicting_config);
+    g_test_add_func("/throttle/config/is_valid",    test_is_valid);
+    g_test_add_func("/throttle/config_functions",   test_config_functions);
+    g_test_add_func("/throttle/accounting",         test_accounting);
+    return g_test_run();
+}
+
diff --git a/util/Makefile.objs b/util/Makefile.objs
index dc72ab0721..2bb13a2a59 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -11,3 +11,4 @@ util-obj-y += iov.o aes.o qemu-config.o qemu-sockets.o uri.o notify.o
 util-obj-y += qemu-option.o qemu-progress.o
 util-obj-y += hexdump.o
 util-obj-y += crc32c.o
+util-obj-y += throttle.o
diff --git a/util/throttle.c b/util/throttle.c
new file mode 100644
index 0000000000..02e6f15587
--- /dev/null
+++ b/util/throttle.c
@@ -0,0 +1,396 @@
+/*
+ * QEMU throttling infrastructure
+ *
+ * Copyright (C) Nodalink, SARL. 2013
+ *
+ * Author:
+ *   Benoît Canet <benoit.canet@irqsave.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/throttle.h"
+#include "qemu/timer.h"
+
+/* This function make a bucket leak
+ *
+ * @bkt:   the bucket to make leak
+ * @delta_ns: the time delta
+ */
+void throttle_leak_bucket(LeakyBucket *bkt, int64_t delta_ns)
+{
+    double leak;
+
+    /* compute how much to leak */
+    leak = (bkt->avg * (double) delta_ns) / NANOSECONDS_PER_SECOND;
+
+    /* make the bucket leak */
+    bkt->level = MAX(bkt->level - leak, 0);
+}
+
+/* Calculate the time delta since last leak and make proportionals leaks
+ *
+ * @now:      the current timestamp in ns
+ */
+static void throttle_do_leak(ThrottleState *ts, int64_t now)
+{
+    /* compute the time elapsed since the last leak */
+    int64_t delta_ns = now - ts->previous_leak;
+    int i;
+
+    ts->previous_leak = now;
+
+    if (delta_ns <= 0) {
+        return;
+    }
+
+    /* make each bucket leak */
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        throttle_leak_bucket(&ts->cfg.buckets[i], delta_ns);
+    }
+}
+
+/* do the real job of computing the time to wait
+ *
+ * @limit: the throttling limit
+ * @extra: the number of operation to delay
+ * @ret:   the time to wait in ns
+ */
+static int64_t throttle_do_compute_wait(double limit, double extra)
+{
+    double wait = extra * NANOSECONDS_PER_SECOND;
+    wait /= limit;
+    return wait;
+}
+
+/* This function compute the wait time in ns that a leaky bucket should trigger
+ *
+ * @bkt: the leaky bucket we operate on
+ * @ret: the resulting wait time in ns or 0 if the operation can go through
+ */
+int64_t throttle_compute_wait(LeakyBucket *bkt)
+{
+    double extra; /* the number of extra units blocking the io */
+
+    if (!bkt->avg) {
+        return 0;
+    }
+
+    extra = bkt->level - bkt->max;
+
+    if (extra <= 0) {
+        return 0;
+    }
+
+    return throttle_do_compute_wait(bkt->avg, extra);
+}
+
+/* This function compute the time that must be waited while this IO
+ *
+ * @is_write:   true if the current IO is a write, false if it's a read
+ * @ret:        time to wait
+ */
+static int64_t throttle_compute_wait_for(ThrottleState *ts,
+                                         bool is_write)
+{
+    BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL,
+                                   THROTTLE_OPS_TOTAL,
+                                   THROTTLE_BPS_READ,
+                                   THROTTLE_OPS_READ},
+                                  {THROTTLE_BPS_TOTAL,
+                                   THROTTLE_OPS_TOTAL,
+                                   THROTTLE_BPS_WRITE,
+                                   THROTTLE_OPS_WRITE}, };
+    int64_t wait, max_wait = 0;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        BucketType index = to_check[is_write][i];
+        wait = throttle_compute_wait(&ts->cfg.buckets[index]);
+        if (wait > max_wait) {
+            max_wait = wait;
+        }
+    }
+
+    return max_wait;
+}
+
+/* compute the timer for this type of operation
+ *
+ * @is_write:   the type of operation
+ * @now:        the current clock timestamp
+ * @next_timestamp: the resulting timer
+ * @ret:        true if a timer must be set
+ */
+bool throttle_compute_timer(ThrottleState *ts,
+                            bool is_write,
+                            int64_t now,
+                            int64_t *next_timestamp)
+{
+    int64_t wait;
+
+    /* leak proportionally to the time elapsed */
+    throttle_do_leak(ts, now);
+
+    /* compute the wait time if any */
+    wait = throttle_compute_wait_for(ts, is_write);
+
+    /* if the code must wait compute when the next timer should fire */
+    if (wait) {
+        *next_timestamp = now + wait;
+        return true;
+    }
+
+    /* else no need to wait at all */
+    *next_timestamp = now;
+    return false;
+}
+
+/* To be called first on the ThrottleState */
+void throttle_init(ThrottleState *ts,
+                   QEMUClockType clock_type,
+                   QEMUTimerCB *read_timer_cb,
+                   QEMUTimerCB *write_timer_cb,
+                   void *timer_opaque)
+{
+    memset(ts, 0, sizeof(ThrottleState));
+
+    ts->clock_type = clock_type;
+    ts->timers[0] = timer_new_ns(clock_type, read_timer_cb, timer_opaque);
+    ts->timers[1] = timer_new_ns(clock_type, write_timer_cb, timer_opaque);
+}
+
+/* destroy a timer */
+static void throttle_timer_destroy(QEMUTimer **timer)
+{
+    assert(*timer != NULL);
+
+    timer_del(*timer);
+    timer_free(*timer);
+    *timer = NULL;
+}
+
+/* To be called last on the ThrottleState */
+void throttle_destroy(ThrottleState *ts)
+{
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        throttle_timer_destroy(&ts->timers[i]);
+    }
+}
+
+/* is any throttling timer configured */
+bool throttle_have_timer(ThrottleState *ts)
+{
+    if (ts->timers[0]) {
+        return true;
+    }
+
+    return false;
+}
+
+/* Does any throttling must be done
+ *
+ * @cfg: the throttling configuration to inspect
+ * @ret: true if throttling must be done else false
+ */
+bool throttle_enabled(ThrottleConfig *cfg)
+{
+    int i;
+
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        if (cfg->buckets[i].avg > 0) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/* return true if any two throttling parameters conflicts
+ *
+ * @cfg: the throttling configuration to inspect
+ * @ret: true if any conflict detected else false
+ */
+bool throttle_conflicting(ThrottleConfig *cfg)
+{
+    bool bps_flag, ops_flag;
+    bool bps_max_flag, ops_max_flag;
+
+    bps_flag = cfg->buckets[THROTTLE_BPS_TOTAL].avg &&
+               (cfg->buckets[THROTTLE_BPS_READ].avg ||
+                cfg->buckets[THROTTLE_BPS_WRITE].avg);
+
+    ops_flag = cfg->buckets[THROTTLE_OPS_TOTAL].avg &&
+               (cfg->buckets[THROTTLE_OPS_READ].avg ||
+                cfg->buckets[THROTTLE_OPS_WRITE].avg);
+
+    bps_max_flag = cfg->buckets[THROTTLE_BPS_TOTAL].max &&
+                  (cfg->buckets[THROTTLE_BPS_READ].max  ||
+                   cfg->buckets[THROTTLE_BPS_WRITE].max);
+
+    ops_max_flag = cfg->buckets[THROTTLE_OPS_TOTAL].max &&
+                   (cfg->buckets[THROTTLE_OPS_READ].max ||
+                   cfg->buckets[THROTTLE_OPS_WRITE].max);
+
+    return bps_flag || ops_flag || bps_max_flag || ops_max_flag;
+}
+
+/* check if a throttling configuration is valid
+ * @cfg: the throttling configuration to inspect
+ * @ret: true if valid else false
+ */
+bool throttle_is_valid(ThrottleConfig *cfg)
+{
+    bool invalid = false;
+    int i;
+
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        if (cfg->buckets[i].avg < 0) {
+            invalid = true;
+        }
+    }
+
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        if (cfg->buckets[i].max < 0) {
+            invalid = true;
+        }
+    }
+
+    return !invalid;
+}
+
+/* fix bucket parameters */
+static void throttle_fix_bucket(LeakyBucket *bkt)
+{
+    double min;
+
+    /* zero bucket level */
+    bkt->level = 0;
+
+    /* The following is done to cope with the Linux CFQ block scheduler
+     * which regroup reads and writes by block of 100ms in the guest.
+     * When they are two process one making reads and one making writes cfq
+     * make a pattern looking like the following:
+     * WWWWWWWWWWWRRRRRRRRRRRRRRWWWWWWWWWWWWWwRRRRRRRRRRRRRRRRR
+     * Having a max burst value of 100ms of the average will help smooth the
+     * throttling
+     */
+    min = bkt->avg / 10;
+    if (bkt->avg && !bkt->max) {
+        bkt->max = min;
+    }
+}
+
+/* take care of canceling a timer */
+static void throttle_cancel_timer(QEMUTimer *timer)
+{
+    assert(timer != NULL);
+
+    timer_del(timer);
+}
+
+/* Used to configure the throttle
+ *
+ * @ts: the throttle state we are working on
+ * @cfg: the config to set
+ */
+void throttle_config(ThrottleState *ts, ThrottleConfig *cfg)
+{
+    int i;
+
+    ts->cfg = *cfg;
+
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        throttle_fix_bucket(&ts->cfg.buckets[i]);
+    }
+
+    ts->previous_leak = qemu_clock_get_ns(ts->clock_type);
+
+    for (i = 0; i < 2; i++) {
+        throttle_cancel_timer(ts->timers[i]);
+    }
+}
+
+/* used to get config
+ *
+ * @ts:  the throttle state we are working on
+ * @cfg: the config to write
+ */
+void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg)
+{
+    *cfg = ts->cfg;
+}
+
+
+/* Schedule the read or write timer if needed
+ *
+ * NOTE: this function is not unit tested due to it's usage of timer_mod
+ *
+ * @is_write: the type of operation (read/write)
+ * @ret:      true if the timer has been scheduled else false
+ */
+bool throttle_schedule_timer(ThrottleState *ts, bool is_write)
+{
+    int64_t now = qemu_clock_get_ns(ts->clock_type);
+    int64_t next_timestamp;
+    bool must_wait;
+
+    must_wait = throttle_compute_timer(ts,
+                                       is_write,
+                                       now,
+                                       &next_timestamp);
+
+    /* request not throttled */
+    if (!must_wait) {
+        return false;
+    }
+
+    /* request throttled and timer pending -> do nothing */
+    if (timer_pending(ts->timers[is_write])) {
+        return true;
+    }
+
+    /* request throttled and timer not pending -> arm timer */
+    timer_mod(ts->timers[is_write], next_timestamp);
+    return true;
+}
+
+/* do the accounting for this operation
+ *
+ * @is_write: the type of operation (read/write)
+ * @size:     the size of the operation
+ */
+void throttle_account(ThrottleState *ts, bool is_write, uint64_t size)
+{
+    double units = 1.0;
+
+    /* if cfg.op_size is defined and smaller than size we compute unit count */
+    if (ts->cfg.op_size && size > ts->cfg.op_size) {
+        units = (double) size / ts->cfg.op_size;
+    }
+
+    ts->cfg.buckets[THROTTLE_BPS_TOTAL].level += size;
+    ts->cfg.buckets[THROTTLE_OPS_TOTAL].level += units;
+
+    if (is_write) {
+        ts->cfg.buckets[THROTTLE_BPS_WRITE].level += size;
+        ts->cfg.buckets[THROTTLE_OPS_WRITE].level += units;
+    } else {
+        ts->cfg.buckets[THROTTLE_BPS_READ].level += size;
+        ts->cfg.buckets[THROTTLE_OPS_READ].level += units;
+    }
+}
+