diff --git a/block/block-backend.c b/block/block-backend.c
index 9889e813b6..36ccc9e616 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -642,8 +642,9 @@ static void error_callback_bh(void *opaque)
     qemu_aio_unref(acb);
 }
 
-static BlockAIOCB *abort_aio_request(BlockBackend *blk, BlockCompletionFunc *cb,
-                                     void *opaque, int ret)
+BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
+                                  BlockCompletionFunc *cb,
+                                  void *opaque, int ret)
 {
     struct BlockBackendAIOCB *acb;
     QEMUBH *bh;
@@ -665,7 +666,7 @@ BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num,
 {
     int ret = blk_check_request(blk, sector_num, nb_sectors);
     if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
     }
 
     return bdrv_aio_write_zeroes(blk->bs, sector_num, nb_sectors, flags,
@@ -725,7 +726,7 @@ BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num,
 {
     int ret = blk_check_request(blk, sector_num, nb_sectors);
     if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
     }
 
     return bdrv_aio_readv(blk->bs, sector_num, iov, nb_sectors, cb, opaque);
@@ -737,7 +738,7 @@ BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num,
 {
     int ret = blk_check_request(blk, sector_num, nb_sectors);
     if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
     }
 
     return bdrv_aio_writev(blk->bs, sector_num, iov, nb_sectors, cb, opaque);
@@ -747,7 +748,7 @@ BlockAIOCB *blk_aio_flush(BlockBackend *blk,
                           BlockCompletionFunc *cb, void *opaque)
 {
     if (!blk_is_available(blk)) {
-        return abort_aio_request(blk, cb, opaque, -ENOMEDIUM);
+        return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM);
     }
 
     return bdrv_aio_flush(blk->bs, cb, opaque);
@@ -759,7 +760,7 @@ BlockAIOCB *blk_aio_discard(BlockBackend *blk,
 {
     int ret = blk_check_request(blk, sector_num, nb_sectors);
     if (ret < 0) {
-        return abort_aio_request(blk, cb, opaque, ret);
+        return blk_abort_aio_request(blk, cb, opaque, ret);
     }
 
     return bdrv_aio_discard(blk->bs, sector_num, nb_sectors, cb, opaque);
@@ -802,7 +803,7 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
                           BlockCompletionFunc *cb, void *opaque)
 {
     if (!blk_is_available(blk)) {
-        return abort_aio_request(blk, cb, opaque, -ENOMEDIUM);
+        return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM);
     }
 
     return bdrv_aio_ioctl(blk->bs, req, buf, cb, opaque);
diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index ed8bb2c80a..7b9f74c3b5 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -105,20 +105,27 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
     memset(buf, 0, 288);
 }
 
-static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)
+static int
+cd_read_sector_sync(IDEState *s)
 {
     int ret;
     block_acct_start(blk_get_stats(s->blk), &s->acct,
                      4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
 
-    switch(sector_size) {
+#ifdef DEBUG_IDE_ATAPI
+    printf("cd_read_sector_sync: lba=%d\n", s->lba);
+#endif
+
+    switch (s->cd_sector_size) {
     case 2048:
-        ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
+        ret = blk_read(s->blk, (int64_t)s->lba << 2,
+                       s->io_buffer, 4);
         break;
     case 2352:
-        ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
+        ret = blk_read(s->blk, (int64_t)s->lba << 2,
+                       s->io_buffer + 16, 4);
         if (ret >= 0) {
-            cd_data_to_raw(buf, lba);
+            cd_data_to_raw(s->io_buffer, s->lba);
         }
         break;
     default:
@@ -130,11 +137,65 @@ static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)
         block_acct_failed(blk_get_stats(s->blk), &s->acct);
     } else {
         block_acct_done(blk_get_stats(s->blk), &s->acct);
+        s->lba++;
+        s->io_buffer_index = 0;
     }
 
     return ret;
 }
 
+static void cd_read_sector_cb(void *opaque, int ret)
+{
+    IDEState *s = opaque;
+
+    block_acct_done(blk_get_stats(s->blk), &s->acct);
+
+#ifdef DEBUG_IDE_ATAPI
+    printf("cd_read_sector_cb: lba=%d ret=%d\n", s->lba, ret);
+#endif
+
+    if (ret < 0) {
+        ide_atapi_io_error(s, ret);
+        return;
+    }
+
+    if (s->cd_sector_size == 2352) {
+        cd_data_to_raw(s->io_buffer, s->lba);
+    }
+
+    s->lba++;
+    s->io_buffer_index = 0;
+    s->status &= ~BUSY_STAT;
+
+    ide_atapi_cmd_reply_end(s);
+}
+
+static int cd_read_sector(IDEState *s)
+{
+    if (s->cd_sector_size != 2048 && s->cd_sector_size != 2352) {
+        return -EINVAL;
+    }
+
+    s->iov.iov_base = (s->cd_sector_size == 2352) ?
+                      s->io_buffer + 16 : s->io_buffer;
+
+    s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
+    qemu_iovec_init_external(&s->qiov, &s->iov, 1);
+
+#ifdef DEBUG_IDE_ATAPI
+    printf("cd_read_sector: lba=%d\n", s->lba);
+#endif
+
+    block_acct_start(blk_get_stats(s->blk), &s->acct,
+                     4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
+
+    ide_buffered_readv(s, (int64_t)s->lba << 2, &s->qiov, 4,
+                       cd_read_sector_cb, s);
+
+    s->status |= BUSY_STAT;
+    return 0;
+}
+
 void ide_atapi_cmd_ok(IDEState *s)
 {
     s->error = 0;
@@ -196,18 +257,27 @@ void ide_atapi_cmd_reply_end(IDEState *s)
         ide_atapi_cmd_ok(s);
         ide_set_irq(s->bus);
 #ifdef DEBUG_IDE_ATAPI
-        printf("status=0x%x\n", s->status);
+        printf("end of transfer, status=0x%x\n", s->status);
 #endif
     } else {
         /* see if a new sector must be read */
         if (s->lba != -1 && s->io_buffer_index >= s->cd_sector_size) {
-            ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size);
-            if (ret < 0) {
-                ide_atapi_io_error(s, ret);
+            if (!s->elementary_transfer_size) {
+                ret = cd_read_sector(s);
+                if (ret < 0) {
+                    ide_atapi_io_error(s, ret);
+                }
                 return;
+            } else {
+                /* rebuffering within an elementary transfer is
+                 * only possible with a sync request because we
+                 * end up with a race condition otherwise */
+                ret = cd_read_sector_sync(s);
+                if (ret < 0) {
+                    ide_atapi_io_error(s, ret);
+                    return;
+                }
             }
-            s->lba++;
-            s->io_buffer_index = 0;
         }
         if (s->elementary_transfer_size > 0) {
             /* there are some data left to transmit in this elementary
@@ -287,7 +357,6 @@ static void ide_atapi_cmd_read_pio(IDEState *s, int lba, int nb_sectors,
     s->io_buffer_index = sector_size;
     s->cd_sector_size = sector_size;
 
-    s->status = READY_STAT | SEEK_STAT;
     ide_atapi_cmd_reply_end(s);
 }
 
@@ -363,16 +432,16 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret)
     s->bus->dma->iov.iov_len = n * 4 * 512;
     qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1);
 
-    s->bus->dma->aiocb = blk_aio_readv(s->blk, (int64_t)s->lba << 2,
-                                       &s->bus->dma->qiov, n * 4,
-                                       ide_atapi_cmd_read_dma_cb, s);
+    s->bus->dma->aiocb = ide_buffered_readv(s, (int64_t)s->lba << 2,
+                                            &s->bus->dma->qiov, n * 4,
+                                            ide_atapi_cmd_read_dma_cb, s);
     return;
 
 eot:
     if (ret < 0) {
         block_acct_failed(blk_get_stats(s->blk), &s->acct);
     } else {
-        block_acct_done(blk_get_stats(s->blk), &s->acct);
+    block_acct_done(blk_get_stats(s->blk), &s->acct);
     }
     ide_set_inactive(s, false);
 }
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 2725dd3b81..da3baab1eb 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -561,6 +561,53 @@ static bool ide_sect_range_ok(IDEState *s,
     return true;
 }
 
+static void ide_buffered_readv_cb(void *opaque, int ret)
+{
+    IDEBufferedRequest *req = opaque;
+    if (!req->orphaned) {
+        if (!ret) {
+            qemu_iovec_from_buf(req->original_qiov, 0, req->iov.iov_base,
+                                req->original_qiov->size);
+        }
+        req->original_cb(req->original_opaque, ret);
+    }
+    QLIST_REMOVE(req, list);
+    qemu_vfree(req->iov.iov_base);
+    g_free(req);
+}
+
+#define MAX_BUFFERED_REQS 16
+
+BlockAIOCB *ide_buffered_readv(IDEState *s, int64_t sector_num,
+                               QEMUIOVector *iov, int nb_sectors,
+                               BlockCompletionFunc *cb, void *opaque)
+{
+    BlockAIOCB *aioreq;
+    IDEBufferedRequest *req;
+    int c = 0;
+
+    QLIST_FOREACH(req, &s->buffered_requests, list) {
+        c++;
+    }
+    if (c > MAX_BUFFERED_REQS) {
+        return blk_abort_aio_request(s->blk, cb, opaque, -EIO);
+    }
+
+    req = g_new0(IDEBufferedRequest, 1);
+    req->original_qiov = iov;
+    req->original_cb = cb;
+    req->original_opaque = opaque;
+    req->iov.iov_base = qemu_blockalign(blk_bs(s->blk), iov->size);
+    req->iov.iov_len = iov->size;
+    qemu_iovec_init_external(&req->qiov, &req->iov, 1);
+
+    aioreq = blk_aio_readv(s->blk, sector_num, &req->qiov, nb_sectors,
+                           ide_buffered_readv_cb, req);
+
+    QLIST_INSERT_HEAD(&s->buffered_requests, req, list);
+    return aioreq;
+}
+
 static void ide_sector_read(IDEState *s);
 
 static void ide_sector_read_cb(void *opaque, int ret)
@@ -632,8 +679,8 @@ static void ide_sector_read(IDEState *s)
 
     block_acct_start(blk_get_stats(s->blk), &s->acct,
                      n * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-    s->pio_aiocb = blk_aio_readv(s->blk, sector_num, &s->qiov, n,
-                                 ide_sector_read_cb, s);
+    s->pio_aiocb = ide_buffered_readv(s, sector_num, &s->qiov, n,
+                                      ide_sector_read_cb, s);
 }
 
 void dma_buf_commit(IDEState *s, uint32_t tx_bytes)
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index e4629b023a..2d1e2d2d2f 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -343,6 +343,16 @@ enum ide_dma_cmd {
 #define ide_cmd_is_read(s) \
 	((s)->dma_cmd == IDE_DMA_READ)
 
+typedef struct IDEBufferedRequest {
+    QLIST_ENTRY(IDEBufferedRequest) list;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    QEMUIOVector *original_qiov;
+    BlockCompletionFunc *original_cb;
+    void *original_opaque;
+    bool orphaned;
+} IDEBufferedRequest;
+
 /* NOTE: IDEState represents in fact one drive */
 struct IDEState {
     IDEBus *bus;
@@ -396,6 +406,7 @@ struct IDEState {
     BlockAIOCB *pio_aiocb;
     struct iovec iov;
     QEMUIOVector qiov;
+    QLIST_HEAD(, IDEBufferedRequest) buffered_requests;
     /* ATA DMA state */
     uint64_t io_buffer_offset;
     int32_t io_buffer_size;
@@ -572,6 +583,9 @@ void ide_set_inactive(IDEState *s, bool more);
 BlockAIOCB *ide_issue_trim(BlockBackend *blk,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *ide_buffered_readv(IDEState *s, int64_t sector_num,
+                               QEMUIOVector *iov, int nb_sectors,
+                               BlockCompletionFunc *cb, void *opaque);
 
 /* hw/ide/atapi.c */
 void ide_atapi_cmd(IDEState *s);
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index 9c54b378d6..37dbc291da 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -233,6 +233,22 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
     /* Ignore writes to SSBM if it keeps the old value */
     if ((val & BM_CMD_START) != (bm->cmd & BM_CMD_START)) {
         if (!(val & BM_CMD_START)) {
+            /* First invoke the callbacks of all buffered requests
+             * and flag those requests as orphaned. Ideally there
+             * are no unbuffered (Scatter Gather DMA Requests or
+             * write requests) pending and we can avoid to drain. */
+            IDEBufferedRequest *req;
+            IDEState *s = idebus_active_if(bm->bus);
+            QLIST_FOREACH(req, &s->buffered_requests, list) {
+                if (!req->orphaned) {
+#ifdef DEBUG_IDE
+                    printf("%s: invoking cb %p of buffered request %p with"
+                           " -ECANCELED\n", __func__, req->original_cb, req);
+#endif
+                    req->original_cb(req->original_opaque, -ECANCELED);
+                }
+                req->orphaned = true;
+            }
             /*
              * We can't cancel Scatter Gather DMA in the middle of the
              * operation or a partial (not full) DMA transfer would reach
@@ -246,6 +262,9 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
              * aio operation with preadv/pwritev.
              */
             if (bm->bus->dma->aiocb) {
+#ifdef DEBUG_IDE
+                printf("%s: draining all remaining requests", __func__);
+#endif
                 blk_drain_all();
                 assert(bm->bus->dma->aiocb == NULL);
             }
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index f4a68e291b..fb068ea47b 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -184,5 +184,8 @@ int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size);
 int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz);
 int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo);
+BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
+                                  BlockCompletionFunc *cb,
+                                  void *opaque, int ret);
 
 #endif