From 355ee2d0e8ca536a6278c9c763ddd2f136eace3f Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Tue, 4 Aug 2015 15:14:39 +0300
Subject: [PATCH 1/5] qcow2: mark the memory as no longer needed after
 qcow2_cache_empty()

After having emptied the cache, the data in the cache tables is no
longer useful, so we can tell the kernel that we are done with it. In
Linux this frees the resources associated with it.

The effect of this can be seen in the HMP commit operation: it moves
data from the top to the base image (and fills both caches), then it
empties the top image. At this point the data in that cache is no
longer needed so it's just wasting memory.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 08538b098e1faf6c92496477cf9b47a20e5aacea.1438690126.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cache.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 53b8afc3d3..f63e7d8e70 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -22,8 +22,16 @@
  * THE SOFTWARE.
  */
 
+/* Needed for CONFIG_MADVISE */
+#include "config-host.h"
+
+#if defined(CONFIG_MADVISE) || defined(CONFIG_POSIX_MADVISE)
+#include <sys/mman.h>
+#endif
+
 #include "block/block_int.h"
 #include "qemu-common.h"
+#include "qemu/osdep.h"
 #include "qcow2.h"
 #include "trace.h"
 
@@ -60,6 +68,22 @@ static inline int qcow2_cache_get_table_idx(BlockDriverState *bs,
     return idx;
 }
 
+static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
+                                      int i, int num_tables)
+{
+#if QEMU_MADV_DONTNEED != QEMU_MADV_INVALID
+    BDRVQcowState *s = bs->opaque;
+    void *t = qcow2_cache_get_table_addr(bs, c, i);
+    int align = getpagesize();
+    size_t mem_size = (size_t) s->cluster_size * num_tables;
+    size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t;
+    size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align);
+    if (length > 0) {
+        qemu_madvise((uint8_t *) t + offset, length, QEMU_MADV_DONTNEED);
+    }
+#endif
+}
+
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
 {
     BDRVQcowState *s = bs->opaque;
@@ -237,6 +261,8 @@ int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c)
         c->entries[i].lru_counter = 0;
     }
 
+    qcow2_cache_table_release(bs, c, 0, c->size);
+
     c->lru_counter = 0;
 
     return 0;

From 279621c046ce57de0af9e3c00663b48d3a7835ae Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Tue, 4 Aug 2015 15:14:40 +0300
Subject: [PATCH 2/5] qcow2: add option to clean unused cache entries after
 some time

This adds a new 'cache-clean-interval' option that cleans all qcow2
cache entries that haven't been used in a certain interval, given in
seconds.

This allows setting a large L2 cache size so it can handle scenarios
with lots of I/O and at the same time use little memory during periods
of inactivity.

This feature currently relies on MADV_DONTNEED to free that memory, so
it is not useful in systems that don't follow that behavior.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: a70d12da60433df9360ada648b3f34b8f6f354ce.1438690126.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cache.c  | 35 ++++++++++++++++++++++++
 block/qcow2.c        | 64 ++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.h        |  4 +++
 qapi/block-core.json |  7 ++++-
 4 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index f63e7d8e70..8457458418 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -49,6 +49,7 @@ struct Qcow2Cache {
     bool                    depends_on_flush;
     void                   *table_array;
     uint64_t                lru_counter;
+    uint64_t                cache_clean_lru_counter;
 };
 
 static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
@@ -84,6 +85,40 @@ static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
 #endif
 }
 
+static inline bool can_clean_entry(Qcow2Cache *c, int i)
+{
+    Qcow2CachedTable *t = &c->entries[i];
+    return t->ref == 0 && !t->dirty && t->offset != 0 &&
+        t->lru_counter <= c->cache_clean_lru_counter;
+}
+
+void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int i = 0;
+    while (i < c->size) {
+        int to_clean = 0;
+
+        /* Skip the entries that we don't need to clean */
+        while (i < c->size && !can_clean_entry(c, i)) {
+            i++;
+        }
+
+        /* And count how many we can clean in a row */
+        while (i < c->size && can_clean_entry(c, i)) {
+            c->entries[i].offset = 0;
+            c->entries[i].lru_counter = 0;
+            i++;
+            to_clean++;
+        }
+
+        if (to_clean > 0) {
+            qcow2_cache_table_release(bs, c, i - to_clean, to_clean);
+        }
+    }
+
+    c->cache_clean_lru_counter = c->lru_counter;
+}
+
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
 {
     BDRVQcowState *s = bs->opaque;
diff --git a/block/qcow2.c b/block/qcow2.c
index 76c331b387..ea34ae2da5 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -467,6 +467,11 @@ static QemuOptsList qcow2_runtime_opts = {
             .type = QEMU_OPT_SIZE,
             .help = "Maximum refcount block cache size",
         },
+        {
+            .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL,
+            .type = QEMU_OPT_NUMBER,
+            .help = "Clean unused cache entries after this time (in seconds)",
+        },
         { /* end of list */ }
     },
 };
@@ -482,6 +487,49 @@ static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
     [QCOW2_OL_INACTIVE_L2_BITNR]    = QCOW2_OPT_OVERLAP_INACTIVE_L2,
 };
 
+static void cache_clean_timer_cb(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    BDRVQcowState *s = bs->opaque;
+    qcow2_cache_clean_unused(bs, s->l2_table_cache);
+    qcow2_cache_clean_unused(bs, s->refcount_block_cache);
+    timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+              (int64_t) s->cache_clean_interval * 1000);
+}
+
+static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context)
+{
+    BDRVQcowState *s = bs->opaque;
+    if (s->cache_clean_interval > 0) {
+        s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL,
+                                             SCALE_MS, cache_clean_timer_cb,
+                                             bs);
+        timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+                  (int64_t) s->cache_clean_interval * 1000);
+    }
+}
+
+static void cache_clean_timer_del(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    if (s->cache_clean_timer) {
+        timer_del(s->cache_clean_timer);
+        timer_free(s->cache_clean_timer);
+        s->cache_clean_timer = NULL;
+    }
+}
+
+static void qcow2_detach_aio_context(BlockDriverState *bs)
+{
+    cache_clean_timer_del(bs);
+}
+
+static void qcow2_attach_aio_context(BlockDriverState *bs,
+                                     AioContext *new_context)
+{
+    cache_clean_timer_init(bs, new_context);
+}
+
 static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
                              uint64_t *l2_cache_size,
                              uint64_t *refcount_cache_size, Error **errp)
@@ -555,6 +603,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     const char *opt_overlap_check, *opt_overlap_check_template;
     int overlap_check_template = 0;
     uint64_t l2_cache_size, refcount_cache_size;
+    uint64_t cache_clean_interval;
 
     ret = bdrv_pread(bs->file, 0, &header, sizeof(header));
     if (ret < 0) {
@@ -848,6 +897,16 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
         goto fail;
     }
 
+    cache_clean_interval =
+        qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL, 0);
+    if (cache_clean_interval > UINT_MAX) {
+        error_setg(errp, "Cache clean interval too big");
+        ret = -EINVAL;
+        goto fail;
+    }
+    s->cache_clean_interval = cache_clean_interval;
+    cache_clean_timer_init(bs, bdrv_get_aio_context(bs));
+
     s->cluster_cache = g_malloc(s->cluster_size);
     /* one more sector for decompressed data alignment */
     s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS
@@ -1013,6 +1072,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     qemu_vfree(s->l1_table);
     /* else pre-write overlap checks in cache_destroy may crash */
     s->l1_table = NULL;
+    cache_clean_timer_del(bs);
     if (s->l2_table_cache) {
         qcow2_cache_destroy(bs, s->l2_table_cache);
     }
@@ -1471,6 +1531,7 @@ static void qcow2_close(BlockDriverState *bs)
         }
     }
 
+    cache_clean_timer_del(bs);
     qcow2_cache_destroy(bs, s->l2_table_cache);
     qcow2_cache_destroy(bs, s->refcount_block_cache);
 
@@ -2977,6 +3038,9 @@ BlockDriver bdrv_qcow2 = {
     .create_opts         = &qcow2_create_opts,
     .bdrv_check          = qcow2_check,
     .bdrv_amend_options  = qcow2_amend_options,
+
+    .bdrv_detach_aio_context  = qcow2_detach_aio_context,
+    .bdrv_attach_aio_context  = qcow2_attach_aio_context,
 };
 
 static void bdrv_qcow2_init(void)
diff --git a/block/qcow2.h b/block/qcow2.h
index 72e132838a..71dafd6dc9 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -96,6 +96,7 @@
 #define QCOW2_OPT_CACHE_SIZE "cache-size"
 #define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size"
 #define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size"
+#define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval"
 
 typedef struct QCowHeader {
     uint32_t magic;
@@ -239,6 +240,8 @@ typedef struct BDRVQcowState {
 
     Qcow2Cache* l2_table_cache;
     Qcow2Cache* refcount_block_cache;
+    QEMUTimer *cache_clean_timer;
+    unsigned cache_clean_interval;
 
     uint8_t *cluster_cache;
     uint8_t *cluster_data;
@@ -581,6 +584,7 @@ int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c,
     Qcow2Cache *dependency);
 void qcow2_cache_depends_on_flush(Qcow2Cache *c);
 
+void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c);
 int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c);
 
 int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 7b2efb8678..bb2189ef3a 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1592,6 +1592,10 @@
 # @refcount-cache-size:   #optional the maximum size of the refcount block cache
 #                         in bytes (since 2.2)
 #
+# @cache-clean-interval:  #optional clean unused entries in the L2 and refcount
+#                         caches. The interval is in seconds. The default value
+#                         is 0 and it disables this feature (since 2.5)
+#
 # Since: 1.7
 ##
 { 'struct': 'BlockdevOptionsQcow2',
@@ -1603,7 +1607,8 @@
             '*overlap-check': 'Qcow2OverlapChecks',
             '*cache-size': 'int',
             '*l2-cache-size': 'int',
-            '*refcount-cache-size': 'int' } }
+            '*refcount-cache-size': 'int',
+            '*cache-clean-interval': 'int' } }
 
 
 ##

From 7f65ce834accce0b7e4bc79313bacf229b957783 Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Tue, 4 Aug 2015 15:14:41 +0300
Subject: [PATCH 3/5] docs: document how to configure the qcow2 L2/refcount
 caches

QEMU has options to configure the size of the L2 and refcount
caches for the qcow2 format. However, choosing the right sizes for
a particular disk image is not a straightforward operation since
the ratio between the cache size and the allocated disk space is
not obvious and depends on the size of the cluster and the refcount
entries.

This document attempts to give an overview of both caches and how to
configure their sizes.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Message-id: 55de928e139b1ba3f3d40fe9c6c88f30b1f36410.1438690126.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 docs/qcow2-cache.txt | 164 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 docs/qcow2-cache.txt

diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
new file mode 100644
index 0000000000..5bb06072d3
--- /dev/null
+++ b/docs/qcow2-cache.txt
@@ -0,0 +1,164 @@
+qcow2 L2/refcount cache configuration
+=====================================
+Copyright (C) 2015 Igalia, S.L.
+Author: Alberto Garcia <berto@igalia.com>
+
+This work is licensed under the terms of the GNU GPL, version 2 or
+later. See the COPYING file in the top-level directory.
+
+Introduction
+------------
+The QEMU qcow2 driver has two caches that can improve the I/O
+performance significantly. However, setting the right cache sizes is
+not a straightforward operation.
+
+This document attempts to give an overview of the L2 and refcount
+caches, and how to configure them.
+
+Please refer to the docs/specs/qcow2.txt file for an in-depth
+technical description of the qcow2 file format.
+
+
+Clusters
+--------
+A qcow2 file is organized in units of constant size called clusters.
+
+The cluster size is configurable, but it must be a power of two and
+its value 512 bytes or higher. QEMU currently defaults to 64 KB
+clusters, and it does not support sizes larger than 2MB.
+
+The 'qemu-img create' command supports specifying the size using the
+cluster_size option:
+
+   qemu-img create -f qcow2 -o cluster_size=128K hd.qcow2 4G
+
+
+The L2 tables
+-------------
+The qcow2 format uses a two-level structure to map the virtual disk as
+seen by the guest to the disk image in the host. These structures are
+called the L1 and L2 tables.
+
+There is one single L1 table per disk image. The table is small and is
+always kept in memory.
+
+There can be many L2 tables, depending on how much space has been
+allocated in the image. Each table is one cluster in size. In order to
+read or write data from the virtual disk, QEMU needs to read its
+corresponding L2 table to find out where that data is located. Since
+reading the table for each I/O operation can be expensive, QEMU keeps
+an L2 cache in memory to speed up disk access.
+
+The size of the L2 cache can be configured, and setting the right
+value can improve the I/O performance significantly.
+
+
+The refcount blocks
+-------------------
+The qcow2 format also mantains a reference count for each cluster.
+Reference counts are used for cluster allocation and internal
+snapshots. The data is stored in a two-level structure similar to the
+L1/L2 tables described above.
+
+The second level structures are called refcount blocks, are also one
+cluster in size and the number is also variable and dependent on the
+amount of allocated space.
+
+Each block contains a number of refcount entries. Their size (in bits)
+is a power of two and must not be higher than 64. It defaults to 16
+bits, but a different value can be set using the refcount_bits option:
+
+   qemu-img create -f qcow2 -o refcount_bits=8 hd.qcow2 4G
+
+QEMU keeps a refcount cache to speed up I/O much like the
+aforementioned L2 cache, and its size can also be configured.
+
+
+Choosing the right cache sizes
+------------------------------
+In order to choose the cache sizes we need to know how they relate to
+the amount of allocated space.
+
+The amount of virtual disk that can be mapped by the L2 and refcount
+caches (in bytes) is:
+
+   disk_size = l2_cache_size * cluster_size / 8
+   disk_size = refcount_cache_size * cluster_size * 8 / refcount_bits
+
+With the default values for cluster_size (64KB) and refcount_bits
+(16), that is
+
+   disk_size = l2_cache_size * 8192
+   disk_size = refcount_cache_size * 32768
+
+So in order to cover n GB of disk space with the default values we
+need:
+
+   l2_cache_size = disk_size_GB * 131072
+   refcount_cache_size = disk_size_GB * 32768
+
+QEMU has a default L2 cache of 1MB (1048576 bytes) and a refcount
+cache of 256KB (262144 bytes), so using the formulas we've just seen
+we have
+
+   1048576 / 131072 = 8 GB of virtual disk covered by that cache
+    262144 /  32768 = 8 GB
+
+
+How to configure the cache sizes
+--------------------------------
+Cache sizes can be configured using the -drive option in the
+command-line, or the 'blockdev-add' QMP command.
+
+There are three options available, and all of them take bytes:
+
+"l2-cache-size":         maximum size of the L2 table cache
+"refcount-cache-size":   maximum size of the refcount block cache
+"cache-size":            maximum size of both caches combined
+
+There are two things that need to be taken into account:
+
+ - Both caches must have a size that is a multiple of the cluster
+   size.
+
+ - If you only set one of the options above, QEMU will automatically
+   adjust the others so that the L2 cache is 4 times bigger than the
+   refcount cache.
+
+This means that these options are equivalent:
+
+   -drive file=hd.qcow2,l2-cache-size=2097152
+   -drive file=hd.qcow2,refcount-cache-size=524288
+   -drive file=hd.qcow2,cache-size=2621440
+
+The reason for this 1/4 ratio is to ensure that both caches cover the
+same amount of disk space. Note however that this is only valid with
+the default value of refcount_bits (16). If you are using a different
+value you might want to calculate both cache sizes yourself since QEMU
+will always use the same 1/4 ratio.
+
+It's also worth mentioning that there's no strict need for both caches
+to cover the same amount of disk space. The refcount cache is used
+much less often than the L2 cache, so it's perfectly reasonable to
+keep it small.
+
+
+Reducing the memory usage
+-------------------------
+It is possible to clean unused cache entries in order to reduce the
+memory usage during periods of low I/O activity.
+
+The parameter "cache-clean-interval" defines an interval (in seconds).
+All cache entries that haven't been accessed during that interval are
+removed from memory.
+
+This example removes all unused cache entries every 15 minutes:
+
+   -drive file=hd.qcow2,cache-clean-interval=900
+
+If unset, the default value for this parameter is 0 and it disables
+this feature.
+
+Note that this functionality currently relies on the MADV_DONTNEED
+argument for madvise() to actually free the memory, so it is not
+useful in systems that don't follow that behavior.

From 909c260c71d1bee7018e17034580ffd0743508db Mon Sep 17 00:00:00 2001
From: Alberto Garcia <berto@igalia.com>
Date: Tue, 4 Aug 2015 15:14:42 +0300
Subject: [PATCH 4/5] qcow2: reorder fields in Qcow2CachedTable to reduce
 padding

Changing the current ordering saves 8 bytes per cache entry in x86_64.

Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Message-id: 0bd55291211df3dfb514d0e7d2031dd5c4f9f807.1438690126.git.berto@igalia.com
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 8457458418..046f5b8e48 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -37,9 +37,9 @@
 
 typedef struct Qcow2CachedTable {
     int64_t  offset;
-    bool     dirty;
     uint64_t lru_counter;
     int      ref;
+    bool     dirty;
 } Qcow2CachedTable;
 
 struct Qcow2Cache {

From 834cb2ada5db197a11c99142d50222945d196fc0 Mon Sep 17 00:00:00 2001
From: Wen Congyang <wency@cn.fujitsu.com>
Date: Fri, 3 Jul 2015 14:45:06 +0800
Subject: [PATCH 5/5] quorum: validate vote threshold against num_children even
 if read-pattern is fifo

We need to use threshold to check if too many write operation fails.
If threshold is larger than num children, we always get write error
event even if all write operations success.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Message-id: 55962F72.3060003@cn.fujitsu.com
Reviewed-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/quorum.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/quorum.c b/block/quorum.c
index 2f6c45f760..8fe53b4272 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -889,6 +889,12 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
+    /* and validate it against s->num_children */
+    ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+    if (ret < 0) {
+        goto exit;
+    }
+
     ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
     if (ret < 0) {
         error_setg(&local_err, "Please set read-pattern as fifo or quorum");
@@ -897,12 +903,6 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
     s->read_pattern = ret;
 
     if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
-        /* and validate it against s->num_children */
-        ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
-        if (ret < 0) {
-            goto exit;
-        }
-
         /* is the driver in blkverify mode */
         if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
             s->num_children == 2 && s->threshold == 2) {