From a66670c79c5c7d530d818430ffcdaa25cbf2c2ab Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 30 Aug 2013 18:10:38 +1000
Subject: [PATCH 01/17] memory: fix 128 arithmetic in info mtree

mtree_print_mr() calls int128_get64() in 3 places but only 2 places
handle 2^64 correctly.

This fixes the third call of int128_get64().

Cc: qemu-stable@nongnu.org
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 memory.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/memory.c b/memory.c
index 5a10fd0bde..7f1f2661a5 100644
--- a/memory.c
+++ b/memory.c
@@ -1809,7 +1809,9 @@ static void mtree_print_mr(fprintf_function mon_printf, void *f,
                    mr->alias->name,
                    mr->alias_offset,
                    mr->alias_offset
-                   + (hwaddr)int128_get64(mr->size) - 1);
+                   + (int128_nz(mr->size) ?
+                      (hwaddr)int128_get64(int128_sub(mr->size,
+                                                      int128_one())) : 0));
     } else {
         mon_printf(f,
                    TARGET_FMT_plx "-" TARGET_FMT_plx " (prio %d, %c%c): %s\n",

From 518420dfec2f082cfecbc6eec79fcc91388cf751 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Thu, 2 May 2013 10:21:18 +0200
Subject: [PATCH 02/17] compatfd: switch to QemuThread

qemu_thread_create already does signal blocking and detaching for us.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 util/compatfd.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/util/compatfd.c b/util/compatfd.c
index 9cf3f2834d..430a41c855 100644
--- a/util/compatfd.c
+++ b/util/compatfd.c
@@ -15,9 +15,9 @@
 
 #include "qemu-common.h"
 #include "qemu/compatfd.h"
+#include "qemu/thread.h"
 
 #include <sys/syscall.h>
-#include <pthread.h>
 
 struct sigfd_compat_info
 {
@@ -28,10 +28,6 @@ struct sigfd_compat_info
 static void *sigwait_compat(void *opaque)
 {
     struct sigfd_compat_info *info = opaque;
-    sigset_t all;
-
-    sigfillset(&all);
-    pthread_sigmask(SIG_BLOCK, &all, NULL);
 
     while (1) {
         int sig;
@@ -71,9 +67,8 @@ static void *sigwait_compat(void *opaque)
 
 static int qemu_signalfd_compat(const sigset_t *mask)
 {
-    pthread_attr_t attr;
-    pthread_t tid;
     struct sigfd_compat_info *info;
+    QemuThread thread;
     int fds[2];
 
     info = malloc(sizeof(*info));
@@ -93,12 +88,7 @@ static int qemu_signalfd_compat(const sigset_t *mask)
     memcpy(&info->mask, mask, sizeof(*mask));
     info->fd = fds[1];
 
-    pthread_attr_init(&attr);
-    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
-
-    pthread_create(&tid, &attr, sigwait_compat, info);
-
-    pthread_attr_destroy(&attr);
+    qemu_thread_create(&thread, sigwait_compat, info, QEMU_THREAD_DETACHED);
 
     return fds[0];
 }

From c76bc480e2d70762d02373678942c98fb193b9e5 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 2 Jul 2013 20:22:37 +0200
Subject: [PATCH 03/17] portio: Allow to mark portio lists as coalesced MMIO
 flushing

This will enable us to remove all remaining explicit calls of
qemu_flush_coalesced_mmio_buffer in IO handlers.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/exec/ioport.h | 2 ++
 ioport.c              | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/include/exec/ioport.h b/include/exec/ioport.h
index b3848be684..3bd6722627 100644
--- a/include/exec/ioport.h
+++ b/include/exec/ioport.h
@@ -64,11 +64,13 @@ typedef struct PortioList {
     struct MemoryRegion **regions;
     void *opaque;
     const char *name;
+    bool flush_coalesced_mmio;
 } PortioList;
 
 void portio_list_init(PortioList *piolist, Object *owner,
                       const struct MemoryRegionPortio *callbacks,
                       void *opaque, const char *name);
+void portio_list_set_flush_coalesced(PortioList *piolist);
 void portio_list_destroy(PortioList *piolist);
 void portio_list_add(PortioList *piolist,
                      struct MemoryRegion *address_space,
diff --git a/ioport.c b/ioport.c
index 707cce88ab..3d91e79edc 100644
--- a/ioport.c
+++ b/ioport.c
@@ -139,6 +139,12 @@ void portio_list_init(PortioList *piolist,
     piolist->opaque = opaque;
     piolist->owner = owner;
     piolist->name = name;
+    piolist->flush_coalesced_mmio = false;
+}
+
+void portio_list_set_flush_coalesced(PortioList *piolist)
+{
+    piolist->flush_coalesced_mmio = true;
 }
 
 void portio_list_destroy(PortioList *piolist)
@@ -231,6 +237,9 @@ static void portio_list_add_1(PortioList *piolist,
      */
     memory_region_init_io(&mrpio->mr, piolist->owner, &portio_ops, mrpio,
                           piolist->name, off_high - off_low);
+    if (piolist->flush_coalesced_mmio) {
+        memory_region_set_flush_coalesced(&mrpio->mr);
+    }
     memory_region_add_subregion(piolist->address_space,
                                 start + off_low, &mrpio->mr);
     piolist->regions[piolist->nr] = &mrpio->mr;

From eb25a1d9d4e88f4dd41702e35199ce4bbd7d1cee Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 2 Jul 2013 21:19:02 +0200
Subject: [PATCH 04/17] cirrus: Mark vga io region as coalesced MMIO flushing

This allows to remove the explicit qemu_flush_coalesced_mmio_buffer
calls - the memory core will invoke them now.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/display/cirrus_vga.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/display/cirrus_vga.c b/hw/display/cirrus_vga.c
index dbd1f4a47b..e4c345fa82 100644
--- a/hw/display/cirrus_vga.c
+++ b/hw/display/cirrus_vga.c
@@ -2447,7 +2447,6 @@ static uint64_t cirrus_vga_ioport_read(void *opaque, hwaddr addr,
     VGACommonState *s = &c->vga;
     int val, index;
 
-    qemu_flush_coalesced_mmio_buffer();
     addr += 0x3b0;
 
     if (vga_ioport_invalid(s, addr)) {
@@ -2544,7 +2543,6 @@ static void cirrus_vga_ioport_write(void *opaque, hwaddr addr, uint64_t val,
     VGACommonState *s = &c->vga;
     int index;
 
-    qemu_flush_coalesced_mmio_buffer();
     addr += 0x3b0;
 
     /* check port range access depending on color/monochrome mode */
@@ -2843,6 +2841,7 @@ static void cirrus_init_common(CirrusVGAState *s, Object *owner,
     /* Register ioport 0x3b0 - 0x3df */
     memory_region_init_io(&s->cirrus_vga_io, owner, &cirrus_vga_io_ops, s,
                           "cirrus-io", 0x30);
+    memory_region_set_flush_coalesced(&s->cirrus_vga_io);
     memory_region_add_subregion(system_io, 0x3b0, &s->cirrus_vga_io);
 
     memory_region_init(&s->low_mem_container, owner,

From c46860ea53854a96b11af0d6e23b623ce199e95e Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 2 Jul 2013 21:37:40 +0200
Subject: [PATCH 05/17] vga: Mark relevant portio lists regions as coalesced
 MMIO flushing

This allows to remove the explicit qemu_flush_coalesced_mmio_buffer
calls.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/display/qxl.c | 1 +
 hw/display/vga.c | 5 +----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/hw/display/qxl.c b/hw/display/qxl.c
index ee2db0da1a..3051006765 100644
--- a/hw/display/qxl.c
+++ b/hw/display/qxl.c
@@ -2074,6 +2074,7 @@ static int qxl_init_primary(PCIDevice *dev)
              pci_address_space(dev), pci_address_space_io(dev), false);
     portio_list_init(qxl_vga_port_list, OBJECT(dev), qxl_vga_portio_list,
                      vga, "vga");
+    portio_list_set_flush_coalesced(qxl_vga_port_list);
     portio_list_add(qxl_vga_port_list, pci_address_space_io(dev), 0x3b0);
 
     vga->con = graphic_console_init(DEVICE(dev), &qxl_ops, qxl);
diff --git a/hw/display/vga.c b/hw/display/vga.c
index 7b91d9c54e..b5e22849ab 100644
--- a/hw/display/vga.c
+++ b/hw/display/vga.c
@@ -359,8 +359,6 @@ uint32_t vga_ioport_read(void *opaque, uint32_t addr)
     VGACommonState *s = opaque;
     int val, index;
 
-    qemu_flush_coalesced_mmio_buffer();
-
     if (vga_ioport_invalid(s, addr)) {
         val = 0xff;
     } else {
@@ -453,8 +451,6 @@ void vga_ioport_write(void *opaque, uint32_t addr, uint32_t val)
     VGACommonState *s = opaque;
     int index;
 
-    qemu_flush_coalesced_mmio_buffer();
-
     /* check port range access depending on color/monochrome mode */
     if (vga_ioport_invalid(s, addr)) {
         return;
@@ -2373,6 +2369,7 @@ void vga_init(VGACommonState *s, Object *obj, MemoryRegion *address_space,
     memory_region_set_coalescing(vga_io_memory);
     if (init_vga_ports) {
         portio_list_init(vga_port_list, obj, vga_ports, s, "vga");
+        portio_list_set_flush_coalesced(vga_port_list);
         portio_list_add(vga_port_list, address_space_io, 0x3b0);
     }
     if (vbe_ports) {

From ea753d81e8b085d679f13e4a6023e003e9854d51 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 25 Sep 2013 14:20:57 +0800
Subject: [PATCH 06/17] seqlock: introduce read-write seqlock

Seqlock implementation for QEMU. Usage idiom

reader:
    do {
        start = seqlock_read_begin(&sl);
        ...
    } while (seqlock_read_retry(&sl, start));

writer:
    seqlock_write_lock(&sl);
    ...
    seqlock_write_unlock(&sl);

initialization:
    seqlock_init(QemuSeqLock *sl, QemuMutex *mutex)

    mutex could be NULL if the caller will provide its own protection
    for concurrent write sides (typically using the BQL).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/seqlock.h | 72 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 include/qemu/seqlock.h

diff --git a/include/qemu/seqlock.h b/include/qemu/seqlock.h
new file mode 100644
index 0000000000..3ff118a1a1
--- /dev/null
+++ b/include/qemu/seqlock.h
@@ -0,0 +1,72 @@
+/*
+ * Seqlock implementation for QEMU
+ *
+ * Copyright Red Hat, Inc. 2013
+ *
+ * Author:
+ *  Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef QEMU_SEQLOCK_H
+#define QEMU_SEQLOCK_H 1
+
+#include <qemu/atomic.h>
+#include <qemu/thread.h>
+
+typedef struct QemuSeqLock QemuSeqLock;
+
+struct QemuSeqLock {
+    QemuMutex *mutex;
+    unsigned sequence;
+};
+
+static inline void seqlock_init(QemuSeqLock *sl, QemuMutex *mutex)
+{
+    sl->mutex = mutex;
+    sl->sequence = 0;
+}
+
+/* Lock out other writers and update the count.  */
+static inline void seqlock_write_lock(QemuSeqLock *sl)
+{
+    if (sl->mutex) {
+        qemu_mutex_lock(sl->mutex);
+    }
+    ++sl->sequence;
+
+    /* Write sequence before updating other fields.  */
+    smp_wmb();
+}
+
+static inline void seqlock_write_unlock(QemuSeqLock *sl)
+{
+    /* Write other fields before finalizing sequence.  */
+    smp_wmb();
+
+    ++sl->sequence;
+    if (sl->mutex) {
+        qemu_mutex_unlock(sl->mutex);
+    }
+}
+
+static inline unsigned seqlock_read_begin(QemuSeqLock *sl)
+{
+    /* Always fail if a write is in progress.  */
+    unsigned ret = sl->sequence & ~1;
+
+    /* Read sequence before reading other fields.  */
+    smp_rmb();
+    return ret;
+}
+
+static int seqlock_read_retry(const QemuSeqLock *sl, unsigned start)
+{
+    /* Read other fields before reading final sequence.  */
+    smp_rmb();
+    return unlikely(sl->sequence != start);
+}
+
+#endif

From cb365646a942ed58aae053064b2048a415337ba2 Mon Sep 17 00:00:00 2001
From: Liu Ping Fan <qemulist@gmail.com>
Date: Wed, 25 Sep 2013 14:20:58 +0800
Subject: [PATCH 07/17] timer: protect timers_state's clock with seqlock

QEMU_CLOCK_VIRTUAL may be read outside BQL. This will make its
foundation, i.e. cpu_clock_offset exposed to race condition.
Using private lock to protect it.

After this patch, reading QEMU_CLOCK_VIRTUAL is thread safe
unless use_icount is true, in which case the existing callers
still rely on the BQL.

Lock rule: private lock innermost, ie BQL->"this lock"

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c               | 53 ++++++++++++++++++++++++++++++++++++--------
 include/qemu/timer.h |  2 ++
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/cpus.c b/cpus.c
index e566297bd3..f07533530c 100644
--- a/cpus.c
+++ b/cpus.c
@@ -37,6 +37,7 @@
 #include "sysemu/qtest.h"
 #include "qemu/main-loop.h"
 #include "qemu/bitmap.h"
+#include "qemu/seqlock.h"
 
 #ifndef _WIN32
 #include "qemu/compatfd.h"
@@ -110,8 +111,14 @@ static int64_t vm_clock_warp_start;
 static int64_t qemu_icount;
 
 typedef struct TimersState {
+    /* Protected by BQL.  */
     int64_t cpu_ticks_prev;
     int64_t cpu_ticks_offset;
+
+    /* cpu_clock_offset can be read out of BQL, so protect it with
+     * this lock.
+     */
+    QemuSeqLock vm_clock_seqlock;
     int64_t cpu_clock_offset;
     int32_t cpu_ticks_enabled;
     int64_t dummy;
@@ -137,6 +144,7 @@ int64_t cpu_get_icount(void)
 }
 
 /* return the host CPU cycle counter and handle stop/restart */
+/* Caller must hold the BQL */
 int64_t cpu_get_ticks(void)
 {
     if (use_icount) {
@@ -157,37 +165,63 @@ int64_t cpu_get_ticks(void)
     }
 }
 
+static int64_t cpu_get_clock_locked(void)
+{
+    int64_t ti;
+
+    if (!timers_state.cpu_ticks_enabled) {
+        ti = timers_state.cpu_clock_offset;
+    } else {
+        ti = get_clock();
+        ti += timers_state.cpu_clock_offset;
+    }
+
+    return ti;
+}
+
 /* return the host CPU monotonic timer and handle stop/restart */
 int64_t cpu_get_clock(void)
 {
     int64_t ti;
-    if (!timers_state.cpu_ticks_enabled) {
-        return timers_state.cpu_clock_offset;
-    } else {
-        ti = get_clock();
-        return ti + timers_state.cpu_clock_offset;
-    }
+    unsigned start;
+
+    do {
+        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
+        ti = cpu_get_clock_locked();
+    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
+
+    return ti;
 }
 
-/* enable cpu_get_ticks() */
+/* enable cpu_get_ticks()
+ * Caller must hold BQL which server as mutex for vm_clock_seqlock.
+ */
 void cpu_enable_ticks(void)
 {
+    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
+    seqlock_write_lock(&timers_state.vm_clock_seqlock);
     if (!timers_state.cpu_ticks_enabled) {
         timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
         timers_state.cpu_clock_offset -= get_clock();
         timers_state.cpu_ticks_enabled = 1;
     }
+    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 }
 
 /* disable cpu_get_ticks() : the clock is stopped. You must not call
-   cpu_get_ticks() after that.  */
+ * cpu_get_ticks() after that.
+ * Caller must hold BQL which server as mutex for vm_clock_seqlock.
+ */
 void cpu_disable_ticks(void)
 {
+    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
+    seqlock_write_lock(&timers_state.vm_clock_seqlock);
     if (timers_state.cpu_ticks_enabled) {
         timers_state.cpu_ticks_offset = cpu_get_ticks();
-        timers_state.cpu_clock_offset = cpu_get_clock();
+        timers_state.cpu_clock_offset = cpu_get_clock_locked();
         timers_state.cpu_ticks_enabled = 0;
     }
+    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 }
 
 /* Correlation between real and virtual time is always going to be
@@ -371,6 +405,7 @@ static const VMStateDescription vmstate_timers = {
 
 void configure_icount(const char *option)
 {
+    seqlock_init(&timers_state.vm_clock_seqlock, NULL);
     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
     if (!option) {
         return;
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index b58903bef5..016e29ae95 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -653,7 +653,9 @@ static inline int64_t qemu_soonest_timeout(int64_t timeout1, int64_t timeout2)
 void init_clocks(void);
 
 int64_t cpu_get_ticks(void);
+/* Caller must hold BQL */
 void cpu_enable_ticks(void);
+/* Caller must hold BQL */
 void cpu_disable_ticks(void);
 
 static inline int64_t get_ticks_per_sec(void)

From c7c4d063f50f0de980d99f02e055722227d703bc Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 25 Sep 2013 14:20:59 +0800
Subject: [PATCH 08/17] qemu-thread: add QemuEvent

This emulates Win32 manual-reset events using futexes or conditional
variables.  Typical ways to use them are with multi-producer,
single-consumer data structures, to test for a complex condition whose
elements come from different threads:

    for (;;) {
        qemu_event_reset(ev);
        ... test complex condition ...
        if (condition is true) {
            break;
        }
        qemu_event_wait(ev);
    }

Or more efficiently (but with some duplication):

    ... evaluate condition ...
    while (!condition) {
        qemu_event_reset(ev);
        ... evaluate condition ...
        if (!condition) {
            qemu_event_wait(ev);
            ... evaluate condition ...
        }
    }

QemuEvent provides a very fast userspace path in the common case when
no other thread is waiting, or the event is not changing state.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/thread-posix.h |   8 +++
 include/qemu/thread-win32.h |   4 ++
 include/qemu/thread.h       |   7 +++
 util/qemu-thread-posix.c    | 116 ++++++++++++++++++++++++++++++++++++
 util/qemu-thread-win32.c    |  26 ++++++++
 5 files changed, 161 insertions(+)

diff --git a/include/qemu/thread-posix.h b/include/qemu/thread-posix.h
index 361566abc4..eb5c7a1da1 100644
--- a/include/qemu/thread-posix.h
+++ b/include/qemu/thread-posix.h
@@ -21,6 +21,14 @@ struct QemuSemaphore {
 #endif
 };
 
+struct QemuEvent {
+#ifndef __linux__
+    pthread_mutex_t lock;
+    pthread_cond_t cond;
+#endif
+    unsigned value;
+};
+
 struct QemuThread {
     pthread_t thread;
 };
diff --git a/include/qemu/thread-win32.h b/include/qemu/thread-win32.h
index 13adb958f0..3d58081bed 100644
--- a/include/qemu/thread-win32.h
+++ b/include/qemu/thread-win32.h
@@ -17,6 +17,10 @@ struct QemuSemaphore {
     HANDLE sema;
 };
 
+struct QemuEvent {
+    HANDLE event;
+};
+
 typedef struct QemuThreadData QemuThreadData;
 struct QemuThread {
     QemuThreadData *data;
diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index c02404b9fb..3e32c6531c 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -7,6 +7,7 @@
 typedef struct QemuMutex QemuMutex;
 typedef struct QemuCond QemuCond;
 typedef struct QemuSemaphore QemuSemaphore;
+typedef struct QemuEvent QemuEvent;
 typedef struct QemuThread QemuThread;
 
 #ifdef _WIN32
@@ -45,6 +46,12 @@ void qemu_sem_wait(QemuSemaphore *sem);
 int qemu_sem_timedwait(QemuSemaphore *sem, int ms);
 void qemu_sem_destroy(QemuSemaphore *sem);
 
+void qemu_event_init(QemuEvent *ev, bool init);
+void qemu_event_set(QemuEvent *ev);
+void qemu_event_reset(QemuEvent *ev);
+void qemu_event_wait(QemuEvent *ev);
+void qemu_event_destroy(QemuEvent *ev);
+
 void qemu_thread_create(QemuThread *thread,
                         void *(*start_routine)(void *),
                         void *arg, int mode);
diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
index 4de133e7b2..37dd298631 100644
--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -20,7 +20,12 @@
 #include <limits.h>
 #include <unistd.h>
 #include <sys/time.h>
+#ifdef __linux__
+#include <sys/syscall.h>
+#include <linux/futex.h>
+#endif
 #include "qemu/thread.h"
+#include "qemu/atomic.h"
 
 static void error_exit(int err, const char *msg)
 {
@@ -272,6 +277,117 @@ void qemu_sem_wait(QemuSemaphore *sem)
 #endif
 }
 
+#ifdef __linux__
+#define futex(...)              syscall(__NR_futex, __VA_ARGS__)
+
+static inline void futex_wake(QemuEvent *ev, int n)
+{
+    futex(ev, FUTEX_WAKE, n, NULL, NULL, 0);
+}
+
+static inline void futex_wait(QemuEvent *ev, unsigned val)
+{
+    futex(ev, FUTEX_WAIT, (int) val, NULL, NULL, 0);
+}
+#else
+static inline void futex_wake(QemuEvent *ev, int n)
+{
+    if (n == 1) {
+        pthread_cond_signal(&ev->cond);
+    } else {
+        pthread_cond_broadcast(&ev->cond);
+    }
+}
+
+static inline void futex_wait(QemuEvent *ev, unsigned val)
+{
+    pthread_mutex_lock(&ev->lock);
+    if (ev->value == val) {
+        pthread_cond_wait(&ev->cond, &ev->lock);
+    }
+    pthread_mutex_unlock(&ev->lock);
+}
+#endif
+
+/* Valid transitions:
+ * - free->set, when setting the event
+ * - busy->set, when setting the event, followed by futex_wake
+ * - set->free, when resetting the event
+ * - free->busy, when waiting
+ *
+ * set->busy does not happen (it can be observed from the outside but
+ * it really is set->free->busy).
+ *
+ * busy->free provably cannot happen; to enforce it, the set->free transition
+ * is done with an OR, which becomes a no-op if the event has concurrently
+ * transitioned to free or busy.
+ */
+
+#define EV_SET         0
+#define EV_FREE        1
+#define EV_BUSY       -1
+
+void qemu_event_init(QemuEvent *ev, bool init)
+{
+#ifndef __linux__
+    pthread_mutex_init(&ev->lock, NULL);
+    pthread_cond_init(&ev->cond, NULL);
+#endif
+
+    ev->value = (init ? EV_SET : EV_FREE);
+}
+
+void qemu_event_destroy(QemuEvent *ev)
+{
+#ifndef __linux__
+    pthread_mutex_destroy(&ev->lock);
+    pthread_cond_destroy(&ev->cond);
+#endif
+}
+
+void qemu_event_set(QemuEvent *ev)
+{
+    if (atomic_mb_read(&ev->value) != EV_SET) {
+        if (atomic_xchg(&ev->value, EV_SET) == EV_BUSY) {
+            /* There were waiters, wake them up.  */
+            futex_wake(ev, INT_MAX);
+        }
+    }
+}
+
+void qemu_event_reset(QemuEvent *ev)
+{
+    if (atomic_mb_read(&ev->value) == EV_SET) {
+        /*
+         * If there was a concurrent reset (or even reset+wait),
+         * do nothing.  Otherwise change EV_SET->EV_FREE.
+         */
+        atomic_or(&ev->value, EV_FREE);
+    }
+}
+
+void qemu_event_wait(QemuEvent *ev)
+{
+    unsigned value;
+
+    value = atomic_mb_read(&ev->value);
+    if (value != EV_SET) {
+        if (value == EV_FREE) {
+            /*
+             * Leave the event reset and tell qemu_event_set that there
+             * are waiters.  No need to retry, because there cannot be
+             * a concurent busy->free transition.  After the CAS, the
+             * event will be either set or busy.
+             */
+            if (atomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) {
+                return;
+            }
+        }
+        futex_wait(ev, EV_BUSY);
+    }
+}
+
+
 void qemu_thread_create(QemuThread *thread,
                        void *(*start_routine)(void*),
                        void *arg, int mode)
diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c
index 517878dcc1..27a5217769 100644
--- a/util/qemu-thread-win32.c
+++ b/util/qemu-thread-win32.c
@@ -227,6 +227,32 @@ void qemu_sem_wait(QemuSemaphore *sem)
     }
 }
 
+void qemu_event_init(QemuEvent *ev, bool init)
+{
+    /* Manual reset.  */
+    ev->event = CreateEvent(NULL, TRUE, init, NULL);
+}
+
+void qemu_event_destroy(QemuEvent *ev)
+{
+    CloseHandle(ev->event);
+}
+
+void qemu_event_set(QemuEvent *ev)
+{
+    SetEvent(ev->event);
+}
+
+void qemu_event_reset(QemuEvent *ev)
+{
+    ResetEvent(ev->event);
+}
+
+void qemu_event_wait(QemuEvent *ev)
+{
+    WaitForSingleObject(ev->event, INFINITE);
+}
+
 struct QemuThreadData {
     /* Passed to win32_start_routine.  */
     void             *(*start_routine)(void *);

From 3c05341157f4d08dc3cc8ffa675a0aaa4818d028 Mon Sep 17 00:00:00 2001
From: Liu Ping Fan <qemulist@gmail.com>
Date: Wed, 25 Sep 2013 14:21:00 +0800
Subject: [PATCH 09/17] timer: make qemu_clock_enable sync between disable and
 timer's cb

After disabling the QemuClock, we should make sure that no QemuTimers
are still in flight. To implement that with light overhead, we resort
to QemuEvent. The caller of disabling will wait on QemuEvent of each
timerlist.

Note, qemu_clock_enable(foo,false) can _not_ be called from timer's cb.
Also, the callers of qemu_clock_enable() should be protected by the BQL.

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/timer.h |  6 ++++++
 qemu-timer.c         | 23 ++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 016e29ae95..1254ef729e 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -189,6 +189,12 @@ void qemu_clock_notify(QEMUClockType type);
  * @enabled: true to enable, false to disable
  *
  * Enable or disable a clock
+ * Disabling the clock will wait for related timerlists to stop
+ * executing qemu_run_timers.  Thus, this functions should not
+ * be used from the callback of a timer that is based on @clock.
+ * Doing so would cause a deadlock.
+ *
+ * Caller should hold BQL.
  */
 void qemu_clock_enable(QEMUClockType type, bool enabled);
 
diff --git a/qemu-timer.c b/qemu-timer.c
index 6b62e88669..2b533da2c7 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -45,6 +45,7 @@
 /* timers */
 
 typedef struct QEMUClock {
+    /* We rely on BQL to protect the timerlists */
     QLIST_HEAD(, QEMUTimerList) timerlists;
 
     NotifierList reset_notifiers;
@@ -71,6 +72,9 @@ struct QEMUTimerList {
     QLIST_ENTRY(QEMUTimerList) list;
     QEMUTimerListNotifyCB *notify_cb;
     void *notify_opaque;
+
+    /* lightweight method to mark the end of timerlist's running */
+    QemuEvent timers_done_ev;
 };
 
 /**
@@ -99,6 +103,7 @@ QEMUTimerList *timerlist_new(QEMUClockType type,
     QEMUClock *clock = qemu_clock_ptr(type);
 
     timer_list = g_malloc0(sizeof(QEMUTimerList));
+    qemu_event_init(&timer_list->timers_done_ev, false);
     timer_list->clock = clock;
     timer_list->notify_cb = cb;
     timer_list->notify_opaque = opaque;
@@ -143,13 +148,25 @@ void qemu_clock_notify(QEMUClockType type)
     }
 }
 
+/* Disabling the clock will wait for related timerlists to stop
+ * executing qemu_run_timers.  Thus, this functions should not
+ * be used from the callback of a timer that is based on @clock.
+ * Doing so would cause a deadlock.
+ *
+ * Caller should hold BQL.
+ */
 void qemu_clock_enable(QEMUClockType type, bool enabled)
 {
     QEMUClock *clock = qemu_clock_ptr(type);
+    QEMUTimerList *tl;
     bool old = clock->enabled;
     clock->enabled = enabled;
     if (enabled && !old) {
         qemu_clock_notify(type);
+    } else if (!enabled && old) {
+        QLIST_FOREACH(tl, &clock->timerlists, list) {
+            qemu_event_wait(&tl->timers_done_ev);
+        }
     }
 }
 
@@ -403,8 +420,9 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
     QEMUTimerCB *cb;
     void *opaque;
 
+    qemu_event_reset(&timer_list->timers_done_ev);
     if (!timer_list->clock->enabled) {
-        return progress;
+        goto out;
     }
 
     current_time = qemu_clock_get_ns(timer_list->clock->type);
@@ -428,6 +446,9 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
         cb(opaque);
         progress = true;
     }
+
+out:
+    qemu_event_set(&timer_list->timers_done_ev);
     return progress;
 }
 

From 0f809e5fbebb36788aea3523be7f93c04f2c7f8c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 3 Oct 2013 15:06:39 +0200
Subject: [PATCH 10/17] timer: extract timer_mod_ns_locked and timerlist_rearm

These will be reused in timer_mod_anticipate functions.

Reviewed-by: Alex Bligh <alex@alex.org.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 qemu-timer.c | 51 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/qemu-timer.c b/qemu-timer.c
index 2b533da2c7..0305ad5313 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -355,6 +355,34 @@ static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts)
     }
 }
 
+static bool timer_mod_ns_locked(QEMUTimerList *timer_list,
+                                QEMUTimer *ts, int64_t expire_time)
+{
+    QEMUTimer **pt, *t;
+
+    /* add the timer in the sorted list */
+    pt = &timer_list->active_timers;
+    for (;;) {
+        t = *pt;
+        if (!timer_expired_ns(t, expire_time)) {
+            break;
+        }
+        pt = &t->next;
+    }
+    ts->expire_time = MAX(expire_time, 0);
+    ts->next = *pt;
+    *pt = ts;
+
+    return pt == &timer_list->active_timers;
+}
+
+static void timerlist_rearm(QEMUTimerList *timer_list)
+{
+    /* Interrupt execution to force deadline recalculation.  */
+    qemu_clock_warp(timer_list->clock->type);
+    timerlist_notify(timer_list);
+}
+
 /* stop a timer, but do not dealloc it */
 void timer_del(QEMUTimer *ts)
 {
@@ -370,30 +398,15 @@ void timer_del(QEMUTimer *ts)
 void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
 {
     QEMUTimerList *timer_list = ts->timer_list;
-    QEMUTimer **pt, *t;
+    bool rearm;
 
     qemu_mutex_lock(&timer_list->active_timers_lock);
     timer_del_locked(timer_list, ts);
-
-    /* add the timer in the sorted list */
-    pt = &timer_list->active_timers;
-    for(;;) {
-        t = *pt;
-        if (!timer_expired_ns(t, expire_time)) {
-            break;
-        }
-        pt = &t->next;
-    }
-    ts->expire_time = MAX(expire_time, 0);
-    ts->next = *pt;
-    *pt = ts;
+    rearm = timer_mod_ns_locked(timer_list, ts, expire_time);
     qemu_mutex_unlock(&timer_list->active_timers_lock);
 
-    /* Rearm if necessary  */
-    if (pt == &timer_list->active_timers) {
-        /* Interrupt execution to force deadline recalculation.  */
-        qemu_clock_warp(timer_list->clock->type);
-        timerlist_notify(timer_list);
+    if (rearm) {
+        timerlist_rearm(timer_list);
     }
 }
 

From add40e9777de139fb317ca6b1fb0dc142601cfcd Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 3 Oct 2013 15:11:43 +0200
Subject: [PATCH 11/17] timer: add timer_mod_anticipate and
 timer_mod_anticipate_ns

These let a user anticipate the deadline of a timer, atomically with
other sites that call the function.  This helps avoiding complicated
lock hierarchies.

Reviewed-by: Alex Bligh <alex@alex.org.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/timer.h | 26 ++++++++++++++++++++++++++
 qemu-timer.c         | 29 +++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 1254ef729e..5afcffc3f9 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -544,6 +544,19 @@ void timer_del(QEMUTimer *ts);
  */
 void timer_mod_ns(QEMUTimer *ts, int64_t expire_time);
 
+/**
+ * timer_mod_anticipate_ns:
+ * @ts: the timer
+ * @expire_time: the expiry time in nanoseconds
+ *
+ * Modify a timer to expire at @expire_time or the current time,
+ * whichever comes earlier.
+ *
+ * This function is thread-safe but the timer and its timer list must not be
+ * freed while this function is running.
+ */
+void timer_mod_anticipate_ns(QEMUTimer *ts, int64_t expire_time);
+
 /**
  * timer_mod:
  * @ts: the timer
@@ -557,6 +570,19 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time);
  */
 void timer_mod(QEMUTimer *ts, int64_t expire_timer);
 
+/**
+ * timer_mod_anticipate:
+ * @ts: the timer
+ * @expire_time: the expiry time in nanoseconds
+ *
+ * Modify a timer to expire at @expire_time or the current time, whichever
+ * comes earlier, taking into account the scale associated with the timer.
+ *
+ * This function is thread-safe but the timer and its timer list must not be
+ * freed while this function is running.
+ */
+void timer_mod_anticipate(QEMUTimer *ts, int64_t expire_time);
+
 /**
  * timer_pending:
  * @ts: the timer
diff --git a/qemu-timer.c b/qemu-timer.c
index 0305ad5313..e15ce477cc 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -410,11 +410,40 @@ void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
     }
 }
 
+/* modify the current timer so that it will be fired when current_time
+   >= expire_time or the current deadline, whichever comes earlier.
+   The corresponding callback will be called. */
+void timer_mod_anticipate_ns(QEMUTimer *ts, int64_t expire_time)
+{
+    QEMUTimerList *timer_list = ts->timer_list;
+    bool rearm;
+
+    qemu_mutex_lock(&timer_list->active_timers_lock);
+    if (ts->expire_time == -1 || ts->expire_time > expire_time) {
+        if (ts->expire_time != -1) {
+            timer_del_locked(timer_list, ts);
+        }
+        rearm = timer_mod_ns_locked(timer_list, ts, expire_time);
+    } else {
+        rearm = false;
+    }
+    qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+    if (rearm) {
+        timerlist_rearm(timer_list);
+    }
+}
+
 void timer_mod(QEMUTimer *ts, int64_t expire_time)
 {
     timer_mod_ns(ts, expire_time * ts->scale);
 }
 
+void timer_mod_anticipate(QEMUTimer *ts, int64_t expire_time)
+{
+    timer_mod_anticipate_ns(ts, expire_time * ts->scale);
+}
+
 bool timer_pending(QEMUTimer *ts)
 {
     return ts->expire_time >= 0;

From 468cc7cf3b85dd20a833773e6bde9f720f2df677 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 7 Oct 2013 17:21:51 +0200
Subject: [PATCH 12/17] icount: use cpu_get_icount() directly

This will help later when we will have to place these calls in
a critical section, and thus call a version of cpu_get_icount()
that does not take the lock.

Reviewed-by: Alex Bligh <alex@alex.org.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cpus.c b/cpus.c
index f07533530c..a2d09f363f 100644
--- a/cpus.c
+++ b/cpus.c
@@ -236,12 +236,15 @@ static void icount_adjust(void)
     int64_t cur_icount;
     int64_t delta;
     static int64_t last_delta;
+
     /* If the VM is not running, then do nothing.  */
     if (!runstate_is_running()) {
         return;
     }
+
     cur_time = cpu_get_clock();
-    cur_icount = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+    cur_icount = cpu_get_icount();
+
     delta = cur_icount - cur_time;
     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
     if (delta > 0
@@ -297,7 +300,7 @@ static void icount_warp_rt(void *opaque)
              * far ahead of real time.
              */
             int64_t cur_time = cpu_get_clock();
-            int64_t cur_icount = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+            int64_t cur_icount = cpu_get_icount();
             int64_t delta = cur_time - cur_icount;
             qemu_icount_bias += MIN(warp_delta, delta);
         }

From 8ed961d95708ee6cadac22fba7762724d533a5b4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 7 Oct 2013 17:26:07 +0200
Subject: [PATCH 13/17] icount: reorganize icount_warp_rt

To prepare for future code changes, move the increment of qemu_icount_bias
outside the "if" statement.

Also, hoist outside the if the check for timers that expired due to the
"warping".  The check is redundant when !runstate_is_running(), but
doing it this way helps because the code that increments qemu_icount_bias
will be a critical section.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/cpus.c b/cpus.c
index a2d09f363f..bc365b7ed5 100644
--- a/cpus.c
+++ b/cpus.c
@@ -291,10 +291,10 @@ static void icount_warp_rt(void *opaque)
 
     if (runstate_is_running()) {
         int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-        int64_t warp_delta = clock - vm_clock_warp_start;
-        if (use_icount == 1) {
-            qemu_icount_bias += warp_delta;
-        } else {
+        int64_t warp_delta;
+
+        warp_delta = clock - vm_clock_warp_start;
+        if (use_icount == 2) {
             /*
              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
              * far ahead of real time.
@@ -302,13 +302,15 @@ static void icount_warp_rt(void *opaque)
             int64_t cur_time = cpu_get_clock();
             int64_t cur_icount = cpu_get_icount();
             int64_t delta = cur_time - cur_icount;
-            qemu_icount_bias += MIN(warp_delta, delta);
-        }
-        if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
-            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+            warp_delta = MIN(warp_delta, delta);
         }
+        qemu_icount_bias += warp_delta;
     }
     vm_clock_warp_start = -1;
+
+    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
+        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
+    }
 }
 
 void qtest_clock_warp(int64_t dest)

From ce78d18ced118b03e821135e702ba1d513c8b2a7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 7 Oct 2013 17:30:02 +0200
Subject: [PATCH 14/17] icount: prepare the code for future races in calling
 qemu_clock_warp

Computing the deadline of all vm_clocks is somewhat expensive and calls
out to qemu-timer.c; two reasons not to do it in the seqlock's write-side
critical section.  This however opens the door for races in setting and
reading vm_clock_warp_start.

To plug them, we need to cover the case where a new deadline slips in
between the call to qemu_clock_deadline_ns_all and the actual modification
of the icount_warp_timer.  Restrict changes to vm_clock_warp_start and
the icount_warp_timer's expiration time, to only move them back (which
would simply cause an early wakeup).

If a vm_clock timer is cancelled while CPUs are idle, this might cause the
icount_warp_timer to fire unnecessarily.  This is not a problem, after it
fires the timer becomes inactive and the next call to timer_mod_anticipate
will be precise.

In addition to this, we must deactivate the icount_warp_timer _before_
checking whether CPUs are idle.  This way, if the "last" CPU becomes idle
during the call to timer_del we will still set up the icount_warp_timer.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/cpus.c b/cpus.c
index bc365b7ed5..34d5e043a9 100644
--- a/cpus.c
+++ b/cpus.c
@@ -329,6 +329,7 @@ void qtest_clock_warp(int64_t dest)
 
 void qemu_clock_warp(QEMUClockType type)
 {
+    int64_t clock;
     int64_t deadline;
 
     /*
@@ -348,8 +349,8 @@ void qemu_clock_warp(QEMUClockType type)
      * the earliest QEMU_CLOCK_VIRTUAL timer.
      */
     icount_warp_rt(NULL);
-    if (!all_cpu_threads_idle() || !qemu_clock_has_timers(QEMU_CLOCK_VIRTUAL)) {
-        timer_del(icount_warp_timer);
+    timer_del(icount_warp_timer);
+    if (!all_cpu_threads_idle()) {
         return;
     }
 
@@ -358,17 +359,11 @@ void qemu_clock_warp(QEMUClockType type)
 	return;
     }
 
-    vm_clock_warp_start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     /* We want to use the earliest deadline from ALL vm_clocks */
+    clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
-
-    /* Maintain prior (possibly buggy) behaviour where if no deadline
-     * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-     * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-     * nanoseconds.
-     */
-    if ((deadline < 0) || (deadline > INT32_MAX)) {
-        deadline = INT32_MAX;
+    if (deadline < 0) {
+        return;
     }
 
     if (deadline > 0) {
@@ -389,7 +384,10 @@ void qemu_clock_warp(QEMUClockType type)
          * you will not be sending network packets continuously instead of
          * every 100ms.
          */
-        timer_mod(icount_warp_timer, vm_clock_warp_start + deadline);
+        if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
+            vm_clock_warp_start = clock;
+        }
+        timer_mod_anticipate(icount_warp_timer, clock + deadline);
     } else if (deadline == 0) {
         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
     }

From a3270e19ccf05603dfaf09e1f18510f7c93095e0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 7 Oct 2013 17:18:15 +0200
Subject: [PATCH 15/17] icount: document (future) locking rules for icount

Reviewed-by: Alex Bligh <alex@alex.org.uk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/cpus.c b/cpus.c
index 34d5e043a9..6203d98432 100644
--- a/cpus.c
+++ b/cpus.c
@@ -98,17 +98,22 @@ static bool all_cpu_threads_idle(void)
 /***********************************************************/
 /* guest cycle counter */
 
+/* Protected by TimersState seqlock */
+
+/* Compensate for varying guest execution speed.  */
+static int64_t qemu_icount_bias;
+static int64_t vm_clock_warp_start;
 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 static int icount_time_shift;
 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 #define MAX_ICOUNT_SHIFT 10
-/* Compensate for varying guest execution speed.  */
-static int64_t qemu_icount_bias;
+
+/* Only written by TCG thread */
+static int64_t qemu_icount;
+
 static QEMUTimer *icount_rt_timer;
 static QEMUTimer *icount_vm_timer;
 static QEMUTimer *icount_warp_timer;
-static int64_t vm_clock_warp_start;
-static int64_t qemu_icount;
 
 typedef struct TimersState {
     /* Protected by BQL.  */
@@ -235,6 +240,8 @@ static void icount_adjust(void)
     int64_t cur_time;
     int64_t cur_icount;
     int64_t delta;
+
+    /* Protected by TimersState mutex.  */
     static int64_t last_delta;
 
     /* If the VM is not running, then do nothing.  */

From 17a15f1b768fe2aab8c5f360b05c0daddf0c438b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 3 Oct 2013 15:17:25 +0200
Subject: [PATCH 16/17] icount: make it thread-safe

This lets threads other than the I/O thread use vm_clock even in -icount mode.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/cpus.c b/cpus.c
index 6203d98432..398229ecbd 100644
--- a/cpus.c
+++ b/cpus.c
@@ -132,7 +132,7 @@ typedef struct TimersState {
 static TimersState timers_state;
 
 /* Return the virtual CPU time, based on the instruction counter.  */
-int64_t cpu_get_icount(void)
+static int64_t cpu_get_icount_locked(void)
 {
     int64_t icount;
     CPUState *cpu = current_cpu;
@@ -148,6 +148,19 @@ int64_t cpu_get_icount(void)
     return qemu_icount_bias + (icount << icount_time_shift);
 }
 
+int64_t cpu_get_icount(void)
+{
+    int64_t icount;
+    unsigned start;
+
+    do {
+        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
+        icount = cpu_get_icount_locked();
+    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
+
+    return icount;
+}
+
 /* return the host CPU cycle counter and handle stop/restart */
 /* Caller must hold the BQL */
 int64_t cpu_get_ticks(void)
@@ -249,8 +262,9 @@ static void icount_adjust(void)
         return;
     }
 
-    cur_time = cpu_get_clock();
-    cur_icount = cpu_get_icount();
+    seqlock_write_lock(&timers_state.vm_clock_seqlock);
+    cur_time = cpu_get_clock_locked();
+    cur_icount = cpu_get_icount_locked();
 
     delta = cur_icount - cur_time;
     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
@@ -268,6 +282,7 @@ static void icount_adjust(void)
     }
     last_delta = delta;
     qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
+    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 }
 
 static void icount_adjust_rt(void *opaque)
@@ -292,10 +307,14 @@ static int64_t qemu_icount_round(int64_t count)
 
 static void icount_warp_rt(void *opaque)
 {
-    if (vm_clock_warp_start == -1) {
+    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
+     * changes from -1 to another value, so the race here is okay.
+     */
+    if (atomic_read(&vm_clock_warp_start) == -1) {
         return;
     }
 
+    seqlock_write_lock(&timers_state.vm_clock_seqlock);
     if (runstate_is_running()) {
         int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
         int64_t warp_delta;
@@ -306,14 +325,15 @@ static void icount_warp_rt(void *opaque)
              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
              * far ahead of real time.
              */
-            int64_t cur_time = cpu_get_clock();
-            int64_t cur_icount = cpu_get_icount();
+            int64_t cur_time = cpu_get_clock_locked();
+            int64_t cur_icount = cpu_get_icount_locked();
             int64_t delta = cur_time - cur_icount;
             warp_delta = MIN(warp_delta, delta);
         }
         qemu_icount_bias += warp_delta;
     }
     vm_clock_warp_start = -1;
+    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 
     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
@@ -327,7 +347,10 @@ void qtest_clock_warp(int64_t dest)
     while (clock < dest) {
         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
         int64_t warp = MIN(dest - clock, deadline);
+        seqlock_write_lock(&timers_state.vm_clock_seqlock);
         qemu_icount_bias += warp;
+        seqlock_write_unlock(&timers_state.vm_clock_seqlock);
+
         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
     }
@@ -391,9 +414,11 @@ void qemu_clock_warp(QEMUClockType type)
          * you will not be sending network packets continuously instead of
          * every 100ms.
          */
+        seqlock_write_lock(&timers_state.vm_clock_seqlock);
         if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
             vm_clock_warp_start = clock;
         }
+        seqlock_write_unlock(&timers_state.vm_clock_seqlock);
         timer_mod_anticipate(icount_warp_timer, clock + deadline);
     } else if (deadline == 0) {
         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);

From 041603fe5d4537cd165941f96bd76a31f7f662fd Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 9 Sep 2013 17:49:45 +0200
Subject: [PATCH 17/17] exec: remove qemu_safe_ram_ptr

This is not needed since the RAM list is not modified anymore by
qemu_get_ram_ptr.  Replace it with qemu_get_ram_block.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c | 97 +++++++++++++++++-----------------------------------------
 1 file changed, 28 insertions(+), 69 deletions(-)

diff --git a/exec.c b/exec.c
index bea2cffd94..2e31ffcb2c 100644
--- a/exec.c
+++ b/exec.c
@@ -129,7 +129,6 @@ static PhysPageMap next_map;
 
 static void io_mem_init(void);
 static void memory_map_init(void);
-static void *qemu_safe_ram_ptr(ram_addr_t addr);
 
 static MemoryRegion io_mem_watch;
 #endif
@@ -626,22 +625,39 @@ void cpu_abort(CPUArchState *env, const char *fmt, ...)
 }
 
 #if !defined(CONFIG_USER_ONLY)
+static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
+{
+    RAMBlock *block;
+
+    /* The list is protected by the iothread lock here.  */
+    block = ram_list.mru_block;
+    if (block && addr - block->offset < block->length) {
+        goto found;
+    }
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        if (addr - block->offset < block->length) {
+            goto found;
+        }
+    }
+
+    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
+    abort();
+
+found:
+    ram_list.mru_block = block;
+    return block;
+}
+
 static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t end,
                                       uintptr_t length)
 {
-    uintptr_t start1;
+    RAMBlock *block;
+    ram_addr_t start1;
 
-    /* we modify the TLB cache so that the dirty bit will be set again
-       when accessing the range */
-    start1 = (uintptr_t)qemu_safe_ram_ptr(start);
-    /* Check that we don't span multiple blocks - this breaks the
-       address comparisons below.  */
-    if ((uintptr_t)qemu_safe_ram_ptr(end - 1) - start1
-            != (end - 1) - start) {
-        abort();
-    }
+    block = qemu_get_ram_block(start);
+    assert(block == qemu_get_ram_block(end - 1));
+    start1 = (uintptr_t)block->host + (start - block->offset);
     cpu_tlb_reset_dirty_all(start1, length);
-
 }
 
 /* Note: start and end must be within the same ram block.  */
@@ -1269,29 +1285,6 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length)
 }
 #endif /* !_WIN32 */
 
-static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
-{
-    RAMBlock *block;
-
-    /* The list is protected by the iothread lock here.  */
-    block = ram_list.mru_block;
-    if (block && addr - block->offset < block->length) {
-        goto found;
-    }
-    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        if (addr - block->offset < block->length) {
-            goto found;
-        }
-    }
-
-    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
-    abort();
-
-found:
-    ram_list.mru_block = block;
-    return block;
-}
-
 /* Return a host pointer to ram allocated with qemu_ram_alloc.
    With the exception of the softmmu code in this file, this should
    only be used for local memory (e.g. video ram) that the device owns,
@@ -1319,40 +1312,6 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
     return block->host + (addr - block->offset);
 }
 
-/* Return a host pointer to ram allocated with qemu_ram_alloc.  Same as
- * qemu_get_ram_ptr but do not touch ram_list.mru_block.
- *
- * ??? Is this still necessary?
- */
-static void *qemu_safe_ram_ptr(ram_addr_t addr)
-{
-    RAMBlock *block;
-
-    /* The list is protected by the iothread lock here.  */
-    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
-        if (addr - block->offset < block->length) {
-            if (xen_enabled()) {
-                /* We need to check if the requested address is in the RAM
-                 * because we don't want to map the entire memory in QEMU.
-                 * In that case just map until the end of the page.
-                 */
-                if (block->offset == 0) {
-                    return xen_map_cache(addr, 0, 0);
-                } else if (block->host == NULL) {
-                    block->host =
-                        xen_map_cache(block->offset, block->length, 1);
-                }
-            }
-            return block->host + (addr - block->offset);
-        }
-    }
-
-    fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
-    abort();
-
-    return NULL;
-}
-
 /* Return a host pointer to guest's ram. Similar to qemu_get_ram_ptr
  * but takes a size argument */
 static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)