From d645e1328726b38b3c79525eb57842ce29c1df7c Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 3 Jun 2019 20:04:08 -0300
Subject: [PATCH 01/33] kvm: i386: halt poll control MSR support

Add support for halt poll control MSR: save/restore, migration
and new feature name.

The purpose of this MSR is to allow the guest to disable
host halt poll.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Message-Id: <20190603230408.GA7938@amt.cnet>
[Do not enable by default, as pointed out by Mark Kanda. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/standard-headers/asm-x86/kvm_para.h |  2 ++
 target/i386/cpu.c                           |  4 +++-
 target/i386/cpu.h                           |  1 +
 target/i386/kvm.c                           | 14 ++++++++++++++
 target/i386/machine.c                       | 20 ++++++++++++++++++++
 5 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/include/standard-headers/asm-x86/kvm_para.h b/include/standard-headers/asm-x86/kvm_para.h
index 35cd8d651f..e1715143fd 100644
--- a/include/standard-headers/asm-x86/kvm_para.h
+++ b/include/standard-headers/asm-x86/kvm_para.h
@@ -29,6 +29,7 @@
 #define KVM_FEATURE_PV_TLB_FLUSH	9
 #define KVM_FEATURE_ASYNC_PF_VMEXIT	10
 #define KVM_FEATURE_PV_SEND_IPI	11
+#define KVM_FEATURE_POLL_CONTROL	12
 
 #define KVM_HINTS_REALTIME      0
 
@@ -47,6 +48,7 @@
 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
 #define MSR_KVM_STEAL_TIME  0x4b564d03
 #define MSR_KVM_PV_EOI_EN      0x4b564d04
+#define MSR_KVM_POLL_CONTROL	0x4b564d05
 
 struct kvm_steal_time {
 	uint64_t steal;
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index ff65e11008..69cef4076a 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -906,7 +906,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
             "kvmclock", "kvm-nopiodelay", "kvm-mmu", "kvmclock",
             "kvm-asyncpf", "kvm-steal-time", "kvm-pv-eoi", "kvm-pv-unhalt",
             NULL, "kvm-pv-tlb-flush", NULL, "kvm-pv-ipi",
-            NULL, NULL, NULL, NULL,
+            "kvm-poll-control", NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
             "kvmclock-stable-bit", NULL, NULL, NULL,
@@ -5660,6 +5660,8 @@ static void x86_cpu_initfn(Object *obj)
     object_property_add_alias(obj, "kvm_steal_time", obj, "kvm-steal-time", &error_abort);
     object_property_add_alias(obj, "kvm_pv_eoi", obj, "kvm-pv-eoi", &error_abort);
     object_property_add_alias(obj, "kvm_pv_unhalt", obj, "kvm-pv-unhalt", &error_abort);
+    object_property_add_alias(obj, "kvm_poll_control", obj, "kvm-poll-control",
+                              &error_abort);
     object_property_add_alias(obj, "svm_lock", obj, "svm-lock", &error_abort);
     object_property_add_alias(obj, "nrip_save", obj, "nrip-save", &error_abort);
     object_property_add_alias(obj, "tsc_scale", obj, "tsc-scale", &error_abort);
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index ecd0ec0899..6c95447005 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1260,6 +1260,7 @@ typedef struct CPUX86State {
     uint64_t steal_time_msr;
     uint64_t async_pf_en_msr;
     uint64_t pv_eoi_en_msr;
+    uint64_t poll_control_msr;
 
     /* Partition-wide HV MSRs, will be updated only on the first vcpu */
     uint64_t msr_hv_hypercall;
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 2abc881324..8dc7026b0d 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1785,6 +1785,8 @@ void kvm_arch_reset_vcpu(X86CPU *cpu)
 
         hyperv_x86_synic_reset(cpu);
     }
+    /* enabled by default */
+    env->poll_control_msr = 1;
 }
 
 void kvm_arch_do_init_vcpu(X86CPU *cpu)
@@ -2493,6 +2495,11 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
             kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
         }
+
+        if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
+            kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
+        }
+
         if (has_architectural_pmu_version > 0) {
             if (has_architectural_pmu_version > 1) {
                 /* Stop the counter.  */
@@ -2878,6 +2885,9 @@ static int kvm_get_msrs(X86CPU *cpu)
     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
         kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
     }
+    if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
+        kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
+    }
     if (has_architectural_pmu_version > 0) {
         if (has_architectural_pmu_version > 1) {
             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
@@ -3112,6 +3122,10 @@ static int kvm_get_msrs(X86CPU *cpu)
         case MSR_KVM_STEAL_TIME:
             env->steal_time_msr = msrs[i].data;
             break;
+        case MSR_KVM_POLL_CONTROL: {
+            env->poll_control_msr = msrs[i].data;
+            break;
+        }
         case MSR_CORE_PERF_FIXED_CTR_CTRL:
             env->msr_fixed_ctr_ctrl = msrs[i].data;
             break;
diff --git a/target/i386/machine.c b/target/i386/machine.c
index ce55755f0f..2767b3096d 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -437,6 +437,14 @@ static const VMStateDescription vmstate_exception_info = {
     }
 };
 
+/* Poll control MSR enabled by default */
+static bool poll_control_msr_needed(void *opaque)
+{
+    X86CPU *cpu = opaque;
+
+    return cpu->env.poll_control_msr != 1;
+}
+
 static const VMStateDescription vmstate_steal_time_msr = {
     .name = "cpu/steal_time_msr",
     .version_id = 1,
@@ -470,6 +478,17 @@ static const VMStateDescription vmstate_pv_eoi_msr = {
     }
 };
 
+static const VMStateDescription vmstate_poll_control_msr = {
+    .name = "cpu/poll_control_msr",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = poll_control_msr_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(env.poll_control_msr, X86CPU),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static bool fpop_ip_dp_needed(void *opaque)
 {
     X86CPU *cpu = opaque;
@@ -1354,6 +1373,7 @@ VMStateDescription vmstate_x86_cpu = {
         &vmstate_async_pf_msr,
         &vmstate_pv_eoi_msr,
         &vmstate_steal_time_msr,
+        &vmstate_poll_control_msr,
         &vmstate_fpop_ip_dp,
         &vmstate_msr_tsc_adjust,
         &vmstate_msr_tscdeadline,

From b896c4b50da107bf0d40e6215d4ee20daf64723b Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Wed, 10 Jul 2019 16:02:51 +0800
Subject: [PATCH 02/33] target-i386: adds PV_SCHED_YIELD CPUID feature bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds PV_SCHED_YIELD CPUID feature bit.

Cc: Eduardo Habkost <ehabkost@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Message-Id: <1562745771-8414-1-git-send-email-wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 69cef4076a..83872dda4d 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -906,7 +906,7 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
             "kvmclock", "kvm-nopiodelay", "kvm-mmu", "kvmclock",
             "kvm-asyncpf", "kvm-steal-time", "kvm-pv-eoi", "kvm-pv-unhalt",
             NULL, "kvm-pv-tlb-flush", NULL, "kvm-pv-ipi",
-            "kvm-poll-control", NULL, NULL, NULL,
+            "kvm-poll-control", "kvm-pv-sched-yield", NULL, NULL,
             NULL, NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
             "kvmclock-stable-bit", NULL, NULL, NULL,

From fef28891aa401e8f9d048c65f32067f51d695f4e Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Wed, 24 Jul 2019 16:31:03 +0200
Subject: [PATCH 03/33] loader: Handle memory-mapped ELFs

This patch allows handling an ELF memory-mapped, taking care
the reference count of the GMappedFile* passed through
rom_add_elf_program().
In this case, the 'data' pointer is not heap-allocated, so
we cannot free it.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20190724143105.307042-2-sgarzare@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/core/loader.c     | 38 ++++++++++++++++++++++++++++++--------
 include/hw/elf_ops.h |  2 +-
 include/hw/loader.h  |  5 +++--
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/hw/core/loader.c b/hw/core/loader.c
index 84e4f3efac..de00f56f34 100644
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@@ -838,6 +838,7 @@ struct Rom {
     int isrom;
     char *fw_dir;
     char *fw_file;
+    GMappedFile *mapped_file;
 
     bool committed;
 
@@ -848,10 +849,25 @@ struct Rom {
 static FWCfgState *fw_cfg;
 static QTAILQ_HEAD(, Rom) roms = QTAILQ_HEAD_INITIALIZER(roms);
 
-/* rom->data must be heap-allocated (do not use with rom_add_elf_program()) */
+/*
+ * rom->data can be heap-allocated or memory-mapped (e.g. when added with
+ * rom_add_elf_program())
+ */
+static void rom_free_data(Rom *rom)
+{
+    if (rom->mapped_file) {
+        g_mapped_file_unref(rom->mapped_file);
+        rom->mapped_file = NULL;
+    } else {
+        g_free(rom->data);
+    }
+
+    rom->data = NULL;
+}
+
 static void rom_free(Rom *rom)
 {
-    g_free(rom->data);
+    rom_free_data(rom);
     g_free(rom->path);
     g_free(rom->name);
     g_free(rom->fw_dir);
@@ -1058,11 +1074,12 @@ MemoryRegion *rom_add_blob(const char *name, const void *blob, size_t len,
 
 /* This function is specific for elf program because we don't need to allocate
  * all the rom. We just allocate the first part and the rest is just zeros. This
- * is why romsize and datasize are different. Also, this function seize the
- * memory ownership of "data", so we don't have to allocate and copy the buffer.
+ * is why romsize and datasize are different. Also, this function takes its own
+ * reference to "mapped_file", so we don't have to allocate and copy the buffer.
  */
-int rom_add_elf_program(const char *name, void *data, size_t datasize,
-                        size_t romsize, hwaddr addr, AddressSpace *as)
+int rom_add_elf_program(const char *name, GMappedFile *mapped_file, void *data,
+                        size_t datasize, size_t romsize, hwaddr addr,
+                        AddressSpace *as)
 {
     Rom *rom;
 
@@ -1073,6 +1090,12 @@ int rom_add_elf_program(const char *name, void *data, size_t datasize,
     rom->romsize  = romsize;
     rom->data     = data;
     rom->as       = as;
+
+    if (mapped_file && data) {
+        g_mapped_file_ref(mapped_file);
+        rom->mapped_file = mapped_file;
+    }
+
     rom_insert(rom);
     return 0;
 }
@@ -1107,8 +1130,7 @@ static void rom_reset(void *unused)
         }
         if (rom->isrom) {
             /* rom needs to be written only once */
-            g_free(rom->data);
-            rom->data = NULL;
+            rom_free_data(rom);
         }
         /*
          * The rom loader is really on the same level as firmware in the guest
diff --git a/include/hw/elf_ops.h b/include/hw/elf_ops.h
index 690f9238c8..fede37ee9c 100644
--- a/include/hw/elf_ops.h
+++ b/include/hw/elf_ops.h
@@ -525,7 +525,7 @@ static int glue(load_elf, SZ)(const char *name, int fd,
                     snprintf(label, sizeof(label), "phdr #%d: %s", i, name);
 
                     /* rom_add_elf_program() seize the ownership of 'data' */
-                    rom_add_elf_program(label, data, file_size, mem_size,
+                    rom_add_elf_program(label, NULL, data, file_size, mem_size,
                                         addr, as);
                 } else {
                     address_space_write(as ? as : &address_space_memory,
diff --git a/include/hw/loader.h b/include/hw/loader.h
index 3e1b3a4566..07fd9286e7 100644
--- a/include/hw/loader.h
+++ b/include/hw/loader.h
@@ -258,8 +258,9 @@ MemoryRegion *rom_add_blob(const char *name, const void *blob, size_t len,
                            FWCfgCallback fw_callback,
                            void *callback_opaque, AddressSpace *as,
                            bool read_only);
-int rom_add_elf_program(const char *name, void *data, size_t datasize,
-                        size_t romsize, hwaddr addr, AddressSpace *as);
+int rom_add_elf_program(const char *name, GMappedFile *mapped_file, void *data,
+                        size_t datasize, size_t romsize, hwaddr addr,
+                        AddressSpace *as);
 int rom_check_and_register_reset(void);
 void rom_set_fw(FWCfgState *f);
 void rom_set_order_override(int order);

From 816b9fe450220e19acb91a0ce4a8ade7000648d1 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Wed, 24 Jul 2019 16:31:04 +0200
Subject: [PATCH 04/33] elf-ops.h: Map into memory the ELF to load

In order to reduce the memory footprint we map into memory
the ELF to load using g_mapped_file_new_from_fd() instead of
reading each sections. In this way we can share the ELF pages
between multiple instances of QEMU.

Suggested-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20190724143105.307042-3-sgarzare@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/hw/elf_ops.h | 71 ++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 29 deletions(-)

diff --git a/include/hw/elf_ops.h b/include/hw/elf_ops.h
index fede37ee9c..1496d7e753 100644
--- a/include/hw/elf_ops.h
+++ b/include/hw/elf_ops.h
@@ -323,8 +323,9 @@ static int glue(load_elf, SZ)(const char *name, int fd,
     struct elfhdr ehdr;
     struct elf_phdr *phdr = NULL, *ph;
     int size, i, total_size;
-    elf_word mem_size, file_size;
+    elf_word mem_size, file_size, data_offset;
     uint64_t addr, low = (uint64_t)-1, high = 0;
+    GMappedFile *mapped_file = NULL;
     uint8_t *data = NULL;
     char label[128];
     int ret = ELF_LOAD_FAILED;
@@ -409,20 +410,32 @@ static int glue(load_elf, SZ)(const char *name, int fd,
         }
     }
 
+    /*
+     * Since we want to be able to modify the mapped buffer, we set the
+     * 'writeble' parameter to 'true'. Modifications to the buffer are not
+     * written back to the file.
+     */
+    mapped_file = g_mapped_file_new_from_fd(fd, true, NULL);
+    if (!mapped_file) {
+        goto fail;
+    }
+
     total_size = 0;
     for(i = 0; i < ehdr.e_phnum; i++) {
         ph = &phdr[i];
         if (ph->p_type == PT_LOAD) {
             mem_size = ph->p_memsz; /* Size of the ROM */
             file_size = ph->p_filesz; /* Size of the allocated data */
-            data = g_malloc0(file_size);
-            if (ph->p_filesz > 0) {
-                if (lseek(fd, ph->p_offset, SEEK_SET) < 0) {
-                    goto fail;
-                }
-                if (read(fd, data, file_size) != file_size) {
+            data_offset = ph->p_offset; /* Offset where the data is located */
+
+            if (file_size > 0) {
+                if (g_mapped_file_get_length(mapped_file) <
+                    file_size + data_offset) {
                     goto fail;
                 }
+
+                data = (uint8_t *)g_mapped_file_get_contents(mapped_file);
+                data += data_offset;
             }
 
             /* The ELF spec is somewhat vague about the purpose of the
@@ -513,25 +526,25 @@ static int glue(load_elf, SZ)(const char *name, int fd,
                 *pentry = ehdr.e_entry - ph->p_vaddr + ph->p_paddr;
             }
 
-            if (mem_size == 0) {
-                /* Some ELF files really do have segments of zero size;
-                 * just ignore them rather than trying to create empty
-                 * ROM blobs, because the zero-length blob can falsely
-                 * trigger the overlapping-ROM-blobs check.
-                 */
-                g_free(data);
-            } else {
+            /* Some ELF files really do have segments of zero size;
+             * just ignore them rather than trying to create empty
+             * ROM blobs, because the zero-length blob can falsely
+             * trigger the overlapping-ROM-blobs check.
+             */
+            if (mem_size != 0) {
                 if (load_rom) {
                     snprintf(label, sizeof(label), "phdr #%d: %s", i, name);
 
-                    /* rom_add_elf_program() seize the ownership of 'data' */
-                    rom_add_elf_program(label, NULL, data, file_size, mem_size,
-                                        addr, as);
+                    /*
+                     * rom_add_elf_program() takes its own reference to
+                     * 'mapped_file'.
+                     */
+                    rom_add_elf_program(label, mapped_file, data, file_size,
+                                        mem_size, addr, as);
                 } else {
                     address_space_write(as ? as : &address_space_memory,
                                         addr, MEMTXATTRS_UNSPECIFIED,
                                         data, file_size);
-                    g_free(data);
                 }
             }
 
@@ -547,14 +560,16 @@ static int glue(load_elf, SZ)(const char *name, int fd,
             struct elf_note *nhdr = NULL;
 
             file_size = ph->p_filesz; /* Size of the range of ELF notes */
-            data = g_malloc0(file_size);
-            if (ph->p_filesz > 0) {
-                if (lseek(fd, ph->p_offset, SEEK_SET) < 0) {
-                    goto fail;
-                }
-                if (read(fd, data, file_size) != file_size) {
+            data_offset = ph->p_offset; /* Offset where the notes are located */
+
+            if (file_size > 0) {
+                if (g_mapped_file_get_length(mapped_file) <
+                    file_size + data_offset) {
                     goto fail;
                 }
+
+                data = (uint8_t *)g_mapped_file_get_contents(mapped_file);
+                data += data_offset;
             }
 
             /*
@@ -570,19 +585,17 @@ static int glue(load_elf, SZ)(const char *name, int fd,
                     sizeof(struct elf_note) == sizeof(struct elf64_note);
                 elf_note_fn((void *)nhdr, (void *)&ph->p_align, is64);
             }
-            g_free(data);
             data = NULL;
         }
     }
 
-    g_free(phdr);
     if (lowaddr)
         *lowaddr = (uint64_t)(elf_sword)low;
     if (highaddr)
         *highaddr = (uint64_t)(elf_sword)high;
-    return total_size;
+    ret = total_size;
  fail:
-    g_free(data);
+    g_mapped_file_unref(mapped_file);
     g_free(phdr);
     return ret;
 }

From e502fe96ac4343a3f4a3c13f28eea03ae7b11c3f Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Wed, 24 Jul 2019 16:31:05 +0200
Subject: [PATCH 05/33] hw/i386/pc: Map into memory the initrd

In order to reduce the memory footprint we map into memory
the initrd using g_mapped_file_new() instead of reading it.
In this way we can share the initrd pages between multiple
instances of QEMU.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Message-Id: <20190724143105.307042-4-sgarzare@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/pc.c         | 17 +++++++++++++----
 include/hw/i386/pc.h |  1 +
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 3ab4bcb3ca..6b4f32edfe 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1244,17 +1244,21 @@ static void load_linux(PCMachineState *pcms,
 
             /* load initrd */
             if (initrd_filename) {
+                GMappedFile *mapped_file;
                 gsize initrd_size;
                 gchar *initrd_data;
                 GError *gerr = NULL;
 
-                if (!g_file_get_contents(initrd_filename, &initrd_data,
-                            &initrd_size, &gerr)) {
+                mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
+                if (!mapped_file) {
                     fprintf(stderr, "qemu: error reading initrd %s: %s\n",
                             initrd_filename, gerr->message);
                     exit(1);
                 }
+                pcms->initrd_mapped_file = mapped_file;
 
+                initrd_data = g_mapped_file_get_contents(mapped_file);
+                initrd_size = g_mapped_file_get_length(mapped_file);
                 initrd_max = pcms->below_4g_mem_size - pcmc->acpi_data_size - 1;
                 if (initrd_size >= initrd_max) {
                     fprintf(stderr, "qemu: initrd is too large, cannot support."
@@ -1381,6 +1385,7 @@ static void load_linux(PCMachineState *pcms,
 
     /* load initrd */
     if (initrd_filename) {
+        GMappedFile *mapped_file;
         gsize initrd_size;
         gchar *initrd_data;
         GError *gerr = NULL;
@@ -1390,12 +1395,16 @@ static void load_linux(PCMachineState *pcms,
             exit(1);
         }
 
-        if (!g_file_get_contents(initrd_filename, &initrd_data,
-                                 &initrd_size, &gerr)) {
+        mapped_file = g_mapped_file_new(initrd_filename, false, &gerr);
+        if (!mapped_file) {
             fprintf(stderr, "qemu: error reading initrd %s: %s\n",
                     initrd_filename, gerr->message);
             exit(1);
         }
+        pcms->initrd_mapped_file = mapped_file;
+
+        initrd_data = g_mapped_file_get_contents(mapped_file);
+        initrd_size = g_mapped_file_get_length(mapped_file);
         if (initrd_size >= initrd_max) {
             fprintf(stderr, "qemu: initrd is too large, cannot support."
                     "(max: %"PRIu32", need %"PRId64")\n",
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 4bb9e29114..2afe285009 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -41,6 +41,7 @@ struct PCMachineState {
     FWCfgState *fw_cfg;
     qemu_irq *gsi;
     PFlashCFI01 *flash[2];
+    GMappedFile *initrd_mapped_file;
 
     /* Configuration options: */
     uint64_t max_ram_below_4g;

From 03c7140c1a0336af3d4fca768de791b9c0e2b128 Mon Sep 17 00:00:00 2001
From: Yan Zhao <yan.y.zhao@intel.com>
Date: Tue, 25 Jun 2019 11:21:18 +0800
Subject: [PATCH 06/33] memory: assert on out of scope notification

It is wrong for an entry to have parts out of scope of notifier's range.
assert this condition.

Out of scope mapping/unmapping would cause problem, as in below case:

1. initially there are two notifiers with ranges
0-0xfedfffff, 0xfef00000-0xffffffffffffffff,
IOVAs from 0x3c000000 - 0x3c1fffff is in shadow page table.

2. in vfio, memory_region_register_iommu_notifier() is followed by
memory_region_iommu_replay(), which will first call address space
unmap,
and walk and add back all entries in vtd shadow page table. e.g.
(1) for notifier 0-0xfedfffff,
    IOVAs from 0 - 0xffffffff get unmapped,
    and IOVAs from 0x3c000000 - 0x3c1fffff get mapped
(2) for notifier 0xfef00000-0xffffffffffffffff
    IOVAs from 0 - 0x7fffffffff get unmapped,
    but IOVAs from 0x3c000000 - 0x3c1fffff cannot get mapped back.

Cc: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>

Message-Id: <1561432878-13754-1-git-send-email-yan.y.zhao@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 memory.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/memory.c b/memory.c
index 8141486832..c90a2cf629 100644
--- a/memory.c
+++ b/memory.c
@@ -1942,16 +1942,18 @@ void memory_region_notify_one(IOMMUNotifier *notifier,
                               IOMMUTLBEntry *entry)
 {
     IOMMUNotifierFlag request_flags;
+    hwaddr entry_end = entry->iova + entry->addr_mask;
 
     /*
      * Skip the notification if the notification does not overlap
      * with registered range.
      */
-    if (notifier->start > entry->iova + entry->addr_mask ||
-        notifier->end < entry->iova) {
+    if (notifier->start > entry_end || notifier->end < entry->iova) {
         return;
     }
 
+    assert(entry->iova >= notifier->start && entry_end <= notifier->end);
+
     if (entry->perm & IOMMU_RW) {
         request_flags = IOMMU_NOTIFIER_MAP;
     } else {

From 52bf9771fdfce98e98cea36a17a18915be6f6b7f Mon Sep 17 00:00:00 2001
From: "tony.nguyen@bt.com" <tony.nguyen@bt.com>
Date: Thu, 18 Jul 2019 06:01:31 +0000
Subject: [PATCH 07/33] configure: Define target access alignment in configure

This patch moves the define of target access alignment earlier from
target/foo/cpu.h to configure.

Suggested in Richard Henderson's reply to "[PATCH 1/4] tcg: TCGMemOp is now
accelerator independent MemOp"

Signed-off-by: Tony Nguyen <tony.nguyen@bt.com>
Message-Id: <11e818d38ebc40e986cfa62dd7d0afdc@tpw09926dag18e.domain1.systemhost.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: tony.nguyen@bt.com <tony.nguyen@bt.com>
---
 configure             | 12 ++++++++++--
 include/exec/poison.h |  1 +
 include/qom/cpu.h     |  2 +-
 target/alpha/cpu.h    |  2 --
 target/hppa/cpu.h     |  1 -
 target/mips/cpu.h     |  2 --
 target/sh4/cpu.h      |  2 --
 target/sparc/cpu.h    |  2 --
 target/xtensa/cpu.h   |  2 --
 tcg/tcg.c             |  2 +-
 tcg/tcg.h             |  8 +++++---
 11 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/configure b/configure
index 714e7fb6a1..942a73b795 100755
--- a/configure
+++ b/configure
@@ -7431,11 +7431,16 @@ for target in $target_list; do
 target_dir="$target"
 config_target_mak=$target_dir/config-target.mak
 target_name=$(echo $target | cut -d '-' -f 1)
+target_aligned_only="no"
+case "$target_name" in
+  alpha|hppa|mips64el|mips64|mipsel|mips|mipsn32|mipsn32el|sh4|sh4eb|sparc|sparc64|sparc32plus|xtensa|xtensaeb)
+  target_aligned_only="yes"
+  ;;
+esac
 target_bigendian="no"
-
 case "$target_name" in
   armeb|aarch64_be|hppa|lm32|m68k|microblaze|mips|mipsn32|mips64|moxie|or1k|ppc|ppc64|ppc64abi32|s390x|sh4eb|sparc|sparc64|sparc32plus|xtensaeb)
-  target_bigendian=yes
+  target_bigendian="yes"
   ;;
 esac
 target_softmmu="no"
@@ -7717,6 +7722,9 @@ fi
 if supported_whpx_target $target; then
     echo "CONFIG_WHPX=y" >> $config_target_mak
 fi
+if test "$target_aligned_only" = "yes" ; then
+  echo "TARGET_ALIGNED_ONLY=y" >> $config_target_mak
+fi
 if test "$target_bigendian" = "yes" ; then
   echo "TARGET_WORDS_BIGENDIAN=y" >> $config_target_mak
 fi
diff --git a/include/exec/poison.h b/include/exec/poison.h
index b862320fa6..955eb863ab 100644
--- a/include/exec/poison.h
+++ b/include/exec/poison.h
@@ -35,6 +35,7 @@
 #pragma GCC poison TARGET_UNICORE32
 #pragma GCC poison TARGET_XTENSA
 
+#pragma GCC poison TARGET_ALIGNED_ONLY
 #pragma GCC poison TARGET_HAS_BFLT
 #pragma GCC poison TARGET_NAME
 #pragma GCC poison TARGET_SUPPORTS_MTTCG
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index ddb91bbaff..77fca95a40 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -89,7 +89,7 @@ struct TranslationBlock;
  * @do_unassigned_access: Callback for unassigned access handling.
  * (this is deprecated: new targets should use do_transaction_failed instead)
  * @do_unaligned_access: Callback for unaligned access handling, if
- * the target defines #ALIGNED_ONLY.
+ * the target defines #TARGET_ALIGNED_ONLY.
  * @do_transaction_failed: Callback for handling failed memory transactions
  * (ie bus faults or external aborts; not MMU faults)
  * @virtio_is_big_endian: Callback to return %true if a CPU which supports
diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h
index 4619530660..a530249a5b 100644
--- a/target/alpha/cpu.h
+++ b/target/alpha/cpu.h
@@ -23,8 +23,6 @@
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
 
-#define ALIGNED_ONLY
-
 /* Alpha processors have a weak memory model */
 #define TCG_GUEST_DEFAULT_MO      (0)
 
diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 4b816cc13a..6713d04f11 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -30,7 +30,6 @@
    basis.  It's probably easier to fall back to a strong memory model.  */
 #define TCG_GUEST_DEFAULT_MO        TCG_MO_ALL
 
-#define ALIGNED_ONLY
 #define MMU_KERNEL_IDX   0
 #define MMU_USER_IDX     3
 #define MMU_PHYS_IDX     4
diff --git a/target/mips/cpu.h b/target/mips/cpu.h
index d235117dab..1fd4a180e1 100644
--- a/target/mips/cpu.h
+++ b/target/mips/cpu.h
@@ -1,8 +1,6 @@
 #ifndef MIPS_CPU_H
 #define MIPS_CPU_H
 
-#define ALIGNED_ONLY
-
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
 #include "fpu/softfloat-types.h"
diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index aee733eaaa..ecaa7a18a9 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -23,8 +23,6 @@
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
 
-#define ALIGNED_ONLY
-
 /* CPU Subtypes */
 #define SH_CPU_SH7750  (1 << 0)
 #define SH_CPU_SH7750S (1 << 1)
diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
index 0d5b01efe5..694d7139cf 100644
--- a/target/sparc/cpu.h
+++ b/target/sparc/cpu.h
@@ -5,8 +5,6 @@
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
 
-#define ALIGNED_ONLY
-
 #if !defined(TARGET_SPARC64)
 #define TARGET_DPREGS 16
 #else
diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
index 2c277134f1..0459243e6b 100644
--- a/target/xtensa/cpu.h
+++ b/target/xtensa/cpu.h
@@ -32,8 +32,6 @@
 #include "exec/cpu-defs.h"
 #include "xtensa-isa.h"
 
-#define ALIGNED_ONLY
-
 /* Xtensa processors have a weak memory model */
 #define TCG_GUEST_DEFAULT_MO      (0)
 
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 2db83ce2e8..0458eaec57 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1925,7 +1925,7 @@ static const char * const ldst_name[] =
 };
 
 static const char * const alignment_name[(MO_AMASK >> MO_ASHIFT) + 1] = {
-#ifdef ALIGNED_ONLY
+#ifdef TARGET_ALIGNED_ONLY
     [MO_UNALN >> MO_ASHIFT]    = "un+",
     [MO_ALIGN >> MO_ASHIFT]    = "",
 #else
diff --git a/tcg/tcg.h b/tcg/tcg.h
index b411e17a28..529acb2ed8 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -333,10 +333,12 @@ typedef enum TCGMemOp {
     MO_TE    = MO_LE,
 #endif
 
-    /* MO_UNALN accesses are never checked for alignment.
+    /*
+     * MO_UNALN accesses are never checked for alignment.
      * MO_ALIGN accesses will result in a call to the CPU's
      * do_unaligned_access hook if the guest address is not aligned.
-     * The default depends on whether the target CPU defines ALIGNED_ONLY.
+     * The default depends on whether the target CPU defines
+     * TARGET_ALIGNED_ONLY.
      *
      * Some architectures (e.g. ARMv8) need the address which is aligned
      * to a size more than the size of the memory access.
@@ -353,7 +355,7 @@ typedef enum TCGMemOp {
      */
     MO_ASHIFT = 4,
     MO_AMASK = 7 << MO_ASHIFT,
-#ifdef ALIGNED_ONLY
+#ifdef TARGET_ALIGNED_ONLY
     MO_ALIGN = 0,
     MO_UNALN = MO_AMASK,
 #else

From 98387d58024e3fb1b88d5a57912e4514d7f39c7c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 25 Jul 2019 12:03:30 +0200
Subject: [PATCH 08/33] 9p: simplify source file selection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Express the complex conditions in Kconfig rather than Makefiles, since Kconfig
is better suited at expressing dependencies and detecting contradictions.

Cc: Marc-André Lureau <marcandre.lureau@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Kconfig.host        | 1 +
 fsdev/Makefile.objs | 2 +-
 hw/9pfs/Kconfig     | 5 +++++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/Kconfig.host b/Kconfig.host
index aec95365ff..bb6e116e2a 100644
--- a/Kconfig.host
+++ b/Kconfig.host
@@ -28,6 +28,7 @@ config VHOST_USER
 
 config XEN
     bool
+    select FSDEV_9P if VIRTFS
 
 config VIRTFS
     bool
diff --git a/fsdev/Makefile.objs b/fsdev/Makefile.objs
index 24bbb3e75c..42cd70c367 100644
--- a/fsdev/Makefile.objs
+++ b/fsdev/Makefile.objs
@@ -1,6 +1,6 @@
 # Lots of the fsdev/9pcode is pulled in by vl.c via qemu_fsdev_add.
 # only pull in the actual 9p backend if we also enabled virtio or xen.
-ifeq ($(call land,$(CONFIG_VIRTFS),$(call lor,$(CONFIG_VIRTIO_9P),$(CONFIG_XEN))),y)
+ifeq ($(CONFIG_FSDEV_9P),y)
 common-obj-y = qemu-fsdev.o 9p-marshal.o 9p-iov-marshal.o
 else
 common-obj-y = qemu-fsdev-dummy.o
diff --git a/hw/9pfs/Kconfig b/hw/9pfs/Kconfig
index 8c5032c575..3ae5749661 100644
--- a/hw/9pfs/Kconfig
+++ b/hw/9pfs/Kconfig
@@ -1,4 +1,9 @@
+config FSDEV_9P
+    bool
+    depends on VIRTFS
+
 config VIRTIO_9P
     bool
     default y
     depends on VIRTFS && VIRTIO
+    select FSDEV_9P

From de428cead63a958137ee63efcc3cceaf75f6c125 Mon Sep 17 00:00:00 2001
From: Li Qiang <liq3ea@163.com>
Date: Thu, 25 Jul 2019 08:16:39 -0700
Subject: [PATCH 09/33] target-i386: kvm: 'kvm_get_supported_msrs' cleanup

Function 'kvm_get_supported_msrs' is only called once
now, get rid of the static variable 'kvm_supported_msrs'.

Signed-off-by: Li Qiang <liq3ea@163.com>
Message-Id: <20190725151639.21693-1-liq3ea@163.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/kvm.c | 185 +++++++++++++++++++++++-----------------------
 1 file changed, 91 insertions(+), 94 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 8dc7026b0d..608bb44270 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1842,108 +1842,105 @@ static int kvm_get_supported_feature_msrs(KVMState *s)
 
 static int kvm_get_supported_msrs(KVMState *s)
 {
-    static int kvm_supported_msrs;
     int ret = 0;
+    struct kvm_msr_list msr_list, *kvm_msr_list;
 
-    /* first time */
-    if (kvm_supported_msrs == 0) {
-        struct kvm_msr_list msr_list, *kvm_msr_list;
+    /*
+     *  Obtain MSR list from KVM.  These are the MSRs that we must
+     *  save/restore.
+     */
+    msr_list.nmsrs = 0;
+    ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
+    if (ret < 0 && ret != -E2BIG) {
+        return ret;
+    }
+    /*
+     * Old kernel modules had a bug and could write beyond the provided
+     * memory. Allocate at least a safe amount of 1K.
+     */
+    kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
+                                          msr_list.nmsrs *
+                                          sizeof(msr_list.indices[0])));
 
-        kvm_supported_msrs = -1;
+    kvm_msr_list->nmsrs = msr_list.nmsrs;
+    ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
+    if (ret >= 0) {
+        int i;
 
-        /* Obtain MSR list from KVM.  These are the MSRs that we must
-         * save/restore */
-        msr_list.nmsrs = 0;
-        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
-        if (ret < 0 && ret != -E2BIG) {
-            return ret;
-        }
-        /* Old kernel modules had a bug and could write beyond the provided
-           memory. Allocate at least a safe amount of 1K. */
-        kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
-                                              msr_list.nmsrs *
-                                              sizeof(msr_list.indices[0])));
-
-        kvm_msr_list->nmsrs = msr_list.nmsrs;
-        ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
-        if (ret >= 0) {
-            int i;
-
-            for (i = 0; i < kvm_msr_list->nmsrs; i++) {
-                switch (kvm_msr_list->indices[i]) {
-                case MSR_STAR:
-                    has_msr_star = true;
-                    break;
-                case MSR_VM_HSAVE_PA:
-                    has_msr_hsave_pa = true;
-                    break;
-                case MSR_TSC_AUX:
-                    has_msr_tsc_aux = true;
-                    break;
-                case MSR_TSC_ADJUST:
-                    has_msr_tsc_adjust = true;
-                    break;
-                case MSR_IA32_TSCDEADLINE:
-                    has_msr_tsc_deadline = true;
-                    break;
-                case MSR_IA32_SMBASE:
-                    has_msr_smbase = true;
-                    break;
-                case MSR_SMI_COUNT:
-                    has_msr_smi_count = true;
-                    break;
-                case MSR_IA32_MISC_ENABLE:
-                    has_msr_misc_enable = true;
-                    break;
-                case MSR_IA32_BNDCFGS:
-                    has_msr_bndcfgs = true;
-                    break;
-                case MSR_IA32_XSS:
-                    has_msr_xss = true;
-                    break;
-                case HV_X64_MSR_CRASH_CTL:
-                    has_msr_hv_crash = true;
-                    break;
-                case HV_X64_MSR_RESET:
-                    has_msr_hv_reset = true;
-                    break;
-                case HV_X64_MSR_VP_INDEX:
-                    has_msr_hv_vpindex = true;
-                    break;
-                case HV_X64_MSR_VP_RUNTIME:
-                    has_msr_hv_runtime = true;
-                    break;
-                case HV_X64_MSR_SCONTROL:
-                    has_msr_hv_synic = true;
-                    break;
-                case HV_X64_MSR_STIMER0_CONFIG:
-                    has_msr_hv_stimer = true;
-                    break;
-                case HV_X64_MSR_TSC_FREQUENCY:
-                    has_msr_hv_frequencies = true;
-                    break;
-                case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
-                    has_msr_hv_reenlightenment = true;
-                    break;
-                case MSR_IA32_SPEC_CTRL:
-                    has_msr_spec_ctrl = true;
-                    break;
-                case MSR_VIRT_SSBD:
-                    has_msr_virt_ssbd = true;
-                    break;
-                case MSR_IA32_ARCH_CAPABILITIES:
-                    has_msr_arch_capabs = true;
-                    break;
-                case MSR_IA32_CORE_CAPABILITY:
-                    has_msr_core_capabs = true;
-                    break;
-                }
+        for (i = 0; i < kvm_msr_list->nmsrs; i++) {
+            switch (kvm_msr_list->indices[i]) {
+            case MSR_STAR:
+                has_msr_star = true;
+                break;
+            case MSR_VM_HSAVE_PA:
+                has_msr_hsave_pa = true;
+                break;
+            case MSR_TSC_AUX:
+                has_msr_tsc_aux = true;
+                break;
+            case MSR_TSC_ADJUST:
+                has_msr_tsc_adjust = true;
+                break;
+            case MSR_IA32_TSCDEADLINE:
+                has_msr_tsc_deadline = true;
+                break;
+            case MSR_IA32_SMBASE:
+                has_msr_smbase = true;
+                break;
+            case MSR_SMI_COUNT:
+                has_msr_smi_count = true;
+                break;
+            case MSR_IA32_MISC_ENABLE:
+                has_msr_misc_enable = true;
+                break;
+            case MSR_IA32_BNDCFGS:
+                has_msr_bndcfgs = true;
+                break;
+            case MSR_IA32_XSS:
+                has_msr_xss = true;
+                break;
+            case HV_X64_MSR_CRASH_CTL:
+                has_msr_hv_crash = true;
+                break;
+            case HV_X64_MSR_RESET:
+                has_msr_hv_reset = true;
+                break;
+            case HV_X64_MSR_VP_INDEX:
+                has_msr_hv_vpindex = true;
+                break;
+            case HV_X64_MSR_VP_RUNTIME:
+                has_msr_hv_runtime = true;
+                break;
+            case HV_X64_MSR_SCONTROL:
+                has_msr_hv_synic = true;
+                break;
+            case HV_X64_MSR_STIMER0_CONFIG:
+                has_msr_hv_stimer = true;
+                break;
+            case HV_X64_MSR_TSC_FREQUENCY:
+                has_msr_hv_frequencies = true;
+                break;
+            case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+                has_msr_hv_reenlightenment = true;
+                break;
+            case MSR_IA32_SPEC_CTRL:
+                has_msr_spec_ctrl = true;
+                break;
+            case MSR_VIRT_SSBD:
+                has_msr_virt_ssbd = true;
+                break;
+            case MSR_IA32_ARCH_CAPABILITIES:
+                has_msr_arch_capabs = true;
+                break;
+            case MSR_IA32_CORE_CAPABILITY:
+                has_msr_core_capabs = true;
+                break;
             }
         }
-
-        g_free(kvm_msr_list);
     }
 
+    g_free(kvm_msr_list);
+
     return ret;
 }
 

From ef0aa6af47da16f5367f545deb599a14ea408bee Mon Sep 17 00:00:00 2001
From: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Date: Tue, 30 Jul 2019 19:01:36 +0300
Subject: [PATCH 10/33] test-throttle: Fix uninitialized use of burst_length

ThrottleState::cfg of the static variable 'ts' is reassigned with the
local one in the do_test_accounting() and then is passed to the
throttle_account() with uninitialized member LeakyBucket::burst_length.

Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Message-Id: <1564502498-805893-2-git-send-email-andrey.shinkevich@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tests/test-throttle.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test-throttle.c b/tests/test-throttle.c
index 083e1f9ba8..7adb5e6652 100644
--- a/tests/test-throttle.c
+++ b/tests/test-throttle.c
@@ -558,6 +558,8 @@ static bool do_test_accounting(bool is_ops, /* are we testing bps or ops */
     BucketType index;
     int i;
 
+    throttle_config_init(&cfg);
+
     for (i = 0; i < 3; i++) {
         BucketType index = to_test[is_ops][i];
         cfg.buckets[index].avg = avg;

From f673174e3f62096f45b874fbe1cd469127d9c21c Mon Sep 17 00:00:00 2001
From: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Date: Tue, 30 Jul 2019 19:01:37 +0300
Subject: [PATCH 11/33] tests: Fix uninitialized byte in test_visitor_in_fuzz

One byte in the local buffer stays uninitialized, at least with the
first iteration, because of the double decrement in the
test_visitor_in_fuzz(). This is what Valgrind does not like and not
critical for the test itself. So, reduce the number of the memory
issues reports.

Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Message-Id: <1564502498-805893-3-git-send-email-andrey.shinkevich@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tests/test-string-input-visitor.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/test-string-input-visitor.c b/tests/test-string-input-visitor.c
index 34b54dfc89..5418e085a4 100644
--- a/tests/test-string-input-visitor.c
+++ b/tests/test-string-input-visitor.c
@@ -444,16 +444,14 @@ static void test_visitor_in_fuzz(TestInputVisitorData *data,
     char buf[10000];
 
     for (i = 0; i < 100; i++) {
-        unsigned int j;
+        unsigned int j, k;
 
         j = g_test_rand_int_range(0, sizeof(buf) - 1);
 
         buf[j] = '\0';
 
-        if (j != 0) {
-            for (j--; j != 0; j--) {
-                buf[j - 1] = (char)g_test_rand_int_range(0, 256);
-            }
+        for (k = 0; k != j; k++) {
+            buf[k] = (char)g_test_rand_int_range(0, 256);
         }
 
         v = visitor_input_test_init(data, buf);

From 1f670a95b370651bcf3721aa0941d57e185478cf Mon Sep 17 00:00:00 2001
From: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Date: Tue, 30 Jul 2019 19:01:38 +0300
Subject: [PATCH 12/33] i386/kvm: initialize struct at full before ioctl call

Not the whole structure is initialized before passing it to the KVM.
Reduce the number of Valgrind reports.

Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
Message-Id: <1564502498-805893-4-git-send-email-andrey.shinkevich@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/kvm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 608bb44270..615dfce909 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -193,6 +193,7 @@ static int kvm_get_tsc(CPUState *cs)
         return 0;
     }
 
+    memset(&msr_data, 0, sizeof(msr_data));
     msr_data.info.nmsrs = 1;
     msr_data.entries[0].index = MSR_IA32_TSC;
     env->tsc_valid = !runstate_is_running();
@@ -1709,6 +1710,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 
     if (has_xsave) {
         env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
+        memset(env->xsave_buf, 0, sizeof(struct kvm_xsave));
     }
 
     max_nested_state_len = kvm_max_nested_state_length();
@@ -3491,6 +3493,7 @@ static int kvm_put_debugregs(X86CPU *cpu)
         return 0;
     }
 
+    memset(&dbgregs, 0, sizeof(dbgregs));
     for (i = 0; i < 4; i++) {
         dbgregs.db[i] = env->dr[i];
     }

From 1e8a98b53867f61da9ca09f411288e2085d323c4 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 5 Aug 2019 19:03:32 +0100
Subject: [PATCH 13/33] target/i386: Return 'indefinite integer value' for
 invalid SSE fp->int conversions

The x86 architecture requires that all conversions from floating
point to integer which raise the 'invalid' exception (infinities of
both signs, NaN, and all values which don't fit in the destination
integer) return what the x86 spec calls the "indefinite integer
value", which is 0x8000_0000 for 32-bits or 0x8000_0000_0000_0000 for
64-bits.  The softfloat functions return the more usual behaviour of
positive overflows returning the maximum value that fits in the
destination integer format and negative overflows returning the
minimum value that fits.

Wrap the softfloat functions in x86-specific versions which
detect the 'invalid' condition and return the indefinite integer.

Note that we don't use these wrappers for the 3DNow! pf2id and pf2iw
instructions, which do return the minimum value that fits in
an int32 if the input float is a large negative number.

Fixes: https://bugs.launchpad.net/qemu/+bug/1815423
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Message-Id: <20190805180332.10185-1-peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/ops_sse.h | 88 +++++++++++++++++++++++++++++--------------
 1 file changed, 60 insertions(+), 28 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index ed05989768..ec1ec745d0 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -710,102 +710,134 @@ void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val)
 #endif
 
 /* float to integer */
+
+/*
+ * x86 mandates that we return the indefinite integer value for the result
+ * of any float-to-integer conversion that raises the 'invalid' exception.
+ * Wrap the softfloat functions to get this behaviour.
+ */
+#define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE)              \
+    static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s)        \
+    {                                                                   \
+        int oldflags, newflags;                                         \
+        RETTYPE r;                                                      \
+                                                                        \
+        oldflags = get_float_exception_flags(s);                        \
+        set_float_exception_flags(0, s);                                \
+        r = FN(a, s);                                                   \
+        newflags = get_float_exception_flags(s);                        \
+        if (newflags & float_flag_invalid) {                            \
+            r = INDEFVALUE;                                             \
+        }                                                               \
+        set_float_exception_flags(newflags | oldflags, s);              \
+        return r;                                                       \
+    }
+
+WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN)
+WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN)
+WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN)
+WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN)
+WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN)
+WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN)
+WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
+WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
+
 void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 {
-    d->ZMM_L(0) = float32_to_int32(s->ZMM_S(0), &env->sse_status);
-    d->ZMM_L(1) = float32_to_int32(s->ZMM_S(1), &env->sse_status);
-    d->ZMM_L(2) = float32_to_int32(s->ZMM_S(2), &env->sse_status);
-    d->ZMM_L(3) = float32_to_int32(s->ZMM_S(3), &env->sse_status);
+    d->ZMM_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
+    d->ZMM_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
+    d->ZMM_L(2) = x86_float32_to_int32(s->ZMM_S(2), &env->sse_status);
+    d->ZMM_L(3) = x86_float32_to_int32(s->ZMM_S(3), &env->sse_status);
 }
 
 void helper_cvtpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 {
-    d->ZMM_L(0) = float64_to_int32(s->ZMM_D(0), &env->sse_status);
-    d->ZMM_L(1) = float64_to_int32(s->ZMM_D(1), &env->sse_status);
+    d->ZMM_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
+    d->ZMM_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
     d->ZMM_Q(1) = 0;
 }
 
 void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
 {
-    d->MMX_L(0) = float32_to_int32(s->ZMM_S(0), &env->sse_status);
-    d->MMX_L(1) = float32_to_int32(s->ZMM_S(1), &env->sse_status);
+    d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
+    d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
 }
 
 void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
 {
-    d->MMX_L(0) = float64_to_int32(s->ZMM_D(0), &env->sse_status);
-    d->MMX_L(1) = float64_to_int32(s->ZMM_D(1), &env->sse_status);
+    d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
+    d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
 }
 
 int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s)
 {
-    return float32_to_int32(s->ZMM_S(0), &env->sse_status);
+    return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
 }
 
 int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s)
 {
-    return float64_to_int32(s->ZMM_D(0), &env->sse_status);
+    return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
 }
 
 #ifdef TARGET_X86_64
 int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s)
 {
-    return float32_to_int64(s->ZMM_S(0), &env->sse_status);
+    return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status);
 }
 
 int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
 {
-    return float64_to_int64(s->ZMM_D(0), &env->sse_status);
+    return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status);
 }
 #endif
 
 /* float to integer truncated */
 void helper_cvttps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 {
-    d->ZMM_L(0) = float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
-    d->ZMM_L(1) = float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
-    d->ZMM_L(2) = float32_to_int32_round_to_zero(s->ZMM_S(2), &env->sse_status);
-    d->ZMM_L(3) = float32_to_int32_round_to_zero(s->ZMM_S(3), &env->sse_status);
+    d->ZMM_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
+    d->ZMM_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
+    d->ZMM_L(2) = x86_float32_to_int32_round_to_zero(s->ZMM_S(2), &env->sse_status);
+    d->ZMM_L(3) = x86_float32_to_int32_round_to_zero(s->ZMM_S(3), &env->sse_status);
 }
 
 void helper_cvttpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s)
 {
-    d->ZMM_L(0) = float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
-    d->ZMM_L(1) = float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
+    d->ZMM_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
+    d->ZMM_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
     d->ZMM_Q(1) = 0;
 }
 
 void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
 {
-    d->MMX_L(0) = float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
-    d->MMX_L(1) = float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
+    d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
+    d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
 }
 
 void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
 {
-    d->MMX_L(0) = float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
-    d->MMX_L(1) = float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
+    d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
+    d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
 }
 
 int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s)
 {
-    return float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
+    return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
 }
 
 int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s)
 {
-    return float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
+    return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
 }
 
 #ifdef TARGET_X86_64
 int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s)
 {
-    return float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
+    return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
 }
 
 int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s)
 {
-    return float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
+    return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
 }
 #endif
 

From 9458a9a1df1a4c719e24512394d548c1fc7abd22 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 6 Feb 2018 18:37:39 +0100
Subject: [PATCH 14/33] memory: fix race between TCG and accesses to dirty
 bitmap

There is a race between TCG and accesses to the dirty log:

      vCPU thread                  reader thread
      -----------------------      -----------------------
      TLB check -> slow path
        notdirty_mem_write
          write to RAM
          set dirty flag
                                   clear dirty flag
      TLB check -> fast path
                                   read memory
        write to RAM

Fortunately, in order to fix it, no change is required to the
vCPU thread.  However, the reader thread must delay the read after
the vCPU thread has finished the write.  This can be approximated
conservatively by run_on_cpu, which waits for the end of the current
translation block.

A similar technique is used by KVM, which has to do a synchronous TLB
flush after doing a test-and-clear of the dirty-page flags.

Reported-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 exec.c                | 31 +++++++++++++++++++++++++++++++
 include/exec/memory.h | 12 ++++++++++++
 memory.c              | 10 +++++++++-
 migration/ram.c       |  1 +
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/exec.c b/exec.c
index 4aaa14b075..731b79d2e4 100644
--- a/exec.c
+++ b/exec.c
@@ -197,6 +197,7 @@ typedef struct subpage_t {
 
 static void io_mem_init(void);
 static void memory_map_init(void);
+static void tcg_log_global_after_sync(MemoryListener *listener);
 static void tcg_commit(MemoryListener *listener);
 
 static MemoryRegion io_mem_watch;
@@ -905,6 +906,7 @@ void cpu_address_space_init(CPUState *cpu, int asidx,
     newas->cpu = cpu;
     newas->as = as;
     if (tcg_enabled()) {
+        newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
         newas->tcg_as_listener.commit = tcg_commit;
         memory_listener_register(&newas->tcg_as_listener, as);
     }
@@ -3142,6 +3144,35 @@ void address_space_dispatch_free(AddressSpaceDispatch *d)
     g_free(d);
 }
 
+static void do_nothing(CPUState *cpu, run_on_cpu_data d)
+{
+}
+
+static void tcg_log_global_after_sync(MemoryListener *listener)
+{
+    CPUAddressSpace *cpuas;
+
+    /* Wait for the CPU to end the current TB.  This avoids the following
+     * incorrect race:
+     *
+     *      vCPU                         migration
+     *      ----------------------       -------------------------
+     *      TLB check -> slow path
+     *        notdirty_mem_write
+     *          write to RAM
+     *          mark dirty
+     *                                   clear dirty flag
+     *      TLB check -> fast path
+     *                                   read memory
+     *        write to RAM
+     *
+     * by pushing the migration thread's memory read after the vCPU thread has
+     * written the memory.
+     */
+    cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
+    run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
+}
+
 static void tcg_commit(MemoryListener *listener)
 {
     CPUAddressSpace *cpuas;
diff --git a/include/exec/memory.h b/include/exec/memory.h
index d99eb25d2e..fddc2ff48a 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -425,6 +425,7 @@ struct MemoryListener {
     void (*log_clear)(MemoryListener *listener, MemoryRegionSection *section);
     void (*log_global_start)(MemoryListener *listener);
     void (*log_global_stop)(MemoryListener *listener);
+    void (*log_global_after_sync)(MemoryListener *listener);
     void (*eventfd_add)(MemoryListener *listener, MemoryRegionSection *section,
                         bool match_data, uint64_t data, EventNotifier *e);
     void (*eventfd_del)(MemoryListener *listener, MemoryRegionSection *section,
@@ -1687,6 +1688,17 @@ MemoryRegionSection memory_region_find(MemoryRegion *mr,
  */
 void memory_global_dirty_log_sync(void);
 
+/**
+ * memory_global_dirty_log_sync: synchronize the dirty log for all memory
+ *
+ * Synchronizes the vCPUs with a thread that is reading the dirty bitmap.
+ * This function must be called after the dirty log bitmap is cleared, and
+ * before dirty guest memory pages are read.  If you are using
+ * #DirtyBitmapSnapshot, memory_region_snapshot_and_clear_dirty() takes
+ * care of doing this.
+ */
+void memory_global_after_dirty_log_sync(void);
+
 /**
  * memory_region_transaction_begin: Start a transaction.
  *
diff --git a/memory.c b/memory.c
index c90a2cf629..4aa38eb5b1 100644
--- a/memory.c
+++ b/memory.c
@@ -2127,9 +2127,12 @@ DirtyBitmapSnapshot *memory_region_snapshot_and_clear_dirty(MemoryRegion *mr,
                                                             hwaddr size,
                                                             unsigned client)
 {
+    DirtyBitmapSnapshot *snapshot;
     assert(mr->ram_block);
     memory_region_sync_dirty_bitmap(mr);
-    return cpu_physical_memory_snapshot_and_clear_dirty(mr, addr, size, client);
+    snapshot = cpu_physical_memory_snapshot_and_clear_dirty(mr, addr, size, client);
+    memory_global_after_dirty_log_sync();
+    return snapshot;
 }
 
 bool memory_region_snapshot_get_dirty(MemoryRegion *mr, DirtyBitmapSnapshot *snap,
@@ -2620,6 +2623,11 @@ void memory_global_dirty_log_sync(void)
     memory_region_sync_dirty_bitmap(NULL);
 }
 
+void memory_global_after_dirty_log_sync(void)
+{
+    MEMORY_LISTENER_CALL_GLOBAL(log_global_after_sync, Forward);
+}
+
 static VMChangeStateEntry *vmstate_change;
 
 void memory_global_dirty_log_start(void)
diff --git a/migration/ram.c b/migration/ram.c
index 35552c090b..b01a37e7ca 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1857,6 +1857,7 @@ static void migration_bitmap_sync(RAMState *rs)
     rcu_read_unlock();
     qemu_mutex_unlock(&rs->bitmap_mutex);
 
+    memory_global_after_dirty_log_sync();
     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 
     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);

From 8ff72af557dcae1e1d441983f9057f8c99cad26f Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Wed, 24 Jul 2019 12:58:20 +0100
Subject: [PATCH 15/33] mc146818rtc: Remove reset notifiers

The reset notifiers are unreliable and recalculating the offsets
after boot causes problems with migration in cases where explicit
base times are set on the destination.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <20190724115823.4199-2-dgilbert@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/timer/mc146818rtc.c | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c
index 26618842c9..6cb378751b 100644
--- a/hw/timer/mc146818rtc.c
+++ b/hw/timer/mc146818rtc.c
@@ -96,7 +96,6 @@ typedef struct RTCState {
     uint32_t irq_coalesced;
     uint32_t period;
     QEMUTimer *coalesced_timer;
-    Notifier clock_reset_notifier;
     LostTickPolicy lost_tick_policy;
     Notifier suspend_notifier;
     QLIST_ENTRY(RTCState) link;
@@ -889,20 +888,6 @@ static const VMStateDescription vmstate_rtc = {
     }
 };
 
-static void rtc_notify_clock_reset(Notifier *notifier, void *data)
-{
-    RTCState *s = container_of(notifier, RTCState, clock_reset_notifier);
-    int64_t now = *(int64_t *)data;
-
-    rtc_set_date_from_host(ISA_DEVICE(s));
-    periodic_timer_update(s, now, 0);
-    check_update_timer(s);
-
-    if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) {
-        rtc_coalesced_timer_update(s);
-    }
-}
-
 /* set CMOS shutdown status register (index 0xF) as S3_resume(0xFE)
    BIOS will read it and start S3 resume at POST Entry */
 static void rtc_notify_suspend(Notifier *notifier, void *data)
@@ -988,10 +973,6 @@ static void rtc_realizefn(DeviceState *dev, Error **errp)
     s->update_timer = timer_new_ns(rtc_clock, rtc_update_timer, s);
     check_update_timer(s);
 
-    s->clock_reset_notifier.notify = rtc_notify_clock_reset;
-    qemu_clock_register_reset_notifier(rtc_clock,
-                                       &s->clock_reset_notifier);
-
     s->suspend_notifier.notify = rtc_notify_suspend;
     qemu_register_suspend_notifier(&s->suspend_notifier);
 

From 4ea9a0e3db39a3ef07f8a97c005733d127485891 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Wed, 24 Jul 2019 12:58:21 +0100
Subject: [PATCH 16/33] timer: Remove reset notifiers

Remove the reset notifer from the core qemu-timer code.
The only user was mc146818 and we've just remove it's use.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <20190724115823.4199-3-dgilbert@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/timer.h | 22 ----------------------
 util/qemu-timer.c    | 21 +--------------------
 2 files changed, 1 insertion(+), 42 deletions(-)

diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 5d978e1634..6817c78ef4 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -227,28 +227,6 @@ void qemu_clock_enable(QEMUClockType type, bool enabled);
  */
 void qemu_start_warp_timer(void);
 
-/**
- * qemu_clock_register_reset_notifier:
- * @type: the clock type
- * @notifier: the notifier function
- *
- * Register a notifier function to call when the clock
- * concerned is reset.
- */
-void qemu_clock_register_reset_notifier(QEMUClockType type,
-                                        Notifier *notifier);
-
-/**
- * qemu_clock_unregister_reset_notifier:
- * @type: the clock type
- * @notifier: the notifier function
- *
- * Unregister a notifier function to call when the clock
- * concerned is reset.
- */
-void qemu_clock_unregister_reset_notifier(QEMUClockType type,
-                                          Notifier *notifier);
-
 /**
  * qemu_clock_run_timers:
  * @type: clock on which to operate
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
index b0e40a9087..1ac831434d 100644
--- a/util/qemu-timer.c
+++ b/util/qemu-timer.c
@@ -47,7 +47,6 @@ typedef struct QEMUClock {
     /* We rely on BQL to protect the timerlists */
     QLIST_HEAD(, QEMUTimerList) timerlists;
 
-    NotifierList reset_notifiers;
     int64_t last;
 
     QEMUClockType type;
@@ -132,7 +131,6 @@ static void qemu_clock_init(QEMUClockType type, QEMUTimerListNotifyCB *notify_cb
     clock->enabled = (type == QEMU_CLOCK_VIRTUAL ? false : true);
     clock->last = INT64_MIN;
     QLIST_INIT(&clock->timerlists);
-    notifier_list_init(&clock->reset_notifiers);
     main_loop_tlg.tl[type] = timerlist_new(type, notify_cb, NULL);
 }
 
@@ -629,7 +627,7 @@ int64_t timerlistgroup_deadline_ns(QEMUTimerListGroup *tlg)
 
 int64_t qemu_clock_get_ns(QEMUClockType type)
 {
-    int64_t now, last;
+    int64_t now;
     QEMUClock *clock = qemu_clock_ptr(type);
 
     switch (type) {
@@ -644,11 +642,7 @@ int64_t qemu_clock_get_ns(QEMUClockType type)
         }
     case QEMU_CLOCK_HOST:
         now = REPLAY_CLOCK(REPLAY_CLOCK_HOST, get_clock_realtime());
-        last = clock->last;
         clock->last = now;
-        if (now < last || now > (last + get_max_clock_jump())) {
-            notifier_list_notify(&clock->reset_notifiers, &now);
-        }
         return now;
     case QEMU_CLOCK_VIRTUAL_RT:
         return REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT, cpu_get_clock());
@@ -667,19 +661,6 @@ void qemu_clock_set_last(QEMUClockType type, uint64_t last)
     clock->last = last;
 }
 
-void qemu_clock_register_reset_notifier(QEMUClockType type,
-                                        Notifier *notifier)
-{
-    QEMUClock *clock = qemu_clock_ptr(type);
-    notifier_list_add(&clock->reset_notifiers, notifier);
-}
-
-void qemu_clock_unregister_reset_notifier(QEMUClockType type,
-                                          Notifier *notifier)
-{
-    notifier_remove(notifier);
-}
-
 void init_clocks(QEMUTimerListNotifyCB *notify_cb)
 {
     QEMUClockType type;

From a02fe2ca70ca2e05345587b2b947d64a446bcbd3 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Wed, 24 Jul 2019 12:58:22 +0100
Subject: [PATCH 17/33] replay: Remove host_clock_last

Now we're not using the 'last' field in the timer, remove it from
replay.

Bump the version number of the replay structure since we've
removed the field.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <20190724115823.4199-4-dgilbert@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 replay/replay-snapshot.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c
index 5dd8680480..e6a7d009ec 100644
--- a/replay/replay-snapshot.c
+++ b/replay/replay-snapshot.c
@@ -23,7 +23,6 @@ static int replay_pre_save(void *opaque)
 {
     ReplayState *state = opaque;
     state->file_offset = ftell(replay_file);
-    state->host_clock_last = qemu_clock_get_last(QEMU_CLOCK_HOST);
 
     return 0;
 }
@@ -33,7 +32,6 @@ static int replay_post_load(void *opaque, int version_id)
     ReplayState *state = opaque;
     if (replay_mode == REPLAY_MODE_PLAY) {
         fseek(replay_file, state->file_offset, SEEK_SET);
-        qemu_clock_set_last(QEMU_CLOCK_HOST, state->host_clock_last);
         /* If this was a vmstate, saved in recording mode,
            we need to initialize replay data fields. */
         replay_fetch_data_kind();
@@ -49,8 +47,8 @@ static int replay_post_load(void *opaque, int version_id)
 
 static const VMStateDescription vmstate_replay = {
     .name = "replay",
-    .version_id = 1,
-    .minimum_version_id = 1,
+    .version_id = 2,
+    .minimum_version_id = 2,
     .pre_save = replay_pre_save,
     .post_load = replay_post_load,
     .fields = (VMStateField[]) {
@@ -61,7 +59,6 @@ static const VMStateDescription vmstate_replay = {
         VMSTATE_UINT32(has_unread_data, ReplayState),
         VMSTATE_UINT64(file_offset, ReplayState),
         VMSTATE_UINT64(block_request_id, ReplayState),
-        VMSTATE_UINT64(host_clock_last, ReplayState),
         VMSTATE_INT32(read_event_kind, ReplayState),
         VMSTATE_UINT64(read_event_id, ReplayState),
         VMSTATE_INT32(read_event_checkpoint, ReplayState),

From 3c2d4c8aa6a98366c9fe2f36305f12199257a7d5 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Date: Wed, 24 Jul 2019 12:58:23 +0100
Subject: [PATCH 18/33] timer: last, remove last bits of last

The reset notifiers kept a 'last' counter to notice jumps;
now that we've remove the notifier we don't need to keep 'last'.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <20190724115823.4199-5-dgilbert@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/timer.h | 13 -------------
 util/qemu-timer.c    | 22 +---------------------
 2 files changed, 1 insertion(+), 34 deletions(-)

diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 6817c78ef4..5bcab935f6 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -248,19 +248,6 @@ bool qemu_clock_run_timers(QEMUClockType type);
  */
 bool qemu_clock_run_all_timers(void);
 
-/**
- * qemu_clock_get_last:
- *
- * Returns last clock query time.
- */
-uint64_t qemu_clock_get_last(QEMUClockType type);
-/**
- * qemu_clock_set_last:
- *
- * Sets last clock query time.
- */
-void qemu_clock_set_last(QEMUClockType type, uint64_t last);
-
 
 /*
  * QEMUTimerList
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
index 1ac831434d..37ecfe31e8 100644
--- a/util/qemu-timer.c
+++ b/util/qemu-timer.c
@@ -47,8 +47,6 @@ typedef struct QEMUClock {
     /* We rely on BQL to protect the timerlists */
     QLIST_HEAD(, QEMUTimerList) timerlists;
 
-    int64_t last;
-
     QEMUClockType type;
     bool enabled;
 } QEMUClock;
@@ -129,7 +127,6 @@ static void qemu_clock_init(QEMUClockType type, QEMUTimerListNotifyCB *notify_cb
 
     clock->type = type;
     clock->enabled = (type == QEMU_CLOCK_VIRTUAL ? false : true);
-    clock->last = INT64_MIN;
     QLIST_INIT(&clock->timerlists);
     main_loop_tlg.tl[type] = timerlist_new(type, notify_cb, NULL);
 }
@@ -627,9 +624,6 @@ int64_t timerlistgroup_deadline_ns(QEMUTimerListGroup *tlg)
 
 int64_t qemu_clock_get_ns(QEMUClockType type)
 {
-    int64_t now;
-    QEMUClock *clock = qemu_clock_ptr(type);
-
     switch (type) {
     case QEMU_CLOCK_REALTIME:
         return get_clock();
@@ -641,26 +635,12 @@ int64_t qemu_clock_get_ns(QEMUClockType type)
             return cpu_get_clock();
         }
     case QEMU_CLOCK_HOST:
-        now = REPLAY_CLOCK(REPLAY_CLOCK_HOST, get_clock_realtime());
-        clock->last = now;
-        return now;
+        return REPLAY_CLOCK(REPLAY_CLOCK_HOST, get_clock_realtime());
     case QEMU_CLOCK_VIRTUAL_RT:
         return REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT, cpu_get_clock());
     }
 }
 
-uint64_t qemu_clock_get_last(QEMUClockType type)
-{
-    QEMUClock *clock = qemu_clock_ptr(type);
-    return clock->last;
-}
-
-void qemu_clock_set_last(QEMUClockType type, uint64_t last)
-{
-    QEMUClock *clock = qemu_clock_ptr(type);
-    clock->last = last;
-}
-
 void init_clocks(QEMUTimerListNotifyCB *notify_cb)
 {
     QEMUClockType type;

From 245429e4a0e036471ba46e2393b6f33b78b9615e Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Date: Thu, 25 Jul 2019 11:44:15 +0300
Subject: [PATCH 19/33] replay: add missing fix for internal function

This is a fix which was missed by patch
74c0b816adfc6aa1b01b4426fdf385e32e35cbac, which added current_step
parameter to the replay_advance_current_step function.

Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Message-Id: <156404425561.18669.13015037579222450241.stgit@pasha-Precision-3630-Tower>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 replay/replay-internal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/replay/replay-internal.c b/replay/replay-internal.c
index c013b23820..7b42767059 100644
--- a/replay/replay-internal.c
+++ b/replay/replay-internal.c
@@ -229,7 +229,7 @@ void replay_mutex_unlock(void)
 
 void replay_advance_current_step(uint64_t current_step)
 {
-    int diff = (int)(replay_get_current_step() - replay_state.current_step);
+    int diff = (int)(current_step - replay_state.current_step);
 
     /* Time can only go forward */
     assert(diff >= 0);

From 978ae0e99c1760c228eef2d320386daab3bf5b10 Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Date: Thu, 25 Jul 2019 11:44:21 +0300
Subject: [PATCH 20/33] replay: document development rules

This patch introduces docs/devel/replay.txt which describes the rules
that should be followed to make virtual devices usable in record/replay mode.

Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgauk@ispras.ru>

--

v9: fixed external virtual clock description (reported by Artem Pisarenko)
Message-Id: <156404426119.18669.6707258931552832854.stgit@pasha-Precision-3630-Tower>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
---
 docs/devel/replay.txt | 46 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 docs/devel/replay.txt

diff --git a/docs/devel/replay.txt b/docs/devel/replay.txt
new file mode 100644
index 0000000000..e641c35add
--- /dev/null
+++ b/docs/devel/replay.txt
@@ -0,0 +1,46 @@
+Record/replay mechanism, that could be enabled through icount mode, expects
+the virtual devices to satisfy the following requirements.
+
+The main idea behind this document is that everything that affects
+the guest state during execution in icount mode should be deterministic.
+
+Timers
+======
+
+All virtual devices should use virtual clock for timers that change the guest
+state. Virtual clock is deterministic, therefore such timers are deterministic
+too.
+
+Virtual devices can also use realtime clock for the events that do not change
+the guest state directly. When the clock ticking should depend on VM execution
+speed, use virtual clock with EXTERNAL attribute. It is not deterministic,
+but its speed depends on the guest execution. This clock is used by
+the virtual devices (e.g., slirp routing device) that lie outside the
+replayed guest.
+
+Bottom halves
+=============
+
+Bottom half callbacks, that affect the guest state, should be invoked through
+replay_bh_schedule_event or replay_bh_schedule_oneshot_event functions.
+Their invocations are saved in record mode and synchronized with the existing
+log in replay mode.
+
+Saving/restoring the VM state
+=============================
+
+All fields in the device state structure (including virtual timers)
+should be restored by loadvm to the same values they had before savevm.
+
+Avoid accessing other devices' state, because the order of saving/restoring
+is not defined. It means that you should not call functions like
+'update_irq' in post_load callback. Save everything explicitly to avoid
+the dependencies that may make restoring the VM state non-deterministic.
+
+Stopping the VM
+===============
+
+Stopping the guest should not interfere with its state (with the exception
+of the network connections, that could be broken by the remote timeouts).
+VM can be stopped at any moment of replay by the user. Restarting the VM
+after that stop should not break the replay by the unneeded guest state change.

From dcb1578069dd072f9aec74e3024cadb9ed0f3aae Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Date: Thu, 25 Jul 2019 11:44:26 +0300
Subject: [PATCH 21/33] util/qemu-timer: refactor deadline calculation for
 external timers

icount-based record/replay uses qemu_clock_deadline_ns_all to measure
the period until vCPU may be interrupted.
This function takes in account the virtual timers, because they belong
to the virtual devices that may generate interrupt request or affect
the virtual machine state.
However, there are a subset of virtual timers, that are marked with
'external' flag. These do not change the virtual machine state and
only based on virtual clock. Calculating the deadling using the external
timers breaks the determinism, because they do not belong to the replayed
part of the virtual machine.
This patch fixes the deadline calculation for this case by adding
new parameter for skipping the external timers when it is needed.

Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>

--

v2 changes:
 - added new parameter for timer attribute mask
Message-Id: <156404426682.18669.17014100602930969222.stgit@pasha-Precision-3630-Tower>

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c                    | 17 ++++++++++++-----
 include/qemu/timer.h      |  8 ++++++--
 qtest.c                   |  3 ++-
 tests/ptimer-test-stubs.c |  4 ++--
 tests/ptimer-test.c       |  6 ++++--
 util/qemu-timer.c         | 30 +++++++++++++++++++++++++++---
 6 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/cpus.c b/cpus.c
index f4ad855b77..85cd451a86 100644
--- a/cpus.c
+++ b/cpus.c
@@ -556,7 +556,8 @@ void qtest_clock_warp(int64_t dest)
     assert(qtest_enabled());
     aio_context = qemu_get_aio_context();
     while (clock < dest) {
-        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                                      QEMU_TIMER_ATTR_ALL);
         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 
         seqlock_write_lock(&timers_state.vm_clock_seqlock,
@@ -616,7 +617,8 @@ void qemu_start_warp_timer(void)
 
     /* We want to use the earliest deadline from ALL vm_clocks */
     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
-    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                          ~QEMU_TIMER_ATTR_EXTERNAL);
     if (deadline < 0) {
         static bool notified;
         if (!icount_sleep && !notified) {
@@ -1352,7 +1354,12 @@ static int64_t tcg_get_icount_limit(void)
     int64_t deadline;
 
     if (replay_mode != REPLAY_MODE_PLAY) {
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+        /*
+         * Include all the timers, because they may need an attention.
+         * Too long CPU execution may create unnecessary delay in UI.
+         */
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                              QEMU_TIMER_ATTR_ALL);
 
         /* Maintain prior (possibly buggy) behaviour where if no deadline
          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
@@ -1373,8 +1380,8 @@ static void handle_icount_deadline(void)
 {
     assert(qemu_in_vcpu_thread());
     if (use_icount) {
-        int64_t deadline =
-            qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                                      QEMU_TIMER_ATTR_ALL);
 
         if (deadline == 0) {
             /* Wake up other AioContexts.  */
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index 5bcab935f6..85bc6eb00b 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -62,13 +62,15 @@ typedef enum {
  * The following attributes are available:
  *
  * QEMU_TIMER_ATTR_EXTERNAL: drives external subsystem
+ * QEMU_TIMER_ATTR_ALL: mask for all existing attributes
  *
  * Timers with this attribute do not recorded in rr mode, therefore it could be
  * used for the subsystems that operate outside the guest core. Applicable only
  * with virtual clock type.
  */
 
-#define QEMU_TIMER_ATTR_EXTERNAL BIT(0)
+#define QEMU_TIMER_ATTR_EXTERNAL ((int)BIT(0))
+#define QEMU_TIMER_ATTR_ALL      0xffffffff
 
 typedef struct QEMUTimerList QEMUTimerList;
 
@@ -177,6 +179,8 @@ bool qemu_clock_use_for_deadline(QEMUClockType type);
 /**
  * qemu_clock_deadline_ns_all:
  * @type: the clock type
+ * @attr_mask: mask for the timer attributes that are included
+ *             in deadline calculation
  *
  * Calculate the deadline across all timer lists associated
  * with a clock (as opposed to just the default one)
@@ -184,7 +188,7 @@ bool qemu_clock_use_for_deadline(QEMUClockType type);
  *
  * Returns: time until expiry in nanoseconds or -1
  */
-int64_t qemu_clock_deadline_ns_all(QEMUClockType type);
+int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int attr_mask);
 
 /**
  * qemu_clock_get_main_loop_timerlist:
diff --git a/qtest.c b/qtest.c
index 59df1b6614..d8b4857c74 100644
--- a/qtest.c
+++ b/qtest.c
@@ -654,7 +654,8 @@ static void qtest_process_command(CharBackend *chr, gchar **words)
             int ret = qemu_strtoi64(words[1], NULL, 0, &ns);
             g_assert(ret == 0);
         } else {
-            ns = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+            ns = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                            QEMU_TIMER_ATTR_ALL);
         }
         qtest_clock_warp(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ns);
         qtest_send_prefix(chr);
diff --git a/tests/ptimer-test-stubs.c b/tests/ptimer-test-stubs.c
index 54b3fd26f6..ed393d9082 100644
--- a/tests/ptimer-test-stubs.c
+++ b/tests/ptimer-test-stubs.c
@@ -88,9 +88,9 @@ int64_t qemu_clock_get_ns(QEMUClockType type)
     return ptimer_test_time_ns;
 }
 
-int64_t qemu_clock_deadline_ns_all(QEMUClockType type)
+int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int attr_mask)
 {
-    QEMUTimerList *timer_list = main_loop_tlg.tl[type];
+    QEMUTimerList *timer_list = main_loop_tlg.tl[QEMU_CLOCK_VIRTUAL];
     QEMUTimer *t = timer_list->active_timers.next;
     int64_t deadline = -1;
 
diff --git a/tests/ptimer-test.c b/tests/ptimer-test.c
index b30aad0737..5b20e91599 100644
--- a/tests/ptimer-test.c
+++ b/tests/ptimer-test.c
@@ -50,13 +50,15 @@ static void ptimer_test_set_qemu_time_ns(int64_t ns)
 
 static void qemu_clock_step(uint64_t ns)
 {
-    int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+    int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                                  QEMU_TIMER_ATTR_ALL);
     int64_t advanced_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ns;
 
     while (deadline != -1 && deadline <= advanced_time) {
         ptimer_test_set_qemu_time_ns(deadline);
         ptimer_test_expire_qemu_timers(deadline, QEMU_CLOCK_VIRTUAL);
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
+                                              QEMU_TIMER_ATTR_ALL);
     }
 
     ptimer_test_set_qemu_time_ns(advanced_time);
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
index 37ecfe31e8..d428fec567 100644
--- a/util/qemu-timer.c
+++ b/util/qemu-timer.c
@@ -247,14 +247,38 @@ int64_t timerlist_deadline_ns(QEMUTimerList *timer_list)
  * ignore whether or not the clock should be used in deadline
  * calculations.
  */
-int64_t qemu_clock_deadline_ns_all(QEMUClockType type)
+int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int attr_mask)
 {
     int64_t deadline = -1;
+    int64_t delta;
+    int64_t expire_time;
+    QEMUTimer *ts;
     QEMUTimerList *timer_list;
     QEMUClock *clock = qemu_clock_ptr(type);
+
+    if (!clock->enabled) {
+        return -1;
+    }
+
     QLIST_FOREACH(timer_list, &clock->timerlists, list) {
-        deadline = qemu_soonest_timeout(deadline,
-                                        timerlist_deadline_ns(timer_list));
+        qemu_mutex_lock(&timer_list->active_timers_lock);
+        ts = timer_list->active_timers;
+        /* Skip all external timers */
+        while (ts && (ts->attributes & ~attr_mask)) {
+            ts = ts->next;
+        }
+        if (!ts) {
+            qemu_mutex_unlock(&timer_list->active_timers_lock);
+            continue;
+        }
+        expire_time = ts->expire_time;
+        qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+        delta = expire_time - qemu_clock_get_ns(type);
+        if (delta <= 0) {
+            delta = 0;
+        }
+        deadline = qemu_soonest_timeout(deadline, delta);
     }
     return deadline;
 }

From e957ad8aefa2a426c607683d4cac24bcee09ede4 Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Date: Thu, 25 Jul 2019 11:44:32 +0300
Subject: [PATCH 22/33] replay: fix replay shutdown

This patch fixes shutdown of the replay process, which is terminated with
the assert when shutdown event is read from the log.
replay_finish_event reads new data_kind and therefore the value of data_kind
should be preserved to be valid at qemu_system_shutdown_request call.

Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Message-Id: <156404427238.18669.12378772823692338069.stgit@pasha-Precision-3630-Tower>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 replay/replay.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/replay/replay.c b/replay/replay.c
index 0c4e9c1318..69f973cd21 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -45,14 +45,14 @@ bool replay_next_event_is(int event)
     }
 
     while (true) {
-        if (event == replay_state.data_kind) {
+        unsigned int data_kind = replay_state.data_kind;
+        if (event == data_kind) {
             res = true;
         }
-        switch (replay_state.data_kind) {
+        switch (data_kind) {
         case EVENT_SHUTDOWN ... EVENT_SHUTDOWN_LAST:
             replay_finish_event();
-            qemu_system_shutdown_request(replay_state.data_kind -
-                                         EVENT_SHUTDOWN);
+            qemu_system_shutdown_request(data_kind - EVENT_SHUTDOWN);
             break;
         default:
             /* clock, time_t, checkpoint and other events */

From 82f49156531b3345fe03a74ca449147db5828238 Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Date: Thu, 25 Jul 2019 11:44:38 +0300
Subject: [PATCH 23/33] replay: refine replay-time module

This patch removes refactoring artifacts from the replay/replay-time.c

Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Message-Id: <156404427799.18669.8072341590511911277.stgit@pasha-Precision-3630-Tower>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 replay/replay-time.c | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/replay/replay-time.c b/replay/replay-time.c
index 5154cb0ce9..49c9e5d8b2 100644
--- a/replay/replay-time.c
+++ b/replay/replay-time.c
@@ -14,18 +14,19 @@
 #include "replay-internal.h"
 #include "qemu/error-report.h"
 
-int64_t replay_save_clock(ReplayClockKind kind, int64_t clock, int64_t raw_icount)
+int64_t replay_save_clock(ReplayClockKind kind, int64_t clock,
+                          int64_t raw_icount)
 {
-    if (replay_file) {
-        g_assert(replay_mutex_locked());
+    g_assert(replay_file);
+    g_assert(replay_mutex_locked());
 
-        /* Due to the caller's locking requirements we get the icount from it
-         * instead of using replay_save_instructions().
-         */
-        replay_advance_current_step(raw_icount);
-        replay_put_event(EVENT_CLOCK + kind);
-        replay_put_qword(clock);
-    }
+    /*
+     * Due to the caller's locking requirements we get the icount from it
+     * instead of using replay_save_instructions().
+     */
+    replay_advance_current_step(raw_icount);
+    replay_put_event(EVENT_CLOCK + kind);
+    replay_put_qword(clock);
 
     return clock;
 }
@@ -47,20 +48,15 @@ void replay_read_next_clock(ReplayClockKind kind)
 /*! Reads next clock event from the input. */
 int64_t replay_read_clock(ReplayClockKind kind)
 {
+    int64_t ret;
     g_assert(replay_file && replay_mutex_locked());
 
     replay_account_executed_instructions();
 
-    if (replay_file) {
-        int64_t ret;
-        if (replay_next_event_is(EVENT_CLOCK + kind)) {
-            replay_read_next_clock(kind);
-        }
-        ret = replay_state.cached_clock[kind];
-
-        return ret;
+    if (replay_next_event_is(EVENT_CLOCK + kind)) {
+        replay_read_next_clock(kind);
     }
+    ret = replay_state.cached_clock[kind];
 
-    error_report("REPLAY INTERNAL ERROR %d", __LINE__);
-    exit(1);
+    return ret;
 }

From 13f267133f349f8a322b5385a58688f0dcdf9ed2 Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Date: Thu, 25 Jul 2019 11:44:43 +0300
Subject: [PATCH 24/33] replay: rename step-related variables and functions

This patch renames replay_get_current_step() and related variables
to make these names consistent with existing 'icount' command line
option and future record/replay hmp/qmp commands.

Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Message-Id: <156404428377.18669.15476429889039912070.stgit@pasha-Precision-3630-Tower>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/sysemu/replay.h  |  2 +-
 replay/replay-events.c   |  2 +-
 replay/replay-internal.c | 10 +++++-----
 replay/replay-internal.h | 10 +++++-----
 replay/replay-snapshot.c |  6 +++---
 replay/replay-time.c     |  2 +-
 replay/replay.c          | 22 +++++++++++-----------
 7 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
index 2f2ccdbc98..d223edda11 100644
--- a/include/sysemu/replay.h
+++ b/include/sysemu/replay.h
@@ -75,7 +75,7 @@ void replay_add_blocker(Error *reason);
 /* Processing the instructions */
 
 /*! Returns number of executed instructions. */
-uint64_t replay_get_current_step(void);
+uint64_t replay_get_current_icount(void);
 /*! Returns number of instructions to execute in replay mode. */
 int replay_get_instructions(void);
 /*! Updates instructions counter in replay mode. */
diff --git a/replay/replay-events.c b/replay/replay-events.c
index 60d17f6edb..008e80f636 100644
--- a/replay/replay-events.c
+++ b/replay/replay-events.c
@@ -124,7 +124,7 @@ void replay_add_event(ReplayAsyncEventKind event_kind,
 void replay_bh_schedule_event(QEMUBH *bh)
 {
     if (events_enabled) {
-        uint64_t id = replay_get_current_step();
+        uint64_t id = replay_get_current_icount();
         replay_add_event(REPLAY_ASYNC_EVENT_BH, bh, NULL, id);
     } else {
         qemu_bh_schedule(bh);
diff --git a/replay/replay-internal.c b/replay/replay-internal.c
index 7b42767059..eba8246aae 100644
--- a/replay/replay-internal.c
+++ b/replay/replay-internal.c
@@ -173,7 +173,7 @@ void replay_fetch_data_kind(void)
         if (!replay_state.has_unread_data) {
             replay_state.data_kind = replay_get_byte();
             if (replay_state.data_kind == EVENT_INSTRUCTION) {
-                replay_state.instructions_count = replay_get_dword();
+                replay_state.instruction_count = replay_get_dword();
             }
             replay_check_error();
             replay_state.has_unread_data = 1;
@@ -227,9 +227,9 @@ void replay_mutex_unlock(void)
     }
 }
 
-void replay_advance_current_step(uint64_t current_step)
+void replay_advance_current_icount(uint64_t current_icount)
 {
-    int diff = (int)(current_step - replay_state.current_step);
+    int diff = (int)(current_icount - replay_state.current_icount);
 
     /* Time can only go forward */
     assert(diff >= 0);
@@ -237,7 +237,7 @@ void replay_advance_current_step(uint64_t current_step)
     if (diff > 0) {
         replay_put_event(EVENT_INSTRUCTION);
         replay_put_dword(diff);
-        replay_state.current_step += diff;
+        replay_state.current_icount += diff;
     }
 }
 
@@ -246,6 +246,6 @@ void replay_save_instructions(void)
 {
     if (replay_file && replay_mode == REPLAY_MODE_RECORD) {
         g_assert(replay_mutex_locked());
-        replay_advance_current_step(replay_get_current_step());
+        replay_advance_current_icount(replay_get_current_icount());
     }
 }
diff --git a/replay/replay-internal.h b/replay/replay-internal.h
index af6f4d55d4..afba9a3e0c 100644
--- a/replay/replay-internal.h
+++ b/replay/replay-internal.h
@@ -64,10 +64,10 @@ typedef enum ReplayAsyncEventKind ReplayAsyncEventKind;
 typedef struct ReplayState {
     /*! Cached clock values. */
     int64_t cached_clock[REPLAY_CLOCK_COUNT];
-    /*! Current step - number of processed instructions and timer events. */
-    uint64_t current_step;
+    /*! Current icount - number of processed instructions. */
+    uint64_t current_icount;
     /*! Number of instructions to be executed before other events happen. */
-    int instructions_count;
+    int instruction_count;
     /*! Type of the currently executed event. */
     unsigned int data_kind;
     /*! Flag which indicates that event is not processed yet. */
@@ -122,8 +122,8 @@ void replay_finish_event(void);
     data_kind variable. */
 void replay_fetch_data_kind(void);
 
-/*! Advance replay_state.current_step to the specified value. */
-void replay_advance_current_step(uint64_t current_step);
+/*! Advance replay_state.current_icount to the specified value. */
+void replay_advance_current_icount(uint64_t current_icount);
 /*! Saves queued events (like instructions and sound). */
 void replay_save_instructions(void);
 
diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c
index e6a7d009ec..e26fa4c892 100644
--- a/replay/replay-snapshot.c
+++ b/replay/replay-snapshot.c
@@ -38,7 +38,7 @@ static int replay_post_load(void *opaque, int version_id)
     } else if (replay_mode == REPLAY_MODE_RECORD) {
         /* This is only useful for loading the initial state.
            Therefore reset all the counters. */
-        state->instructions_count = 0;
+        state->instruction_count = 0;
         state->block_request_id = 0;
     }
 
@@ -53,8 +53,8 @@ static const VMStateDescription vmstate_replay = {
     .post_load = replay_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_INT64_ARRAY(cached_clock, ReplayState, REPLAY_CLOCK_COUNT),
-        VMSTATE_UINT64(current_step, ReplayState),
-        VMSTATE_INT32(instructions_count, ReplayState),
+        VMSTATE_UINT64(current_icount, ReplayState),
+        VMSTATE_INT32(instruction_count, ReplayState),
         VMSTATE_UINT32(data_kind, ReplayState),
         VMSTATE_UINT32(has_unread_data, ReplayState),
         VMSTATE_UINT64(file_offset, ReplayState),
diff --git a/replay/replay-time.c b/replay/replay-time.c
index 49c9e5d8b2..43357c9f24 100644
--- a/replay/replay-time.c
+++ b/replay/replay-time.c
@@ -24,7 +24,7 @@ int64_t replay_save_clock(ReplayClockKind kind, int64_t clock,
      * Due to the caller's locking requirements we get the icount from it
      * instead of using replay_save_instructions().
      */
-    replay_advance_current_step(raw_icount);
+    replay_advance_current_icount(raw_icount);
     replay_put_event(EVENT_CLOCK + kind);
     replay_put_qword(clock);
 
diff --git a/replay/replay.c b/replay/replay.c
index 69f973cd21..e540d795a8 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -39,7 +39,7 @@ bool replay_next_event_is(int event)
     bool res = false;
 
     /* nothing to skip - not all instructions used */
-    if (replay_state.instructions_count != 0) {
+    if (replay_state.instruction_count != 0) {
         assert(replay_state.data_kind == EVENT_INSTRUCTION);
         return event == EVENT_INSTRUCTION;
     }
@@ -62,7 +62,7 @@ bool replay_next_event_is(int event)
     return res;
 }
 
-uint64_t replay_get_current_step(void)
+uint64_t replay_get_current_icount(void)
 {
     return cpu_get_icount_raw();
 }
@@ -72,7 +72,7 @@ int replay_get_instructions(void)
     int res = 0;
     replay_mutex_lock();
     if (replay_next_event_is(EVENT_INSTRUCTION)) {
-        res = replay_state.instructions_count;
+        res = replay_state.instruction_count;
     }
     replay_mutex_unlock();
     return res;
@@ -82,16 +82,16 @@ void replay_account_executed_instructions(void)
 {
     if (replay_mode == REPLAY_MODE_PLAY) {
         g_assert(replay_mutex_locked());
-        if (replay_state.instructions_count > 0) {
-            int count = (int)(replay_get_current_step()
-                              - replay_state.current_step);
+        if (replay_state.instruction_count > 0) {
+            int count = (int)(replay_get_current_icount()
+                              - replay_state.current_icount);
 
             /* Time can only go forward */
             assert(count >= 0);
 
-            replay_state.instructions_count -= count;
-            replay_state.current_step += count;
-            if (replay_state.instructions_count == 0) {
+            replay_state.instruction_count -= count;
+            replay_state.current_icount += count;
+            if (replay_state.instruction_count == 0) {
                 assert(replay_state.data_kind == EVENT_INSTRUCTION);
                 replay_finish_event();
                 /* Wake up iothread. This is required because
@@ -273,8 +273,8 @@ static void replay_enable(const char *fname, int mode)
     replay_mutex_init();
 
     replay_state.data_kind = -1;
-    replay_state.instructions_count = 0;
-    replay_state.current_step = 0;
+    replay_state.instruction_count = 0;
+    replay_state.current_icount = 0;
     replay_state.has_unread_data = 0;
 
     /* skip file header for RECORD and check it for PLAY */

From ba3e7926691ed33e1164fafbd4fb2e8e50e7c4cd Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Date: Thu, 25 Jul 2019 11:44:49 +0300
Subject: [PATCH 25/33] icount: clean up cpu_can_io at the entry to the block

Most of IO instructions can be executed only at the end of the block in
icount mode. Therefore translator can set cpu_can_io flag when translating
the last instruction.
But when the blocks are chained, then this flag is not reset and may
remain set at the beginning of the next block.
This patch resets the flag at the entry of any translation block,
making I/O operations impossible by default.

Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>

--

v2 changes:
 - reset can_do_io at the start of every TB (suggested by Paolo Bonzini)
Message-Id: <156404428943.18669.15747009371169578935.stgit@pasha-Precision-3630-Tower>

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 accel/tcg/cpu-exec.c      |  1 -
 include/exec/gen-icount.h | 38 ++++++++++++++++++++------------------
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 6c85c3ee1e..48272c781b 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -169,7 +169,6 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
     }
 #endif /* DEBUG_DISAS */
 
-    cpu->can_do_io = !use_icount;
     ret = tcg_qemu_tb_exec(env, tb_ptr);
     cpu->can_do_io = 1;
     last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index f7669b6841..4004e6c9d0 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -7,6 +7,24 @@
 
 static TCGOp *icount_start_insn;
 
+static inline void gen_io_start(void)
+{
+    TCGv_i32 tmp = tcg_const_i32(1);
+    tcg_gen_st_i32(tmp, cpu_env,
+                   offsetof(ArchCPU, parent_obj.can_do_io) -
+                   offsetof(ArchCPU, env));
+    tcg_temp_free_i32(tmp);
+}
+
+static inline void gen_io_end(void)
+{
+    TCGv_i32 tmp = tcg_const_i32(0);
+    tcg_gen_st_i32(tmp, cpu_env,
+                   offsetof(ArchCPU, parent_obj.can_do_io) -
+                   offsetof(ArchCPU, env));
+    tcg_temp_free_i32(tmp);
+}
+
 static inline void gen_tb_start(TranslationBlock *tb)
 {
     TCGv_i32 count, imm;
@@ -40,6 +58,8 @@ static inline void gen_tb_start(TranslationBlock *tb)
         tcg_gen_st16_i32(count, cpu_env,
                          offsetof(ArchCPU, neg.icount_decr.u16.low) -
                          offsetof(ArchCPU, env));
+        /* Disable I/O by default */
+        gen_io_end();
     }
 
     tcg_temp_free_i32(count);
@@ -57,22 +77,4 @@ static inline void gen_tb_end(TranslationBlock *tb, int num_insns)
     tcg_gen_exit_tb(tb, TB_EXIT_REQUESTED);
 }
 
-static inline void gen_io_start(void)
-{
-    TCGv_i32 tmp = tcg_const_i32(1);
-    tcg_gen_st_i32(tmp, cpu_env,
-                   offsetof(ArchCPU, parent_obj.can_do_io) -
-                   offsetof(ArchCPU, env));
-    tcg_temp_free_i32(tmp);
-}
-
-static inline void gen_io_end(void)
-{
-    TCGv_i32 tmp = tcg_const_i32(0);
-    tcg_gen_st_i32(tmp, cpu_env,
-                   offsetof(ArchCPU, parent_obj.can_do_io) -
-                   offsetof(ArchCPU, env));
-    tcg_temp_free_i32(tmp);
-}
-
 #endif

From 9e9b10c6491153b60ccfd021328f1f88e1669550 Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk <pavel.dovgaluk@gmail.com>
Date: Thu, 25 Jul 2019 11:44:55 +0300
Subject: [PATCH 26/33] icount: remove unnecessary gen_io_end calls

Prior patch resets can_do_io flag at the TB entry. Therefore there is no
need in resetting this flag at the end of the block.
This patch removes redundant gen_io_end calls.

Signed-off-by: Pavel Dovgalyuk <Pavel.Dovgaluk@ispras.ru>
Message-Id: <156404429499.18669.13404064982854123855.stgit@pasha-Precision-3630-Tower>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Pavel Dovgalyuk <pavel.dovgaluk@gmail.com>
---
 accel/tcg/translator.c                  |  1 -
 include/exec/gen-icount.h               |  8 +++++++-
 target/alpha/translate.c                |  2 --
 target/arm/translate-a64.c              |  4 ----
 target/arm/translate.c                  |  7 -------
 target/cris/translate.c                 |  2 --
 target/hppa/translate.c                 |  1 -
 target/i386/translate.c                 | 10 ----------
 target/lm32/translate.c                 |  9 ---------
 target/microblaze/translate.c           |  2 --
 target/mips/translate.c                 | 11 -----------
 target/nios2/translate.c                |  4 ----
 target/ppc/translate.c                  | 13 -------------
 target/ppc/translate_init.inc.c         |  2 --
 target/riscv/insn_trans/trans_rvi.inc.c |  1 -
 target/sparc/translate.c                | 16 ----------------
 target/unicore32/translate.c            |  1 -
 target/xtensa/translate.c               | 15 ---------------
 18 files changed, 7 insertions(+), 102 deletions(-)

diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index 9226a348a3..70c66c538c 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -90,7 +90,6 @@ void translator_loop(const TranslatorOps *ops, DisasContextBase *db,
             /* Accept I/O on the last instruction.  */
             gen_io_start();
             ops->translate_insn(db, cpu);
-            gen_io_end();
         } else {
             ops->translate_insn(db, cpu);
         }
diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index 4004e6c9d0..822c43cfd3 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -16,6 +16,13 @@ static inline void gen_io_start(void)
     tcg_temp_free_i32(tmp);
 }
 
+/*
+ * cpu->can_do_io is cleared automatically at the beginning of
+ * each translation block.  The cost is minimal and only paid
+ * for -icount, plus it would be very easy to forget doing it
+ * in the translator.  Therefore, backends only need to call
+ * gen_io_start.
+ */
 static inline void gen_io_end(void)
 {
     TCGv_i32 tmp = tcg_const_i32(0);
@@ -58,7 +65,6 @@ static inline void gen_tb_start(TranslationBlock *tb)
         tcg_gen_st16_i32(count, cpu_env,
                          offsetof(ArchCPU, neg.icount_decr.u16.low) -
                          offsetof(ArchCPU, env));
-        /* Disable I/O by default */
         gen_io_end();
     }
 
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index 2c9cccf6c1..1e29653aac 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -1332,7 +1332,6 @@ static DisasJumpType gen_mfpr(DisasContext *ctx, TCGv va, int regno)
         if (use_icount) {
             gen_io_start();
             helper(va);
-            gen_io_end();
             return DISAS_PC_STALE;
         } else {
             helper(va);
@@ -2398,7 +2397,6 @@ static DisasJumpType translate_one(DisasContext *ctx, uint32_t insn)
             if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
                 gen_io_start();
                 gen_helper_load_pcc(va, cpu_env);
-                gen_io_end();
                 ret = DISAS_PC_STALE;
             } else {
                 gen_helper_load_pcc(va, cpu_env);
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index fc3e5f5c38..6fd0b779d3 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -1775,7 +1775,6 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread,
 
     if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
         /* I/O operations must end the TB here (whether read or write) */
-        gen_io_end();
         s->base.is_jmp = DISAS_UPDATE;
     } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
         /* We default to ending the TB on a coprocessor register write,
@@ -2084,9 +2083,6 @@ static void disas_uncond_b_reg(DisasContext *s, uint32_t insn)
 
         gen_helper_exception_return(cpu_env, dst);
         tcg_temp_free_i64(dst);
-        if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_end();
-        }
         /* Must exit loop to check un-masked IRQs */
         s->base.is_jmp = DISAS_EXIT;
         return;
diff --git a/target/arm/translate.c b/target/arm/translate.c
index d948757131..cbe19b7a62 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -3213,9 +3213,6 @@ static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr)
         gen_io_start();
     }
     gen_helper_cpsr_write_eret(cpu_env, cpsr);
-    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
-    }
     tcg_temp_free_i32(cpsr);
     /* Must exit loop to check un-masked IRQs */
     s->base.is_jmp = DISAS_EXIT;
@@ -7303,7 +7300,6 @@ static int disas_coproc_insn(DisasContext *s, uint32_t insn)
 
         if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) {
             /* I/O operations must end the TB here (whether read or write) */
-            gen_io_end();
             gen_lookup_tb(s);
         } else if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) {
             /* We default to ending the TB on a coprocessor register write,
@@ -9163,9 +9159,6 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
                         gen_io_start();
                     }
                     gen_helper_cpsr_write_eret(cpu_env, tmp);
-                    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-                        gen_io_end();
-                    }
                     tcg_temp_free_i32(tmp);
                     /* Must exit loop to check un-masked IRQs */
                     s->base.is_jmp = DISAS_EXIT;
diff --git a/target/cris/translate.c b/target/cris/translate.c
index 3429a3b768..e752bd0609 100644
--- a/target/cris/translate.c
+++ b/target/cris/translate.c
@@ -3225,8 +3225,6 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns)
 
     npc = dc->pc;
 
-        if (tb_cflags(tb) & CF_LAST_IO)
-            gen_io_end();
     /* Force an update if the per-tb cpu state has changed.  */
     if (dc->is_jmp == DISAS_NEXT
         && (dc->cpustate_changed || !dc->flagx_known
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 188fe688cb..8c6189512c 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -2161,7 +2161,6 @@ static bool trans_mfctl(DisasContext *ctx, arg_mfctl *a)
         if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
             gen_io_start();
             gen_helper_read_interval_timer(tmp);
-            gen_io_end();
             ctx->base.is_jmp = DISAS_IAQ_N_STALE;
         } else {
             gen_helper_read_interval_timer(tmp);
diff --git a/target/i386/translate.c b/target/i386/translate.c
index 03150a86e2..5cd74ad639 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -5381,7 +5381,6 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             gen_op_mov_reg_v(s, dflag, rm, s->T0);
             set_cc_op(s, CC_OP_EFLAGS);
             if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-                gen_io_end();
                 gen_jmp(s, s->pc - s->cs_base);
             }
             break;
@@ -6443,7 +6442,6 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_op_mov_reg_v(s, ot, R_EAX, s->T1);
         gen_bpt_io(s, s->tmp2_i32, ot);
         if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_end();
             gen_jmp(s, s->pc - s->cs_base);
         }
         break;
@@ -6464,7 +6462,6 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_helper_out_func(ot, s->tmp2_i32, s->tmp3_i32);
         gen_bpt_io(s, s->tmp2_i32, ot);
         if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_end();
             gen_jmp(s, s->pc - s->cs_base);
         }
         break;
@@ -6482,7 +6479,6 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_op_mov_reg_v(s, ot, R_EAX, s->T1);
         gen_bpt_io(s, s->tmp2_i32, ot);
         if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_end();
             gen_jmp(s, s->pc - s->cs_base);
         }
         break;
@@ -6502,7 +6498,6 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         gen_helper_out_func(ot, s->tmp2_i32, s->tmp3_i32);
         gen_bpt_io(s, s->tmp2_i32, ot);
         if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_end();
             gen_jmp(s, s->pc - s->cs_base);
         }
         break;
@@ -7206,7 +7201,6 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
         }
         gen_helper_rdtsc(cpu_env);
         if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-            gen_io_end();
             gen_jmp(s, s->pc - s->cs_base);
         }
         break;
@@ -7666,7 +7660,6 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
             }
             gen_helper_rdtscp(cpu_env);
             if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-                gen_io_end();
                 gen_jmp(s, s->pc - s->cs_base);
             }
             break;
@@ -8036,9 +8029,6 @@ static target_ulong disas_insn(DisasContext *s, CPUState *cpu)
                     gen_op_mov_v_reg(s, ot, s->T0, rm);
                     gen_helper_write_crN(cpu_env, tcg_const_i32(reg),
                                          s->T0);
-                    if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) {
-                        gen_io_end();
-                    }
                     gen_jmp_im(s, s->pc - s->cs_base);
                     gen_eob(s);
                 } else {
diff --git a/target/lm32/translate.c b/target/lm32/translate.c
index b9f2f2c4a7..778cae1e81 100644
--- a/target/lm32/translate.c
+++ b/target/lm32/translate.c
@@ -885,9 +885,6 @@ static void dec_wcsr(DisasContext *dc)
         }
         gen_helper_wcsr_im(cpu_env, cpu_R[dc->r1]);
         tcg_gen_movi_tl(cpu_pc, dc->pc + 4);
-        if (tb_cflags(dc->tb) & CF_USE_ICOUNT) {
-            gen_io_end();
-        }
         dc->is_jmp = DISAS_UPDATE;
         break;
     case CSR_IP:
@@ -897,9 +894,6 @@ static void dec_wcsr(DisasContext *dc)
         }
         gen_helper_wcsr_ip(cpu_env, cpu_R[dc->r1]);
         tcg_gen_movi_tl(cpu_pc, dc->pc + 4);
-        if (tb_cflags(dc->tb) & CF_USE_ICOUNT) {
-            gen_io_end();
-        }
         dc->is_jmp = DISAS_UPDATE;
         break;
     case CSR_ICC:
@@ -1111,9 +1105,6 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns)
          && (dc->pc - page_start < TARGET_PAGE_SIZE)
          && num_insns < max_insns);
 
-    if (tb_cflags(tb) & CF_LAST_IO) {
-        gen_io_end();
-    }
 
     if (unlikely(cs->singlestep_enabled)) {
         if (dc->is_jmp == DISAS_NEXT) {
diff --git a/target/microblaze/translate.c b/target/microblaze/translate.c
index 9ce65f3bcf..95ff663292 100644
--- a/target/microblaze/translate.c
+++ b/target/microblaze/translate.c
@@ -1724,8 +1724,6 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns)
             npc = dc->jmp_pc;
     }
 
-    if (tb_cflags(tb) & CF_LAST_IO)
-        gen_io_end();
     /* Force an update if the per-tb cpu state has changed.  */
     if (dc->is_jmp == DISAS_NEXT
         && (dc->cpustate_changed || org_flags != dc->tb_flags)) {
diff --git a/target/mips/translate.c b/target/mips/translate.c
index 1c50e5a8c4..8ebde6ffee 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -7129,9 +7129,6 @@ static void gen_mfc0(DisasContext *ctx, TCGv arg, int reg, int sel)
                 gen_io_start();
             }
             gen_helper_mfc0_count(arg, cpu_env);
-            if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-                gen_io_end();
-            }
             /*
              * Break the TB to be able to take timer interrupts immediately
              * after reading count. DISAS_STOP isn't sufficient, we need to
@@ -8296,7 +8293,6 @@ static void gen_mtc0(DisasContext *ctx, TCGv arg, int reg, int sel)
 
     /* For simplicity assume that all writes can cause interrupts.  */
     if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
         /*
          * DISAS_STOP isn't sufficient, we need to ensure we break out of
          * translated code to check for pending interrupts.
@@ -8607,9 +8603,6 @@ static void gen_dmfc0(DisasContext *ctx, TCGv arg, int reg, int sel)
                 gen_io_start();
             }
             gen_helper_mfc0_count(arg, cpu_env);
-            if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-                gen_io_end();
-            }
             /*
              * Break the TB to be able to take timer interrupts immediately
              * after reading count. DISAS_STOP isn't sufficient, we need to
@@ -9748,7 +9741,6 @@ static void gen_dmtc0(DisasContext *ctx, TCGv arg, int reg, int sel)
 
     /* For simplicity assume that all writes can cause interrupts.  */
     if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
         /*
          * DISAS_STOP isn't sufficient, we need to ensure we break out of
          * translated code to check for pending interrupts.
@@ -12817,9 +12809,6 @@ static void gen_rdhwr(DisasContext *ctx, int rt, int rd, int sel)
             gen_io_start();
         }
         gen_helper_rdhwr_cc(t0, cpu_env);
-        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-            gen_io_end();
-        }
         gen_store_gpr(t0, rt);
         /*
          * Break the TB to be able to take timer interrupts immediately
diff --git a/target/nios2/translate.c b/target/nios2/translate.c
index 17d8f1877c..e17656e66f 100644
--- a/target/nios2/translate.c
+++ b/target/nios2/translate.c
@@ -862,10 +862,6 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns)
              !tcg_op_buf_full() &&
              num_insns < max_insns);
 
-    if (tb_cflags(tb) & CF_LAST_IO) {
-        gen_io_end();
-    }
-
     /* Indicate where the next block should start */
     switch (dc->is_jmp) {
     case DISAS_NEXT:
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 9f9553afb4..2a9d13f7e5 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -1861,7 +1861,6 @@ static void gen_darn(DisasContext *ctx)
             gen_helper_darn64(cpu_gpr[rD(ctx->opcode)]);
         }
         if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-            gen_io_end();
             gen_stop_exception(ctx);
         }
     }
@@ -3991,9 +3990,6 @@ static void gen_rfi(DisasContext *ctx)
     gen_update_cfar(ctx, ctx->base.pc_next - 4);
     gen_helper_rfi(cpu_env);
     gen_sync_exception(ctx);
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
-    }
 #endif
 }
 
@@ -4011,9 +4007,6 @@ static void gen_rfid(DisasContext *ctx)
     gen_update_cfar(ctx, ctx->base.pc_next - 4);
     gen_helper_rfid(cpu_env);
     gen_sync_exception(ctx);
-    if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
-    }
 #endif
 }
 
@@ -4389,9 +4382,6 @@ static void gen_mtmsrd(DisasContext *ctx)
         /* Must stop the translation as machine state (may have) changed */
         /* Note that mtmsr is not always defined as context-synchronizing */
         gen_stop_exception(ctx);
-        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-            gen_io_end();
-        }
     }
 #endif /* !defined(CONFIG_USER_ONLY) */
 }
@@ -4429,9 +4419,6 @@ static void gen_mtmsr(DisasContext *ctx)
         tcg_gen_mov_tl(msr, cpu_gpr[rS(ctx->opcode)]);
 #endif
         gen_helper_store_msr(cpu_env, msr);
-        if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-            gen_io_end();
-        }
         tcg_temp_free(msr);
         /* Must stop the translation as machine state (may have) changed */
         /* Note that mtmsr is not always defined as context-synchronizing */
diff --git a/target/ppc/translate_init.inc.c b/target/ppc/translate_init.inc.c
index 86fc8f2e31..66d9a732e0 100644
--- a/target/ppc/translate_init.inc.c
+++ b/target/ppc/translate_init.inc.c
@@ -189,7 +189,6 @@ static void spr_read_decr(DisasContext *ctx, int gprn, int sprn)
     }
     gen_helper_load_decr(cpu_gpr[gprn], cpu_env);
     if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
         gen_stop_exception(ctx);
     }
 }
@@ -201,7 +200,6 @@ static void spr_write_decr(DisasContext *ctx, int sprn, int gprn)
     }
     gen_helper_store_decr(cpu_env, cpu_gpr[gprn]);
     if (tb_cflags(ctx->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
         gen_stop_exception(ctx);
     }
 }
diff --git a/target/riscv/insn_trans/trans_rvi.inc.c b/target/riscv/insn_trans/trans_rvi.inc.c
index ea6473111c..1af795e05d 100644
--- a/target/riscv/insn_trans/trans_rvi.inc.c
+++ b/target/riscv/insn_trans/trans_rvi.inc.c
@@ -511,7 +511,6 @@ static bool trans_fence_i(DisasContext *ctx, arg_fence_i *a)
 } while (0)
 
 #define RISCV_OP_CSR_POST do {\
-    gen_io_end(); \
     gen_set_gpr(a->rd, dest); \
     tcg_gen_movi_tl(cpu_pc, ctx->pc_succ_insn); \
     exit_tb(ctx); \
diff --git a/target/sparc/translate.c b/target/sparc/translate.c
index 091bab53af..02c16128c8 100644
--- a/target/sparc/translate.c
+++ b/target/sparc/translate.c
@@ -4412,10 +4412,6 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     gen_helper_tick_set_limit(r_tickptr,
                                                               cpu_tick_cmpr);
                                     tcg_temp_free_ptr(r_tickptr);
-                                    if (tb_cflags(dc->base.tb) &
-                                           CF_USE_ICOUNT) {
-                                        gen_io_end();
-                                    }
                                     /* End TB to handle timer interrupt */
                                     dc->base.is_jmp = DISAS_EXIT;
                                 }
@@ -4440,10 +4436,6 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     gen_helper_tick_set_count(r_tickptr,
                                                               cpu_tmp0);
                                     tcg_temp_free_ptr(r_tickptr);
-                                    if (tb_cflags(dc->base.tb) &
-                                           CF_USE_ICOUNT) {
-                                        gen_io_end();
-                                    }
                                     /* End TB to handle timer interrupt */
                                     dc->base.is_jmp = DISAS_EXIT;
                                 }
@@ -4468,10 +4460,6 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     gen_helper_tick_set_limit(r_tickptr,
                                                               cpu_stick_cmpr);
                                     tcg_temp_free_ptr(r_tickptr);
-                                    if (tb_cflags(dc->base.tb) &
-                                           CF_USE_ICOUNT) {
-                                        gen_io_end();
-                                    }
                                     /* End TB to handle timer interrupt */
                                     dc->base.is_jmp = DISAS_EXIT;
                                 }
@@ -4588,10 +4576,6 @@ static void disas_sparc_insn(DisasContext * dc, unsigned int insn)
                                     gen_helper_tick_set_count(r_tickptr,
                                                               cpu_tmp0);
                                     tcg_temp_free_ptr(r_tickptr);
-                                    if (tb_cflags(dc->base.tb) &
-                                           CF_USE_ICOUNT) {
-                                        gen_io_end();
-                                    }
                                     /* End TB to handle timer interrupt */
                                     dc->base.is_jmp = DISAS_EXIT;
                                 }
diff --git a/target/unicore32/translate.c b/target/unicore32/translate.c
index d27451eed3..0e01f35856 100644
--- a/target/unicore32/translate.c
+++ b/target/unicore32/translate.c
@@ -1931,7 +1931,6 @@ void gen_intermediate_code(CPUState *cs, TranslationBlock *tb, int max_insns)
                code.  */
             cpu_abort(cs, "IO on conditional branch instruction");
         }
-        gen_io_end();
     }
 
     /* At this stage dc->condjmp will only be set when the skipped
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index fa12a576b2..d20e60ce77 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -539,9 +539,6 @@ static void gen_waiti(DisasContext *dc, uint32_t imm4)
         gen_io_start();
     }
     gen_helper_waiti(cpu_env, pc, intlevel);
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
-    }
     tcg_temp_free(pc);
     tcg_temp_free(intlevel);
 }
@@ -2215,9 +2212,6 @@ static void translate_rsr_ccount(DisasContext *dc, const OpcodeArg arg[],
     }
     gen_helper_update_ccount(cpu_env);
     tcg_gen_mov_i32(arg[0].out, cpu_SR[par[0]]);
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
-    }
 #endif
 }
 
@@ -2607,9 +2601,6 @@ static void translate_wsr_ccompare(DisasContext *dc, const OpcodeArg arg[],
     tcg_gen_mov_i32(cpu_SR[par[0]], arg[0].in);
     gen_helper_update_ccompare(cpu_env, tmp);
     tcg_temp_free(tmp);
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
-    }
 #endif
 }
 
@@ -2621,9 +2612,6 @@ static void translate_wsr_ccount(DisasContext *dc, const OpcodeArg arg[],
         gen_io_start();
     }
     gen_helper_wsr_ccount(cpu_env, arg[0].in);
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
-    }
 #endif
 }
 
@@ -2830,9 +2818,6 @@ static void translate_xsr_ccount(DisasContext *dc, const OpcodeArg arg[],
     tcg_gen_mov_i32(arg[0].out, tmp);
     tcg_temp_free(tmp);
 
-    if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) {
-        gen_io_end();
-    }
 #endif
 }
 

From e533f45d7dd85d6514de3f7a433f7dc4313e8f62 Mon Sep 17 00:00:00 2001
From: Roman Kagan <rkagan@virtuozzo.com>
Date: Thu, 23 May 2019 10:54:48 +0000
Subject: [PATCH 27/33] cpus-common: nuke finish_safe_work

It was introduced in commit ab129972c8b41e15b0521895a46fd9c752b68a5e,
with the following motivation:

  Because start_exclusive uses CPU_FOREACH, merge exclusive_lock with
  qemu_cpu_list_lock: together with a call to exclusive_idle (via
  cpu_exec_start/end) in cpu_list_add, this protects exclusive work
  against concurrent CPU addition and removal.

However, it seems to be redundant, because the cpu-exclusive
infrastructure provides suffificent protection against the newly added
CPU starting execution while the cpu-exclusive work is running, and the
aforementioned traversing of the cpu list is protected by
qemu_cpu_list_lock.

Besides, this appears to be the only place where the cpu-exclusive
section is entered with the BQL taken, which has been found to trigger
AB-BA deadlock as follows:

    vCPU thread                             main thread
    -----------                             -----------
async_safe_run_on_cpu(self,
                      async_synic_update)
...                                         [cpu hot-add]
process_queued_cpu_work()
  qemu_mutex_unlock_iothread()
                                            [grab BQL]
  start_exclusive()                         cpu_list_add()
  async_synic_update()                        finish_safe_work()
    qemu_mutex_lock_iothread()                  cpu_exec_start()

So remove it.  This paves the way to establishing a strict nesting rule
of never entering the exclusive section with the BQL taken.

Signed-off-by: Roman Kagan <rkagan@virtuozzo.com>
Message-Id: <20190523105440.27045-2-rkagan@virtuozzo.com>
---
 cpus-common.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index 3ca58c64e8..023cfebfa3 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -69,12 +69,6 @@ static int cpu_get_free_index(void)
     return cpu_index;
 }
 
-static void finish_safe_work(CPUState *cpu)
-{
-    cpu_exec_start(cpu);
-    cpu_exec_end(cpu);
-}
-
 void cpu_list_add(CPUState *cpu)
 {
     qemu_mutex_lock(&qemu_cpu_list_lock);
@@ -86,8 +80,6 @@ void cpu_list_add(CPUState *cpu)
     }
     QTAILQ_INSERT_TAIL_RCU(&cpus, cpu, node);
     qemu_mutex_unlock(&qemu_cpu_list_lock);
-
-    finish_safe_work(cpu);
 }
 
 void cpu_list_remove(CPUState *cpu)

From fc7d2b451ea5d5fe1e787ed7e80c7082fe1cb83d Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 13 Aug 2019 08:29:33 +0200
Subject: [PATCH 28/33] kvm: vmxcap: Enhance with latest features

Based on SDM from May 2019.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 scripts/kvm/vmxcap | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap
index 99a8146aaa..d8c7d6dfb8 100755
--- a/scripts/kvm/vmxcap
+++ b/scripts/kvm/vmxcap
@@ -178,7 +178,11 @@ controls = [
             19: 'Conceal non-root operation from PT',
             20: 'Enable XSAVES/XRSTORS',
             22: 'Mode-based execute control (XS/XU)',
+            23: 'Sub-page write permissions',
+            24: 'GPA translation for PT',
             25: 'TSC scaling',
+            26: 'User wait and pause',
+            28: 'ENCLV exiting',
             },
         cap_msr = MSR_IA32_VMX_PROCBASED_CTLS2,
         ),
@@ -197,6 +201,7 @@ controls = [
             22: 'Save VMX-preemption timer value',
             23: 'Clear IA32_BNDCFGS',
             24: 'Conceal VM exits from PT',
+            25: 'Clear IA32_RTIT_CTL',
             },
         cap_msr = MSR_IA32_VMX_EXIT_CTLS,
         true_cap_msr = MSR_IA32_VMX_TRUE_EXIT_CTLS,
@@ -214,6 +219,7 @@ controls = [
             15: 'Load IA32_EFER',
             16: 'Load IA32_BNDCFGS',
             17: 'Conceal VM entries from PT',
+            18: 'Load IA32_RTIT_CTL',
             },
         cap_msr = MSR_IA32_VMX_ENTRY_CTLS,
         true_cap_msr = MSR_IA32_VMX_TRUE_ENTRY_CTLS,
@@ -227,6 +233,7 @@ controls = [
             6: 'HLT activity state',
             7: 'Shutdown activity state',
             8: 'Wait-for-SIPI activity state',
+            14: 'PT in VMX operation',
             15: 'IA32_SMBASE support',
             (16,24): 'Number of CR3-target values',
             (25,27): 'MSR-load/store count recommendation',
@@ -249,6 +256,7 @@ controls = [
             17: '1GB EPT pages',
             20: 'INVEPT supported',
             21: 'EPT accessed and dirty flags',
+            22: 'Advanced VM-exit information for EPT violations',
             25: 'Single-context INVEPT',
             26: 'All-context INVEPT',
             32: 'INVVPID supported',

From b87c8cdb3e838f282c878a8ddb4c598e0d0ec559 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 12 Aug 2019 20:46:30 -0300
Subject: [PATCH 29/33] HACKING: Document 'struct' keyword usage

Sometimes we use the 'struct' keyword in headers to help us
reduce dependencies between header files.  Document that
practice.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 HACKING | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/HACKING b/HACKING
index 0fc3e0fc04..097d482603 100644
--- a/HACKING
+++ b/HACKING
@@ -100,7 +100,19 @@ pointer, you're guaranteed that it is used to modify the storage
 it points to, or it is aliased to another pointer that is.
 
 2.3. Typedefs
-Typedefs are used to eliminate the redundant 'struct' keyword.
+
+Typedefs are used to eliminate the redundant 'struct' keyword, since type
+names have a different style than other identifiers ("CamelCase" versus
+"snake_case").  Each named struct type should have a CamelCase name and a
+corresponding typedef.
+
+Since certain C compilers choke on duplicated typedefs, you should avoid
+them and declare a typedef only in one header file.  For common types,
+you can use "include/qemu/typedefs.h" for example.  However, as a matter
+of convenience it is also perfectly fine to use forward struct
+definitions instead of typedefs in headers and function prototypes; this
+avoids problems with duplicated typedefs and reduces the need to include
+headers from other headers.
 
 2.4. Reserved namespaces in C and POSIX
 Underscore capital, double underscore, and underscore 't' suffixes should be

From 355477f8c73e9c6b60704c57472c71393ff39bca Mon Sep 17 00:00:00 2001
From: Catherine Ho <catherine.hecx@gmail.com>
Date: Mon, 8 Apr 2019 04:42:13 -0400
Subject: [PATCH 30/33] migration: do not rom_reset() during incoming migration

Commit 18269069c310 ("migration: Introduce ignore-shared capability")
addes ignore-shared capability to bypass the shared ramblock (e,g,
membackend + numa node). It does good to live migration.

As told by Yury,this commit expectes that QEMU doesn't write to guest RAM
until VM starts, but it does on aarch64 qemu:
Backtrace:
1  0x000055f4a296dd84 in address_space_write_rom_internal () at
exec.c:3458
2  0x000055f4a296de3a in address_space_write_rom () at exec.c:3479
3  0x000055f4a2d519ff in rom_reset () at hw/core/loader.c:1101
4  0x000055f4a2d475ec in qemu_devices_reset () at hw/core/reset.c:69
5  0x000055f4a2c90a28 in qemu_system_reset () at vl.c:1675
6  0x000055f4a2c9851d in main () at vl.c:4552

Actually, on arm64 virt marchine, ramblock "dtb" will be filled into ram
druing rom_reset. In ignore-shared incoming case, this rom filling
is not required since all the data has been stored in memory backend
file.

Further more, as suggested by Peter Xu, if we do rom_reset() now with
these ROMs then the RAM data should be re-filled again too with the
migration stream coming in.

Fixes: commit 18269069c310 ("migration: Introduce ignore-shared
capability")
Suggested-by: Yury Kotov <yury-kotov@yandex-team.ru>
Suggested-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Catherine Ho <catherine.hecx@gmail.com>
Acked-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/core/loader.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/hw/core/loader.c b/hw/core/loader.c
index de00f56f34..32f7cc7c33 100644
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@@ -58,6 +58,7 @@
 #include "exec/address-spaces.h"
 #include "hw/boards.h"
 #include "qemu/cutils.h"
+#include "sysemu/runstate.h"
 
 #include <zlib.h>
 
@@ -1114,6 +1115,15 @@ static void rom_reset(void *unused)
 {
     Rom *rom;
 
+    /*
+     * We don't need to fill in the RAM with ROM data because we'll fill
+     * the data in during the next incoming migration in all cases.  Note
+     * that some of those RAMs can actually be modified by the guest on ARM
+     * so this is probably the only right thing to do here.
+     */
+    if (runstate_check(RUN_STATE_INMIGRATE))
+        return;
+
     QTAILQ_FOREACH(rom, &roms, next) {
         if (rom->fw_file) {
             continue;

From a060297822ea6b4194bf36654e58c802448a3eea Mon Sep 17 00:00:00 2001
From: Wei Yang <richardw.yang@linux.intel.com>
Date: Wed, 14 Aug 2019 08:27:23 +0800
Subject: [PATCH 31/33] test-bitmap: test set 1 bit case for bitmap_set

All current bitmap_set test cases set range across word, while the
handle of a range within one word is different from that.

Add case to set 1 bit as a represent for set range within one word.

Signed-off-by: Wei Yang <richardw.yang@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tests/test-bitmap.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/test-bitmap.c b/tests/test-bitmap.c
index 18aa584591..087e02a26c 100644
--- a/tests/test-bitmap.c
+++ b/tests/test-bitmap.c
@@ -67,6 +67,18 @@ static void bitmap_set_case(bmap_set_func set_func)
 
     bmap = bitmap_new(BMAP_SIZE);
 
+    /* Set one bit at offset in second word */
+    for (offset = 0; offset <= BITS_PER_LONG; offset++) {
+        bitmap_clear(bmap, 0, BMAP_SIZE);
+        set_func(bmap, BITS_PER_LONG + offset, 1);
+        g_assert_cmpint(find_first_bit(bmap, 2 * BITS_PER_LONG),
+                        ==, BITS_PER_LONG + offset);
+        g_assert_cmpint(find_next_zero_bit(bmap,
+                                           3 * BITS_PER_LONG,
+                                           BITS_PER_LONG + offset),
+                        ==, BITS_PER_LONG + offset + 1);
+    }
+
     /* Both Aligned, set bits [BITS_PER_LONG, 3*BITS_PER_LONG] */
     set_func(bmap, BITS_PER_LONG, 2 * BITS_PER_LONG);
     g_assert_cmpuint(bmap[1], ==, -1ul);

From de594e47659029316bbf9391efb79da0a1a08e08 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 14 Aug 2019 17:35:21 +0530
Subject: [PATCH 32/33] scsi: lsi: exit infinite loop while executing script
 (CVE-2019-12068)

When executing script in lsi_execute_script(), the LSI scsi adapter
emulator advances 's->dsp' index to read next opcode. This can lead
to an infinite loop if the next opcode is empty. Move the existing
loop exit after 10k iterations so that it covers no-op opcodes as
well.

Reported-by: Bugs SysSec <bugs-syssec@rub.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Prasad J Pandit <pjp@fedoraproject.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/lsi53c895a.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
index 222a286d44..ec53b14f7f 100644
--- a/hw/scsi/lsi53c895a.c
+++ b/hw/scsi/lsi53c895a.c
@@ -186,6 +186,9 @@ static const char *names[] = {
 /* Flag set if this is a tagged command.  */
 #define LSI_TAG_VALID     (1 << 16)
 
+/* Maximum instructions to process. */
+#define LSI_MAX_INSN    10000
+
 typedef struct lsi_request {
     SCSIRequest *req;
     uint32_t tag;
@@ -1133,7 +1136,21 @@ static void lsi_execute_script(LSIState *s)
 
     s->istat1 |= LSI_ISTAT1_SRUN;
 again:
-    insn_processed++;
+    if (++insn_processed > LSI_MAX_INSN) {
+        /* Some windows drivers make the device spin waiting for a memory
+           location to change.  If we have been executed a lot of code then
+           assume this is the case and force an unexpected device disconnect.
+           This is apparently sufficient to beat the drivers into submission.
+         */
+        if (!(s->sien0 & LSI_SIST0_UDC)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "lsi_scsi: inf. loop with UDC masked");
+        }
+        lsi_script_scsi_interrupt(s, LSI_SIST0_UDC, 0);
+        lsi_disconnect(s);
+        trace_lsi_execute_script_stop();
+        return;
+    }
     insn = read_dword(s, s->dsp);
     if (!insn) {
         /* If we receive an empty opcode increment the DSP by 4 bytes
@@ -1570,19 +1587,7 @@ again:
             }
         }
     }
-    if (insn_processed > 10000 && s->waiting == LSI_NOWAIT) {
-        /* Some windows drivers make the device spin waiting for a memory
-           location to change.  If we have been executed a lot of code then
-           assume this is the case and force an unexpected device disconnect.
-           This is apparently sufficient to beat the drivers into submission.
-         */
-        if (!(s->sien0 & LSI_SIST0_UDC)) {
-            qemu_log_mask(LOG_GUEST_ERROR,
-                          "lsi_scsi: inf. loop with UDC masked");
-        }
-        lsi_script_scsi_interrupt(s, LSI_SIST0_UDC, 0);
-        lsi_disconnect(s);
-    } else if (s->istat1 & LSI_ISTAT1_SRUN && s->waiting == LSI_NOWAIT) {
+    if (s->istat1 & LSI_ISTAT1_SRUN && s->waiting == LSI_NOWAIT) {
         if (s->dcntl & LSI_DCNTL_SSM) {
             lsi_script_dma_interrupt(s, LSI_DSTAT_SSI);
         } else {
@@ -1970,6 +1975,10 @@ static void lsi_reg_writeb(LSIState *s, int offset, uint8_t val)
     case 0x2f: /* DSP[24:31] */
         s->dsp &= 0x00ffffff;
         s->dsp |= val << 24;
+        /*
+         * FIXME: if s->waiting != LSI_NOWAIT, this will only execute one
+         * instruction.  Is this correct?
+         */
         if ((s->dmode & LSI_DMODE_MAN) == 0
             && (s->istat1 & LSI_ISTAT1_SRUN) == 0)
             lsi_execute_script(s);
@@ -1988,6 +1997,10 @@ static void lsi_reg_writeb(LSIState *s, int offset, uint8_t val)
         break;
     case 0x3b: /* DCNTL */
         s->dcntl = val & ~(LSI_DCNTL_PFF | LSI_DCNTL_STD);
+        /*
+         * FIXME: if s->waiting != LSI_NOWAIT, this will only execute one
+         * instruction.  Is this correct?
+         */
         if ((val & LSI_DCNTL_STD) && (s->istat1 & LSI_ISTAT1_SRUN) == 0)
             lsi_execute_script(s);
         break;

From 80db491da4ce8b199e0e8d1e23943b20aab82f69 Mon Sep 17 00:00:00 2001
From: Jing Liu <jing2.liu@linux.intel.com>
Date: Thu, 25 Jul 2019 14:14:16 +0800
Subject: [PATCH 33/33] x86: Intel AVX512_BF16 feature enabling

Intel CooperLake cpu adds AVX512_BF16 instruction, defining as
CPUID.(EAX=7,ECX=1):EAX[bit 05].

The patch adds a property for setting the subleaf of CPUID leaf 7 in
case that people would like to specify it.

The release spec link as follows,
https://software.intel.com/sites/default/files/managed/c5/15/\
architecture-instruction-set-extensions-programming-reference.pdf

Signed-off-by: Jing Liu <jing2.liu@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.c | 39 ++++++++++++++++++++++++++++++++++++++-
 target/i386/cpu.h |  7 +++++++
 target/i386/kvm.c |  3 ++-
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 83872dda4d..9e0bac31e8 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -770,6 +770,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
           /* CPUID_7_0_ECX_OSPKE is dynamic */ \
           CPUID_7_0_ECX_LA57)
 #define TCG_7_0_EDX_FEATURES 0
+#define TCG_7_1_EAX_FEATURES 0
 #define TCG_APM_FEATURES 0
 #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT
 #define TCG_XSAVE_FEATURES (CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XGETBV1)
@@ -1095,6 +1096,25 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
         },
         .tcg_features = TCG_7_0_EDX_FEATURES,
     },
+    [FEAT_7_1_EAX] = {
+        .type = CPUID_FEATURE_WORD,
+        .feat_names = {
+            NULL, NULL, NULL, NULL,
+            NULL, "avx512-bf16", NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+            NULL, NULL, NULL, NULL,
+        },
+        .cpuid = {
+            .eax = 7,
+            .needs_ecx = true, .ecx = 1,
+            .reg = R_EAX,
+        },
+        .tcg_features = TCG_7_1_EAX_FEATURES,
+    },
     [FEAT_8000_0007_EDX] = {
         .type = CPUID_FEATURE_WORD,
         .feat_names = {
@@ -4292,13 +4312,19 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
     case 7:
         /* Structured Extended Feature Flags Enumeration Leaf */
         if (count == 0) {
-            *eax = 0; /* Maximum ECX value for sub-leaves */
+            /* Maximum ECX value for sub-leaves */
+            *eax = env->cpuid_level_func7;
             *ebx = env->features[FEAT_7_0_EBX]; /* Feature flags */
             *ecx = env->features[FEAT_7_0_ECX]; /* Feature flags */
             if ((*ecx & CPUID_7_0_ECX_PKU) && env->cr[4] & CR4_PKE_MASK) {
                 *ecx |= CPUID_7_0_ECX_OSPKE;
             }
             *edx = env->features[FEAT_7_0_EDX]; /* Feature flags */
+        } else if (count == 1) {
+            *eax = env->features[FEAT_7_1_EAX];
+            *ebx = 0;
+            *ecx = 0;
+            *edx = 0;
         } else {
             *eax = 0;
             *ebx = 0;
@@ -4948,6 +4974,11 @@ static void x86_cpu_adjust_feat_level(X86CPU *cpu, FeatureWord w)
         x86_cpu_adjust_level(cpu, &env->cpuid_min_xlevel2, eax);
     break;
     }
+
+    if (eax == 7) {
+        x86_cpu_adjust_level(cpu, &env->cpuid_min_level_func7,
+                             fi->cpuid.ecx);
+    }
 }
 
 /* Calculate XSAVE components based on the configured CPU feature flags */
@@ -5066,6 +5097,7 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error **errp)
         x86_cpu_adjust_feat_level(cpu, FEAT_1_ECX);
         x86_cpu_adjust_feat_level(cpu, FEAT_6_EAX);
         x86_cpu_adjust_feat_level(cpu, FEAT_7_0_ECX);
+        x86_cpu_adjust_feat_level(cpu, FEAT_7_1_EAX);
         x86_cpu_adjust_feat_level(cpu, FEAT_8000_0001_EDX);
         x86_cpu_adjust_feat_level(cpu, FEAT_8000_0001_ECX);
         x86_cpu_adjust_feat_level(cpu, FEAT_8000_0007_EDX);
@@ -5097,6 +5129,9 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error **errp)
     }
 
     /* Set cpuid_*level* based on cpuid_min_*level, if not explicitly set */
+    if (env->cpuid_level_func7 == UINT32_MAX) {
+        env->cpuid_level_func7 = env->cpuid_min_level_func7;
+    }
     if (env->cpuid_level == UINT32_MAX) {
         env->cpuid_level = env->cpuid_min_level;
     }
@@ -5870,6 +5905,8 @@ static Property x86_cpu_properties[] = {
     DEFINE_PROP_BOOL("host-phys-bits", X86CPU, host_phys_bits, false),
     DEFINE_PROP_UINT8("host-phys-bits-limit", X86CPU, host_phys_bits_limit, 0),
     DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true),
+    DEFINE_PROP_UINT32("level-func7", X86CPU, env.cpuid_level_func7,
+                       UINT32_MAX),
     DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, UINT32_MAX),
     DEFINE_PROP_UINT32("xlevel", X86CPU, env.cpuid_xlevel, UINT32_MAX),
     DEFINE_PROP_UINT32("xlevel2", X86CPU, env.cpuid_xlevel2, UINT32_MAX),
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 6c95447005..5f6e3a029a 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -479,6 +479,7 @@ typedef enum FeatureWord {
     FEAT_7_0_EBX,       /* CPUID[EAX=7,ECX=0].EBX */
     FEAT_7_0_ECX,       /* CPUID[EAX=7,ECX=0].ECX */
     FEAT_7_0_EDX,       /* CPUID[EAX=7,ECX=0].EDX */
+    FEAT_7_1_EAX,       /* CPUID[EAX=7,ECX=1].EAX */
     FEAT_8000_0001_EDX, /* CPUID[8000_0001].EDX */
     FEAT_8000_0001_ECX, /* CPUID[8000_0001].ECX */
     FEAT_8000_0007_EDX, /* CPUID[8000_0007].EDX */
@@ -692,6 +693,8 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EDX_CORE_CAPABILITY   (1U << 30)  /*Core Capability*/
 #define CPUID_7_0_EDX_SPEC_CTRL_SSBD  (1U << 31) /* Speculative Store Bypass Disable */
 
+#define CPUID_7_1_EAX_AVX512_BF16 (1U << 5) /* AVX512 BFloat16 Instruction */
+
 #define CPUID_8000_0008_EBX_WBNOINVD  (1U << 9)  /* Write back and
                                                                              do not invalidate cache */
 #define CPUID_8000_0008_EBX_IBPB    (1U << 12) /* Indirect Branch Prediction Barrier */
@@ -1323,6 +1326,10 @@ typedef struct CPUX86State {
     /* Fields after this point are preserved across CPU reset. */
 
     /* processor features (e.g. for CPUID insn) */
+    /* Minimum cpuid leaf 7 value */
+    uint32_t cpuid_level_func7;
+    /* Actual cpuid leaf 7 value */
+    uint32_t cpuid_min_level_func7;
     /* Minimum level/xlevel/xlevel2, based on CPU model + features */
     uint32_t cpuid_min_level, cpuid_min_xlevel, cpuid_min_xlevel2;
     /* Maximum level/xlevel/xlevel2 value for auto-assignment: */
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 615dfce909..8023c679ea 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1501,6 +1501,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
                 c = &cpuid_data.entries[cpuid_i++];
             }
             break;
+        case 0x7:
         case 0x14: {
             uint32_t times;
 
@@ -1513,7 +1514,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
             for (j = 1; j <= times; ++j) {
                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
                     fprintf(stderr, "cpuid_data is full, no space for "
-                                "cpuid(eax:0x14,ecx:0x%x)\n", j);
+                                "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
                     abort();
                 }
                 c = &cpuid_data.entries[cpuid_i++];