From ae06cb46b2d3d27ccad92ad962afda68ad1286a9 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Fri, 16 Jun 2017 14:56:36 -0400
Subject: [PATCH 1/3] gen-icount: add missing inline to gen_tb_end

Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1497639397-19453-2-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/exec/gen-icount.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index 62d462e494..547c979629 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -44,7 +44,7 @@ static inline void gen_tb_start(TranslationBlock *tb)
     tcg_temp_free_i32(count);
 }
 
-static void gen_tb_end(TranslationBlock *tb, int num_insns)
+static inline void gen_tb_end(TranslationBlock *tb, int num_insns)
 {
     if (tb->cflags & CF_USE_ICOUNT) {
         /* Update the num_insn immediate parameter now that we know

From 53f6672bcf57d82b794a2cc3a3469be7d35c8653 Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Fri, 16 Jun 2017 14:56:37 -0400
Subject: [PATCH 2/3] gen-icount: use tcg_ctx.tcg_env instead of cpu_env

We are relying on cpu_env being defined as a global, yet most
targets (i.e. all but arm/a64) have it defined as a local variable.
Luckily all of them use the same "cpu_env" name, but really
compilation shouldn't break if the name of that local variable
changed.

Fix it by using tcg_ctx.tcg_env, which all targets set in their
translate_init function. This change also helps paving the way
for the upcoming "translation loop common to all targets" work.

Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1497639397-19453-3-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/exec/gen-icount.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h
index 547c979629..9b3cb14dfa 100644
--- a/include/exec/gen-icount.h
+++ b/include/exec/gen-icount.h
@@ -19,7 +19,7 @@ static inline void gen_tb_start(TranslationBlock *tb)
         count = tcg_temp_new_i32();
     }
 
-    tcg_gen_ld_i32(count, cpu_env,
+    tcg_gen_ld_i32(count, tcg_ctx.tcg_env,
                    -ENV_OFFSET + offsetof(CPUState, icount_decr.u32));
 
     if (tb->cflags & CF_USE_ICOUNT) {
@@ -37,7 +37,7 @@ static inline void gen_tb_start(TranslationBlock *tb)
     tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, exitreq_label);
 
     if (tb->cflags & CF_USE_ICOUNT) {
-        tcg_gen_st16_i32(count, cpu_env,
+        tcg_gen_st16_i32(count, tcg_ctx.tcg_env,
                          -ENV_OFFSET + offsetof(CPUState, icount_decr.u16.low));
     }
 
@@ -62,14 +62,16 @@ static inline void gen_tb_end(TranslationBlock *tb, int num_insns)
 static inline void gen_io_start(void)
 {
     TCGv_i32 tmp = tcg_const_i32(1);
-    tcg_gen_st_i32(tmp, cpu_env, -ENV_OFFSET + offsetof(CPUState, can_do_io));
+    tcg_gen_st_i32(tmp, tcg_ctx.tcg_env,
+                   -ENV_OFFSET + offsetof(CPUState, can_do_io));
     tcg_temp_free_i32(tmp);
 }
 
 static inline void gen_io_end(void)
 {
     TCGv_i32 tmp = tcg_const_i32(0);
-    tcg_gen_st_i32(tmp, cpu_env, -ENV_OFFSET + offsetof(CPUState, can_do_io));
+    tcg_gen_st_i32(tmp, tcg_ctx.tcg_env,
+                   -ENV_OFFSET + offsetof(CPUState, can_do_io));
     tcg_temp_free_i32(tmp);
 }
 

From f3ced3c59287dabc253f83f0c70aa4934470c15e Mon Sep 17 00:00:00 2001
From: "Emilio G. Cota" <cota@braap.org>
Date: Wed, 14 Jun 2017 20:36:13 -0400
Subject: [PATCH 3/3] tcg: consistently access cpu->tb_jmp_cache atomically

Some code paths can lead to atomic accesses racing with memset()
on cpu->tb_jmp_cache, which can result in torn reads/writes
and is undefined behaviour in C11.

These torn accesses are unlikely to show up as bugs, but from code
inspection they seem possible. For example, tb_phys_invalidate does:
    /* remove the TB from the hash list */
    h = tb_jmp_cache_hash_func(tb->pc);
    CPU_FOREACH(cpu) {
        if (atomic_read(&cpu->tb_jmp_cache[h]) == tb) {
            atomic_set(&cpu->tb_jmp_cache[h], NULL);
        }
    }
Here atomic_set might race with a concurrent memset (such as the
ones scheduled via "unsafe" async work, e.g. tlb_flush_page) and
therefore we might end up with a torn pointer (or who knows what,
because we are under undefined behaviour).

This patch converts parallel accesses to cpu->tb_jmp_cache to use
atomic primitives, thereby bringing these accesses back to defined
behaviour. The price to pay is to potentially execute more instructions
when clearing cpu->tb_jmp_cache, but given how infrequently they happen
and the small size of the cache, the performance impact I have measured
is within noise range when booting debian-arm.

Note that under "safe async" work (e.g. do_tb_flush) we could use memset
because no other vcpus are running. However I'm keeping these accesses
atomic as well to keep things simple and to avoid confusing analysis
tools such as ThreadSanitizer.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1497486973-25845-1-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 accel/tcg/cputlb.c        |  4 ++--
 accel/tcg/translate-all.c | 26 ++++++++++++--------------
 include/qom/cpu.h         | 11 ++++++++++-
 qom/cpu.c                 |  5 +----
 4 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 1900936038..85635ae8ad 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -118,7 +118,7 @@ static void tlb_flush_nocheck(CPUState *cpu)
 
     memset(env->tlb_table, -1, sizeof(env->tlb_table));
     memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
-    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+    cpu_tb_jmp_cache_clear(cpu);
 
     env->vtlb_index = 0;
     env->tlb_flush_addr = -1;
@@ -183,7 +183,7 @@ static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
         }
     }
 
-    memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+    cpu_tb_jmp_cache_clear(cpu);
 
     tlb_debug("done\n");
 
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index f6ad46b613..93fb9230ba 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -928,11 +928,7 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count)
     }
 
     CPU_FOREACH(cpu) {
-        int i;
-
-        for (i = 0; i < TB_JMP_CACHE_SIZE; ++i) {
-            atomic_set(&cpu->tb_jmp_cache[i], NULL);
-        }
+        cpu_tb_jmp_cache_clear(cpu);
     }
 
     tcg_ctx.tb_ctx.nb_tbs = 0;
@@ -1813,19 +1809,21 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
     cpu_loop_exit_noexc(cpu);
 }
 
+static void tb_jmp_cache_clear_page(CPUState *cpu, target_ulong page_addr)
+{
+    unsigned int i, i0 = tb_jmp_cache_hash_page(page_addr);
+
+    for (i = 0; i < TB_JMP_PAGE_SIZE; i++) {
+        atomic_set(&cpu->tb_jmp_cache[i0 + i], NULL);
+    }
+}
+
 void tb_flush_jmp_cache(CPUState *cpu, target_ulong addr)
 {
-    unsigned int i;
-
     /* Discard jump cache entries for any tb which might potentially
        overlap the flushed page.  */
-    i = tb_jmp_cache_hash_page(addr - TARGET_PAGE_SIZE);
-    memset(&cpu->tb_jmp_cache[i], 0,
-           TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *));
-
-    i = tb_jmp_cache_hash_page(addr);
-    memset(&cpu->tb_jmp_cache[i], 0,
-           TB_JMP_PAGE_SIZE * sizeof(TranslationBlock *));
+    tb_jmp_cache_clear_page(cpu, addr - TARGET_PAGE_SIZE);
+    tb_jmp_cache_clear_page(cpu, addr);
 }
 
 static void print_qht_statistics(FILE *f, fprintf_function cpu_fprintf,
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 89ddb686fb..2fe7cff9fe 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -346,7 +346,7 @@ struct CPUState {
 
     void *env_ptr; /* CPUArchState */
 
-    /* Writes protected by tb_lock, reads not thread-safe  */
+    /* Accessed in parallel; all accesses must be atomic */
     struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];
 
     struct GDBRegisterState *gdb_regs;
@@ -422,6 +422,15 @@ extern struct CPUTailQ cpus;
 
 extern __thread CPUState *current_cpu;
 
+static inline void cpu_tb_jmp_cache_clear(CPUState *cpu)
+{
+    unsigned int i;
+
+    for (i = 0; i < TB_JMP_CACHE_SIZE; i++) {
+        atomic_set(&cpu->tb_jmp_cache[i], NULL);
+    }
+}
+
 /**
  * qemu_tcg_mttcg_enabled:
  * Check whether we are running MultiThread TCG or not.
diff --git a/qom/cpu.c b/qom/cpu.c
index 50698767dd..585419b65c 100644
--- a/qom/cpu.c
+++ b/qom/cpu.c
@@ -274,7 +274,6 @@ void cpu_reset(CPUState *cpu)
 static void cpu_common_reset(CPUState *cpu)
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
-    int i;
 
     if (qemu_loglevel_mask(CPU_LOG_RESET)) {
         qemu_log("CPU Reset (CPU %d)\n", cpu->cpu_index);
@@ -292,9 +291,7 @@ static void cpu_common_reset(CPUState *cpu)
     cpu->crash_occurred = false;
 
     if (tcg_enabled()) {
-        for (i = 0; i < TB_JMP_CACHE_SIZE; ++i) {
-            atomic_set(&cpu->tb_jmp_cache[i], NULL);
-        }
+        cpu_tb_jmp_cache_clear(cpu);
 
 #ifdef CONFIG_SOFTMMU
         tlb_flush(cpu, 0);