From 34664507c7f038842f20a2c787915680b1fabba2 Mon Sep 17 00:00:00 2001 From: Artyom Tarasenko Date: Tue, 23 Jun 2015 14:30:17 +0200 Subject: [PATCH 01/19] qemu-common: add VEC_OR macro Intel C Compiler version 15.0.3.187 Build 20150407 doesn't support '|' function for non floating-point simd operands. Define VEC_OR macro which uses _mm_or_si128 supported both in icc and gcc on x86 platform. Signed-off-by: Artyom Tarasenko Message-Id: <54c804cdb3b3a93e93ef98f085dc57c4092580b7.1435062067.git.atar4qemu@gmail.com> Signed-off-by: Paolo Bonzini --- include/qemu-common.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/qemu-common.h b/include/qemu-common.h index d52d09cfb8..5be3cddfa9 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -455,6 +455,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); #define VECTYPE __vector unsigned char #define SPLAT(p) vec_splat(vec_ld(0, p), 0) #define ALL_EQ(v1, v2) vec_all_eq(v1, v2) +#define VEC_OR(v1, v2) ((v1) | (v2)) /* altivec.h may redefine the bool macro as vector type. * Reset it to POSIX semantics. */ #define bool _Bool @@ -463,10 +464,12 @@ void qemu_hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); #define VECTYPE __m128i #define SPLAT(p) _mm_set1_epi8(*(p)) #define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF) +#define VEC_OR(v1, v2) (_mm_or_si128(v1, v2)) #else #define VECTYPE unsigned long #define SPLAT(p) (*(p) * (~0UL / 255)) #define ALL_EQ(v1, v2) ((v1) == (v2)) +#define VEC_OR(v1, v2) ((v1) | (v2)) #endif #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 From 27e7755bea57c66097000f7612271ceefcbeb4a4 Mon Sep 17 00:00:00 2001 From: Artyom Tarasenko Date: Tue, 23 Jun 2015 14:30:18 +0200 Subject: [PATCH 02/19] cutils: allow compilation with icc Use VEC_OR macro for operations on VECTYPE operands Signed-off-by: Artyom Tarasenko Message-Id: <3f62d7a3a265f7dd99e50d016a0333a99a4a082a.1435062067.git.atar4qemu@gmail.com> Signed-off-by: Paolo Bonzini --- util/cutils.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/util/cutils.c b/util/cutils.c index 144b25c05a..5d1c9ebe05 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -207,13 +207,13 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len) for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i < len / sizeof(VECTYPE); i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { - VECTYPE tmp0 = p[i + 0] | p[i + 1]; - VECTYPE tmp1 = p[i + 2] | p[i + 3]; - VECTYPE tmp2 = p[i + 4] | p[i + 5]; - VECTYPE tmp3 = p[i + 6] | p[i + 7]; - VECTYPE tmp01 = tmp0 | tmp1; - VECTYPE tmp23 = tmp2 | tmp3; - if (!ALL_EQ(tmp01 | tmp23, zero)) { + VECTYPE tmp0 = VEC_OR(p[i + 0], p[i + 1]); + VECTYPE tmp1 = VEC_OR(p[i + 2], p[i + 3]); + VECTYPE tmp2 = VEC_OR(p[i + 4], p[i + 5]); + VECTYPE tmp3 = VEC_OR(p[i + 6], p[i + 7]); + VECTYPE tmp01 = VEC_OR(tmp0, tmp1); + VECTYPE tmp23 = VEC_OR(tmp2, tmp3); + if (!ALL_EQ(VEC_OR(tmp01, tmp23), zero)) { break; } } From 94beb661bd90bcb477eed6d3b07aced988c40163 Mon Sep 17 00:00:00 2001 From: Peter Crosthwaite Date: Sun, 7 Jun 2015 14:59:09 -0700 Subject: [PATCH 03/19] memory_mapping: Rework cpu related includes This makes it more consistent with all other core code files, which either just rely on qemu-common.h inclusion or precede cpu.h with qemu-common.h. cpu-all.h should not be included in addition to cpu.h. Remove it. Signed-off-by: Peter Crosthwaite Message-Id: <1433714349-7262-1-git-send-email-crosthwaite.peter@gmail.com> Signed-off-by: Paolo Bonzini --- memory_mapping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/memory_mapping.c b/memory_mapping.c index 7b69801cb8..36d6b26046 100644 --- a/memory_mapping.c +++ b/memory_mapping.c @@ -13,8 +13,8 @@ #include +#include "qemu-common.h" #include "cpu.h" -#include "exec/cpu-all.h" #include "sysemu/memory_mapping.h" #include "exec/memory.h" #include "exec/address-spaces.h" From 6e0b07306d1793e8402dd218d2e38a7377b5fc27 Mon Sep 17 00:00:00 2001 From: Peter Crosthwaite Date: Sat, 30 May 2015 23:11:34 -0700 Subject: [PATCH 04/19] cpu-defs: Move CPU_TEMP_BUF_NLONGS to tcg The usages of this define are pure TCG and there is no architecture specific variation of the value. Localise it to the TCG engine to remove another architecture agnostic piece from cpu-defs.h. This follows on from a28177820a868eafda8fab007561cc19f41941f4 where temp_buf was moved out of the CPU_COMMON obsoleting the need for the super early definition. Reviewed-by: Richard Henderson Signed-off-by: Peter Crosthwaite Message-Id: <498e8e5325c1a1aff79e5bcfc28cb760ef6b214e.1433052532.git.crosthwaite.peter@gmail.com> Signed-off-by: Paolo Bonzini --- include/exec/cpu-defs.h | 1 - tcg/tcg.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h index d5aecaf49e..817889bb04 100644 --- a/include/exec/cpu-defs.h +++ b/include/exec/cpu-defs.h @@ -161,7 +161,6 @@ typedef struct CPUIOTLBEntry { #endif -#define CPU_TEMP_BUF_NLONGS 128 #define CPU_COMMON \ /* soft mmu support */ \ CPU_COMMON_TLB \ diff --git a/tcg/tcg.h b/tcg/tcg.h index 41e486959d..231a781524 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -29,6 +29,8 @@ #include "qemu/bitops.h" #include "tcg-target.h" +#define CPU_TEMP_BUF_NLONGS 128 + /* Default target word size to pointer size. */ #ifndef TCG_TARGET_REG_BITS # if UINTPTR_MAX == UINT32_MAX From 9e0dc48c9f05505b53cb28f860456a0648e56ddf Mon Sep 17 00:00:00 2001 From: Peter Crosthwaite Date: Sat, 30 May 2015 23:11:42 -0700 Subject: [PATCH 05/19] include/exec: Move standard exceptions to cpu-all.h These exception indicies are generic and don't have any reliance on the per-arch cpu.h defs. Move them to cpu-all.h so they can be used by core code that does not have access to cpu-defs.h. Reviewed-by: Richard Henderson Signed-off-by: Peter Crosthwaite Message-Id: Signed-off-by: Paolo Bonzini --- include/exec/cpu-all.h | 6 ++++++ include/exec/cpu-defs.h | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index ac06c6721c..8999634981 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -26,6 +26,12 @@ #include "qom/cpu.h" #include "qemu/rcu.h" +#define EXCP_INTERRUPT 0x10000 /* async interruption */ +#define EXCP_HLT 0x10001 /* hlt instruction reached */ +#define EXCP_DEBUG 0x10002 /* cpu stopped after a breakpoint or singlestep */ +#define EXCP_HALTED 0x10003 /* cpu is halted (waiting for external event) */ +#define EXCP_YIELD 0x10004 /* cpu wants to yield timeslice to another */ + /* some important defines: * * WORDS_ALIGNED : if defined, the host cpu can only make word aligned diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h index 817889bb04..247829c968 100644 --- a/include/exec/cpu-defs.h +++ b/include/exec/cpu-defs.h @@ -56,12 +56,6 @@ typedef uint64_t target_ulong; #error TARGET_LONG_SIZE undefined #endif -#define EXCP_INTERRUPT 0x10000 /* async interruption */ -#define EXCP_HLT 0x10001 /* hlt instruction reached */ -#define EXCP_DEBUG 0x10002 /* cpu stopped after a breakpoint or singlestep */ -#define EXCP_HALTED 0x10003 /* cpu is halted (waiting for external event) */ -#define EXCP_YIELD 0x10004 /* cpu wants to yield timeslice to another */ - /* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for addresses on the same page. The top bits are the same. This allows TLB invalidation to quickly clear a subset of the hash table. */ From e1b89321bafea9fb33d87852fc91fee579d17dfe Mon Sep 17 00:00:00 2001 From: Peter Crosthwaite Date: Sat, 30 May 2015 23:11:45 -0700 Subject: [PATCH 06/19] include/exec: Move tb hash functions out This is one of very few things in exec-all with a genuine CPU architecture dependency. Move these hashing helpers to a new header to trim exec-all.h down to a near architecture-agnostic header. The defs are only used by cpu-exec and translate-all which are both arch-obj's so the new tb-hash.h has no core code usage. Reviewed-by: Richard Henderson Signed-off-by: Peter Crosthwaite Message-Id: <9d048b96f7cfa64a4d9c0b88e0dd2877fac51d41.1433052532.git.crosthwaite.peter@gmail.com> Signed-off-by: Paolo Bonzini --- cpu-exec.c | 1 + include/exec/exec-all.h | 20 ------------------- include/exec/tb-hash.h | 43 +++++++++++++++++++++++++++++++++++++++++ translate-all.c | 1 + 4 files changed, 45 insertions(+), 20 deletions(-) create mode 100644 include/exec/tb-hash.h diff --git a/cpu-exec.c b/cpu-exec.c index 2ffeb6e40d..b2724c18c1 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -27,6 +27,7 @@ #include "exec/address-spaces.h" #include "exec/memory-internal.h" #include "qemu/rcu.h" +#include "exec/tb-hash.h" /* -icount align implementation. */ diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 2573e8c36e..d678114cb2 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -195,26 +195,6 @@ struct TBContext { int tb_invalidated_flag; }; -static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc) -{ - target_ulong tmp; - tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)); - return (tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK; -} - -static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc) -{ - target_ulong tmp; - tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)); - return (((tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK) - | (tmp & TB_JMP_ADDR_MASK)); -} - -static inline unsigned int tb_phys_hash_func(tb_page_addr_t pc) -{ - return (pc >> 2) & (CODE_GEN_PHYS_HASH_SIZE - 1); -} - void tb_free(TranslationBlock *tb); void tb_flush(CPUArchState *env); void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr); diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h new file mode 100644 index 0000000000..e0bd786f9f --- /dev/null +++ b/include/exec/tb-hash.h @@ -0,0 +1,43 @@ +/* + * internal execution defines for qemu + * + * Copyright (c) 2003 Fabrice Bellard + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see . + */ + +#ifndef EXEC_TB_HASH +#define EXEC_TB_HASH + +static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc) +{ + target_ulong tmp; + tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)); + return (tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK; +} + +static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc) +{ + target_ulong tmp; + tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)); + return (((tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK) + | (tmp & TB_JMP_ADDR_MASK)); +} + +static inline unsigned int tb_phys_hash_func(tb_page_addr_t pc) +{ + return (pc >> 2) & (CODE_GEN_PHYS_HASH_SIZE - 1); +} + +#endif diff --git a/translate-all.c b/translate-all.c index b6b0e1c098..412bc9005f 100644 --- a/translate-all.c +++ b/translate-all.c @@ -58,6 +58,7 @@ #endif #include "exec/cputlb.h" +#include "exec/tb-hash.h" #include "translate-all.h" #include "qemu/bitmap.h" #include "qemu/timer.h" From 41da4bd6420afd1209c408974920f63ff9c658e1 Mon Sep 17 00:00:00 2001 From: Peter Crosthwaite Date: Sat, 30 May 2015 23:11:46 -0700 Subject: [PATCH 07/19] cpu-defs: Move out TB_JMP defines These are not Architecture specific in any way so move them out of cpu-defs.h. tb-hash.h is an appropriate place as a leading user and their strong relationship to TB hashing and caching. Reviewed-by: Richard Henderson Signed-off-by: Peter Crosthwaite Message-Id: <43ceca65a3fa240efac49aa0bf604ad0442e1710.1433052532.git.crosthwaite.peter@gmail.com> Signed-off-by: Paolo Bonzini --- include/exec/cpu-defs.h | 8 -------- include/exec/tb-hash.h | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h index 247829c968..98b9cff310 100644 --- a/include/exec/cpu-defs.h +++ b/include/exec/cpu-defs.h @@ -56,14 +56,6 @@ typedef uint64_t target_ulong; #error TARGET_LONG_SIZE undefined #endif -/* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for - addresses on the same page. The top bits are the same. This allows - TLB invalidation to quickly clear a subset of the hash table. */ -#define TB_JMP_PAGE_BITS (TB_JMP_CACHE_BITS / 2) -#define TB_JMP_PAGE_SIZE (1 << TB_JMP_PAGE_BITS) -#define TB_JMP_ADDR_MASK (TB_JMP_PAGE_SIZE - 1) -#define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE) - #if !defined(CONFIG_USER_ONLY) /* use a fully associative victim tlb of 8 entries */ #define CPU_VTLB_SIZE 8 diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h index e0bd786f9f..0f4e8a08af 100644 --- a/include/exec/tb-hash.h +++ b/include/exec/tb-hash.h @@ -20,6 +20,14 @@ #ifndef EXEC_TB_HASH #define EXEC_TB_HASH +/* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for + addresses on the same page. The top bits are the same. This allows + TLB invalidation to quickly clear a subset of the hash table. */ +#define TB_JMP_PAGE_BITS (TB_JMP_CACHE_BITS / 2) +#define TB_JMP_PAGE_SIZE (1 << TB_JMP_PAGE_BITS) +#define TB_JMP_ADDR_MASK (TB_JMP_PAGE_SIZE - 1) +#define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE) + static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc) { target_ulong tmp; From bdf026317daa3b9dfa281f29e96fbb6fd48394c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=A9=AC=E6=96=87=E9=9C=9C?= Date: Wed, 1 Jul 2015 15:41:41 +0200 Subject: [PATCH 08/19] Fix irq route entries exceeding KVM_MAX_IRQ_ROUTES Last month, we experienced several guests crash(6cores-8cores), qemu logs display the following messages: qemu-system-x86_64: /build/qemu-2.1.2/kvm-all.c:976: kvm_irqchip_commit_routes: Assertion `ret == 0' failed. After analysis and verification, we can confirm it's irq-balance daemon(in guest) leads to the assertion failure. Start a 8 core guest with two disks, execute the following scripts will reproduce the BUG quickly: irq_affinity.sh ======================================================================== vda_irq_num=25 vdb_irq_num=27 while [ 1 ] do for irq in {1,2,4,8,10,20,40,80} do echo $irq > /proc/irq/$vda_irq_num/smp_affinity echo $irq > /proc/irq/$vdb_irq_num/smp_affinity dd if=/dev/vda of=/dev/zero bs=4K count=100 iflag=direct dd if=/dev/vdb of=/dev/zero bs=4K count=100 iflag=direct done done ======================================================================== QEMU setup static irq route entries in kvm_pc_setup_irq_routing(), PIC and IOAPIC share the first 15 GSI numbers, take up 23 GSI numbers, but take up 38 irq route entries. When change irq smp_affinity in guest, a dynamic route entry may be setup, the current logic is: if allocate GSI number succeeds, a new route entry can be added. The available dynamic GSI numbers is 1021(KVM_MAX_IRQ_ROUTES-23), but available irq route entries is only 986(KVM_MAX_IRQ_ROUTES-38), GSI numbers greater than route entries. irq-balance's behavior will eventually leads to total irq route entries exceed KVM_MAX_IRQ_ROUTES, ioctl(KVM_SET_GSI_ROUTING) fail and kvm_irqchip_commit_routes() trigger assertion failure. This patch fix the BUG. Signed-off-by: Wenshuang Ma Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini --- kvm-all.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 53e01d468e..e98b08def2 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1099,9 +1099,17 @@ static int kvm_irqchip_get_virq(KVMState *s) uint32_t *word = s->used_gsi_bitmap; int max_words = ALIGN(s->gsi_count, 32) / 32; int i, zeroes; - bool retry = true; -again: + /* + * PIC and IOAPIC share the first 16 GSI numbers, thus the available + * GSI numbers are more than the number of IRQ route. Allocating a GSI + * number can succeed even though a new route entry cannot be added. + * When this happens, flush dynamic MSI entries to free IRQ route entries. + */ + if (!s->direct_msi && s->irq_routes->nr == s->gsi_count) { + kvm_flush_dynamic_msi_routes(s); + } + /* Return the lowest unused GSI in the bitmap */ for (i = 0; i < max_words; i++) { zeroes = ctz32(~word[i]); @@ -1111,11 +1119,6 @@ again: return zeroes + i * 32; } - if (!s->direct_msi && retry) { - retry = false; - kvm_flush_dynamic_msi_routes(s); - goto again; - } return -ENOSPC; } From 2e7f7a3c86f884a77296a137b7c730a4d580c5c9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Jun 2015 18:47:18 +0200 Subject: [PATCH 09/19] main-loop: use qemu_mutex_lock_iothread consistently The next patch will require the BQL to be always taken with qemu_mutex_lock_iothread(), while right now this isn't the case. Outside TCG mode this is not a problem. In TCG mode, we need to be careful and avoid the "prod out of compiled code" step if already in a VCPU thread. This is easily done with a check on current_cpu, i.e. qemu_in_vcpu_thread(). Hopefully, multithreaded TCG will get rid of the whole logic to kick VCPUs whenever an I/O event occurs! Cc: Frederic Konrad Message-Id: <1434646046-27150-2-git-send-email-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- cpus.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpus.c b/cpus.c index 4f0e54d53c..c09fbef5de 100644 --- a/cpus.c +++ b/cpus.c @@ -954,7 +954,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg) CPUState *cpu = arg; int r; - qemu_mutex_lock(&qemu_global_mutex); + qemu_mutex_lock_iothread(); qemu_thread_get_self(cpu->thread); cpu->thread_id = qemu_get_thread_id(); cpu->can_do_io = 1; @@ -1034,10 +1034,10 @@ static void *qemu_tcg_cpu_thread_fn(void *arg) { CPUState *cpu = arg; + qemu_mutex_lock_iothread(); qemu_tcg_init_cpu_signals(); qemu_thread_get_self(cpu->thread); - qemu_mutex_lock(&qemu_global_mutex); CPU_FOREACH(cpu) { cpu->thread_id = qemu_get_thread_id(); cpu->created = true; @@ -1149,7 +1149,11 @@ bool qemu_in_vcpu_thread(void) void qemu_mutex_lock_iothread(void) { atomic_inc(&iothread_requesting_mutex); - if (!tcg_enabled() || !first_cpu || !first_cpu->thread) { + /* In the simple case there is no need to bump the VCPU thread out of + * TCG code execution. + */ + if (!tcg_enabled() || qemu_in_vcpu_thread() || + !first_cpu || !first_cpu->thread) { qemu_mutex_lock(&qemu_global_mutex); atomic_dec(&iothread_requesting_mutex); } else { From afbe70535ff1a8a7a32910cc15ebecc0ba92e7da Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Jun 2015 18:47:19 +0200 Subject: [PATCH 10/19] main-loop: introduce qemu_mutex_iothread_locked This function will be used to avoid recursive locking of the iothread lock whenever address_space_rw/ld*/st* are called with the BQL held, which is almost always the case. Tracking whether the iothread is owned is very cheap (just use a TLS variable) but requires some care because now the lock must always be taken with qemu_mutex_lock_iothread(). Previously this wasn't the case. Outside TCG mode this is not a problem. In TCG mode, we need to be careful and avoid the "prod out of compiled code" step if already in a VCPU thread. This is easily done with a check on current_cpu, i.e. qemu_in_vcpu_thread(). Hopefully, multithreaded TCG will get rid of the whole logic to kick VCPUs whenever an I/O event occurs! Cc: Frederic Konrad Message-Id: <1434646046-27150-3-git-send-email-pbonzini@redhat.com> Reviewed-by: Fam Zheng Signed-off-by: Paolo Bonzini --- cpus.c | 9 +++++++++ include/qemu/main-loop.h | 10 ++++++++++ stubs/iothread-lock.c | 5 +++++ 3 files changed, 24 insertions(+) diff --git a/cpus.c b/cpus.c index c09fbef5de..f547aebeaf 100644 --- a/cpus.c +++ b/cpus.c @@ -1146,6 +1146,13 @@ bool qemu_in_vcpu_thread(void) return current_cpu && qemu_cpu_is_self(current_cpu); } +static __thread bool iothread_locked = false; + +bool qemu_mutex_iothread_locked(void) +{ + return iothread_locked; +} + void qemu_mutex_lock_iothread(void) { atomic_inc(&iothread_requesting_mutex); @@ -1164,10 +1171,12 @@ void qemu_mutex_lock_iothread(void) atomic_dec(&iothread_requesting_mutex); qemu_cond_broadcast(&qemu_io_proceeded_cond); } + iothread_locked = true; } void qemu_mutex_unlock_iothread(void) { + iothread_locked = false; qemu_mutex_unlock(&qemu_global_mutex); } diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h index 0f4a0fd4b2..bc18ca30e4 100644 --- a/include/qemu/main-loop.h +++ b/include/qemu/main-loop.h @@ -222,6 +222,16 @@ void qemu_set_fd_handler(int fd, int qemu_add_child_watch(pid_t pid); #endif +/** + * qemu_mutex_iothread_locked: Return lock status of the main loop mutex. + * + * The main loop mutex is the coarsest lock in QEMU, and as such it + * must always be taken outside other locks. This function helps + * functions take different paths depending on whether the current + * thread is running within the main loop mutex. + */ +bool qemu_mutex_iothread_locked(void); + /** * qemu_mutex_lock_iothread: Lock the main loop mutex. * diff --git a/stubs/iothread-lock.c b/stubs/iothread-lock.c index 5d8aca1b37..dda6f6b58d 100644 --- a/stubs/iothread-lock.c +++ b/stubs/iothread-lock.c @@ -1,6 +1,11 @@ #include "qemu-common.h" #include "qemu/main-loop.h" +bool qemu_mutex_iothread_locked(void) +{ + return true; +} + void qemu_mutex_lock_iothread(void) { } From 196ea13104f802c508e57180b2a0d2b3418989a3 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 18 Jun 2015 18:47:20 +0200 Subject: [PATCH 11/19] memory: Add global-locking property to memory regions This introduces the memory region property "global_locking". It is true by default. By setting it to false, a device model can request BQL-free dispatching of region accesses to its r/w handlers. The actual BQL break-up will be provided in a separate patch. Signed-off-by: Jan Kiszka Cc: Frederic Konrad Signed-off-by: Paolo Bonzini Message-Id: <1434646046-27150-4-git-send-email-pbonzini@redhat.com> --- include/exec/memory.h | 26 ++++++++++++++++++++++++++ memory.c | 11 +++++++++++ 2 files changed, 37 insertions(+) diff --git a/include/exec/memory.h b/include/exec/memory.h index 8ae004eb06..0ebdc55506 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -180,6 +180,7 @@ struct MemoryRegion { bool rom_device; bool warning_printed; /* For reservations */ bool flush_coalesced_mmio; + bool global_locking; MemoryRegion *alias; hwaddr alias_offset; int32_t priority; @@ -824,6 +825,31 @@ void memory_region_set_flush_coalesced(MemoryRegion *mr); */ void memory_region_clear_flush_coalesced(MemoryRegion *mr); +/** + * memory_region_set_global_locking: Declares the access processing requires + * QEMU's global lock. + * + * When this is invoked, accesses to the memory region will be processed while + * holding the global lock of QEMU. This is the default behavior of memory + * regions. + * + * @mr: the memory region to be updated. + */ +void memory_region_set_global_locking(MemoryRegion *mr); + +/** + * memory_region_clear_global_locking: Declares that access processing does + * not depend on the QEMU global lock. + * + * By clearing this property, accesses to the memory region will be processed + * outside of QEMU's global lock (unless the lock is held on when issuing the + * access request). In this case, the device model implementing the access + * handlers is responsible for synchronization of concurrency. + * + * @mr: the memory region to be updated. + */ +void memory_region_clear_global_locking(MemoryRegion *mr); + /** * memory_region_add_eventfd: Request an eventfd to be triggered when a word * is written to a location. diff --git a/memory.c b/memory.c index 3ac0bd20d2..b0b8860aaa 100644 --- a/memory.c +++ b/memory.c @@ -1012,6 +1012,7 @@ static void memory_region_initfn(Object *obj) mr->ram_addr = RAM_ADDR_INVALID; mr->enabled = true; mr->romd_mode = true; + mr->global_locking = true; mr->destructor = memory_region_destructor_none; QTAILQ_INIT(&mr->subregions); QTAILQ_INIT(&mr->coalesced); @@ -1646,6 +1647,16 @@ void memory_region_clear_flush_coalesced(MemoryRegion *mr) } } +void memory_region_set_global_locking(MemoryRegion *mr) +{ + mr->global_locking = true; +} + +void memory_region_clear_global_locking(MemoryRegion *mr) +{ + mr->global_locking = false; +} + void memory_region_add_eventfd(MemoryRegion *mr, hwaddr addr, unsigned size, From 125b3806668106667dd2ae049593852859e12b63 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Jun 2015 18:47:21 +0200 Subject: [PATCH 12/19] exec: pull qemu_flush_coalesced_mmio_buffer() into address_space_rw/ld*/st* As memory_region_read/write_accessor will now be run also without BQL held, we need to move coalesced MMIO flushing earlier in the dispatch process. Cc: Frederic Konrad Message-Id: <1434646046-27150-5-git-send-email-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- exec.c | 21 +++++++++++++++++++++ memory.c | 12 ------------ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/exec.c b/exec.c index f7883d2246..f2e66032a4 100644 --- a/exec.c +++ b/exec.c @@ -2316,6 +2316,13 @@ static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) return l; } +static void prepare_mmio_access(MemoryRegion *mr) +{ + if (mr->flush_coalesced_mmio) { + qemu_flush_coalesced_mmio_buffer(); + } +} + MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, uint8_t *buf, int len, bool is_write) { @@ -2333,6 +2340,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, if (is_write) { if (!memory_access_is_direct(mr, is_write)) { + prepare_mmio_access(mr); l = memory_access_size(mr, l, addr1); /* XXX: could force current_cpu to NULL to avoid potential bugs */ @@ -2374,6 +2382,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, } else { if (!memory_access_is_direct(mr, is_write)) { /* I/O case */ + prepare_mmio_access(mr); l = memory_access_size(mr, l, addr1); switch (l) { case 8: @@ -2739,6 +2748,8 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr, rcu_read_lock(); mr = address_space_translate(as, addr, &addr1, &l, false); if (l < 4 || !memory_access_is_direct(mr, false)) { + prepare_mmio_access(mr); + /* I/O case */ r = memory_region_dispatch_read(mr, addr1, &val, 4, attrs); #if defined(TARGET_WORDS_BIGENDIAN) @@ -2828,6 +2839,8 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr, mr = address_space_translate(as, addr, &addr1, &l, false); if (l < 8 || !memory_access_is_direct(mr, false)) { + prepare_mmio_access(mr); + /* I/O case */ r = memory_region_dispatch_read(mr, addr1, &val, 8, attrs); #if defined(TARGET_WORDS_BIGENDIAN) @@ -2937,6 +2950,8 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as, mr = address_space_translate(as, addr, &addr1, &l, false); if (l < 2 || !memory_access_is_direct(mr, false)) { + prepare_mmio_access(mr); + /* I/O case */ r = memory_region_dispatch_read(mr, addr1, &val, 2, attrs); #if defined(TARGET_WORDS_BIGENDIAN) @@ -3026,6 +3041,8 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val, mr = address_space_translate(as, addr, &addr1, &l, true); if (l < 4 || !memory_access_is_direct(mr, true)) { + prepare_mmio_access(mr); + r = memory_region_dispatch_write(mr, addr1, val, 4, attrs); } else { addr1 += memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK; @@ -3065,6 +3082,8 @@ static inline void address_space_stl_internal(AddressSpace *as, mr = address_space_translate(as, addr, &addr1, &l, true); if (l < 4 || !memory_access_is_direct(mr, true)) { + prepare_mmio_access(mr); + #if defined(TARGET_WORDS_BIGENDIAN) if (endian == DEVICE_LITTLE_ENDIAN) { val = bswap32(val); @@ -3169,6 +3188,8 @@ static inline void address_space_stw_internal(AddressSpace *as, rcu_read_lock(); mr = address_space_translate(as, addr, &addr1, &l, true); if (l < 2 || !memory_access_is_direct(mr, true)) { + prepare_mmio_access(mr); + #if defined(TARGET_WORDS_BIGENDIAN) if (endian == DEVICE_LITTLE_ENDIAN) { val = bswap16(val); diff --git a/memory.c b/memory.c index b0b8860aaa..5a0cc66982 100644 --- a/memory.c +++ b/memory.c @@ -396,9 +396,6 @@ static MemTxResult memory_region_read_accessor(MemoryRegion *mr, { uint64_t tmp; - if (mr->flush_coalesced_mmio) { - qemu_flush_coalesced_mmio_buffer(); - } tmp = mr->ops->read(mr->opaque, addr, size); trace_memory_region_ops_read(mr, addr, tmp, size); *value |= (tmp & mask) << shift; @@ -416,9 +413,6 @@ static MemTxResult memory_region_read_with_attrs_accessor(MemoryRegion *mr, uint64_t tmp = 0; MemTxResult r; - if (mr->flush_coalesced_mmio) { - qemu_flush_coalesced_mmio_buffer(); - } r = mr->ops->read_with_attrs(mr->opaque, addr, &tmp, size, attrs); trace_memory_region_ops_read(mr, addr, tmp, size); *value |= (tmp & mask) << shift; @@ -451,9 +445,6 @@ static MemTxResult memory_region_write_accessor(MemoryRegion *mr, { uint64_t tmp; - if (mr->flush_coalesced_mmio) { - qemu_flush_coalesced_mmio_buffer(); - } tmp = (*value >> shift) & mask; trace_memory_region_ops_write(mr, addr, tmp, size); mr->ops->write(mr->opaque, addr, tmp, size); @@ -470,9 +461,6 @@ static MemTxResult memory_region_write_with_attrs_accessor(MemoryRegion *mr, { uint64_t tmp; - if (mr->flush_coalesced_mmio) { - qemu_flush_coalesced_mmio_buffer(); - } tmp = (*value >> shift) & mask; trace_memory_region_ops_write(mr, addr, tmp, size); return mr->ops->write_with_attrs(mr->opaque, addr, tmp, size, attrs); From 4840f10eff37eebc609fcc933ab985dc66df95c6 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 18 Jun 2015 18:47:22 +0200 Subject: [PATCH 13/19] memory: let address_space_rw/ld*/st* run outside the BQL The MMIO case is further broken up in two cases: if the caller does not hold the BQL on invocation, the unlocked one takes or avoids BQL depending on the locking strategy of the target memory region and its coalesced MMIO handling. In this case, the caller should not hold _any_ lock (a friendly suggestion which is disregarded by virtio-scsi-dataplane). Signed-off-by: Jan Kiszka Cc: Frederic Konrad Message-Id: <1434646046-27150-6-git-send-email-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- exec.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/exec.c b/exec.c index f2e66032a4..3457f7e828 100644 --- a/exec.c +++ b/exec.c @@ -48,6 +48,7 @@ #endif #include "exec/cpu-all.h" #include "qemu/rcu_queue.h" +#include "qemu/main-loop.h" #include "exec/cputlb.h" #include "translate-all.h" @@ -2316,11 +2317,27 @@ static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) return l; } -static void prepare_mmio_access(MemoryRegion *mr) +static bool prepare_mmio_access(MemoryRegion *mr) { - if (mr->flush_coalesced_mmio) { - qemu_flush_coalesced_mmio_buffer(); + bool unlocked = !qemu_mutex_iothread_locked(); + bool release_lock = false; + + if (unlocked && mr->global_locking) { + qemu_mutex_lock_iothread(); + unlocked = false; + release_lock = true; } + if (mr->flush_coalesced_mmio) { + if (unlocked) { + qemu_mutex_lock_iothread(); + } + qemu_flush_coalesced_mmio_buffer(); + if (unlocked) { + qemu_mutex_unlock_iothread(); + } + } + + return release_lock; } MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, @@ -2332,6 +2349,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, hwaddr addr1; MemoryRegion *mr; MemTxResult result = MEMTX_OK; + bool release_lock = false; rcu_read_lock(); while (len > 0) { @@ -2340,7 +2358,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, if (is_write) { if (!memory_access_is_direct(mr, is_write)) { - prepare_mmio_access(mr); + release_lock |= prepare_mmio_access(mr); l = memory_access_size(mr, l, addr1); /* XXX: could force current_cpu to NULL to avoid potential bugs */ @@ -2382,7 +2400,7 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, } else { if (!memory_access_is_direct(mr, is_write)) { /* I/O case */ - prepare_mmio_access(mr); + release_lock |= prepare_mmio_access(mr); l = memory_access_size(mr, l, addr1); switch (l) { case 8: @@ -2418,6 +2436,12 @@ MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, memcpy(buf, ptr, l); } } + + if (release_lock) { + qemu_mutex_unlock_iothread(); + release_lock = false; + } + len -= l; buf += l; addr += l; @@ -2744,11 +2768,12 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr, hwaddr l = 4; hwaddr addr1; MemTxResult r; + bool release_lock = false; rcu_read_lock(); mr = address_space_translate(as, addr, &addr1, &l, false); if (l < 4 || !memory_access_is_direct(mr, false)) { - prepare_mmio_access(mr); + release_lock |= prepare_mmio_access(mr); /* I/O case */ r = memory_region_dispatch_read(mr, addr1, &val, 4, attrs); @@ -2782,6 +2807,9 @@ static inline uint32_t address_space_ldl_internal(AddressSpace *as, hwaddr addr, if (result) { *result = r; } + if (release_lock) { + qemu_mutex_unlock_iothread(); + } rcu_read_unlock(); return val; } @@ -2834,12 +2862,13 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr, hwaddr l = 8; hwaddr addr1; MemTxResult r; + bool release_lock = false; rcu_read_lock(); mr = address_space_translate(as, addr, &addr1, &l, false); if (l < 8 || !memory_access_is_direct(mr, false)) { - prepare_mmio_access(mr); + release_lock |= prepare_mmio_access(mr); /* I/O case */ r = memory_region_dispatch_read(mr, addr1, &val, 8, attrs); @@ -2873,6 +2902,9 @@ static inline uint64_t address_space_ldq_internal(AddressSpace *as, hwaddr addr, if (result) { *result = r; } + if (release_lock) { + qemu_mutex_unlock_iothread(); + } rcu_read_unlock(); return val; } @@ -2945,12 +2977,13 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as, hwaddr l = 2; hwaddr addr1; MemTxResult r; + bool release_lock = false; rcu_read_lock(); mr = address_space_translate(as, addr, &addr1, &l, false); if (l < 2 || !memory_access_is_direct(mr, false)) { - prepare_mmio_access(mr); + release_lock |= prepare_mmio_access(mr); /* I/O case */ r = memory_region_dispatch_read(mr, addr1, &val, 2, attrs); @@ -2984,6 +3017,9 @@ static inline uint32_t address_space_lduw_internal(AddressSpace *as, if (result) { *result = r; } + if (release_lock) { + qemu_mutex_unlock_iothread(); + } rcu_read_unlock(); return val; } @@ -3036,12 +3072,13 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val, hwaddr addr1; MemTxResult r; uint8_t dirty_log_mask; + bool release_lock = false; rcu_read_lock(); mr = address_space_translate(as, addr, &addr1, &l, true); if (l < 4 || !memory_access_is_direct(mr, true)) { - prepare_mmio_access(mr); + release_lock |= prepare_mmio_access(mr); r = memory_region_dispatch_write(mr, addr1, val, 4, attrs); } else { @@ -3057,6 +3094,9 @@ void address_space_stl_notdirty(AddressSpace *as, hwaddr addr, uint32_t val, if (result) { *result = r; } + if (release_lock) { + qemu_mutex_unlock_iothread(); + } rcu_read_unlock(); } @@ -3077,12 +3117,13 @@ static inline void address_space_stl_internal(AddressSpace *as, hwaddr l = 4; hwaddr addr1; MemTxResult r; + bool release_lock = false; rcu_read_lock(); mr = address_space_translate(as, addr, &addr1, &l, true); if (l < 4 || !memory_access_is_direct(mr, true)) { - prepare_mmio_access(mr); + release_lock |= prepare_mmio_access(mr); #if defined(TARGET_WORDS_BIGENDIAN) if (endian == DEVICE_LITTLE_ENDIAN) { @@ -3115,6 +3156,9 @@ static inline void address_space_stl_internal(AddressSpace *as, if (result) { *result = r; } + if (release_lock) { + qemu_mutex_unlock_iothread(); + } rcu_read_unlock(); } @@ -3184,11 +3228,12 @@ static inline void address_space_stw_internal(AddressSpace *as, hwaddr l = 2; hwaddr addr1; MemTxResult r; + bool release_lock = false; rcu_read_lock(); mr = address_space_translate(as, addr, &addr1, &l, true); if (l < 2 || !memory_access_is_direct(mr, true)) { - prepare_mmio_access(mr); + release_lock |= prepare_mmio_access(mr); #if defined(TARGET_WORDS_BIGENDIAN) if (endian == DEVICE_LITTLE_ENDIAN) { @@ -3221,6 +3266,9 @@ static inline void address_space_stw_internal(AddressSpace *as, if (result) { *result = r; } + if (release_lock) { + qemu_mutex_unlock_iothread(); + } rcu_read_unlock(); } From 4b8523ee896750c37b4fa224a40d34703cbdf4c6 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 18 Jun 2015 18:47:23 +0200 Subject: [PATCH 14/19] kvm: First step to push iothread lock out of inner run loop This opens the path to get rid of the iothread lock on vmexits in KVM mode. On x86, the in-kernel irqchips has to be used because we otherwise need to synchronize APIC and other per-cpu state accesses that could be changed concurrently. Regarding pre/post-run callbacks, s390x and ARM should be fine without specific locking as the callbacks are empty. MIPS and POWER require locking for the pre-run callback. For the handle_exit callback, it is non-empty in x86, POWER and s390. Some POWER cases could do without the locking, but it is left in place for now. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini Message-Id: <1434646046-27150-7-git-send-email-pbonzini@redhat.com> --- kvm-all.c | 10 ++++++++-- target-i386/kvm.c | 24 ++++++++++++++++++++++++ target-mips/kvm.c | 4 ++++ target-ppc/kvm.c | 7 +++++++ target-s390x/kvm.c | 3 +++ 5 files changed, 46 insertions(+), 2 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index e98b08def2..ca428ca298 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1755,6 +1755,8 @@ int kvm_cpu_exec(CPUState *cpu) return EXCP_HLT; } + qemu_mutex_unlock_iothread(); + do { MemTxAttrs attrs; @@ -1773,11 +1775,9 @@ int kvm_cpu_exec(CPUState *cpu) */ qemu_cpu_kick_self(); } - qemu_mutex_unlock_iothread(); run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); - qemu_mutex_lock_iothread(); attrs = kvm_arch_post_run(cpu, run); if (run_ret < 0) { @@ -1804,20 +1804,24 @@ int kvm_cpu_exec(CPUState *cpu) switch (run->exit_reason) { case KVM_EXIT_IO: DPRINTF("handle_io\n"); + qemu_mutex_lock_iothread(); kvm_handle_io(run->io.port, attrs, (uint8_t *)run + run->io.data_offset, run->io.direction, run->io.size, run->io.count); + qemu_mutex_unlock_iothread(); ret = 0; break; case KVM_EXIT_MMIO: DPRINTF("handle_mmio\n"); + qemu_mutex_lock_iothread(); address_space_rw(&address_space_memory, run->mmio.phys_addr, attrs, run->mmio.data, run->mmio.len, run->mmio.is_write); + qemu_mutex_unlock_iothread(); ret = 0; break; case KVM_EXIT_IRQ_WINDOW_OPEN: @@ -1860,6 +1864,8 @@ int kvm_cpu_exec(CPUState *cpu) } } while (ret == 0); + qemu_mutex_lock_iothread(); + if (ret < 0) { cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE); vm_stop(RUN_STATE_INTERNAL_ERROR); diff --git a/target-i386/kvm.c b/target-i386/kvm.c index daced5cb94..6426600c63 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -2191,7 +2191,10 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) /* Inject NMI */ if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { + qemu_mutex_lock_iothread(); cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; + qemu_mutex_unlock_iothread(); + DPRINTF("injected NMI\n"); ret = kvm_vcpu_ioctl(cpu, KVM_NMI); if (ret < 0) { @@ -2200,6 +2203,10 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) } } + if (!kvm_irqchip_in_kernel()) { + qemu_mutex_lock_iothread(); + } + /* Force the VCPU out of its inner loop to process any INIT requests * or (for userspace APIC, but it is cheap to combine the checks here) * pending TPR access reports. @@ -2243,6 +2250,8 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) DPRINTF("setting tpr\n"); run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state); + + qemu_mutex_unlock_iothread(); } } @@ -2256,8 +2265,17 @@ MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run) } else { env->eflags &= ~IF_MASK; } + + /* We need to protect the apic state against concurrent accesses from + * different threads in case the userspace irqchip is used. */ + if (!kvm_irqchip_in_kernel()) { + qemu_mutex_lock_iothread(); + } cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8); cpu_set_apic_base(x86_cpu->apic_state, run->apic_base); + if (!kvm_irqchip_in_kernel()) { + qemu_mutex_unlock_iothread(); + } return cpu_get_mem_attrs(env); } @@ -2550,13 +2568,17 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) switch (run->exit_reason) { case KVM_EXIT_HLT: DPRINTF("handle_hlt\n"); + qemu_mutex_lock_iothread(); ret = kvm_handle_halt(cpu); + qemu_mutex_unlock_iothread(); break; case KVM_EXIT_SET_TPR: ret = 0; break; case KVM_EXIT_TPR_ACCESS: + qemu_mutex_lock_iothread(); ret = kvm_handle_tpr_access(cpu); + qemu_mutex_unlock_iothread(); break; case KVM_EXIT_FAIL_ENTRY: code = run->fail_entry.hardware_entry_failure_reason; @@ -2582,7 +2604,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) break; case KVM_EXIT_DEBUG: DPRINTF("kvm_exit_debug\n"); + qemu_mutex_lock_iothread(); ret = kvm_handle_debug(cpu, &run->debug.arch); + qemu_mutex_unlock_iothread(); break; default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); diff --git a/target-mips/kvm.c b/target-mips/kvm.c index 948619fbab..7d2293d934 100644 --- a/target-mips/kvm.c +++ b/target-mips/kvm.c @@ -99,6 +99,8 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run) int r; struct kvm_mips_interrupt intr; + qemu_mutex_lock_iothread(); + if ((cs->interrupt_request & CPU_INTERRUPT_HARD) && cpu_mips_io_interrupts_pending(cpu)) { intr.cpu = -1; @@ -109,6 +111,8 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run) __func__, cs->cpu_index, intr.irq); } } + + qemu_mutex_unlock_iothread(); } MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run) diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index afb4696b8a..ddf469fe09 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -1242,6 +1242,8 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run) int r; unsigned irq; + qemu_mutex_lock_iothread(); + /* PowerPC QEMU tracks the various core input pins (interrupt, critical * interrupt, reset, etc) in PPC-specific env->irq_input_state. */ if (!cap_interrupt_level && @@ -1269,6 +1271,8 @@ void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run) /* We don't know if there are more interrupts pending after this. However, * the guest will return to userspace in the course of handling this one * anyways, so we will get a chance to deliver the rest. */ + + qemu_mutex_unlock_iothread(); } MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run) @@ -1570,6 +1574,8 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) CPUPPCState *env = &cpu->env; int ret; + qemu_mutex_lock_iothread(); + switch (run->exit_reason) { case KVM_EXIT_DCR: if (run->dcr.is_write) { @@ -1620,6 +1626,7 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) break; } + qemu_mutex_unlock_iothread(); return ret; } diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c index 135111a2c4..ae3a0affec 100644 --- a/target-s390x/kvm.c +++ b/target-s390x/kvm.c @@ -2007,6 +2007,8 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) S390CPU *cpu = S390_CPU(cs); int ret = 0; + qemu_mutex_lock_iothread(); + switch (run->exit_reason) { case KVM_EXIT_S390_SIEIC: ret = handle_intercept(cpu); @@ -2027,6 +2029,7 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) fprintf(stderr, "Unknown KVM exit: %d\n", run->exit_reason); break; } + qemu_mutex_unlock_iothread(); if (ret == 0) { ret = EXCP_INTERRUPT; From 80b7d2efb63c225797345c152cdd3392b9fe7b72 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 18 Jun 2015 18:47:24 +0200 Subject: [PATCH 15/19] kvm: Switch to unlocked PIO Do not take the BQL before dispatching PIO requests of KVM VCPUs. Instead, address_space_rw will do it if necessary. This enables completely BQL-free PIO handling in KVM mode for upcoming devices with fine-grained locking. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini Message-Id: <1434646046-27150-8-git-send-email-pbonzini@redhat.com> --- kvm-all.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index ca428ca298..ad5ac5e3df 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1804,13 +1804,12 @@ int kvm_cpu_exec(CPUState *cpu) switch (run->exit_reason) { case KVM_EXIT_IO: DPRINTF("handle_io\n"); - qemu_mutex_lock_iothread(); + /* Called outside BQL */ kvm_handle_io(run->io.port, attrs, (uint8_t *)run + run->io.data_offset, run->io.direction, run->io.size, run->io.count); - qemu_mutex_unlock_iothread(); ret = 0; break; case KVM_EXIT_MMIO: From 7070e085d490c396f9237c8f10bf8b6e69cd0066 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Jun 2015 18:47:25 +0200 Subject: [PATCH 16/19] acpi: mark PMTIMER as unlocked Accessing QEMU_CLOCK_VIRTUAL is thread-safe. Signed-off-by: Paolo Bonzini Message-Id: <1434646046-27150-9-git-send-email-pbonzini@redhat.com> --- hw/acpi/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/acpi/core.c b/hw/acpi/core.c index 0f201d8c6d..fe6215af4a 100644 --- a/hw/acpi/core.c +++ b/hw/acpi/core.c @@ -528,6 +528,7 @@ void acpi_pm_tmr_init(ACPIREGS *ar, acpi_update_sci_fn update_sci, ar->tmr.timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, acpi_pm_tmr_timer, ar); memory_region_init_io(&ar->tmr.io, memory_region_owner(parent), &acpi_pm_tmr_ops, ar, "acpi-tmr", 4); + memory_region_clear_global_locking(&ar->tmr.io); memory_region_add_subregion(parent, 8, &ar->tmr.io); } From de7ea885c5394c1fba7443cbf33bd2745d32e6c2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Jun 2015 18:47:26 +0200 Subject: [PATCH 17/19] kvm: Switch to unlocked MMIO Do not take the BQL before dispatching MMIO requests of KVM VCPUs. Instead, address_space_rw will do it if necessary. This enables completely BQL-free MMIO handling in KVM mode for upcoming devices with fine-grained locking. Signed-off-by: Paolo Bonzini Message-Id: <1434646046-27150-10-git-send-email-pbonzini@redhat.com> --- kvm-all.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index ad5ac5e3df..df57da0bf2 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1814,13 +1814,12 @@ int kvm_cpu_exec(CPUState *cpu) break; case KVM_EXIT_MMIO: DPRINTF("handle_mmio\n"); - qemu_mutex_lock_iothread(); + /* Called outside BQL */ address_space_rw(&address_space_memory, run->mmio.phys_addr, attrs, run->mmio.data, run->mmio.len, run->mmio.is_write); - qemu_mutex_unlock_iothread(); ret = 0; break; case KVM_EXIT_IRQ_WINDOW_OPEN: From fba0a593b2809ecdda68650952cf3d3332ac1990 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Fri, 3 Jul 2015 15:18:24 +0100 Subject: [PATCH 18/19] Stop including qemu-common.h in memory.h Including qemu-common.h from other header files is generally a bad idea, because it means it's very easy to end up with a circular dependency. For instance, if we wanted to include memory.h from qom/cpu.h we'd end up with this loop: memory.h -> qemu-common.h -> cpu.h -> cpu-qom.h -> qom/cpu.h -> memory.h Remove the include from memory.h. This requires us to fix up a few other files which were inadvertently getting declarations indirectly through memory.h. The biggest change is splitting the fprintf_function typedef out into its own header so other headers can get at it without having to include qemu-common.h. Signed-off-by: Peter Maydell Message-Id: <1435933104-15216-1-git-send-email-peter.maydell@linaro.org> Signed-off-by: Paolo Bonzini --- include/exec/cpu-common.h | 2 ++ include/exec/memory.h | 1 - include/hw/arm/arm.h | 1 + include/qemu-common.h | 4 +--- include/qemu/fprintf-fn.h | 17 +++++++++++++++++ target-s390x/mmu_helper.c | 2 +- 6 files changed, 22 insertions(+), 5 deletions(-) create mode 100644 include/qemu/fprintf-fn.h diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index de8a7200a9..9fb1d541d4 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -13,6 +13,8 @@ #include "qemu/bswap.h" #include "qemu/queue.h" +#include "qemu/fprintf-fn.h" +#include "qemu/typedefs.h" /** * CPUListState: diff --git a/include/exec/memory.h b/include/exec/memory.h index 0ebdc55506..139471500f 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -23,7 +23,6 @@ #include #include -#include "qemu-common.h" #include "exec/cpu-common.h" #ifndef CONFIG_USER_ONLY #include "exec/hwaddr.h" diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h index 760804cc46..4dcd4f9b63 100644 --- a/include/hw/arm/arm.h +++ b/include/hw/arm/arm.h @@ -14,6 +14,7 @@ #include "exec/memory.h" #include "hw/irq.h" #include "qemu/notify.h" +#include "cpu.h" /* armv7m.c */ qemu_irq *armv7m_init(MemoryRegion *system_memory, int mem_size, int num_irq, diff --git a/include/qemu-common.h b/include/qemu-common.h index 5be3cddfa9..237d6547b3 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -15,6 +15,7 @@ #include "qemu/compiler.h" #include "config-host.h" #include "qemu/typedefs.h" +#include "qemu/fprintf-fn.h" #if defined(__arm__) || defined(__sparc__) || defined(__mips__) || defined(__hppa__) || defined(__ia64__) #define WORDS_ALIGNED @@ -85,9 +86,6 @@ # error Unknown pointer size #endif -typedef int (*fprintf_function)(FILE *f, const char *fmt, ...) - GCC_FMT_ATTR(2, 3); - #ifdef _WIN32 #define fsync _commit #if !defined(lseek) diff --git a/include/qemu/fprintf-fn.h b/include/qemu/fprintf-fn.h new file mode 100644 index 0000000000..9ddc90f1c5 --- /dev/null +++ b/include/qemu/fprintf-fn.h @@ -0,0 +1,17 @@ +/* + * Typedef for fprintf-alike function pointers. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_FPRINTF_FN_H +#define QEMU_FPRINTF_FN_H 1 + +#include "qemu/compiler.h" +#include + +typedef int (*fprintf_function)(FILE *f, const char *fmt, ...) + GCC_FMT_ATTR(2, 3); + +#endif diff --git a/target-s390x/mmu_helper.c b/target-s390x/mmu_helper.c index 815ff42dde..1ea6d812c2 100644 --- a/target-s390x/mmu_helper.c +++ b/target-s390x/mmu_helper.c @@ -17,8 +17,8 @@ #include "qemu/error-report.h" #include "exec/address-spaces.h" -#include "sysemu/kvm.h" #include "cpu.h" +#include "sysemu/kvm.h" /* #define DEBUG_S390 */ /* #define DEBUG_S390_PTE */ From b242e0e0e2969c044a318e56f7988bbd84de1f63 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 4 Jul 2015 00:24:51 +0200 Subject: [PATCH 19/19] exec: skip MMIO regions correctly in cpu_physical_memory_write_rom_internal Loading the BIOS in the mac99 machine is interesting, because there is a PROM in the middle of the BIOS region (from 16K to 32K). Before memory region accesses were clamped, when QEMU was asked to load a BIOS from 0xfff00000 to 0xffffffff it would put even those 16K from the BIOS file into the region. This is weird because those 16K were not actually visible between 0xfff04000 and 0xfff07fff. However, it worked. After clamping was added, this also worked. In this case, the cpu_physical_memory_write_rom_internal function split the write in three parts: the first 16K were copied, the PROM area (second 16K) were ignored, then the rest was copied. Problems then started with commit 965eb2f (exec: do not clamp accesses to MMIO regions, 2015-06-17). Clamping accesses is not done for MMIO regions because they can overlap wildly, and MMIO registers can be expected to perform full-width accesses based only on their address (with no respect for adjacent registers that could decode to completely different MemoryRegions). However, this lack of clamping also applied to the PROM area! cpu_physical_memory_write_rom_internal thus failed to copy the third range above, i.e. only copied the first 16K of the BIOS. In effect, address_space_translate is expecting _something else_ to do the clamping for MMIO regions if the incoming length is large. This "something else" is memory_access_size in the case of address_space_rw, so use the same logic in cpu_physical_memory_write_rom_internal. Reported-by: Alexander Graf Reviewed-by: Laurent Vivier Tested-by: Laurent Vivier Fixes: 965eb2f Signed-off-by: Paolo Bonzini --- exec.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/exec.c b/exec.c index 3457f7e828..251dc79e10 100644 --- a/exec.c +++ b/exec.c @@ -353,6 +353,18 @@ address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *x *xlat = addr + section->offset_within_region; mr = section->mr; + + /* MMIO registers can be expected to perform full-width accesses based only + * on their address, without considering adjacent registers that could + * decode to completely different MemoryRegions. When such registers + * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO + * regions overlap wildly. For this reason we cannot clamp the accesses + * here. + * + * If the length is small (as is the case for address_space_ldl/stl), + * everything works fine. If the incoming length is large, however, + * the caller really has to do the clamping through memory_access_size. + */ if (memory_region_is_ram(mr)) { diff = int128_sub(section->size, int128_make64(addr)); *plen = int128_get64(int128_min(diff, int128_make64(*plen))); @@ -2491,7 +2503,7 @@ static inline void cpu_physical_memory_write_rom_internal(AddressSpace *as, if (!(memory_region_is_ram(mr) || memory_region_is_romd(mr))) { - /* do nothing */ + l = memory_access_size(mr, l, addr1); } else { addr1 += memory_region_get_ram_addr(mr); /* ROM/RAM case */