From 1aab16c28a0232d898d6f56f5a56019472296ee7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 27 Jan 2017 11:25:33 +0100 Subject: [PATCH 01/21] cpu-exec: unify icount_decr and tcg_exit_req The icount interrupt flag and tcg_exit_req serve almost the same purpose, let's make them completely the same. The former TB_EXIT_REQUESTED and TB_EXIT_ICOUNT_EXPIRED cases are unified, since we can distinguish them from the value of the interrupt flag. Signed-off-by: Paolo Bonzini --- cpu-exec.c | 70 ++++++++++++++++++--------------------- include/exec/gen-icount.h | 53 ++++++++++++++--------------- include/qom/cpu.h | 15 ++++----- qom/cpu.c | 2 +- tcg/tcg.h | 1 - translate-all.c | 2 +- translate-common.c | 13 +++----- 7 files changed, 71 insertions(+), 85 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index 142a5862fc..8f094c8e15 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -185,12 +185,6 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb) cc->set_pc(cpu, last_tb->pc); } } - if (tb_exit == TB_EXIT_REQUESTED) { - /* We were asked to stop executing TBs (probably a pending - * interrupt. We've now stopped, so clear the flag. - */ - atomic_set(&cpu->tcg_exit_req, 0); - } return ret; } @@ -537,6 +531,7 @@ static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb, SyncClocks *sc) { uintptr_t ret; + int32_t insns_left; if (unlikely(atomic_read(&cpu->exit_request))) { return; @@ -546,8 +541,15 @@ static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb, ret = cpu_tb_exec(cpu, tb); tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK); *tb_exit = ret & TB_EXIT_MASK; - switch (*tb_exit) { - case TB_EXIT_REQUESTED: + if (*tb_exit != TB_EXIT_REQUESTED) { + *last_tb = tb; + return; + } + + *last_tb = NULL; + insns_left = atomic_read(&cpu->icount_decr.u32); + atomic_set(&cpu->icount_decr.u16.high, 0); + if (insns_left < 0) { /* Something asked us to stop executing * chained TBs; just continue round the main * loop. Whatever requested the exit will also @@ -559,38 +561,30 @@ static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb, * or cpu->interrupt_request. */ smp_mb(); - *last_tb = NULL; - break; - case TB_EXIT_ICOUNT_EXPIRED: - { - /* Instruction counter expired. */ -#ifdef CONFIG_USER_ONLY - abort(); -#else - int insns_left = cpu->icount_decr.u32; - *last_tb = NULL; - if (cpu->icount_extra && insns_left >= 0) { - /* Refill decrementer and continue execution. */ - cpu->icount_extra += insns_left; - insns_left = MIN(0xffff, cpu->icount_extra); - cpu->icount_extra -= insns_left; - cpu->icount_decr.u16.low = insns_left; - } else { - if (insns_left > 0) { - /* Execute remaining instructions. */ - cpu_exec_nocache(cpu, insns_left, tb, false); - align_clocks(sc, cpu); - } - cpu->exception_index = EXCP_INTERRUPT; - cpu_loop_exit(cpu); + return; + } + + /* Instruction counter expired. */ + assert(use_icount); +#ifndef CONFIG_USER_ONLY + if (cpu->icount_extra) { + /* Refill decrementer and continue execution. */ + cpu->icount_extra += insns_left; + insns_left = MIN(0xffff, cpu->icount_extra); + cpu->icount_extra -= insns_left; + cpu->icount_decr.u16.low = insns_left; + } else { + /* Execute any remaining instructions, then let the main loop + * handle the next event. + */ + if (insns_left > 0) { + cpu_exec_nocache(cpu, insns_left, tb, false); + align_clocks(sc, cpu); } - break; + cpu->exception_index = EXCP_INTERRUPT; + cpu_loop_exit(cpu); + } #endif - } - default: - *last_tb = tb; - break; - } } /* main execution loop */ diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h index 050de59b38..62d462e494 100644 --- a/include/exec/gen-icount.h +++ b/include/exec/gen-icount.h @@ -6,58 +6,55 @@ /* Helpers for instruction counting code generation. */ static int icount_start_insn_idx; -static TCGLabel *icount_label; static TCGLabel *exitreq_label; static inline void gen_tb_start(TranslationBlock *tb) { - TCGv_i32 count, flag, imm; + TCGv_i32 count, imm; exitreq_label = gen_new_label(); - flag = tcg_temp_new_i32(); - tcg_gen_ld_i32(flag, cpu_env, - offsetof(CPUState, tcg_exit_req) - ENV_OFFSET); - tcg_gen_brcondi_i32(TCG_COND_NE, flag, 0, exitreq_label); - tcg_temp_free_i32(flag); - - if (!(tb->cflags & CF_USE_ICOUNT)) { - return; + if (tb->cflags & CF_USE_ICOUNT) { + count = tcg_temp_local_new_i32(); + } else { + count = tcg_temp_new_i32(); } - icount_label = gen_new_label(); - count = tcg_temp_local_new_i32(); tcg_gen_ld_i32(count, cpu_env, -ENV_OFFSET + offsetof(CPUState, icount_decr.u32)); - imm = tcg_temp_new_i32(); - /* We emit a movi with a dummy immediate argument. Keep the insn index - * of the movi so that we later (when we know the actual insn count) - * can update the immediate argument with the actual insn count. */ - icount_start_insn_idx = tcg_op_buf_count(); - tcg_gen_movi_i32(imm, 0xdeadbeef); + if (tb->cflags & CF_USE_ICOUNT) { + imm = tcg_temp_new_i32(); + /* We emit a movi with a dummy immediate argument. Keep the insn index + * of the movi so that we later (when we know the actual insn count) + * can update the immediate argument with the actual insn count. */ + icount_start_insn_idx = tcg_op_buf_count(); + tcg_gen_movi_i32(imm, 0xdeadbeef); - tcg_gen_sub_i32(count, count, imm); - tcg_temp_free_i32(imm); + tcg_gen_sub_i32(count, count, imm); + tcg_temp_free_i32(imm); + } + + tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, exitreq_label); + + if (tb->cflags & CF_USE_ICOUNT) { + tcg_gen_st16_i32(count, cpu_env, + -ENV_OFFSET + offsetof(CPUState, icount_decr.u16.low)); + } - tcg_gen_brcondi_i32(TCG_COND_LT, count, 0, icount_label); - tcg_gen_st16_i32(count, cpu_env, - -ENV_OFFSET + offsetof(CPUState, icount_decr.u16.low)); tcg_temp_free_i32(count); } static void gen_tb_end(TranslationBlock *tb, int num_insns) { - gen_set_label(exitreq_label); - tcg_gen_exit_tb((uintptr_t)tb + TB_EXIT_REQUESTED); - if (tb->cflags & CF_USE_ICOUNT) { /* Update the num_insn immediate parameter now that we know * the actual insn count. */ tcg_set_insn_param(icount_start_insn_idx, 1, num_insns); - gen_set_label(icount_label); - tcg_gen_exit_tb((uintptr_t)tb + TB_EXIT_ICOUNT_EXPIRED); } + gen_set_label(exitreq_label); + tcg_gen_exit_tb((uintptr_t)tb + TB_EXIT_REQUESTED); + /* Terminate the linked list. */ tcg_ctx.gen_op_buf[tcg_ctx.gen_op_buf[0].prev].next = 0; } diff --git a/include/qom/cpu.h b/include/qom/cpu.h index f69b2407ea..1bc3ad230a 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -275,11 +275,11 @@ struct qemu_work_item; * @stopped: Indicates the CPU has been artificially stopped. * @unplug: Indicates a pending CPU unplug request. * @crash_occurred: Indicates the OS reported a crash (panic) for this CPU - * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this - * CPU and return to its top level loop. * @singlestep_enabled: Flags for single-stepping. * @icount_extra: Instructions until next timer event. - * @icount_decr: Number of cycles left, with interrupt flag in high bit. + * @icount_decr: Low 16 bits: number of cycles left, only used in icount mode. + * High 16 bits: Set to -1 to force TCG to stop executing linked TBs for this + * CPU and return to its top level loop (even in non-icount mode). * This allows a single read-compare-cbranch-write sequence to test * for both decrementer underflow and exceptions. * @can_do_io: Nonzero if memory-mapped IO is safe. Deterministic execution @@ -381,10 +381,6 @@ struct CPUState { /* TODO Move common fields from CPUArchState here. */ int cpu_index; /* used by alpha TCG */ uint32_t halted; /* used by alpha, cris, ppc TCG */ - union { - uint32_t u32; - icount_decr_u16 u16; - } icount_decr; uint32_t can_do_io; int32_t exception_index; /* used by m68k TCG */ @@ -397,7 +393,10 @@ struct CPUState { offset from AREG0. Leave this field at the end so as to make the (absolute value) offset as small as possible. This reduces code size, especially for hosts without large memory offsets. */ - uint32_t tcg_exit_req; + union { + uint32_t u32; + icount_decr_u16 u16; + } icount_decr; bool hax_vcpu_dirty; struct hax_vcpu_state *hax_vcpu; diff --git a/qom/cpu.c b/qom/cpu.c index ed87c50cea..7e005af37a 100644 --- a/qom/cpu.c +++ b/qom/cpu.c @@ -123,7 +123,7 @@ void cpu_exit(CPUState *cpu) atomic_set(&cpu->exit_request, 1); /* Ensure cpu_exec will see the exit request after TCG has exited. */ smp_wmb(); - atomic_set(&cpu->tcg_exit_req, 1); + atomic_set(&cpu->icount_decr.u16.high, -1); } int cpu_write_elf32_qemunote(WriteCoreDumpFunction f, CPUState *cpu, diff --git a/tcg/tcg.h b/tcg/tcg.h index 631c6f69b1..167aa30254 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -1108,7 +1108,6 @@ static inline unsigned get_mmuidx(TCGMemOpIdx oi) #define TB_EXIT_MASK 3 #define TB_EXIT_IDX0 0 #define TB_EXIT_IDX1 1 -#define TB_EXIT_ICOUNT_EXPIRED 2 #define TB_EXIT_REQUESTED 3 #ifdef HAVE_TCG_QEMU_TB_EXEC diff --git a/translate-all.c b/translate-all.c index 5f44ec844e..1a21e3fb1f 100644 --- a/translate-all.c +++ b/translate-all.c @@ -1958,7 +1958,7 @@ void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf) void cpu_interrupt(CPUState *cpu, int mask) { cpu->interrupt_request |= mask; - cpu->tcg_exit_req = 1; + cpu->icount_decr.u16.high = -1; } /* diff --git a/translate-common.c b/translate-common.c index 5e989cdf70..77762fd86c 100644 --- a/translate-common.c +++ b/translate-common.c @@ -43,14 +43,11 @@ static void tcg_handle_interrupt(CPUState *cpu, int mask) return; } - if (use_icount) { - cpu->icount_decr.u16.high = 0xffff; - if (!cpu->can_do_io - && (mask & ~old_mask) != 0) { - cpu_abort(cpu, "Raised interrupt while not in I/O function"); - } - } else { - cpu->tcg_exit_req = 1; + cpu->icount_decr.u16.high = -1; + if (use_icount && + !cpu->can_do_io + && (mask & ~old_mask) != 0) { + cpu_abort(cpu, "Raised interrupt while not in I/O function"); } } From cfb2d02be9413d45b30ed6d8e38800250b6b4b48 Mon Sep 17 00:00:00 2001 From: Pavel Dovgalyuk Date: Tue, 7 Feb 2017 09:54:57 +0300 Subject: [PATCH 02/21] replay: check icount in cpu exec loop This patch adds check to break cpu loop when icount expires without setting the TB_EXIT_ICOUNT_EXPIRED flag. It happens when there is no available translated blocks and all instructions were executed. In icount replay mode unnecessary tb_find will be called (which may cause an exception) and execution will be non-deterministic. Because cpu_loop_exec_tb cannot longjmp anymore, we can remove the anticipated call to align_clocks in cpu_loop_exec_tb, as well as the SyncClocks *sc argument. Signed-off-by: Pavel Dovgalyuk Message-Id: <002801d2810f$18809c20$4981d460$@ru> Signed-off-by: Paolo Bonzini Signed-off-by: Pavel Dovgalyuk --- cpu-exec.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index 8f094c8e15..7ddf66cf23 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -517,7 +517,10 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, *last_tb = NULL; } } - if (unlikely(atomic_read(&cpu->exit_request) || replay_has_interrupt())) { + + /* Finally, check if we need to exit to the main loop. */ + if (unlikely(atomic_read(&cpu->exit_request) + || (use_icount && cpu->icount_decr.u16.low + cpu->icount_extra == 0))) { atomic_set(&cpu->exit_request, 0); cpu->exception_index = EXCP_INTERRUPT; return true; @@ -527,8 +530,7 @@ static inline bool cpu_handle_interrupt(CPUState *cpu, } static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb, - TranslationBlock **last_tb, int *tb_exit, - SyncClocks *sc) + TranslationBlock **last_tb, int *tb_exit) { uintptr_t ret; int32_t insns_left; @@ -579,10 +581,7 @@ static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb, */ if (insns_left > 0) { cpu_exec_nocache(cpu, insns_left, tb, false); - align_clocks(sc, cpu); } - cpu->exception_index = EXCP_INTERRUPT; - cpu_loop_exit(cpu); } #endif } @@ -593,7 +592,7 @@ int cpu_exec(CPUState *cpu) { CPUClass *cc = CPU_GET_CLASS(cpu); int ret; - SyncClocks sc; + SyncClocks sc = { 0 }; /* replay_interrupt may need current_cpu */ current_cpu = cpu; @@ -643,7 +642,7 @@ int cpu_exec(CPUState *cpu) while (!cpu_handle_interrupt(cpu, &last_tb)) { TranslationBlock *tb = tb_find(cpu, last_tb, tb_exit); - cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit, &sc); + cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit); /* Try to align the host and virtual clocks if the guest is in advance */ align_clocks(&sc, cpu); From 55ac0a9bf4e1b1adfc7d73586a7aa085f58c9851 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Feb 2017 16:26:47 +0100 Subject: [PATCH 03/21] cpu-exec: remove unnecessary check of cpu->exit_request The cpu->exit_request check in cpu_loop_exec_tb is unnecessary, because cpu->tcg_exit_req is always set after cpu->exit_request. So let the TB exit and we will pick up the exit request later in cpu_handle_interrupt. Signed-off-by: Paolo Bonzini --- cpu-exec.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index 7ddf66cf23..7db959c821 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -535,10 +535,6 @@ static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb, uintptr_t ret; int32_t insns_left; - if (unlikely(atomic_read(&cpu->exit_request))) { - return; - } - trace_exec_tb(tb, tb->pc); ret = cpu_tb_exec(cpu, tb); tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK); From d98d407234713d05b77114237f839c43a8152089 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Feb 2017 13:22:12 +0100 Subject: [PATCH 04/21] cpus: remove ugly cast on sigbus_handler The cast is there because sigbus_handler is invoked via sigfd_handler. But it feels just wrong to use struct qemu_signalfd_siginfo in the prototype of a function that is passed to sigaction. Instead, do a simple-minded conversion of qemu_signalfd_siginfo to siginfo_t. Signed-off-by: Paolo Bonzini --- cpus.c | 12 +++--------- include/qemu/compatfd.h | 42 ----------------------------------------- include/qemu/osdep.h | 28 +++++++++++++++++++++++++++ util/compatfd.c | 1 - util/main-loop.c | 5 +---- util/oslib-posix.c | 33 ++++++++++++++++++++++++++++++++ 6 files changed, 65 insertions(+), 56 deletions(-) delete mode 100644 include/qemu/compatfd.h diff --git a/cpus.c b/cpus.c index 8200ac6b75..a628cde232 100644 --- a/cpus.c +++ b/cpus.c @@ -51,10 +51,6 @@ #include "hw/nmi.h" #include "sysemu/replay.h" -#ifndef _WIN32 -#include "qemu/compatfd.h" -#endif - #ifdef CONFIG_LINUX #include @@ -924,11 +920,9 @@ static void sigbus_reraise(void) abort(); } -static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo, - void *ctx) +static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx) { - if (kvm_on_sigbus(siginfo->ssi_code, - (void *)(intptr_t)siginfo->ssi_addr)) { + if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) { sigbus_reraise(); } } @@ -939,7 +933,7 @@ static void qemu_init_sigbus(void) memset(&action, 0, sizeof(action)); action.sa_flags = SA_SIGINFO; - action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler; + action.sa_sigaction = sigbus_handler; sigaction(SIGBUS, &action, NULL); prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0); diff --git a/include/qemu/compatfd.h b/include/qemu/compatfd.h deleted file mode 100644 index aa12ee9364..0000000000 --- a/include/qemu/compatfd.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * signalfd/eventfd compatibility - * - * Copyright IBM, Corp. 2008 - * - * Authors: - * Anthony Liguori - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#ifndef QEMU_COMPATFD_H -#define QEMU_COMPATFD_H - - -struct qemu_signalfd_siginfo { - uint32_t ssi_signo; /* Signal number */ - int32_t ssi_errno; /* Error number (unused) */ - int32_t ssi_code; /* Signal code */ - uint32_t ssi_pid; /* PID of sender */ - uint32_t ssi_uid; /* Real UID of sender */ - int32_t ssi_fd; /* File descriptor (SIGIO) */ - uint32_t ssi_tid; /* Kernel timer ID (POSIX timers) */ - uint32_t ssi_band; /* Band event (SIGIO) */ - uint32_t ssi_overrun; /* POSIX timer overrun count */ - uint32_t ssi_trapno; /* Trap number that caused signal */ - int32_t ssi_status; /* Exit status or signal (SIGCHLD) */ - int32_t ssi_int; /* Integer sent by sigqueue(2) */ - uint64_t ssi_ptr; /* Pointer sent by sigqueue(2) */ - uint64_t ssi_utime; /* User CPU time consumed (SIGCHLD) */ - uint64_t ssi_stime; /* System CPU time consumed (SIGCHLD) */ - uint64_t ssi_addr; /* Address that generated signal - (for hardware-generated signals) */ - uint8_t pad[48]; /* Pad size to 128 bytes (allow for - additional fields in the future) */ -}; - -int qemu_signalfd(const sigset_t *mask); - -#endif diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 56c9e22405..6932709e4e 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -297,6 +297,34 @@ void qemu_anon_ram_free(void *ptr, size_t size); # define QEMU_VMALLOC_ALIGN getpagesize() #endif +#ifdef CONFIG_POSIX +struct qemu_signalfd_siginfo { + uint32_t ssi_signo; /* Signal number */ + int32_t ssi_errno; /* Error number (unused) */ + int32_t ssi_code; /* Signal code */ + uint32_t ssi_pid; /* PID of sender */ + uint32_t ssi_uid; /* Real UID of sender */ + int32_t ssi_fd; /* File descriptor (SIGIO) */ + uint32_t ssi_tid; /* Kernel timer ID (POSIX timers) */ + uint32_t ssi_band; /* Band event (SIGIO) */ + uint32_t ssi_overrun; /* POSIX timer overrun count */ + uint32_t ssi_trapno; /* Trap number that caused signal */ + int32_t ssi_status; /* Exit status or signal (SIGCHLD) */ + int32_t ssi_int; /* Integer sent by sigqueue(2) */ + uint64_t ssi_ptr; /* Pointer sent by sigqueue(2) */ + uint64_t ssi_utime; /* User CPU time consumed (SIGCHLD) */ + uint64_t ssi_stime; /* System CPU time consumed (SIGCHLD) */ + uint64_t ssi_addr; /* Address that generated signal + (for hardware-generated signals) */ + uint8_t pad[48]; /* Pad size to 128 bytes (allow for + additional fields in the future) */ +}; + +int qemu_signalfd(const sigset_t *mask); +void sigaction_invoke(struct sigaction *action, + struct qemu_signalfd_siginfo *info); +#endif + int qemu_madvise(void *addr, size_t len, int advice); int qemu_open(const char *name, int flags, ...); diff --git a/util/compatfd.c b/util/compatfd.c index 9a43042ae6..980bd33e52 100644 --- a/util/compatfd.c +++ b/util/compatfd.c @@ -15,7 +15,6 @@ #include "qemu/osdep.h" #include "qemu-common.h" -#include "qemu/compatfd.h" #include "qemu/thread.h" #include diff --git a/util/main-loop.c b/util/main-loop.c index ad10bca211..ca7bb072f9 100644 --- a/util/main-loop.c +++ b/util/main-loop.c @@ -34,8 +34,6 @@ #ifndef _WIN32 -#include "qemu/compatfd.h" - /* If we have signalfd, we mask out the signals we want to handle and then * use signalfd to listen for them. We rely on whatever the current signal * handler is to dispatch the signals when we receive them. @@ -63,8 +61,7 @@ static void sigfd_handler(void *opaque) sigaction(info.ssi_signo, NULL, &action); if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) { - action.sa_sigaction(info.ssi_signo, - (siginfo_t *)&info, NULL); + sigaction_invoke(&action, &info); } else if (action.sa_handler) { action.sa_handler(info.ssi_signo); } diff --git a/util/oslib-posix.c b/util/oslib-posix.c index f63146407f..cd686aae3d 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -603,3 +603,36 @@ void qemu_free_stack(void *stack, size_t sz) munmap(stack, sz); } + +void sigaction_invoke(struct sigaction *action, + struct qemu_signalfd_siginfo *info) +{ + siginfo_t si = { 0 }; + si.si_signo = info->ssi_signo; + si.si_errno = info->ssi_errno; + si.si_code = info->ssi_code; + + /* Convert the minimal set of fields defined by POSIX. + * Positive si_code values are reserved for kernel-generated + * signals, where the valid siginfo fields are determined by + * the signal number. But according to POSIX, it is unspecified + * whether SI_USER and SI_QUEUE have values less than or equal to + * zero. + */ + if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE || + info->ssi_code <= 0) { + /* SIGTERM, etc. */ + si.si_pid = info->ssi_pid; + si.si_uid = info->ssi_uid; + } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE || + info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) { + si.si_addr = (void *)(uintptr_t)info->ssi_addr; + } else if (info->ssi_signo == SIGCHLD) { + si.si_pid = info->ssi_pid; + si.si_status = info->ssi_status; + si.si_uid = info->ssi_uid; + } else if (info->ssi_signo == SIGIO) { + si.si_band = info->ssi_band; + } + action->sa_sigaction(info->ssi_signo, &si, NULL); +} From 20e0ff59a96e72dc6785be31db23ed7030781d45 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Feb 2017 14:25:49 +0100 Subject: [PATCH 05/21] KVM: x86: cleanup SIGBUS handlers This patch should have no semantic change. Signed-off-by: Paolo Bonzini --- target/i386/kvm.c | 87 ++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 27fd0505df..0c48dfdae5 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -469,31 +469,34 @@ int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) ram_addr_t ram_addr; hwaddr paddr; - if ((env->mcg_cap & MCG_SER_P) && addr - && (code == BUS_MCEERR_AR || code == BUS_MCEERR_AO)) { - ram_addr = qemu_ram_addr_from_host(addr); - if (ram_addr == RAM_ADDR_INVALID || - !kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { - fprintf(stderr, "Hardware memory error for memory used by " - "QEMU itself instead of guest system!\n"); - /* Hope we are lucky for AO MCE */ - if (code == BUS_MCEERR_AO) { - return 0; - } else { - hardware_memory_error(); - } - } - kvm_hwpoison_page_add(ram_addr); - kvm_mce_inject(cpu, paddr, code); - } else { - if (code == BUS_MCEERR_AO) { - return 0; - } else if (code == BUS_MCEERR_AR) { - hardware_memory_error(); - } else { - return 1; - } + if (code != BUS_MCEERR_AR && code != BUS_MCEERR_AO) { + return 1; } + + /* Because the MCE happened while running the VCPU, KVM could have + * injected action required MCEs too. Action optional MCEs should + * be delivered to the main thread, which qemu_init_sigbus identifies + * as the "early kill" thread, but if we get one for whatever reason + * we just handle it just like the main thread would. + */ + if ((env->mcg_cap & MCG_SER_P) && addr) { + ram_addr = qemu_ram_addr_from_host(addr); + if (ram_addr != RAM_ADDR_INVALID && + kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { + kvm_hwpoison_page_add(ram_addr); + kvm_mce_inject(cpu, paddr, code); + return 0; + } + + fprintf(stderr, "Hardware memory error for memory used by " + "QEMU itself instead of guest system!\n"); + } + + if (code == BUS_MCEERR_AR) { + hardware_memory_error(); + } + + /* Hope we are lucky for AO MCE */ return 0; } @@ -501,29 +504,29 @@ int kvm_arch_on_sigbus(int code, void *addr) { X86CPU *cpu = X86_CPU(first_cpu); - if ((cpu->env.mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) { + if (code != BUS_MCEERR_AR && code != BUS_MCEERR_AO) { + return 1; + } + + if (code == BUS_MCEERR_AR) { + hardware_memory_error(); + } + + /* Hope we are lucky for AO MCE */ + if ((cpu->env.mcg_cap & MCG_SER_P) && addr) { ram_addr_t ram_addr; hwaddr paddr; - /* Hope we are lucky for AO MCE */ ram_addr = qemu_ram_addr_from_host(addr); - if (ram_addr == RAM_ADDR_INVALID || - !kvm_physical_memory_addr_from_host(first_cpu->kvm_state, - addr, &paddr)) { - fprintf(stderr, "Hardware memory error for memory used by " - "QEMU itself instead of guest system!: %p\n", addr); - return 0; - } - kvm_hwpoison_page_add(ram_addr); - kvm_mce_inject(X86_CPU(first_cpu), paddr, code); - } else { - if (code == BUS_MCEERR_AO) { - return 0; - } else if (code == BUS_MCEERR_AR) { - hardware_memory_error(); - } else { - return 1; + if (ram_addr != RAM_ADDR_INVALID && + kvm_physical_memory_addr_from_host(first_cpu->kvm_state, + addr, &paddr)) { + kvm_hwpoison_page_add(ram_addr); + kvm_mce_inject(X86_CPU(first_cpu), paddr, code); } + + fprintf(stderr, "Hardware memory error for memory used by " + "QEMU itself instead of guest system!: %p\n", addr); } return 0; } From a16fc07ebd58da51d5e1c2928069879c40a26f59 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Feb 2017 09:50:02 +0100 Subject: [PATCH 06/21] cpus: reorganize signal handling code Move the KVM "eat signals" code under CONFIG_LINUX, in preparation for moving it to kvm-all.c; reraise non-MCE SIGBUS immediately, without passing it to KVM. Signed-off-by: Paolo Bonzini --- cpus.c | 105 ++++++++++++++++++++++--------------------- include/qemu/osdep.h | 9 ++++ target/i386/kvm.c | 15 +------ 3 files changed, 64 insertions(+), 65 deletions(-) diff --git a/cpus.c b/cpus.c index a628cde232..399e2713b8 100644 --- a/cpus.c +++ b/cpus.c @@ -922,6 +922,10 @@ static void sigbus_reraise(void) static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx) { + if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) { + sigbus_reraise(); + } + if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) { sigbus_reraise(); } @@ -939,55 +943,6 @@ static void qemu_init_sigbus(void) prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0); } -static void qemu_kvm_eat_signals(CPUState *cpu) -{ - struct timespec ts = { 0, 0 }; - siginfo_t siginfo; - sigset_t waitset; - sigset_t chkset; - int r; - - sigemptyset(&waitset); - sigaddset(&waitset, SIG_IPI); - sigaddset(&waitset, SIGBUS); - - do { - r = sigtimedwait(&waitset, &siginfo, &ts); - if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { - perror("sigtimedwait"); - exit(1); - } - - switch (r) { - case SIGBUS: - if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) { - sigbus_reraise(); - } - break; - default: - break; - } - - r = sigpending(&chkset); - if (r == -1) { - perror("sigpending"); - exit(1); - } - } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS)); -} - -#else /* !CONFIG_LINUX */ - -static void qemu_init_sigbus(void) -{ -} - -static void qemu_kvm_eat_signals(CPUState *cpu) -{ -} -#endif /* !CONFIG_LINUX */ - -#ifndef _WIN32 static void dummy_signal(int sig) { } @@ -1012,12 +967,58 @@ static void qemu_kvm_init_cpu_signals(CPUState *cpu) } } -#else /* _WIN32 */ +static void qemu_kvm_eat_signals(CPUState *cpu) +{ + struct timespec ts = { 0, 0 }; + siginfo_t siginfo; + sigset_t waitset; + sigset_t chkset; + int r; + + sigemptyset(&waitset); + sigaddset(&waitset, SIG_IPI); + sigaddset(&waitset, SIGBUS); + + do { + r = sigtimedwait(&waitset, &siginfo, &ts); + if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { + perror("sigtimedwait"); + exit(1); + } + + switch (r) { + case SIGBUS: + if (siginfo.si_code != BUS_MCEERR_AO && siginfo.si_code != BUS_MCEERR_AR) { + sigbus_reraise(); + } + if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) { + sigbus_reraise(); + } + break; + default: + break; + } + + r = sigpending(&chkset); + if (r == -1) { + perror("sigpending"); + exit(1); + } + } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS)); +} +#else /* !CONFIG_LINUX */ +static void qemu_init_sigbus(void) +{ +} + +static void qemu_kvm_eat_signals(CPUState *cpu) +{ +} + static void qemu_kvm_init_cpu_signals(CPUState *cpu) { - abort(); } -#endif /* _WIN32 */ +#endif /* !CONFIG_LINUX */ static QemuMutex qemu_global_mutex; diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 6932709e4e..af37195fef 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -284,6 +284,15 @@ void qemu_anon_ram_free(void *ptr, size_t size); #endif +#if defined(CONFIG_LINUX) +#ifndef BUS_MCEERR_AR +#define BUS_MCEERR_AR 4 +#endif +#ifndef BUS_MCEERR_AO +#define BUS_MCEERR_AO 5 +#endif +#endif + #if defined(__linux__) && \ (defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)) /* Use 2 MiB alignment so transparent hugepages can be used by KVM. diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 0c48dfdae5..f49a786c98 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -64,13 +64,6 @@ * 255 kvm_msr_entry structs */ #define MSR_BUF_SIZE 4096 -#ifndef BUS_MCEERR_AR -#define BUS_MCEERR_AR 4 -#endif -#ifndef BUS_MCEERR_AO -#define BUS_MCEERR_AO 5 -#endif - const KVMCapabilityInfo kvm_arch_required_capabilities[] = { KVM_CAP_INFO(SET_TSS_ADDR), KVM_CAP_INFO(EXT_CPUID), @@ -469,9 +462,7 @@ int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) ram_addr_t ram_addr; hwaddr paddr; - if (code != BUS_MCEERR_AR && code != BUS_MCEERR_AO) { - return 1; - } + assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); /* Because the MCE happened while running the VCPU, KVM could have * injected action required MCEs too. Action optional MCEs should @@ -504,9 +495,7 @@ int kvm_arch_on_sigbus(int code, void *addr) { X86CPU *cpu = X86_CPU(first_cpu); - if (code != BUS_MCEERR_AR && code != BUS_MCEERR_AO) { - return 1; - } + assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); if (code == BUS_MCEERR_AR) { hardware_memory_error(); From 4d39892cca86a9162beaa3944057d118ef42edcd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Feb 2017 10:04:34 +0100 Subject: [PATCH 07/21] KVM: remove kvm_arch_on_sigbus Build it on kvm_arch_on_sigbus_vcpu instead. They do the same for "action optional" SIGBUSes, and the main thread should never get "action required" SIGBUSes because it blocks the signal. Signed-off-by: Paolo Bonzini --- include/sysemu/kvm.h | 1 - kvm-all.c | 9 ++++++++- target/arm/kvm.c | 5 ----- target/i386/kvm.c | 40 +++++----------------------------------- target/mips/kvm.c | 6 ------ target/ppc/kvm.c | 5 ----- target/s390x/kvm.c | 5 ----- 7 files changed, 13 insertions(+), 58 deletions(-) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 3045ee7678..6ecb61cdef 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -358,7 +358,6 @@ bool kvm_vcpu_id_is_valid(int vcpu_id); unsigned long kvm_arch_vcpu_id(CPUState *cpu); int kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr); -int kvm_arch_on_sigbus(int code, void *addr); void kvm_arch_init_irq_routing(KVMState *s); diff --git a/kvm-all.c b/kvm-all.c index 0c94637c46..a433ad3090 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -2391,6 +2391,7 @@ int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) return r; } + int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) { return kvm_arch_on_sigbus_vcpu(cpu, code, addr); @@ -2398,7 +2399,13 @@ int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) int kvm_on_sigbus(int code, void *addr) { - return kvm_arch_on_sigbus(code, addr); + /* Action required MCE kills the process if SIGBUS is blocked. Because + * that's what happens in the I/O thread, where we handle MCE via signalfd, + * we can only get action optional here. + */ + assert(code != BUS_MCEERR_AR); + kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); + return 0; } int kvm_create_device(KVMState *s, uint64_t type, bool test) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 395e986973..e5218f6e5d 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -565,11 +565,6 @@ int kvm_arch_on_sigbus_vcpu(CPUState *cs, int code, void *addr) return 1; } -int kvm_arch_on_sigbus(int code, void *addr) -{ - return 1; -} - /* The #ifdef protections are until 32bit headers are imported and can * be removed once both 32 and 64 bit reach feature parity. */ diff --git a/target/i386/kvm.c b/target/i386/kvm.c index f49a786c98..2adf992c84 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -462,14 +462,13 @@ int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) ram_addr_t ram_addr; hwaddr paddr; + /* If we get an action required MCE, it has been injected by KVM + * while the VM was running. An action optional MCE instead should + * be coming from the main thread, which qemu_init_sigbus identifies + * as the "early kill" thread. + */ assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); - /* Because the MCE happened while running the VCPU, KVM could have - * injected action required MCEs too. Action optional MCEs should - * be delivered to the main thread, which qemu_init_sigbus identifies - * as the "early kill" thread, but if we get one for whatever reason - * we just handle it just like the main thread would. - */ if ((env->mcg_cap & MCG_SER_P) && addr) { ram_addr = qemu_ram_addr_from_host(addr); if (ram_addr != RAM_ADDR_INVALID && @@ -491,35 +490,6 @@ int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) return 0; } -int kvm_arch_on_sigbus(int code, void *addr) -{ - X86CPU *cpu = X86_CPU(first_cpu); - - assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); - - if (code == BUS_MCEERR_AR) { - hardware_memory_error(); - } - - /* Hope we are lucky for AO MCE */ - if ((cpu->env.mcg_cap & MCG_SER_P) && addr) { - ram_addr_t ram_addr; - hwaddr paddr; - - ram_addr = qemu_ram_addr_from_host(addr); - if (ram_addr != RAM_ADDR_INVALID && - kvm_physical_memory_addr_from_host(first_cpu->kvm_state, - addr, &paddr)) { - kvm_hwpoison_page_add(ram_addr); - kvm_mce_inject(X86_CPU(first_cpu), paddr, code); - } - - fprintf(stderr, "Hardware memory error for memory used by " - "QEMU itself instead of guest system!: %p\n", addr); - } - return 0; -} - static int kvm_inject_mce_oldstyle(X86CPU *cpu) { CPUX86State *env = &cpu->env; diff --git a/target/mips/kvm.c b/target/mips/kvm.c index 998c3412c3..3e686e73a5 100644 --- a/target/mips/kvm.c +++ b/target/mips/kvm.c @@ -186,12 +186,6 @@ int kvm_arch_on_sigbus_vcpu(CPUState *cs, int code, void *addr) return 1; } -int kvm_arch_on_sigbus(int code, void *addr) -{ - DPRINTF("%s\n", __func__); - return 1; -} - void kvm_arch_init_irq_routing(KVMState *s) { } diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index acc40ece65..75598cd779 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -2587,11 +2587,6 @@ int kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) return 1; } -int kvm_arch_on_sigbus(int code, void *addr) -{ - return 1; -} - void kvm_arch_init_irq_routing(KVMState *s) { } diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c index 5ec050cf89..e7eea6ddb6 100644 --- a/target/s390x/kvm.c +++ b/target/s390x/kvm.c @@ -2145,11 +2145,6 @@ int kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) return 1; } -int kvm_arch_on_sigbus(int code, void *addr) -{ - return 1; -} - void kvm_s390_io_interrupt(uint16_t subchannel_id, uint16_t subchannel_nr, uint32_t io_int_parm, uint32_t io_int_word) From 2ae41db262e02743b27719fe085e749d957613c0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Feb 2017 12:48:54 +0100 Subject: [PATCH 08/21] KVM: do not use sigtimedwait to catch SIGBUS Call kvm_on_sigbus_vcpu asynchronously from the VCPU thread. Information for the SIGBUS can be stored in thread-local variables and processed later in kvm_cpu_exec. Signed-off-by: Paolo Bonzini --- cpus.c | 31 +++++++++++++------------------ include/sysemu/kvm.h | 5 ++++- kvm-all.c | 35 ++++++++++++++++++++++++++++++++++- target/arm/kvm.c | 5 ----- target/i386/kvm.c | 5 ++--- target/mips/kvm.c | 6 ------ target/ppc/kvm.c | 5 ----- target/s390x/kvm.c | 5 ----- 8 files changed, 53 insertions(+), 44 deletions(-) diff --git a/cpus.c b/cpus.c index 399e2713b8..56b1338c87 100644 --- a/cpus.c +++ b/cpus.c @@ -926,8 +926,16 @@ static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx) sigbus_reraise(); } - if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) { - sigbus_reraise(); + if (current_cpu) { + /* Called asynchronously in VCPU thread. */ + if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) { + sigbus_reraise(); + } + } else { + /* Called synchronously (via signalfd) in main thread. */ + if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) { + sigbus_reraise(); + } } } @@ -958,8 +966,9 @@ static void qemu_kvm_init_cpu_signals(CPUState *cpu) sigaction(SIG_IPI, &sigact, NULL); pthread_sigmask(SIG_BLOCK, NULL, &set); - sigdelset(&set, SIG_IPI); sigdelset(&set, SIGBUS); + pthread_sigmask(SIG_SETMASK, &set, NULL); + sigdelset(&set, SIG_IPI); r = kvm_set_signal_mask(cpu, &set); if (r) { fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); @@ -977,7 +986,6 @@ static void qemu_kvm_eat_signals(CPUState *cpu) sigemptyset(&waitset); sigaddset(&waitset, SIG_IPI); - sigaddset(&waitset, SIGBUS); do { r = sigtimedwait(&waitset, &siginfo, &ts); @@ -986,25 +994,12 @@ static void qemu_kvm_eat_signals(CPUState *cpu) exit(1); } - switch (r) { - case SIGBUS: - if (siginfo.si_code != BUS_MCEERR_AO && siginfo.si_code != BUS_MCEERR_AR) { - sigbus_reraise(); - } - if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) { - sigbus_reraise(); - } - break; - default: - break; - } - r = sigpending(&chkset); if (r == -1) { perror("sigpending"); exit(1); } - } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS)); + } while (sigismember(&chkset, SIG_IPI)); } #else /* !CONFIG_LINUX */ static void qemu_init_sigbus(void) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index 6ecb61cdef..a1b019da6f 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -357,7 +357,10 @@ bool kvm_vcpu_id_is_valid(int vcpu_id); /* Returns VCPU ID to be used on KVM_CREATE_VCPU ioctl() */ unsigned long kvm_arch_vcpu_id(CPUState *cpu); -int kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr); +#ifdef TARGET_I386 +#define KVM_HAVE_MCE_INJECTION 1 +void kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr); +#endif void kvm_arch_init_irq_routing(KVMState *s); diff --git a/kvm-all.c b/kvm-all.c index a433ad3090..0baa193763 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1893,6 +1893,12 @@ void kvm_cpu_synchronize_post_init(CPUState *cpu) run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); } +#ifdef KVM_HAVE_MCE_INJECTION +static __thread void *pending_sigbus_addr; +static __thread int pending_sigbus_code; +static __thread bool have_sigbus_pending; +#endif + int kvm_cpu_exec(CPUState *cpu) { struct kvm_run *run = cpu->kvm_run; @@ -1930,6 +1936,16 @@ int kvm_cpu_exec(CPUState *cpu) attrs = kvm_arch_post_run(cpu, run); +#ifdef KVM_HAVE_MCE_INJECTION + if (unlikely(have_sigbus_pending)) { + qemu_mutex_lock_iothread(); + kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, + pending_sigbus_addr); + have_sigbus_pending = false; + qemu_mutex_unlock_iothread(); + } +#endif + if (run_ret < 0) { if (run_ret == -EINTR || run_ret == -EAGAIN) { DPRINTF("io window exit\n"); @@ -2392,13 +2408,27 @@ int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) return r; } +/* Called asynchronously in VCPU thread. */ int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) { - return kvm_arch_on_sigbus_vcpu(cpu, code, addr); +#ifdef KVM_HAVE_MCE_INJECTION + if (have_sigbus_pending) { + return 1; + } + have_sigbus_pending = true; + pending_sigbus_addr = addr; + pending_sigbus_code = code; + atomic_set(&cpu->exit_request, 1); + return 0; +#else + return 1; +#endif } +/* Called synchronously (via signalfd) in main thread. */ int kvm_on_sigbus(int code, void *addr) { +#ifdef KVM_HAVE_MCE_INJECTION /* Action required MCE kills the process if SIGBUS is blocked. Because * that's what happens in the I/O thread, where we handle MCE via signalfd, * we can only get action optional here. @@ -2406,6 +2436,9 @@ int kvm_on_sigbus(int code, void *addr) assert(code != BUS_MCEERR_AR); kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); return 0; +#else + return 1; +#endif } int kvm_create_device(KVMState *s, uint64_t type, bool test) diff --git a/target/arm/kvm.c b/target/arm/kvm.c index e5218f6e5d..45554682f2 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -560,11 +560,6 @@ int kvm_arch_process_async_events(CPUState *cs) return 0; } -int kvm_arch_on_sigbus_vcpu(CPUState *cs, int code, void *addr) -{ - return 1; -} - /* The #ifdef protections are until 32bit headers are imported and can * be removed once both 32 and 64 bit reach feature parity. */ diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 2adf992c84..7698421ae7 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -455,7 +455,7 @@ static void hardware_memory_error(void) exit(1); } -int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) +void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) { X86CPU *cpu = X86_CPU(c); CPUX86State *env = &cpu->env; @@ -475,7 +475,7 @@ int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { kvm_hwpoison_page_add(ram_addr); kvm_mce_inject(cpu, paddr, code); - return 0; + return; } fprintf(stderr, "Hardware memory error for memory used by " @@ -487,7 +487,6 @@ int kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) } /* Hope we are lucky for AO MCE */ - return 0; } static int kvm_inject_mce_oldstyle(X86CPU *cpu) diff --git a/target/mips/kvm.c b/target/mips/kvm.c index 3e686e73a5..0982e874bb 100644 --- a/target/mips/kvm.c +++ b/target/mips/kvm.c @@ -180,12 +180,6 @@ bool kvm_arch_stop_on_emulation_error(CPUState *cs) return true; } -int kvm_arch_on_sigbus_vcpu(CPUState *cs, int code, void *addr) -{ - DPRINTF("%s\n", __func__); - return 1; -} - void kvm_arch_init_irq_routing(KVMState *s) { } diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index 75598cd779..03f5097eab 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -2582,11 +2582,6 @@ bool kvm_arch_stop_on_emulation_error(CPUState *cpu) return true; } -int kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) -{ - return 1; -} - void kvm_arch_init_irq_routing(KVMState *s) { } diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c index e7eea6ddb6..ac47154b83 100644 --- a/target/s390x/kvm.c +++ b/target/s390x/kvm.c @@ -2140,11 +2140,6 @@ bool kvm_arch_stop_on_emulation_error(CPUState *cpu) return true; } -int kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) -{ - return 1; -} - void kvm_s390_io_interrupt(uint16_t subchannel_id, uint16_t subchannel_nr, uint32_t io_int_parm, uint32_t io_int_word) From 18268b6016930efe76c77ae590e244d42d9671ea Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Feb 2017 09:41:14 +0100 Subject: [PATCH 09/21] KVM: move SIG_IPI handling to kvm-all.c This lets us remove a bunch of CONFIG_LINUX defines. Signed-off-by: Paolo Bonzini --- cpus.c | 62 +------------------------------------------- include/sysemu/kvm.h | 5 ++-- kvm-all.c | 60 ++++++++++++++++++++++++++++++++++++++---- kvm-stub.c | 12 ++++----- 4 files changed, 63 insertions(+), 76 deletions(-) diff --git a/cpus.c b/cpus.c index 56b1338c87..c857ad2957 100644 --- a/cpus.c +++ b/cpus.c @@ -950,69 +950,10 @@ static void qemu_init_sigbus(void) prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0); } - -static void dummy_signal(int sig) -{ -} - -static void qemu_kvm_init_cpu_signals(CPUState *cpu) -{ - int r; - sigset_t set; - struct sigaction sigact; - - memset(&sigact, 0, sizeof(sigact)); - sigact.sa_handler = dummy_signal; - sigaction(SIG_IPI, &sigact, NULL); - - pthread_sigmask(SIG_BLOCK, NULL, &set); - sigdelset(&set, SIGBUS); - pthread_sigmask(SIG_SETMASK, &set, NULL); - sigdelset(&set, SIG_IPI); - r = kvm_set_signal_mask(cpu, &set); - if (r) { - fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); - exit(1); - } -} - -static void qemu_kvm_eat_signals(CPUState *cpu) -{ - struct timespec ts = { 0, 0 }; - siginfo_t siginfo; - sigset_t waitset; - sigset_t chkset; - int r; - - sigemptyset(&waitset); - sigaddset(&waitset, SIG_IPI); - - do { - r = sigtimedwait(&waitset, &siginfo, &ts); - if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { - perror("sigtimedwait"); - exit(1); - } - - r = sigpending(&chkset); - if (r == -1) { - perror("sigpending"); - exit(1); - } - } while (sigismember(&chkset, SIG_IPI)); -} #else /* !CONFIG_LINUX */ static void qemu_init_sigbus(void) { } - -static void qemu_kvm_eat_signals(CPUState *cpu) -{ -} - -static void qemu_kvm_init_cpu_signals(CPUState *cpu) -{ -} #endif /* !CONFIG_LINUX */ static QemuMutex qemu_global_mutex; @@ -1089,7 +1030,6 @@ static void qemu_kvm_wait_io_event(CPUState *cpu) qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex); } - qemu_kvm_eat_signals(cpu); qemu_wait_io_event_common(cpu); } @@ -1112,7 +1052,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg) exit(1); } - qemu_kvm_init_cpu_signals(cpu); + kvm_init_cpu_signals(cpu); /* signal CPU creation */ cpu->created = true; diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index a1b019da6f..24281fc7f8 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -238,9 +238,6 @@ int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, target_ulong len, int type); void kvm_remove_all_breakpoints(CPUState *cpu); int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap); -#ifndef _WIN32 -int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset); -#endif int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr); int kvm_on_sigbus(int code, void *addr); @@ -463,6 +460,8 @@ void kvm_cpu_synchronize_state(CPUState *cpu); void kvm_cpu_synchronize_post_reset(CPUState *cpu); void kvm_cpu_synchronize_post_init(CPUState *cpu); +void kvm_init_cpu_signals(CPUState *cpu); + /** * kvm_irqchip_add_msi_route - Add MSI route for specific vector * @s: KVM state diff --git a/kvm-all.c b/kvm-all.c index 0baa193763..1d7fc6c1e8 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1899,6 +1899,32 @@ static __thread int pending_sigbus_code; static __thread bool have_sigbus_pending; #endif +static void kvm_eat_signals(CPUState *cpu) +{ + struct timespec ts = { 0, 0 }; + siginfo_t siginfo; + sigset_t waitset; + sigset_t chkset; + int r; + + sigemptyset(&waitset); + sigaddset(&waitset, SIG_IPI); + + do { + r = sigtimedwait(&waitset, &siginfo, &ts); + if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { + perror("sigtimedwait"); + exit(1); + } + + r = sigpending(&chkset); + if (r == -1) { + perror("sigpending"); + exit(1); + } + } while (sigismember(&chkset, SIG_IPI)); +} + int kvm_cpu_exec(CPUState *cpu) { struct kvm_run *run = cpu->kvm_run; @@ -1949,6 +1975,7 @@ int kvm_cpu_exec(CPUState *cpu) if (run_ret < 0) { if (run_ret == -EINTR || run_ret == -EAGAIN) { DPRINTF("io window exit\n"); + kvm_eat_signals(cpu); ret = EXCP_INTERRUPT; break; } @@ -2388,16 +2415,12 @@ void kvm_remove_all_breakpoints(CPUState *cpu) } #endif /* !KVM_CAP_SET_GUEST_DEBUG */ -int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) +static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) { KVMState *s = kvm_state; struct kvm_signal_mask *sigmask; int r; - if (!sigset) { - return kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, NULL); - } - sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); sigmask->len = s->sigmask_len; @@ -2408,6 +2431,33 @@ int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) return r; } +static void dummy_signal(int sig) +{ +} + +void kvm_init_cpu_signals(CPUState *cpu) +{ + int r; + sigset_t set; + struct sigaction sigact; + + memset(&sigact, 0, sizeof(sigact)); + sigact.sa_handler = dummy_signal; + sigaction(SIG_IPI, &sigact, NULL); + + pthread_sigmask(SIG_BLOCK, NULL, &set); +#if defined KVM_HAVE_MCE_INJECTION + sigdelset(&set, SIGBUS); + pthread_sigmask(SIG_SETMASK, &set, NULL); +#endif + sigdelset(&set, SIG_IPI); + r = kvm_set_signal_mask(cpu, &set); + if (r) { + fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); + exit(1); + } +} + /* Called asynchronously in VCPU thread. */ int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) { diff --git a/kvm-stub.c b/kvm-stub.c index b1b6b96c96..ef0c7346af 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -95,13 +95,6 @@ void kvm_remove_all_breakpoints(CPUState *cpu) { } -#ifndef _WIN32 -int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) -{ - abort(); -} -#endif - int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) { return 1; @@ -157,4 +150,9 @@ bool kvm_has_free_slot(MachineState *ms) { return false; } + +void kvm_init_cpu_signals(CPUState *cpu) +{ + abort(); +} #endif From c5c6679d37547a2a5125e257529fcd3fd095b88f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 15 Feb 2017 15:36:11 +0100 Subject: [PATCH 10/21] kvm: use atomic_read/atomic_set to access cpu->exit_request Signed-off-by: Paolo Bonzini --- kvm-all.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 1d7fc6c1e8..434c720dbe 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1933,7 +1933,7 @@ int kvm_cpu_exec(CPUState *cpu) DPRINTF("kvm_cpu_exec()\n"); if (kvm_arch_process_async_events(cpu)) { - cpu->exit_request = 0; + atomic_set(&cpu->exit_request, 0); return EXCP_HLT; } @@ -1948,7 +1948,7 @@ int kvm_cpu_exec(CPUState *cpu) } kvm_arch_pre_run(cpu, run); - if (cpu->exit_request) { + if (atomic_read(&cpu->exit_request)) { DPRINTF("interrupt exit requested\n"); /* * KVM requires us to reenter the kernel after IO exits to complete @@ -2069,7 +2069,7 @@ int kvm_cpu_exec(CPUState *cpu) vm_stop(RUN_STATE_INTERNAL_ERROR); } - cpu->exit_request = 0; + atomic_set(&cpu->exit_request, 0); return ret; } From cf0f7cf903073f9dd9979dd33d52618b384ac2cb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Feb 2017 13:52:50 +0100 Subject: [PATCH 11/21] KVM: use KVM_CAP_IMMEDIATE_EXIT The purpose of the KVM_SET_SIGNAL_MASK API is to let userspace "kick" a VCPU out of KVM_RUN through a POSIX signal. A signal is attached to a dummy signal handler; by blocking the signal outside KVM_RUN and unblocking it inside, this possible race is closed: VCPU thread service thread -------------------------------------------------------------- check flag set flag raise signal (signal handler does nothing) KVM_RUN However, one issue with KVM_SET_SIGNAL_MASK is that it has to take tsk->sighand->siglock on every KVM_RUN. This lock is often on a remote NUMA node, because it is on the node of a thread's creator. Taking this lock can be very expensive if there are many userspace exits (as is the case for SMP Windows VMs without Hyper-V reference time counter). KVM_CAP_IMMEDIATE_EXIT provides an alternative, where the flag is placed directly in kvm_run so that KVM can see it: VCPU thread service thread -------------------------------------------------------------- raise signal signal handler set run->immediate_exit KVM_RUN check run->immediate_exit The previous patches changed QEMU so that the only blocked signal is SIG_IPI, so we can now stop using KVM_SET_SIGNAL_MASK and sigtimedwait if KVM_CAP_IMMEDIATE_EXIT is available. On a 14-VCPU guest, an "inl" operation goes down from 30k to 6k on an unlocked (no BQL) MemoryRegion, or from 30k to 15k if the BQL is involved. Signed-off-by: Paolo Bonzini --- kvm-all.c | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 434c720dbe..9040bd50a4 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -120,6 +120,7 @@ bool kvm_vm_attributes_allowed; bool kvm_direct_msi_allowed; bool kvm_ioeventfd_any_length_allowed; bool kvm_msi_use_devid; +static bool kvm_immediate_exit; static const KVMCapabilityInfo kvm_required_capabilites[] = { KVM_CAP_INFO(USER_MEMORY), @@ -1619,6 +1620,7 @@ static int kvm_init(MachineState *ms) goto err; } + kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); /* If unspecified, use the default value */ @@ -1899,6 +1901,20 @@ static __thread int pending_sigbus_code; static __thread bool have_sigbus_pending; #endif +static void kvm_cpu_kick(CPUState *cpu) +{ + atomic_set(&cpu->kvm_run->immediate_exit, 1); +} + +static void kvm_cpu_kick_self(void) +{ + if (kvm_immediate_exit) { + kvm_cpu_kick(current_cpu); + } else { + qemu_cpu_kick_self(); + } +} + static void kvm_eat_signals(CPUState *cpu) { struct timespec ts = { 0, 0 }; @@ -1907,6 +1923,15 @@ static void kvm_eat_signals(CPUState *cpu) sigset_t chkset; int r; + if (kvm_immediate_exit) { + atomic_set(&cpu->kvm_run->immediate_exit, 0); + /* Write kvm_run->immediate_exit before the cpu->exit_request + * write in kvm_cpu_exec. + */ + smp_wmb(); + return; + } + sigemptyset(&waitset); sigaddset(&waitset, SIG_IPI); @@ -1955,9 +1980,14 @@ int kvm_cpu_exec(CPUState *cpu) * instruction emulation. This self-signal will ensure that we * leave ASAP again. */ - qemu_cpu_kick_self(); + kvm_cpu_kick_self(); } + /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. + * Matching barrier in kvm_eat_signals. + */ + smp_rmb(); + run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); attrs = kvm_arch_post_run(cpu, run); @@ -2431,8 +2461,12 @@ static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) return r; } -static void dummy_signal(int sig) +static void kvm_ipi_signal(int sig) { + if (current_cpu) { + assert(kvm_immediate_exit); + kvm_cpu_kick(current_cpu); + } } void kvm_init_cpu_signals(CPUState *cpu) @@ -2442,7 +2476,7 @@ void kvm_init_cpu_signals(CPUState *cpu) struct sigaction sigact; memset(&sigact, 0, sizeof(sigact)); - sigact.sa_handler = dummy_signal; + sigact.sa_handler = kvm_ipi_signal; sigaction(SIG_IPI, &sigact, NULL); pthread_sigmask(SIG_BLOCK, NULL, &set); @@ -2451,7 +2485,11 @@ void kvm_init_cpu_signals(CPUState *cpu) pthread_sigmask(SIG_SETMASK, &set, NULL); #endif sigdelset(&set, SIG_IPI); - r = kvm_set_signal_mask(cpu, &set); + if (kvm_immediate_exit) { + r = pthread_sigmask(SIG_SETMASK, &set, NULL); + } else { + r = kvm_set_signal_mask(cpu, &set); + } if (r) { fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); exit(1); From c3e31eaa21bc038c146cb196f7762a972eb9de5b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Feb 2017 09:29:34 +0100 Subject: [PATCH 12/21] vmxcap: port to Python 3 Signed-off-by: Paolo Bonzini --- scripts/kvm/vmxcap | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap index 222025525b..af8de1558b 100755 --- a/scripts/kvm/vmxcap +++ b/scripts/kvm/vmxcap @@ -27,9 +27,9 @@ MSR_IA32_VMX_VMFUNC = 0x491 class msr(object): def __init__(self): try: - self.f = open('/dev/cpu/0/msr', 'r', 0) + self.f = open('/dev/cpu/0/msr', 'rb', 0) except: - self.f = open('/dev/msr0', 'r', 0) + self.f = open('/dev/msr0', 'rb', 0) def read(self, index, default = None): import struct self.f.seek(index) @@ -49,7 +49,7 @@ class Control(object): val = m.read(nr, 0) return (val & 0xffffffff, val >> 32) def show(self): - print self.name + print(self.name) mbz, mb1 = self.read2(self.cap_msr) tmbz, tmb1 = 0, 0 if self.true_cap_msr: @@ -69,7 +69,7 @@ class Control(object): s = 'forced' elif one and zero: s = 'yes' - print ' %-40s %s' % (self.bits[bit], s) + print(' %-40s %s' % (self.bits[bit], s)) class Misc(object): def __init__(self, name, bits, msr): @@ -77,9 +77,9 @@ class Misc(object): self.bits = bits self.msr = msr def show(self): - print self.name + print(self.name) value = msr().read(self.msr, 0) - print ' Hex: 0x%x' % (value) + print(' Hex: 0x%x' % (value)) def first_bit(key): if type(key) is tuple: return key[0] @@ -94,7 +94,7 @@ class Misc(object): def fmt(x): return { True: 'yes', False: 'no' }[x] v = (value >> lo) & ((1 << (hi - lo + 1)) - 1) - print ' %-40s %s' % (self.bits[bits], fmt(v)) + print(' %-40s %s' % (self.bits[bits], fmt(v))) controls = [ Misc( From 025533f6eeb4751ee6d8330a505d47a1128322d1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Feb 2017 09:35:45 +0100 Subject: [PATCH 13/21] vmxcap: update for September 2016 SDM Signed-off-by: Paolo Bonzini --- scripts/kvm/vmxcap | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/kvm/vmxcap b/scripts/kvm/vmxcap index af8de1558b..d9a6db0bb7 100755 --- a/scripts/kvm/vmxcap +++ b/scripts/kvm/vmxcap @@ -170,9 +170,13 @@ controls = [ 12: 'Enable INVPCID', 13: 'Enable VM functions', 14: 'VMCS shadowing', + 15: 'Enable ENCLS exiting', 16: 'RDSEED exiting', + 17: 'Enable PML', 18: 'EPT-violation #VE', + 19: 'Conceal non-root operation from PT', 20: 'Enable XSAVES/XRSTORS', + 22: 'Mode-based execute control (XS/XU)', 25: 'TSC scaling', }, cap_msr = MSR_IA32_VMX_PROCBASED_CTLS2, @@ -190,6 +194,8 @@ controls = [ 20: 'Save IA32_EFER', 21: 'Load IA32_EFER', 22: 'Save VMX-preemption timer value', + 23: 'Clear IA32_BNDCFGS', + 24: 'Conceal VM exits from PT', }, cap_msr = MSR_IA32_VMX_EXIT_CTLS, true_cap_msr = MSR_IA32_VMX_TRUE_EXIT_CTLS, @@ -205,6 +211,8 @@ controls = [ 13: 'Load IA32_PERF_GLOBAL_CTRL', 14: 'Load IA32_PAT', 15: 'Load IA32_EFER', + 16: 'Load IA32_BNDCFGS', + 17: 'Conceal VM entries from PT', }, cap_msr = MSR_IA32_VMX_ENTRY_CTLS, true_cap_msr = MSR_IA32_VMX_TRUE_ENTRY_CTLS, @@ -223,6 +231,7 @@ controls = [ (25,27): 'MSR-load/store count recommendation', 28: 'IA32_SMM_MONITOR_CTL[2] can be set to 1', 29: 'VMWRITE to VM-exit information fields', + 30: 'Inject event with insn length=0', (32,63): 'MSEG revision identifier', }, msr = MSR_IA32_VMX_MISC_CTLS, From e8ed97a6478f55dde54f0188a54e094a1caa7965 Mon Sep 17 00:00:00 2001 From: Anton Nefedov Date: Mon, 20 Feb 2017 21:21:54 +0300 Subject: [PATCH 14/21] qapi: flatten GuestPanicInformation union Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Eric Blake Message-Id: <1487614915-18710-3-git-send-email-den@openvz.org> Signed-off-by: Paolo Bonzini --- qapi-schema.json | 12 ++++++++++++ target/i386/cpu.c | 15 ++++++--------- vl.c | 12 ++++++------ 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/qapi-schema.json b/qapi-schema.json index fb39d1dc11..6febfa7b90 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -5914,6 +5914,16 @@ { 'enum': 'GuestPanicAction', 'data': [ 'pause', 'poweroff' ] } +## +# @GuestPanicInformationType: +# +# An enumeration of the guest panic information types +# +# Since: 2.9 +## +{ 'enum': 'GuestPanicInformationType', + 'data': [ 'hyper-v'] } + ## # @GuestPanicInformation: # @@ -5922,6 +5932,8 @@ # Since: 2.9 ## {'union': 'GuestPanicInformation', + 'base': {'type': 'GuestPanicInformationType'}, + 'discriminator': 'type', 'data': { 'hyper-v': 'GuestPanicInformationHyperV' } } ## diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 89421c893b..aec5d9daf8 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3778,19 +3778,16 @@ static GuestPanicInformation *x86_cpu_get_crash_info(CPUState *cs) GuestPanicInformation *panic_info = NULL; if (env->features[FEAT_HYPERV_EDX] & HV_X64_GUEST_CRASH_MSR_AVAILABLE) { - GuestPanicInformationHyperV *panic_info_hv = - g_malloc0(sizeof(GuestPanicInformationHyperV)); panic_info = g_malloc0(sizeof(GuestPanicInformation)); - panic_info->type = GUEST_PANIC_INFORMATION_KIND_HYPER_V; - panic_info->u.hyper_v.data = panic_info_hv; + panic_info->type = GUEST_PANIC_INFORMATION_TYPE_HYPER_V; assert(HV_X64_MSR_CRASH_PARAMS >= 5); - panic_info_hv->arg1 = env->msr_hv_crash_params[0]; - panic_info_hv->arg2 = env->msr_hv_crash_params[1]; - panic_info_hv->arg3 = env->msr_hv_crash_params[2]; - panic_info_hv->arg4 = env->msr_hv_crash_params[3]; - panic_info_hv->arg5 = env->msr_hv_crash_params[4]; + panic_info->u.hyper_v.arg1 = env->msr_hv_crash_params[0]; + panic_info->u.hyper_v.arg2 = env->msr_hv_crash_params[1]; + panic_info->u.hyper_v.arg3 = env->msr_hv_crash_params[2]; + panic_info->u.hyper_v.arg4 = env->msr_hv_crash_params[3]; + panic_info->u.hyper_v.arg5 = env->msr_hv_crash_params[4]; } return panic_info; diff --git a/vl.c b/vl.c index e10a27bdd6..7f57deda28 100644 --- a/vl.c +++ b/vl.c @@ -1717,14 +1717,14 @@ void qemu_system_guest_panicked(GuestPanicInformation *info) } if (info) { - if (info->type == GUEST_PANIC_INFORMATION_KIND_HYPER_V) { + if (info->type == GUEST_PANIC_INFORMATION_TYPE_HYPER_V) { qemu_log_mask(LOG_GUEST_ERROR, "HV crash parameters: (%#"PRIx64 " %#"PRIx64" %#"PRIx64" %#"PRIx64" %#"PRIx64")\n", - info->u.hyper_v.data->arg1, - info->u.hyper_v.data->arg2, - info->u.hyper_v.data->arg3, - info->u.hyper_v.data->arg4, - info->u.hyper_v.data->arg5); + info->u.hyper_v.arg1, + info->u.hyper_v.arg2, + info->u.hyper_v.arg3, + info->u.hyper_v.arg4, + info->u.hyper_v.arg5); } qapi_free_GuestPanicInformation(info); } From 11953be792998c43bf2cad4ad3deaeaeaf89dbb4 Mon Sep 17 00:00:00 2001 From: Anton Nefedov Date: Mon, 20 Feb 2017 21:21:55 +0300 Subject: [PATCH 15/21] qmp-events: fix GUEST_PANICKED description formatting Signed-off-by: Anton Nefedov Signed-off-by: Denis V. Lunev CC: Paolo Bonzini CC: Eric Blake Message-Id: <1487614915-18710-4-git-send-email-den@openvz.org> Signed-off-by: Paolo Bonzini --- qapi/event.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qapi/event.json b/qapi/event.json index 970ff0255a..e02852cd8a 100644 --- a/qapi/event.json +++ b/qapi/event.json @@ -488,9 +488,9 @@ # # @action: action that has been taken, currently always "pause" # -# @info: optional information about a panic +# @info: #optional information about a panic (since 2.9) # -# Since: 1.5 (@info since 2.9) +# Since: 1.5 # # Example: # From c99a29e702528698c0ce2590f06ca7ff239f7c39 Mon Sep 17 00:00:00 2001 From: Yongji Xie Date: Mon, 27 Feb 2017 12:52:44 +0800 Subject: [PATCH 16/21] memory: Introduce DEVICE_HOST_ENDIAN for ram device At the moment ram device's memory regions are DEVICE_NATIVE_ENDIAN. It's incorrect. This memory region is backed by a MMIO area in host, so the uint64_t data that MemoryRegionOps read from/write to this area should be host-endian rather than target-endian. Hence, current code does not work when target and host endianness are different which is the most common case on PPC64. To fix it, this introduces DEVICE_HOST_ENDIAN for the ram device. This has been tested on PPC64 BE/LE host/guest in all possible combinations including TCG. Suggested-by: Paolo Bonzini Signed-off-by: Yongji Xie Message-Id: <1488171164-28319-1-git-send-email-xyjxie@linux.vnet.ibm.com> Signed-off-by: Paolo Bonzini --- include/exec/cpu-common.h | 6 ++++++ memory.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 8c305aa4fa..b62f0d82e4 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -36,6 +36,12 @@ enum device_endian { DEVICE_LITTLE_ENDIAN, }; +#if defined(HOST_WORDS_BIGENDIAN) +#define DEVICE_HOST_ENDIAN DEVICE_BIG_ENDIAN +#else +#define DEVICE_HOST_ENDIAN DEVICE_LITTLE_ENDIAN +#endif + /* address in the RAM (different from a physical address) */ #if defined(CONFIG_XEN_BACKEND) typedef uint64_t ram_addr_t; diff --git a/memory.c b/memory.c index d61caee867..573fa6e5f6 100644 --- a/memory.c +++ b/memory.c @@ -1182,7 +1182,7 @@ static void memory_region_ram_device_write(void *opaque, hwaddr addr, static const MemoryRegionOps ram_device_mem_ops = { .read = memory_region_ram_device_read, .write = memory_region_ram_device_write, - .endianness = DEVICE_NATIVE_ENDIAN, + .endianness = DEVICE_HOST_ENDIAN, .valid = { .min_access_size = 1, .max_access_size = 8, From f6f99b48087696812241d6c54f97444de6364c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Poussineau?= Date: Mon, 20 Feb 2017 21:41:19 +0100 Subject: [PATCH 17/21] vl: disable default cdrom when using explicitely scsi-hd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit af6bf1328ef90fae617857c02697e0174b84d596 (May 2011), ide-hd, ide-cd and scsi-cd have been added to disable default cdrom, "or else you can't put one on secondary master without -nodefaults". Make it the same for scsi-hd, so you can put one on scsi-id 2 without using -nodefaults. scsi-hd has probably been forgotten, as it has been added in the preceding commit (b443ae67130d32ad06b06fc9aa6d04d05ccd93ce). Affected users are the ones using a machine with SCSI devices and start QEMU with -device scsi-hd but without -device scsi-cd or -cdrom In that case, the default cdrom device will disappear instead of being empty. Signed-off-by: Hervé Poussineau Message-Id: <1487623279-29930-1-git-send-email-hpoussin@reactos.org> Signed-off-by: Paolo Bonzini --- vl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vl.c b/vl.c index 7f57deda28..16a3b5ed8b 100644 --- a/vl.c +++ b/vl.c @@ -227,6 +227,7 @@ static struct { { .driver = "ide-hd", .flag = &default_cdrom }, { .driver = "ide-drive", .flag = &default_cdrom }, { .driver = "scsi-cd", .flag = &default_cdrom }, + { .driver = "scsi-hd", .flag = &default_cdrom }, { .driver = "virtio-serial-pci", .flag = &default_virtcon }, { .driver = "virtio-serial", .flag = &default_virtcon }, { .driver = "VGA", .flag = &default_vga }, From f20e6f8cd42acae9a130b9e0bcd47b0d7e39f253 Mon Sep 17 00:00:00 2001 From: Li Qiang Date: Tue, 21 Feb 2017 00:18:27 -0800 Subject: [PATCH 18/21] spice-char: fix segfault in char_spice_finalize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In 'qemu_chr_open_spice_vmc' if the 'psubtype' is NULL, it will call 'char_spice_finalize'. But as the SpiceChardev is not inserted in the 'spice_chars' list, the 'QLIST_REMOVE' will cause a segfault. Add a detect to avoid it. Signed-off-by: Li Qiang Message-Id: <1487665107-88004-1-git-send-email-liqiang6-s@360.cn> Reviewed-by: Marc-André Lureau Signed-off-by: Paolo Bonzini Signed-off-by: Li Qiang --- spice-qemu-char.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spice-qemu-char.c b/spice-qemu-char.c index 6f46f46b25..4d1c76e8a4 100644 --- a/spice-qemu-char.c +++ b/spice-qemu-char.c @@ -215,7 +215,10 @@ static void char_spice_finalize(Object *obj) SpiceChardev *s = SPICE_CHARDEV(obj); vmc_unregister_interface(s); - QLIST_REMOVE(s, next); + + if (s->next.le_prev) { + QLIST_REMOVE(s, next); + } g_free((char *)s->sin.subtype); #if SPICE_SERVER_VERSION >= 0x000c02 From fc3a1fd74fac0e3233060aaaf923fe8ec104b48f Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 23 Feb 2017 13:34:41 +0000 Subject: [PATCH 19/21] x86: Work around SMI migration breakages Migration from a 2.3.0 qemu results in a reboot on the receiving QEMU due to a disagreement about SM (System management) interrupts. 2.3.0 didn't have much SMI support, but it did set CPU_INTERRUPT_SMI and this gets into the migration stream, but on 2.3.0 it never got delivered. ~2.4.0 SMI interrupt support was added but was broken - so that when a 2.3.0 stream was received it cleared the CPU_INTERRUPT_SMI but never actually caused an interrupt. The SMI delivery was recently fixed by 68c6efe07a, but the effect now is that an incoming 2.3.0 stream takes the interrupt it had flagged but it's bios can't actually handle it(I think partly due to the original interrupt not being taken during boot?). The consequence is a triple(?) fault and a reboot. Tested from: 2.3.1 -M 2.3.0 2.7.0 -M 2.3.0 2.8.0 -M 2.3.0 2.8.0 -M 2.8.0 This corresponds to RH bugzilla entry 1420679. Signed-off-by: Dr. David Alan Gilbert Message-Id: <20170223133441.16010-1-dgilbert@redhat.com> Signed-off-by: Paolo Bonzini --- include/hw/i386/pc.h | 4 ++++ target/i386/cpu.c | 2 ++ target/i386/cpu.h | 3 +++ target/i386/kvm.c | 7 ++++++- 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index d1f45540a1..ab303c7fee 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -623,6 +623,10 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); .driver = "Broadwell-noTSX" "-" TYPE_X86_CPU,\ .property = "xlevel",\ .value = stringify(0x8000000a),\ + },{\ + .driver = TYPE_X86_CPU,\ + .property = "kvm-no-smi-migration",\ + .value = "on",\ }, #define PC_COMPAT_2_2 \ diff --git a/target/i386/cpu.c b/target/i386/cpu.c index aec5d9daf8..fba92125ab 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3983,6 +3983,8 @@ static Property x86_cpu_properties[] = { DEFINE_PROP_BOOL("cpuid-0xb", X86CPU, enable_cpuid_0xb, true), DEFINE_PROP_BOOL("lmce", X86CPU, enable_lmce, false), DEFINE_PROP_BOOL("l3-cache", X86CPU, enable_l3_cache, true), + DEFINE_PROP_BOOL("kvm-no-smi-migration", X86CPU, kvm_no_smi_migration, + false), DEFINE_PROP_BOOL("vmware-cpuid-freq", X86CPU, vmware_cpuid_freq, true), DEFINE_PROP_END_OF_LIST() }; diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 12a39d590f..ac2ad6d443 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1255,6 +1255,9 @@ struct X86CPU { /* if true override the phys_bits value with a value read from the host */ bool host_phys_bits; + /* Stop SMI delivery for migration compatibility with old machines */ + bool kvm_no_smi_migration; + /* Number of physical address bits supported */ uint32_t phys_bits; diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 7698421ae7..887a81268f 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -2492,7 +2492,12 @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level) events.smi.pending = 0; events.smi.latched_init = 0; } - events.flags |= KVM_VCPUEVENT_VALID_SMM; + /* Stop SMI delivery on old machine types to avoid a reboot + * on an inward migration of an old VM. + */ + if (!cpu->kvm_no_smi_migration) { + events.flags |= KVM_VCPUEVENT_VALID_SMM; + } } if (level >= KVM_PUT_RESET_STATE) { From 377a07aa0d1617736289bfa9a9cbdc73de4e6542 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 2 Mar 2017 22:49:41 +0100 Subject: [PATCH 20/21] memory: show region offset and ROM/RAM type in "info mtree -f" "info mtree -f" output is currently hard to use for large RAM regions, because there is no hint as to what part of the region is being mapped. Add the offset if it is nonzero. Secondly, FlatView has a readonly field, that can override the MemoryRegion in the presence of aliases. Take it into account. Together, with this patch this: address-space (flat view): KVM-SMRAM 0000000000000000-00000000000bffff (prio 0, ram): pc.ram 00000000000c0000-00000000000c9fff (prio 0, ram): pc.ram 00000000000ca000-00000000000ccfff (prio 0, ram): pc.ram 00000000000cd000-00000000000ebfff (prio 0, ram): pc.ram 00000000000ec000-00000000000effff (prio 0, ram): pc.ram 00000000000f0000-00000000000fffff (prio 0, ram): pc.ram 0000000000100000-00000000bfffffff (prio 0, ram): pc.ram 00000000fd000000-00000000fdffffff (prio 1, ram): vga.vram 00000000febc0000-00000000febdffff (prio 1, i/o): e1000-mmio 00000000febf0400-00000000febf041f (prio 0, i/o): vga ioports remapped 00000000febf0500-00000000febf0515 (prio 0, i/o): bochs dispi interface 00000000febf0600-00000000febf0607 (prio 0, i/o): qemu extended regs 00000000fec00000-00000000fec00fff (prio 0, i/o): kvm-ioapic 00000000fed00000-00000000fed003ff (prio 0, i/o): hpet 00000000fee00000-00000000feefffff (prio 4096, i/o): kvm-apic-msi 00000000fffc0000-00000000ffffffff (prio 0, rom): pc.bios 0000000100000000-000000013fffffff (prio 0, ram): pc.ram becomes this: address-space (flat view): KVM-SMRAM 0000000000000000-00000000000bffff (prio 0, ram): pc.ram 00000000000c0000-00000000000c9fff (prio 0, rom): pc.ram @00000000000c0000 00000000000ca000-00000000000ccfff (prio 0, ram): pc.ram @00000000000ca000 00000000000cd000-00000000000ebfff (prio 0, rom): pc.ram @00000000000cd000 00000000000ec000-00000000000effff (prio 0, ram): pc.ram @00000000000ec000 00000000000f0000-00000000000fffff (prio 0, rom): pc.ram @00000000000f0000 0000000000100000-00000000bfffffff (prio 0, ram): pc.ram @0000000000100000 00000000fd000000-00000000fdffffff (prio 1, ram): vga.vram 00000000febc0000-00000000febdffff (prio 1, i/o): e1000-mmio 00000000febf0400-00000000febf041f (prio 0, i/o): vga ioports remapped 00000000febf0500-00000000febf0515 (prio 0, i/o): bochs dispi interface 00000000febf0600-00000000febf0607 (prio 0, i/o): qemu extended regs 00000000fec00000-00000000fec00fff (prio 0, i/o): kvm-ioapic 00000000fed00000-00000000fed003ff (prio 0, i/o): hpet 00000000fee00000-00000000feefffff (prio 4096, i/o): kvm-apic-msi 00000000fffc0000-00000000ffffffff (prio 0, rom): pc.bios 0000000100000000-000000013fffffff (prio 0, ram): pc.ram @00000000c0000000 This should make it easier to understand what's going on. Cc: Peter Xu Cc: "William Tambe" Signed-off-by: Paolo Bonzini --- memory.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/memory.c b/memory.c index 573fa6e5f6..284894b135 100644 --- a/memory.c +++ b/memory.c @@ -2588,13 +2588,24 @@ static void mtree_print_flatview(fprintf_function p, void *f, while (n--) { mr = range->mr; - p(f, MTREE_INDENT TARGET_FMT_plx "-" - TARGET_FMT_plx " (prio %d, %s): %s\n", - int128_get64(range->addr.start), - int128_get64(range->addr.start) + MR_SIZE(range->addr.size), - mr->priority, - memory_region_type(mr), - memory_region_name(mr)); + if (range->offset_in_region) { + p(f, MTREE_INDENT TARGET_FMT_plx "-" + TARGET_FMT_plx " (prio %d, %s): %s @" TARGET_FMT_plx "\n", + int128_get64(range->addr.start), + int128_get64(range->addr.start) + MR_SIZE(range->addr.size), + mr->priority, + range->readonly ? "rom" : memory_region_type(mr), + memory_region_name(mr), + range->offset_in_region); + } else { + p(f, MTREE_INDENT TARGET_FMT_plx "-" + TARGET_FMT_plx " (prio %d, %s): %s\n", + int128_get64(range->addr.start), + int128_get64(range->addr.start) + MR_SIZE(range->addr.size), + mr->priority, + range->readonly ? "rom" : memory_region_type(mr), + memory_region_name(mr)); + } range++; } From f6eb0b319e4bad3d01d74d71e3a6cf40f0ede720 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 3 Mar 2017 16:23:36 +0100 Subject: [PATCH 21/21] iscsi: fix missing unlock Reported by Coverity. Signed-off-by: Paolo Bonzini --- block/iscsi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/iscsi.c b/block/iscsi.c index 76319a1a6e..75d890538e 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -637,6 +637,7 @@ retry: } #endif if (iTask.task == NULL) { + qemu_mutex_unlock(&iscsilun->mutex); return -ENOMEM; } #if LIBISCSI_API_VERSION < (20160603) @@ -864,6 +865,7 @@ retry: } #endif if (iTask.task == NULL) { + qemu_mutex_unlock(&iscsilun->mutex); return -ENOMEM; } #if LIBISCSI_API_VERSION < (20160603) @@ -904,6 +906,7 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) retry: if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0, 0, iscsi_co_generic_cb, &iTask) == NULL) { + qemu_mutex_unlock(&iscsilun->mutex); return -ENOMEM; } @@ -1237,6 +1240,7 @@ retry: 0, 0, iscsi_co_generic_cb, &iTask); } if (iTask.task == NULL) { + qemu_mutex_unlock(&iscsilun->mutex); return -ENOMEM; }