From f3dfda6114fd12ca7caac456b1997962b5c48274 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Feb 2010 00:23:46 +0100 Subject: [PATCH 1/8] use eventfd for iothread Signed-off-by: Paolo Bonzini Signed-off-by: Avi Kivity --- osdep.c | 32 ++++++++++++++++++++++++++++++++ qemu-common.h | 1 + vl.c | 9 +++++---- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/osdep.c b/osdep.c index 9059f019e7..9e4b17bd48 100644 --- a/osdep.c +++ b/osdep.c @@ -37,6 +37,10 @@ #include #endif +#ifdef CONFIG_EVENTFD +#include +#endif + #ifdef _WIN32 #include #elif defined(CONFIG_BSD) @@ -280,6 +284,34 @@ ssize_t qemu_write_full(int fd, const void *buf, size_t count) } #ifndef _WIN32 +/* + * Creates an eventfd that looks like a pipe and has EFD_CLOEXEC set. + */ +int qemu_eventfd(int fds[2]) +{ + int ret; + +#ifdef CONFIG_EVENTFD + ret = eventfd(0, 0); + if (ret >= 0) { + fds[0] = ret; + qemu_set_cloexec(ret); + if ((fds[1] = dup(ret)) == -1) { + close(ret); + return -1; + } + qemu_set_cloexec(fds[1]); + return 0; + } + + if (errno != ENOSYS) { + return -1; + } +#endif + + return qemu_pipe(fds); +} + /* * Creates a pipe with FD_CLOEXEC set on both file descriptors */ diff --git a/qemu-common.h b/qemu-common.h index b09f71757a..c9410066e4 100644 --- a/qemu-common.h +++ b/qemu-common.h @@ -170,6 +170,7 @@ ssize_t qemu_write_full(int fd, const void *buf, size_t count) void qemu_set_cloexec(int fd); #ifndef _WIN32 +int qemu_eventfd(int pipefd[2]); int qemu_pipe(int pipefd[2]); #endif diff --git a/vl.c b/vl.c index 98918ac49d..1957018cf6 100644 --- a/vl.c +++ b/vl.c @@ -3211,14 +3211,15 @@ static int io_thread_fd = -1; static void qemu_event_increment(void) { - static const char byte = 0; + /* Write 8 bytes to be compatible with eventfd. */ + static uint64_t val = 1; ssize_t ret; if (io_thread_fd == -1) return; do { - ret = write(io_thread_fd, &byte, sizeof(byte)); + ret = write(io_thread_fd, &val, sizeof(val)); } while (ret < 0 && errno == EINTR); /* EAGAIN is fine, a read must be pending. */ @@ -3235,7 +3236,7 @@ static void qemu_event_read(void *opaque) ssize_t len; char buffer[512]; - /* Drain the notify pipe */ + /* Drain the notify pipe. For eventfd, only 8 bytes will be read. */ do { len = read(fd, buffer, sizeof(buffer)); } while ((len == -1 && errno == EINTR) || len == sizeof(buffer)); @@ -3246,7 +3247,7 @@ static int qemu_event_init(void) int err; int fds[2]; - err = qemu_pipe(fds); + err = qemu_eventfd(fds); if (err == -1) return -errno; From 14dcc3e2ac52d7a2a1cfe2e54c332d8042485a39 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 19 Feb 2010 18:21:20 +0100 Subject: [PATCH 2/8] kvm: Fix eflags corruption in kvm mode This should explain a lot of the weird breakages of upstream KVM we've seen recently (actually we should have seen it much earlier): Stop translating eflags into TCG format when in kvm mode as we never translate it back and rather sync this broken state into the kernel. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- cpu-exec.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index 6a290fd6cd..4029ea25ff 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -228,11 +228,13 @@ int cpu_exec(CPUState *env1) env = env1; #if defined(TARGET_I386) - /* put eflags in CPU temporary format */ - CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C); - DF = 1 - (2 * ((env->eflags >> 10) & 1)); - CC_OP = CC_OP_EFLAGS; - env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C); + if (!kvm_enabled()) { + /* put eflags in CPU temporary format */ + CC_SRC = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C); + DF = 1 - (2 * ((env->eflags >> 10) & 1)); + CC_OP = CC_OP_EFLAGS; + env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C); + } #elif defined(TARGET_SPARC) #elif defined(TARGET_M68K) env->cc_op = CC_OP_FLAGS; From 62f734a0d57306523e6ba0e7bd606e0d55449671 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Fri, 19 Feb 2010 07:43:24 +0100 Subject: [PATCH 3/8] kvm: Kill CR3_CACHE feature references Remove all references to KVM_CR3_CACHE as it was never implemented. Signed-off-by: Jes Sorensen Signed-off-by: Avi Kivity --- target-i386/kvm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 0d08cd532e..5d9aecc89d 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -157,9 +157,6 @@ struct kvm_para_features { #endif #ifdef KVM_CAP_PV_MMU { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP }, -#endif -#ifdef KVM_CAP_CR3_CACHE - { KVM_CAP_CR3_CACHE, KVM_FEATURE_CR3_CACHE }, #endif { -1, -1 } }; From adc8c965c401ec3becc14eabf7a5a6ba7a002f46 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 17 Feb 2010 20:14:40 -0200 Subject: [PATCH 4/8] block SIGCHLD in vcpu thread(s) Otherwise a vcpu thread can run the sigchild handler causing waitpid() from iothread to fail. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- vl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vl.c b/vl.c index 1957018cf6..3bc618dd39 100644 --- a/vl.c +++ b/vl.c @@ -3515,6 +3515,7 @@ static void block_io_signals(void) sigaddset(&set, SIGUSR2); sigaddset(&set, SIGIO); sigaddset(&set, SIGALRM); + sigaddset(&set, SIGCHLD); pthread_sigmask(SIG_BLOCK, &set, NULL); sigemptyset(&set); From fed6c3444c714e88eb84dae58fcde0182865db8f Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 17 Feb 2010 20:14:41 -0200 Subject: [PATCH 5/8] kvm specific wait_io_event In KVM mode the global mutex is released when vcpus are executing, which means acquiring the fairness mutex is not required. Also for KVM there is one thread per vcpu, so tcg_has_work is meaningless. Add a new qemu_wait_io_event_common function to hold common code between TCG/KVM. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- vl.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/vl.c b/vl.c index 3bc618dd39..af198c11e6 100644 --- a/vl.c +++ b/vl.c @@ -3383,6 +3383,7 @@ static QemuCond qemu_pause_cond; static void block_io_signals(void); static void unblock_io_signals(void); static int tcg_has_work(void); +static int cpu_has_work(CPUState *env); static int qemu_init_main_loop(void) { @@ -3403,6 +3404,15 @@ static int qemu_init_main_loop(void) return 0; } +static void qemu_wait_io_event_common(CPUState *env) +{ + if (env->stop) { + env->stop = 0; + env->stopped = 1; + qemu_cond_signal(&qemu_pause_cond); + } +} + static void qemu_wait_io_event(CPUState *env) { while (!tcg_has_work()) @@ -3419,11 +3429,15 @@ static void qemu_wait_io_event(CPUState *env) qemu_mutex_unlock(&qemu_fair_mutex); qemu_mutex_lock(&qemu_global_mutex); - if (env->stop) { - env->stop = 0; - env->stopped = 1; - qemu_cond_signal(&qemu_pause_cond); - } + qemu_wait_io_event_common(env); +} + +static void qemu_kvm_wait_io_event(CPUState *env) +{ + while (!cpu_has_work(env)) + qemu_cond_timedwait(env->halt_cond, &qemu_global_mutex, 1000); + + qemu_wait_io_event_common(env); } static int qemu_cpu_exec(CPUState *env); @@ -3449,7 +3463,7 @@ static void *kvm_cpu_thread_fn(void *arg) while (1) { if (cpu_can_run(env)) qemu_cpu_exec(env); - qemu_wait_io_event(env); + qemu_kvm_wait_io_event(env); } return NULL; From cc84de9570ffe01a9c3c169bd62ab9586a9a080c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 17 Feb 2010 20:14:42 -0200 Subject: [PATCH 6/8] kvm: consume internal signal with sigtimedwait Change the way the internal qemu signal, used for communication between iothread and vcpus, is handled. Block and consume it with sigtimedwait on the outer vcpu loop, which allows more precise timing control. Change from standard signal (SIGUSR1) to real-time one, so multiple signals are not collapsed. Set the signal number on KVM's in-kernel allowed sigmask. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- kvm-all.c | 19 ++++++++++++ kvm.h | 1 + vl.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 97 insertions(+), 12 deletions(-) diff --git a/kvm-all.c b/kvm-all.c index 79345b2837..38c372f34b 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -771,6 +771,7 @@ int kvm_cpu_exec(CPUState *env) kvm_arch_post_run(env, run); if (ret == -EINTR || ret == -EAGAIN) { + cpu_exit(env); dprintf("io window exit\n"); ret = 0; break; @@ -1116,3 +1117,21 @@ void kvm_remove_all_breakpoints(CPUState *current_env) { } #endif /* !KVM_CAP_SET_GUEST_DEBUG */ + +int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset) +{ + struct kvm_signal_mask *sigmask; + int r; + + if (!sigset) + return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL); + + sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset)); + + sigmask->len = 8; + memcpy(sigmask->sigset, sigset, sizeof(*sigset)); + r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask); + free(sigmask); + + return r; +} diff --git a/kvm.h b/kvm.h index e24bbde0dc..9a9cdd5fc0 100644 --- a/kvm.h +++ b/kvm.h @@ -53,6 +53,7 @@ int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr, target_ulong len, int type); void kvm_remove_all_breakpoints(CPUState *current_env); int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap); +int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset); int kvm_pit_in_kernel(void); int kvm_irqchip_in_kernel(void); diff --git a/vl.c b/vl.c index af198c11e6..dc05da36ca 100644 --- a/vl.c +++ b/vl.c @@ -271,6 +271,12 @@ uint8_t qemu_uuid[16]; static QEMUBootSetHandler *boot_set_handler; static void *boot_set_opaque; +#ifdef SIGRTMIN +#define SIG_IPI (SIGRTMIN+4) +#else +#define SIG_IPI SIGUSR1 +#endif + static int default_serial = 1; static int default_parallel = 1; static int default_virtcon = 1; @@ -3380,7 +3386,8 @@ static QemuCond qemu_cpu_cond; static QemuCond qemu_system_cond; static QemuCond qemu_pause_cond; -static void block_io_signals(void); +static void tcg_block_io_signals(void); +static void kvm_block_io_signals(CPUState *env); static void unblock_io_signals(void); static int tcg_has_work(void); static int cpu_has_work(CPUState *env); @@ -3432,11 +3439,36 @@ static void qemu_wait_io_event(CPUState *env) qemu_wait_io_event_common(env); } +static void qemu_kvm_eat_signal(CPUState *env, int timeout) +{ + struct timespec ts; + int r, e; + siginfo_t siginfo; + sigset_t waitset; + + ts.tv_sec = timeout / 1000; + ts.tv_nsec = (timeout % 1000) * 1000000; + + sigemptyset(&waitset); + sigaddset(&waitset, SIG_IPI); + + qemu_mutex_unlock(&qemu_global_mutex); + r = sigtimedwait(&waitset, &siginfo, &ts); + e = errno; + qemu_mutex_lock(&qemu_global_mutex); + + if (r == -1 && !(e == EAGAIN || e == EINTR)) { + fprintf(stderr, "sigtimedwait: %s\n", strerror(e)); + exit(1); + } +} + static void qemu_kvm_wait_io_event(CPUState *env) { while (!cpu_has_work(env)) qemu_cond_timedwait(env->halt_cond, &qemu_global_mutex, 1000); + qemu_kvm_eat_signal(env, 0); qemu_wait_io_event_common(env); } @@ -3446,11 +3478,12 @@ static void *kvm_cpu_thread_fn(void *arg) { CPUState *env = arg; - block_io_signals(); qemu_thread_self(env->thread); if (kvm_enabled()) kvm_init_vcpu(env); + kvm_block_io_signals(env); + /* signal CPU creation */ qemu_mutex_lock(&qemu_global_mutex); env->created = 1; @@ -3475,7 +3508,7 @@ static void *tcg_cpu_thread_fn(void *arg) { CPUState *env = arg; - block_io_signals(); + tcg_block_io_signals(); qemu_thread_self(env->thread); /* signal CPU creation */ @@ -3501,7 +3534,7 @@ void qemu_cpu_kick(void *_env) CPUState *env = _env; qemu_cond_broadcast(env->halt_cond); if (kvm_enabled()) - qemu_thread_signal(env->thread, SIGUSR1); + qemu_thread_signal(env->thread, SIG_IPI); } int qemu_cpu_self(void *_env) @@ -3520,7 +3553,7 @@ static void cpu_signal(int sig) cpu_exit(cpu_single_env); } -static void block_io_signals(void) +static void tcg_block_io_signals(void) { sigset_t set; struct sigaction sigact; @@ -3533,12 +3566,44 @@ static void block_io_signals(void) pthread_sigmask(SIG_BLOCK, &set, NULL); sigemptyset(&set); - sigaddset(&set, SIGUSR1); + sigaddset(&set, SIG_IPI); pthread_sigmask(SIG_UNBLOCK, &set, NULL); memset(&sigact, 0, sizeof(sigact)); sigact.sa_handler = cpu_signal; - sigaction(SIGUSR1, &sigact, NULL); + sigaction(SIG_IPI, &sigact, NULL); +} + +static void dummy_signal(int sig) +{ +} + +static void kvm_block_io_signals(CPUState *env) +{ + int r; + sigset_t set; + struct sigaction sigact; + + sigemptyset(&set); + sigaddset(&set, SIGUSR2); + sigaddset(&set, SIGIO); + sigaddset(&set, SIGALRM); + sigaddset(&set, SIGCHLD); + sigaddset(&set, SIG_IPI); + pthread_sigmask(SIG_BLOCK, &set, NULL); + + pthread_sigmask(SIG_BLOCK, NULL, &set); + sigdelset(&set, SIG_IPI); + + memset(&sigact, 0, sizeof(sigact)); + sigact.sa_handler = dummy_signal; + sigaction(SIG_IPI, &sigact, NULL); + + r = kvm_set_signal_mask(env, &set); + if (r) { + fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(r)); + exit(1); + } } static void unblock_io_signals(void) @@ -3552,7 +3617,7 @@ static void unblock_io_signals(void) pthread_sigmask(SIG_UNBLOCK, &set, NULL); sigemptyset(&set); - sigaddset(&set, SIGUSR1); + sigaddset(&set, SIG_IPI); pthread_sigmask(SIG_BLOCK, &set, NULL); } @@ -3561,7 +3626,7 @@ static void qemu_signal_lock(unsigned int msecs) qemu_mutex_lock(&qemu_fair_mutex); while (qemu_mutex_trylock(&qemu_global_mutex)) { - qemu_thread_signal(tcg_cpu_thread, SIGUSR1); + qemu_thread_signal(tcg_cpu_thread, SIG_IPI); if (!qemu_mutex_timedlock(&qemu_global_mutex, msecs)) break; } @@ -3602,7 +3667,7 @@ static void pause_all_vcpus(void) while (penv) { penv->stop = 1; - qemu_thread_signal(penv->thread, SIGUSR1); + qemu_thread_signal(penv->thread, SIG_IPI); qemu_cpu_kick(penv); penv = (CPUState *)penv->next_cpu; } @@ -3611,7 +3676,7 @@ static void pause_all_vcpus(void) qemu_cond_timedwait(&qemu_pause_cond, &qemu_global_mutex, 100); penv = first_cpu; while (penv) { - qemu_thread_signal(penv->thread, SIGUSR1); + qemu_thread_signal(penv->thread, SIG_IPI); penv = (CPUState *)penv->next_cpu; } } @@ -3624,7 +3689,7 @@ static void resume_all_vcpus(void) while (penv) { penv->stop = 0; penv->stopped = 0; - qemu_thread_signal(penv->thread, SIGUSR1); + qemu_thread_signal(penv->thread, SIG_IPI); qemu_cpu_kick(penv); penv = (CPUState *)penv->next_cpu; } From 6312b92853a7cd483533de0348dcd26edef74824 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 17 Feb 2010 20:14:43 -0200 Subject: [PATCH 7/8] kvm: remove pre-entry exit_request check with iothread enabled With SIG_IPI blocked vcpu loop exit notification happens via -EAGAIN from KVM_RUN. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- kvm-all.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kvm-all.c b/kvm-all.c index 38c372f34b..91d3cbd43e 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -753,11 +753,13 @@ int kvm_cpu_exec(CPUState *env) dprintf("kvm_cpu_exec()\n"); do { +#ifndef CONFIG_IOTHREAD if (env->exit_request) { dprintf("interrupt exit requested\n"); ret = 0; break; } +#endif if (env->kvm_vcpu_dirty) { kvm_arch_put_registers(env); From 85199474d0df23b87f0b4a3e330401b59aac3ec0 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 22 Feb 2010 13:57:54 -0300 Subject: [PATCH 8/8] kvm-all.c: define smp_wmb and use it for coalesced mmio Acked-by: "Michael S. Tsirkin" Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- kvm-all.c | 3 ++- qemu-barrier.h | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 qemu-barrier.h diff --git a/kvm-all.c b/kvm-all.c index 91d3cbd43e..1a02076bfd 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -21,6 +21,7 @@ #include #include "qemu-common.h" +#include "qemu-barrier.h" #include "sysemu.h" #include "hw/hw.h" #include "gdbstub.h" @@ -730,7 +731,7 @@ void kvm_flush_coalesced_mmio_buffer(void) ent = &ring->coalesced_mmio[ring->first]; cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); - /* FIXME smp_wmb() */ + smp_wmb(); ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; } } diff --git a/qemu-barrier.h b/qemu-barrier.h new file mode 100644 index 0000000000..3bd1075d66 --- /dev/null +++ b/qemu-barrier.h @@ -0,0 +1,7 @@ +#ifndef __QEMU_BARRIER_H +#define __QEMU_BARRIER_H 1 + +/* FIXME: arch dependant, x86 version */ +#define smp_wmb() asm volatile("" ::: "memory") + +#endif