From 2654ace151c07bd6519a7b71ab98d763137302eb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 16 Nov 2020 11:02:20 -0600 Subject: [PATCH 1/5] kvm/i386: Set proper nested state format for SVM Currently, the nested state format is hardcoded to VMX. This will result in kvm_put_nested_state() returning an error because the KVM SVM support checks for the nested state to be KVM_STATE_NESTED_FORMAT_SVM. As a result, kvm_arch_put_registers() errors out early. Update the setting of the format based on the virtualization feature: VMX - KVM_STATE_NESTED_FORMAT_VMX SVM - KVM_STATE_NESTED_FORMAT_SVM Also, fix the code formatting while at it. Fixes: b16c0e20c7 ("KVM: add support for AMD nested live migration") Cc: Eduardo Habkost Cc: Richard Henderson Cc: Paolo Bonzini Cc: Marcelo Tosatti Signed-off-by: Tom Lendacky Message-Id: Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini --- target/i386/kvm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index cf46259534..a2934dda02 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -1820,12 +1820,14 @@ int kvm_arch_init_vcpu(CPUState *cs) env->nested_state = g_malloc0(max_nested_state_len); env->nested_state->size = max_nested_state_len; - env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX; if (cpu_has_vmx(env)) { - vmx_hdr = &env->nested_state->hdr.vmx; - vmx_hdr->vmxon_pa = -1ull; - vmx_hdr->vmcs12_pa = -1ull; + env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX; + vmx_hdr = &env->nested_state->hdr.vmx; + vmx_hdr->vmxon_pa = -1ull; + vmx_hdr->vmcs12_pa = -1ull; + } else { + env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM; } } } From b430b51395650137a80e78ee7395165b80fe1752 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 16 Nov 2020 11:59:47 +0100 Subject: [PATCH 2/5] util/vfio-helpers.c: Use ram_block_discard_disable() in qemu_vfio_open_pci() Currently, when using "nvme://" for a block device, like -drive file=nvme://0000:01:00.0/1,if=none,id=drive0 \ -device virtio-blk,drive=drive0 \ VFIO may pin all guest memory, and discarding of RAM no longer works as expected. I was able to reproduce this easily with my 01:00.0 Non-Volatile memory controller: Samsung Electronics Co Ltd NVMe SSD Controller SM981/PM981/PM983 Similar to common VFIO, we have to disable it, making sure that: a) virtio-balloon won't discard any memory ("silently disabled") b) virtio-mem and nvme:// run mutually exclusive Cc: Paolo Bonzini Cc: "Michael S. Tsirkin" Cc: Alex Williamson Cc: Wei Yang Cc: Dr. David Alan Gilbert Cc: Igor Mammedov Cc: Pankaj Gupta Cc: Peter Xu Signed-off-by: David Hildenbrand Message-Id: <20201116105947.9194-1-david@redhat.com> Signed-off-by: Paolo Bonzini --- stubs/ram-block.c | 6 ++++++ util/vfio-helpers.c | 14 ++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/stubs/ram-block.c b/stubs/ram-block.c index 73c0a3ee08..108197683b 100644 --- a/stubs/ram-block.c +++ b/stubs/ram-block.c @@ -1,6 +1,7 @@ #include "qemu/osdep.h" #include "exec/ramlist.h" #include "exec/cpu-common.h" +#include "exec/memory.h" void *qemu_ram_get_host_addr(RAMBlock *rb) { @@ -29,3 +30,8 @@ int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) { return 0; } + +int ram_block_discard_disable(bool state) +{ + return 0; +} diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c index c469beb061..2bec48e163 100644 --- a/util/vfio-helpers.c +++ b/util/vfio-helpers.c @@ -16,6 +16,7 @@ #include "qapi/error.h" #include "exec/ramlist.h" #include "exec/cpu-common.h" +#include "exec/memory.h" #include "trace.h" #include "qemu/error-report.h" #include "standard-headers/linux/pci_regs.h" @@ -494,8 +495,20 @@ QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) int r; QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); + /* + * VFIO may pin all memory inside mappings, resulting it in pinning + * all memory inside RAM blocks unconditionally. + */ + r = ram_block_discard_disable(true); + if (r) { + error_setg_errno(errp, -r, "Cannot set discarding of RAM broken"); + g_free(s); + return NULL; + } + r = qemu_vfio_init_pci(s, device, errp); if (r) { + ram_block_discard_disable(false); g_free(s); return NULL; } @@ -837,4 +850,5 @@ void qemu_vfio_close(QEMUVFIOState *s) close(s->device); close(s->group); close(s->container); + ram_block_discard_disable(false); } From 3b12a7fd39307017c8968b8d05986a63b33752b5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 12 Nov 2020 10:52:04 +0100 Subject: [PATCH 3/5] scsi-disk: convert more errno values back to SCSI statuses Linux has some OS-specific (and sometimes weird) mappings for various SCSI statuses and sense codes. The most important is probably RESERVATION CONFLICT. Add them so that they can be reported back to the guest kernel. Cc: Hannes Reinecke Signed-off-by: Paolo Bonzini --- hw/scsi/scsi-disk.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index e859534eaf..90841ad791 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -461,6 +461,25 @@ static bool scsi_handle_rw_error(SCSIDiskReq *r, int error, bool acct_failed) } error = scsi_sense_buf_to_errno(r->req.sense, sizeof(r->req.sense)); break; +#ifdef CONFIG_LINUX + /* These errno mapping are specific to Linux. For more information: + * - scsi_decide_disposition in drivers/scsi/scsi_error.c + * - scsi_result_to_blk_status in drivers/scsi/scsi_lib.c + * - blk_errors[] in block/blk-core.c + */ + case EBADE: + /* DID_NEXUS_FAILURE -> BLK_STS_NEXUS. */ + scsi_req_complete(&r->req, RESERVATION_CONFLICT); + break; + case ENODATA: + /* DID_MEDIUM_ERROR -> BLK_STS_MEDIUM. */ + scsi_check_condition(r, SENSE_CODE(READ_ERROR)); + break; + case EREMOTEIO: + /* DID_TARGET_FAILURE -> BLK_STS_TARGET. */ + scsi_req_complete(&r->req, HARDWARE_ERROR); + break; +#endif case ENOMEDIUM: scsi_check_condition(r, SENSE_CODE(NO_MEDIUM)); break; From 42ccce19818e4e8fb55026f50b20d533cccc48f6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 Oct 2020 09:48:23 +0200 Subject: [PATCH 4/5] target/i386: avoid theoretical leak on MCE injection g_strdup_printf is used twice to write to the same variable, which can theoretically cause a leak. In practice, it is extremely unlikely that a guest is seeing a recursive MCE and has disabled CR4.MCE between the first and the second error, but we can fix it and we can also make a slight improvement on the logic: CR4.MCE=0 causes a triple fault even for a non-recursive machine check, so let's place its test first. Signed-off-by: Paolo Bonzini --- target/i386/helper.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/target/i386/helper.c b/target/i386/helper.c index 516ce0cad8..034f46bcc2 100644 --- a/target/i386/helper.c +++ b/target/i386/helper.c @@ -908,16 +908,14 @@ static void do_inject_x86_mce(CPUState *cs, run_on_cpu_data data) return; } - if (recursive) { - need_reset = true; - msg = g_strdup_printf("CPU %d: Previous MCE still in progress, " - "raising triple fault", cs->cpu_index); - } - if (!(cenv->cr[4] & CR4_MCE_MASK)) { need_reset = true; msg = g_strdup_printf("CPU %d: MCE capability is not enabled, " "raising triple fault", cs->cpu_index); + } else if (recursive) { + need_reset = true; + msg = g_strdup_printf("CPU %d: Previous MCE still in progress, " + "raising triple fault", cs->cpu_index); } if (need_reset) { From 1370d61ae3c9934861d2349349447605202f04e9 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Mon, 16 Nov 2020 21:22:10 +0800 Subject: [PATCH 5/5] memory: Skip dirty tracking for un-migratable memory regions It makes no sense to track dirty pages for those un-migratable memory regions (e.g., Memory BAR region of the VFIO PCI device) and doing so will potentially lead to some unpleasant issues during migration [1]. Skip dirty tracking for those regions by evaluating if the region is migratable before setting dirty_log_mask (DIRTY_MEMORY_MIGRATION). [1] https://lists.gnu.org/archive/html/qemu-devel/2020-11/msg03757.html Signed-off-by: Zenghui Yu Message-Id: <20201116132210.1730-1-yuzenghui@huawei.com> Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- softmmu/memory.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/softmmu/memory.c b/softmmu/memory.c index 71951fe4dc..aa393f1bb0 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -1806,7 +1806,10 @@ bool memory_region_is_ram_device(MemoryRegion *mr) uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr) { uint8_t mask = mr->dirty_log_mask; - if (global_dirty_log && (mr->ram_block || memory_region_is_iommu(mr))) { + RAMBlock *rb = mr->ram_block; + + if (global_dirty_log && ((rb && qemu_ram_is_migratable(rb)) || + memory_region_is_iommu(mr))) { mask |= (1 << DIRTY_MEMORY_MIGRATION); } return mask;