From e4bcec0c3c8527a3f2be2791fa6a89387b9116c1 Mon Sep 17 00:00:00 2001 From: Robert Hoo Date: Mon, 4 Jul 2022 16:58:52 +0800 Subject: [PATCH 01/16] acpi/nvdimm: Define trace events for NVDIMM and substitute nvdimm_debug() Signed-off-by: Robert Hoo Reviewed-by: Jingqi Liu Message-Id: <20220704085852.330005-1-robert.hu@linux.intel.com> Reviewed-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/nvdimm.c | 35 ++++++++++++++++------------------- hw/acpi/trace-events | 13 +++++++++++++ include/hw/mem/nvdimm.h | 8 -------- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 5f85b16327..31e46df0bd 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -35,6 +35,7 @@ #include "hw/nvram/fw_cfg.h" #include "hw/mem/nvdimm.h" #include "qemu/nvdimm-utils.h" +#include "trace.h" /* * define Byte Addressable Persistent Memory (PM) Region according to @@ -550,8 +551,8 @@ static void nvdimm_dsm_func_read_fit(NVDIMMState *state, NvdimmDsmIn *in, fit = fit_buf->fit; - nvdimm_debug("Read FIT: offset 0x%x FIT size 0x%x Dirty %s.\n", - read_fit->offset, fit->len, fit_buf->dirty ? "Yes" : "No"); + trace_acpi_nvdimm_read_fit(read_fit->offset, fit->len, + fit_buf->dirty ? "Yes" : "No"); if (read_fit->offset > fit->len) { func_ret_status = NVDIMM_DSM_RET_STATUS_INVALID; @@ -658,7 +659,7 @@ static void nvdimm_dsm_label_size(NVDIMMDevice *nvdimm, hwaddr dsm_mem_addr) label_size = nvdimm->label_size; mxfer = nvdimm_get_max_xfer_label_size(); - nvdimm_debug("label_size 0x%x, max_xfer 0x%x.\n", label_size, mxfer); + trace_acpi_nvdimm_label_info(label_size, mxfer); label_size_out.func_ret_status = cpu_to_le32(NVDIMM_DSM_RET_STATUS_SUCCESS); label_size_out.label_size = cpu_to_le32(label_size); @@ -674,20 +675,18 @@ static uint32_t nvdimm_rw_label_data_check(NVDIMMDevice *nvdimm, uint32_t ret = NVDIMM_DSM_RET_STATUS_INVALID; if (offset + length < offset) { - nvdimm_debug("offset 0x%x + length 0x%x is overflow.\n", offset, - length); + trace_acpi_nvdimm_label_overflow(offset, length); return ret; } if (nvdimm->label_size < offset + length) { - nvdimm_debug("position 0x%x is beyond label data (len = %" PRIx64 ").\n", - offset + length, nvdimm->label_size); + trace_acpi_nvdimm_label_oversize(offset + length, nvdimm->label_size); return ret; } if (length > nvdimm_get_max_xfer_label_size()) { - nvdimm_debug("length (0x%x) is larger than max_xfer (0x%x).\n", - length, nvdimm_get_max_xfer_label_size()); + trace_acpi_nvdimm_label_xfer_exceed(length, + nvdimm_get_max_xfer_label_size()); return ret; } @@ -710,8 +709,8 @@ static void nvdimm_dsm_get_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, get_label_data->offset = le32_to_cpu(get_label_data->offset); get_label_data->length = le32_to_cpu(get_label_data->length); - nvdimm_debug("Read Label Data: offset 0x%x length 0x%x.\n", - get_label_data->offset, get_label_data->length); + trace_acpi_nvdimm_read_label(get_label_data->offset, + get_label_data->length); status = nvdimm_rw_label_data_check(nvdimm, get_label_data->offset, get_label_data->length); @@ -749,8 +748,8 @@ static void nvdimm_dsm_set_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, set_label_data->offset = le32_to_cpu(set_label_data->offset); set_label_data->length = le32_to_cpu(set_label_data->length); - nvdimm_debug("Write Label Data: offset 0x%x length 0x%x.\n", - set_label_data->offset, set_label_data->length); + trace_acpi_nvdimm_write_label(set_label_data->offset, + set_label_data->length); status = nvdimm_rw_label_data_check(nvdimm, set_label_data->offset, set_label_data->length); @@ -821,7 +820,7 @@ static void nvdimm_dsm_device(NvdimmDsmIn *in, hwaddr dsm_mem_addr) static uint64_t nvdimm_dsm_read(void *opaque, hwaddr addr, unsigned size) { - nvdimm_debug("BUG: we never read _DSM IO Port.\n"); + trace_acpi_nvdimm_read_io_port(); return 0; } @@ -832,7 +831,7 @@ nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) NvdimmDsmIn *in; hwaddr dsm_mem_addr = val; - nvdimm_debug("dsm memory address 0x%" HWADDR_PRIx ".\n", dsm_mem_addr); + trace_acpi_nvdimm_dsm_mem_addr(dsm_mem_addr); /* * The DSM memory is mapped to guest address space so an evil guest @@ -846,12 +845,10 @@ nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) in->function = le32_to_cpu(in->function); in->handle = le32_to_cpu(in->handle); - nvdimm_debug("Revision 0x%x Handler 0x%x Function 0x%x.\n", in->revision, - in->handle, in->function); + trace_acpi_nvdimm_dsm_info(in->revision, in->handle, in->function); if (in->revision != 0x1 /* Currently we only support DSM Spec Rev1. */) { - nvdimm_debug("Revision 0x%x is not supported, expect 0x%x.\n", - in->revision, 0x1); + trace_acpi_nvdimm_invalid_revision(in->revision); nvdimm_dsm_no_payload(NVDIMM_DSM_RET_STATUS_UNSUPPORT, dsm_mem_addr); goto exit; } diff --git a/hw/acpi/trace-events b/hw/acpi/trace-events index 2250126a22..eb60b04f9b 100644 --- a/hw/acpi/trace-events +++ b/hw/acpi/trace-events @@ -70,3 +70,16 @@ acpi_erst_reset_out(unsigned record_count) "record_count %u" acpi_erst_post_load(void *header, unsigned slot_size) "header: 0x%p slot_size %u" acpi_erst_class_init_in(void) acpi_erst_class_init_out(void) + +# nvdimm.c +acpi_nvdimm_read_fit(uint32_t offset, uint32_t len, const char *dirty) "Read FIT: offset 0x%" PRIx32 " FIT size 0x%" PRIx32 " Dirty %s" +acpi_nvdimm_label_info(uint32_t label_size, uint32_t mxfer) "label_size 0x%" PRIx32 ", max_xfer 0x%" PRIx32 +acpi_nvdimm_label_overflow(uint32_t offset, uint32_t length) "offset 0x%" PRIx32 " + length 0x%" PRIx32 " is overflow" +acpi_nvdimm_label_oversize(uint32_t pos, uint64_t size) "position 0x%" PRIx32 " is beyond label data (len = %" PRIu64 ")" +acpi_nvdimm_label_xfer_exceed(uint32_t length, uint32_t max_xfer) "length (0x%" PRIx32 ") is larger than max_xfer (0x%" PRIx32 ")" +acpi_nvdimm_read_label(uint32_t offset, uint32_t length) "Read Label Data: offset 0x%" PRIx32 " length 0x%" PRIx32 +acpi_nvdimm_write_label(uint32_t offset, uint32_t length) "Write Label Data: offset 0x%" PRIx32 " length 0x%" PRIx32 +acpi_nvdimm_read_io_port(void) "Alert: we never read _DSM IO Port" +acpi_nvdimm_dsm_mem_addr(uint64_t dsm_mem_addr) "dsm memory address 0x%" PRIx64 +acpi_nvdimm_dsm_info(uint32_t revision, uint32_t handle, uint32_t function) "Revision 0x%" PRIx32 " Handle 0x%" PRIx32 " Function 0x%" PRIx32 +acpi_nvdimm_invalid_revision(uint32_t revision) "Revision 0x%" PRIx32 " is not supported, expect 0x1" diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index cf8f59be44..acf887c83d 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -29,14 +29,6 @@ #include "hw/acpi/aml-build.h" #include "qom/object.h" -#define NVDIMM_DEBUG 0 -#define nvdimm_debug(fmt, ...) \ - do { \ - if (NVDIMM_DEBUG) { \ - fprintf(stderr, "nvdimm: " fmt, ## __VA_ARGS__); \ - } \ - } while (0) - /* * The minimum label data size is required by NVDIMM Namespace * specification, see the chapter 2 Namespaces: From 71a5f07e75ba34072a074181d37009abc28d7083 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 1 Jul 2022 14:22:58 +0100 Subject: [PATCH 02/16] hw/machine: Clear out left over CXL related pointer from move of state handling to machines. This got left behind in the move of the CXL setup code from core files to the machines that support it. Link: https://gitlab.com/qemu-project/qemu/-/commit/1ebf9001fb2701e3c00b401334c8f3900a46adaa Signed-off-by: Jonathan Cameron Message-Id: <20220701132300.2264-2-Jonathan.Cameron@huawei.com> Acked-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/boards.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/hw/boards.h b/include/hw/boards.h index d94edcef28..7b416c9787 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -360,7 +360,6 @@ struct MachineState { CpuTopology smp; struct NVDIMMState *nvdimms_state; struct NumaState *numa_state; - CXLFixedMemoryWindowOptionsList *cfmws_list; }; #define DEFINE_MACHINE(namestr, machine_initfn) \ From 4a447a710c523392b09391232d5a3dfb156a9d75 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 1 Jul 2022 14:22:59 +0100 Subject: [PATCH 03/16] hw/i386/pc: Always place CXL Memory Regions after device_memory Previously broken_reserved_end was taken into account, but Igor Mammedov identified that this could lead to a clash between potential RAM being mapped in the region and CXL usage. Hence always add the size of the device_memory memory region. This only affects the case where the broken_reserved_end flag was set. Fixes: 6e4e3ae936e6 ("hw/cxl/component: Implement host bridge MMIO (8.2.5, table 142)") Reported-by: Igor Mammedov Signed-off-by: Jonathan Cameron Message-Id: <20220701132300.2264-3-Jonathan.Cameron@huawei.com> Acked-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index d2b5823ffb..46ab1dcb47 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -922,10 +922,8 @@ void pc_memory_init(PCMachineState *pcms, hwaddr cxl_size = MiB; if (pcmc->has_reserved_memory && machine->device_memory->base) { - cxl_base = machine->device_memory->base; - if (!pcmc->broken_reserved_end) { - cxl_base += memory_region_size(&machine->device_memory->mr); - } + cxl_base = machine->device_memory->base + + memory_region_size(&machine->device_memory->mr); } else if (pcms->sgx_epc.size != 0) { cxl_base = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { From cb70b7e8712e17e5761a7447defdce5572cd4b80 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Fri, 1 Jul 2022 14:23:00 +0100 Subject: [PATCH 04/16] hw/cxl: Fix size of constant in interleave granularity function. Whilst the interleave granularity is always small enough that this isn't a real problem (much less than 4GiB) let's change the constant to ULL to fix the coverity warning. Reported-by: Peter Maydell Fixes: 829de299d1 ("hw/cxl/component: Add utils for interleave parameter encoding/decoding") Fixes: Coverity CID 1488868 Signed-off-by: Jonathan Cameron Message-Id: <20220701132300.2264-4-Jonathan.Cameron@huawei.com> Acked-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/hw/cxl/cxl_component.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hw/cxl/cxl_component.h b/include/hw/cxl/cxl_component.h index 70b5018156..94ec2f07d7 100644 --- a/include/hw/cxl/cxl_component.h +++ b/include/hw/cxl/cxl_component.h @@ -215,7 +215,7 @@ uint8_t cxl_interleave_granularity_enc(uint64_t gran, Error **errp); static inline hwaddr cxl_decode_ig(int ig) { - return 1 << (ig + 8); + return 1ULL << (ig + 8); } CXLComponentState *cxl_get_hb_cstate(PCIHostState *hb); From 4ab4c33014b4876bc6d7888efecd6bfcca0d045a Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:04 +0100 Subject: [PATCH 05/16] hw/i386: add 4g boundary start to X86MachineState Rather than hardcoding the 4G boundary everywhere, introduce a X86MachineState field @above_4g_mem_start and use it accordingly. This is in preparation for relocating ram-above-4g to be dynamically start at 1T on AMD platforms. Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-2-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 2 +- hw/i386/pc.c | 11 ++++++----- hw/i386/sgx.c | 2 +- hw/i386/x86.c | 1 + include/hw/i386/x86.h | 3 +++ 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index cad6f5ac41..0355bd3dda 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2024,7 +2024,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine) build_srat_memory(table_data, mem_base, mem_len, i - 1, MEM_AFFINITY_ENABLED); } - mem_base = 1ULL << 32; + mem_base = x86ms->above_4g_mem_start; mem_len = next_base - x86ms->below_4g_mem_size; next_base = mem_base + mem_len; } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 46ab1dcb47..13b68307be 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -850,9 +850,10 @@ void pc_memory_init(PCMachineState *pcms, machine->ram, x86ms->below_4g_mem_size, x86ms->above_4g_mem_size); - memory_region_add_subregion(system_memory, 0x100000000ULL, + memory_region_add_subregion(system_memory, x86ms->above_4g_mem_start, ram_above_4g); - e820_add_entry(0x100000000ULL, x86ms->above_4g_mem_size, E820_RAM); + e820_add_entry(x86ms->above_4g_mem_start, x86ms->above_4g_mem_size, + E820_RAM); } if (pcms->sgx_epc.size != 0) { @@ -893,7 +894,7 @@ void pc_memory_init(PCMachineState *pcms, machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { machine->device_memory->base = - 0x100000000ULL + x86ms->above_4g_mem_size; + x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } machine->device_memory->base = @@ -927,7 +928,7 @@ void pc_memory_init(PCMachineState *pcms, } else if (pcms->sgx_epc.size != 0) { cxl_base = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - cxl_base = 0x100000000ULL + x86ms->above_4g_mem_size; + cxl_base = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } e820_add_entry(cxl_base, cxl_size, E820_RESERVED); @@ -1035,7 +1036,7 @@ uint64_t pc_pci_hole64_start(void) } else if (pcms->sgx_epc.size != 0) { hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - hole64_start = 0x100000000ULL + x86ms->above_4g_mem_size; + hole64_start = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } return ROUND_UP(hole64_start, 1 * GiB); diff --git a/hw/i386/sgx.c b/hw/i386/sgx.c index a44d66ba2a..09d9c7c73d 100644 --- a/hw/i386/sgx.c +++ b/hw/i386/sgx.c @@ -295,7 +295,7 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms) return; } - sgx_epc->base = 0x100000000ULL + x86ms->above_4g_mem_size; + sgx_epc->base = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; memory_region_init(&sgx_epc->mr, OBJECT(pcms), "sgx-epc", UINT64_MAX); memory_region_add_subregion(get_system_memory(), sgx_epc->base, diff --git a/hw/i386/x86.c b/hw/i386/x86.c index ecea25d249..050eedc0c8 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -1391,6 +1391,7 @@ static void x86_machine_initfn(Object *obj) x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); x86ms->bus_lock_ratelimit = 0; + x86ms->above_4g_mem_start = 4 * GiB; } static void x86_machine_class_init(ObjectClass *oc, void *data) diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h index 6bdf1f6ab2..62fa5774f8 100644 --- a/include/hw/i386/x86.h +++ b/include/hw/i386/x86.h @@ -56,6 +56,9 @@ struct X86MachineState { /* RAM information (sizes, addresses, configuration): */ ram_addr_t below_4g_mem_size, above_4g_mem_size; + /* Start address of the initial RAM above 4G */ + uint64_t above_4g_mem_start; + /* CPU and apic information: */ bool apic_xrupt_override; unsigned pci_irq_mask; From 4876778749021a59a03dd3da3518dc861bc95ff8 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:05 +0100 Subject: [PATCH 06/16] i386/pc: create pci-host qdev prior to pc_memory_init() At the start of pc_memory_init() we usually pass a range of 0..UINT64_MAX as pci_memory, when really its 2G (i440fx) or 32G (q35). To get the real user value, we need to get pci-host passed property for default pci_hole64_size. Thus to get that, create the qdev prior to memory init to better make estimations on max used/phys addr. This is in preparation to determine that host-phys-bits are enough and also for pci-hole64-size to be considered to relocate ram-above-4g to be at 1T (on AMD platforms). Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-3-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc_piix.c | 7 +++++-- hw/i386/pc_q35.c | 6 +++--- hw/pci-host/i440fx.c | 5 ++--- include/hw/pci-host/i440fx.h | 3 ++- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index fbf9465318..b8b3ce3408 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -91,6 +91,7 @@ static void pc_init1(MachineState *machine, MemoryRegion *pci_memory; MemoryRegion *rom_memory; ram_addr_t lowmem; + DeviceState *i440fx_host; /* * Calculate ram split, for memory below and above 4G. It's a bit @@ -164,9 +165,11 @@ static void pc_init1(MachineState *machine, pci_memory = g_new(MemoryRegion, 1); memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); rom_memory = pci_memory; + i440fx_host = qdev_new(host_type); } else { pci_memory = NULL; rom_memory = system_memory; + i440fx_host = NULL; } pc_guest_info_init(pcms); @@ -200,8 +203,8 @@ static void pc_init1(MachineState *machine, const char *type = xen_enabled() ? TYPE_PIIX3_XEN_DEVICE : TYPE_PIIX3_DEVICE; - pci_bus = i440fx_init(host_type, - pci_type, + pci_bus = i440fx_init(pci_type, + i440fx_host, system_memory, system_io, machine->ram_size, x86ms->below_4g_mem_size, x86ms->above_4g_mem_size, diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 12cc76aaf8..f4d23b1469 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -203,12 +203,12 @@ static void pc_q35_init(MachineState *machine) pcms->smbios_entry_point_type); } - /* allocate ram and load rom/bios */ - pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); - /* create pci host bus */ q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE)); + /* allocate ram and load rom/bios */ + pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); + object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host)); object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM, OBJECT(ram_memory), NULL); diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c index 1c5ad5f918..d5426ef4a5 100644 --- a/hw/pci-host/i440fx.c +++ b/hw/pci-host/i440fx.c @@ -237,7 +237,8 @@ static void i440fx_realize(PCIDevice *dev, Error **errp) } } -PCIBus *i440fx_init(const char *host_type, const char *pci_type, +PCIBus *i440fx_init(const char *pci_type, + DeviceState *dev, MemoryRegion *address_space_mem, MemoryRegion *address_space_io, ram_addr_t ram_size, @@ -246,7 +247,6 @@ PCIBus *i440fx_init(const char *host_type, const char *pci_type, MemoryRegion *pci_address_space, MemoryRegion *ram_memory) { - DeviceState *dev; PCIBus *b; PCIDevice *d; PCIHostState *s; @@ -254,7 +254,6 @@ PCIBus *i440fx_init(const char *host_type, const char *pci_type, unsigned i; I440FXState *i440fx; - dev = qdev_new(host_type); s = PCI_HOST_BRIDGE(dev); b = pci_root_bus_new(dev, NULL, pci_address_space, address_space_io, 0, TYPE_PCI_BUS); diff --git a/include/hw/pci-host/i440fx.h b/include/hw/pci-host/i440fx.h index 52518dbf08..d02bf1ed6b 100644 --- a/include/hw/pci-host/i440fx.h +++ b/include/hw/pci-host/i440fx.h @@ -35,7 +35,8 @@ struct PCII440FXState { #define TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE "igd-passthrough-i440FX" -PCIBus *i440fx_init(const char *host_type, const char *pci_type, +PCIBus *i440fx_init(const char *pci_type, + DeviceState *dev, MemoryRegion *address_space_mem, MemoryRegion *address_space_io, ram_addr_t ram_size, From c48eb7a4e869f338e5b3e233fed7c9dbb0520247 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:06 +0100 Subject: [PATCH 07/16] i386/pc: pass pci_hole64_size to pc_memory_init() Use the pre-initialized pci-host qdev and fetch the pci-hole64-size into pc_memory_init() newly added argument. Use PCI_HOST_PROP_PCI_HOLE64_SIZE pci-host property for fetching pci-hole64-size. This is in preparation to determine that host-phys-bits are enough and for pci-hole64-size to be considered to relocate ram-above-4g to be at 1T (on AMD platforms). Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-4-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 3 ++- hw/i386/pc_piix.c | 7 ++++++- hw/i386/pc_q35.c | 10 +++++++++- include/hw/i386/pc.h | 3 ++- 4 files changed, 19 insertions(+), 4 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 13b68307be..f4d5b25fdd 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -817,7 +817,8 @@ void xen_load_linux(PCMachineState *pcms) void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, - MemoryRegion **ram_memory) + MemoryRegion **ram_memory, + uint64_t pci_hole64_size) { int linux_boot, i; MemoryRegion *option_rom_mr; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index b8b3ce3408..aa191d405a 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -91,6 +91,7 @@ static void pc_init1(MachineState *machine, MemoryRegion *pci_memory; MemoryRegion *rom_memory; ram_addr_t lowmem; + uint64_t hole64_size; DeviceState *i440fx_host; /* @@ -166,10 +167,14 @@ static void pc_init1(MachineState *machine, memory_region_init(pci_memory, NULL, "pci", UINT64_MAX); rom_memory = pci_memory; i440fx_host = qdev_new(host_type); + hole64_size = object_property_get_uint(OBJECT(i440fx_host), + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); } else { pci_memory = NULL; rom_memory = system_memory; i440fx_host = NULL; + hole64_size = 0; } pc_guest_info_init(pcms); @@ -186,7 +191,7 @@ static void pc_init1(MachineState *machine, /* allocate ram and load rom/bios */ if (!xen_enabled()) { pc_memory_init(pcms, system_memory, - rom_memory, &ram_memory); + rom_memory, &ram_memory, hole64_size); } else { pc_system_flash_cleanup_unused(pcms); if (machine->kernel_filename != NULL) { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index f4d23b1469..307910b33c 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -138,6 +138,7 @@ static void pc_q35_init(MachineState *machine) MachineClass *mc = MACHINE_GET_CLASS(machine); bool acpi_pcihp; bool keep_pci_slot_hpc; + uint64_t pci_hole64_size = 0; /* Check whether RAM fits below 4G (leaving 1/2 GByte for IO memory * and 256 Mbytes for PCI Express Enhanced Configuration Access Mapping @@ -206,8 +207,15 @@ static void pc_q35_init(MachineState *machine) /* create pci host bus */ q35_host = Q35_HOST_DEVICE(qdev_new(TYPE_Q35_HOST_DEVICE)); + if (pcmc->pci_enabled) { + pci_hole64_size = object_property_get_uint(OBJECT(q35_host), + PCI_HOST_PROP_PCI_HOLE64_SIZE, + &error_abort); + } + /* allocate ram and load rom/bios */ - pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory); + pc_memory_init(pcms, get_system_memory(), rom_memory, &ram_memory, + pci_hole64_size); object_property_add_child(qdev_get_machine(), "q35", OBJECT(q35_host)); object_property_set_link(OBJECT(q35_host), MCH_HOST_PROP_RAM_MEM, diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 2a8ffbcfa8..01938fce4c 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -162,7 +162,8 @@ void xen_load_linux(PCMachineState *pcms); void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, - MemoryRegion **ram_memory); + MemoryRegion **ram_memory, + uint64_t pci_hole64_size); uint64_t pc_pci_hole64_start(void); DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus); void pc_basic_device_init(struct PCMachineState *pcms, From 5ff62e2afedf5c984048f90a881794622a9eb3e8 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:07 +0100 Subject: [PATCH 08/16] i386/pc: factor out above-4g end to an helper There's a couple of places that seem to duplicate this calculation of RAM size above the 4G boundary. Move all those to a helper function. Signed-off-by: Joao Martins Reviewed-by: Igor Mammedov Message-Id: <20220719170014.27028-5-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index f4d5b25fdd..d1e20ccb27 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -814,6 +814,17 @@ void xen_load_linux(PCMachineState *pcms) #define PC_ROM_ALIGN 0x800 #define PC_ROM_SIZE (PC_ROM_MAX - PC_ROM_MIN_VGA) +static hwaddr pc_above_4g_end(PCMachineState *pcms) +{ + X86MachineState *x86ms = X86_MACHINE(pcms); + + if (pcms->sgx_epc.size != 0) { + return sgx_epc_above_4g_end(&pcms->sgx_epc); + } + + return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; +} + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -891,15 +902,8 @@ void pc_memory_init(PCMachineState *pcms, exit(EXIT_FAILURE); } - if (pcms->sgx_epc.size != 0) { - machine->device_memory->base = sgx_epc_above_4g_end(&pcms->sgx_epc); - } else { - machine->device_memory->base = - x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; - } - machine->device_memory->base = - ROUND_UP(machine->device_memory->base, 1 * GiB); + ROUND_UP(pc_above_4g_end(pcms), 1 * GiB); if (pcmc->enforce_aligned_dimm) { /* size device region assuming 1G page max alignment per slot */ @@ -926,10 +930,8 @@ void pc_memory_init(PCMachineState *pcms, if (pcmc->has_reserved_memory && machine->device_memory->base) { cxl_base = machine->device_memory->base + memory_region_size(&machine->device_memory->mr); - } else if (pcms->sgx_epc.size != 0) { - cxl_base = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - cxl_base = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; + cxl_base = pc_above_4g_end(pcms); } e820_add_entry(cxl_base, cxl_size, E820_RESERVED); @@ -1016,7 +1018,6 @@ uint64_t pc_pci_hole64_start(void) PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); MachineState *ms = MACHINE(pcms); - X86MachineState *x86ms = X86_MACHINE(pcms); uint64_t hole64_start = 0; if (pcms->cxl_devices_state.host_mr.addr) { @@ -1034,10 +1035,8 @@ uint64_t pc_pci_hole64_start(void) if (!pcmc->broken_reserved_end) { hole64_start += memory_region_size(&ms->device_memory->mr); } - } else if (pcms->sgx_epc.size != 0) { - hole64_start = sgx_epc_above_4g_end(&pcms->sgx_epc); } else { - hole64_start = x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; + hole64_start = pc_above_4g_end(pcms); } return ROUND_UP(hole64_start, 1 * GiB); From 55668e409bcaf04e6af7acb44037d5f18c6b786e Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:08 +0100 Subject: [PATCH 09/16] i386/pc: factor out cxl range end to helper Move calculation of CXL memory region end to separate helper. This is in preparation to a future change that removes CXL range dependency on the CXL memory region, with the goal of allowing pc_pci_hole64_start() to be called before any memory region are initialized. Cc: Jonathan Cameron Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-6-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index d1e20ccb27..cb27309e76 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -825,6 +825,25 @@ static hwaddr pc_above_4g_end(PCMachineState *pcms) return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } +static uint64_t pc_get_cxl_range_end(PCMachineState *pcms) +{ + uint64_t start = 0; + + if (pcms->cxl_devices_state.host_mr.addr) { + start = pcms->cxl_devices_state.host_mr.addr + + memory_region_size(&pcms->cxl_devices_state.host_mr); + if (pcms->cxl_devices_state.fixed_windows) { + GList *it; + for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) { + CXLFixedWindow *fw = it->data; + start = fw->mr.addr + memory_region_size(&fw->mr); + } + } + } + + return start; +} + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -1020,16 +1039,8 @@ uint64_t pc_pci_hole64_start(void) MachineState *ms = MACHINE(pcms); uint64_t hole64_start = 0; - if (pcms->cxl_devices_state.host_mr.addr) { - hole64_start = pcms->cxl_devices_state.host_mr.addr + - memory_region_size(&pcms->cxl_devices_state.host_mr); - if (pcms->cxl_devices_state.fixed_windows) { - GList *it; - for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) { - CXLFixedWindow *fw = it->data; - hole64_start = fw->mr.addr + memory_region_size(&fw->mr); - } - } + if (pcms->cxl_devices_state.is_enabled) { + hole64_start = pc_get_cxl_range_end(pcms); } else if (pcmc->has_reserved_memory && ms->device_memory->base) { hole64_start = ms->device_memory->base; if (!pcmc->broken_reserved_end) { From 42bed0712725ce338fc1cad725c3d9cf5cf443dc Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:09 +0100 Subject: [PATCH 10/16] i386/pc: factor out cxl range start to helper Factor out the calculation of the base address of the memory region. It will be used later on for the cxl range end counterpart calculation and as well in pc_memory_init() CXL memory region initialization, thus avoiding duplication. Cc: Jonathan Cameron Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-7-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index cb27309e76..9e1a067c41 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -825,6 +825,22 @@ static hwaddr pc_above_4g_end(PCMachineState *pcms) return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } +static uint64_t pc_get_cxl_range_start(PCMachineState *pcms) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + MachineState *machine = MACHINE(pcms); + hwaddr cxl_base; + + if (pcmc->has_reserved_memory && machine->device_memory->base) { + cxl_base = machine->device_memory->base + + memory_region_size(&machine->device_memory->mr); + } else { + cxl_base = pc_above_4g_end(pcms); + } + + return cxl_base; +} + static uint64_t pc_get_cxl_range_end(PCMachineState *pcms) { uint64_t start = 0; @@ -946,13 +962,7 @@ void pc_memory_init(PCMachineState *pcms, MemoryRegion *mr = &pcms->cxl_devices_state.host_mr; hwaddr cxl_size = MiB; - if (pcmc->has_reserved_memory && machine->device_memory->base) { - cxl_base = machine->device_memory->base - + memory_region_size(&machine->device_memory->mr); - } else { - cxl_base = pc_above_4g_end(pcms); - } - + cxl_base = pc_get_cxl_range_start(pcms); e820_add_entry(cxl_base, cxl_size, E820_RESERVED); memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size); memory_region_add_subregion(system_memory, cxl_base, mr); From 1065b21993628a00066a2ab1439bfff5d186c12d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:10 +0100 Subject: [PATCH 11/16] i386/pc: handle unitialized mr in pc_get_cxl_range_end() Remove pc_get_cxl_range_end() dependency on the CXL memory region, and replace with one that does not require the CXL host_mr to determine the start of CXL start. This in preparation to allow pc_pci_hole64_start() to be called early in pc_memory_init(), handle CXL memory region end when its underlying memory region isn't yet initialized. Cc: Jonathan Cameron Signed-off-by: Joao Martins Message-Id: <20220719170014.27028-8-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Igor Mammedov --- hw/i386/pc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 9e1a067c41..611eb197da 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -843,17 +843,15 @@ static uint64_t pc_get_cxl_range_start(PCMachineState *pcms) static uint64_t pc_get_cxl_range_end(PCMachineState *pcms) { - uint64_t start = 0; + uint64_t start = pc_get_cxl_range_start(pcms) + MiB; - if (pcms->cxl_devices_state.host_mr.addr) { - start = pcms->cxl_devices_state.host_mr.addr + - memory_region_size(&pcms->cxl_devices_state.host_mr); - if (pcms->cxl_devices_state.fixed_windows) { - GList *it; - for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) { - CXLFixedWindow *fw = it->data; - start = fw->mr.addr + memory_region_size(&fw->mr); - } + if (pcms->cxl_devices_state.fixed_windows) { + GList *it; + + start = ROUND_UP(start, 256 * MiB); + for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) { + CXLFixedWindow *fw = it->data; + start += fw->size; } } From 8288a8286d00e074b765f1d47ee61159ed5291fd Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:11 +0100 Subject: [PATCH 12/16] i386/pc: factor out device_memory base/size to helper Move obtaining hole64_start from device_memory memory region base/size into an helper alongside correspondent getters in pc_memory_init() when the hotplug range is unitialized. While doing that remove the memory region based logic from this newly added helper. This is the final step that allows pc_pci_hole64_start() to be callable at the beginning of pc_memory_init() before any memory regions are initialized. Cc: Jonathan Cameron Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-9-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 48 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 611eb197da..ebc27e4cb7 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -825,15 +825,36 @@ static hwaddr pc_above_4g_end(PCMachineState *pcms) return x86ms->above_4g_mem_start + x86ms->above_4g_mem_size; } -static uint64_t pc_get_cxl_range_start(PCMachineState *pcms) +static void pc_get_device_memory_range(PCMachineState *pcms, + hwaddr *base, + ram_addr_t *device_mem_size) { PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); MachineState *machine = MACHINE(pcms); - hwaddr cxl_base; + ram_addr_t size; + hwaddr addr; - if (pcmc->has_reserved_memory && machine->device_memory->base) { - cxl_base = machine->device_memory->base - + memory_region_size(&machine->device_memory->mr); + size = machine->maxram_size - machine->ram_size; + addr = ROUND_UP(pc_above_4g_end(pcms), 1 * GiB); + + if (pcmc->enforce_aligned_dimm) { + /* size device region assuming 1G page max alignment per slot */ + size += (1 * GiB) * machine->ram_slots; + } + + *base = addr; + *device_mem_size = size; +} + +static uint64_t pc_get_cxl_range_start(PCMachineState *pcms) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); + hwaddr cxl_base; + ram_addr_t size; + + if (pcmc->has_reserved_memory) { + pc_get_device_memory_range(pcms, &cxl_base, &size); + cxl_base += size; } else { cxl_base = pc_above_4g_end(pcms); } @@ -920,7 +941,7 @@ void pc_memory_init(PCMachineState *pcms, /* initialize device memory address space */ if (pcmc->has_reserved_memory && (machine->ram_size < machine->maxram_size)) { - ram_addr_t device_mem_size = machine->maxram_size - machine->ram_size; + ram_addr_t device_mem_size; if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) { error_report("unsupported amount of memory slots: %"PRIu64, @@ -935,13 +956,7 @@ void pc_memory_init(PCMachineState *pcms, exit(EXIT_FAILURE); } - machine->device_memory->base = - ROUND_UP(pc_above_4g_end(pcms), 1 * GiB); - - if (pcmc->enforce_aligned_dimm) { - /* size device region assuming 1G page max alignment per slot */ - device_mem_size += (1 * GiB) * machine->ram_slots; - } + pc_get_device_memory_range(pcms, &machine->device_memory->base, &device_mem_size); if ((machine->device_memory->base + device_mem_size) < device_mem_size) { @@ -1046,13 +1061,14 @@ uint64_t pc_pci_hole64_start(void) PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); MachineState *ms = MACHINE(pcms); uint64_t hole64_start = 0; + ram_addr_t size = 0; if (pcms->cxl_devices_state.is_enabled) { hole64_start = pc_get_cxl_range_end(pcms); - } else if (pcmc->has_reserved_memory && ms->device_memory->base) { - hole64_start = ms->device_memory->base; + } else if (pcmc->has_reserved_memory && (ms->ram_size < ms->maxram_size)) { + pc_get_device_memory_range(pcms, &hole64_start, &size); if (!pcmc->broken_reserved_end) { - hole64_start += memory_region_size(&ms->device_memory->mr); + hole64_start += size; } } else { hole64_start = pc_above_4g_end(pcms); From 1caab5cf86bdf02fa71079c4ca4a3c0748f39eb5 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:12 +0100 Subject: [PATCH 13/16] i386/pc: bounds check phys-bits against max used GPA Calculate max *used* GPA against the CPU maximum possible address and error out if the former surprasses the latter. This ensures max used GPA is reacheable by configured phys-bits. Default phys-bits on Qemu is TCG_PHYS_ADDR_BITS (40) which is enough for the CPU to address 1Tb (0xff ffff ffff) or 1010G (0xfc ffff ffff) in AMD hosts with IOMMU. This is preparation for AMD guests with >1010G, where it will want relocate ram-above-4g to be after 1Tb instead of 4G. Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-10-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index ebc27e4cb7..56d8c179ea 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -879,6 +879,18 @@ static uint64_t pc_get_cxl_range_end(PCMachineState *pcms) return start; } +static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size) +{ + X86CPU *cpu = X86_CPU(first_cpu); + + /* 32-bit systems don't have hole64 thus return max CPU address */ + if (cpu->phys_bits <= 32) { + return ((hwaddr)1 << cpu->phys_bits) - 1; + } + + return pc_pci_hole64_start() + pci_hole64_size - 1; +} + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -893,13 +905,28 @@ void pc_memory_init(PCMachineState *pcms, MachineClass *mc = MACHINE_GET_CLASS(machine); PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); X86MachineState *x86ms = X86_MACHINE(pcms); + hwaddr maxphysaddr, maxusedaddr; hwaddr cxl_base, cxl_resv_end = 0; + X86CPU *cpu = X86_CPU(first_cpu); assert(machine->ram_size == x86ms->below_4g_mem_size + x86ms->above_4g_mem_size); linux_boot = (machine->kernel_filename != NULL); + /* + * phys-bits is required to be appropriately configured + * to make sure max used GPA is reachable. + */ + maxusedaddr = pc_max_used_gpa(pcms, pci_hole64_size); + maxphysaddr = ((hwaddr)1 << cpu->phys_bits) - 1; + if (maxphysaddr < maxusedaddr) { + error_report("Address space limit 0x%"PRIx64" < 0x%"PRIx64 + " phys-bits too low (%u)", + maxphysaddr, maxusedaddr, cpu->phys_bits); + exit(EXIT_FAILURE); + } + /* * Split single memory region and use aliases to address portions of it, * done for backwards compatibility with older qemus. From 8504f129450b909c88e199ca44facd35d38ba4de Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:13 +0100 Subject: [PATCH 14/16] i386/pc: relocate 4g start to 1T where applicable It is assumed that the whole GPA space is available to be DMA addressable, within a given address space limit, except for a tiny region before the 4G. Since Linux v5.4, VFIO validates whether the selected GPA is indeed valid i.e. not reserved by IOMMU on behalf of some specific devices or platform-defined restrictions, and thus failing the ioctl(VFIO_DMA_MAP) with -EINVAL. AMD systems with an IOMMU are examples of such platforms and particularly may only have these ranges as allowed: 0000000000000000 - 00000000fedfffff (0 .. 3.982G) 00000000fef00000 - 000000fcffffffff (3.983G .. 1011.9G) 0000010000000000 - ffffffffffffffff (1Tb .. 16Pb[*]) We already account for the 4G hole, albeit if the guest is big enough we will fail to allocate a guest with >1010G due to the ~12G hole at the 1Tb boundary, reserved for HyperTransport (HT). [*] there is another reserved region unrelated to HT that exists in the 256T boundary in Fam 17h according to Errata #1286, documeted also in "Open-Source Register Reference for AMD Family 17h Processors (PUB)" When creating the region above 4G, take into account that on AMD platforms the HyperTransport range is reserved and hence it cannot be used either as GPAs. On those cases rather than establishing the start of ram-above-4g to be 4G, relocate instead to 1Tb. See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology", for more information on the underlying restriction of IOVAs. After accounting for the 1Tb hole on AMD hosts, mtree should look like: 0000000000000000-000000007fffffff (prio 0, i/o): alias ram-below-4g @pc.ram 0000000000000000-000000007fffffff 0000010000000000-000001ff7fffffff (prio 0, i/o): alias ram-above-4g @pc.ram 0000000080000000-000000ffffffffff If the relocation is done or the address space covers it, we also add the the reserved HT e820 range as reserved. Default phys-bits on Qemu is TCG_PHYS_ADDR_BITS (40) which is enough to address 1Tb (0xff ffff ffff). On AMD platforms, if a ram-above-4g relocation is attempted and the CPU wasn't configured with a big enough phys-bits, an error message will be printed due to the maxphysaddr vs maxusedaddr check previously added. Suggested-by: Igor Mammedov Signed-off-by: Joao Martins Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-11-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 56d8c179ea..1c5c9e17c6 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -891,6 +891,40 @@ static hwaddr pc_max_used_gpa(PCMachineState *pcms, uint64_t pci_hole64_size) return pc_pci_hole64_start() + pci_hole64_size - 1; } +/* + * AMD systems with an IOMMU have an additional hole close to the + * 1Tb, which are special GPAs that cannot be DMA mapped. Depending + * on kernel version, VFIO may or may not let you DMA map those ranges. + * Starting Linux v5.4 we validate it, and can't create guests on AMD machines + * with certain memory sizes. It's also wrong to use those IOVA ranges + * in detriment of leading to IOMMU INVALID_DEVICE_REQUEST or worse. + * The ranges reserved for Hyper-Transport are: + * + * FD_0000_0000h - FF_FFFF_FFFFh + * + * The ranges represent the following: + * + * Base Address Top Address Use + * + * FD_0000_0000h FD_F7FF_FFFFh Reserved interrupt address space + * FD_F800_0000h FD_F8FF_FFFFh Interrupt/EOI IntCtl + * FD_F900_0000h FD_F90F_FFFFh Legacy PIC IACK + * FD_F910_0000h FD_F91F_FFFFh System Management + * FD_F920_0000h FD_FAFF_FFFFh Reserved Page Tables + * FD_FB00_0000h FD_FBFF_FFFFh Address Translation + * FD_FC00_0000h FD_FDFF_FFFFh I/O Space + * FD_FE00_0000h FD_FFFF_FFFFh Configuration + * FE_0000_0000h FE_1FFF_FFFFh Extended Configuration/Device Messages + * FE_2000_0000h FF_FFFF_FFFFh Reserved + * + * See AMD IOMMU spec, section 2.1.2 "IOMMU Logical Topology", + * Table 3: Special Address Controls (GPA) for more information. + */ +#define AMD_HT_START 0xfd00000000UL +#define AMD_HT_END 0xffffffffffUL +#define AMD_ABOVE_1TB_START (AMD_HT_END + 1) +#define AMD_HT_SIZE (AMD_ABOVE_1TB_START - AMD_HT_START) + void pc_memory_init(PCMachineState *pcms, MemoryRegion *system_memory, MemoryRegion *rom_memory, @@ -914,6 +948,26 @@ void pc_memory_init(PCMachineState *pcms, linux_boot = (machine->kernel_filename != NULL); + /* + * The HyperTransport range close to the 1T boundary is unique to AMD + * hosts with IOMMUs enabled. Restrict the ram-above-4g relocation + * to above 1T to AMD vCPUs only. + */ + if (IS_AMD_CPU(&cpu->env)) { + /* Bail out if max possible address does not cross HT range */ + if (pc_max_used_gpa(pcms, pci_hole64_size) >= AMD_HT_START) { + x86ms->above_4g_mem_start = AMD_ABOVE_1TB_START; + } + + /* + * Advertise the HT region if address space covers the reserved + * region or if we relocate. + */ + if (cpu->phys_bits >= 40) { + e820_add_entry(AMD_HT_START, AMD_HT_SIZE, E820_RESERVED); + } + } + /* * phys-bits is required to be appropriately configured * to make sure max used GPA is reachable. From b3e6982b4154c1c0ab8b25f2e1ac7838a1809824 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Tue, 19 Jul 2022 18:00:14 +0100 Subject: [PATCH 15/16] i386/pc: restrict AMD only enforcing of 1Tb hole to new machine type The added enforcing is only relevant in the case of AMD where the range right before the 1TB is restricted and cannot be DMA mapped by the kernel consequently leading to IOMMU INVALID_DEVICE_REQUEST or possibly other kinds of IOMMU events in the AMD IOMMU. Although, there's a case where it may make sense to disable the IOVA relocation/validation when migrating from a non-amd-1tb-aware qemu to one that supports it. Relocating RAM regions to after the 1Tb hole has consequences for guest ABI because we are changing the memory mapping, so make sure that only new machine enforce but not older ones. Signed-off-by: Joao Martins Acked-by: Dr. David Alan Gilbert Acked-by: Igor Mammedov Message-Id: <20220719170014.27028-12-joao.m.martins@oracle.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 6 ++++-- hw/i386/pc_piix.c | 1 + hw/i386/pc_q35.c | 1 + include/hw/i386/pc.h | 1 + 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 1c5c9e17c6..7280c02ce3 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -951,9 +951,10 @@ void pc_memory_init(PCMachineState *pcms, /* * The HyperTransport range close to the 1T boundary is unique to AMD * hosts with IOMMUs enabled. Restrict the ram-above-4g relocation - * to above 1T to AMD vCPUs only. + * to above 1T to AMD vCPUs only. @enforce_amd_1tb_hole is only false in + * older machine types (<= 7.0) for compatibility purposes. */ - if (IS_AMD_CPU(&cpu->env)) { + if (IS_AMD_CPU(&cpu->env) && pcmc->enforce_amd_1tb_hole) { /* Bail out if max possible address does not cross HT range */ if (pc_max_used_gpa(pcms, pci_hole64_size) >= AMD_HT_START) { x86ms->above_4g_mem_start = AMD_ABOVE_1TB_START; @@ -1902,6 +1903,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) pcmc->has_reserved_memory = true; pcmc->kvmclock_enabled = true; pcmc->enforce_aligned_dimm = true; + pcmc->enforce_amd_1tb_hole = true; /* BIOS ACPI tables: 128K. Other BIOS datastructures: less than 4K reported * to be used at the moment, 32K should be enough for a while. */ pcmc->acpi_data_size = 0x20000 + 0x8000; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index aa191d405a..a5c65c1c35 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -451,6 +451,7 @@ static void pc_i440fx_7_0_machine_options(MachineClass *m) m->alias = NULL; m->is_default = false; pcmc->legacy_no_rng_seed = true; + pcmc->enforce_amd_1tb_hole = false; compat_props_add(m->compat_props, hw_compat_7_0, hw_compat_7_0_len); compat_props_add(m->compat_props, pc_compat_7_0, pc_compat_7_0_len); } diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 307910b33c..3a35193ff7 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -387,6 +387,7 @@ static void pc_q35_7_0_machine_options(MachineClass *m) pc_q35_7_1_machine_options(m); m->alias = NULL; pcmc->legacy_no_rng_seed = true; + pcmc->enforce_amd_1tb_hole = false; compat_props_add(m->compat_props, hw_compat_7_0, hw_compat_7_0_len); compat_props_add(m->compat_props, pc_compat_7_0, pc_compat_7_0_len); } diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 01938fce4c..8435733bd6 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -118,6 +118,7 @@ struct PCMachineClass { bool has_reserved_memory; bool enforce_aligned_dimm; bool broken_reserved_end; + bool enforce_amd_1tb_hole; /* generate legacy CPU hotplug AML */ bool legacy_cpu_hotplug; From 0522be9a0c0094088ccef7aab352c57f483ca250 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 18 Jul 2022 14:56:37 +0100 Subject: [PATCH 16/16] hw/virtio/virtio-iommu: Enforce power-of-two notify for both MAP and UNMAP Currently we only enforce power-of-two mappings (required by the QEMU notifier) for UNMAP requests. A MAP request not aligned on a power-of-two may be successfully handled by VFIO, and then the corresponding UNMAP notify will fail because it will attempt to split that mapping. Ensure MAP and UNMAP notifications are consistent. Fixes: dde3f08b5cab ("virtio-iommu: Handle non power of 2 range invalidations") Reported-by: Tina Zhang Signed-off-by: Jean-Philippe Brucker Message-Id: <20220718135636.338264-1-jean-philippe@linaro.org> Tested-by: Tina Zhang Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-iommu.c | 47 ++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index 281152d338..62e07ec2e4 100644 --- a/hw/virtio/virtio-iommu.c +++ b/hw/virtio/virtio-iommu.c @@ -197,6 +197,32 @@ static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data) } } +static void virtio_iommu_notify_map_unmap(IOMMUMemoryRegion *mr, + IOMMUTLBEvent *event, + hwaddr virt_start, hwaddr virt_end) +{ + uint64_t delta = virt_end - virt_start; + + event->entry.iova = virt_start; + event->entry.addr_mask = delta; + + if (delta == UINT64_MAX) { + memory_region_notify_iommu(mr, 0, *event); + } + + while (virt_start != virt_end + 1) { + uint64_t mask = dma_aligned_pow2_mask(virt_start, virt_end, 64); + + event->entry.addr_mask = mask; + event->entry.iova = virt_start; + memory_region_notify_iommu(mr, 0, *event); + virt_start += mask + 1; + if (event->entry.perm != IOMMU_NONE) { + event->entry.translated_addr += mask + 1; + } + } +} + static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start, hwaddr virt_end, hwaddr paddr, uint32_t flags) @@ -215,19 +241,16 @@ static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start, event.type = IOMMU_NOTIFIER_MAP; event.entry.target_as = &address_space_memory; - event.entry.addr_mask = virt_end - virt_start; - event.entry.iova = virt_start; event.entry.perm = perm; event.entry.translated_addr = paddr; - memory_region_notify_iommu(mr, 0, event); + virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end); } static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start, hwaddr virt_end) { IOMMUTLBEvent event; - uint64_t delta = virt_end - virt_start; if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) { return; @@ -239,22 +262,8 @@ static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start, event.entry.target_as = &address_space_memory; event.entry.perm = IOMMU_NONE; event.entry.translated_addr = 0; - event.entry.addr_mask = delta; - event.entry.iova = virt_start; - if (delta == UINT64_MAX) { - memory_region_notify_iommu(mr, 0, event); - } - - - while (virt_start != virt_end + 1) { - uint64_t mask = dma_aligned_pow2_mask(virt_start, virt_end, 64); - - event.entry.addr_mask = mask; - event.entry.iova = virt_start; - memory_region_notify_iommu(mr, 0, event); - virt_start += mask + 1; - } + virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end); } static gboolean virtio_iommu_notify_unmap_cb(gpointer key, gpointer value,