From bacabb0afadb47294806481a7ebb6fa5d4f1c7bd Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 3 Nov 2016 09:22:23 +0800 Subject: [PATCH 01/28] intel_iommu: fixing source id during IOTLB hash key calculation Using uint8_t for source id will lose bus num and get the wrong/invalid IOTLB entry. Fixing by using uint16_t instead and enlarge level shift. Cc: Paolo Bonzini Cc: Richard Henderson Cc: Eduardo Habkost Cc: Michael S. Tsirkin Signed-off-by: Jason Wang Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 2 +- hw/i386/intel_iommu_internal.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 1655a65bce..5a12ae7a93 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -218,7 +218,7 @@ static void vtd_reset_iotlb(IntelIOMMUState *s) g_hash_table_remove_all(s->iotlb); } -static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint8_t source_id, +static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, uint32_t level) { return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h index 0829a5064f..11abfa2233 100644 --- a/hw/i386/intel_iommu_internal.h +++ b/hw/i386/intel_iommu_internal.h @@ -115,7 +115,7 @@ /* The shift of source_id in the key of IOTLB hash table */ #define VTD_IOTLB_SID_SHIFT 36 -#define VTD_IOTLB_LVL_SHIFT 44 +#define VTD_IOTLB_LVL_SHIFT 52 #define VTD_IOTLB_MAX_SIZE 1024 /* Max size of the hash table */ /* IOTLB_REG */ From 27e57efe32f53a8788cd6b6b9b9bbc08446cc8ae Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Thu, 3 Nov 2016 09:55:49 +0100 Subject: [PATCH 02/28] virtio: rename virtqueue_discard to virtqueue_unpop The function undoes the effect of virtqueue_pop and doesn't do anything destructive or irreversible so virtqueue_unpop is a more fitting name. Signed-off-by: Ladi Prosek Reviewed-by: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/virtio-net.c | 2 +- hw/virtio/virtio-balloon.c | 2 +- hw/virtio/virtio.c | 8 ++++---- include/hw/virtio/virtio.h | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 06bfe4bcc9..20aa63e16c 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1177,7 +1177,7 @@ static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t * must have consumed the complete packet. * Otherwise, drop it. */ if (!n->mergeable_rx_bufs && offset < size) { - virtqueue_discard(q->rx_vq, elem, total); + virtqueue_unpop(q->rx_vq, elem, total); g_free(elem); return size; } diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index cfba053280..884570a57d 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -456,7 +456,7 @@ static void virtio_balloon_device_reset(VirtIODevice *vdev) VirtIOBalloon *s = VIRTIO_BALLOON(vdev); if (s->stats_vq_elem != NULL) { - virtqueue_discard(s->svq, s->stats_vq_elem, 0); + virtqueue_unpop(s->svq, s->stats_vq_elem, 0); g_free(s->stats_vq_elem); s->stats_vq_elem = NULL; } diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index bcbcfe063c..3a76dc6b8f 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -279,7 +279,7 @@ void virtqueue_detach_element(VirtQueue *vq, const VirtQueueElement *elem, virtqueue_unmap_sg(vq, elem, len); } -/* virtqueue_discard: +/* virtqueue_unpop: * @vq: The #VirtQueue * @elem: The #VirtQueueElement * @len: number of bytes written @@ -287,8 +287,8 @@ void virtqueue_detach_element(VirtQueue *vq, const VirtQueueElement *elem, * Pretend the most recent element wasn't popped from the virtqueue. The next * call to virtqueue_pop() will refetch the element. */ -void virtqueue_discard(VirtQueue *vq, const VirtQueueElement *elem, - unsigned int len) +void virtqueue_unpop(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) { vq->last_avail_idx--; virtqueue_detach_element(vq, elem, len); @@ -301,7 +301,7 @@ void virtqueue_discard(VirtQueue *vq, const VirtQueueElement *elem, * Pretend that elements weren't popped from the virtqueue. The next * virtqueue_pop() will refetch the oldest element. * - * Use virtqueue_discard() instead if you have a VirtQueueElement. + * Use virtqueue_unpop() instead if you have a VirtQueueElement. * * Returns: true on success, false if @num is greater than the number of in use * elements. diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index ac65d6a594..6a2f57c1f7 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -160,8 +160,8 @@ void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, void virtqueue_flush(VirtQueue *vq, unsigned int count); void virtqueue_detach_element(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len); -void virtqueue_discard(VirtQueue *vq, const VirtQueueElement *elem, - unsigned int len); +void virtqueue_unpop(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len); bool virtqueue_rewind(VirtQueue *vq, unsigned int num); void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len, unsigned int idx); From bf91bd27924955aa243abfa5d422ee71e9e84b93 Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Thu, 3 Nov 2016 09:55:50 +0100 Subject: [PATCH 03/28] virtio: make virtqueue_alloc_element static The function does not fully initialize the returned VirtQueueElement and should be used only internally from the virtio module. Signed-off-by: Ladi Prosek Reviewed-by: Stefan Hajnoczi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 2 +- include/hw/virtio/virtio.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 3a76dc6b8f..1df5f4e5b0 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -632,7 +632,7 @@ void virtqueue_map(VirtQueueElement *elem) VIRTQUEUE_MAX_SIZE, 0); } -void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num) +static void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num) { VirtQueueElement *elem; size_t in_addr_ofs = QEMU_ALIGN_UP(sz, __alignof__(elem->in_addr[0])); diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 6a2f57c1f7..f12a1a8635 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -154,7 +154,6 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, void virtio_del_queue(VirtIODevice *vdev, int n); -void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num); void virtqueue_push(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len); void virtqueue_flush(VirtQueue *vq, unsigned int count); From 6e724d9d990b0672f3f311346f193394ae3def08 Mon Sep 17 00:00:00 2001 From: Gonglei Date: Mon, 31 Oct 2016 22:08:00 +0800 Subject: [PATCH 04/28] virtio-crypto: tag as not hotpluggable and migration Currently the virtio-crypto device hasn't supported hotpluggable and live migration well. Let's tag it as not hotpluggable and migration actively and reopen them once we support them well. Suggested-by: Michael S. Tsirkin Signed-off-by: Gonglei Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio-crypto-pci.c | 2 +- hw/virtio/virtio-crypto.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio-crypto-pci.c b/hw/virtio/virtio-crypto-pci.c index 21d998401a..a1b09064c0 100644 --- a/hw/virtio/virtio-crypto-pci.c +++ b/hw/virtio/virtio-crypto-pci.c @@ -48,7 +48,7 @@ static void virtio_crypto_pci_class_init(ObjectClass *klass, void *data) k->realize = virtio_crypto_pci_realize; set_bit(DEVICE_CATEGORY_MISC, dc->categories); dc->props = virtio_crypto_pci_properties; - + dc->hotpluggable = false; pcidev_k->class_id = PCI_CLASS_OTHERS; } diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c index 170114f52b..32938433b7 100644 --- a/hw/virtio/virtio-crypto.c +++ b/hw/virtio/virtio-crypto.c @@ -813,6 +813,7 @@ static void virtio_crypto_device_unrealize(DeviceState *dev, Error **errp) static const VMStateDescription vmstate_virtio_crypto = { .name = "virtio-crypto", + .unmigratable = 1, .minimum_version_id = VIRTIO_CRYPTO_VM_VERSION, .version_id = VIRTIO_CRYPTO_VM_VERSION, .fields = (VMStateField[]) { From 1a43713b02e515575134e18af1f5d0e8c3c26bcc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 31 Oct 2016 15:34:38 +0800 Subject: [PATCH 05/28] intel_iommu: fix several incorrect endianess and bit fields Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 2 +- include/hw/i386/intel_iommu.h | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 5a12ae7a93..20c4d2c3b7 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2180,7 +2180,7 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, } addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; - if (le16_to_cpu(addr.addr.__head) != 0xfee) { + if (addr.addr.__head != 0xfee) { VTD_DPRINTF(GENERAL, "error: MSI addr low 32 bits invalid: " "0x%"PRIx32, addr.data); return -VTD_FR_IR_REQ_RSVD; diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h index 1989c1eec1..405c9d122e 100644 --- a/include/hw/i386/intel_iommu.h +++ b/include/hw/i386/intel_iommu.h @@ -123,7 +123,6 @@ enum { union VTD_IR_TableEntry { struct { #ifdef HOST_WORDS_BIGENDIAN - uint32_t dest_id:32; /* Destination ID */ uint32_t __reserved_1:8; /* Reserved 1 */ uint32_t vector:8; /* Interrupt Vector */ uint32_t irte_mode:1; /* IRTE Mode */ @@ -147,9 +146,9 @@ union VTD_IR_TableEntry { uint32_t irte_mode:1; /* IRTE Mode */ uint32_t vector:8; /* Interrupt Vector */ uint32_t __reserved_1:8; /* Reserved 1 */ - uint32_t dest_id:32; /* Destination ID */ #endif - uint16_t source_id:16; /* Source-ID */ + uint32_t dest_id; /* Destination ID */ + uint16_t source_id; /* Source-ID */ #ifdef HOST_WORDS_BIGENDIAN uint64_t __reserved_2:44; /* Reserved 2 */ uint64_t sid_vtype:2; /* Source-ID Validation Type */ @@ -220,7 +219,7 @@ struct VTD_MSIMessage { uint32_t dest:8; uint32_t __addr_head:12; /* 0xfee */ #endif - uint32_t __addr_hi:32; + uint32_t __addr_hi; } QEMU_PACKED; uint64_t msi_addr; }; @@ -239,7 +238,7 @@ struct VTD_MSIMessage { uint16_t level:1; uint16_t trigger_mode:1; #endif - uint16_t __resved1:16; + uint16_t __resved1; } QEMU_PACKED; uint32_t msi_data; }; From 8e7a0a1616faa852d65427bbbb6296dffb38990d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 31 Oct 2016 15:34:40 +0800 Subject: [PATCH 06/28] intel_iommu: fix incorrect assert Reported-by: Michael S. Tsirkin Signed-off-by: Peter Xu Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/intel_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 20c4d2c3b7..1b706adf9d 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2463,7 +2463,7 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) IntelIOMMUState *s = opaque; VTDAddressSpace *vtd_as; - assert(0 <= devfn && devfn <= X86_IOMMU_PCI_DEVFN_MAX); + assert(0 <= devfn && devfn < X86_IOMMU_PCI_DEVFN_MAX); vtd_as = vtd_find_add_as(s, bus, devfn); return &vtd_as->as; From 1b39bc1cf67eee07518ee05ce9306eaa53d868e4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 31 Oct 2016 15:34:39 +0800 Subject: [PATCH 07/28] acpi: fix DMAR device scope for IOAPIC We should not use cpu_to_le16() here, instead each of device/function value is stored in a 8 byte field. Signed-off-by: Peter Xu Reviewed-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 3 ++- include/hw/acpi/acpi-defs.h | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index c02f408ab2..a15585717a 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -2605,7 +2605,8 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker) scope->length = ioapic_scope_size; scope->enumeration_id = ACPI_BUILD_IOAPIC_ID; scope->bus = Q35_PSEUDO_BUS_PLATFORM; - scope->path[0] = cpu_to_le16(Q35_PSEUDO_DEVFN_IOAPIC); + scope->path[0].device = PCI_SLOT(Q35_PSEUDO_DEVFN_IOAPIC); + scope->path[0].function = PCI_FUNC(Q35_PSEUDO_DEVFN_IOAPIC); build_header(linker, table_data, (void *)(table_data->data + dmar_start), "DMAR", table_data->len - dmar_start, 1, NULL, NULL); diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h index d1d1d61fcb..154f3b82f6 100644 --- a/include/hw/acpi/acpi-defs.h +++ b/include/hw/acpi/acpi-defs.h @@ -619,7 +619,10 @@ struct AcpiDmarDeviceScope { uint16_t reserved; uint8_t enumeration_id; uint8_t bus; - uint16_t path[0]; /* list of dev:func pairs */ + struct { + uint8_t device; + uint8_t function; + } path[0]; } QEMU_PACKED; typedef struct AcpiDmarDeviceScope AcpiDmarDeviceScope; From 9b706dbbbb81f5cb7c67e491d38cd6077205e056 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 4 Nov 2016 12:04:23 +0200 Subject: [PATCH 08/28] virtio: allow per-device-class legacy features Legacy features are those that transitional devices only expose on the legacy interface. Allow different ones per device class. Cc: qemu-stable@nongnu.org # dependency for the next patch Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- hw/s390x/virtio-ccw.c | 4 +++- hw/virtio/virtio-pci.c | 4 +++- hw/virtio/virtio.c | 2 ++ include/hw/virtio/virtio.h | 5 +++++ 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index 7d7f8f6e19..f5c1d98192 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -303,6 +303,8 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw) if (!ccw.cda) { ret = -EFAULT; } else { + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + features.index = address_space_ldub(&address_space_memory, ccw.cda + sizeof(features.features), @@ -312,7 +314,7 @@ static int virtio_ccw_cb(SubchDev *sch, CCW1 ccw) if (dev->revision >= 1) { /* Don't offer legacy features for modern devices. */ features.features = (uint32_t) - (vdev->host_features & ~VIRTIO_LEGACY_FEATURES); + (vdev->host_features & ~vdc->legacy_features); } else { features.features = (uint32_t)vdev->host_features; } diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 62001b46d7..97b32febaf 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -1175,7 +1175,9 @@ static uint64_t virtio_pci_common_read(void *opaque, hwaddr addr, break; case VIRTIO_PCI_COMMON_DF: if (proxy->dfselect <= 1) { - val = (vdev->host_features & ~VIRTIO_LEGACY_FEATURES) >> + VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev); + + val = (vdev->host_features & ~vdc->legacy_features) >> (32 * proxy->dfselect); } break; diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 1df5f4e5b0..72ee06be6c 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -2214,6 +2214,8 @@ static void virtio_device_class_init(ObjectClass *klass, void *data) dc->props = virtio_properties; vdc->start_ioeventfd = virtio_device_start_ioeventfd_impl; vdc->stop_ioeventfd = virtio_device_stop_ioeventfd_impl; + + vdc->legacy_features |= VIRTIO_LEGACY_FEATURES; } bool virtio_device_ioeventfd_enabled(VirtIODevice *vdev) diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index f12a1a8635..bdb3c4b46b 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -113,6 +113,11 @@ typedef struct VirtioDeviceClass { void (*set_config)(VirtIODevice *vdev, const uint8_t *config); void (*reset)(VirtIODevice *vdev); void (*set_status)(VirtIODevice *vdev, uint8_t val); + /* For transitional devices, this is a bitmap of features + * that are only exposed on the legacy interface but not + * the modern one. + */ + uint64_t legacy_features; /* Test and clear event pending status. * Should be called after unmask to avoid losing events. * If backend does not support masking, From 2a083ffd2e37ef08769749a5c7cfc6ca65c9f8ea Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 4 Nov 2016 12:27:52 +0200 Subject: [PATCH 09/28] virtio-net: mark VIRTIO_NET_F_GSO as legacy virtio 1.0 spec says this is a legacy feature bit, hide it from guests in modern mode. Note: for cross-version migration compatibility, we keep the bit set in host_features. The result will be that a guest migrating cross-version will see host features change under it. As guests only seem to read it once, this should not be an issue. Meanwhile, will work to fix guests to ignore this bit in virtio1 mode, too. Cc: qemu-stable@nongnu.org Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- hw/net/virtio-net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 20aa63e16c..b68c69d8f6 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1942,6 +1942,7 @@ static void virtio_net_class_init(ObjectClass *klass, void *data) vdc->guest_notifier_pending = virtio_net_guest_notifier_pending; vdc->load = virtio_net_load_device; vdc->save = virtio_net_save_device; + vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO); } static const TypeInfo virtio_net_info = { From 0d34fbabc13891da41582b0823867dc5733fffef Mon Sep 17 00:00:00 2001 From: Rafael David Tinoco Date: Mon, 24 Oct 2016 15:35:03 +0000 Subject: [PATCH 10/28] vhost: migration blocker only if shared log is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 31190ed7 added a migration blocker in vhost_dev_init() to check if memfd would succeed. It is better if this blocker first checks if vhost backend requires shared log. This will avoid a situation where a blocker is added inappropriately (e.g. shared log allocation fails when vhost backend doesn't support it). Signed-off-by: Rafael David Tinoco Reviewed-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 131f1643b2..25bf67f3cc 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -1122,7 +1122,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { error_setg(&hdev->migration_blocker, "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); - } else if (!qemu_memfd_check()) { + } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_check()) { error_setg(&hdev->migration_blocker, "Migration disabled: failed to allocate shared memory"); } From c7f8d0f3a52b5ef8fdcd305cce438f67d7e06a9f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:36 +0800 Subject: [PATCH 11/28] qdev: hotplug: drop HotplugHandler.post_plug callback as nvdimm acpi is okay to build fit when the nvdimm device has not been 'realized' Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 6 +----- hw/core/hotplug.c | 11 ----------- hw/core/qdev.c | 22 +++++----------------- hw/i386/pc.c | 23 ++++------------------- include/hw/hotplug.h | 11 ----------- 5 files changed, 10 insertions(+), 63 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 602ec54485..623bb36ee3 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -38,11 +38,7 @@ static int nvdimm_plugged_device_list(Object *obj, void *opaque) GSList **list = opaque; if (object_dynamic_cast(obj, TYPE_NVDIMM)) { - DeviceState *dev = DEVICE(obj); - - if (dev->realized) { /* only realized NVDIMMs matter */ - *list = g_slist_append(*list, DEVICE(obj)); - } + *list = g_slist_append(*list, DEVICE(obj)); } object_child_foreach(obj, nvdimm_plugged_device_list, opaque); diff --git a/hw/core/hotplug.c b/hw/core/hotplug.c index ab34c19461..17ac986685 100644 --- a/hw/core/hotplug.c +++ b/hw/core/hotplug.c @@ -35,17 +35,6 @@ void hotplug_handler_plug(HotplugHandler *plug_handler, } } -void hotplug_handler_post_plug(HotplugHandler *plug_handler, - DeviceState *plugged_dev, - Error **errp) -{ - HotplugHandlerClass *hdc = HOTPLUG_HANDLER_GET_CLASS(plug_handler); - - if (hdc->post_plug) { - hdc->post_plug(plug_handler, plugged_dev, errp); - } -} - void hotplug_handler_unplug_request(HotplugHandler *plug_handler, DeviceState *plugged_dev, Error **errp) diff --git a/hw/core/qdev.c b/hw/core/qdev.c index d835e6259a..57834423b9 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -945,21 +945,10 @@ static void device_set_realized(Object *obj, bool value, Error **errp) goto child_realize_fail; } } - if (dev->hotplugged) { device_reset(dev); } dev->pending_deleted_event = false; - dev->realized = value; - - if (hotplug_ctrl) { - hotplug_handler_post_plug(hotplug_ctrl, dev, &local_err); - } - - if (local_err != NULL) { - dev->realized = value; - goto post_realize_fail; - } } else if (!value && dev->realized) { Error **local_errp = NULL; QLIST_FOREACH(bus, &dev->child_bus, sibling) { @@ -976,14 +965,13 @@ static void device_set_realized(Object *obj, bool value, Error **errp) } dev->pending_deleted_event = true; DEVICE_LISTENER_CALL(unrealize, Reverse, dev); - - if (local_err != NULL) { - goto fail; - } - - dev->realized = value; } + if (local_err != NULL) { + goto fail; + } + + dev->realized = value; return; child_realize_fail: diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 2c37a78c7a..cebaad2dc9 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1715,22 +1715,16 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, goto out; } + if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) { + nvdimm_acpi_hotplug(&pcms->acpi_nvdimm_state); + } + hhc = HOTPLUG_HANDLER_GET_CLASS(pcms->acpi_dev); hhc->plug(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &error_abort); out: error_propagate(errp, local_err); } -static void pc_dimm_post_plug(HotplugHandler *hotplug_dev, - DeviceState *dev, Error **errp) -{ - PCMachineState *pcms = PC_MACHINE(hotplug_dev); - - if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) { - nvdimm_acpi_hotplug(&pcms->acpi_nvdimm_state); - } -} - static void pc_dimm_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -2008,14 +2002,6 @@ static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev, } } -static void pc_machine_device_post_plug_cb(HotplugHandler *hotplug_dev, - DeviceState *dev, Error **errp) -{ - if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { - pc_dimm_post_plug(hotplug_dev, dev, errp); - } -} - static void pc_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { @@ -2322,7 +2308,6 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) mc->reset = pc_machine_reset; hc->pre_plug = pc_machine_device_pre_plug_cb; hc->plug = pc_machine_device_plug_cb; - hc->post_plug = pc_machine_device_post_plug_cb; hc->unplug_request = pc_machine_device_unplug_request_cb; hc->unplug = pc_machine_device_unplug_cb; nc->nmi_monitor_handler = x86_nmi; diff --git a/include/hw/hotplug.h b/include/hw/hotplug.h index 10ca5b6504..1a0516a479 100644 --- a/include/hw/hotplug.h +++ b/include/hw/hotplug.h @@ -47,7 +47,6 @@ typedef void (*hotplug_fn)(HotplugHandler *plug_handler, * @parent: Opaque parent interface. * @pre_plug: pre plug callback called at start of device.realize(true) * @plug: plug callback called at end of device.realize(true). - * @post_pug: post plug callback called after device is successfully plugged. * @unplug_request: unplug request callback. * Used as a means to initiate device unplug for devices that * require asynchronous unplug handling. @@ -62,7 +61,6 @@ typedef struct HotplugHandlerClass { /* */ hotplug_fn pre_plug; hotplug_fn plug; - hotplug_fn post_plug; hotplug_fn unplug_request; hotplug_fn unplug; } HotplugHandlerClass; @@ -85,15 +83,6 @@ void hotplug_handler_pre_plug(HotplugHandler *plug_handler, DeviceState *plugged_dev, Error **errp); -/** - * hotplug_handler_post_plug: - * - * Call #HotplugHandlerClass.post_plug callback of @plug_handler. - */ -void hotplug_handler_post_plug(HotplugHandler *plug_handler, - DeviceState *plugged_dev, - Error **errp); - /** * hotplug_handler_unplug_request: * From 12f86b5b3e1bdf75e0a467d771c16cc42f3a1f1a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:37 +0800 Subject: [PATCH 12/28] nvdimm acpi: drop the lock of fit buffer as there is a global lock to protect vm-exit handlers and QMP/monitor, this lock can be dropped Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 11 +---------- include/hw/mem/nvdimm.h | 17 +++++------------ 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 623bb36ee3..0fe35471ca 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -371,17 +371,14 @@ static GArray *nvdimm_build_device_structure(void) static void nvdimm_init_fit_buffer(NvdimmFitBuffer *fit_buf) { - qemu_mutex_init(&fit_buf->lock); fit_buf->fit = g_array_new(false, true /* clear */, 1); } static void nvdimm_build_fit_buffer(NvdimmFitBuffer *fit_buf) { - qemu_mutex_lock(&fit_buf->lock); g_array_free(fit_buf->fit, true); fit_buf->fit = nvdimm_build_device_structure(); fit_buf->dirty = true; - qemu_mutex_unlock(&fit_buf->lock); } void nvdimm_acpi_hotplug(AcpiNVDIMMState *state) @@ -395,11 +392,10 @@ static void nvdimm_build_nfit(AcpiNVDIMMState *state, GArray *table_offsets, NvdimmFitBuffer *fit_buf = &state->fit_buf; unsigned int header; - qemu_mutex_lock(&fit_buf->lock); /* NVDIMM device is not plugged? */ if (!fit_buf->fit->len) { - goto exit; + return; } acpi_add_table(table_offsets, table_data); @@ -413,9 +409,6 @@ static void nvdimm_build_nfit(AcpiNVDIMMState *state, GArray *table_offsets, build_header(linker, table_data, (void *)(table_data->data + header), "NFIT", sizeof(NvdimmNfitHeader) + fit_buf->fit->len, 1, NULL, NULL); - -exit: - qemu_mutex_unlock(&fit_buf->lock); } struct NvdimmDsmIn { @@ -544,7 +537,6 @@ static void nvdimm_dsm_func_read_fit(AcpiNVDIMMState *state, NvdimmDsmIn *in, read_fit = (NvdimmFuncReadFITIn *)in->arg3; le32_to_cpus(&read_fit->offset); - qemu_mutex_lock(&fit_buf->lock); fit = fit_buf->fit; nvdimm_debug("Read FIT: offset %#x FIT size %#x Dirty %s.\n", @@ -578,7 +570,6 @@ exit: cpu_physical_memory_write(dsm_mem_addr, read_fit_out, size); g_free(read_fit_out); - qemu_mutex_unlock(&fit_buf->lock); } static void nvdimm_dsm_reserved_root(AcpiNVDIMMState *state, NvdimmDsmIn *in, diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index 33cd421ace..d3ffb2569b 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -99,20 +99,13 @@ typedef struct NVDIMMClass NVDIMMClass; #define NVDIMM_ACPI_IO_LEN 4 /* - * The buffer, @fit, saves the FIT info for all the presented NVDIMM - * devices which is updated after the NVDIMM device is plugged or - * unplugged. - * - * Rules to use the buffer: - * 1) the user should hold the @lock to access the buffer. - * 2) mark @dirty whenever the buffer is updated. - * - * These rules preserve NVDIMM ACPI _FIT method to read incomplete - * or obsolete fit info if fit update happens during multiple RFIT - * calls. + * NvdimmFitBuffer: + * @fit: FIT structures for present NVDIMMs. It is updated when + * the NVDIMM device is plugged or unplugged. + * @dirty: It allows OSPM to detect change and restart read in + * progress if there is any. */ struct NvdimmFitBuffer { - QemuMutex lock; GArray *fit; bool dirty; }; From 75f27498220e6ff6f78bf08fbe2cc662ec76ba89 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:38 +0800 Subject: [PATCH 13/28] pc: memhp: move nvdimm hotplug out of memory hotplug as they use completely different way to handle hotplug event Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- default-configs/mips-softmmu-common.mak | 1 + docs/specs/acpi_mem_hotplug.txt | 3 --- docs/specs/acpi_nvdimm.txt | 5 ++++ hw/acpi/ich9.c | 8 +++++-- hw/acpi/memory_hotplug.c | 31 +++++++------------------ hw/acpi/nvdimm.c | 7 ++++++ hw/acpi/piix4.c | 7 +++++- include/hw/mem/nvdimm.h | 1 + 8 files changed, 34 insertions(+), 29 deletions(-) diff --git a/default-configs/mips-softmmu-common.mak b/default-configs/mips-softmmu-common.mak index 0394514b93..f0676f52ac 100644 --- a/default-configs/mips-softmmu-common.mak +++ b/default-configs/mips-softmmu-common.mak @@ -17,6 +17,7 @@ CONFIG_FDC=y CONFIG_ACPI=y CONFIG_ACPI_X86=y CONFIG_ACPI_MEMORY_HOTPLUG=y +CONFIG_ACPI_NVDIMM=y CONFIG_ACPI_CPU_HOTPLUG=y CONFIG_APM=y CONFIG_I8257=y diff --git a/docs/specs/acpi_mem_hotplug.txt b/docs/specs/acpi_mem_hotplug.txt index cb26dd27c4..3df3620ce4 100644 --- a/docs/specs/acpi_mem_hotplug.txt +++ b/docs/specs/acpi_mem_hotplug.txt @@ -4,9 +4,6 @@ QEMU<->ACPI BIOS memory hotplug interface ACPI BIOS GPE.3 handler is dedicated for notifying OS about memory hot-add and hot-remove events. -ACPI BIOS GPE.4 handler is dedicated for notifying OS about nvdimm device -hot-add and hot-remove events. - Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access): --------------------------------------------------------------- 0xa00: diff --git a/docs/specs/acpi_nvdimm.txt b/docs/specs/acpi_nvdimm.txt index 4aa5e3de29..d244147691 100644 --- a/docs/specs/acpi_nvdimm.txt +++ b/docs/specs/acpi_nvdimm.txt @@ -127,6 +127,11 @@ _DSM process diagram: | result from the page | | | +--------------------------+ +--------------+ +NVDIMM hotplug +-------------- +ACPI BIOS GPE.4 handler is dedicated for notifying OS about nvdimm device +hot-add event. + Device Handle Reservation ------------------------- As we mentioned above, byte 0 ~ byte 3 in the DSM memory save NVDIMM device diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c index e5a3c18e52..830c475127 100644 --- a/hw/acpi/ich9.c +++ b/hw/acpi/ich9.c @@ -490,8 +490,12 @@ void ich9_pm_device_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, if (lpc->pm.acpi_memory_hotplug.is_enabled && object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { - acpi_memory_plug_cb(hotplug_dev, &lpc->pm.acpi_memory_hotplug, - dev, errp); + if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) { + nvdimm_acpi_plug_cb(hotplug_dev, dev); + } else { + acpi_memory_plug_cb(hotplug_dev, &lpc->pm.acpi_memory_hotplug, + dev, errp); + } } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { if (lpc->pm.cpu_hotplug_legacy) { legacy_acpi_cpu_plug_cb(hotplug_dev, &lpc->pm.gpe_cpu, dev, errp); diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c index 70f64517fd..ec4e64b361 100644 --- a/hw/acpi/memory_hotplug.c +++ b/hw/acpi/memory_hotplug.c @@ -2,7 +2,6 @@ #include "hw/acpi/memory_hotplug.h" #include "hw/acpi/pc-hotplug.h" #include "hw/mem/pc-dimm.h" -#include "hw/mem/nvdimm.h" #include "hw/boards.h" #include "hw/qdev-core.h" #include "trace.h" @@ -233,8 +232,11 @@ void acpi_memory_plug_cb(HotplugHandler *hotplug_dev, MemHotplugState *mem_st, DeviceState *dev, Error **errp) { MemStatus *mdev; - AcpiEventStatusBits event; - bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM); + DeviceClass *dc = DEVICE_GET_CLASS(dev); + + if (!dc->hotpluggable) { + return; + } mdev = acpi_memory_slot_status(mem_st, dev, errp); if (!mdev) { @@ -242,23 +244,10 @@ void acpi_memory_plug_cb(HotplugHandler *hotplug_dev, MemHotplugState *mem_st, } mdev->dimm = dev; - - /* - * do not set is_enabled and is_inserting if the slot is plugged with - * a nvdimm device to stop OSPM inquires memory region from the slot. - */ - if (is_nvdimm) { - event = ACPI_NVDIMM_HOTPLUG_STATUS; - } else { - mdev->is_enabled = true; - event = ACPI_MEMORY_HOTPLUG_STATUS; - } - + mdev->is_enabled = true; if (dev->hotplugged) { - if (!is_nvdimm) { - mdev->is_inserting = true; - } - acpi_send_event(DEVICE(hotplug_dev), event); + mdev->is_inserting = true; + acpi_send_event(DEVICE(hotplug_dev), ACPI_MEMORY_HOTPLUG_STATUS); } } @@ -273,8 +262,6 @@ void acpi_memory_unplug_request_cb(HotplugHandler *hotplug_dev, return; } - /* nvdimm device hot unplug is not supported yet. */ - assert(!object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)); mdev->is_removing = true; acpi_send_event(DEVICE(hotplug_dev), ACPI_MEMORY_HOTPLUG_STATUS); } @@ -289,8 +276,6 @@ void acpi_memory_unplug_cb(MemHotplugState *mem_st, return; } - /* nvdimm device hot unplug is not supported yet. */ - assert(!object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)); mdev->is_enabled = false; mdev->dimm = NULL; } diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 0fe35471ca..5156565112 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -868,6 +868,13 @@ static const MemoryRegionOps nvdimm_dsm_ops = { }, }; +void nvdimm_acpi_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev) +{ + if (dev->hotplugged) { + acpi_send_event(DEVICE(hotplug_dev), ACPI_NVDIMM_HOTPLUG_STATUS); + } +} + void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io, FWCfgState *fw_cfg, Object *owner) { diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c index 2adc246b00..17d36bd595 100644 --- a/hw/acpi/piix4.c +++ b/hw/acpi/piix4.c @@ -378,7 +378,12 @@ static void piix4_device_plug_cb(HotplugHandler *hotplug_dev, if (s->acpi_memory_hotplug.is_enabled && object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { - acpi_memory_plug_cb(hotplug_dev, &s->acpi_memory_hotplug, dev, errp); + if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) { + nvdimm_acpi_plug_cb(hotplug_dev, dev); + } else { + acpi_memory_plug_cb(hotplug_dev, &s->acpi_memory_hotplug, + dev, errp); + } } else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { acpi_pcihp_device_plug_cb(hotplug_dev, &s->acpi_pci_hotplug, dev, errp); } else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) { diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index d3ffb2569b..60585c3bbd 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -131,4 +131,5 @@ void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data, BIOSLinker *linker, AcpiNVDIMMState *state, uint32_t ram_slots); void nvdimm_acpi_hotplug(AcpiNVDIMMState *state); +void nvdimm_acpi_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev); #endif From 3e8522e23f6ab3c2b89ebb962ec4c2227d88aca6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:39 +0800 Subject: [PATCH 14/28] pc: memhp: stop handling nvdimm hotplug in pc_dimm_unplug as it is never called when nvdimm hotplug happens Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/i386/pc.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index cebaad2dc9..b69cd4836e 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1761,12 +1761,6 @@ static void pc_dimm_unplug(HotplugHandler *hotplug_dev, HotplugHandlerClass *hhc; Error *local_err = NULL; - if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) { - error_setg(&local_err, - "nvdimm device hot unplug is not supported yet."); - goto out; - } - hhc = HOTPLUG_HANDLER_GET_CLASS(pcms->acpi_dev); hhc->unplug(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &local_err); From 264813cb9d0eea199d48c6ea917060683685d1e0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:40 +0800 Subject: [PATCH 15/28] nvdimm acpi: clean up nvdimm_build_acpi To make the code more clearer, we 1) check ram_slots first, and build ssdt & nfit only when it is available 2) use nvdimm_get_plugged_device_list() to check if there is nvdimm device plugged Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 5156565112..65eb6c9567 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -392,12 +392,6 @@ static void nvdimm_build_nfit(AcpiNVDIMMState *state, GArray *table_offsets, NvdimmFitBuffer *fit_buf = &state->fit_buf; unsigned int header; - - /* NVDIMM device is not plugged? */ - if (!fit_buf->fit->len) { - return; - } - acpi_add_table(table_offsets, table_data); /* NFIT header. */ @@ -1275,14 +1269,22 @@ void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data, BIOSLinker *linker, AcpiNVDIMMState *state, uint32_t ram_slots) { - nvdimm_build_nfit(state, table_offsets, table_data, linker); + GSList *device_list; - /* - * NVDIMM device is allowed to be plugged only if there is available - * slot. - */ - if (ram_slots) { - nvdimm_build_ssdt(table_offsets, table_data, linker, state->dsm_mem, - ram_slots); + /* no nvdimm device can be plugged. */ + if (!ram_slots) { + return; } + + nvdimm_build_ssdt(table_offsets, table_data, linker, state->dsm_mem, + ram_slots); + + device_list = nvdimm_get_plugged_device_list(); + /* no NVDIMM device is plugged. */ + if (!device_list) { + return; + } + + nvdimm_build_nfit(state, table_offsets, table_data, linker); + g_slist_free(device_list); } From 45a994944a8773e67adaede8cc15a207f85dbef0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:41 +0800 Subject: [PATCH 16/28] docs: improve the doc of Read FIT method Improve the description and clearly document the length field Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- docs/specs/acpi_nvdimm.txt | 90 +++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/docs/specs/acpi_nvdimm.txt b/docs/specs/acpi_nvdimm.txt index d244147691..3f322e6f55 100644 --- a/docs/specs/acpi_nvdimm.txt +++ b/docs/specs/acpi_nvdimm.txt @@ -65,8 +65,8 @@ _FIT(Firmware Interface Table) The detailed definition of the structure can be found at ACPI 6.0: 5.2.25 NVDIMM Firmware Interface Table (NFIT). -QEMU NVDIMM Implemention -======================== +QEMU NVDIMM Implementation +========================== QEMU uses 4 bytes IO Port starting from 0x0a18 and a RAM-based memory page for NVDIMM ACPI. @@ -80,8 +80,17 @@ Memory: emulates _DSM access and writes the output data to it. ACPI writes _DSM Input Data (based on the offset in the page): - [0x0 - 0x3]: 4 bytes, NVDIMM Device Handle, 0 is reserved for NVDIMM - Root device. + [0x0 - 0x3]: 4 bytes, NVDIMM Device Handle. + + The handle is completely QEMU internal thing, the values in + range [1, 0xFFFF] indicate nvdimm device. Other values are + reserved for other purposes. + + Reserved handles: + 0 is reserved for nvdimm root device named NVDR. + 0x10000 is reserved for QEMU internal DSM function called on + the root device. + [0x4 - 0x7]: 4 bytes, Revision ID, that is the Arg1 of _DSM method. [0x8 - 0xB]: 4 bytes. Function Index, that is the Arg2 of _DSM method. [0xC - 0xFFF]: 4084 bytes, the Arg3 of _DSM method. @@ -132,28 +141,12 @@ NVDIMM hotplug ACPI BIOS GPE.4 handler is dedicated for notifying OS about nvdimm device hot-add event. -Device Handle Reservation -------------------------- -As we mentioned above, byte 0 ~ byte 3 in the DSM memory save NVDIMM device -handle. The handle is completely QEMU internal thing, the values in range -[0, 0xFFFF] indicate nvdimm device (O means nvdimm root device named NVDR), -other values are reserved by other purpose. - -Current reserved handle: -0x10000 is reserved for QEMU internal DSM function called on the root -device. - QEMU internal use only _DSM function ------------------------------------ -UUID, 648B9CF2-CDA1-4312-8AD9-49C4AF32BD62, is reserved for QEMU internal -DSM function. - -There is the function introduced by QEMU and only used by QEMU internal. - 1) Read FIT - As we only reserved one page for NVDIMM ACPI it is impossible to map the - whole FIT data to guest's address space. This function is used by _FIT - method to read a piece of FIT data from QEMU. + _FIT method uses _DSM method to fetch NFIT structures blob from QEMU + in 1 page sized increments which are then concatenated and returned + as _FIT method result. Input parameters: Arg0 – UUID {set to 648B9CF2-CDA1-4312-8AD9-49C4AF32BD62} @@ -161,29 +154,34 @@ There is the function introduced by QEMU and only used by QEMU internal. Arg2 - Function Index, 0x1 Arg3 - A package containing a buffer whose layout is as follows: - +----------+-------------+-------------+-----------------------------------+ - | Filed | Byte Length | Byte Offset | Description | - +----------+-------------+-------------+-----------------------------------+ - | offset | 4 | 0 | the offset of FIT buffer | - +----------+-------------+-------------+-----------------------------------+ + +----------+--------+--------+-------------------------------------------+ + | Field | Length | Offset | Description | + +----------+--------+--------+-------------------------------------------+ + | offset | 4 | 0 | offset in QEMU's NFIT structures blob to | + | | | | read from | + +----------+--------+--------+-------------------------------------------+ - Output: - +----------+-------------+-------------+-----------------------------------+ - | Filed | Byte Length | Byte Offset | Description | - +----------+-------------+-------------+-----------------------------------+ - | | | | return status codes | - | | | | 0x100 indicates fit has been | - | status | 4 | 0 | updated | - | | | | other follows Chapter 3 in DSM | - | | | | Spec Rev1 | - +----------+-------------+-------------+-----------------------------------+ - | fit data | Varies | 4 | FIT data | - | | | | | - +----------+-------------+-------------+-----------------------------------+ + Output layout in the dsm memory page: + +----------+--------+--------+-------------------------------------------+ + | Field | Length | Offset | Description | + +----------+--------+--------+-------------------------------------------+ + | length | 4 | 0 | length of entire returned data | + | | | | (including this header) | + +----------+-----------------+-------------------------------------------+ + | | | | return status codes | + | | | | 0x0 - success | + | | | | 0x100 - error caused by NFIT update while | + | status | 4 | 4 | read by _FIT wasn't completed, other | + | | | | codes follow Chapter 3 in DSM Spec Rev1 | + +----------+-----------------+-------------------------------------------+ + | fit data | Varies | 8 | contains FIT data, this field is present | + | | | | if status field is 0; | + +----------+--------+--------+-------------------------------------------+ - The FIT offset is maintained by the caller itself, current offset plugs - the length returned by the function is the next offset we should read. - When all the FIT data has been read out, zero length is returned. + The FIT offset is maintained by the OSPM itself, current offset plus + the size of the fit data returned by the function is the next offset + OSPM should read. When all FIT data has been read out, zero fit data + size is returned. - If it returns 0x100, OSPM should restart to read FIT (read from offset 0 - again). + If it returns status code 0x100, OSPM should restart to read FIT (read + from offset 0 again). From cf7c0ff521b0710079aa28f21937fb7dbb3f5224 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:42 +0800 Subject: [PATCH 17/28] nvdimm acpi: rename nvdimm_plugged_device_list Its behavior has been changed as the nvdimm device which is being realized also will be handled in this function, so rename it to reflect the fact Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 65eb6c9567..f2c0659599 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -33,7 +33,7 @@ #include "hw/nvram/fw_cfg.h" #include "hw/mem/nvdimm.h" -static int nvdimm_plugged_device_list(Object *obj, void *opaque) +static int nvdimm_device_list(Object *obj, void *opaque) { GSList **list = opaque; @@ -41,23 +41,22 @@ static int nvdimm_plugged_device_list(Object *obj, void *opaque) *list = g_slist_append(*list, DEVICE(obj)); } - object_child_foreach(obj, nvdimm_plugged_device_list, opaque); + object_child_foreach(obj, nvdimm_device_list, opaque); return 0; } /* - * inquire plugged NVDIMM devices and link them into the list which is + * inquire NVDIMM devices and link them into the list which is * returned to the caller. * * Note: it is the caller's responsibility to free the list to avoid * memory leak. */ -static GSList *nvdimm_get_plugged_device_list(void) +static GSList *nvdimm_get_device_list(void) { GSList *list = NULL; - object_child_foreach(qdev_get_machine(), nvdimm_plugged_device_list, - &list); + object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list); return list; } @@ -215,7 +214,7 @@ static uint32_t nvdimm_slot_to_dcr_index(int slot) static NVDIMMDevice *nvdimm_get_device_by_handle(uint32_t handle) { NVDIMMDevice *nvdimm = NULL; - GSList *list, *device_list = nvdimm_get_plugged_device_list(); + GSList *list, *device_list = nvdimm_get_device_list(); for (list = device_list; list; list = list->next) { NVDIMMDevice *nvd = list->data; @@ -346,7 +345,7 @@ static void nvdimm_build_structure_dcr(GArray *structures, DeviceState *dev) static GArray *nvdimm_build_device_structure(void) { - GSList *device_list = nvdimm_get_plugged_device_list(); + GSList *device_list = nvdimm_get_device_list(); GArray *structures = g_array_new(false, true /* clear */, 1); for (; device_list; device_list = device_list->next) { @@ -1279,7 +1278,7 @@ void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data, nvdimm_build_ssdt(table_offsets, table_data, linker, state->dsm_mem, ram_slots); - device_list = nvdimm_get_plugged_device_list(); + device_list = nvdimm_get_device_list(); /* no NVDIMM device is plugged. */ if (!device_list) { return; From 880f36125371a969905a3be83a9099ca451b5c12 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:43 +0800 Subject: [PATCH 18/28] nvdimm acpi: cleanup nvdimm_build_fit inline buf_size to refine the code a bit Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index f2c0659599..148999ee7a 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -1134,11 +1134,9 @@ static void nvdimm_build_fit(Aml *dev) aml_append(ifctx, aml_return(aml_buffer(0, NULL))); aml_append(method, ifctx); - aml_append(method, aml_store(aml_shiftleft(buf_size, aml_int(3)), - buf_size)); aml_append(method, aml_create_field(buf, aml_int(4 * BITS_PER_BYTE), /* offset at byte 4.*/ - buf_size, "BUFF")); + aml_shiftleft(buf_size, aml_int(3)), "BUFF")); aml_append(method, aml_return(aml_name("BUFF"))); aml_append(dev, method); From 284197e41f0fe98d58ce5e8acd4966c91f28c4bd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:44 +0800 Subject: [PATCH 19/28] nvdimm acpi: rename nvdimm_acpi_hotplug Rename it to nvdimm_plug() Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 2 +- hw/i386/pc.c | 2 +- include/hw/mem/nvdimm.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 148999ee7a..7733f14aea 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -380,7 +380,7 @@ static void nvdimm_build_fit_buffer(NvdimmFitBuffer *fit_buf) fit_buf->dirty = true; } -void nvdimm_acpi_hotplug(AcpiNVDIMMState *state) +void nvdimm_plug(AcpiNVDIMMState *state) { nvdimm_build_fit_buffer(&state->fit_buf); } diff --git a/hw/i386/pc.c b/hw/i386/pc.c index b69cd4836e..a9b19507f1 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1716,7 +1716,7 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, } if (object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM)) { - nvdimm_acpi_hotplug(&pcms->acpi_nvdimm_state); + nvdimm_plug(&pcms->acpi_nvdimm_state); } hhc = HOTPLUG_HANDLER_GET_CLASS(pcms->acpi_dev); diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index 60585c3bbd..03e1ff9558 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -130,6 +130,6 @@ void nvdimm_init_acpi_state(AcpiNVDIMMState *state, MemoryRegion *io, void nvdimm_build_acpi(GArray *table_offsets, GArray *table_data, BIOSLinker *linker, AcpiNVDIMMState *state, uint32_t ram_slots); -void nvdimm_acpi_hotplug(AcpiNVDIMMState *state); +void nvdimm_plug(AcpiNVDIMMState *state); void nvdimm_acpi_plug_cb(HotplugHandler *hotplug_dev, DeviceState *dev); #endif From c2fa30757a2ba1bb5b053883773a9a61fbdd7082 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:45 +0800 Subject: [PATCH 20/28] nvdimm acpi: define DSM return codes and use these codes to refine the code Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 7733f14aea..c7e7744073 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -514,7 +514,13 @@ nvdimm_dsm_no_payload(uint32_t func_ret_status, hwaddr dsm_mem_addr) cpu_physical_memory_write(dsm_mem_addr, &out, sizeof(out)); } -#define NVDIMM_QEMU_RSVD_HANDLE_ROOT 0x10000 +#define NVDIMM_DSM_RET_STATUS_SUCCESS 0 /* Success */ +#define NVDIMM_DSM_RET_STATUS_UNSUPPORT 1 /* Not Supported */ +#define NVDIMM_DSM_RET_STATUS_NOMEMDEV 2 /* Non-Existing Memory Device */ +#define NVDIMM_DSM_RET_STATUS_INVALID 3 /* Invalid Input Parameters */ +#define NVDIMM_DSM_RET_STATUS_FIT_CHANGED 0x100 /* FIT Changed */ + +#define NVDIMM_QEMU_RSVD_HANDLE_ROOT 0x10000 /* Read FIT data, defined in docs/specs/acpi_nvdimm.txt. */ static void nvdimm_dsm_func_read_fit(AcpiNVDIMMState *state, NvdimmDsmIn *in, @@ -536,7 +542,7 @@ static void nvdimm_dsm_func_read_fit(AcpiNVDIMMState *state, NvdimmDsmIn *in, read_fit->offset, fit->len, fit_buf->dirty ? "Yes" : "No"); if (read_fit->offset > fit->len) { - func_ret_status = 3 /* Invalid Input Parameters */; + func_ret_status = NVDIMM_DSM_RET_STATUS_INVALID; goto exit; } @@ -544,11 +550,11 @@ static void nvdimm_dsm_func_read_fit(AcpiNVDIMMState *state, NvdimmDsmIn *in, if (!read_fit->offset) { fit_buf->dirty = false; } else if (fit_buf->dirty) { /* FIT has been changed during RFIT. */ - func_ret_status = 0x100 /* fit changed */; + func_ret_status = NVDIMM_DSM_RET_STATUS_FIT_CHANGED; goto exit; } - func_ret_status = 0 /* Success */; + func_ret_status = NVDIMM_DSM_RET_STATUS_SUCCESS; read_len = MIN(fit->len - read_fit->offset, 4096 - sizeof(NvdimmFuncReadFITOut)); @@ -577,7 +583,7 @@ static void nvdimm_dsm_reserved_root(AcpiNVDIMMState *state, NvdimmDsmIn *in, return; } - nvdimm_dsm_no_payload(1 /* Not Supported */, dsm_mem_addr); + nvdimm_dsm_no_payload(NVDIMM_DSM_RET_STATUS_UNSUPPORT, dsm_mem_addr); } static void nvdimm_dsm_root(NvdimmDsmIn *in, hwaddr dsm_mem_addr) @@ -593,7 +599,7 @@ static void nvdimm_dsm_root(NvdimmDsmIn *in, hwaddr dsm_mem_addr) } /* No function except function 0 is supported yet. */ - nvdimm_dsm_no_payload(1 /* Not Supported */, dsm_mem_addr); + nvdimm_dsm_no_payload(NVDIMM_DSM_RET_STATUS_UNSUPPORT, dsm_mem_addr); } /* @@ -639,7 +645,7 @@ static void nvdimm_dsm_label_size(NVDIMMDevice *nvdimm, hwaddr dsm_mem_addr) nvdimm_debug("label_size %#x, max_xfer %#x.\n", label_size, mxfer); - label_size_out.func_ret_status = cpu_to_le32(0 /* Success */); + label_size_out.func_ret_status = cpu_to_le32(NVDIMM_DSM_RET_STATUS_SUCCESS); label_size_out.label_size = cpu_to_le32(label_size); label_size_out.max_xfer = cpu_to_le32(mxfer); @@ -650,7 +656,7 @@ static void nvdimm_dsm_label_size(NVDIMMDevice *nvdimm, hwaddr dsm_mem_addr) static uint32_t nvdimm_rw_label_data_check(NVDIMMDevice *nvdimm, uint32_t offset, uint32_t length) { - uint32_t ret = 3 /* Invalid Input Parameters */; + uint32_t ret = NVDIMM_DSM_RET_STATUS_INVALID; if (offset + length < offset) { nvdimm_debug("offset %#x + length %#x is overflow.\n", offset, @@ -670,7 +676,7 @@ static uint32_t nvdimm_rw_label_data_check(NVDIMMDevice *nvdimm, return ret; } - return 0 /* Success */; + return NVDIMM_DSM_RET_STATUS_SUCCESS; } /* @@ -694,7 +700,7 @@ static void nvdimm_dsm_get_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, status = nvdimm_rw_label_data_check(nvdimm, get_label_data->offset, get_label_data->length); - if (status != 0 /* Success */) { + if (status != NVDIMM_DSM_RET_STATUS_SUCCESS) { nvdimm_dsm_no_payload(status, dsm_mem_addr); return; } @@ -704,7 +710,8 @@ static void nvdimm_dsm_get_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, get_label_data_out = g_malloc(size); get_label_data_out->len = cpu_to_le32(size); - get_label_data_out->func_ret_status = cpu_to_le32(0 /* Success */); + get_label_data_out->func_ret_status = + cpu_to_le32(NVDIMM_DSM_RET_STATUS_SUCCESS); nvc->read_label_data(nvdimm, get_label_data_out->out_buf, get_label_data->length, get_label_data->offset); @@ -732,7 +739,7 @@ static void nvdimm_dsm_set_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, status = nvdimm_rw_label_data_check(nvdimm, set_label_data->offset, set_label_data->length); - if (status != 0 /* Success */) { + if (status != NVDIMM_DSM_RET_STATUS_SUCCESS) { nvdimm_dsm_no_payload(status, dsm_mem_addr); return; } @@ -742,7 +749,7 @@ static void nvdimm_dsm_set_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, nvc->write_label_data(nvdimm, set_label_data->in_buf, set_label_data->length, set_label_data->offset); - nvdimm_dsm_no_payload(0 /* Success */, dsm_mem_addr); + nvdimm_dsm_no_payload(NVDIMM_DSM_RET_STATUS_SUCCESS, dsm_mem_addr); } static void nvdimm_dsm_device(NvdimmDsmIn *in, hwaddr dsm_mem_addr) @@ -766,7 +773,7 @@ static void nvdimm_dsm_device(NvdimmDsmIn *in, hwaddr dsm_mem_addr) } if (!nvdimm) { - nvdimm_dsm_no_payload(2 /* Non-Existing Memory Device */, + nvdimm_dsm_no_payload(NVDIMM_DSM_RET_STATUS_NOMEMDEV, dsm_mem_addr); return; } @@ -793,7 +800,7 @@ static void nvdimm_dsm_device(NvdimmDsmIn *in, hwaddr dsm_mem_addr) break; } - nvdimm_dsm_no_payload(1 /* Not Supported */, dsm_mem_addr); + nvdimm_dsm_no_payload(NVDIMM_DSM_RET_STATUS_UNSUPPORT, dsm_mem_addr); } static uint64_t @@ -830,7 +837,7 @@ nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) if (in->revision != 0x1 /* Currently we only support DSM Spec Rev1. */) { nvdimm_debug("Revision %#x is not supported, expect %#x.\n", in->revision, 0x1); - nvdimm_dsm_no_payload(1 /* Not Supported */, dsm_mem_addr); + nvdimm_dsm_no_payload(NVDIMM_DSM_RET_STATUS_UNSUPPORT, dsm_mem_addr); goto exit; } @@ -1018,7 +1025,7 @@ static void nvdimm_build_common_dsm(Aml *dev) aml_append(unsupport, ifctx); /* No function is supported yet. */ - byte_list[0] = 1 /* Not Supported */; + byte_list[0] = NVDIMM_DSM_RET_STATUS_UNSUPPORT; aml_append(unsupport, aml_return(aml_buffer(1, byte_list))); aml_append(method, unsupport); @@ -1119,7 +1126,8 @@ static void nvdimm_build_fit(Aml *dev) aml_name(NVDIMM_DSM_RFIT_STATUS))); /* if something is wrong during _DSM. */ - ifcond = aml_equal(aml_int(0 /* Success */), aml_name("STAU")); + ifcond = aml_equal(aml_int(NVDIMM_DSM_RET_STATUS_SUCCESS), + aml_name("STAU")); ifctx = aml_if(aml_lnot(ifcond)); aml_append(ifctx, aml_return(aml_buffer(0, NULL))); aml_append(method, ifctx); @@ -1156,7 +1164,7 @@ static void nvdimm_build_fit(Aml *dev) * again. */ ifctx = aml_if(aml_equal(aml_name(NVDIMM_DSM_RFIT_STATUS), - aml_int(0x100 /* fit changed */))); + aml_int(NVDIMM_DSM_RET_STATUS_FIT_CHANGED))); aml_append(ifctx, aml_store(aml_buffer(0, NULL), fit)); aml_append(ifctx, aml_store(aml_int(0), offset)); aml_append(whilectx, ifctx); From 7adbce633908fd093e24eee50018898dc11802cc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:46 +0800 Subject: [PATCH 21/28] nvdimm acpi: fix two comments fixed the English issue and code-style issue Suggested-by: Stefan Hajnoczi Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index c7e7744073..6f6f51fef7 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -479,7 +479,7 @@ QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncSetLabelDataIn) + offsetof(NvdimmDsmIn, arg3) > 4096); struct NvdimmFuncReadFITIn { - uint32_t offset; /* the offset of FIT buffer. */ + uint32_t offset; /* the offset into FIT buffer. */ } QEMU_PACKED; typedef struct NvdimmFuncReadFITIn NvdimmFuncReadFITIn; QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncReadFITIn) + @@ -578,7 +578,7 @@ static void nvdimm_dsm_reserved_root(AcpiNVDIMMState *state, NvdimmDsmIn *in, case 0x0: nvdimm_dsm_function0(0x1 | 1 << 1 /* Read FIT */, dsm_mem_addr); return; - case 0x1 /*Read FIT */: + case 0x1 /* Read FIT */: nvdimm_dsm_func_read_fit(state, in, dsm_mem_addr); return; } From 5a33db78b0f3ee808a3d8dd5c427edfbe80bdc73 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:47 +0800 Subject: [PATCH 22/28] nvdimm acpi: rename nvdimm_dsm_reserved_root Rename it to nvdimm_dsm_handle_reserved_root_method Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 6f6f51fef7..6692648e7e 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -571,8 +571,9 @@ exit: g_free(read_fit_out); } -static void nvdimm_dsm_reserved_root(AcpiNVDIMMState *state, NvdimmDsmIn *in, - hwaddr dsm_mem_addr) +static void +nvdimm_dsm_handle_reserved_root_method(AcpiNVDIMMState *state, + NvdimmDsmIn *in, hwaddr dsm_mem_addr) { switch (in->function) { case 0x0: @@ -842,7 +843,7 @@ nvdimm_dsm_write(void *opaque, hwaddr addr, uint64_t val, unsigned size) } if (in->handle == NVDIMM_QEMU_RSVD_HANDLE_ROOT) { - nvdimm_dsm_reserved_root(state, in, dsm_mem_addr); + nvdimm_dsm_handle_reserved_root_method(state, in, dsm_mem_addr); goto exit; } From aef056c11d082fcde44c5cbd3f91548738c220a8 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:48 +0800 Subject: [PATCH 23/28] nvdimm acpi: use aml_name_decl to define named object to make the code more clearer Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 6692648e7e..5f48b75b8e 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -1098,13 +1098,11 @@ static void nvdimm_build_fit(Aml *dev) buf_size = aml_local(1); fit = aml_local(2); - aml_append(dev, aml_create_dword_field(aml_buffer(4, NULL), - aml_int(0), NVDIMM_DSM_RFIT_STATUS)); + aml_append(dev, aml_name_decl(NVDIMM_DSM_RFIT_STATUS, aml_int(0))); /* build helper function, RFIT. */ method = aml_method("RFIT", 1, AML_SERIALIZED); - aml_append(method, aml_create_dword_field(aml_buffer(4, NULL), - aml_int(0), "OFST")); + aml_append(method, aml_name_decl("OFST", aml_int(0))); /* prepare input package. */ pkg = aml_package(1); From cb88ebd7542458c22f3051646f268dcea6109abc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Nov 2016 19:13:49 +0800 Subject: [PATCH 24/28] nvdimm acpi: introduce NVDIMM_DSM_MEMORY_SIZE and use it to replace the raw number Suggested-by: Igor Mammedov Signed-off-by: Xiao Guangrong Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/acpi/nvdimm.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 5f48b75b8e..8e7d6ec034 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -404,6 +404,8 @@ static void nvdimm_build_nfit(AcpiNVDIMMState *state, GArray *table_offsets, sizeof(NvdimmNfitHeader) + fit_buf->fit->len, 1, NULL, NULL); } +#define NVDIMM_DSM_MEMORY_SIZE 4096 + struct NvdimmDsmIn { uint32_t handle; uint32_t revision; @@ -414,7 +416,7 @@ struct NvdimmDsmIn { }; } QEMU_PACKED; typedef struct NvdimmDsmIn NvdimmDsmIn; -QEMU_BUILD_BUG_ON(sizeof(NvdimmDsmIn) != 4096); +QEMU_BUILD_BUG_ON(sizeof(NvdimmDsmIn) != NVDIMM_DSM_MEMORY_SIZE); struct NvdimmDsmOut { /* the size of buffer filled by QEMU. */ @@ -422,7 +424,7 @@ struct NvdimmDsmOut { uint8_t data[4092]; } QEMU_PACKED; typedef struct NvdimmDsmOut NvdimmDsmOut; -QEMU_BUILD_BUG_ON(sizeof(NvdimmDsmOut) != 4096); +QEMU_BUILD_BUG_ON(sizeof(NvdimmDsmOut) != NVDIMM_DSM_MEMORY_SIZE); struct NvdimmDsmFunc0Out { /* the size of buffer filled by QEMU. */ @@ -450,7 +452,7 @@ struct NvdimmFuncGetLabelSizeOut { uint32_t max_xfer; } QEMU_PACKED; typedef struct NvdimmFuncGetLabelSizeOut NvdimmFuncGetLabelSizeOut; -QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncGetLabelSizeOut) > 4096); +QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncGetLabelSizeOut) > NVDIMM_DSM_MEMORY_SIZE); struct NvdimmFuncGetLabelDataIn { uint32_t offset; /* the offset in the namespace label data area. */ @@ -458,7 +460,7 @@ struct NvdimmFuncGetLabelDataIn { } QEMU_PACKED; typedef struct NvdimmFuncGetLabelDataIn NvdimmFuncGetLabelDataIn; QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncGetLabelDataIn) + - offsetof(NvdimmDsmIn, arg3) > 4096); + offsetof(NvdimmDsmIn, arg3) > NVDIMM_DSM_MEMORY_SIZE); struct NvdimmFuncGetLabelDataOut { /* the size of buffer filled by QEMU. */ @@ -467,7 +469,7 @@ struct NvdimmFuncGetLabelDataOut { uint8_t out_buf[0]; /* the data got via Get Namesapce Label function. */ } QEMU_PACKED; typedef struct NvdimmFuncGetLabelDataOut NvdimmFuncGetLabelDataOut; -QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncGetLabelDataOut) > 4096); +QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncGetLabelDataOut) > NVDIMM_DSM_MEMORY_SIZE); struct NvdimmFuncSetLabelDataIn { uint32_t offset; /* the offset in the namespace label data area. */ @@ -476,14 +478,14 @@ struct NvdimmFuncSetLabelDataIn { } QEMU_PACKED; typedef struct NvdimmFuncSetLabelDataIn NvdimmFuncSetLabelDataIn; QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncSetLabelDataIn) + - offsetof(NvdimmDsmIn, arg3) > 4096); + offsetof(NvdimmDsmIn, arg3) > NVDIMM_DSM_MEMORY_SIZE); struct NvdimmFuncReadFITIn { uint32_t offset; /* the offset into FIT buffer. */ } QEMU_PACKED; typedef struct NvdimmFuncReadFITIn NvdimmFuncReadFITIn; QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncReadFITIn) + - offsetof(NvdimmDsmIn, arg3) > 4096); + offsetof(NvdimmDsmIn, arg3) > NVDIMM_DSM_MEMORY_SIZE); struct NvdimmFuncReadFITOut { /* the size of buffer filled by QEMU. */ @@ -492,7 +494,7 @@ struct NvdimmFuncReadFITOut { uint8_t fit[0]; /* the FIT data. */ } QEMU_PACKED; typedef struct NvdimmFuncReadFITOut NvdimmFuncReadFITOut; -QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncReadFITOut) > 4096); +QEMU_BUILD_BUG_ON(sizeof(NvdimmFuncReadFITOut) > NVDIMM_DSM_MEMORY_SIZE); static void nvdimm_dsm_function0(uint32_t supported_func, hwaddr dsm_mem_addr) @@ -556,7 +558,7 @@ static void nvdimm_dsm_func_read_fit(AcpiNVDIMMState *state, NvdimmDsmIn *in, func_ret_status = NVDIMM_DSM_RET_STATUS_SUCCESS; read_len = MIN(fit->len - read_fit->offset, - 4096 - sizeof(NvdimmFuncReadFITOut)); + NVDIMM_DSM_MEMORY_SIZE - sizeof(NvdimmFuncReadFITOut)); exit: size = sizeof(NvdimmFuncReadFITOut) + read_len; @@ -610,7 +612,9 @@ static void nvdimm_dsm_root(NvdimmDsmIn *in, hwaddr dsm_mem_addr) */ static uint32_t nvdimm_get_max_xfer_label_size(void) { - uint32_t max_get_size, max_set_size, dsm_memory_size = 4096; + uint32_t max_get_size, max_set_size, dsm_memory_size; + + dsm_memory_size = NVDIMM_DSM_MEMORY_SIZE; /* * the max data ACPI can read one time which is transferred by @@ -707,7 +711,7 @@ static void nvdimm_dsm_get_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, } size = sizeof(*get_label_data_out) + get_label_data->length; - assert(size <= 4096); + assert(size <= NVDIMM_DSM_MEMORY_SIZE); get_label_data_out = g_malloc(size); get_label_data_out->len = cpu_to_le32(size); @@ -745,8 +749,8 @@ static void nvdimm_dsm_set_label_data(NVDIMMDevice *nvdimm, NvdimmDsmIn *in, return; } - assert(offsetof(NvdimmDsmIn, arg3) + - sizeof(*set_label_data) + set_label_data->length <= 4096); + assert(offsetof(NvdimmDsmIn, arg3) + sizeof(*set_label_data) + + set_label_data->length <= NVDIMM_DSM_MEMORY_SIZE); nvc->write_label_data(nvdimm, set_label_data->in_buf, set_label_data->length, set_label_data->offset); From f1f9e6c5961ffb36fd4a81cd7edcded7bfad2ab2 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 4 Nov 2016 09:39:15 +0100 Subject: [PATCH 25/28] vhost: adapt vhost_verify_ring_mappings() to virtio 1 ring layout With virtio 1, the vring layout is split in 3 separate regions of contiguous memory for the descriptor table, the available ring and the used ring, as opposed with legacy virtio which uses a single region. In case of memory re-mapping, the code ensures it doesn't affect the vring mapping. This is done in vhost_verify_ring_mappings() which assumes the device is legacy. This patch changes vhost_verify_ring_mappings() to check the mappings of each part of the vring separately. This works for legacy mappings as well. Cc: qemu-stable@nongnu.org Signed-off-by: Greg Kurz Reviewed-by: Cornelia Huck Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost.c | 79 +++++++++++++++++++++++++++++---------- include/hw/virtio/vhost.h | 4 ++ 2 files changed, 64 insertions(+), 19 deletions(-) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 25bf67f3cc..d88d34a044 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -421,32 +421,73 @@ static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) dev->log_size = size; } + +static int vhost_verify_ring_part_mapping(void *part, + uint64_t part_addr, + uint64_t part_size, + uint64_t start_addr, + uint64_t size) +{ + hwaddr l; + void *p; + int r = 0; + + if (!ranges_overlap(start_addr, size, part_addr, part_size)) { + return 0; + } + l = part_size; + p = cpu_physical_memory_map(part_addr, &l, 1); + if (!p || l != part_size) { + r = -ENOMEM; + } + if (p != part) { + r = -EBUSY; + } + cpu_physical_memory_unmap(p, l, 0, 0); + return r; +} + static int vhost_verify_ring_mappings(struct vhost_dev *dev, uint64_t start_addr, uint64_t size) { - int i; + int i, j; int r = 0; + const char *part_name[] = { + "descriptor table", + "available ring", + "used ring" + }; - for (i = 0; !r && i < dev->nvqs; ++i) { + for (i = 0; i < dev->nvqs; ++i) { struct vhost_virtqueue *vq = dev->vqs + i; - hwaddr l; - void *p; - if (!ranges_overlap(start_addr, size, vq->ring_phys, vq->ring_size)) { - continue; + j = 0; + r = vhost_verify_ring_part_mapping(vq->desc, vq->desc_phys, + vq->desc_size, start_addr, size); + if (!r) { + break; } - l = vq->ring_size; - p = cpu_physical_memory_map(vq->ring_phys, &l, 1); - if (!p || l != vq->ring_size) { - error_report("Unable to map ring buffer for ring %d", i); - r = -ENOMEM; + + j++; + r = vhost_verify_ring_part_mapping(vq->avail, vq->avail_phys, + vq->avail_size, start_addr, size); + if (!r) { + break; } - if (p != vq->ring) { - error_report("Ring buffer relocated for ring %d", i); - r = -EBUSY; + + j++; + r = vhost_verify_ring_part_mapping(vq->used, vq->used_phys, + vq->used_size, start_addr, size); + if (!r) { + break; } - cpu_physical_memory_unmap(p, l, 0, 0); + } + + if (r == -ENOMEM) { + error_report("Unable to map %s for ring %d", part_name[j], i); + } else if (r == -EBUSY) { + error_report("%s relocated for ring %d", part_name[j], i); } return r; } @@ -860,15 +901,15 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, } } - s = l = virtio_queue_get_desc_size(vdev, idx); - a = virtio_queue_get_desc_addr(vdev, idx); + vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx); + vq->desc_phys = a = virtio_queue_get_desc_addr(vdev, idx); vq->desc = cpu_physical_memory_map(a, &l, 0); if (!vq->desc || l != s) { r = -ENOMEM; goto fail_alloc_desc; } - s = l = virtio_queue_get_avail_size(vdev, idx); - a = virtio_queue_get_avail_addr(vdev, idx); + vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx); + vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx); vq->avail = cpu_physical_memory_map(a, &l, 0); if (!vq->avail || l != s) { r = -ENOMEM; diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index e433089ea9..56b567f199 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -14,6 +14,10 @@ struct vhost_virtqueue { void *avail; void *used; int num; + unsigned long long desc_phys; + unsigned desc_size; + unsigned long long avail_phys; + unsigned avail_size; unsigned long long used_phys; unsigned used_size; void *ring; From 1cdce7c54d26e64f5eddb10a6f4f7dd938dfc2c4 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 4 Nov 2016 09:39:22 +0100 Subject: [PATCH 26/28] vhost: drop legacy vring layout bits The legacy vring layout is not used anymore as we use the separate mappings even for legacy devices. This patch simply removes it. This also fixes a bug with virtio 1 devices when the vring descriptor table is mapped at a higher address than the used vring because the following function may return an insanely great value: hwaddr virtio_queue_get_ring_size(VirtIODevice *vdev, int n) { return vdev->vq[n].vring.used - vdev->vq[n].vring.desc + virtio_queue_get_used_size(vdev, n); } and the mapping fails. Signed-off-by: Greg Kurz Reviewed-by: Cornelia Huck Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost.c | 13 ------------- include/hw/virtio/vhost.h | 3 --- 2 files changed, 16 deletions(-) diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index d88d34a044..30aee88a3e 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -923,14 +923,6 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, goto fail_alloc_used; } - vq->ring_size = s = l = virtio_queue_get_ring_size(vdev, idx); - vq->ring_phys = a = virtio_queue_get_ring_addr(vdev, idx); - vq->ring = cpu_physical_memory_map(a, &l, 1); - if (!vq->ring || l != s) { - r = -ENOMEM; - goto fail_alloc_ring; - } - r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled); if (r < 0) { r = -errno; @@ -971,9 +963,6 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, fail_vector: fail_kick: fail_alloc: - cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), - 0, 0); -fail_alloc_ring: cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx), 0, 0); fail_alloc_used: @@ -1014,8 +1003,6 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev, vhost_vq_index); } - cpu_physical_memory_unmap(vq->ring, virtio_queue_get_ring_size(vdev, idx), - 0, virtio_queue_get_ring_size(vdev, idx)); cpu_physical_memory_unmap(vq->used, virtio_queue_get_used_size(vdev, idx), 1, virtio_queue_get_used_size(vdev, idx)); cpu_physical_memory_unmap(vq->avail, virtio_queue_get_avail_size(vdev, idx), diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 56b567f199..1fe5aadef5 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -20,9 +20,6 @@ struct vhost_virtqueue { unsigned avail_size; unsigned long long used_phys; unsigned used_size; - void *ring; - unsigned long long ring_phys; - unsigned ring_size; EventNotifier masked_notifier; }; From 435346d7484aa2aad1875e35948df3c3f4feee75 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 4 Nov 2016 09:39:29 +0100 Subject: [PATCH 27/28] virtio: drop virtio_queue_get_ring_{size,addr}() These are not used anymore. Signed-off-by: Greg Kurz Reviewed-by: Cornelia Huck Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/virtio.c | 11 ----------- include/hw/virtio/virtio.h | 2 -- 2 files changed, 13 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 72ee06be6c..55a00cdf9e 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -1935,11 +1935,6 @@ hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n) return vdev->vq[n].vring.used; } -hwaddr virtio_queue_get_ring_addr(VirtIODevice *vdev, int n) -{ - return vdev->vq[n].vring.desc; -} - hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n) { return sizeof(VRingDesc) * vdev->vq[n].vring.num; @@ -1957,12 +1952,6 @@ hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n) sizeof(VRingUsedElem) * vdev->vq[n].vring.num; } -hwaddr virtio_queue_get_ring_size(VirtIODevice *vdev, int n) -{ - return vdev->vq[n].vring.used - vdev->vq[n].vring.desc + - virtio_queue_get_used_size(vdev, n); -} - uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) { return vdev->vq[n].last_avail_idx; diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index bdb3c4b46b..5951997f22 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -259,11 +259,9 @@ typedef struct VirtIORNGConf VirtIORNGConf; hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_avail_addr(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_used_addr(VirtIODevice *vdev, int n); -hwaddr virtio_queue_get_ring_addr(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n); hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n); -hwaddr virtio_queue_get_ring_size(VirtIODevice *vdev, int n); uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n); void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx); void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n); From 453ac8835b002263a6b7b0843e7c90fa8b19c869 Mon Sep 17 00:00:00 2001 From: Marcel Apfelbaum Date: Tue, 1 Nov 2016 15:39:47 +0200 Subject: [PATCH 28/28] docs: add PCIe devices placement guidelines Proposes best practices on how to use PCI Express/PCI device in PCI Express based machines and explain the reasoning behind them. Reviewed-by: Laszlo Ersek Signed-off-by: Marcel Apfelbaum Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/pcie.txt | 310 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 docs/pcie.txt diff --git a/docs/pcie.txt b/docs/pcie.txt new file mode 100644 index 0000000000..9fb20aaed9 --- /dev/null +++ b/docs/pcie.txt @@ -0,0 +1,310 @@ +PCI EXPRESS GUIDELINES +====================== + +1. Introduction +================ +The doc proposes best practices on how to use PCI Express/PCI device +in PCI Express based machines and explains the reasoning behind them. + +The following presentations accompany this document: + (1) Q35 overview. + http://wiki.qemu.org/images/4/4e/Q35.pdf + (2) A comparison between PCI and PCI Express technologies. + http://wiki.qemu.org/images/f/f6/PCIvsPCIe.pdf + +Note: The usage examples are not intended to replace the full +documentation, please use QEMU help to retrieve all options. + +2. Device placement strategy +============================ +QEMU does not have a clear socket-device matching mechanism +and allows any PCI/PCI Express device to be plugged into any +PCI/PCI Express slot. +Plugging a PCI device into a PCI Express slot might not always work and +is weird anyway since it cannot be done for "bare metal". +Plugging a PCI Express device into a PCI slot will hide the Extended +Configuration Space thus is also not recommended. + +The recommendation is to separate the PCI Express and PCI hierarchies. +PCI Express devices should be plugged only into PCI Express Root Ports and +PCI Express Downstream ports. + +2.1 Root Bus (pcie.0) +===================== +Place only the following kinds of devices directly on the Root Complex: + (1) PCI Devices (e.g. network card, graphics card, IDE controller), + not controllers. Place only legacy PCI devices on + the Root Complex. These will be considered Integrated Endpoints. + Note: Integrated Endpoints are not hot-pluggable. + + Although the PCI Express spec does not forbid PCI Express devices as + Integrated Endpoints, existing hardware mostly integrates legacy PCI + devices with the Root Complex. Guest OSes are suspected to behave + strangely when PCI Express devices are integrated + with the Root Complex. + + (2) PCI Express Root Ports (ioh3420), for starting exclusively PCI Express + hierarchies. + + (3) DMI-PCI Bridges (i82801b11-bridge), for starting legacy PCI + hierarchies. + + (4) Extra Root Complexes (pxb-pcie), if multiple PCI Express Root Buses + are needed. + + pcie.0 bus + ---------------------------------------------------------------------------- + | | | | + ----------- ------------------ ------------------ -------------- + | PCI Dev | | PCIe Root Port | | DMI-PCI Bridge | | pxb-pcie | + ----------- ------------------ ------------------ -------------- + +2.1.1 To plug a device into pcie.0 as a Root Complex Integrated Endpoint use: + -device [,bus=pcie.0] +2.1.2 To expose a new PCI Express Root Bus use: + -device pxb-pcie,id=pcie.1,bus_nr=x[,numa_node=y][,addr=z] + Only PCI Express Root Ports and DMI-PCI bridges can be connected + to the pcie.1 bus: + -device ioh3420,id=root_port1[,bus=pcie.1][,chassis=x][,slot=y][,addr=z] \ + -device i82801b11-bridge,id=dmi_pci_bridge1,bus=pcie.1 + + +2.2 PCI Express only hierarchy +============================== +Always use PCI Express Root Ports to start PCI Express hierarchies. + +A PCI Express Root bus supports up to 32 devices. Since each +PCI Express Root Port is a function and a multi-function +device may support up to 8 functions, the maximum possible +number of PCI Express Root Ports per PCI Express Root Bus is 256. + +Prefer grouping PCI Express Root Ports into multi-function devices +to keep a simple flat hierarchy that is enough for most scenarios. +Only use PCI Express Switches (x3130-upstream, xio3130-downstream) +if there is no more room for PCI Express Root Ports. +Please see section 4. for further justifications. + +Plug only PCI Express devices into PCI Express Ports. + + + pcie.0 bus + ---------------------------------------------------------------------------------- + | | | + ------------- ------------- ------------- + | Root Port | | Root Port | | Root Port | + ------------ ------------- ------------- + | -------------------------|------------------------ + ------------ | ----------------- | + | PCIe Dev | | PCI Express | Upstream Port | | + ------------ | Switch ----------------- | + | | | | + | ------------------- ------------------- | + | | Downstream Port | | Downstream Port | | + | ------------------- ------------------- | + -------------|-----------------------|------------ + ------------ + | PCIe Dev | + ------------ + +2.2.1 Plugging a PCI Express device into a PCI Express Root Port: + -device ioh3420,id=root_port1,chassis=x,slot=y[,bus=pcie.0][,addr=z] \ + -device ,bus=root_port1 +2.2.2 Using multi-function PCI Express Root Ports: + -device ioh3420,id=root_port1,multifunction=on,chassis=x,slot=y[,bus=pcie.0][,addr=z.0] \ + -device ioh3420,id=root_port2,chassis=x1,slot=y1[,bus=pcie.0][,addr=z.1] \ + -device ioh3420,id=root_port3,chassis=x2,slot=y2[,bus=pcie.0][,addr=z.2] \ +2.2.2 Plugging a PCI Express device into a Switch: + -device ioh3420,id=root_port1,chassis=x,slot=y[,bus=pcie.0][,addr=z] \ + -device x3130-upstream,id=upstream_port1,bus=root_port1[,addr=x] \ + -device xio3130-downstream,id=downstream_port1,bus=upstream_port1,chassis=x1,slot=y1[,addr=z1]] \ + -device ,bus=downstream_port1 + +Notes: + - (slot, chassis) pair is mandatory and must be + unique for each PCI Express Root Port. + - 'addr' parameter can be 0 for all the examples above. + + +2.3 PCI only hierarchy +====================== +Legacy PCI devices can be plugged into pcie.0 as Integrated Endpoints, +but, as mentioned in section 5, doing so means the legacy PCI +device in question will be incapable of hot-unplugging. +Besides that use DMI-PCI Bridges (i82801b11-bridge) in combination +with PCI-PCI Bridges (pci-bridge) to start PCI hierarchies. + +Prefer flat hierarchies. For most scenarios a single DMI-PCI Bridge +(having 32 slots) and several PCI-PCI Bridges attached to it +(each supporting also 32 slots) will support hundreds of legacy devices. +The recommendation is to populate one PCI-PCI Bridge under the DMI-PCI Bridge +until is full and then plug a new PCI-PCI Bridge... + + pcie.0 bus + ---------------------------------------------- + | | + ----------- ------------------ + | PCI Dev | | DMI-PCI BRIDGE | + ---------- ------------------ + | | + ------------------ ------------------ + | PCI-PCI Bridge | | PCI-PCI Bridge | ... + ------------------ ------------------ + | | + ----------- ----------- + | PCI Dev | | PCI Dev | + ----------- ----------- + +2.3.1 To plug a PCI device into pcie.0 as an Integrated Endpoint use: + -device [,bus=pcie.0] +2.3.2 Plugging a PCI device into a PCI-PCI Bridge: + -device i82801b11-bridge,id=dmi_pci_bridge1[,bus=pcie.0] \ + -device pci-bridge,id=pci_bridge1,bus=dmi_pci_bridge1[,chassis_nr=x][,addr=y] \ + -device ,bus=pci_bridge1[,addr=x] + Note that 'addr' cannot be 0 unless shpc=off parameter is passed to + the PCI Bridge. + +3. IO space issues +=================== +The PCI Express Root Ports and PCI Express Downstream ports are seen by +Firmware/Guest OS as PCI-PCI Bridges. As required by the PCI spec, each +such Port should be reserved a 4K IO range for, even though only one +(multifunction) device can be plugged into each Port. This results in +poor IO space utilization. + +The firmware used by QEMU (SeaBIOS/OVMF) may try further optimizations +by not allocating IO space for each PCI Express Root / PCI Express +Downstream port if: + (1) the port is empty, or + (2) the device behind the port has no IO BARs. + +The IO space is very limited, to 65536 byte-wide IO ports, and may even be +fragmented by fixed IO ports owned by platform devices resulting in at most +10 PCI Express Root Ports or PCI Express Downstream Ports per system +if devices with IO BARs are used in the PCI Express hierarchy. Using the +proposed device placing strategy solves this issue by using only +PCI Express devices within PCI Express hierarchy. + +The PCI Express spec requires that PCI Express devices work properly +without using IO ports. The PCI hierarchy has no such limitations. + + +4. Bus numbers issues +====================== +Each PCI domain can have up to only 256 buses and the QEMU PCI Express +machines do not support multiple PCI domains even if extra Root +Complexes (pxb-pcie) are used. + +Each element of the PCI Express hierarchy (Root Complexes, +PCI Express Root Ports, PCI Express Downstream/Upstream ports) +uses one bus number. Since only one (multifunction) device +can be attached to a PCI Express Root Port or PCI Express Downstream +Port it is advised to plan in advance for the expected number of +devices to prevent bus number starvation. + +Avoiding PCI Express Switches (and thereby striving for a 'flatter' PCI +Express hierarchy) enables the hierarchy to not spend bus numbers on +Upstream Ports. + +The bus_nr properties of the pxb-pcie devices partition the 0..255 bus +number space. All bus numbers assigned to the buses recursively behind a +given pxb-pcie device's root bus must fit between the bus_nr property of +that pxb-pcie device, and the lowest of the higher bus_nr properties +that the command line sets for other pxb-pcie devices. + + +5. Hot-plug +============ +The PCI Express root buses (pcie.0 and the buses exposed by pxb-pcie devices) +do not support hot-plug, so any devices plugged into Root Complexes +cannot be hot-plugged/hot-unplugged: + (1) PCI Express Integrated Endpoints + (2) PCI Express Root Ports + (3) DMI-PCI Bridges + (4) pxb-pcie + +Be aware that PCI Express Downstream Ports can't be hot-plugged into +an existing PCI Express Upstream Port. + +PCI devices can be hot-plugged into PCI-PCI Bridges. The PCI hot-plug is ACPI +based and can work side by side with the PCI Express native hot-plug. + +PCI Express devices can be natively hot-plugged/hot-unplugged into/from +PCI Express Root Ports (and PCI Express Downstream Ports). + +5.1 Planning for hot-plug: + (1) PCI hierarchy + Leave enough PCI-PCI Bridge slots empty or add one + or more empty PCI-PCI Bridges to the DMI-PCI Bridge. + + For each such PCI-PCI Bridge the Guest Firmware is expected to reserve + 4K IO space and 2M MMIO range to be used for all devices behind it. + + Because of the hard IO limit of around 10 PCI Bridges (~ 40K space) + per system don't use more than 9 PCI-PCI Bridges, leaving 4K for the + Integrated Endpoints. (The PCI Express Hierarchy needs no IO space). + + (2) PCI Express hierarchy: + Leave enough PCI Express Root Ports empty. Use multifunction + PCI Express Root Ports (up to 8 ports per pcie.0 slot) + on the Root Complex(es), for keeping the + hierarchy as flat as possible, thereby saving PCI bus numbers. + Don't use PCI Express Switches if you don't have + to, each one of those uses an extra PCI bus (for its Upstream Port) + that could be put to better use with another Root Port or Downstream + Port, which may come handy for hot-plugging another device. + + +5.3 Hot-plug example: +Using HMP: (add -monitor stdio to QEMU command line) + device_add ,id=,bus= + + +6. Device assignment +==================== +Host devices are mostly PCI Express and should be plugged only into +PCI Express Root Ports or PCI Express Downstream Ports. +PCI-PCI Bridge slots can be used for legacy PCI host devices. + +6.1 How to detect if a device is PCI Express: + > lspci -s 03:00.0 -v (as root) + + 03:00.0 Network controller: Intel Corporation Wireless 7260 (rev 83) + Subsystem: Intel Corporation Dual Band Wireless-AC 7260 + Flags: bus master, fast devsel, latency 0, IRQ 50 + Memory at f0400000 (64-bit, non-prefetchable) [size=8K] + Capabilities: [c8] Power Management version 3 + Capabilities: [d0] MSI: Enable+ Count=1/1 Maskable- 64bit+ + Capabilities: [40] Express Endpoint, MSI 00 + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + Capabilities: [100] Advanced Error Reporting + Capabilities: [140] Device Serial Number 7c-7a-91-ff-ff-90-db-20 + Capabilities: [14c] Latency Tolerance Reporting + Capabilities: [154] Vendor Specific Information: ID=cafe Rev=1 Len=014 + +If you can see the "Express Endpoint" capability in the +output, then the device is indeed PCI Express. + + +7. Virtio devices +================= +Virtio devices plugged into the PCI hierarchy or as Integrated Endpoints +will remain PCI and have transitional behaviour as default. +Transitional virtio devices work in both IO and MMIO modes depending on +the guest support. The Guest firmware will assign both IO and MMIO resources +to transitional virtio devices. + +Virtio devices plugged into PCI Express ports are PCI Express devices and +have "1.0" behavior by default without IO support. +In both cases disable-legacy and disable-modern properties can be used +to override the behaviour. + +Note that setting disable-legacy=off will enable legacy mode (enabling +legacy behavior) for PCI Express virtio devices causing them to +require IO space, which, given the limited available IO space, may quickly +lead to resource exhaustion, and is therefore strongly discouraged. + + +8. Conclusion +============== +The proposal offers a usage model that is easy to understand and follow +and at the same time overcomes the PCI Express architecture limitations.