From 0f7a903ba3f0f8dfb347fb15b783aade4833826e Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:11 +0530 Subject: [PATCH 01/32] vfio: Add function to unmap VFIO region This function will be used for migration region. Migration region is mmaped when migration starts and will be unmapped when migration is complete. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/common.c | 32 ++++++++++++++++++++++++++++---- hw/vfio/trace-events | 1 + include/hw/vfio/vfio-common.h | 1 + 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 13471ae294..c6e98b8d61 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -924,6 +924,18 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, return 0; } +static void vfio_subregion_unmap(VFIORegion *region, int index) +{ + trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), + region->mmaps[index].offset, + region->mmaps[index].offset + + region->mmaps[index].size - 1); + memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); + munmap(region->mmaps[index].mmap, region->mmaps[index].size); + object_unparent(OBJECT(®ion->mmaps[index].mem)); + region->mmaps[index].mmap = NULL; +} + int vfio_region_mmap(VFIORegion *region) { int i, prot = 0; @@ -954,10 +966,7 @@ int vfio_region_mmap(VFIORegion *region) region->mmaps[i].mmap = NULL; for (i--; i >= 0; i--) { - memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); - munmap(region->mmaps[i].mmap, region->mmaps[i].size); - object_unparent(OBJECT(®ion->mmaps[i].mem)); - region->mmaps[i].mmap = NULL; + vfio_subregion_unmap(region, i); } return ret; @@ -982,6 +991,21 @@ int vfio_region_mmap(VFIORegion *region) return 0; } +void vfio_region_unmap(VFIORegion *region) +{ + int i; + + if (!region->mem) { + return; + } + + for (i = 0; i < region->nr_mmaps; i++) { + if (region->mmaps[i].mmap) { + vfio_subregion_unmap(region, i); + } + } +} + void vfio_region_exit(VFIORegion *region) { int i; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 93a0bc2522..a0c7b49a2e 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -113,6 +113,7 @@ vfio_region_mmap(const char *name, unsigned long offset, unsigned long end) "Reg vfio_region_exit(const char *name, int index) "Device %s, region %d" vfio_region_finalize(const char *name, int index) "Device %s, region %d" vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps enabled: %d" +vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]" vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries" vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]" vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index c78f3ff559..dc95f527b5 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -171,6 +171,7 @@ int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, int index, const char *name); int vfio_region_mmap(VFIORegion *region); void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled); +void vfio_region_unmap(VFIORegion *region); void vfio_region_exit(VFIORegion *region); void vfio_region_finalize(VFIORegion *region); void vfio_reset_handler(void *opaque); From e93b733bcf8ee185af14a0f90a217d51cf40e7ea Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:12 +0530 Subject: [PATCH 02/32] vfio: Add vfio_get_object callback to VFIODeviceOps Hook vfio_get_object callback for PCI devices. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Suggested-by: Cornelia Huck Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 8 ++++++++ include/hw/vfio/vfio-common.h | 1 + 2 files changed, 9 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 0d83eb0e47..bffd5bfe3b 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2394,10 +2394,18 @@ static void vfio_pci_compute_needs_reset(VFIODevice *vbasedev) } } +static Object *vfio_pci_get_object(VFIODevice *vbasedev) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + + return OBJECT(vdev); +} + static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, .vfio_eoi = vfio_intx_eoi, + .vfio_get_object = vfio_pci_get_object, }; int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index dc95f527b5..fe99c36a69 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -119,6 +119,7 @@ struct VFIODeviceOps { void (*vfio_compute_needs_reset)(VFIODevice *vdev); int (*vfio_hot_reset_multi)(VFIODevice *vdev); void (*vfio_eoi)(VFIODevice *vdev); + Object *(*vfio_get_object)(VFIODevice *vdev); }; typedef struct VFIOGroup { From c5e2fb3ce4dbb158732420fbd3b963eebbcd85c8 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:13 +0530 Subject: [PATCH 03/32] vfio: Add save and load functions for VFIO PCI devices Added functions to save and restore PCI device specific data, specifically config space of PCI device. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 51 +++++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 2 ++ 2 files changed, 53 insertions(+) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index bffd5bfe3b..e27c88be6d 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -41,6 +41,7 @@ #include "trace.h" #include "qapi/error.h" #include "migration/blocker.h" +#include "migration/qemu-file.h" #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" @@ -2401,11 +2402,61 @@ static Object *vfio_pci_get_object(VFIODevice *vbasedev) return OBJECT(vdev); } +static bool vfio_msix_present(void *opaque, int version_id) +{ + PCIDevice *pdev = opaque; + + return msix_present(pdev); +} + +const VMStateDescription vmstate_vfio_pci_config = { + .name = "VFIOPCIDevice", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), + VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), + VMSTATE_END_OF_LIST() + } +}; + +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + + vmstate_save_state(f, &vmstate_vfio_pci_config, vdev, NULL); +} + +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + PCIDevice *pdev = &vdev->pdev; + int ret; + + ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); + if (ret) { + return ret; + } + + vfio_pci_write_config(pdev, PCI_COMMAND, + pci_get_word(pdev->config + PCI_COMMAND), 2); + + if (msi_enabled(pdev)) { + vfio_msi_enable(vdev); + } else if (msix_enabled(pdev)) { + vfio_msix_enable(vdev); + } + + return ret; +} + static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, .vfio_eoi = vfio_intx_eoi, .vfio_get_object = vfio_pci_get_object, + .vfio_save_config = vfio_pci_save_config, + .vfio_load_config = vfio_pci_load_config, }; int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index fe99c36a69..ba6169cd92 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -120,6 +120,8 @@ struct VFIODeviceOps { int (*vfio_hot_reset_multi)(VFIODevice *vdev); void (*vfio_eoi)(VFIODevice *vdev); Object *(*vfio_get_object)(VFIODevice *vdev); + void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f); + int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f); }; typedef struct VFIOGroup { From a9e271ec9b36ef4c7b5bc3b234c85d93931e192e Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:14 +0530 Subject: [PATCH 04/32] vfio: Add migration region initialization and finalize function Whether the VFIO device supports migration or not is decided based of migration region query. If migration region query is successful and migration region initialization is successful then migration is supported else migration is blocked. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Acked-by: Dr. David Alan Gilbert Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/meson.build | 1 + hw/vfio/migration.c | 122 ++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 3 + include/hw/vfio/vfio-common.h | 9 +++ 4 files changed, 135 insertions(+) create mode 100644 hw/vfio/migration.c diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build index 37efa74018..da9af297a0 100644 --- a/hw/vfio/meson.build +++ b/hw/vfio/meson.build @@ -2,6 +2,7 @@ vfio_ss = ss.source_set() vfio_ss.add(files( 'common.c', 'spapr.c', + 'migration.c', )) vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files( 'display.c', diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c new file mode 100644 index 0000000000..fd7faf423c --- /dev/null +++ b/hw/vfio/migration.c @@ -0,0 +1,122 @@ +/* + * Migration support for VFIO devices + * + * Copyright NVIDIA, Inc. 2020 + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include + +#include "hw/vfio/vfio-common.h" +#include "cpu.h" +#include "migration/migration.h" +#include "migration/qemu-file.h" +#include "migration/register.h" +#include "migration/blocker.h" +#include "migration/misc.h" +#include "qapi/error.h" +#include "exec/ramlist.h" +#include "exec/ram_addr.h" +#include "pci.h" +#include "trace.h" + +static void vfio_migration_exit(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + vfio_region_exit(&migration->region); + vfio_region_finalize(&migration->region); + g_free(vbasedev->migration); + vbasedev->migration = NULL; +} + +static int vfio_migration_init(VFIODevice *vbasedev, + struct vfio_region_info *info) +{ + int ret; + Object *obj; + + if (!vbasedev->ops->vfio_get_object) { + return -EINVAL; + } + + obj = vbasedev->ops->vfio_get_object(vbasedev); + if (!obj) { + return -EINVAL; + } + + vbasedev->migration = g_new0(VFIOMigration, 1); + + ret = vfio_region_setup(obj, vbasedev, &vbasedev->migration->region, + info->index, "migration"); + if (ret) { + error_report("%s: Failed to setup VFIO migration region %d: %s", + vbasedev->name, info->index, strerror(-ret)); + goto err; + } + + if (!vbasedev->migration->region.size) { + error_report("%s: Invalid zero-sized VFIO migration region %d", + vbasedev->name, info->index); + ret = -EINVAL; + goto err; + } + return 0; + +err: + vfio_migration_exit(vbasedev); + return ret; +} + +/* ---------------------------------------------------------------------- */ + +int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) +{ + struct vfio_region_info *info = NULL; + Error *local_err = NULL; + int ret; + + ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION, + VFIO_REGION_SUBTYPE_MIGRATION, &info); + if (ret) { + goto add_blocker; + } + + ret = vfio_migration_init(vbasedev, info); + if (ret) { + goto add_blocker; + } + + g_free(info); + trace_vfio_migration_probe(vbasedev->name, info->index); + return 0; + +add_blocker: + error_setg(&vbasedev->migration_blocker, + "VFIO device doesn't support migration"); + g_free(info); + + ret = migrate_add_blocker(vbasedev->migration_blocker, &local_err); + if (local_err) { + error_propagate(errp, local_err); + error_free(vbasedev->migration_blocker); + vbasedev->migration_blocker = NULL; + } + return ret; +} + +void vfio_migration_finalize(VFIODevice *vbasedev) +{ + if (vbasedev->migration) { + vfio_migration_exit(vbasedev); + } + + if (vbasedev->migration_blocker) { + migrate_del_blocker(vbasedev->migration_blocker); + error_free(vbasedev->migration_blocker); + vbasedev->migration_blocker = NULL; + } +} diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index a0c7b49a2e..9ced5ec627 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -145,3 +145,6 @@ vfio_display_edid_link_up(void) "" vfio_display_edid_link_down(void) "" vfio_display_edid_update(uint32_t prefx, uint32_t prefy) "%ux%u" vfio_display_edid_write_error(void) "" + +# migration.c +vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index ba6169cd92..8275c4c68f 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -57,6 +57,10 @@ typedef struct VFIORegion { uint8_t nr; /* cache the region number for debug */ } VFIORegion; +typedef struct VFIOMigration { + VFIORegion region; +} VFIOMigration; + typedef struct VFIOAddressSpace { AddressSpace *as; QLIST_HEAD(, VFIOContainer) containers; @@ -113,6 +117,8 @@ typedef struct VFIODevice { unsigned int num_irqs; unsigned int num_regions; unsigned int flags; + VFIOMigration *migration; + Error *migration_blocker; } VFIODevice; struct VFIODeviceOps { @@ -204,4 +210,7 @@ int vfio_spapr_create_window(VFIOContainer *container, int vfio_spapr_remove_window(VFIOContainer *container, hwaddr offset_within_address_space); +int vfio_migration_probe(VFIODevice *vbasedev, Error **errp); +void vfio_migration_finalize(VFIODevice *vbasedev); + #endif /* HW_VFIO_VFIO_COMMON_H */ From 02a7e71b1e5b1313060927e7c86a10be2d7083a7 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:15 +0530 Subject: [PATCH 05/32] vfio: Add VM state change handler to know state of VM VM state change handler is called on change in VM's state. Based on VM state, VFIO device state should be changed. Added read/write helper functions for migration region. Added function to set device_state. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Cornelia Huck [aw: lx -> HWADDR_PRIx, remove redundant parens] Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 160 ++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 2 + include/hw/vfio/vfio-common.h | 4 + 3 files changed, 166 insertions(+) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index fd7faf423c..e1ffae05e2 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -10,6 +10,7 @@ #include "qemu/osdep.h" #include +#include "sysemu/runstate.h" #include "hw/vfio/vfio-common.h" #include "cpu.h" #include "migration/migration.h" @@ -22,6 +23,157 @@ #include "exec/ram_addr.h" #include "pci.h" #include "trace.h" +#include "hw/hw.h" + +static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count, + off_t off, bool iswrite) +{ + int ret; + + ret = iswrite ? pwrite(vbasedev->fd, val, count, off) : + pread(vbasedev->fd, val, count, off); + if (ret < count) { + error_report("vfio_mig_%s %d byte %s: failed at offset 0x%" + HWADDR_PRIx", err: %s", iswrite ? "write" : "read", count, + vbasedev->name, off, strerror(errno)); + return (ret < 0) ? ret : -EINVAL; + } + return 0; +} + +static int vfio_mig_rw(VFIODevice *vbasedev, __u8 *buf, size_t count, + off_t off, bool iswrite) +{ + int ret, done = 0; + __u8 *tbuf = buf; + + while (count) { + int bytes = 0; + + if (count >= 8 && !(off % 8)) { + bytes = 8; + } else if (count >= 4 && !(off % 4)) { + bytes = 4; + } else if (count >= 2 && !(off % 2)) { + bytes = 2; + } else { + bytes = 1; + } + + ret = vfio_mig_access(vbasedev, tbuf, bytes, off, iswrite); + if (ret) { + return ret; + } + + count -= bytes; + done += bytes; + off += bytes; + tbuf += bytes; + } + return done; +} + +#define vfio_mig_read(f, v, c, o) vfio_mig_rw(f, (__u8 *)v, c, o, false) +#define vfio_mig_write(f, v, c, o) vfio_mig_rw(f, (__u8 *)v, c, o, true) + +#define VFIO_MIG_STRUCT_OFFSET(f) \ + offsetof(struct vfio_device_migration_info, f) +/* + * Change the device_state register for device @vbasedev. Bits set in @mask + * are preserved, bits set in @value are set, and bits not set in either @mask + * or @value are cleared in device_state. If the register cannot be accessed, + * the resulting state would be invalid, or the device enters an error state, + * an error is returned. + */ + +static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask, + uint32_t value) +{ + VFIOMigration *migration = vbasedev->migration; + VFIORegion *region = &migration->region; + off_t dev_state_off = region->fd_offset + + VFIO_MIG_STRUCT_OFFSET(device_state); + uint32_t device_state; + int ret; + + ret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state), + dev_state_off); + if (ret < 0) { + return ret; + } + + device_state = (device_state & mask) | value; + + if (!VFIO_DEVICE_STATE_VALID(device_state)) { + return -EINVAL; + } + + ret = vfio_mig_write(vbasedev, &device_state, sizeof(device_state), + dev_state_off); + if (ret < 0) { + int rret; + + rret = vfio_mig_read(vbasedev, &device_state, sizeof(device_state), + dev_state_off); + + if ((rret < 0) || (VFIO_DEVICE_STATE_IS_ERROR(device_state))) { + hw_error("%s: Device in error state 0x%x", vbasedev->name, + device_state); + return rret ? rret : -EIO; + } + return ret; + } + + migration->device_state = device_state; + trace_vfio_migration_set_state(vbasedev->name, device_state); + return 0; +} + +static void vfio_vmstate_change(void *opaque, int running, RunState state) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + uint32_t value, mask; + int ret; + + if (vbasedev->migration->vm_running == running) { + return; + } + + if (running) { + /* + * Here device state can have one of _SAVING, _RESUMING or _STOP bit. + * Transition from _SAVING to _RUNNING can happen if there is migration + * failure, in that case clear _SAVING bit. + * Transition from _RESUMING to _RUNNING occurs during resuming + * phase, in that case clear _RESUMING bit. + * In both the above cases, set _RUNNING bit. + */ + mask = ~VFIO_DEVICE_STATE_MASK; + value = VFIO_DEVICE_STATE_RUNNING; + } else { + /* + * Here device state could be either _RUNNING or _SAVING|_RUNNING. Reset + * _RUNNING bit + */ + mask = ~VFIO_DEVICE_STATE_RUNNING; + value = 0; + } + + ret = vfio_migration_set_state(vbasedev, mask, value); + if (ret) { + /* + * Migration should be aborted in this case, but vm_state_notify() + * currently does not support reporting failures. + */ + error_report("%s: Failed to set device state 0x%x", vbasedev->name, + (migration->device_state & mask) | value); + qemu_file_set_error(migrate_get_current()->to_dst_file, ret); + } + vbasedev->migration->vm_running = running; + trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), + (migration->device_state & mask) | value); +} static void vfio_migration_exit(VFIODevice *vbasedev) { @@ -38,6 +190,7 @@ static int vfio_migration_init(VFIODevice *vbasedev, { int ret; Object *obj; + VFIOMigration *migration; if (!vbasedev->ops->vfio_get_object) { return -EINVAL; @@ -64,6 +217,10 @@ static int vfio_migration_init(VFIODevice *vbasedev, ret = -EINVAL; goto err; } + + migration = vbasedev->migration; + migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change, + vbasedev); return 0; err: @@ -111,6 +268,9 @@ add_blocker: void vfio_migration_finalize(VFIODevice *vbasedev) { if (vbasedev->migration) { + VFIOMigration *migration = vbasedev->migration; + + qemu_del_vm_change_state_handler(migration->vm_state); vfio_migration_exit(vbasedev); } diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9ced5ec627..41de81f12f 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -148,3 +148,5 @@ vfio_display_edid_write_error(void) "" # migration.c vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" +vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d" +vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 8275c4c68f..9a571f1fb5 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -29,6 +29,7 @@ #ifdef CONFIG_LINUX #include #endif +#include "sysemu/sysemu.h" #define VFIO_MSG_PREFIX "vfio %s: " @@ -58,7 +59,10 @@ typedef struct VFIORegion { } VFIORegion; typedef struct VFIOMigration { + VMChangeStateEntry *vm_state; VFIORegion region; + uint32_t device_state; + int vm_running; } VFIOMigration; typedef struct VFIOAddressSpace { From 050c588c2ef6edd75769e6c4869d0ad625d5be90 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:16 +0530 Subject: [PATCH 06/32] vfio: Add migration state change notifier Added migration state change notifier to get notification on migration state change. These states are translated to VFIO device state and conveyed to vendor driver. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 28 ++++++++++++++++++++++++++++ hw/vfio/trace-events | 1 + include/hw/vfio/vfio-common.h | 2 ++ 3 files changed, 31 insertions(+) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index e1ffae05e2..7ec85b6469 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -175,6 +175,30 @@ static void vfio_vmstate_change(void *opaque, int running, RunState state) (migration->device_state & mask) | value); } +static void vfio_migration_state_notifier(Notifier *notifier, void *data) +{ + MigrationState *s = data; + VFIOMigration *migration = container_of(notifier, VFIOMigration, + migration_state); + VFIODevice *vbasedev = migration->vbasedev; + int ret; + + trace_vfio_migration_state_notifier(vbasedev->name, + MigrationStatus_str(s->state)); + + switch (s->state) { + case MIGRATION_STATUS_CANCELLING: + case MIGRATION_STATUS_CANCELLED: + case MIGRATION_STATUS_FAILED: + ret = vfio_migration_set_state(vbasedev, + ~(VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING), + VFIO_DEVICE_STATE_RUNNING); + if (ret) { + error_report("%s: Failed to set state RUNNING", vbasedev->name); + } + } +} + static void vfio_migration_exit(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; @@ -219,8 +243,11 @@ static int vfio_migration_init(VFIODevice *vbasedev, } migration = vbasedev->migration; + migration->vbasedev = vbasedev; migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change, vbasedev); + migration->migration_state.notify = vfio_migration_state_notifier; + add_migration_state_change_notifier(&migration->migration_state); return 0; err: @@ -270,6 +297,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev) if (vbasedev->migration) { VFIOMigration *migration = vbasedev->migration; + remove_migration_state_change_notifier(&migration->migration_state); qemu_del_vm_change_state_handler(migration->vm_state); vfio_migration_exit(vbasedev); } diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 41de81f12f..78d7d83b5e 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -150,3 +150,4 @@ vfio_display_edid_write_error(void) "" vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d" vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d" +vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 9a571f1fb5..2bd593ba38 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -59,10 +59,12 @@ typedef struct VFIORegion { } VFIORegion; typedef struct VFIOMigration { + struct VFIODevice *vbasedev; VMChangeStateEntry *vm_state; VFIORegion region; uint32_t device_state; int vm_running; + Notifier migration_state; } VFIOMigration; typedef struct VFIOAddressSpace { From 7c2f5f75f94a8820023a46169a4369fd8189a23c Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:17 +0530 Subject: [PATCH 07/32] vfio: Register SaveVMHandlers for VFIO device Define flags to be used as delimiter in migration stream for VFIO devices. Added .save_setup and .save_cleanup functions. Map & unmap migration region from these functions at source during saving or pre-copy phase. Set VFIO device state depending on VM's state. During live migration, VM is running when .save_setup is called, _SAVING | _RUNNING state is set for VFIO device. During save-restore, VM is paused, _SAVING state is set for VFIO device. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Cornelia Huck Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 102 +++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 2 + 2 files changed, 104 insertions(+) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 7ec85b6469..ca6fd89665 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -8,12 +8,15 @@ */ #include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "qemu/cutils.h" #include #include "sysemu/runstate.h" #include "hw/vfio/vfio-common.h" #include "cpu.h" #include "migration/migration.h" +#include "migration/vmstate.h" #include "migration/qemu-file.h" #include "migration/register.h" #include "migration/blocker.h" @@ -25,6 +28,22 @@ #include "trace.h" #include "hw/hw.h" +/* + * Flags to be used as unique delimiters for VFIO devices in the migration + * stream. These flags are composed as: + * 0xffffffff => MSB 32-bit all 1s + * 0xef10 => Magic ID, represents emulated (virtual) function IO + * 0x0000 => 16-bits reserved for flags + * + * The beginning of state information is marked by _DEV_CONFIG_STATE, + * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a + * certain state information is marked by _END_OF_STATE. + */ +#define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL) +#define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL) +#define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) +#define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) + static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count, off_t off, bool iswrite) { @@ -129,6 +148,75 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask, return 0; } +static void vfio_migration_cleanup(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + + if (migration->region.mmaps) { + vfio_region_unmap(&migration->region); + } +} + +/* ---------------------------------------------------------------------- */ + +static int vfio_save_setup(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret; + + trace_vfio_save_setup(vbasedev->name); + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); + + if (migration->region.mmaps) { + /* + * Calling vfio_region_mmap() from migration thread. Memory API called + * from this function require locking the iothread when called from + * outside the main loop thread. + */ + qemu_mutex_lock_iothread(); + ret = vfio_region_mmap(&migration->region); + qemu_mutex_unlock_iothread(); + if (ret) { + error_report("%s: Failed to mmap VFIO migration region: %s", + vbasedev->name, strerror(-ret)); + error_report("%s: Falling back to slow path", vbasedev->name); + } + } + + ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_MASK, + VFIO_DEVICE_STATE_SAVING); + if (ret) { + error_report("%s: Failed to set state SAVING", vbasedev->name); + return ret; + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + + return 0; +} + +static void vfio_save_cleanup(void *opaque) +{ + VFIODevice *vbasedev = opaque; + + vfio_migration_cleanup(vbasedev); + trace_vfio_save_cleanup(vbasedev->name); +} + +static SaveVMHandlers savevm_vfio_handlers = { + .save_setup = vfio_save_setup, + .save_cleanup = vfio_save_cleanup, +}; + +/* ---------------------------------------------------------------------- */ + static void vfio_vmstate_change(void *opaque, int running, RunState state) { VFIODevice *vbasedev = opaque; @@ -215,6 +303,8 @@ static int vfio_migration_init(VFIODevice *vbasedev, int ret; Object *obj; VFIOMigration *migration; + char id[256] = ""; + g_autofree char *path = NULL, *oid = NULL; if (!vbasedev->ops->vfio_get_object) { return -EINVAL; @@ -244,6 +334,18 @@ static int vfio_migration_init(VFIODevice *vbasedev, migration = vbasedev->migration; migration->vbasedev = vbasedev; + + oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); + if (oid) { + path = g_strdup_printf("%s/vfio", oid); + } else { + path = g_strdup("vfio"); + } + strpadcpy(id, sizeof(id), path, '\0'); + + register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, + vbasedev); + migration->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change, vbasedev); migration->migration_state.notify = vfio_migration_state_notifier; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 78d7d83b5e..f148b5e828 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -151,3 +151,5 @@ vfio_migration_probe(const char *name, uint32_t index) " (%s) Region %d" vfio_migration_set_state(const char *name, uint32_t state) " (%s) state %d" vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t dev_state) " (%s) running %d reason %s device state %d" vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" +vfio_save_setup(const char *name) " (%s)" +vfio_save_cleanup(const char *name) " (%s)" From 1bc3c535ffbe512126a02b9f588497d5f5b7075b Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:18 +0530 Subject: [PATCH 08/32] vfio: Add save state functions to SaveVMHandlers Added .save_live_pending, .save_live_iterate and .save_live_complete_precopy functions. These functions handles pre-copy and stop-and-copy phase. In _SAVING|_RUNNING device state or pre-copy phase: - read pending_bytes. If pending_bytes > 0, go through below steps. - read data_offset - indicates kernel driver to write data to staging buffer. - read data_size - amount of data in bytes written by vendor driver in migration region. - read data_size bytes of data from data_offset in the migration region. - Write data packet to file stream as below: {VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data, VFIO_MIG_FLAG_END_OF_STATE } In _SAVING device state or stop-and-copy phase a. read config space of device and save to migration file stream. This doesn't need to be from vendor driver. Any other special config state from driver can be saved as data in following iteration. b. read pending_bytes. If pending_bytes > 0, go through below steps. c. read data_offset - indicates kernel driver to write data to staging buffer. d. read data_size - amount of data in bytes written by vendor driver in migration region. e. read data_size bytes of data from data_offset in the migration region. f. Write data packet as below: {VFIO_MIG_FLAG_DEV_DATA_STATE, data_size, actual data} g. iterate through steps b to f while (pending_bytes > 0) h. Write {VFIO_MIG_FLAG_END_OF_STATE} When data region is mapped, its user's responsibility to read data from data_offset of data_size before moving to next steps. Added fix suggested by Artem Polyakov to reset pending_bytes in vfio_save_iterate(). Added fix suggested by Zhi Wang to add 0 as data size in migration stream and add END_OF_STATE delimiter to indicate phase complete. Suggested-by: Artem Polyakov Suggested-by: Zhi Wang Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 276 ++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 6 + include/hw/vfio/vfio-common.h | 1 + 3 files changed, 283 insertions(+) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index ca6fd89665..5e0c9e8e61 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -148,6 +148,151 @@ static int vfio_migration_set_state(VFIODevice *vbasedev, uint32_t mask, return 0; } +static void *get_data_section_size(VFIORegion *region, uint64_t data_offset, + uint64_t data_size, uint64_t *size) +{ + void *ptr = NULL; + uint64_t limit = 0; + int i; + + if (!region->mmaps) { + if (size) { + *size = MIN(data_size, region->size - data_offset); + } + return ptr; + } + + for (i = 0; i < region->nr_mmaps; i++) { + VFIOMmap *map = region->mmaps + i; + + if ((data_offset >= map->offset) && + (data_offset < map->offset + map->size)) { + + /* check if data_offset is within sparse mmap areas */ + ptr = map->mmap + data_offset - map->offset; + if (size) { + *size = MIN(data_size, map->offset + map->size - data_offset); + } + break; + } else if ((data_offset < map->offset) && + (!limit || limit > map->offset)) { + /* + * data_offset is not within sparse mmap areas, find size of + * non-mapped area. Check through all list since region->mmaps list + * is not sorted. + */ + limit = map->offset; + } + } + + if (!ptr && size) { + *size = limit ? MIN(data_size, limit - data_offset) : data_size; + } + return ptr; +} + +static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size) +{ + VFIOMigration *migration = vbasedev->migration; + VFIORegion *region = &migration->region; + uint64_t data_offset = 0, data_size = 0, sz; + int ret; + + ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset)); + if (ret < 0) { + return ret; + } + + ret = vfio_mig_read(vbasedev, &data_size, sizeof(data_size), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size)); + if (ret < 0) { + return ret; + } + + trace_vfio_save_buffer(vbasedev->name, data_offset, data_size, + migration->pending_bytes); + + qemu_put_be64(f, data_size); + sz = data_size; + + while (sz) { + void *buf; + uint64_t sec_size; + bool buf_allocated = false; + + buf = get_data_section_size(region, data_offset, sz, &sec_size); + + if (!buf) { + buf = g_try_malloc(sec_size); + if (!buf) { + error_report("%s: Error allocating buffer ", __func__); + return -ENOMEM; + } + buf_allocated = true; + + ret = vfio_mig_read(vbasedev, buf, sec_size, + region->fd_offset + data_offset); + if (ret < 0) { + g_free(buf); + return ret; + } + } + + qemu_put_buffer(f, buf, sec_size); + + if (buf_allocated) { + g_free(buf); + } + sz -= sec_size; + data_offset += sec_size; + } + + ret = qemu_file_get_error(f); + + if (!ret && size) { + *size = data_size; + } + + return ret; +} + +static int vfio_update_pending(VFIODevice *vbasedev) +{ + VFIOMigration *migration = vbasedev->migration; + VFIORegion *region = &migration->region; + uint64_t pending_bytes = 0; + int ret; + + ret = vfio_mig_read(vbasedev, &pending_bytes, sizeof(pending_bytes), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(pending_bytes)); + if (ret < 0) { + migration->pending_bytes = 0; + return ret; + } + + migration->pending_bytes = pending_bytes; + trace_vfio_update_pending(vbasedev->name, pending_bytes); + return 0; +} + +static int vfio_save_device_config_state(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); + + if (vbasedev->ops && vbasedev->ops->vfio_save_config) { + vbasedev->ops->vfio_save_config(vbasedev, f); + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + trace_vfio_save_device_config_state(vbasedev->name); + + return qemu_file_get_error(f); +} + static void vfio_migration_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; @@ -210,9 +355,140 @@ static void vfio_save_cleanup(void *opaque) trace_vfio_save_cleanup(vbasedev->name); } +static void vfio_save_pending(QEMUFile *f, void *opaque, + uint64_t threshold_size, + uint64_t *res_precopy_only, + uint64_t *res_compatible, + uint64_t *res_postcopy_only) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret; + + ret = vfio_update_pending(vbasedev); + if (ret) { + return; + } + + *res_precopy_only += migration->pending_bytes; + + trace_vfio_save_pending(vbasedev->name, *res_precopy_only, + *res_postcopy_only, *res_compatible); +} + +static int vfio_save_iterate(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + uint64_t data_size; + int ret; + + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); + + if (migration->pending_bytes == 0) { + ret = vfio_update_pending(vbasedev); + if (ret) { + return ret; + } + + if (migration->pending_bytes == 0) { + qemu_put_be64(f, 0); + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + /* indicates data finished, goto complete phase */ + return 1; + } + } + + ret = vfio_save_buffer(f, vbasedev, &data_size); + if (ret) { + error_report("%s: vfio_save_buffer failed %s", vbasedev->name, + strerror(errno)); + return ret; + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + + /* + * Reset pending_bytes as .save_live_pending is not called during savevm or + * snapshot case, in such case vfio_update_pending() at the start of this + * function updates pending_bytes. + */ + migration->pending_bytes = 0; + trace_vfio_save_iterate(vbasedev->name, data_size); + return 0; +} + +static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + uint64_t data_size; + int ret; + + ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_RUNNING, + VFIO_DEVICE_STATE_SAVING); + if (ret) { + error_report("%s: Failed to set state STOP and SAVING", + vbasedev->name); + return ret; + } + + ret = vfio_save_device_config_state(f, opaque); + if (ret) { + return ret; + } + + ret = vfio_update_pending(vbasedev); + if (ret) { + return ret; + } + + while (migration->pending_bytes > 0) { + qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); + ret = vfio_save_buffer(f, vbasedev, &data_size); + if (ret < 0) { + error_report("%s: Failed to save buffer", vbasedev->name); + return ret; + } + + if (data_size == 0) { + break; + } + + ret = vfio_update_pending(vbasedev); + if (ret) { + return ret; + } + } + + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); + + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + + ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_SAVING, 0); + if (ret) { + error_report("%s: Failed to set state STOPPED", vbasedev->name); + return ret; + } + + trace_vfio_save_complete_precopy(vbasedev->name); + return ret; +} + static SaveVMHandlers savevm_vfio_handlers = { .save_setup = vfio_save_setup, .save_cleanup = vfio_save_cleanup, + .save_live_pending = vfio_save_pending, + .save_live_iterate = vfio_save_iterate, + .save_live_complete_precopy = vfio_save_complete_precopy, }; /* ---------------------------------------------------------------------- */ diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index f148b5e828..9f5712dab1 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -153,3 +153,9 @@ vfio_vmstate_change(const char *name, int running, const char *reason, uint32_t vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s" vfio_save_setup(const char *name) " (%s)" vfio_save_cleanup(const char *name) " (%s)" +vfio_save_buffer(const char *name, uint64_t data_offset, uint64_t data_size, uint64_t pending) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64" pending 0x%"PRIx64 +vfio_update_pending(const char *name, uint64_t pending) " (%s) pending 0x%"PRIx64 +vfio_save_device_config_state(const char *name) " (%s)" +vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64 +vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d" +vfio_save_complete_precopy(const char *name) " (%s)" diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 2bd593ba38..f4ebdae013 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -65,6 +65,7 @@ typedef struct VFIOMigration { uint32_t device_state; int vm_running; Notifier migration_state; + uint64_t pending_bytes; } VFIOMigration; typedef struct VFIOAddressSpace { From 3336d21710130a3d64c901aeae2dc66c364f93ad Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:19 +0530 Subject: [PATCH 09/32] vfio: Add load state functions to SaveVMHandlers Sequence during _RESUMING device state: While data for this device is available, repeat below steps: a. read data_offset from where user application should write data. b. write data of data_size to migration region from data_offset. c. write data_size which indicates vendor driver that data is written in staging buffer. For user, data is opaque. User should write data in the same order as received. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 195 +++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 4 + 2 files changed, 199 insertions(+) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 5e0c9e8e61..1af0fce874 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -257,6 +257,77 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size) return ret; } +static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, + uint64_t data_size) +{ + VFIORegion *region = &vbasedev->migration->region; + uint64_t data_offset = 0, size, report_size; + int ret; + + do { + ret = vfio_mig_read(vbasedev, &data_offset, sizeof(data_offset), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_offset)); + if (ret < 0) { + return ret; + } + + if (data_offset + data_size > region->size) { + /* + * If data_size is greater than the data section of migration region + * then iterate the write buffer operation. This case can occur if + * size of migration region at destination is smaller than size of + * migration region at source. + */ + report_size = size = region->size - data_offset; + data_size -= size; + } else { + report_size = size = data_size; + data_size = 0; + } + + trace_vfio_load_state_device_data(vbasedev->name, data_offset, size); + + while (size) { + void *buf; + uint64_t sec_size; + bool buf_alloc = false; + + buf = get_data_section_size(region, data_offset, size, &sec_size); + + if (!buf) { + buf = g_try_malloc(sec_size); + if (!buf) { + error_report("%s: Error allocating buffer ", __func__); + return -ENOMEM; + } + buf_alloc = true; + } + + qemu_get_buffer(f, buf, sec_size); + + if (buf_alloc) { + ret = vfio_mig_write(vbasedev, buf, sec_size, + region->fd_offset + data_offset); + g_free(buf); + + if (ret < 0) { + return ret; + } + } + size -= sec_size; + data_offset += sec_size; + } + + ret = vfio_mig_write(vbasedev, &report_size, sizeof(report_size), + region->fd_offset + VFIO_MIG_STRUCT_OFFSET(data_size)); + if (ret < 0) { + return ret; + } + } while (data_size); + + return 0; +} + static int vfio_update_pending(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; @@ -293,6 +364,33 @@ static int vfio_save_device_config_state(QEMUFile *f, void *opaque) return qemu_file_get_error(f); } +static int vfio_load_device_config_state(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + uint64_t data; + + if (vbasedev->ops && vbasedev->ops->vfio_load_config) { + int ret; + + ret = vbasedev->ops->vfio_load_config(vbasedev, f); + if (ret) { + error_report("%s: Failed to load device config space", + vbasedev->name); + return ret; + } + } + + data = qemu_get_be64(f); + if (data != VFIO_MIG_FLAG_END_OF_STATE) { + error_report("%s: Failed loading device config space, " + "end flag incorrect 0x%"PRIx64, vbasedev->name, data); + return -EINVAL; + } + + trace_vfio_load_device_config_state(vbasedev->name); + return qemu_file_get_error(f); +} + static void vfio_migration_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; @@ -483,12 +581,109 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) return ret; } +static int vfio_load_setup(QEMUFile *f, void *opaque) +{ + VFIODevice *vbasedev = opaque; + VFIOMigration *migration = vbasedev->migration; + int ret = 0; + + if (migration->region.mmaps) { + ret = vfio_region_mmap(&migration->region); + if (ret) { + error_report("%s: Failed to mmap VFIO migration region %d: %s", + vbasedev->name, migration->region.nr, + strerror(-ret)); + error_report("%s: Falling back to slow path", vbasedev->name); + } + } + + ret = vfio_migration_set_state(vbasedev, ~VFIO_DEVICE_STATE_MASK, + VFIO_DEVICE_STATE_RESUMING); + if (ret) { + error_report("%s: Failed to set state RESUMING", vbasedev->name); + if (migration->region.mmaps) { + vfio_region_unmap(&migration->region); + } + } + return ret; +} + +static int vfio_load_cleanup(void *opaque) +{ + VFIODevice *vbasedev = opaque; + + vfio_migration_cleanup(vbasedev); + trace_vfio_load_cleanup(vbasedev->name); + return 0; +} + +static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) +{ + VFIODevice *vbasedev = opaque; + int ret = 0; + uint64_t data; + + data = qemu_get_be64(f); + while (data != VFIO_MIG_FLAG_END_OF_STATE) { + + trace_vfio_load_state(vbasedev->name, data); + + switch (data) { + case VFIO_MIG_FLAG_DEV_CONFIG_STATE: + { + ret = vfio_load_device_config_state(f, opaque); + if (ret) { + return ret; + } + break; + } + case VFIO_MIG_FLAG_DEV_SETUP_STATE: + { + data = qemu_get_be64(f); + if (data == VFIO_MIG_FLAG_END_OF_STATE) { + return ret; + } else { + error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, + vbasedev->name, data); + return -EINVAL; + } + break; + } + case VFIO_MIG_FLAG_DEV_DATA_STATE: + { + uint64_t data_size = qemu_get_be64(f); + + if (data_size) { + ret = vfio_load_buffer(f, vbasedev, data_size); + if (ret < 0) { + return ret; + } + } + break; + } + default: + error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); + return -EINVAL; + } + + data = qemu_get_be64(f); + ret = qemu_file_get_error(f); + if (ret) { + return ret; + } + } + return ret; +} + static SaveVMHandlers savevm_vfio_handlers = { .save_setup = vfio_save_setup, .save_cleanup = vfio_save_cleanup, .save_live_pending = vfio_save_pending, .save_live_iterate = vfio_save_iterate, .save_live_complete_precopy = vfio_save_complete_precopy, + .load_setup = vfio_load_setup, + .load_cleanup = vfio_load_cleanup, + .load_state = vfio_load_state, }; /* ---------------------------------------------------------------------- */ diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9f5712dab1..a75b520881 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -159,3 +159,7 @@ vfio_save_device_config_state(const char *name) " (%s)" vfio_save_pending(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t compatible) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" compatible 0x%"PRIx64 vfio_save_iterate(const char *name, int data_size) " (%s) data_size %d" vfio_save_complete_precopy(const char *name) " (%s)" +vfio_load_device_config_state(const char *name) " (%s)" +vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 +vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64 +vfio_load_cleanup(const char *name) " (%s)" From 74ee653799f93dfb119de9a248bdf0a85a68904f Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:20 +0530 Subject: [PATCH 10/32] memory: Set DIRTY_MEMORY_MIGRATION when IOMMU is enabled mr->ram_block is NULL when mr->is_iommu is true, then fr.dirty_log_mask wasn't set correctly due to which memory listener's log_sync doesn't get called. This patch returns log_mask with DIRTY_MEMORY_MIGRATION set when IOMMU is enabled. Signed-off-by: Kirti Wankhede Reviewed-by: Yan Zhao Acked-by: Paolo Bonzini Signed-off-by: Alex Williamson --- softmmu/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/softmmu/memory.c b/softmmu/memory.c index ee4a6bc168..21d533d8ed 100644 --- a/softmmu/memory.c +++ b/softmmu/memory.c @@ -1806,7 +1806,7 @@ bool memory_region_is_ram_device(MemoryRegion *mr) uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr) { uint8_t mask = mr->dirty_log_mask; - if (global_dirty_log && mr->ram_block) { + if (global_dirty_log && (mr->ram_block || memory_region_is_iommu(mr))) { mask |= (1 << DIRTY_MEMORY_MIGRATION); } return mask; From 87ea529c5020124440cd892a038dffe6057fd613 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:21 +0530 Subject: [PATCH 11/32] vfio: Get migration capability flags for container Added helper functions to get IOMMU info capability chain. Added function to get migration capability information from that capability chain for IOMMU container. Similar change was proposed earlier: https://lists.gnu.org/archive/html/qemu-devel/2018-05/msg03759.html Disable migration for devices if IOMMU module doesn't support migration capability. Signed-off-by: Kirti Wankhede Cc: Shameer Kolothum Cc: Eric Auger Signed-off-by: Alex Williamson --- hw/vfio/common.c | 90 +++++++++++++++++++++++++++++++---- hw/vfio/migration.c | 7 ++- include/hw/vfio/vfio-common.h | 3 ++ 3 files changed, 91 insertions(+), 9 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index c6e98b8d61..d4959c036d 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1228,6 +1228,75 @@ static int vfio_init_container(VFIOContainer *container, int group_fd, return 0; } +static int vfio_get_iommu_info(VFIOContainer *container, + struct vfio_iommu_type1_info **info) +{ + + size_t argsz = sizeof(struct vfio_iommu_type1_info); + + *info = g_new0(struct vfio_iommu_type1_info, 1); +again: + (*info)->argsz = argsz; + + if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) { + g_free(*info); + *info = NULL; + return -errno; + } + + if (((*info)->argsz > argsz)) { + argsz = (*info)->argsz; + *info = g_realloc(*info, argsz); + goto again; + } + + return 0; +} + +static struct vfio_info_cap_header * +vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) +{ + struct vfio_info_cap_header *hdr; + void *ptr = info; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { + return NULL; + } + + for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + if (hdr->id == id) { + return hdr; + } + } + + return NULL; +} + +static void vfio_get_iommu_info_migration(VFIOContainer *container, + struct vfio_iommu_type1_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_cap_migration *cap_mig; + + hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION); + if (!hdr) { + return; + } + + cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration, + header); + + /* + * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of + * TARGET_PAGE_SIZE to mark those dirty. + */ + if (cap_mig->pgsize_bitmap & TARGET_PAGE_SIZE) { + container->dirty_pages_supported = true; + container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size; + container->dirty_pgsizes = cap_mig->pgsize_bitmap; + } +} + static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, Error **errp) { @@ -1297,6 +1366,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->space = space; container->fd = fd; container->error = NULL; + container->dirty_pages_supported = false; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->hostwin_list); @@ -1309,7 +1379,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: { - struct vfio_iommu_type1_info info; + struct vfio_iommu_type1_info *info; /* * FIXME: This assumes that a Type1 IOMMU can map any 64-bit @@ -1318,15 +1388,19 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * existing Type1 IOMMUs generally support any IOVA we're * going to actually try in practice. */ - info.argsz = sizeof(info); - ret = ioctl(fd, VFIO_IOMMU_GET_INFO, &info); - /* Ignore errors */ - if (ret || !(info.flags & VFIO_IOMMU_INFO_PGSIZES)) { + ret = vfio_get_iommu_info(container, &info); + + if (ret || !(info->flags & VFIO_IOMMU_INFO_PGSIZES)) { /* Assume 4k IOVA page size */ - info.iova_pgsizes = 4096; + info->iova_pgsizes = 4096; } - vfio_host_win_add(container, 0, (hwaddr)-1, info.iova_pgsizes); - container->pgsizes = info.iova_pgsizes; + vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes); + container->pgsizes = info->iova_pgsizes; + + if (!ret) { + vfio_get_iommu_info_migration(container, info); + } + g_free(info); break; } case VFIO_SPAPR_TCE_v2_IOMMU: diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 1af0fce874..39503b49e3 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -832,9 +832,14 @@ err: int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) { + VFIOContainer *container = vbasedev->group->container; struct vfio_region_info *info = NULL; Error *local_err = NULL; - int ret; + int ret = -ENOTSUP; + + if (!container->dirty_pages_supported) { + goto add_blocker; + } ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION, VFIO_REGION_SUBTYPE_MIGRATION, &info); diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index f4ebdae013..b1c1b18fd2 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -84,6 +84,9 @@ typedef struct VFIOContainer { unsigned iommu_type; Error *error; bool initialized; + bool dirty_pages_supported; + uint64_t dirty_pgsizes; + uint64_t max_dirty_bitmap_size; unsigned long pgsizes; QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list; From e663f516830c61f1dcafd2dda810126c14327b15 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:22 +0530 Subject: [PATCH 12/32] vfio: Add function to start and stop dirty pages tracking Call VFIO_IOMMU_DIRTY_PAGES ioctl to start and stop dirty pages tracking for VFIO devices. Signed-off-by: Kirti Wankhede Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alex Williamson --- hw/vfio/migration.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index 39503b49e3..a248effb37 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -11,6 +11,7 @@ #include "qemu/main-loop.h" #include "qemu/cutils.h" #include +#include #include "sysemu/runstate.h" #include "hw/vfio/vfio-common.h" @@ -391,10 +392,40 @@ static int vfio_load_device_config_state(QEMUFile *f, void *opaque) return qemu_file_get_error(f); } +static int vfio_set_dirty_page_tracking(VFIODevice *vbasedev, bool start) +{ + int ret; + VFIOMigration *migration = vbasedev->migration; + VFIOContainer *container = vbasedev->group->container; + struct vfio_iommu_type1_dirty_bitmap dirty = { + .argsz = sizeof(dirty), + }; + + if (start) { + if (migration->device_state & VFIO_DEVICE_STATE_SAVING) { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; + } else { + return -EINVAL; + } + } else { + dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; + } + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty); + if (ret) { + error_report("Failed to set dirty tracking flag 0x%x errno: %d", + dirty.flags, errno); + return -errno; + } + return ret; +} + static void vfio_migration_cleanup(VFIODevice *vbasedev) { VFIOMigration *migration = vbasedev->migration; + vfio_set_dirty_page_tracking(vbasedev, false); + if (migration->region.mmaps) { vfio_region_unmap(&migration->region); } @@ -435,6 +466,11 @@ static int vfio_save_setup(QEMUFile *f, void *opaque) return ret; } + ret = vfio_set_dirty_page_tracking(vbasedev, true); + if (ret) { + return ret; + } + qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); ret = qemu_file_get_error(f); From b6dd6504e303d4339df2db765433bffa36dfbf8b Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:23 +0530 Subject: [PATCH 13/32] vfio: Add vfio_listener_log_sync to mark dirty pages vfio_listener_log_sync gets list of dirty pages from container using VFIO_IOMMU_GET_DIRTY_BITMAP ioctl and mark those pages dirty when all devices are stopped and saving state. Return early for the RAM block section of mapped MMIO region. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia [aw: fix error_report types, fix cpu_physical_memory_set_dirty_lebitmap() cast] Signed-off-by: Alex Williamson --- hw/vfio/common.c | 116 +++++++++++++++++++++++++++++++++++++++++++ hw/vfio/trace-events | 1 + 2 files changed, 117 insertions(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d4959c036d..0a97fbfefb 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -29,6 +29,7 @@ #include "hw/vfio/vfio.h" #include "exec/address-spaces.h" #include "exec/memory.h" +#include "exec/ram_addr.h" #include "hw/hw.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" @@ -37,6 +38,7 @@ #include "sysemu/reset.h" #include "trace.h" #include "qapi/error.h" +#include "migration/migration.h" VFIOGroupList vfio_group_list = QLIST_HEAD_INITIALIZER(vfio_group_list); @@ -286,6 +288,39 @@ const MemoryRegionOps vfio_region_ops = { }, }; +/* + * Device state interfaces + */ + +static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + MigrationState *ms = migrate_get_current(); + + if (!migration_is_setup_or_active(ms->state)) { + return false; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + VFIOMigration *migration = vbasedev->migration; + + if (!migration) { + return false; + } + + if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) && + !(migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { + continue; + } else { + return false; + } + } + } + return true; +} + /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ @@ -812,9 +847,90 @@ static void vfio_listener_region_del(MemoryListener *listener, } } +static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova, + uint64_t size, ram_addr_t ram_addr) +{ + struct vfio_iommu_type1_dirty_bitmap *dbitmap; + struct vfio_iommu_type1_dirty_bitmap_get *range; + uint64_t pages; + int ret; + + dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range)); + + dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range); + dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; + range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data; + range->iova = iova; + range->size = size; + + /* + * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of + * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap's pgsize to + * TARGET_PAGE_SIZE. + */ + range->bitmap.pgsize = TARGET_PAGE_SIZE; + + pages = TARGET_PAGE_ALIGN(range->size) >> TARGET_PAGE_BITS; + range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + range->bitmap.data = g_try_malloc0(range->bitmap.size); + if (!range->bitmap.data) { + ret = -ENOMEM; + goto err_out; + } + + ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap); + if (ret) { + error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64 + " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova, + (uint64_t)range->size, errno); + goto err_out; + } + + cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data, + ram_addr, pages); + + trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size, + range->bitmap.size, ram_addr); +err_out: + g_free(range->bitmap.data); + g_free(dbitmap); + + return ret; +} + +static int vfio_sync_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) +{ + ram_addr_t ram_addr; + + ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + + return vfio_get_dirty_bitmap(container, + TARGET_PAGE_ALIGN(section->offset_within_address_space), + int128_get64(section->size), ram_addr); +} + +static void vfio_listerner_log_sync(MemoryListener *listener, + MemoryRegionSection *section) +{ + VFIOContainer *container = container_of(listener, VFIOContainer, listener); + + if (vfio_listener_skipped_section(section) || + !container->dirty_pages_supported) { + return; + } + + if (vfio_devices_all_stopped_and_saving(container)) { + vfio_sync_dirty_bitmap(container, section); + } +} + static const MemoryListener vfio_memory_listener = { .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, + .log_sync = vfio_listerner_log_sync, }; static void vfio_listener_release(VFIOContainer *container) diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index a75b520881..dd991bd8f2 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -163,3 +163,4 @@ vfio_load_device_config_state(const char *name) " (%s)" vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64 vfio_load_cleanup(const char *name) " (%s)" +vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64 From 9a04fe09576b0399646e80e57ff2d2324f7cf64d Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:24 +0530 Subject: [PATCH 14/32] vfio: Dirty page tracking when vIOMMU is enabled When vIOMMU is enabled, register MAP notifier from log_sync when all devices in container are in stop and copy phase of migration. Call replay and get dirty pages from notifier callback. Suggested-by: Alex Williamson Signed-off-by: Kirti Wankhede Reviewed-by: Yan Zhao Signed-off-by: Alex Williamson --- hw/vfio/common.c | 88 +++++++++++++++++++++++++++++++++++++++++--- hw/vfio/trace-events | 1 + 2 files changed, 83 insertions(+), 6 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 0a97fbfefb..43e6e89090 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -442,8 +442,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) } /* Called with rcu_read_lock held. */ -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr, - bool *read_only) +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, + ram_addr_t *ram_addr, bool *read_only) { MemoryRegion *mr; hwaddr xlat; @@ -474,8 +474,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr, return false; } - *vaddr = memory_region_get_ram_ptr(mr) + xlat; - *read_only = !writable || mr->readonly; + if (vaddr) { + *vaddr = memory_region_get_ram_ptr(mr) + xlat; + } + + if (ram_addr) { + *ram_addr = memory_region_get_ram_addr(mr) + xlat; + } + + if (read_only) { + *read_only = !writable || mr->readonly; + } return true; } @@ -485,7 +494,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); VFIOContainer *container = giommu->container; hwaddr iova = iotlb->iova + giommu->iommu_offset; - bool read_only; void *vaddr; int ret; @@ -501,7 +509,9 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_lock(); if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { - if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) { + bool read_only; + + if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) { goto out; } /* @@ -899,11 +909,77 @@ err_out: return ret; } +typedef struct { + IOMMUNotifier n; + VFIOGuestIOMMU *giommu; +} vfio_giommu_dirty_notifier; + +static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) +{ + vfio_giommu_dirty_notifier *gdn = container_of(n, + vfio_giommu_dirty_notifier, n); + VFIOGuestIOMMU *giommu = gdn->giommu; + VFIOContainer *container = giommu->container; + hwaddr iova = iotlb->iova + giommu->iommu_offset; + ram_addr_t translated_addr; + + trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask); + + if (iotlb->target_as != &address_space_memory) { + error_report("Wrong target AS \"%s\", only system memory is allowed", + iotlb->target_as->name ? iotlb->target_as->name : "none"); + return; + } + + rcu_read_lock(); + if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) { + int ret; + + ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1, + translated_addr); + if (ret) { + error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%m)", + container, iova, + iotlb->addr_mask + 1, ret); + } + } + rcu_read_unlock(); +} + static int vfio_sync_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { ram_addr_t ram_addr; + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + + QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + if (MEMORY_REGION(giommu->iommu) == section->mr && + giommu->n.start == section->offset_within_region) { + Int128 llend; + vfio_giommu_dirty_notifier gdn = { .giommu = giommu }; + int idx = memory_region_iommu_attrs_to_index(giommu->iommu, + MEMTXATTRS_UNSPECIFIED); + + llend = int128_add(int128_make64(section->offset_within_region), + section->size); + llend = int128_sub(llend, int128_one()); + + iommu_notifier_init(&gdn.n, + vfio_iommu_map_dirty_notify, + IOMMU_NOTIFIER_MAP, + section->offset_within_region, + int128_get64(llend), + idx); + memory_region_iommu_replay(giommu->iommu, &gdn.n); + break; + } + } + return 0; + } + ram_addr = memory_region_get_ram_addr(section->mr) + section->offset_within_region; diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index dd991bd8f2..c0e75f24b7 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -164,3 +164,4 @@ vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64 vfio_load_state_device_data(const char *name, uint64_t data_offset, uint64_t data_size) " (%s) Offset 0x%"PRIx64" size 0x%"PRIx64 vfio_load_cleanup(const char *name) " (%s)" vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64 +vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 From 9e7b0442f23a92c27204d6f81a954f30f3126d33 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:25 +0530 Subject: [PATCH 15/32] vfio: Add ioctl to get dirty pages bitmap during dma unmap With vIOMMU, IO virtual address range can get unmapped while in pre-copy phase of migration. In that case, unmap ioctl should return pages pinned in that range and QEMU should find its correcponding guest physical addresses and report those dirty. Suggested-by: Alex Williamson Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia [aw: fix error_report types, fix cpu_physical_memory_set_dirty_lebitmap() cast] Signed-off-by: Alex Williamson --- hw/vfio/common.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 4 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 43e6e89090..620358a3d8 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -321,11 +321,95 @@ static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) return true; } +static bool vfio_devices_all_running_and_saving(VFIOContainer *container) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + MigrationState *ms = migrate_get_current(); + + if (!migration_is_setup_or_active(ms->state)) { + return false; + } + + QLIST_FOREACH(group, &container->group_list, container_next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + VFIOMigration *migration = vbasedev->migration; + + if (!migration) { + return false; + } + + if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) && + (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) { + continue; + } else { + return false; + } + } + } + return true; +} + +static int vfio_dma_unmap_bitmap(VFIOContainer *container, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) +{ + struct vfio_iommu_type1_dma_unmap *unmap; + struct vfio_bitmap *bitmap; + uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS; + int ret; + + unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap)); + + unmap->argsz = sizeof(*unmap) + sizeof(*bitmap); + unmap->iova = iova; + unmap->size = size; + unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP; + bitmap = (struct vfio_bitmap *)&unmap->data; + + /* + * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of + * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to + * TARGET_PAGE_SIZE. + */ + + bitmap->pgsize = TARGET_PAGE_SIZE; + bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) / + BITS_PER_BYTE; + + if (bitmap->size > container->max_dirty_bitmap_size) { + error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, + (uint64_t)bitmap->size); + ret = -E2BIG; + goto unmap_exit; + } + + bitmap->data = g_try_malloc0(bitmap->size); + if (!bitmap->data) { + ret = -ENOMEM; + goto unmap_exit; + } + + ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap); + if (!ret) { + cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data, + iotlb->translated_addr, pages); + } else { + error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m"); + } + + g_free(bitmap->data); +unmap_exit: + g_free(unmap); + return ret; +} + /* * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 */ static int vfio_dma_unmap(VFIOContainer *container, - hwaddr iova, ram_addr_t size) + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) { struct vfio_iommu_type1_dma_unmap unmap = { .argsz = sizeof(unmap), @@ -334,6 +418,11 @@ static int vfio_dma_unmap(VFIOContainer *container, .size = size, }; + if (iotlb && container->dirty_pages_supported && + vfio_devices_all_running_and_saving(container)) { + return vfio_dma_unmap_bitmap(container, iova, size, iotlb); + } + while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { /* * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c @@ -381,7 +470,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, * the VGA ROM space. */ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || - (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 && + (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 && ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { return 0; } @@ -531,7 +620,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) iotlb->addr_mask + 1, vaddr, ret); } } else { - ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1); + ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%m)", @@ -834,7 +923,7 @@ static void vfio_listener_region_del(MemoryListener *listener, } if (try_unmap) { - ret = vfio_dma_unmap(container, iova, int128_get64(llsize)); + ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL); if (ret) { error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%m)", From a22651053b59b7d40bf921e8819ea696a3b0a9d2 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:26 +0530 Subject: [PATCH 16/32] vfio: Make vfio-pci device migration capable If the device is not a failover primary device, call vfio_migration_probe() and vfio_migration_finalize() to enable migration support for those devices that support it respectively to tear it down again. Removed migration blocker from VFIO PCI device specific structure and use migration blocker from generic structure of VFIO device. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia Reviewed-by: Dr. David Alan Gilbert Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 28 ++++++++-------------------- hw/vfio/pci.h | 1 - 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index e27c88be6d..58c0ce8971 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2791,17 +2791,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) return; } - if (!pdev->failover_pair_id) { - error_setg(&vdev->migration_blocker, - "VFIO device doesn't support migration"); - ret = migrate_add_blocker(vdev->migration_blocker, errp); - if (ret) { - error_free(vdev->migration_blocker); - vdev->migration_blocker = NULL; - return; - } - } - vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev); vdev->vbasedev.ops = &vfio_pci_ops; vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; @@ -3069,6 +3058,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } + if (!pdev->failover_pair_id) { + ret = vfio_migration_probe(&vdev->vbasedev, errp); + if (ret) { + error_report("%s: Migration disabled", vdev->vbasedev.name); + } + } + vfio_register_err_notifier(vdev); vfio_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); @@ -3083,11 +3079,6 @@ out_teardown: vfio_bars_exit(vdev); error: error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); - if (vdev->migration_blocker) { - migrate_del_blocker(vdev->migration_blocker); - error_free(vdev->migration_blocker); - vdev->migration_blocker = NULL; - } } static void vfio_instance_finalize(Object *obj) @@ -3099,10 +3090,6 @@ static void vfio_instance_finalize(Object *obj) vfio_bars_finalize(vdev); g_free(vdev->emulated_config_bits); g_free(vdev->rom); - if (vdev->migration_blocker) { - migrate_del_blocker(vdev->migration_blocker); - error_free(vdev->migration_blocker); - } /* * XXX Leaking igd_opregion is not an oversight, we can't remove the * fw_cfg entry therefore leaking this allocation seems like the safest @@ -3130,6 +3117,7 @@ static void vfio_exitfn(PCIDevice *pdev) } vfio_teardown_msi(vdev); vfio_bars_exit(vdev); + vfio_migration_finalize(&vdev->vbasedev); } static void vfio_pci_reset(DeviceState *dev) diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index bce71a9ac9..1574ef983f 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -172,7 +172,6 @@ struct VFIOPCIDevice { bool no_vfio_ioeventfd; bool enable_ramfb; VFIODisplay *dpy; - Error *migration_blocker; Notifier irqchip_change_notifier; }; From 3710586caa5d91a52c0cf247e1c829a50f2e7b98 Mon Sep 17 00:00:00 2001 From: Kirti Wankhede Date: Mon, 26 Oct 2020 15:06:27 +0530 Subject: [PATCH 17/32] qapi: Add VFIO devices migration stats in Migration stats Added amount of bytes transferred to the VM at destination by all VFIO devices Signed-off-by: Kirti Wankhede Reviewed-by: Dr. David Alan Gilbert Signed-off-by: Alex Williamson --- hw/vfio/common.c | 19 +++++++++++++++++++ hw/vfio/migration.c | 9 +++++++++ include/hw/vfio/vfio-common.h | 3 +++ migration/migration.c | 17 +++++++++++++++++ monitor/hmp-cmds.c | 6 ++++++ qapi/migration.json | 17 +++++++++++++++++ 6 files changed, 71 insertions(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 620358a3d8..d41ba67ffb 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -292,6 +292,25 @@ const MemoryRegionOps vfio_region_ops = { * Device state interfaces */ +bool vfio_mig_active(void) +{ + VFIOGroup *group; + VFIODevice *vbasedev; + + if (QLIST_EMPTY(&vfio_group_list)) { + return false; + } + + QLIST_FOREACH(group, &vfio_group_list, next) { + QLIST_FOREACH(vbasedev, &group->device_list, next) { + if (vbasedev->migration_blocker) { + return false; + } + } + } + return true; +} + static bool vfio_devices_all_stopped_and_saving(VFIOContainer *container) { VFIOGroup *group; diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c index a248effb37..3ce285ea39 100644 --- a/hw/vfio/migration.c +++ b/hw/vfio/migration.c @@ -45,6 +45,8 @@ #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL) #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL) +static int64_t bytes_transferred; + static inline int vfio_mig_access(VFIODevice *vbasedev, void *val, int count, off_t off, bool iswrite) { @@ -255,6 +257,7 @@ static int vfio_save_buffer(QEMUFile *f, VFIODevice *vbasedev, uint64_t *size) *size = data_size; } + bytes_transferred += data_size; return ret; } @@ -785,6 +788,7 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data) case MIGRATION_STATUS_CANCELLING: case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_FAILED: + bytes_transferred = 0; ret = vfio_migration_set_state(vbasedev, ~(VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING), VFIO_DEVICE_STATE_RUNNING); @@ -866,6 +870,11 @@ err: /* ---------------------------------------------------------------------- */ +int64_t vfio_mig_bytes_transferred(void) +{ + return bytes_transferred; +} + int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) { VFIOContainer *container = vbasedev->group->container; diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index b1c1b18fd2..24e299d974 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -203,6 +203,9 @@ extern const MemoryRegionOps vfio_region_ops; typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList; extern VFIOGroupList vfio_group_list; +bool vfio_mig_active(void); +int64_t vfio_mig_bytes_transferred(void); + #ifdef CONFIG_LINUX int vfio_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info); diff --git a/migration/migration.c b/migration/migration.c index 9bb4fee5ac..3263aa55a9 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -57,6 +57,10 @@ #include "qemu/queue.h" #include "multifd.h" +#ifdef CONFIG_VFIO +#include "hw/vfio/vfio-common.h" +#endif + #define MAX_THROTTLE (128 << 20) /* Migration transfer speed throttling */ /* Amount of time to allocate to each "chunk" of bandwidth-throttled @@ -1037,6 +1041,17 @@ static void populate_disk_info(MigrationInfo *info) } } +static void populate_vfio_info(MigrationInfo *info) +{ +#ifdef CONFIG_VFIO + if (vfio_mig_active()) { + info->has_vfio = true; + info->vfio = g_malloc0(sizeof(*info->vfio)); + info->vfio->transferred = vfio_mig_bytes_transferred(); + } +#endif +} + static void fill_source_migration_info(MigrationInfo *info) { MigrationState *s = migrate_get_current(); @@ -1061,6 +1076,7 @@ static void fill_source_migration_info(MigrationInfo *info) populate_time_info(info, s); populate_ram_info(info, s); populate_disk_info(info); + populate_vfio_info(info); break; case MIGRATION_STATUS_COLO: info->has_status = true; @@ -1069,6 +1085,7 @@ static void fill_source_migration_info(MigrationInfo *info) case MIGRATION_STATUS_COMPLETED: populate_time_info(info, s); populate_ram_info(info, s); + populate_vfio_info(info); break; case MIGRATION_STATUS_FAILED: info->has_status = true; diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c index 9789f4277f..56e9bad33d 100644 --- a/monitor/hmp-cmds.c +++ b/monitor/hmp-cmds.c @@ -357,6 +357,12 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) } monitor_printf(mon, "]\n"); } + + if (info->has_vfio) { + monitor_printf(mon, "vfio device transferred: %" PRIu64 " kbytes\n", + info->vfio->transferred >> 10); + } + qapi_free_MigrationInfo(info); } diff --git a/qapi/migration.json b/qapi/migration.json index a5da513c9e..3c75820527 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -147,6 +147,18 @@ 'active', 'postcopy-active', 'postcopy-paused', 'postcopy-recover', 'completed', 'failed', 'colo', 'pre-switchover', 'device', 'wait-unplug' ] } +## +# @VfioStats: +# +# Detailed VFIO devices migration statistics +# +# @transferred: amount of bytes transferred to the target VM by VFIO devices +# +# Since: 5.2 +# +## +{ 'struct': 'VfioStats', + 'data': {'transferred': 'int' } } ## # @MigrationInfo: @@ -208,11 +220,16 @@ # # @socket-address: Only used for tcp, to know what the real port is (Since 4.0) # +# @vfio: @VfioStats containing detailed VFIO devices migration statistics, +# only returned if VFIO device is present, migration is supported by all +# VFIO devices and status is 'active' or 'completed' (since 5.2) +# # Since: 0.14.0 ## { 'struct': 'MigrationInfo', 'data': {'*status': 'MigrationStatus', '*ram': 'MigrationStats', '*disk': 'MigrationStats', + '*vfio': 'VfioStats', '*xbzrle-cache': 'XBZRLECacheStats', '*total-time': 'int', '*expected-downtime': 'int', From 84567ea763874c633eddfd8c522d48bbe0bc5c8f Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:29 -0400 Subject: [PATCH 18/32] update-linux-headers: Add vfio_zdev.h vfio_zdev.h is used by s390x zPCI support to pass device-specific CLP information between host and userspace. Signed-off-by: Matthew Rosato Acked-by: Cornelia Huck Signed-off-by: Alex Williamson --- scripts/update-linux-headers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index 29c27f4681..9efbaf2f84 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -141,7 +141,7 @@ done rm -rf "$output/linux-headers/linux" mkdir -p "$output/linux-headers/linux" -for header in kvm.h vfio.h vfio_ccw.h vhost.h \ +for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h \ psci.h psp-sev.h userfaultfd.h mman.h; do cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux" done From 53ba2eee52bff5a746e96835539a1079f6bcadd1 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:30 -0400 Subject: [PATCH 19/32] linux-headers: update against 5.10-rc1 commit 3650b228f83adda7e5ee532e2b90429c03f7b9ec Signed-off-by: Matthew Rosato [aw: drop pvrdma_ring.h changes to avoid revert of d73415a31547 ("qemu/atomic.h: rename atomic_ to qatomic_")] Signed-off-by: Alex Williamson --- .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 2 +- include/standard-headers/linux/ethtool.h | 2 + include/standard-headers/linux/fuse.h | 50 +++++++++++- .../linux/input-event-codes.h | 4 + include/standard-headers/linux/pci_regs.h | 6 +- include/standard-headers/linux/virtio_fs.h | 3 + include/standard-headers/linux/virtio_gpu.h | 19 +++++ include/standard-headers/linux/virtio_mmio.h | 11 +++ include/standard-headers/linux/virtio_pci.h | 11 ++- linux-headers/asm-arm64/kvm.h | 25 ++++++ linux-headers/asm-arm64/mman.h | 1 + linux-headers/asm-generic/hugetlb_encode.h | 1 + linux-headers/asm-generic/unistd.h | 18 ++--- linux-headers/asm-mips/unistd_n32.h | 1 + linux-headers/asm-mips/unistd_n64.h | 1 + linux-headers/asm-mips/unistd_o32.h | 1 + linux-headers/asm-powerpc/unistd_32.h | 1 + linux-headers/asm-powerpc/unistd_64.h | 1 + linux-headers/asm-s390/unistd_32.h | 1 + linux-headers/asm-s390/unistd_64.h | 1 + linux-headers/asm-x86/kvm.h | 20 +++++ linux-headers/asm-x86/unistd_32.h | 1 + linux-headers/asm-x86/unistd_64.h | 1 + linux-headers/asm-x86/unistd_x32.h | 1 + linux-headers/linux/kvm.h | 19 +++++ linux-headers/linux/mman.h | 1 + linux-headers/linux/vfio.h | 29 ++++++- linux-headers/linux/vfio_zdev.h | 78 +++++++++++++++++++ 28 files changed, 294 insertions(+), 16 deletions(-) create mode 100644 linux-headers/linux/vfio_zdev.h diff --git a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index 1677208a41..0a8c7c9311 100644 --- a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -176,7 +176,7 @@ struct pvrdma_port_attr { uint8_t subnet_timeout; uint8_t init_type_reply; uint8_t active_width; - uint8_t active_speed; + uint16_t active_speed; uint8_t phys_state; uint8_t reserved[2]; }; diff --git a/include/standard-headers/linux/ethtool.h b/include/standard-headers/linux/ethtool.h index e13eff4488..0df22f7538 100644 --- a/include/standard-headers/linux/ethtool.h +++ b/include/standard-headers/linux/ethtool.h @@ -1617,6 +1617,8 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, + ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, + ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h index f4df0a40f6..82c0a38b59 100644 --- a/include/standard-headers/linux/fuse.h +++ b/include/standard-headers/linux/fuse.h @@ -172,6 +172,9 @@ * - add FUSE_WRITE_KILL_PRIV flag * - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING * - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag + * + * 7.32 + * - add flags to fuse_attr, add FUSE_ATTR_SUBMOUNT, add FUSE_SUBMOUNTS */ #ifndef _LINUX_FUSE_H @@ -203,7 +206,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 31 +#define FUSE_KERNEL_MINOR_VERSION 32 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -227,7 +230,7 @@ struct fuse_attr { uint32_t gid; uint32_t rdev; uint32_t blksize; - uint32_t padding; + uint32_t flags; }; struct fuse_kstatfs { @@ -309,7 +312,10 @@ struct fuse_file_lock { * FUSE_CACHE_SYMLINKS: cache READLINK responses * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request - * FUSE_MAP_ALIGNMENT: map_alignment field is valid + * FUSE_MAP_ALIGNMENT: init_out.map_alignment contains log2(byte alignment) for + * foffset and moffset fields in struct + * fuse_setupmapping_out and fuse_removemapping_one. + * FUSE_SUBMOUNTS: kernel supports auto-mounting directory submounts */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -338,6 +344,7 @@ struct fuse_file_lock { #define FUSE_NO_OPENDIR_SUPPORT (1 << 24) #define FUSE_EXPLICIT_INVAL_DATA (1 << 25) #define FUSE_MAP_ALIGNMENT (1 << 26) +#define FUSE_SUBMOUNTS (1 << 27) /** * CUSE INIT request/reply flags @@ -413,6 +420,13 @@ struct fuse_file_lock { */ #define FUSE_FSYNC_FDATASYNC (1 << 0) +/** + * fuse_attr flags + * + * FUSE_ATTR_SUBMOUNT: Object is a submount root + */ +#define FUSE_ATTR_SUBMOUNT (1 << 0) + enum fuse_opcode { FUSE_LOOKUP = 1, FUSE_FORGET = 2, /* no reply */ @@ -888,4 +902,34 @@ struct fuse_copy_file_range_in { uint64_t flags; }; +#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) +#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) +struct fuse_setupmapping_in { + /* An already open handle */ + uint64_t fh; + /* Offset into the file to start the mapping */ + uint64_t foffset; + /* Length of mapping required */ + uint64_t len; + /* Flags, FUSE_SETUPMAPPING_FLAG_* */ + uint64_t flags; + /* Offset in Memory Window */ + uint64_t moffset; +}; + +struct fuse_removemapping_in { + /* number of fuse_removemapping_one follows */ + uint32_t count; +}; + +struct fuse_removemapping_one { + /* Offset into the dax window start the unmapping */ + uint64_t moffset; + /* Length of mapping required */ + uint64_t len; +}; + +#define FUSE_REMOVEMAPPING_MAX_ENTRY \ + (PAGE_SIZE / sizeof(struct fuse_removemapping_one)) + #endif /* _LINUX_FUSE_H */ diff --git a/include/standard-headers/linux/input-event-codes.h b/include/standard-headers/linux/input-event-codes.h index e740ad9f2e..c403b9cb0d 100644 --- a/include/standard-headers/linux/input-event-codes.h +++ b/include/standard-headers/linux/input-event-codes.h @@ -515,6 +515,9 @@ #define KEY_10CHANNELSUP 0x1b8 /* 10 channels up (10+) */ #define KEY_10CHANNELSDOWN 0x1b9 /* 10 channels down (10-) */ #define KEY_IMAGES 0x1ba /* AL Image Browser */ +#define KEY_NOTIFICATION_CENTER 0x1bc /* Show/hide the notification center */ +#define KEY_PICKUP_PHONE 0x1bd /* Answer incoming call */ +#define KEY_HANGUP_PHONE 0x1be /* Decline incoming call */ #define KEY_DEL_EOL 0x1c0 #define KEY_DEL_EOS 0x1c1 @@ -542,6 +545,7 @@ #define KEY_FN_F 0x1e2 #define KEY_FN_S 0x1e3 #define KEY_FN_B 0x1e4 +#define KEY_FN_RIGHT_SHIFT 0x1e5 #define KEY_BRL_DOT1 0x1f1 #define KEY_BRL_DOT2 0x1f2 diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h index f9701410d3..a95d55f9f2 100644 --- a/include/standard-headers/linux/pci_regs.h +++ b/include/standard-headers/linux/pci_regs.h @@ -76,6 +76,7 @@ #define PCI_CACHE_LINE_SIZE 0x0c /* 8 bits */ #define PCI_LATENCY_TIMER 0x0d /* 8 bits */ #define PCI_HEADER_TYPE 0x0e /* 8 bits */ +#define PCI_HEADER_TYPE_MASK 0x7f #define PCI_HEADER_TYPE_NORMAL 0 #define PCI_HEADER_TYPE_BRIDGE 1 #define PCI_HEADER_TYPE_CARDBUS 2 @@ -246,7 +247,7 @@ #define PCI_PM_CAP_PME_D0 0x0800 /* PME# from D0 */ #define PCI_PM_CAP_PME_D1 0x1000 /* PME# from D1 */ #define PCI_PM_CAP_PME_D2 0x2000 /* PME# from D2 */ -#define PCI_PM_CAP_PME_D3 0x4000 /* PME# from D3 (hot) */ +#define PCI_PM_CAP_PME_D3hot 0x4000 /* PME# from D3 (hot) */ #define PCI_PM_CAP_PME_D3cold 0x8000 /* PME# from D3 (cold) */ #define PCI_PM_CAP_PME_SHIFT 11 /* Start of the PME Mask in PMC */ #define PCI_PM_CTRL 4 /* PM control and status register */ @@ -532,6 +533,8 @@ #define PCI_EXP_LNKCAP_SLS_32_0GB 0x00000005 /* LNKCAP2 SLS Vector bit 4 */ #define PCI_EXP_LNKCAP_MLW 0x000003f0 /* Maximum Link Width */ #define PCI_EXP_LNKCAP_ASPMS 0x00000c00 /* ASPM Support */ +#define PCI_EXP_LNKCAP_ASPM_L0S 0x00000400 /* ASPM L0s Support */ +#define PCI_EXP_LNKCAP_ASPM_L1 0x00000800 /* ASPM L1 Support */ #define PCI_EXP_LNKCAP_L0SEL 0x00007000 /* L0s Exit Latency */ #define PCI_EXP_LNKCAP_L1EL 0x00038000 /* L1 Exit Latency */ #define PCI_EXP_LNKCAP_CLKPM 0x00040000 /* Clock Power Management */ @@ -1056,6 +1059,7 @@ #define PCI_L1SS_CTL1_PCIPM_L1_1 0x00000002 /* PCI-PM L1.1 Enable */ #define PCI_L1SS_CTL1_ASPM_L1_2 0x00000004 /* ASPM L1.2 Enable */ #define PCI_L1SS_CTL1_ASPM_L1_1 0x00000008 /* ASPM L1.1 Enable */ +#define PCI_L1SS_CTL1_L1_2_MASK 0x00000005 #define PCI_L1SS_CTL1_L1SS_MASK 0x0000000f #define PCI_L1SS_CTL1_CM_RESTORE_TIME 0x0000ff00 /* Common_Mode_Restore_Time */ #define PCI_L1SS_CTL1_LTR_L12_TH_VALUE 0x03ff0000 /* LTR_L1.2_THRESHOLD_Value */ diff --git a/include/standard-headers/linux/virtio_fs.h b/include/standard-headers/linux/virtio_fs.h index 9d88817a6b..a32fe8a64c 100644 --- a/include/standard-headers/linux/virtio_fs.h +++ b/include/standard-headers/linux/virtio_fs.h @@ -16,4 +16,7 @@ struct virtio_fs_config { uint32_t num_request_queues; } QEMU_PACKED; +/* For the id field in virtio_pci_shm_cap */ +#define VIRTIO_FS_SHMCAP_ID_CACHE 0 + #endif /* _LINUX_VIRTIO_FS_H */ diff --git a/include/standard-headers/linux/virtio_gpu.h b/include/standard-headers/linux/virtio_gpu.h index b8fa15f0ac..4183cdc74b 100644 --- a/include/standard-headers/linux/virtio_gpu.h +++ b/include/standard-headers/linux/virtio_gpu.h @@ -50,6 +50,10 @@ * VIRTIO_GPU_CMD_GET_EDID */ #define VIRTIO_GPU_F_EDID 1 +/* + * VIRTIO_GPU_CMD_RESOURCE_ASSIGN_UUID + */ +#define VIRTIO_GPU_F_RESOURCE_UUID 2 enum virtio_gpu_ctrl_type { VIRTIO_GPU_UNDEFINED = 0, @@ -66,6 +70,7 @@ enum virtio_gpu_ctrl_type { VIRTIO_GPU_CMD_GET_CAPSET_INFO, VIRTIO_GPU_CMD_GET_CAPSET, VIRTIO_GPU_CMD_GET_EDID, + VIRTIO_GPU_CMD_RESOURCE_ASSIGN_UUID, /* 3d commands */ VIRTIO_GPU_CMD_CTX_CREATE = 0x0200, @@ -87,6 +92,7 @@ enum virtio_gpu_ctrl_type { VIRTIO_GPU_RESP_OK_CAPSET_INFO, VIRTIO_GPU_RESP_OK_CAPSET, VIRTIO_GPU_RESP_OK_EDID, + VIRTIO_GPU_RESP_OK_RESOURCE_UUID, /* error responses */ VIRTIO_GPU_RESP_ERR_UNSPEC = 0x1200, @@ -340,4 +346,17 @@ enum virtio_gpu_formats { VIRTIO_GPU_FORMAT_R8G8B8X8_UNORM = 134, }; +/* VIRTIO_GPU_CMD_RESOURCE_ASSIGN_UUID */ +struct virtio_gpu_resource_assign_uuid { + struct virtio_gpu_ctrl_hdr hdr; + uint32_t resource_id; + uint32_t padding; +}; + +/* VIRTIO_GPU_RESP_OK_RESOURCE_UUID */ +struct virtio_gpu_resp_resource_uuid { + struct virtio_gpu_ctrl_hdr hdr; + uint8_t uuid[16]; +}; + #endif diff --git a/include/standard-headers/linux/virtio_mmio.h b/include/standard-headers/linux/virtio_mmio.h index c4b09689ab..0650f91bea 100644 --- a/include/standard-headers/linux/virtio_mmio.h +++ b/include/standard-headers/linux/virtio_mmio.h @@ -122,6 +122,17 @@ #define VIRTIO_MMIO_QUEUE_USED_LOW 0x0a0 #define VIRTIO_MMIO_QUEUE_USED_HIGH 0x0a4 +/* Shared memory region id */ +#define VIRTIO_MMIO_SHM_SEL 0x0ac + +/* Shared memory region length, 64 bits in two halves */ +#define VIRTIO_MMIO_SHM_LEN_LOW 0x0b0 +#define VIRTIO_MMIO_SHM_LEN_HIGH 0x0b4 + +/* Shared memory region base address, 64 bits in two halves */ +#define VIRTIO_MMIO_SHM_BASE_LOW 0x0b8 +#define VIRTIO_MMIO_SHM_BASE_HIGH 0x0bc + /* Configuration atomicity value */ #define VIRTIO_MMIO_CONFIG_GENERATION 0x0fc diff --git a/include/standard-headers/linux/virtio_pci.h b/include/standard-headers/linux/virtio_pci.h index 9262acd130..db7a8e2fcb 100644 --- a/include/standard-headers/linux/virtio_pci.h +++ b/include/standard-headers/linux/virtio_pci.h @@ -113,6 +113,8 @@ #define VIRTIO_PCI_CAP_DEVICE_CFG 4 /* PCI configuration access */ #define VIRTIO_PCI_CAP_PCI_CFG 5 +/* Additional shared memory capability */ +#define VIRTIO_PCI_CAP_SHARED_MEMORY_CFG 8 /* This is the PCI capability header: */ struct virtio_pci_cap { @@ -121,11 +123,18 @@ struct virtio_pci_cap { uint8_t cap_len; /* Generic PCI field: capability length */ uint8_t cfg_type; /* Identifies the structure. */ uint8_t bar; /* Where to find it. */ - uint8_t padding[3]; /* Pad to full dword. */ + uint8_t id; /* Multiple capabilities of the same type */ + uint8_t padding[2]; /* Pad to full dword. */ uint32_t offset; /* Offset within bar. */ uint32_t length; /* Length of the structure, in bytes. */ }; +struct virtio_pci_cap64 { + struct virtio_pci_cap cap; + uint32_t offset_hi; /* Most sig 32 bits of offset */ + uint32_t length_hi; /* Most sig 32 bits of length */ +}; + struct virtio_pci_notify_cap { struct virtio_pci_cap cap; uint32_t notify_off_multiplier; /* Multiplier for queue_notify_off. */ diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index 9e34f0f875..a72de1ae4c 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -159,6 +159,21 @@ struct kvm_sync_regs { struct kvm_arch_memory_slot { }; +/* + * PMU filter structure. Describe a range of events with a particular + * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER. + */ +struct kvm_pmu_event_filter { + __u16 base_event; + __u16 nevents; + +#define KVM_PMU_EVENT_ALLOW 0 +#define KVM_PMU_EVENT_DENY 1 + + __u8 action; + __u8 pad[3]; +}; + /* for KVM_GET/SET_VCPU_EVENTS */ struct kvm_vcpu_events { struct { @@ -242,6 +257,15 @@ struct kvm_vcpu_events { #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL 0 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL 1 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED 2 + +/* + * Only two states can be presented by the host kernel: + * - NOT_REQUIRED: the guest doesn't need to do anything + * - NOT_AVAIL: the guest isn't mitigated (it can still use SSBS if available) + * + * All the other values are deprecated. The host still accepts all + * values (they are ABI), but will narrow them to the above two. + */ #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 KVM_REG_ARM_FW_REG(2) #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL 0 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN 1 @@ -329,6 +353,7 @@ struct kvm_vcpu_events { #define KVM_ARM_VCPU_PMU_V3_CTRL 0 #define KVM_ARM_VCPU_PMU_V3_IRQ 0 #define KVM_ARM_VCPU_PMU_V3_INIT 1 +#define KVM_ARM_VCPU_PMU_V3_FILTER 2 #define KVM_ARM_VCPU_TIMER_CTRL 1 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 diff --git a/linux-headers/asm-arm64/mman.h b/linux-headers/asm-arm64/mman.h index e94b9af859..d0dbfe9587 100644 --- a/linux-headers/asm-arm64/mman.h +++ b/linux-headers/asm-arm64/mman.h @@ -5,5 +5,6 @@ #include #define PROT_BTI 0x10 /* BTI guarded page */ +#define PROT_MTE 0x20 /* Normal Tagged mapping */ #endif /* ! _UAPI__ASM_MMAN_H */ diff --git a/linux-headers/asm-generic/hugetlb_encode.h b/linux-headers/asm-generic/hugetlb_encode.h index b0f8e87235..4f3d5aaa11 100644 --- a/linux-headers/asm-generic/hugetlb_encode.h +++ b/linux-headers/asm-generic/hugetlb_encode.h @@ -20,6 +20,7 @@ #define HUGETLB_FLAG_ENCODE_SHIFT 26 #define HUGETLB_FLAG_ENCODE_MASK 0x3f +#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h index 995b36c2ea..2056318988 100644 --- a/linux-headers/asm-generic/unistd.h +++ b/linux-headers/asm-generic/unistd.h @@ -140,7 +140,7 @@ __SYSCALL(__NR_renameat, sys_renameat) #define __NR_umount2 39 __SYSCALL(__NR_umount2, sys_umount) #define __NR_mount 40 -__SC_COMP(__NR_mount, sys_mount, compat_sys_mount) +__SYSCALL(__NR_mount, sys_mount) #define __NR_pivot_root 41 __SYSCALL(__NR_pivot_root, sys_pivot_root) @@ -207,9 +207,9 @@ __SYSCALL(__NR_read, sys_read) #define __NR_write 64 __SYSCALL(__NR_write, sys_write) #define __NR_readv 65 -__SC_COMP(__NR_readv, sys_readv, compat_sys_readv) +__SC_COMP(__NR_readv, sys_readv, sys_readv) #define __NR_writev 66 -__SC_COMP(__NR_writev, sys_writev, compat_sys_writev) +__SC_COMP(__NR_writev, sys_writev, sys_writev) #define __NR_pread64 67 __SC_COMP(__NR_pread64, sys_pread64, compat_sys_pread64) #define __NR_pwrite64 68 @@ -237,7 +237,7 @@ __SC_COMP(__NR_signalfd4, sys_signalfd4, compat_sys_signalfd4) /* fs/splice.c */ #define __NR_vmsplice 75 -__SC_COMP(__NR_vmsplice, sys_vmsplice, compat_sys_vmsplice) +__SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_splice 76 __SYSCALL(__NR_splice, sys_splice) #define __NR_tee 77 @@ -727,11 +727,9 @@ __SYSCALL(__NR_setns, sys_setns) #define __NR_sendmmsg 269 __SC_COMP(__NR_sendmmsg, sys_sendmmsg, compat_sys_sendmmsg) #define __NR_process_vm_readv 270 -__SC_COMP(__NR_process_vm_readv, sys_process_vm_readv, \ - compat_sys_process_vm_readv) +__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv) #define __NR_process_vm_writev 271 -__SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \ - compat_sys_process_vm_writev) +__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev) #define __NR_kcmp 272 __SYSCALL(__NR_kcmp, sys_kcmp) #define __NR_finit_module 273 @@ -859,9 +857,11 @@ __SYSCALL(__NR_openat2, sys_openat2) __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #define __NR_faccessat2 439 __SYSCALL(__NR_faccessat2, sys_faccessat2) +#define __NR_process_madvise 440 +__SYSCALL(__NR_process_madvise, sys_process_madvise) #undef __NR_syscalls -#define __NR_syscalls 440 +#define __NR_syscalls 441 /* * 32 bit systems traditionally used different diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h index 246fbb6a78..aba284d190 100644 --- a/linux-headers/asm-mips/unistd_n32.h +++ b/linux-headers/asm-mips/unistd_n32.h @@ -369,6 +369,7 @@ #define __NR_openat2 (__NR_Linux + 437) #define __NR_pidfd_getfd (__NR_Linux + 438) #define __NR_faccessat2 (__NR_Linux + 439) +#define __NR_process_madvise (__NR_Linux + 440) #endif /* _ASM_MIPS_UNISTD_N32_H */ diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h index 194d777dfd..0465ab94db 100644 --- a/linux-headers/asm-mips/unistd_n64.h +++ b/linux-headers/asm-mips/unistd_n64.h @@ -345,6 +345,7 @@ #define __NR_openat2 (__NR_Linux + 437) #define __NR_pidfd_getfd (__NR_Linux + 438) #define __NR_faccessat2 (__NR_Linux + 439) +#define __NR_process_madvise (__NR_Linux + 440) #endif /* _ASM_MIPS_UNISTD_N64_H */ diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h index 3e093dd913..5222a0dd50 100644 --- a/linux-headers/asm-mips/unistd_o32.h +++ b/linux-headers/asm-mips/unistd_o32.h @@ -415,6 +415,7 @@ #define __NR_openat2 (__NR_Linux + 437) #define __NR_pidfd_getfd (__NR_Linux + 438) #define __NR_faccessat2 (__NR_Linux + 439) +#define __NR_process_madvise (__NR_Linux + 440) #endif /* _ASM_MIPS_UNISTD_O32_H */ diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h index 0db9481d49..21066a3d5f 100644 --- a/linux-headers/asm-powerpc/unistd_32.h +++ b/linux-headers/asm-powerpc/unistd_32.h @@ -422,6 +422,7 @@ #define __NR_openat2 437 #define __NR_pidfd_getfd 438 #define __NR_faccessat2 439 +#define __NR_process_madvise 440 #endif /* _ASM_POWERPC_UNISTD_32_H */ diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h index 9f74310988..c153da29f2 100644 --- a/linux-headers/asm-powerpc/unistd_64.h +++ b/linux-headers/asm-powerpc/unistd_64.h @@ -394,6 +394,7 @@ #define __NR_openat2 437 #define __NR_pidfd_getfd 438 #define __NR_faccessat2 439 +#define __NR_process_madvise 440 #endif /* _ASM_POWERPC_UNISTD_64_H */ diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h index 1803cd0c3b..3b4f2dda60 100644 --- a/linux-headers/asm-s390/unistd_32.h +++ b/linux-headers/asm-s390/unistd_32.h @@ -412,5 +412,6 @@ #define __NR_openat2 437 #define __NR_pidfd_getfd 438 #define __NR_faccessat2 439 +#define __NR_process_madvise 440 #endif /* _ASM_S390_UNISTD_32_H */ diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h index 228d5004e5..030a51fa38 100644 --- a/linux-headers/asm-s390/unistd_64.h +++ b/linux-headers/asm-s390/unistd_64.h @@ -360,5 +360,6 @@ #define __NR_openat2 437 #define __NR_pidfd_getfd 438 #define __NR_faccessat2 439 +#define __NR_process_madvise 440 #endif /* _ASM_S390_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h index 0780f97c18..89e5f3d1bb 100644 --- a/linux-headers/asm-x86/kvm.h +++ b/linux-headers/asm-x86/kvm.h @@ -192,6 +192,26 @@ struct kvm_msr_list { __u32 indices[0]; }; +/* Maximum size of any access bitmap in bytes */ +#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600 + +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range { +#define KVM_MSR_FILTER_READ (1 << 0) +#define KVM_MSR_FILTER_WRITE (1 << 1) + __u32 flags; + __u32 nmsrs; /* number of msrs in bitmap */ + __u32 base; /* MSR index the bitmap starts at */ + __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */ +}; + +#define KVM_MSR_FILTER_MAX_RANGES 16 +struct kvm_msr_filter { +#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) + __u32 flags; + struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; struct kvm_cpuid_entry { __u32 function; diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h index 356c12c2db..cfba368f9d 100644 --- a/linux-headers/asm-x86/unistd_32.h +++ b/linux-headers/asm-x86/unistd_32.h @@ -430,6 +430,7 @@ #define __NR_openat2 437 #define __NR_pidfd_getfd 438 #define __NR_faccessat2 439 +#define __NR_process_madvise 440 #endif /* _ASM_X86_UNISTD_32_H */ diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h index ef70e1c7c9..61af725095 100644 --- a/linux-headers/asm-x86/unistd_64.h +++ b/linux-headers/asm-x86/unistd_64.h @@ -352,6 +352,7 @@ #define __NR_openat2 437 #define __NR_pidfd_getfd 438 #define __NR_faccessat2 439 +#define __NR_process_madvise 440 #endif /* _ASM_X86_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h index 84ae8e9f5f..a6890cb1f5 100644 --- a/linux-headers/asm-x86/unistd_x32.h +++ b/linux-headers/asm-x86/unistd_x32.h @@ -305,6 +305,7 @@ #define __NR_openat2 (__X32_SYSCALL_BIT + 437) #define __NR_pidfd_getfd (__X32_SYSCALL_BIT + 438) #define __NR_faccessat2 (__X32_SYSCALL_BIT + 439) +#define __NR_process_madvise (__X32_SYSCALL_BIT + 440) #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512) #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513) #define __NR_ioctl (__X32_SYSCALL_BIT + 514) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 43580c767c..56ce14ad20 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -248,6 +248,8 @@ struct kvm_hyperv_exit { #define KVM_EXIT_IOAPIC_EOI 26 #define KVM_EXIT_HYPERV 27 #define KVM_EXIT_ARM_NISV 28 +#define KVM_EXIT_X86_RDMSR 29 +#define KVM_EXIT_X86_WRMSR 30 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -413,6 +415,17 @@ struct kvm_run { __u64 esr_iss; __u64 fault_ipa; } arm_nisv; + /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ + struct { + __u8 error; /* user -> kernel */ + __u8 pad[7]; +#define KVM_MSR_EXIT_REASON_INVAL (1 << 0) +#define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) +#define KVM_MSR_EXIT_REASON_FILTER (1 << 2) + __u32 reason; /* kernel -> user */ + __u32 index; /* kernel -> user */ + __u64 data; /* kernel <-> user */ + } msr; /* Fix the size of the union. */ char padding[256]; }; @@ -1037,6 +1050,9 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SMALLER_MAXPHYADDR 185 #define KVM_CAP_S390_DIAG318 186 #define KVM_CAP_STEAL_TIME 187 +#define KVM_CAP_X86_USER_SPACE_MSR 188 +#define KVM_CAP_X86_MSR_FILTER 189 +#define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 #ifdef KVM_CAP_IRQ_ROUTING @@ -1538,6 +1554,9 @@ struct kvm_pv_cmd { /* Available with KVM_CAP_S390_PROTECTED */ #define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) +/* Available with KVM_CAP_X86_MSR_FILTER */ +#define KVM_X86_SET_MSR_FILTER _IOW(KVMIO, 0xc6, struct kvm_msr_filter) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ diff --git a/linux-headers/linux/mman.h b/linux-headers/linux/mman.h index 51ea363759..434986fbe3 100644 --- a/linux-headers/linux/mman.h +++ b/linux-headers/linux/mman.h @@ -27,6 +27,7 @@ #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT #define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK +#define MAP_HUGE_16KB HUGETLB_FLAG_ENCODE_16KB #define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB #define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB #define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index a90672494d..b92dcc4daf 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -201,8 +201,11 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ +#define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ +#define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ + __u32 cap_offset; /* Offset within info struct of first cap */ }; #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) @@ -218,6 +221,15 @@ struct vfio_device_info { #define VFIO_DEVICE_API_CCW_STRING "vfio-ccw" #define VFIO_DEVICE_API_AP_STRING "vfio-ap" +/* + * The following capabilities are unique to s390 zPCI devices. Their contents + * are further-defined in vfio_zdev.h + */ +#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE 1 +#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP 2 +#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL 3 +#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP 4 + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) @@ -462,7 +474,7 @@ struct vfio_region_gfx_edid { * 5. Resumed * |--------->| * - * 0. Default state of VFIO device is _RUNNNG when the user application starts. + * 0. Default state of VFIO device is _RUNNING when the user application starts. * 1. During normal shutdown of the user application, the user application may * optionally change the VFIO device state from _RUNNING to _STOP. This * transition is optional. The vendor driver must support this transition but @@ -1039,6 +1051,21 @@ struct vfio_iommu_type1_info_cap_migration { __u64 max_dirty_bitmap_size; /* in bytes */ }; +/* + * The DMA available capability allows to report the current number of + * simultaneously outstanding DMA mappings that are allowed. + * + * The structure below defines version 1 of this capability. + * + * avail: specifies the current number of outstanding DMA mappings allowed. + */ +#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3 + +struct vfio_iommu_type1_info_dma_avail { + struct vfio_info_cap_header header; + __u32 avail; +}; + #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) /** diff --git a/linux-headers/linux/vfio_zdev.h b/linux-headers/linux/vfio_zdev.h new file mode 100644 index 0000000000..b4309397b6 --- /dev/null +++ b/linux-headers/linux/vfio_zdev.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * VFIO Region definitions for ZPCI devices + * + * Copyright IBM Corp. 2020 + * + * Author(s): Pierre Morel + * Matthew Rosato + */ + +#ifndef _VFIO_ZDEV_H_ +#define _VFIO_ZDEV_H_ + +#include +#include + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_BASE - Base PCI Function information + * + * This capability provides a set of descriptive information about the + * associated PCI function. + */ +struct vfio_device_info_cap_zpci_base { + struct vfio_info_cap_header header; + __u64 start_dma; /* Start of available DMA addresses */ + __u64 end_dma; /* End of available DMA addresses */ + __u16 pchid; /* Physical Channel ID */ + __u16 vfn; /* Virtual function number */ + __u16 fmb_length; /* Measurement Block Length (in bytes) */ + __u8 pft; /* PCI Function Type */ + __u8 gid; /* PCI function group ID */ +}; + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_GROUP - Base PCI Function Group information + * + * This capability provides a set of descriptive information about the group of + * PCI functions that the associated device belongs to. + */ +struct vfio_device_info_cap_zpci_group { + struct vfio_info_cap_header header; + __u64 dasm; /* DMA Address space mask */ + __u64 msi_addr; /* MSI address */ + __u64 flags; +#define VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH 1 /* Program-specified TLB refresh */ + __u16 mui; /* Measurement Block Update Interval */ + __u16 noi; /* Maximum number of MSIs */ + __u16 maxstbl; /* Maximum Store Block Length */ + __u8 version; /* Supported PCI Version */ +}; + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_UTIL - Utility String + * + * This capability provides the utility string for the associated device, which + * is a device identifier string made up of EBCDID characters. 'size' specifies + * the length of 'util_str'. + */ +struct vfio_device_info_cap_zpci_util { + struct vfio_info_cap_header header; + __u32 size; + __u8 util_str[]; +}; + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_PFIP - PCI Function Path + * + * This capability provides the PCI function path string, which is an identifier + * that describes the internal hardware path of the device. 'size' specifies + * the length of 'pfip'. + */ +struct vfio_device_info_cap_zpci_pfip { + struct vfio_info_cap_header header; + __u32 size; + __u8 pfip[]; +}; + +#endif From 408b55db8be3e3edae041d46ef8786fabc1476aa Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:31 -0400 Subject: [PATCH 20/32] s390x/pci: Move header files to include/hw/s390x Seems a more appropriate location for them. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- MAINTAINERS | 1 + hw/s390x/s390-pci-bus.c | 4 ++-- hw/s390x/s390-pci-inst.c | 4 ++-- hw/s390x/s390-virtio-ccw.c | 2 +- {hw => include/hw}/s390x/s390-pci-bus.h | 0 {hw => include/hw}/s390x/s390-pci-inst.h | 0 6 files changed, 6 insertions(+), 5 deletions(-) rename {hw => include/hw}/s390x/s390-pci-bus.h (100%) rename {hw => include/hw}/s390x/s390-pci-inst.h (100%) diff --git a/MAINTAINERS b/MAINTAINERS index 8c744a9bdf..2c22bbca5a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1436,6 +1436,7 @@ S390 PCI M: Matthew Rosato S: Supported F: hw/s390x/s390-pci* +F: include/hw/s390x/s390-pci* L: qemu-s390x@nongnu.org UniCore32 Machines diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index fb4cee87a4..a929340688 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -15,8 +15,8 @@ #include "qapi/error.h" #include "qapi/visitor.h" #include "cpu.h" -#include "s390-pci-bus.h" -#include "s390-pci-inst.h" +#include "hw/s390x/s390-pci-bus.h" +#include "hw/s390x/s390-pci-inst.h" #include "hw/pci/pci_bus.h" #include "hw/qdev-properties.h" #include "hw/pci/pci_bridge.h" diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 2f7a7d7bd1..639b13c8d6 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -13,12 +13,12 @@ #include "qemu/osdep.h" #include "cpu.h" -#include "s390-pci-inst.h" -#include "s390-pci-bus.h" #include "exec/memop.h" #include "exec/memory-internal.h" #include "qemu/error-report.h" #include "sysemu/hw_accel.h" +#include "hw/s390x/s390-pci-inst.h" +#include "hw/s390x/s390-pci-bus.h" #include "hw/s390x/tod.h" #ifndef DEBUG_S390PCI_INST diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 2e900335ea..22222c4fd5 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -28,7 +28,7 @@ #include "qemu/error-report.h" #include "qemu/option.h" #include "qemu/qemu-print.h" -#include "s390-pci-bus.h" +#include "hw/s390x/s390-pci-bus.h" #include "sysemu/reset.h" #include "hw/s390x/storage-keys.h" #include "hw/s390x/storage-attributes.h" diff --git a/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h similarity index 100% rename from hw/s390x/s390-pci-bus.h rename to include/hw/s390x/s390-pci-bus.h diff --git a/hw/s390x/s390-pci-inst.h b/include/hw/s390x/s390-pci-inst.h similarity index 100% rename from hw/s390x/s390-pci-inst.h rename to include/hw/s390x/s390-pci-inst.h From 3ab7a0b40d4be5ade3b61d4afd1518193b199423 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:32 -0400 Subject: [PATCH 21/32] vfio: Create shared routine for scanning info capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than duplicating the same loop in multiple locations, create a static function to do the work. Signed-off-by: Matthew Rosato Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/common.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index d41ba67ffb..693d3a2943 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1125,17 +1125,12 @@ static void vfio_listener_release(VFIOContainer *container) } } -struct vfio_info_cap_header * -vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) +static struct vfio_info_cap_header * +vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id) { struct vfio_info_cap_header *hdr; - void *ptr = info; - if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { - return NULL; - } - - for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) { + for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) { if (hdr->id == id) { return hdr; } @@ -1144,6 +1139,16 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) return NULL; } +struct vfio_info_cap_header * +vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) +{ + if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) { + return NULL; + } + + return vfio_get_cap((void *)info, info->cap_offset, id); +} + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, struct vfio_region_info *info) { From 7486a62845b1e12011dd99973e4739f69d57cd38 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:33 -0400 Subject: [PATCH 22/32] vfio: Find DMA available capability The underlying host may be limiting the number of outstanding DMA requests for type 1 IOMMU. Add helper functions to check for the DMA available capability and retrieve the current number of DMA mappings allowed. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck [aw: vfio_get_info_dma_avail moved inside CONFIG_LINUX] Signed-off-by: Alex Williamson --- hw/vfio/common.c | 31 +++++++++++++++++++++++++++++++ include/hw/vfio/vfio-common.h | 2 ++ 2 files changed, 33 insertions(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 693d3a2943..920786a23e 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1149,6 +1149,37 @@ vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id) return vfio_get_cap((void *)info, info->cap_offset, id); } +static struct vfio_info_cap_header * +vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) +{ + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) { + return NULL; + } + + return vfio_get_cap((void *)info, info->cap_offset, id); +} + +bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + unsigned int *avail) +{ + struct vfio_info_cap_header *hdr; + struct vfio_iommu_type1_info_dma_avail *cap; + + /* If the capability cannot be found, assume no DMA limiting */ + hdr = vfio_get_iommu_type1_info_cap(info, + VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL); + if (hdr == NULL) { + return false; + } + + if (avail != NULL) { + cap = (void *) hdr; + *avail = cap->avail; + } + + return true; +} + static int vfio_setup_region_sparse_mmaps(VFIORegion *region, struct vfio_region_info *info) { diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 24e299d974..1d14946a9d 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -214,6 +214,8 @@ int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type, bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type); struct vfio_info_cap_header * vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id); +bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, + unsigned int *avail); #endif extern const MemoryListener vfio_prereg_listener; From cd7498d07fbb20fa04790ff7ee168a8a8d01cb30 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:34 -0400 Subject: [PATCH 23/32] s390x/pci: Add routine to get the vfio dma available count Create new files for separating out vfio-specific work for s390 pci. Add the first such routine, which issues VFIO_IOMMU_GET_INFO ioctl to collect the current dma available count. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck [aw: Fix non-Linux build with CONFIG_LINUX] Signed-off-by: Alex Williamson --- hw/s390x/meson.build | 1 + hw/s390x/s390-pci-vfio.c | 54 ++++++++++++++++++++++++++++++++ include/hw/s390x/s390-pci-vfio.h | 24 ++++++++++++++ 3 files changed, 79 insertions(+) create mode 100644 hw/s390x/s390-pci-vfio.c create mode 100644 include/hw/s390x/s390-pci-vfio.h diff --git a/hw/s390x/meson.build b/hw/s390x/meson.build index 948ceae7a7..f4663a8355 100644 --- a/hw/s390x/meson.build +++ b/hw/s390x/meson.build @@ -27,6 +27,7 @@ s390x_ss.add(when: 'CONFIG_KVM', if_true: files( )) s390x_ss.add(when: 'CONFIG_S390_CCW_VIRTIO', if_true: files('s390-virtio-ccw.c')) s390x_ss.add(when: 'CONFIG_TERMINAL3270', if_true: files('3270-ccw.c')) +s390x_ss.add(when: 'CONFIG_LINUX', if_true: files('s390-pci-vfio.c')) virtio_ss = ss.source_set() virtio_ss.add(files('virtio-ccw.c')) diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c new file mode 100644 index 0000000000..cb3f4d98ad --- /dev/null +++ b/hw/s390x/s390-pci-vfio.c @@ -0,0 +1,54 @@ +/* + * s390 vfio-pci interfaces + * + * Copyright 2020 IBM Corp. + * Author(s): Matthew Rosato + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include + +#include "qemu/osdep.h" +#include "hw/s390x/s390-pci-vfio.h" +#include "hw/vfio/vfio-common.h" + +/* + * Get the current DMA available count from vfio. Returns true if vfio is + * limiting DMA requests, false otherwise. The current available count read + * from vfio is returned in avail. + */ +bool s390_pci_update_dma_avail(int fd, unsigned int *avail) +{ + g_autofree struct vfio_iommu_type1_info *info; + uint32_t argsz; + + assert(avail); + + argsz = sizeof(struct vfio_iommu_type1_info); + info = g_malloc0(argsz); + + /* + * If the specified argsz is not large enough to contain all capabilities + * it will be updated upon return from the ioctl. Retry until we have + * a big enough buffer to hold the entire capability chain. + */ +retry: + info->argsz = argsz; + + if (ioctl(fd, VFIO_IOMMU_GET_INFO, info)) { + return false; + } + + if (info->argsz > argsz) { + argsz = info->argsz; + info = g_realloc(info, argsz); + goto retry; + } + + /* If the capability exists, update with the current value */ + return vfio_get_info_dma_avail(info, avail); +} + diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h new file mode 100644 index 0000000000..1727292e9b --- /dev/null +++ b/include/hw/s390x/s390-pci-vfio.h @@ -0,0 +1,24 @@ +/* + * s390 vfio-pci interfaces + * + * Copyright 2020 IBM Corp. + * Author(s): Matthew Rosato + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#ifndef HW_S390_PCI_VFIO_H +#define HW_S390_PCI_VFIO_H + +#ifdef CONFIG_LINUX +bool s390_pci_update_dma_avail(int fd, unsigned int *avail); +#else +static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail) +{ + return false; +} +#endif + +#endif From 37fa32de707340f3a93959ad5a1ebc41ba1520ee Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:35 -0400 Subject: [PATCH 24/32] s390x/pci: Honor DMA limits set by vfio When an s390 guest is using lazy unmapping, it can result in a very large number of oustanding DMA requests, far beyond the default limit configured for vfio. Let's track DMA usage similar to vfio in the host, and trigger the guest to flush their DMA mappings before vfio runs out. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck [aw: non-Linux build fixes] Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 18 ++++++++----- hw/s390x/s390-pci-inst.c | 45 +++++++++++++++++++++++++++----- hw/s390x/s390-pci-vfio.c | 42 +++++++++++++++++++++++++++++ include/hw/s390x/s390-pci-bus.h | 9 +++++++ include/hw/s390x/s390-pci-inst.h | 3 +++ include/hw/s390x/s390-pci-vfio.h | 12 +++++++++ 6 files changed, 117 insertions(+), 12 deletions(-) diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index a929340688..218717397a 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -17,6 +17,7 @@ #include "cpu.h" #include "hw/s390x/s390-pci-bus.h" #include "hw/s390x/s390-pci-inst.h" +#include "hw/s390x/s390-pci-vfio.h" #include "hw/pci/pci_bus.h" #include "hw/qdev-properties.h" #include "hw/pci/pci_bridge.h" @@ -764,6 +765,7 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) s->bus_no = 0; QTAILQ_INIT(&s->pending_sei); QTAILQ_INIT(&s->zpci_devs); + QTAILQ_INIT(&s->zpci_dma_limit); css_register_io_adapters(CSS_IO_ADAPTER_PCI, true, false, S390_ADAPTER_SUPPRESSIBLE, errp); @@ -941,17 +943,18 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, } } - if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { - pbdev->fh |= FH_SHM_VFIO; - } else { - pbdev->fh |= FH_SHM_EMUL; - } - pbdev->pdev = pdev; pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); pbdev->iommu->pbdev = pbdev; pbdev->state = ZPCI_FS_DISABLED; + if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { + pbdev->fh |= FH_SHM_VFIO; + pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); + } else { + pbdev->fh |= FH_SHM_EMUL; + } + if (s390_pci_msix_init(pbdev)) { error_setg(errp, "MSI-X support is mandatory " "in the S390 architecture"); @@ -1004,6 +1007,9 @@ static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev, pbdev->fid = 0; QTAILQ_REMOVE(&s->zpci_devs, pbdev, link); g_hash_table_remove(s->zpci_table, &pbdev->idx); + if (pbdev->iommu->dma_limit) { + s390_pci_end_dma_count(s, pbdev->iommu->dma_limit); + } qdev_unrealize(dev); } } diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 639b13c8d6..4eadd9e794 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -32,6 +32,20 @@ } \ } while (0) +static inline void inc_dma_avail(S390PCIIOMMU *iommu) +{ + if (iommu->dma_limit) { + iommu->dma_limit->avail++; + } +} + +static inline void dec_dma_avail(S390PCIIOMMU *iommu) +{ + if (iommu->dma_limit) { + iommu->dma_limit->avail--; + } +} + static void s390_set_status_code(CPUS390XState *env, uint8_t r, uint64_t status_code) { @@ -572,7 +586,8 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) return 0; } -static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) +static uint32_t s390_pci_update_iotlb(S390PCIIOMMU *iommu, + S390IOTLBEntry *entry) { S390IOTLBEntry *cache = g_hash_table_lookup(iommu->iotlb, &entry->iova); IOMMUTLBEntry notify = { @@ -585,14 +600,15 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) if (entry->perm == IOMMU_NONE) { if (!cache) { - return; + goto out; } g_hash_table_remove(iommu->iotlb, &entry->iova); + inc_dma_avail(iommu); } else { if (cache) { if (cache->perm == entry->perm && cache->translated_addr == entry->translated_addr) { - return; + goto out; } notify.perm = IOMMU_NONE; @@ -606,9 +622,13 @@ static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) cache->len = PAGE_SIZE; cache->perm = entry->perm; g_hash_table_replace(iommu->iotlb, &cache->iova, cache); + dec_dma_avail(iommu); } memory_region_notify_iommu(&iommu->iommu_mr, 0, notify); + +out: + return iommu->dma_limit ? iommu->dma_limit->avail : 1; } int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) @@ -620,6 +640,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) S390PCIIOMMU *iommu; S390IOTLBEntry entry; hwaddr start, end; + uint32_t dma_avail; if (env->psw.mask & PSW_MASK_PSTATE) { s390_program_interrupt(env, PGM_PRIVILEGED, ra); @@ -658,6 +679,11 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) } iommu = pbdev->iommu; + if (iommu->dma_limit) { + dma_avail = iommu->dma_limit->avail; + } else { + dma_avail = 1; + } if (!iommu->g_iota) { error = ERR_EVENT_INVALAS; goto err; @@ -675,8 +701,9 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) } start += entry.len; - while (entry.iova < start && entry.iova < end) { - s390_pci_update_iotlb(iommu, &entry); + while (entry.iova < start && entry.iova < end && + (dma_avail > 0 || entry.perm == IOMMU_NONE)) { + dma_avail = s390_pci_update_iotlb(iommu, &entry); entry.iova += PAGE_SIZE; entry.translated_addr += PAGE_SIZE; } @@ -689,7 +716,13 @@ err: s390_pci_generate_error_event(error, pbdev->fh, pbdev->fid, start, 0); } else { pbdev->fmb.counter[ZPCI_FMB_CNT_RPCIT]++; - setcc(cpu, ZPCI_PCI_LS_OK); + if (dma_avail > 0) { + setcc(cpu, ZPCI_PCI_LS_OK); + } else { + /* vfio DMA mappings are exhausted, trigger a RPCIT */ + setcc(cpu, ZPCI_PCI_LS_ERR); + s390_set_status_code(env, r1, ZPCI_RPCIT_ST_INSUFF_RES); + } } return 0; } diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c index cb3f4d98ad..0621fa386c 100644 --- a/hw/s390x/s390-pci-vfio.c +++ b/hw/s390x/s390-pci-vfio.c @@ -12,7 +12,9 @@ #include #include "qemu/osdep.h" +#include "hw/s390x/s390-pci-bus.h" #include "hw/s390x/s390-pci-vfio.h" +#include "hw/vfio/pci.h" #include "hw/vfio/vfio-common.h" /* @@ -52,3 +54,43 @@ retry: return vfio_get_info_dma_avail(info, avail); } +S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, + S390PCIBusDevice *pbdev) +{ + S390PCIDMACount *cnt; + uint32_t avail; + VFIOPCIDevice *vpdev = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + int id; + + assert(vpdev); + + id = vpdev->vbasedev.group->container->fd; + + if (!s390_pci_update_dma_avail(id, &avail)) { + return NULL; + } + + QTAILQ_FOREACH(cnt, &s->zpci_dma_limit, link) { + if (cnt->id == id) { + cnt->users++; + return cnt; + } + } + + cnt = g_new0(S390PCIDMACount, 1); + cnt->id = id; + cnt->users = 1; + cnt->avail = avail; + QTAILQ_INSERT_TAIL(&s->zpci_dma_limit, cnt, link); + return cnt; +} + +void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) +{ + assert(cnt); + + cnt->users--; + if (cnt->users == 0) { + QTAILQ_REMOVE(&s->zpci_dma_limit, cnt, link); + } +} diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h index 97464d0ad3..6a35f1365b 100644 --- a/include/hw/s390x/s390-pci-bus.h +++ b/include/hw/s390x/s390-pci-bus.h @@ -262,6 +262,13 @@ typedef struct S390IOTLBEntry { uint64_t perm; } S390IOTLBEntry; +typedef struct S390PCIDMACount { + int id; + int users; + uint32_t avail; + QTAILQ_ENTRY(S390PCIDMACount) link; +} S390PCIDMACount; + struct S390PCIIOMMU { Object parent_obj; S390PCIBusDevice *pbdev; @@ -273,6 +280,7 @@ struct S390PCIIOMMU { uint64_t pba; uint64_t pal; GHashTable *iotlb; + S390PCIDMACount *dma_limit; }; typedef struct S390PCIIOMMUTable { @@ -348,6 +356,7 @@ struct S390pciState { GHashTable *zpci_table; QTAILQ_HEAD(, SeiContainer) pending_sei; QTAILQ_HEAD(, S390PCIBusDevice) zpci_devs; + QTAILQ_HEAD(, S390PCIDMACount) zpci_dma_limit; }; S390pciState *s390_get_phb(void); diff --git a/include/hw/s390x/s390-pci-inst.h b/include/hw/s390x/s390-pci-inst.h index fa3bf8b5aa..8ee3a3c237 100644 --- a/include/hw/s390x/s390-pci-inst.h +++ b/include/hw/s390x/s390-pci-inst.h @@ -254,6 +254,9 @@ typedef struct ClpReqRspQueryPciGrp { #define ZPCI_STPCIFC_ST_INVAL_DMAAS 28 #define ZPCI_STPCIFC_ST_ERROR_RECOVER 40 +/* Refresh PCI Translations status codes */ +#define ZPCI_RPCIT_ST_INSUFF_RES 16 + /* FIB function controls */ #define ZPCI_FIB_FC_ENABLED 0x80 #define ZPCI_FIB_FC_ERROR 0x40 diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h index 1727292e9b..539bcf04eb 100644 --- a/include/hw/s390x/s390-pci-vfio.h +++ b/include/hw/s390x/s390-pci-vfio.h @@ -12,13 +12,25 @@ #ifndef HW_S390_PCI_VFIO_H #define HW_S390_PCI_VFIO_H +#include "hw/s390x/s390-pci-bus.h" + #ifdef CONFIG_LINUX bool s390_pci_update_dma_avail(int fd, unsigned int *avail); +S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, + S390PCIBusDevice *pbdev); +void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt); #else static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail) { return false; } +static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, + S390PCIBusDevice *pbdev) +{ + return NULL; +} +static inline void s390_pci_end_dma_count(S390pciState *s, + S390PCIDMACount *cnt) { } #endif #endif From c04274f49e0dd1f1279c0f74cbb89a902d8372eb Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Mon, 26 Oct 2020 11:34:36 -0400 Subject: [PATCH 25/32] s390x/pci: create a header dedicated to PCI CLP To have a clean separation between s390-pci-bus.h and s390-pci-inst.h headers we export the PCI CLP instructions in a dedicated header. Signed-off-by: Pierre Morel Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- include/hw/s390x/s390-pci-bus.h | 1 + include/hw/s390x/s390-pci-clp.h | 211 +++++++++++++++++++++++++++++++ include/hw/s390x/s390-pci-inst.h | 196 ---------------------------- 3 files changed, 212 insertions(+), 196 deletions(-) create mode 100644 include/hw/s390x/s390-pci-clp.h diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h index 6a35f1365b..5f339e57fb 100644 --- a/include/hw/s390x/s390-pci-bus.h +++ b/include/hw/s390x/s390-pci-bus.h @@ -19,6 +19,7 @@ #include "hw/s390x/sclp.h" #include "hw/s390x/s390_flic.h" #include "hw/s390x/css.h" +#include "hw/s390x/s390-pci-clp.h" #include "qom/object.h" #define TYPE_S390_PCI_HOST_BRIDGE "s390-pcihost" diff --git a/include/hw/s390x/s390-pci-clp.h b/include/hw/s390x/s390-pci-clp.h new file mode 100644 index 0000000000..3708acd173 --- /dev/null +++ b/include/hw/s390x/s390-pci-clp.h @@ -0,0 +1,211 @@ +/* + * s390 CLP instruction definitions + * + * Copyright 2019 IBM Corp. + * Author(s): Pierre Morel + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#ifndef HW_S390_PCI_CLP +#define HW_S390_PCI_CLP + +/* CLP common request & response block size */ +#define CLP_BLK_SIZE 4096 +#define PCI_BAR_COUNT 6 +#define PCI_MAX_FUNCTIONS 4096 + +typedef struct ClpReqHdr { + uint16_t len; + uint16_t cmd; +} QEMU_PACKED ClpReqHdr; + +typedef struct ClpRspHdr { + uint16_t len; + uint16_t rsp; +} QEMU_PACKED ClpRspHdr; + +/* CLP Response Codes */ +#define CLP_RC_OK 0x0010 /* Command request successfully */ +#define CLP_RC_CMD 0x0020 /* Command code not recognized */ +#define CLP_RC_PERM 0x0030 /* Command not authorized */ +#define CLP_RC_FMT 0x0040 /* Invalid command request format */ +#define CLP_RC_LEN 0x0050 /* Invalid command request length */ +#define CLP_RC_8K 0x0060 /* Command requires 8K LPCB */ +#define CLP_RC_RESNOT0 0x0070 /* Reserved field not zero */ +#define CLP_RC_NODATA 0x0080 /* No data available */ +#define CLP_RC_FC_UNKNOWN 0x0100 /* Function code not recognized */ + +/* + * Call Logical Processor - Command Codes + */ +#define CLP_LIST_PCI 0x0002 +#define CLP_QUERY_PCI_FN 0x0003 +#define CLP_QUERY_PCI_FNGRP 0x0004 +#define CLP_SET_PCI_FN 0x0005 + +/* PCI function handle list entry */ +typedef struct ClpFhListEntry { + uint16_t device_id; + uint16_t vendor_id; +#define CLP_FHLIST_MASK_CONFIG 0x80000000 + uint32_t config; + uint32_t fid; + uint32_t fh; +} QEMU_PACKED ClpFhListEntry; + +#define CLP_RC_SETPCIFN_FH 0x0101 /* Invalid PCI fn handle */ +#define CLP_RC_SETPCIFN_FHOP 0x0102 /* Fn handle not valid for op */ +#define CLP_RC_SETPCIFN_DMAAS 0x0103 /* Invalid DMA addr space */ +#define CLP_RC_SETPCIFN_RES 0x0104 /* Insufficient resources */ +#define CLP_RC_SETPCIFN_ALRDY 0x0105 /* Fn already in requested state */ +#define CLP_RC_SETPCIFN_ERR 0x0106 /* Fn in permanent error state */ +#define CLP_RC_SETPCIFN_RECPND 0x0107 /* Error recovery pending */ +#define CLP_RC_SETPCIFN_BUSY 0x0108 /* Fn busy */ +#define CLP_RC_LISTPCI_BADRT 0x010a /* Resume token not recognized */ +#define CLP_RC_QUERYPCIFG_PFGID 0x010b /* Unrecognized PFGID */ + +/* request or response block header length */ +#define LIST_PCI_HDR_LEN 32 + +/* Number of function handles fitting in response block */ +#define CLP_FH_LIST_NR_ENTRIES \ + ((CLP_BLK_SIZE - 2 * LIST_PCI_HDR_LEN) \ + / sizeof(ClpFhListEntry)) + +#define CLP_SET_ENABLE_PCI_FN 0 /* Yes, 0 enables it */ +#define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */ + +#define CLP_UTIL_STR_LEN 64 + +#define CLP_MASK_FMT 0xf0000000 + +/* List PCI functions request */ +typedef struct ClpReqListPci { + ClpReqHdr hdr; + uint32_t fmt; + uint64_t reserved1; + uint64_t resume_token; + uint64_t reserved2; +} QEMU_PACKED ClpReqListPci; + +/* List PCI functions response */ +typedef struct ClpRspListPci { + ClpRspHdr hdr; + uint32_t fmt; + uint64_t reserved1; + uint64_t resume_token; + uint32_t mdd; + uint16_t max_fn; + uint8_t flags; + uint8_t entry_size; + ClpFhListEntry fh_list[CLP_FH_LIST_NR_ENTRIES]; +} QEMU_PACKED ClpRspListPci; + +/* Query PCI function request */ +typedef struct ClpReqQueryPci { + ClpReqHdr hdr; + uint32_t fmt; + uint64_t reserved1; + uint32_t fh; /* function handle */ + uint32_t reserved2; + uint64_t reserved3; +} QEMU_PACKED ClpReqQueryPci; + +/* Query PCI function response */ +typedef struct ClpRspQueryPci { + ClpRspHdr hdr; + uint32_t fmt; + uint64_t reserved1; + uint16_t vfn; /* virtual fn number */ +#define CLP_RSP_QPCI_MASK_UTIL 0x100 +#define CLP_RSP_QPCI_MASK_PFGID 0xff + uint16_t ug; + uint32_t fid; /* pci function id */ + uint8_t bar_size[PCI_BAR_COUNT]; + uint16_t pchid; + uint32_t bar[PCI_BAR_COUNT]; + uint64_t reserved2; + uint64_t sdma; /* start dma as */ + uint64_t edma; /* end dma as */ + uint32_t reserved3[11]; + uint32_t uid; + uint8_t util_str[CLP_UTIL_STR_LEN]; /* utility string */ +} QEMU_PACKED ClpRspQueryPci; + +/* Query PCI function group request */ +typedef struct ClpReqQueryPciGrp { + ClpReqHdr hdr; + uint32_t fmt; + uint64_t reserved1; +#define CLP_REQ_QPCIG_MASK_PFGID 0xff + uint32_t g; + uint32_t reserved2; + uint64_t reserved3; +} QEMU_PACKED ClpReqQueryPciGrp; + +/* Query PCI function group response */ +typedef struct ClpRspQueryPciGrp { + ClpRspHdr hdr; + uint32_t fmt; + uint64_t reserved1; +#define CLP_RSP_QPCIG_MASK_NOI 0xfff + uint16_t i; + uint8_t version; +#define CLP_RSP_QPCIG_MASK_FRAME 0x2 +#define CLP_RSP_QPCIG_MASK_REFRESH 0x1 + uint8_t fr; + uint16_t maxstbl; + uint16_t mui; + uint64_t reserved3; + uint64_t dasm; /* dma address space mask */ + uint64_t msia; /* MSI address */ + uint64_t reserved4; + uint64_t reserved5; +} QEMU_PACKED ClpRspQueryPciGrp; + +/* Set PCI function request */ +typedef struct ClpReqSetPci { + ClpReqHdr hdr; + uint32_t fmt; + uint64_t reserved1; + uint32_t fh; /* function handle */ + uint16_t reserved2; + uint8_t oc; /* operation controls */ + uint8_t ndas; /* number of dma spaces */ + uint64_t reserved3; +} QEMU_PACKED ClpReqSetPci; + +/* Set PCI function response */ +typedef struct ClpRspSetPci { + ClpRspHdr hdr; + uint32_t fmt; + uint64_t reserved1; + uint32_t fh; /* function handle */ + uint32_t reserved3; + uint64_t reserved4; +} QEMU_PACKED ClpRspSetPci; + +typedef struct ClpReqRspListPci { + ClpReqListPci request; + ClpRspListPci response; +} QEMU_PACKED ClpReqRspListPci; + +typedef struct ClpReqRspSetPci { + ClpReqSetPci request; + ClpRspSetPci response; +} QEMU_PACKED ClpReqRspSetPci; + +typedef struct ClpReqRspQueryPci { + ClpReqQueryPci request; + ClpRspQueryPci response; +} QEMU_PACKED ClpReqRspQueryPci; + +typedef struct ClpReqRspQueryPciGrp { + ClpReqQueryPciGrp request; + ClpRspQueryPciGrp response; +} QEMU_PACKED ClpReqRspQueryPciGrp; + +#endif diff --git a/include/hw/s390x/s390-pci-inst.h b/include/hw/s390x/s390-pci-inst.h index 8ee3a3c237..a55c448aad 100644 --- a/include/hw/s390x/s390-pci-inst.h +++ b/include/hw/s390x/s390-pci-inst.h @@ -17,202 +17,6 @@ #include "s390-pci-bus.h" #include "sysemu/dma.h" -/* CLP common request & response block size */ -#define CLP_BLK_SIZE 4096 -#define PCI_BAR_COUNT 6 -#define PCI_MAX_FUNCTIONS 4096 - -typedef struct ClpReqHdr { - uint16_t len; - uint16_t cmd; -} QEMU_PACKED ClpReqHdr; - -typedef struct ClpRspHdr { - uint16_t len; - uint16_t rsp; -} QEMU_PACKED ClpRspHdr; - -/* CLP Response Codes */ -#define CLP_RC_OK 0x0010 /* Command request successfully */ -#define CLP_RC_CMD 0x0020 /* Command code not recognized */ -#define CLP_RC_PERM 0x0030 /* Command not authorized */ -#define CLP_RC_FMT 0x0040 /* Invalid command request format */ -#define CLP_RC_LEN 0x0050 /* Invalid command request length */ -#define CLP_RC_8K 0x0060 /* Command requires 8K LPCB */ -#define CLP_RC_RESNOT0 0x0070 /* Reserved field not zero */ -#define CLP_RC_NODATA 0x0080 /* No data available */ -#define CLP_RC_FC_UNKNOWN 0x0100 /* Function code not recognized */ - -/* - * Call Logical Processor - Command Codes - */ -#define CLP_LIST_PCI 0x0002 -#define CLP_QUERY_PCI_FN 0x0003 -#define CLP_QUERY_PCI_FNGRP 0x0004 -#define CLP_SET_PCI_FN 0x0005 - -/* PCI function handle list entry */ -typedef struct ClpFhListEntry { - uint16_t device_id; - uint16_t vendor_id; -#define CLP_FHLIST_MASK_CONFIG 0x80000000 - uint32_t config; - uint32_t fid; - uint32_t fh; -} QEMU_PACKED ClpFhListEntry; - -#define CLP_RC_SETPCIFN_FH 0x0101 /* Invalid PCI fn handle */ -#define CLP_RC_SETPCIFN_FHOP 0x0102 /* Fn handle not valid for op */ -#define CLP_RC_SETPCIFN_DMAAS 0x0103 /* Invalid DMA addr space */ -#define CLP_RC_SETPCIFN_RES 0x0104 /* Insufficient resources */ -#define CLP_RC_SETPCIFN_ALRDY 0x0105 /* Fn already in requested state */ -#define CLP_RC_SETPCIFN_ERR 0x0106 /* Fn in permanent error state */ -#define CLP_RC_SETPCIFN_RECPND 0x0107 /* Error recovery pending */ -#define CLP_RC_SETPCIFN_BUSY 0x0108 /* Fn busy */ -#define CLP_RC_LISTPCI_BADRT 0x010a /* Resume token not recognized */ -#define CLP_RC_QUERYPCIFG_PFGID 0x010b /* Unrecognized PFGID */ - -/* request or response block header length */ -#define LIST_PCI_HDR_LEN 32 - -/* Number of function handles fitting in response block */ -#define CLP_FH_LIST_NR_ENTRIES \ - ((CLP_BLK_SIZE - 2 * LIST_PCI_HDR_LEN) \ - / sizeof(ClpFhListEntry)) - -#define CLP_SET_ENABLE_PCI_FN 0 /* Yes, 0 enables it */ -#define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */ - -#define CLP_UTIL_STR_LEN 64 - -#define CLP_MASK_FMT 0xf0000000 - -/* List PCI functions request */ -typedef struct ClpReqListPci { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint64_t resume_token; - uint64_t reserved2; -} QEMU_PACKED ClpReqListPci; - -/* List PCI functions response */ -typedef struct ClpRspListPci { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint64_t resume_token; - uint32_t mdd; - uint16_t max_fn; - uint8_t flags; - uint8_t entry_size; - ClpFhListEntry fh_list[CLP_FH_LIST_NR_ENTRIES]; -} QEMU_PACKED ClpRspListPci; - -/* Query PCI function request */ -typedef struct ClpReqQueryPci { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint32_t fh; /* function handle */ - uint32_t reserved2; - uint64_t reserved3; -} QEMU_PACKED ClpReqQueryPci; - -/* Query PCI function response */ -typedef struct ClpRspQueryPci { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint16_t vfn; /* virtual fn number */ -#define CLP_RSP_QPCI_MASK_UTIL 0x100 -#define CLP_RSP_QPCI_MASK_PFGID 0xff - uint16_t ug; - uint32_t fid; /* pci function id */ - uint8_t bar_size[PCI_BAR_COUNT]; - uint16_t pchid; - uint32_t bar[PCI_BAR_COUNT]; - uint64_t reserved2; - uint64_t sdma; /* start dma as */ - uint64_t edma; /* end dma as */ - uint32_t reserved3[11]; - uint32_t uid; - uint8_t util_str[CLP_UTIL_STR_LEN]; /* utility string */ -} QEMU_PACKED ClpRspQueryPci; - -/* Query PCI function group request */ -typedef struct ClpReqQueryPciGrp { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; -#define CLP_REQ_QPCIG_MASK_PFGID 0xff - uint32_t g; - uint32_t reserved2; - uint64_t reserved3; -} QEMU_PACKED ClpReqQueryPciGrp; - -/* Query PCI function group response */ -typedef struct ClpRspQueryPciGrp { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; -#define CLP_RSP_QPCIG_MASK_NOI 0xfff - uint16_t i; - uint8_t version; -#define CLP_RSP_QPCIG_MASK_FRAME 0x2 -#define CLP_RSP_QPCIG_MASK_REFRESH 0x1 - uint8_t fr; - uint16_t maxstbl; - uint16_t mui; - uint64_t reserved3; - uint64_t dasm; /* dma address space mask */ - uint64_t msia; /* MSI address */ - uint64_t reserved4; - uint64_t reserved5; -} QEMU_PACKED ClpRspQueryPciGrp; - -/* Set PCI function request */ -typedef struct ClpReqSetPci { - ClpReqHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint32_t fh; /* function handle */ - uint16_t reserved2; - uint8_t oc; /* operation controls */ - uint8_t ndas; /* number of dma spaces */ - uint64_t reserved3; -} QEMU_PACKED ClpReqSetPci; - -/* Set PCI function response */ -typedef struct ClpRspSetPci { - ClpRspHdr hdr; - uint32_t fmt; - uint64_t reserved1; - uint32_t fh; /* function handle */ - uint32_t reserved3; - uint64_t reserved4; -} QEMU_PACKED ClpRspSetPci; - -typedef struct ClpReqRspListPci { - ClpReqListPci request; - ClpRspListPci response; -} QEMU_PACKED ClpReqRspListPci; - -typedef struct ClpReqRspSetPci { - ClpReqSetPci request; - ClpRspSetPci response; -} QEMU_PACKED ClpReqRspSetPci; - -typedef struct ClpReqRspQueryPci { - ClpReqQueryPci request; - ClpRspQueryPci response; -} QEMU_PACKED ClpReqRspQueryPci; - -typedef struct ClpReqRspQueryPciGrp { - ClpReqQueryPciGrp request; - ClpRspQueryPciGrp response; -} QEMU_PACKED ClpReqRspQueryPciGrp; - /* Load/Store status codes */ #define ZPCI_PCI_ST_FUNC_NOT_ENABLED 4 #define ZPCI_PCI_ST_FUNC_IN_ERR 8 From 28dc86a07299fba784ca2352f95e30fe603e17ab Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Mon, 26 Oct 2020 11:34:37 -0400 Subject: [PATCH 26/32] s390x/pci: use a PCI Group structure We use a S390PCIGroup structure to hold the information related to a zPCI Function group. This allows us to be ready to support multiple groups and to retrieve the group information from the host. Signed-off-by: Pierre Morel Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 42 +++++++++++++++++++++++++++++++++ hw/s390x/s390-pci-inst.c | 23 +++++++++++------- include/hw/s390x/s390-pci-bus.h | 10 ++++++++ 3 files changed, 66 insertions(+), 9 deletions(-) diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 218717397a..4c7f06d5cf 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -738,6 +738,46 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn) object_unref(OBJECT(iommu)); } +static S390PCIGroup *s390_group_create(int id) +{ + S390PCIGroup *group; + S390pciState *s = s390_get_phb(); + + group = g_new0(S390PCIGroup, 1); + group->id = id; + QTAILQ_INSERT_TAIL(&s->zpci_groups, group, link); + return group; +} + +S390PCIGroup *s390_group_find(int id) +{ + S390PCIGroup *group; + S390pciState *s = s390_get_phb(); + + QTAILQ_FOREACH(group, &s->zpci_groups, link) { + if (group->id == id) { + return group; + } + } + return NULL; +} + +static void s390_pci_init_default_group(void) +{ + S390PCIGroup *group; + ClpRspQueryPciGrp *resgrp; + + group = s390_group_create(ZPCI_DEFAULT_FN_GRP); + resgrp = &group->zpci_group; + resgrp->fr = 1; + stq_p(&resgrp->dasm, 0); + stq_p(&resgrp->msia, ZPCI_MSI_ADDR); + stw_p(&resgrp->mui, DEFAULT_MUI); + stw_p(&resgrp->i, 128); + stw_p(&resgrp->maxstbl, 128); + resgrp->version = 0; +} + static void s390_pcihost_realize(DeviceState *dev, Error **errp) { PCIBus *b; @@ -766,7 +806,9 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) QTAILQ_INIT(&s->pending_sei); QTAILQ_INIT(&s->zpci_devs); QTAILQ_INIT(&s->zpci_dma_limit); + QTAILQ_INIT(&s->zpci_groups); + s390_pci_init_default_group(); css_register_io_adapters(CSS_IO_ADAPTER_PCI, true, false, S390_ADAPTER_SUPPRESSIBLE, errp); } diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index 4eadd9e794..c25b2a67ef 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -298,21 +298,25 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) stq_p(&resquery->edma, ZPCI_EDMA_ADDR); stl_p(&resquery->fid, pbdev->fid); stw_p(&resquery->pchid, 0); - stw_p(&resquery->ug, 1); + stw_p(&resquery->ug, ZPCI_DEFAULT_FN_GRP); stl_p(&resquery->uid, pbdev->uid); stw_p(&resquery->hdr.rsp, CLP_RC_OK); break; } case CLP_QUERY_PCI_FNGRP: { ClpRspQueryPciGrp *resgrp = (ClpRspQueryPciGrp *)resh; - resgrp->fr = 1; - stq_p(&resgrp->dasm, 0); - stq_p(&resgrp->msia, ZPCI_MSI_ADDR); - stw_p(&resgrp->mui, DEFAULT_MUI); - stw_p(&resgrp->i, 128); - stw_p(&resgrp->maxstbl, 128); - resgrp->version = 0; + ClpReqQueryPciGrp *reqgrp = (ClpReqQueryPciGrp *)reqh; + S390PCIGroup *group; + + group = s390_group_find(reqgrp->g); + if (!group) { + /* We do not allow access to unknown groups */ + /* The group must have been obtained with a vfio device */ + stw_p(&resgrp->hdr.rsp, CLP_RC_QUERYPCIFG_PFGID); + goto out; + } + memcpy(resgrp, &group->zpci_group, sizeof(ClpRspQueryPciGrp)); stw_p(&resgrp->hdr.rsp, CLP_RC_OK); break; } @@ -787,7 +791,8 @@ int pcistb_service_call(S390CPU *cpu, uint8_t r1, uint8_t r3, uint64_t gaddr, } /* Length must be greater than 8, a multiple of 8 */ /* and not greater than maxstbl */ - if ((len <= 8) || (len % 8) || (len > pbdev->maxstbl)) { + if ((len <= 8) || (len % 8) || + (len > pbdev->pci_group->zpci_group.maxstbl)) { goto specification_error; } /* Do not cross a 4K-byte boundary */ diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h index 5f339e57fb..869c0f254b 100644 --- a/include/hw/s390x/s390-pci-bus.h +++ b/include/hw/s390x/s390-pci-bus.h @@ -316,6 +316,14 @@ typedef struct ZpciFmb { } ZpciFmb; QEMU_BUILD_BUG_MSG(offsetof(ZpciFmb, fmt0) != 48, "padding in ZpciFmb"); +#define ZPCI_DEFAULT_FN_GRP 0x20 +typedef struct S390PCIGroup { + ClpRspQueryPciGrp zpci_group; + int id; + QTAILQ_ENTRY(S390PCIGroup) link; +} S390PCIGroup; +S390PCIGroup *s390_group_find(int id); + struct S390PCIBusDevice { DeviceState qdev; PCIDevice *pdev; @@ -333,6 +341,7 @@ struct S390PCIBusDevice { uint16_t noi; uint16_t maxstbl; uint8_t sum; + S390PCIGroup *pci_group; S390MsixInfo msix; AdapterRoutes routes; S390PCIIOMMU *iommu; @@ -358,6 +367,7 @@ struct S390pciState { QTAILQ_HEAD(, SeiContainer) pending_sei; QTAILQ_HEAD(, S390PCIBusDevice) zpci_devs; QTAILQ_HEAD(, S390PCIDMACount) zpci_dma_limit; + QTAILQ_HEAD(, S390PCIGroup) zpci_groups; }; S390pciState *s390_get_phb(void); From b354d5d8049c513444b51ce841bd3136fed2e234 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:38 -0400 Subject: [PATCH 27/32] s390x/pci: clean up s390 PCI groups Add a step to remove all stashed PCI groups to avoid stale data between machine resets. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 4c7f06d5cf..036cf4635a 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -813,6 +813,17 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp) S390_ADAPTER_SUPPRESSIBLE, errp); } +static void s390_pcihost_unrealize(DeviceState *dev) +{ + S390PCIGroup *group; + S390pciState *s = S390_PCI_HOST_BRIDGE(dev); + + while (!QTAILQ_EMPTY(&s->zpci_groups)) { + group = QTAILQ_FIRST(&s->zpci_groups); + QTAILQ_REMOVE(&s->zpci_groups, group, link); + } +} + static int s390_pci_msix_init(S390PCIBusDevice *pbdev) { char *name; @@ -1171,6 +1182,7 @@ static void s390_pcihost_class_init(ObjectClass *klass, void *data) dc->reset = s390_pcihost_reset; dc->realize = s390_pcihost_realize; + dc->unrealize = s390_pcihost_unrealize; hc->pre_plug = s390_pcihost_pre_plug; hc->plug = s390_pcihost_plug; hc->unplug_request = s390_pcihost_unplug_request; From 9670ee752727945d8ce4f76efc0b68364b832f20 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Mon, 26 Oct 2020 11:34:39 -0400 Subject: [PATCH 28/32] s390x/pci: use a PCI Function structure We use a ClpRspQueryPci structure to hold the information related to a zPCI Function. This allows us to be ready to support different zPCI functions and to retrieve the zPCI function information from the host. Signed-off-by: Pierre Morel Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 12 ++++++++++++ hw/s390x/s390-pci-inst.c | 8 ++------ include/hw/s390x/s390-pci-bus.h | 1 + 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 036cf4635a..072b56e45e 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -778,6 +778,17 @@ static void s390_pci_init_default_group(void) resgrp->version = 0; } +static void set_pbdev_info(S390PCIBusDevice *pbdev) +{ + pbdev->zpci_fn.sdma = ZPCI_SDMA_ADDR; + pbdev->zpci_fn.edma = ZPCI_EDMA_ADDR; + pbdev->zpci_fn.pchid = 0; + pbdev->zpci_fn.ug = ZPCI_DEFAULT_FN_GRP; + pbdev->zpci_fn.fid = pbdev->fid; + pbdev->zpci_fn.uid = pbdev->uid; + pbdev->pci_group = s390_group_find(ZPCI_DEFAULT_FN_GRP); +} + static void s390_pcihost_realize(DeviceState *dev, Error **errp) { PCIBus *b; @@ -1000,6 +1011,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn); pbdev->iommu->pbdev = pbdev; pbdev->state = ZPCI_FS_DISABLED; + set_pbdev_info(pbdev); if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { pbdev->fh |= FH_SHM_VFIO; diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index c25b2a67ef..58cd041d17 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -281,6 +281,8 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) goto out; } + memcpy(resquery, &pbdev->zpci_fn, sizeof(*resquery)); + for (i = 0; i < PCI_BAR_COUNT; i++) { uint32_t data = pci_get_long(pbdev->pdev->config + PCI_BASE_ADDRESS_0 + (i * 4)); @@ -294,12 +296,6 @@ int clp_service_call(S390CPU *cpu, uint8_t r2, uintptr_t ra) resquery->bar_size[i]); } - stq_p(&resquery->sdma, ZPCI_SDMA_ADDR); - stq_p(&resquery->edma, ZPCI_EDMA_ADDR); - stl_p(&resquery->fid, pbdev->fid); - stw_p(&resquery->pchid, 0); - stw_p(&resquery->ug, ZPCI_DEFAULT_FN_GRP); - stl_p(&resquery->uid, pbdev->uid); stw_p(&resquery->hdr.rsp, CLP_RC_OK); break; } diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h index 869c0f254b..fe36f163ab 100644 --- a/include/hw/s390x/s390-pci-bus.h +++ b/include/hw/s390x/s390-pci-bus.h @@ -342,6 +342,7 @@ struct S390PCIBusDevice { uint16_t maxstbl; uint8_t sum; S390PCIGroup *pci_group; + ClpRspQueryPci zpci_fn; S390MsixInfo msix; AdapterRoutes routes; S390PCIIOMMU *iommu; From 92fe289ace3e559e2d18d0c2e49cdfb4cbd5a59b Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:40 -0400 Subject: [PATCH 29/32] vfio: Add routine for finding VFIO_DEVICE_GET_INFO capabilities Now that VFIO_DEVICE_GET_INFO supports capability chains, add a helper function to find specific capabilities in the chain. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- hw/vfio/common.c | 10 ++++++++++ include/hw/vfio/vfio-common.h | 2 ++ 2 files changed, 12 insertions(+) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 920786a23e..57f55f0447 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -1159,6 +1159,16 @@ vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id) return vfio_get_cap((void *)info, info->cap_offset, id); } +struct vfio_info_cap_header * +vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id) +{ + if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) { + return NULL; + } + + return vfio_get_cap((void *)info, info->cap_offset, id); +} + bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, unsigned int *avail) { diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 1d14946a9d..baeb4dcff1 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -216,6 +216,8 @@ struct vfio_info_cap_header * vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id); bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info, unsigned int *avail); +struct vfio_info_cap_header * +vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id); #endif extern const MemoryListener vfio_prereg_listener; From 1e7552ff5c34972a7a17d2b06900a0b66c79a68b Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Mon, 26 Oct 2020 11:34:41 -0400 Subject: [PATCH 30/32] s390x/pci: get zPCI function info from host We use the capability chains of the VFIO_DEVICE_GET_INFO ioctl to retrieve the CLP information that the kernel exports. To be compatible with previous kernel versions we fall back on previous predefined values, same as the emulation values, when the ioctl is found to not support capability chains. If individual CLP capabilities are not found, we fall back on default values for only those capabilities missing from the chain. This patch is based on work previously done by Pierre Morel. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck [aw: non-Linux build fixes] Signed-off-by: Alex Williamson --- hw/s390x/s390-pci-bus.c | 9 +- hw/s390x/s390-pci-vfio.c | 180 +++++++++++++++++++++++++++++++ hw/s390x/trace-events | 5 + include/hw/s390x/s390-pci-bus.h | 1 + include/hw/s390x/s390-pci-clp.h | 12 ++- include/hw/s390x/s390-pci-vfio.h | 2 + 6 files changed, 202 insertions(+), 7 deletions(-) diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 072b56e45e..48a3be802f 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -738,7 +738,7 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn) object_unref(OBJECT(iommu)); } -static S390PCIGroup *s390_group_create(int id) +S390PCIGroup *s390_group_create(int id) { S390PCIGroup *group; S390pciState *s = s390_get_phb(); @@ -783,7 +783,7 @@ static void set_pbdev_info(S390PCIBusDevice *pbdev) pbdev->zpci_fn.sdma = ZPCI_SDMA_ADDR; pbdev->zpci_fn.edma = ZPCI_EDMA_ADDR; pbdev->zpci_fn.pchid = 0; - pbdev->zpci_fn.ug = ZPCI_DEFAULT_FN_GRP; + pbdev->zpci_fn.pfgid = ZPCI_DEFAULT_FN_GRP; pbdev->zpci_fn.fid = pbdev->fid; pbdev->zpci_fn.uid = pbdev->uid; pbdev->pci_group = s390_group_find(ZPCI_DEFAULT_FN_GRP); @@ -863,7 +863,8 @@ static int s390_pci_msix_init(S390PCIBusDevice *pbdev) name = g_strdup_printf("msix-s390-%04x", pbdev->uid); memory_region_init_io(&pbdev->msix_notify_mr, OBJECT(pbdev), &s390_msi_ctrl_ops, pbdev, name, PAGE_SIZE); - memory_region_add_subregion(&pbdev->iommu->mr, ZPCI_MSI_ADDR, + memory_region_add_subregion(&pbdev->iommu->mr, + pbdev->pci_group->zpci_group.msia, &pbdev->msix_notify_mr); g_free(name); @@ -1016,6 +1017,8 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev, if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) { pbdev->fh |= FH_SHM_VFIO; pbdev->iommu->dma_limit = s390_pci_start_dma_count(s, pbdev); + /* Fill in CLP information passed via the vfio region */ + s390_pci_get_clp_info(pbdev); } else { pbdev->fh |= FH_SHM_EMUL; } diff --git a/hw/s390x/s390-pci-vfio.c b/hw/s390x/s390-pci-vfio.c index 0621fa386c..d5c78063b5 100644 --- a/hw/s390x/s390-pci-vfio.c +++ b/hw/s390x/s390-pci-vfio.c @@ -10,9 +10,13 @@ */ #include +#include +#include #include "qemu/osdep.h" +#include "trace.h" #include "hw/s390x/s390-pci-bus.h" +#include "hw/s390x/s390-pci-clp.h" #include "hw/s390x/s390-pci-vfio.h" #include "hw/vfio/pci.h" #include "hw/vfio/vfio-common.h" @@ -94,3 +98,179 @@ void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) QTAILQ_REMOVE(&s->zpci_dma_limit, cnt, link); } } + +static void s390_pci_read_base(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_base *cap; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_BASE); + + /* If capability not provided, just leave the defaults in place */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_BASE); + return; + } + cap = (void *) hdr; + + pbdev->zpci_fn.sdma = cap->start_dma; + pbdev->zpci_fn.edma = cap->end_dma; + pbdev->zpci_fn.pchid = cap->pchid; + pbdev->zpci_fn.vfn = cap->vfn; + pbdev->zpci_fn.pfgid = cap->gid; + /* The following values remain 0 until we support other FMB formats */ + pbdev->zpci_fn.fmbl = 0; + pbdev->zpci_fn.pft = 0; +} + +static void s390_pci_read_group(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_group *cap; + ClpRspQueryPciGrp *resgrp; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_GROUP); + + /* If capability not provided, just use the default group */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_GROUP); + pbdev->zpci_fn.pfgid = ZPCI_DEFAULT_FN_GRP; + pbdev->pci_group = s390_group_find(ZPCI_DEFAULT_FN_GRP); + return; + } + cap = (void *) hdr; + + /* See if the PCI group is already defined, create if not */ + pbdev->pci_group = s390_group_find(pbdev->zpci_fn.pfgid); + + if (!pbdev->pci_group) { + pbdev->pci_group = s390_group_create(pbdev->zpci_fn.pfgid); + + resgrp = &pbdev->pci_group->zpci_group; + if (cap->flags & VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH) { + resgrp->fr = 1; + } + stq_p(&resgrp->dasm, cap->dasm); + stq_p(&resgrp->msia, cap->msi_addr); + stw_p(&resgrp->mui, cap->mui); + stw_p(&resgrp->i, cap->noi); + stw_p(&resgrp->maxstbl, cap->maxstbl); + stb_p(&resgrp->version, cap->version); + } +} + +static void s390_pci_read_util(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_util *cap; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_UTIL); + + /* If capability not provided, just leave the defaults in place */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_UTIL); + return; + } + cap = (void *) hdr; + + if (cap->size > CLP_UTIL_STR_LEN) { + trace_s390_pci_clp_cap_size(vpci->vbasedev.name, cap->size, + VFIO_DEVICE_INFO_CAP_ZPCI_UTIL); + return; + } + + pbdev->zpci_fn.flags |= CLP_RSP_QPCI_MASK_UTIL; + memcpy(pbdev->zpci_fn.util_str, cap->util_str, CLP_UTIL_STR_LEN); +} + +static void s390_pci_read_pfip(S390PCIBusDevice *pbdev, + struct vfio_device_info *info) +{ + struct vfio_info_cap_header *hdr; + struct vfio_device_info_cap_zpci_pfip *cap; + VFIOPCIDevice *vpci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_ZPCI_PFIP); + + /* If capability not provided, just leave the defaults in place */ + if (hdr == NULL) { + trace_s390_pci_clp_cap(vpci->vbasedev.name, + VFIO_DEVICE_INFO_CAP_ZPCI_PFIP); + return; + } + cap = (void *) hdr; + + if (cap->size > CLP_PFIP_NR_SEGMENTS) { + trace_s390_pci_clp_cap_size(vpci->vbasedev.name, cap->size, + VFIO_DEVICE_INFO_CAP_ZPCI_PFIP); + return; + } + + memcpy(pbdev->zpci_fn.pfip, cap->pfip, CLP_PFIP_NR_SEGMENTS); +} + +/* + * This function will issue the VFIO_DEVICE_GET_INFO ioctl and look for + * capabilities that contain information about CLP features provided by the + * underlying host. + * On entry, defaults have already been placed into the guest CLP response + * buffers. On exit, defaults will have been overwritten for any CLP features + * found in the capability chain; defaults will remain for any CLP features not + * found in the chain. + */ +void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) +{ + g_autofree struct vfio_device_info *info; + VFIOPCIDevice *vfio_pci; + uint32_t argsz; + int fd; + + argsz = sizeof(*info); + info = g_malloc0(argsz); + + vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev); + fd = vfio_pci->vbasedev.fd; + + /* + * If the specified argsz is not large enough to contain all capabilities + * it will be updated upon return from the ioctl. Retry until we have + * a big enough buffer to hold the entire capability chain. On error, + * just exit and rely on CLP defaults. + */ +retry: + info->argsz = argsz; + + if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) { + trace_s390_pci_clp_dev_info(vfio_pci->vbasedev.name); + return; + } + + if (info->argsz > argsz) { + argsz = info->argsz; + info = g_realloc(info, argsz); + goto retry; + } + + /* + * Find the CLP features provided and fill in the guest CLP responses. + * Always call s390_pci_read_base first as information from this could + * determine which function group is used in s390_pci_read_group. + * For any feature not found, the default values will remain in the CLP + * response. + */ + s390_pci_read_base(pbdev, info); + s390_pci_read_group(pbdev, info); + s390_pci_read_util(pbdev, info); + s390_pci_read_pfip(pbdev, info); + + return; +} diff --git a/hw/s390x/trace-events b/hw/s390x/trace-events index 0dc5b818c4..8156693749 100644 --- a/hw/s390x/trace-events +++ b/hw/s390x/trace-events @@ -14,3 +14,8 @@ css_do_sic(uint16_t mode, uint8_t isc) "CSS: set interruption mode 0x%x on isc 0 virtio_ccw_interpret_ccw(int cssid, int ssid, int schid, int cmd_code) "VIRTIO-CCW: %x.%x.%04x: interpret command 0x%x" virtio_ccw_new_device(int cssid, int ssid, int schid, int devno, const char *devno_mode) "VIRTIO-CCW: add subchannel %x.%x.%04x, devno 0x%04x (%s)" virtio_ccw_set_ind(uint64_t ind_loc, uint8_t ind_old, uint8_t ind_new) "VIRTIO-CCW: indicator at %" PRIu64 ": 0x%x->0x%x" + +# s390-pci-vfio.c +s390_pci_clp_cap(const char *id, uint32_t cap) "PCI: %s: missing expected CLP capability %u" +s390_pci_clp_cap_size(const char *id, uint32_t size, uint32_t cap) "PCI: %s: bad size (%u) for CLP capability %u" +s390_pci_clp_dev_info(const char *id) "PCI: %s: cannot read vfio device info" diff --git a/include/hw/s390x/s390-pci-bus.h b/include/hw/s390x/s390-pci-bus.h index fe36f163ab..49ae9f03d3 100644 --- a/include/hw/s390x/s390-pci-bus.h +++ b/include/hw/s390x/s390-pci-bus.h @@ -322,6 +322,7 @@ typedef struct S390PCIGroup { int id; QTAILQ_ENTRY(S390PCIGroup) link; } S390PCIGroup; +S390PCIGroup *s390_group_create(int id); S390PCIGroup *s390_group_find(int id); struct S390PCIBusDevice { diff --git a/include/hw/s390x/s390-pci-clp.h b/include/hw/s390x/s390-pci-clp.h index 3708acd173..ea2b1378cd 100644 --- a/include/hw/s390x/s390-pci-clp.h +++ b/include/hw/s390x/s390-pci-clp.h @@ -79,6 +79,7 @@ typedef struct ClpFhListEntry { #define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */ #define CLP_UTIL_STR_LEN 64 +#define CLP_PFIP_NR_SEGMENTS 4 #define CLP_MASK_FMT 0xf0000000 @@ -120,14 +121,17 @@ typedef struct ClpRspQueryPci { uint32_t fmt; uint64_t reserved1; uint16_t vfn; /* virtual fn number */ -#define CLP_RSP_QPCI_MASK_UTIL 0x100 -#define CLP_RSP_QPCI_MASK_PFGID 0xff - uint16_t ug; +#define CLP_RSP_QPCI_MASK_UTIL 0x01 + uint8_t flags; + uint8_t pfgid; uint32_t fid; /* pci function id */ uint8_t bar_size[PCI_BAR_COUNT]; uint16_t pchid; uint32_t bar[PCI_BAR_COUNT]; - uint64_t reserved2; + uint8_t pfip[CLP_PFIP_NR_SEGMENTS]; + uint16_t reserved2; + uint8_t fmbl; + uint8_t pft; uint64_t sdma; /* start dma as */ uint64_t edma; /* end dma as */ uint32_t reserved3[11]; diff --git a/include/hw/s390x/s390-pci-vfio.h b/include/hw/s390x/s390-pci-vfio.h index 539bcf04eb..c7984905b3 100644 --- a/include/hw/s390x/s390-pci-vfio.h +++ b/include/hw/s390x/s390-pci-vfio.h @@ -19,6 +19,7 @@ bool s390_pci_update_dma_avail(int fd, unsigned int *avail); S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, S390PCIBusDevice *pbdev); void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt); +void s390_pci_get_clp_info(S390PCIBusDevice *pbdev); #else static inline bool s390_pci_update_dma_avail(int fd, unsigned int *avail) { @@ -31,6 +32,7 @@ static inline S390PCIDMACount *s390_pci_start_dma_count(S390pciState *s, } static inline void s390_pci_end_dma_count(S390pciState *s, S390PCIDMACount *cnt) { } +static inline void s390_pci_get_clp_info(S390PCIBusDevice *pbdev) { } #endif #endif From 88eef59796f91271e3d288f64457e975dd7c8ac9 Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Fri, 23 Oct 2020 18:13:42 +0530 Subject: [PATCH 31/32] hw/vfio: Use lock guard macros Use qemu LOCK_GUARD macros in hw/vfio. Saves manual unlock calls Signed-off-by: Amey Narkhede Signed-off-by: Alex Williamson --- hw/vfio/platform.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 869ed2c39d..cc3f66f7e4 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -166,7 +166,7 @@ static void vfio_intp_mmap_enable(void *opaque) VFIOINTp *tmp; VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque; - qemu_mutex_lock(&vdev->intp_mutex); + QEMU_LOCK_GUARD(&vdev->intp_mutex); QLIST_FOREACH(tmp, &vdev->intp_list, next) { if (tmp->state == VFIO_IRQ_ACTIVE) { trace_vfio_platform_intp_mmap_enable(tmp->pin); @@ -174,12 +174,10 @@ static void vfio_intp_mmap_enable(void *opaque) timer_mod(vdev->mmap_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->mmap_timeout); - qemu_mutex_unlock(&vdev->intp_mutex); return; } } vfio_mmap_set_enabled(vdev, true); - qemu_mutex_unlock(&vdev->intp_mutex); } /** @@ -289,7 +287,7 @@ static void vfio_platform_eoi(VFIODevice *vbasedev) VFIOPlatformDevice *vdev = container_of(vbasedev, VFIOPlatformDevice, vbasedev); - qemu_mutex_lock(&vdev->intp_mutex); + QEMU_LOCK_GUARD(&vdev->intp_mutex); QLIST_FOREACH(intp, &vdev->intp_list, next) { if (intp->state == VFIO_IRQ_ACTIVE) { trace_vfio_platform_eoi(intp->pin, @@ -314,7 +312,6 @@ static void vfio_platform_eoi(VFIODevice *vbasedev) vfio_intp_inject_pending_lockheld(intp); QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext); } - qemu_mutex_unlock(&vdev->intp_mutex); } /** From c624b6b312680b76d2a19a4c65cfdb234e875e1b Mon Sep 17 00:00:00 2001 From: Zhengui li Date: Mon, 19 Oct 2020 14:23:46 +0000 Subject: [PATCH 32/32] vfio: fix incorrect print type The type of input variable is unsigned int while the printer type is int. So fix incorrect print type. Signed-off-by: Zhengui li Signed-off-by: Alex Williamson --- hw/vfio/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 57f55f0447..e18ea2cf91 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -205,7 +205,7 @@ void vfio_region_write(void *opaque, hwaddr addr, buf.qword = cpu_to_le64(data); break; default: - hw_error("vfio: unsupported write size, %d bytes", size); + hw_error("vfio: unsupported write size, %u bytes", size); break; } @@ -262,7 +262,7 @@ uint64_t vfio_region_read(void *opaque, data = le64_to_cpu(buf.qword); break; default: - hw_error("vfio: unsupported read size, %d bytes", size); + hw_error("vfio: unsupported read size, %u bytes", size); break; }