From 2dca1b37a7605abb135559ef1b0d63929e7ae60d Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Mon, 20 Mar 2023 16:35:22 +0900 Subject: [PATCH 1/3] vfio/pci: add support for VF token VF token was introduced [1] to kernel vfio-pci along with SR-IOV support [2]. This patch adds support VF token among PF and VF(s). To passthu PCIe VF to a VM, kernel >= v5.7 needs this. It can be configured with UUID like: -device vfio-pci,host=DDDD:BB:DD:F,vf-token=,... [1] https://lore.kernel.org/linux-pci/158396393244.5601.10297430724964025753.stgit@gimli.home/ [2] https://lore.kernel.org/linux-pci/158396044753.5601.14804870681174789709.stgit@gimli.home/ Cc: Alex Williamson Signed-off-by: Minwoo Im Reviewed-by: Klaus Jensen Link: https://lore.kernel.org/r/20230320073522epcms2p48f682ecdb73e0ae1a4850ad0712fd780@epcms2p4 Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 13 ++++++++++++- hw/vfio/pci.h | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index ec9a854361..cf27f28936 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2856,6 +2856,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) int groupid; int i, ret; bool is_mdev; + char uuid[UUID_FMT_LEN]; + char *name; if (!vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || @@ -2936,7 +2938,15 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } - ret = vfio_get_device(group, vbasedev->name, vbasedev, errp); + if (!qemu_uuid_is_null(&vdev->vf_token)) { + qemu_uuid_unparse(&vdev->vf_token, uuid); + name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid); + } else { + name = vbasedev->name; + } + + ret = vfio_get_device(group, name, vbasedev, errp); + g_free(name); if (ret) { vfio_put_group(group); goto error; @@ -3268,6 +3278,7 @@ static void vfio_instance_init(Object *obj) static Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), + DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, vbasedev.pre_copy_dirty_page_tracking, diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 177abcc8fb..2674476d6c 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -137,6 +137,7 @@ struct VFIOPCIDevice { VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ void *igd_opregion; PCIHostDeviceAddress host; + QemuUUID vf_token; EventNotifier err_notifier; EventNotifier req_notifier; int (*resetfn)(struct VFIOPCIDevice *); From ff180c6bd7a8fad48099b767ad5e8340cec4fb87 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Mon, 3 Apr 2023 16:00:00 +0300 Subject: [PATCH 2/3] vfio/migration: Skip log_sync during migration SETUP state Currently, VFIO log_sync can be issued while migration is in SETUP state. However, doing this log_sync is at best redundant and at worst can fail. Redundant -- all RAM is marked dirty in migration SETUP state and is transferred only after migration is set to ACTIVE state, so doing log_sync during migration SETUP is pointless. Can fail -- there is a time window, between setting migration state to SETUP and starting dirty tracking by RAM save_live_setup handler, during which dirty tracking is still not started. Any VFIO log_sync call that is issued during this time window will fail. For example, this error can be triggered by migrating a VM when a GUI is active, which constantly calls log_sync. Fix it by skipping VFIO log_sync while migration is in SETUP state. Fixes: 758b96b61d5c ("vfio/migrate: Move switch of dirty tracking into vfio_memory_listener") Signed-off-by: Avihai Horon Link: https://lore.kernel.org/r/20230403130000.6422-1-avihaih@nvidia.com Signed-off-by: Alex Williamson --- hw/vfio/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 4d01ea3515..78358ede27 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -478,7 +478,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container) VFIODevice *vbasedev; MigrationState *ms = migrate_get_current(); - if (!migration_is_setup_or_active(ms->state)) { + if (ms->state != MIGRATION_STATUS_ACTIVE && + ms->state != MIGRATION_STATUS_DEVICE) { return false; } From b5048a4cbfa0362abc720b5198fe9a35441bf5fe Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 4 May 2023 14:42:48 -0600 Subject: [PATCH 3/3] vfio/pci: Static Resizable BAR capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PCI Resizable BAR (ReBAR) capability is currently hidden from the VM because the protocol for interacting with the capability does not support a mechanism for the device to reject an advertised supported BAR size. However, when assigned to a VM, the act of resizing the BAR requires adjustment of host resources for the device, which absolutely can fail. Linux does not currently allow us to reserve resources for the device independent of the current usage. The only writable field within the ReBAR capability is the BAR Size register. The PCIe spec indicates that when written, the device should immediately begin to operate with the provided BAR size. The spec however also notes that software must only write values corresponding to supported sizes as indicated in the capability and control registers. Writing unsupported sizes produces undefined results. Therefore, if the hypervisor were to virtualize the capability and control registers such that the current size is the only indicated available size, then a write of anything other than the current size falls into the category of undefined behavior, where we can essentially expose the modified ReBAR capability as read-only. This may seem pointless, but users have reported that virtualizing the capability in this way not only allows guest software to expose related features as available (even if only cosmetic), but in some scenarios can resolve guest driver issues. Additionally, no regressions in behavior have been reported for this change. A caveat here is that the PCIe spec requires for compatibility that devices report support for a size in the range of 1MB to 512GB, therefore if the current BAR size falls outside that range we revert to hiding the capability. Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20230505232308.2869912-1-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index cf27f28936..bf27a39905 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2066,6 +2066,54 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) return 0; } +static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos) +{ + uint32_t ctrl; + int i, nbar; + + ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL); + nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT; + + for (i = 0; i < nbar; i++) { + uint32_t cap; + int size; + + ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8)); + size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT; + + /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */ + cap = size <= 27 ? 1U << (size + 4) : 0; + + /* + * The PCIe spec (v6.0.1, 7.8.6) requires HW to support at least one + * size in the range 1MB to 512GB. We intend to mask all sizes except + * the one currently enabled in the size field, therefore if it's + * outside the range, hide the whole capability as this virtualization + * trick won't work. If >512GB resizable BARs start to appear, we + * might need an opt-in or reservation scheme in the kernel. + */ + if (!(cap & PCI_REBAR_CAP_SIZES)) { + return -EINVAL; + } + + /* Hide all sizes reported in the ctrl reg per above requirement. */ + ctrl &= (PCI_REBAR_CTRL_BAR_SIZE | + PCI_REBAR_CTRL_NBAR_MASK | + PCI_REBAR_CTRL_BAR_IDX); + + /* + * The BAR size field is RW, however we've mangled the capability + * register such that we only report a single size, ie. the current + * BAR size. A write of an unsupported value is undefined, therefore + * the register field is essentially RO. + */ + vfio_add_emulated_long(vdev, pos + PCI_REBAR_CAP + (i * 8), cap, ~0); + vfio_add_emulated_long(vdev, pos + PCI_REBAR_CTRL + (i * 8), ctrl, ~0); + } + + return 0; +} + static void vfio_add_ext_cap(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; @@ -2139,9 +2187,13 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) case 0: /* kernel masked capability */ case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */ case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */ - case PCI_EXT_CAP_ID_REBAR: /* Can't expose read-only */ trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next); break; + case PCI_EXT_CAP_ID_REBAR: + if (!vfio_setup_rebar_ecap(vdev, next)) { + pcie_add_capability(pdev, cap_id, cap_ver, next, size); + } + break; default: pcie_add_capability(pdev, cap_id, cap_ver, next, size); }