On 4/22/20 10:13 AM, Jon Derrick wrote: > The VMD endpoint provides a real PCIe domain to the guest, including > bridges and endpoints. The IOMMU performs Host Physical Address to Guest > Physical Address translation when assigning downstream endpoint BARs and > when translating MMIO addresses. > > This translation is not desired when assigning bridge windows. When MMIO > goes to an endpoint after being translated to HPA, the bridge will > reject the HPA transaction because the bridge window has been programmed > with translated GPAs. > > VMD device 28C0 natively supports passthrough by providing the Host > Physical Address in shadow registers accessible to the guest for bridge > window assignment. The shadow registers are valid if bit 1 is set in VMD > VMLOCK config register 0x70. > > This quirk emulates the VMLOCK and HPA shadow registers for all VMD > device ids which don't natively offer this feature. The Linux VMD driver > is updated to match the QEMU subsystem id to enable this feature. > > Signed-off-by: Jon Derrick <jonathan.derrick@xxxxxxxxx> > --- > hw/vfio/pci-quirks.c | 119 +++++++++++++++++++++++++++++++++++++++++++ > hw/vfio/pci.c | 7 +++ > hw/vfio/pci.h | 2 + > hw/vfio/trace-events | 4 ++ > 4 files changed, 132 insertions(+) > > diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c > index 2d348f8237..2fd27cc8f6 100644 > --- a/hw/vfio/pci-quirks.c > +++ b/hw/vfio/pci-quirks.c > @@ -1709,3 +1709,122 @@ free_exit: > > return ret; > } > + > +/* > + * The VMD endpoint provides a real PCIe domain to the guest. The IOMMU > + * performs Host Physical Address to Guest Physical Address translation when > + * assigning downstream endpoint BARs and when translating MMIO addresses. > + * However this translation is not desired when assigning bridge windows. When > + * MMIO goes to an endpoint after being translated to HPA, the bridge rejects > + * the transaction because the window has been programmed with translated GPAs. > + * > + * VMD uses the Host Physical Address in order to correctly program the bridge > + * windows in its PCIe domain. VMD device 28C0 has HPA shadow registers located > + * at offset 0x2000 in MEMBAR2 (BAR 4). The shadow registers are valid if bit 1 > + * is set in the VMD VMLOCK config register 0x70. > + * > + * This quirk emulates the VMLOCK and HPA shadow registers for all VMD device > + * ids which don't natively offer this feature. The subsystem vendor/device > + * id is set to the QEMU subsystem vendor/device id, where the driver matches > + * the id to enable this feature. > + */ > +typedef struct VFIOVMDQuirk { > + VFIOPCIDevice *vdev; > + uint64_t membar_phys[2]; > +} VFIOVMDQuirk; > + > +static uint64_t vfio_vmd_quirk_read(void *opaque, hwaddr addr, unsigned size) > +{ > + VFIOVMDQuirk *data = opaque; > + uint64_t val = 0; > + > + memcpy(&val, (void *)data->membar_phys + addr, size); > + return val; > +} > + > +static const MemoryRegionOps vfio_vmd_quirk = { > + .read = vfio_vmd_quirk_read, > + .endianness = DEVICE_LITTLE_ENDIAN, > +}; > + > +#define VMD_VMLOCK 0x70 > +#define VMD_SHADOW 0x2000 > +#define VMD_MEMBAR2 4 > + > +static int vfio_vmd_emulate_shadow_registers(VFIOPCIDevice *vdev) > +{ > + VFIOQuirk *quirk; > + VFIOVMDQuirk *data; > + PCIDevice *pdev = &vdev->pdev; > + int ret; > + > + data = g_malloc0(sizeof(*data)); > + ret = pread(vdev->vbasedev.fd, data->membar_phys, 16, > + vdev->config_offset + PCI_BASE_ADDRESS_2); > + if (ret != 16) { > + error_report("VMD %s cannot read MEMBARs (%d)", > + vdev->vbasedev.name, ret); > + g_free(data); > + return -EFAULT; > + } > + > + quirk = vfio_quirk_alloc(1); > + quirk->data = data; > + data->vdev = vdev; > + > + /* Emulate Shadow Registers */ > + memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_vmd_quirk, data, > + "vfio-vmd-quirk", sizeof(data->membar_phys)); > + memory_region_add_subregion_overlap(vdev->bars[VMD_MEMBAR2].region.mem, > + VMD_SHADOW, quirk->mem, 1); > + memory_region_set_readonly(quirk->mem, true); > + memory_region_set_enabled(quirk->mem, true); > + > + QLIST_INSERT_HEAD(&vdev->bars[VMD_MEMBAR2].quirks, quirk, next); > + > + trace_vfio_pci_vmd_quirk_shadow_regs(vdev->vbasedev.name, > + data->membar_phys[0], > + data->membar_phys[1]); > + > + /* Advertise Shadow Register support */ > + pci_byte_test_and_set_mask(pdev->config + VMD_VMLOCK, 0x2); > + pci_set_byte(pdev->wmask + VMD_VMLOCK, 0); > + pci_set_byte(vdev->emulated_config_bits + VMD_VMLOCK, 0x2); > + > + trace_vfio_pci_vmd_quirk_vmlock(vdev->vbasedev.name, > + pci_get_byte(pdev->config + VMD_VMLOCK)); > + > + /* Drivers can match the subsystem vendor/device id */ > + pci_set_word(pdev->config + PCI_SUBSYSTEM_VENDOR_ID, > + PCI_SUBVENDOR_ID_REDHAT_QUMRANET); > + pci_set_word(vdev->emulated_config_bits + PCI_SUBSYSTEM_VENDOR_ID, ~0); > + > + pci_set_word(pdev->config + PCI_SUBSYSTEM_ID, PCI_SUBDEVICE_ID_QEMU); > + pci_set_word(vdev->emulated_config_bits + PCI_SUBSYSTEM_ID, ~0); > + > + trace_vfio_pci_vmd_quirk_subsystem(vdev->vbasedev.name, > + vdev->sub_vendor_id, vdev->sub_device_id, > + pci_get_word(pdev->config + PCI_SUBSYSTEM_VENDOR_ID), > + pci_get_word(pdev->config + PCI_SUBSYSTEM_ID)); > + > + return 0; > +} > + > +int vfio_pci_vmd_init(VFIOPCIDevice *vdev) > +{ > + int ret = 0; > + > + switch (vdev->device_id) { > + case 0x28C0: /* Native passthrough support */ > + break; > + /* Emulates Native passthrough support */ > + case 0x201D: > + case 0x467F: > + case 0x4C3D: > + case 0x9A0B: > + ret = vfio_vmd_emulate_shadow_registers(vdev); > + break; > + } > + > + return ret; > +} > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 5e75a95129..85425a1a6f 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -3024,6 +3024,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > } > } > > + if (vdev->vendor_id == PCI_VENDOR_ID_INTEL) { > + ret = vfio_pci_vmd_init(vdev); > + if (ret) { > + error_report("Failed to setup VMD"); > + } > + } > + > vfio_register_err_notifier(vdev); > vfio_register_req_notifier(vdev); > vfio_setup_resetfn_quirk(vdev); > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index 0da7a20a7e..e8632d806b 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -217,6 +217,8 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, > int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp); > int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp); > > +int vfio_pci_vmd_init(VFIOPCIDevice *vdev); > + > void vfio_display_reset(VFIOPCIDevice *vdev); > int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp); > void vfio_display_finalize(VFIOPCIDevice *vdev); > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > index b1ef55a33f..aabbd2693a 100644 > --- a/hw/vfio/trace-events > +++ b/hw/vfio/trace-events > @@ -90,6 +90,10 @@ vfio_pci_nvidia_gpu_setup_quirk(const char *name, uint64_t tgt, uint64_t size) " > vfio_pci_nvlink2_setup_quirk_ssatgt(const char *name, uint64_t tgt, uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64 > vfio_pci_nvlink2_setup_quirk_lnkspd(const char *name, uint32_t link_speed) "%s link_speed=0x%x" > > +vfio_pci_vmd_quirk_shadow_regs(const char *name, uint64_t mb1, uint64_t mb2) "%s membar1_phys=0x%"PRIx64" membar2_phys=0x%"PRIx64" > +vfio_pci_vmd_quirk_vmlock(const char *name, uint8_t vmlock) "%s vmlock=0x%x" > +vfio_pci_vmd_quirk_subsystem(const char *name, uint16_t old_svid, uint16_t old_sdid, uint16_t new_svid, uint16_t new_sdid) "%s subsystem id 0x%04x:0x%04x -> 0x%04x:0x%04x" > + > # common.c > vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" > vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64 Reviewed-by: Andrzej Jakowski <andrzej.jakowski@xxxxxxxxxxxxxxx>