RE: [PATCH v1 1/1] vfio/nvgpu: Add vfio pci variant module for grace hopper device

Ankit Agrawal <ankita@xxxxxxxxxx> · Thu, 4 May 2023 00:41:55 +0000

Thanks Alex, comments inline.

> -----Original Message-----
> From: Alex Williamson <alex.williamson@xxxxxxxxxx>
> Sent: Thursday, April 20, 2023 4:17 PM
> To: Ankit Agrawal <ankita@xxxxxxxxxx>
> Cc: Jason Gunthorpe <jgg@xxxxxxxxxx>; Aniket Agashe
> <aniketa@xxxxxxxxxx>; Neo Jia <cjia@xxxxxxxxxx>; Kirti Wankhede
> <kwankhede@xxxxxxxxxx>; Tarun Gupta (SW-GPU) <targupta@xxxxxxxxxx>;
> Vikram Sethi <vsethi@xxxxxxxxxx>; Andy Currid <acurrid@xxxxxxxxxx>;
> kvm@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx
> Subject: Re: [PATCH v1 1/1] vfio/nvgpu: Add vfio pci variant module for grace
> hopper device
> 
> External email: Use caution opening links or attachments
> 
> 
> On Wed, 19 Apr 2023 15:28:36 -0700
> <ankita@xxxxxxxxxx> wrote:
> 
> > From: Ankit Agrawal <ankita@xxxxxxxxxx>
> >
> > NVIDIA's upcoming Grace Hopper Superchip provides a PCI-like device
> > for the on-chip GPU that is the logical OS representation of the
> > internal propritary cache coherent interconnect.
> >
> > This representation has a number of limitations compared to a real PCI
> > device, in particular, it does not model the coherent GPU memory
> > aperture as a PCI config space BAR, and PCI doesn't know anything
> > about cacheable memory types.
> >
> > Provide a VFIO PCI variant driver that adapts the unique PCI
> > representation into a more standard PCI representation facing
> > userspace. The GPU memory aperture is obtained from ACPI using
> > device_property_read_u64(), according to the FW specification, and
> > exported to userspace as the VFIO_REGION that covers the first PCI
> > BAR. qemu will naturally generate a PCI device in the VM where the
> > cacheable aperture is reported in BAR1.
> >
> > Since this memory region is actually cache coherent with the CPU, the
> > VFIO variant driver will mmap it into VMA using a cacheable mapping.
> > The mapping is done using remap_pfn_range().
> >
> > This goes along with a qemu series to provides the necessary
> > implementation of the Grace Hopper Superchip firmware specification so
> > that the guest operating system can see the correct ACPI modeling for
> > the coherent GPU device.
> > https://github.com/qemu/qemu/compare/master...ankita-nv:qemu:dev-
> ankit
> > /cohmem-0330
> >
> > This patch is split from a patch series being pursued separately:
> > https://lore.kernel.org/lkml/20230405180134.16932-2-
> ankita@xxxxxxxxxx/
> >
> > Applied and tested over v6.3-rc4.
> >
> > Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx>
> > ---
> >  MAINTAINERS                     |   6 +
> >  drivers/vfio/pci/Kconfig        |   2 +
> >  drivers/vfio/pci/Makefile       |   2 +
> >  drivers/vfio/pci/nvgpu/Kconfig  |  10 ++
> >  drivers/vfio/pci/nvgpu/Makefile |   3 +
> >  drivers/vfio/pci/nvgpu/main.c   | 255
> ++++++++++++++++++++++++++++++++
> >  6 files changed, 278 insertions(+)
> >  create mode 100644 drivers/vfio/pci/nvgpu/Kconfig  create mode 100644
> > drivers/vfio/pci/nvgpu/Makefile  create mode 100644
> > drivers/vfio/pci/nvgpu/main.c
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS index
> > 1dc8bd26b6cf..6b48756c30d3 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -21954,6 +21954,12 @@ L:   kvm@xxxxxxxxxxxxxxx
> >  S:   Maintained
> >  F:   drivers/vfio/pci/mlx5/
> >
> > +VFIO NVIDIA PCI DRIVER
> > +M:   Ankit Agrawal <ankita@xxxxxxxxxx>
> > +L:   kvm@xxxxxxxxxxxxxxx
> > +S:   Maintained
> > +F:   drivers/vfio/pci/nvgpu/
> > +
> >  VGA_SWITCHEROO
> >  R:   Lukas Wunner <lukas@xxxxxxxxx>
> >  S:   Maintained
> > diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index
> > f9d0c908e738..ade18b0ffb7b 100644
> > --- a/drivers/vfio/pci/Kconfig
> > +++ b/drivers/vfio/pci/Kconfig
> > @@ -59,4 +59,6 @@ source "drivers/vfio/pci/mlx5/Kconfig"
> >
> >  source "drivers/vfio/pci/hisilicon/Kconfig"
> >
> > +source "drivers/vfio/pci/nvgpu/Kconfig"
> > +
> >  endif
> > diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
> > index 24c524224da5..0c93d452d0da 100644
> > --- a/drivers/vfio/pci/Makefile
> > +++ b/drivers/vfio/pci/Makefile
> > @@ -11,3 +11,5 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
> >  obj-$(CONFIG_MLX5_VFIO_PCI)           += mlx5/
> >
> >  obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
> > +
> > +obj-$(CONFIG_NVGPU_VFIO_PCI) += nvgpu/
> > diff --git a/drivers/vfio/pci/nvgpu/Kconfig
> > b/drivers/vfio/pci/nvgpu/Kconfig new file mode 100644 index
> > 000000000000..066f764f7c5f
> > --- /dev/null
> > +++ b/drivers/vfio/pci/nvgpu/Kconfig
> > @@ -0,0 +1,10 @@
> > +# SPDX-License-Identifier: GPL-2.0-only config NVGPU_VFIO_PCI
> > +     tristate "VFIO support for the GPU in the NVIDIA Grace Hopper
> Superchip"
> > +     depends on ARM64 || (COMPILE_TEST && 64BIT)
> > +     select VFIO_PCI_CORE
> > +     help
> > +       VFIO support for the GPU in the NVIDIA Grace Hopper Superchip is
> > +       required to assign the GPU device to a VM using KVM/qemu/etc.
> > +
> > +       If you don't know what to do here, say N.
> > diff --git a/drivers/vfio/pci/nvgpu/Makefile
> > b/drivers/vfio/pci/nvgpu/Makefile new file mode 100644 index
> > 000000000000..00fd3a078218
> > --- /dev/null
> > +++ b/drivers/vfio/pci/nvgpu/Makefile
> > @@ -0,0 +1,3 @@
> > +# SPDX-License-Identifier: GPL-2.0-only
> > +obj-$(CONFIG_NVGPU_VFIO_PCI) += nvgpu-vfio-pci.o nvgpu-vfio-pci-y :=
> > +main.o
> > diff --git a/drivers/vfio/pci/nvgpu/main.c
> > b/drivers/vfio/pci/nvgpu/main.c new file mode 100644 index
> > 000000000000..9e08e8cf4606
> > --- /dev/null
> > +++ b/drivers/vfio/pci/nvgpu/main.c
> > @@ -0,0 +1,255 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights
> > +reserved  */
> > +
> > +#include <linux/pci.h>
> > +#include <linux/vfio_pci_core.h>
> > +
> > +#define DUMMY_PFN \
> > +     (((nvdev->mem_prop.hpa + nvdev->mem_prop.mem_length) >>
> > +PAGE_SHIFT) - 1)
> > +
> > +struct dev_mem_properties {
> > +     uint64_t hpa;
> > +     uint64_t mem_length;
> > +     int bar1_start_offset;
> > +};
> > +
> > +struct nvgpu_vfio_pci_core_device {
> > +     struct vfio_pci_core_device core_device;
> > +     struct dev_mem_properties mem_prop; };
> > +
> > +static int vfio_get_bar1_start_offset(struct vfio_pci_core_device
> > +*vdev) {
> > +     u8 val = 0;
> > +
> > +     pci_read_config_byte(vdev->pdev, 0x10, &val);
> > +     /*
> > +      * The BAR1 start offset in the PCI config space depends on the BAR0size.
> > +      * Check if the BAR0 is 64b and return the approproiate BAR1 offset.
> > +      */
> > +     if (val & PCI_BASE_ADDRESS_MEM_TYPE_64)
> > +             return VFIO_PCI_BAR2_REGION_INDEX;
> > +
> > +     return VFIO_PCI_BAR1_REGION_INDEX; }
> 
> This is really confusing offsets vs indexes, it's clearly returning a region index,
> not offset.  Also we already have resources setup for BAR0, so rather than
> working on the raw BAR value, how about:
> 
>         return pci_resource_flags(vdev->pdev, 0) & IORESOURCE_MEM_64 ?
>                         VFIO_PCI_BAR2_REGION_INDEX :
> VFIO_PCI_BAR1_REGION_INDEX;
> 
> OTOH, why are we trying to pack the BARs, couldn't we always put it at BAR2?
> 

Right, it should have said region index. And yes, I think we can put it at index 2.

> > +
> > +static int nvgpu_vfio_pci_open_device(struct vfio_device *core_vdev)
> > +{
> > +     struct nvgpu_vfio_pci_core_device *nvdev = container_of(
> > +             core_vdev, struct nvgpu_vfio_pci_core_device, core_device.vdev);
> > +     struct vfio_pci_core_device *vdev =
> > +             container_of(core_vdev, struct vfio_pci_core_device, vdev);
> > +     int ret;
> > +
> > +     ret = vfio_pci_core_enable(vdev);
> > +     if (ret)
> > +             return ret;
> > +
> > +     vfio_pci_core_finish_enable(vdev);
> > +
> > +     nvdev->mem_prop.bar1_start_offset =
> > + vfio_get_bar1_start_offset(vdev);
> > +
> > +     return ret;
> > +}
> > +
> > +static int nvgpu_vfio_pci_mmap(struct vfio_device *core_vdev,
> > +                     struct vm_area_struct *vma) {
> > +     struct nvgpu_vfio_pci_core_device *nvdev = container_of(
> > +             core_vdev, struct nvgpu_vfio_pci_core_device,
> > +core_device.vdev);
> > +
> > +     unsigned long start_pfn;
> > +     unsigned int index;
> > +     u64 req_len, pgoff;
> > +     int ret = 0;
> > +
> > +     index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
> > +     if (index != nvdev->mem_prop.bar1_start_offset)
> 
> offset vs index... 

Ack, will name the variable appropriately at this and other places.

> 
> > +             return vfio_pci_core_mmap(core_vdev, vma);
> > +
> > +     /*
> > +      * Request to mmap the BAR1. Map to the CPU accessible memory on
> > + the
> 
> But it might be BAR2...
> 
> > +      * GPU using the memory information gathered from the system ACPI
> > +      * tables.
> > +      */
> > +     start_pfn = nvdev->mem_prop.hpa >> PAGE_SHIFT;
> > +     req_len = vma->vm_end - vma->vm_start;
> > +     pgoff = vma->vm_pgoff &
> > +             ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> > +     if (pgoff >= (nvdev->mem_prop.mem_length >> PAGE_SHIFT))
> > +             return -EINVAL;
> > +
> > +     /*
> > +      * Perform a PFN map to the memory. The device BAR1 is backed by the
> > +      * GPU memory now. Check that the mapping does not overflow out of
> > +      * the GPU memory size.
> > +      */
> > +     ret = remap_pfn_range(vma, vma->vm_start, start_pfn + pgoff,
> > +                           min(req_len, nvdev->mem_prop.mem_length - pgoff),
> > +                           vma->vm_page_prot);
> 
> What's the behavior of this "BAR" relative to things like
> PCI_COMMAND_MEMORY or reset?  If the user generates a secondary bus
> reset on the parent bridge in one thread, while accessing the device in another
> thread, isn't that susceptible to platform error handling just like any other PCI
> device?  This is why vfio-pci-core has all the zapping and faulting of mmaps to
> real BARs.
> 
> > +     if (ret)
> > +             return ret;
> > +
> > +     vma->vm_pgoff = start_pfn + pgoff;
> > +
> > +     return 0;
> > +}
> > +
> > +static long nvgpu_vfio_pci_ioctl(struct vfio_device *core_vdev,
> > +                     unsigned int cmd, unsigned long arg) {
> > +     struct nvgpu_vfio_pci_core_device *nvdev = container_of(
> > +             core_vdev, struct nvgpu_vfio_pci_core_device,
> > +core_device.vdev);
> > +
> > +     unsigned long minsz = offsetofend(struct vfio_region_info, offset);
> > +     struct vfio_region_info info;
> > +
> > +     switch (cmd) {
> > +     case VFIO_DEVICE_GET_REGION_INFO:
> > +             if (copy_from_user(&info, (void __user *)arg, minsz))
> > +                     return -EFAULT;
> > +
> > +             if (info.argsz < minsz)
> > +                     return -EINVAL;
> > +
> > +             if (info.index == nvdev->mem_prop.bar1_start_offset) {
> 
> index vs offset...
> 
> > +                     /*
> > +                      * Request to determine the BAR1 region information. Send the
> > +                      * GPU memory information.
> > +                      */
> > +                     info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
> > +                     info.size = nvdev->mem_prop.mem_length;
> > +                     info.flags = VFIO_REGION_INFO_FLAG_READ |
> > +                                  VFIO_REGION_INFO_FLAG_WRITE |
> > +                                  VFIO_REGION_INFO_FLAG_MMAP;
> > +                     return copy_to_user((void __user *)arg, &info, minsz) ?
> > +                                    -EFAULT : 0;
> > +             }
> > +
> > +             if (info.index == nvdev->mem_prop.bar1_start_offset + 1) {
> > +                     /*
> > +                      * The BAR1 region is 64b. Ignore this access.
> > +                      */
> > +                     info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
> > +                     info.size = 0;
> > +                     info.flags = 0;
> > +                     return copy_to_user((void __user *)arg, &info, minsz) ?
> > +                             -EFAULT : 0;
> > +             }
> 
> Not sure why the core code doesn't handle BAR+1
> 
> > +
> > +             return vfio_pci_core_ioctl(core_vdev, cmd, arg);
> > +
> > +     default:
> > +             return vfio_pci_core_ioctl(core_vdev, cmd, arg);
> > +     }
> 
> This might work better as simply:
> 
>         if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
>                 /* virtual BAR returns... */
>         }
> 
>         return vfio_pci_core_ioctl(...);
> 
> It at least avoids the duplication.
> 

Right, that looks better.

> > +}
> > +
> > +static const struct vfio_device_ops nvgpu_vfio_pci_ops = {
> > +     .name = "nvgpu-vfio-pci",
> > +     .init = vfio_pci_core_init_dev,
> > +     .release = vfio_pci_core_release_dev,
> > +     .open_device = nvgpu_vfio_pci_open_device,
> > +     .close_device = vfio_pci_core_close_device,
> > +     .ioctl = nvgpu_vfio_pci_ioctl,
> > +     .read = vfio_pci_core_read,
> > +     .write = vfio_pci_core_write,
> > +     .mmap = nvgpu_vfio_pci_mmap,
> > +     .request = vfio_pci_core_request,
> > +     .match = vfio_pci_core_match,
> > +     .bind_iommufd = vfio_iommufd_physical_bind,
> > +     .unbind_iommufd = vfio_iommufd_physical_unbind,
> > +     .attach_ioas = vfio_iommufd_physical_attach_ioas,
> > +};
> > +
> > +static struct nvgpu_vfio_pci_core_device *nvgpu_drvdata(struct
> > +pci_dev *pdev) {
> > +     struct vfio_pci_core_device *core_device =
> > +dev_get_drvdata(&pdev->dev);
> > +
> > +     return container_of(core_device, struct nvgpu_vfio_pci_core_device,
> > +                         core_device); }
> > +
> > +static int
> > +nvgpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev,
> > +                                  struct nvgpu_vfio_pci_core_device
> > +*nvdev) {
> > +     int ret = 0;
> 
> Unnecessary initialization.  Thanks

Ack, will fix in the next version.

> 
> Alex
> 
> > +
> > +     /*
> > +      * The memory information is present in the system ACPI tables as DSD
> > +      * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size.
> > +      */
> > +     ret = device_property_read_u64(&(pdev->dev), "nvidia,gpu-mem-base-
> pa",
> > +                                    &(nvdev->mem_prop.hpa));
> > +     if (ret)
> > +             return ret;
> > +
> > +     ret = device_property_read_u64(&(pdev->dev), "nvidia,gpu-mem-size",
> > +                                    &(nvdev->mem_prop.mem_length));
> > +     return ret;
> > +}
> > +
> > +static int nvgpu_vfio_pci_probe(struct pci_dev *pdev,
> > +                             const struct pci_device_id *id) {
> > +     struct nvgpu_vfio_pci_core_device *nvdev;
> > +     int ret;
> > +
> > +     nvdev = vfio_alloc_device(nvgpu_vfio_pci_core_device,
> core_device.vdev,
> > +                               &pdev->dev, &nvgpu_vfio_pci_ops);
> > +     if (IS_ERR(nvdev))
> > +             return PTR_ERR(nvdev);
> > +
> > +     dev_set_drvdata(&pdev->dev, nvdev);
> > +
> > +     ret = nvgpu_vfio_pci_fetch_memory_property(pdev, nvdev);
> > +     if (ret)
> > +             goto out_put_vdev;
> > +
> > +     ret = vfio_pci_core_register_device(&nvdev->core_device);
> > +     if (ret)
> > +             goto out_put_vdev;
> > +
> > +     return ret;
> > +
> > +out_put_vdev:
> > +     vfio_put_device(&nvdev->core_device.vdev);
> > +     return ret;
> > +}
> > +
> > +static void nvgpu_vfio_pci_remove(struct pci_dev *pdev) {
> > +     struct nvgpu_vfio_pci_core_device *nvdev = nvgpu_drvdata(pdev);
> > +     struct vfio_pci_core_device *vdev = &nvdev->core_device;
> > +
> > +     vfio_pci_core_unregister_device(vdev);
> > +     vfio_put_device(&vdev->vdev);
> > +}
> > +
> > +static const struct pci_device_id nvgpu_vfio_pci_table[] = {
> > +     { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA,
> 0x2342) },
> > +     { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA,
> 0x2343) },
> > +     { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA,
> 0x2345) },
> > +     {}
> > +};
> > +
> > +MODULE_DEVICE_TABLE(pci, nvgpu_vfio_pci_table);
> > +
> > +static struct pci_driver nvgpu_vfio_pci_driver = {
> > +     .name = KBUILD_MODNAME,
> > +     .id_table = nvgpu_vfio_pci_table,
> > +     .probe = nvgpu_vfio_pci_probe,
> > +     .remove = nvgpu_vfio_pci_remove,
> > +     .err_handler = &vfio_pci_core_err_handlers,
> > +     .driver_managed_dma = true,
> > +};
> > +
> > +module_pci_driver(nvgpu_vfio_pci_driver);
> > +
> > +MODULE_LICENSE("GPL v2");
> > +MODULE_AUTHOR("Ankit Agrawal <ankita@xxxxxxxxxx>");
> > +MODULE_AUTHOR("Aniket Agashe <aniketa@xxxxxxxxxx>");
> > +MODULE_DESCRIPTION(
> > +     "VFIO NVGPU PF - User Level driver for NVIDIA devices with CPU
> coherently accessible device memory");