On Sun, 5 Jan 2025 17:36:14 +0000 <ankita@xxxxxxxxxx> wrote: > From: Ankit Agrawal <ankita@xxxxxxxxxx> > > There is a HW defect on Grace Hopper (GH) to support the > Multi-Instance GPU (MIG) feature [1] that necessiated the presence > of a 1G region carved out from the device memory and mapped as > uncached. The 1G region is shown as a fake BAR (comprising region 2 and 3) > to workaround the issue. > > The Grace Blackwell systems (GB) differ from GH systems in the following > aspects: > 1. The aforementioned HW defect is fixed on GB systems. > 2. There is a usable BAR1 (region 2 and 3) on GB systems for the > GPUdirect RDMA feature [2]. > > This patch accommodate those GB changes by showing the 64b physical > device BAR1 (region2 and 3) to the VM instead of the fake one. This > takes care of both the differences. > > Moreover, the entire device memory is exposed on GB as cacheable to > the VM as there is no carveout required. > > Link: https://www.nvidia.com/en-in/technologies/multi-instance-gpu/ [1] > Link: https://docs.nvidia.com/cuda/gpudirect-rdma/ [2] > > Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx> > --- > drivers/vfio/pci/nvgrace-gpu/main.c | 32 +++++++++++++++++++++-------- > 1 file changed, 24 insertions(+), 8 deletions(-) > > diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c > index 85eacafaffdf..44a276c886e1 100644 > --- a/drivers/vfio/pci/nvgrace-gpu/main.c > +++ b/drivers/vfio/pci/nvgrace-gpu/main.c > @@ -72,7 +72,7 @@ nvgrace_gpu_memregion(int index, > if (index == USEMEM_REGION_INDEX) > return &nvdev->usemem; > > - if (index == RESMEM_REGION_INDEX) > + if (!nvdev->has_mig_hw_bug_fix && index == RESMEM_REGION_INDEX) > return &nvdev->resmem; > > return NULL; > @@ -715,6 +715,16 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { > .detach_ioas = vfio_iommufd_physical_detach_ioas, > }; > > +static void > +nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, > + struct nvgrace_gpu_pci_core_device *nvdev, > + u64 memphys, u64 memlength) > +{ > + nvdev->usemem.memphys = memphys; > + nvdev->usemem.memlength = memlength; > + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); > +} > + > static int > nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, > u64 *pmemphys, u64 *pmemlength) > @@ -752,9 +762,9 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, > } > > static int > -nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, > - struct nvgrace_gpu_pci_core_device *nvdev, > - u64 memphys, u64 memlength) > +nvgrace_gpu_nvdev_struct_workaround(struct pci_dev *pdev, > + struct nvgrace_gpu_pci_core_device *nvdev, > + u64 memphys, u64 memlength) > { > int ret = 0; > > @@ -864,10 +874,16 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, > * Device memory properties are identified in the host ACPI > * table. Set the nvgrace_gpu_pci_core_device structure. > */ > - ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev, > - memphys, memlength); > - if (ret) > - goto out_put_vdev; > + if (nvdev->has_mig_hw_bug_fix) { > + nvgrace_gpu_init_nvdev_struct(pdev, nvdev, > + memphys, memlength); > + } else { > + ret = nvgrace_gpu_nvdev_struct_workaround(pdev, nvdev, > + memphys, > + memlength); > + if (ret) > + goto out_put_vdev; > + } Doesn't this work out much more naturally if we just do something like: diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 85eacafaffdf..43a9457442ff 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -17,9 +17,6 @@ #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX -/* Memory size expected as non cached and reserved by the VM driver */ -#define RESMEM_SIZE SZ_1G - /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ #define MEMBLK_SIZE SZ_512M @@ -72,7 +69,7 @@ nvgrace_gpu_memregion(int index, if (index == USEMEM_REGION_INDEX) return &nvdev->usemem; - if (index == RESMEM_REGION_INDEX) + if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) return &nvdev->resmem; return NULL; @@ -757,6 +754,13 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, u64 memphys, u64 memlength) { int ret = 0; + u64 resmem_size = 0; + + /* + * Comment about the GH bug that requires this and fix in GB + */ + if (!nvdev->has_mig_hw_bug_fix) + resmem_size = SZ_1G; /* * The VM GPU device driver needs a non-cacheable region to support @@ -780,7 +784,7 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, * memory (usemem) is added to the kernel for usage by the VM * workloads. Make the usable memory size memblock aligned. */ - if (check_sub_overflow(memlength, RESMEM_SIZE, + if (check_sub_overflow(memlength, resmem_size, &nvdev->usemem.memlength)) { ret = -EOVERFLOW; goto done; @@ -813,7 +817,9 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, * the BAR size for them. */ nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); - nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); + if (nvdev->resmem.memlength) + nvdev->resmem.bar_size = + roundup_pow_of_two(nvdev->resmem.memlength); done: return ret; } Thanks, Alex