Re: [PATCH v2 2/3] vfio/nvgrace-gpu: Expose the blackwell device PF BAR1 to the VM

Alex Williamson <alex.williamson@xxxxxxxxxx> · Thu, 9 Jan 2025 14:21:23 -0500

On Sun, 5 Jan 2025 17:36:14 +0000
<ankita@xxxxxxxxxx> wrote:

> From: Ankit Agrawal <ankita@xxxxxxxxxx>
> 
> There is a HW defect on Grace Hopper (GH) to support the
> Multi-Instance GPU (MIG) feature [1] that necessiated the presence
> of a 1G region carved out from the device memory and mapped as
> uncached. The 1G region is shown as a fake BAR (comprising region 2 and 3)
> to workaround the issue.
> 
> The Grace Blackwell systems (GB) differ from GH systems in the following
> aspects:
> 1. The aforementioned HW defect is fixed on GB systems.
> 2. There is a usable BAR1 (region 2 and 3) on GB systems for the
> GPUdirect RDMA feature [2].
> 
> This patch accommodate those GB changes by showing the 64b physical
> device BAR1 (region2 and 3) to the VM instead of the fake one. This
> takes care of both the differences.
> 
> Moreover, the entire device memory is exposed on GB as cacheable to
> the VM as there is no carveout required.
> 
> Link: https://www.nvidia.com/en-in/technologies/multi-instance-gpu/ [1]
> Link: https://docs.nvidia.com/cuda/gpudirect-rdma/ [2]
> 
> Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx>
> ---
>  drivers/vfio/pci/nvgrace-gpu/main.c | 32 +++++++++++++++++++++--------
>  1 file changed, 24 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
> index 85eacafaffdf..44a276c886e1 100644
> --- a/drivers/vfio/pci/nvgrace-gpu/main.c
> +++ b/drivers/vfio/pci/nvgrace-gpu/main.c
> @@ -72,7 +72,7 @@ nvgrace_gpu_memregion(int index,
>  	if (index == USEMEM_REGION_INDEX)
>  		return &nvdev->usemem;
>  
> -	if (index == RESMEM_REGION_INDEX)
> +	if (!nvdev->has_mig_hw_bug_fix && index == RESMEM_REGION_INDEX)
>  		return &nvdev->resmem;
>  
>  	return NULL;
> @@ -715,6 +715,16 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
>  	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
>  };
>  
> +static void
> +nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
> +			      struct nvgrace_gpu_pci_core_device *nvdev,
> +			      u64 memphys, u64 memlength)
> +{
> +	nvdev->usemem.memphys = memphys;
> +	nvdev->usemem.memlength = memlength;
> +	nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
> +}
> +
>  static int
>  nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
>  				  u64 *pmemphys, u64 *pmemlength)
> @@ -752,9 +762,9 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
>  }
>  
>  static int
> -nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
> -			      struct nvgrace_gpu_pci_core_device *nvdev,
> -			      u64 memphys, u64 memlength)
> +nvgrace_gpu_nvdev_struct_workaround(struct pci_dev *pdev,
> +				    struct nvgrace_gpu_pci_core_device *nvdev,
> +				    u64 memphys, u64 memlength)
>  {
>  	int ret = 0;
>  
> @@ -864,10 +874,16 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
>  		 * Device memory properties are identified in the host ACPI
>  		 * table. Set the nvgrace_gpu_pci_core_device structure.
>  		 */
> -		ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
> -						    memphys, memlength);
> -		if (ret)
> -			goto out_put_vdev;
> +		if (nvdev->has_mig_hw_bug_fix) {
> +			nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
> +						      memphys, memlength);
> +		} else {
> +			ret = nvgrace_gpu_nvdev_struct_workaround(pdev, nvdev,
> +								  memphys,
> +								  memlength);
> +			if (ret)
> +				goto out_put_vdev;
> +		}

Doesn't this work out much more naturally if we just do something like:

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c
b/drivers/vfio/pci/nvgrace-gpu/main.c index 85eacafaffdf..43a9457442ff
100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -17,9 +17,6 @@
 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
 
-/* Memory size expected as non cached and reserved by the VM driver */
-#define RESMEM_SIZE SZ_1G
-
 /* A hardwired and constant ABI value between the GPU FW and VFIO
driver. */ #define MEMBLK_SIZE SZ_512M
 
@@ -72,7 +69,7 @@ nvgrace_gpu_memregion(int index,
 	if (index == USEMEM_REGION_INDEX)
 		return &nvdev->usemem;
 
-	if (index == RESMEM_REGION_INDEX)
+	if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
 		return &nvdev->resmem;
 
 	return NULL;
@@ -757,6 +754,13 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
 			      u64 memphys, u64 memlength)
 {
 	int ret = 0;
+	u64 resmem_size = 0;
+
+	/*
+	 * Comment about the GH bug that requires this and fix in GB
+	 */
+	if (!nvdev->has_mig_hw_bug_fix)
+		resmem_size = SZ_1G;
 
 	/*
 	 * The VM GPU device driver needs a non-cacheable region to
support @@ -780,7 +784,7 @@ nvgrace_gpu_init_nvdev_struct(struct
pci_dev *pdev,
 	 * memory (usemem) is added to the kernel for usage by the VM
 	 * workloads. Make the usable memory size memblock aligned.
 	 */
-	if (check_sub_overflow(memlength, RESMEM_SIZE,
+	if (check_sub_overflow(memlength, resmem_size,
 			       &nvdev->usemem.memlength)) {
 		ret = -EOVERFLOW;
 		goto done;
@@ -813,7 +817,9 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
 	 * the BAR size for them.
 	 */
 	nvdev->usemem.bar_size =
roundup_pow_of_two(nvdev->usemem.memlength);
-	nvdev->resmem.bar_size =
roundup_pow_of_two(nvdev->resmem.memlength);
+	if (nvdev->resmem.memlength)
+		nvdev->resmem.bar_size =
+			roundup_pow_of_two(nvdev->resmem.memlength);
 done:
 	return ret;
 }

Thanks,
Alex