From: Ankit Agrawal <ankita@xxxxxxxxxx> The nvgpu-vfio-pci module maps QEMU VMA to device memory through remap_pfn_range(). The new mechanism to handle poison on memory not backed by struct page is leveraged here. nvgpu-vfio-pci defines a function pfn_memory_failure() to get the ECC PFN from the MM. The function is registered with kernel MM along with the address space and PFN range through register_pfn_address_space(). Track poisoned PFN in the nvgpu-vfio-pci module as bitmap with a bit per PFN. The PFN is communicated by the kernel MM to the module through the failure function, which sets the appropriate bit in the bitmap. Register a VMA fault ops for the module. It returns VM_FAULT_HWPOISON in case the bit for the PFN is set in the bitmap. Clear bitmap on reset to reflect the clean state of the device memory after reset. Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx> --- drivers/vfio/pci/nvgpu/main.c | 116 ++++++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/nvgpu/main.c b/drivers/vfio/pci/nvgpu/main.c index 2dd8cc6e0145..8ccd3fe33a0f 100644 --- a/drivers/vfio/pci/nvgpu/main.c +++ b/drivers/vfio/pci/nvgpu/main.c @@ -5,6 +5,8 @@ #include <linux/pci.h> #include <linux/vfio_pci_core.h> +#include <linux/bitmap.h> +#include <linux/memory-failure.h> #define DUMMY_PFN \ (((nvdev->mem_prop.hpa + nvdev->mem_prop.mem_length) >> PAGE_SHIFT) - 1) @@ -12,12 +14,78 @@ struct dev_mem_properties { uint64_t hpa; uint64_t mem_length; + unsigned long *pfn_bitmap; int bar1_start_offset; }; struct nvgpu_vfio_pci_core_device { struct vfio_pci_core_device core_device; struct dev_mem_properties mem_prop; + struct pfn_address_space pfn_address_space; +}; + +void nvgpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space, + unsigned long pfn) +{ + struct nvgpu_vfio_pci_core_device *nvdev = container_of( + pfn_space, struct nvgpu_vfio_pci_core_device, pfn_address_space); + + /* + * MM has called to notify a poisoned page. Track that in the bitmap. + */ + __set_bit(pfn - (pfn_space->node.start), nvdev->mem_prop.pfn_bitmap); +} + +struct pfn_address_space_ops nvgpu_vfio_pci_pas_ops = { + .failure = nvgpu_vfio_pci_pfn_memory_failure, +}; + +static int +nvgpu_vfio_pci_register_pfn_range(struct nvgpu_vfio_pci_core_device *nvdev, + struct vm_area_struct *vma) +{ + unsigned long nr_pages; + int ret = 0; + + nr_pages = nvdev->mem_prop.mem_length >> PAGE_SHIFT; + + nvdev->pfn_address_space.node.start = vma->vm_pgoff; + nvdev->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; + nvdev->pfn_address_space.ops = &nvgpu_vfio_pci_pas_ops; + nvdev->pfn_address_space.mapping = vma->vm_file->f_mapping; + + ret = register_pfn_address_space(&(nvdev->pfn_address_space)); + + return ret; +} + +static vm_fault_t nvgpu_vfio_pci_fault(struct vm_fault *vmf) +{ + unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff; + struct nvgpu_vfio_pci_core_device *nvdev = container_of( + vmf->vma->vm_file->private_data, + struct nvgpu_vfio_pci_core_device, core_device.vdev); + int ret; + + /* + * Check if the page is poisoned. + */ + if (mem_offset < (nvdev->mem_prop.mem_length >> PAGE_SHIFT) && + test_bit(mem_offset, nvdev->mem_prop.pfn_bitmap)) + return VM_FAULT_HWPOISON; + + ret = remap_pfn_range(vmf->vma, + vmf->vma->vm_start + (mem_offset << PAGE_SHIFT), + DUMMY_PFN, PAGE_SIZE, + vmf->vma->vm_page_prot); + if (ret) + return VM_FAULT_ERROR; + + return VM_FAULT_NOPAGE; +} + +static const struct vm_operations_struct nvgpu_vfio_pci_mmap_ops = { + .fault = nvgpu_vfio_pci_fault, }; static int vfio_get_bar1_start_offset(struct vfio_pci_core_device *vdev) @@ -26,8 +94,9 @@ static int vfio_get_bar1_start_offset(struct vfio_pci_core_device *vdev) pci_read_config_byte(vdev->pdev, 0x10, &val); /* - * The BAR1 start offset in the PCI config space depends on the BAR0size. - * Check if the BAR0 is 64b and return the approproiate BAR1 offset. + * The BAR1 start offset in the PCI config space depends on the BAR0 + * size. Check if the BAR0 is 64b and return the approproiate BAR1 + * offset. */ if (val & PCI_BASE_ADDRESS_MEM_TYPE_64) return VFIO_PCI_BAR2_REGION_INDEX; @@ -54,6 +123,16 @@ static int nvgpu_vfio_pci_open_device(struct vfio_device *core_vdev) return ret; } +void nvgpu_vfio_pci_close_device(struct vfio_device *core_vdev) +{ + struct nvgpu_vfio_pci_core_device *nvdev = container_of( + core_vdev, struct nvgpu_vfio_pci_core_device, core_device.vdev); + + unregister_pfn_address_space(&(nvdev->pfn_address_space)); + + vfio_pci_core_close_device(core_vdev); +} + int nvgpu_vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { @@ -93,8 +172,11 @@ int nvgpu_vfio_pci_mmap(struct vfio_device *core_vdev, return ret; vma->vm_pgoff = start_pfn + pgoff; + vma->vm_ops = &nvgpu_vfio_pci_mmap_ops; - return 0; + ret = nvgpu_vfio_pci_register_pfn_range(nvdev, vma); + + return ret; } long nvgpu_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, @@ -140,7 +222,14 @@ long nvgpu_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, } return vfio_pci_core_ioctl(core_vdev, cmd, arg); - + case VFIO_DEVICE_RESET: + /* + * Resetting the GPU clears up the poisoned page. Reset the + * poisoned page bitmap. + */ + memset(nvdev->mem_prop.pfn_bitmap, 0, + nvdev->mem_prop.mem_length >> (PAGE_SHIFT + 3)); + return vfio_pci_core_ioctl(core_vdev, cmd, arg); default: return vfio_pci_core_ioctl(core_vdev, cmd, arg); } @@ -151,7 +240,7 @@ static const struct vfio_device_ops nvgpu_vfio_pci_ops = { .init = vfio_pci_core_init_dev, .release = vfio_pci_core_release_dev, .open_device = nvgpu_vfio_pci_open_device, - .close_device = vfio_pci_core_close_device, + .close_device = nvgpu_vfio_pci_close_device, .ioctl = nvgpu_vfio_pci_ioctl, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -188,7 +277,20 @@ nvgpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev, ret = device_property_read_u64(&(pdev->dev), "nvidia,gpu-mem-size", &(nvdev->mem_prop.mem_length)); - return ret; + if (ret) + return ret; + + /* + * A bitmap is maintained to teack the pages that are poisoned. Each + * page is represented by a bit. Allocation size in bytes is + * determined by shifting the device memory size by PAGE_SHIFT to + * determine the number of pages; and further shifted by 3 as each + * byte could track 8 pages. + */ + nvdev->mem_prop.pfn_bitmap + = vzalloc(nvdev->mem_prop.mem_length >> (PAGE_SHIFT + 3)); + + return 0; } static int nvgpu_vfio_pci_probe(struct pci_dev *pdev, @@ -224,6 +326,8 @@ static void nvgpu_vfio_pci_remove(struct pci_dev *pdev) struct nvgpu_vfio_pci_core_device *nvdev = nvgpu_drvdata(pdev); struct vfio_pci_core_device *vdev = &nvdev->core_device; + vfree(nvdev->mem_prop.pfn_bitmap); + vfio_pci_core_unregister_device(vdev); vfio_put_device(&vdev->vdev); } -- 2.17.1