From: Ankit Agrawal <ankita@xxxxxxxxxx> The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA (Qemu) using remap_pfn_range() without adding the memory to the kernel. The device memory pages are not backed by struct page. Patches 1-3 implements the mechanism to handle ECC/poison on memory page without struct page and expose a registration function. This new mechanism is leveraged here. The module registers its memory region with the kernel MM for ECC handling using the register_pfn_address_space() registration API exposed by the kernel. It also defines a failure callback function pfn_memory_failure() to get the poisoned PFN from the MM. The module track poisoned PFN as a bitmap with a bit per PFN. The PFN is communicated by the kernel MM to the module through the failure function, which sets the appropriate bit in the bitmap. The module also defines a VMA fault ops for the module. It returns VM_FAULT_HWPOISON in case the bit for the PFN is set in the bitmap. [1] https://lore.kernel.org/all/20230915025415.6762-1-ankita@xxxxxxxxxx/ Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx> --- drivers/vfio/pci/nvgrace-gpu/main.c | 107 +++++++++++++++++++++++++++- drivers/vfio/vfio.h | 11 --- drivers/vfio/vfio_main.c | 3 +- include/linux/vfio.h | 15 ++++ 4 files changed, 123 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index ba323f2d8ea1..1c89ce0cc1cc 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -6,6 +6,10 @@ #include <linux/pci.h> #include <linux/vfio_pci_core.h> #include <linux/vfio.h> +#ifdef CONFIG_MEMORY_FAILURE +#include <linux/bitmap.h> +#include <linux/memory-failure.h> +#endif struct nvgrace_gpu_vfio_pci_core_device { struct vfio_pci_core_device core_device; @@ -13,8 +17,85 @@ struct nvgrace_gpu_vfio_pci_core_device { size_t memlength; void *memmap; struct mutex memmap_lock; +#ifdef CONFIG_MEMORY_FAILURE + struct pfn_address_space pfn_address_space; + unsigned long *pfn_bitmap; +#endif }; +#ifdef CONFIG_MEMORY_FAILURE +void nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space, + unsigned long pfn) +{ + struct nvgrace_gpu_vfio_pci_core_device *nvdev = container_of( + pfn_space, struct nvgrace_gpu_vfio_pci_core_device, pfn_address_space); + unsigned long mem_offset = pfn - pfn_space->node.start; + + if (mem_offset >= nvdev->memlength) + return; + + /* + * MM has called to notify a poisoned page. Track that in the bitmap. + */ + __set_bit(mem_offset, nvdev->pfn_bitmap); +} + +struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = { + .failure = nvgrace_gpu_vfio_pci_pfn_memory_failure, +}; + +static int +nvgrace_gpu_vfio_pci_register_pfn_range(struct nvgrace_gpu_vfio_pci_core_device *nvdev, + struct vm_area_struct *vma) +{ + unsigned long nr_pages; + int ret = 0; + + nr_pages = nvdev->memlength >> PAGE_SHIFT; + + nvdev->pfn_address_space.node.start = vma->vm_pgoff; + nvdev->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; + nvdev->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops; + nvdev->pfn_address_space.mapping = vma->vm_file->f_mapping; + + ret = register_pfn_address_space(&(nvdev->pfn_address_space)); + + return ret; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff; + struct vfio_device *core_vdev; + struct nvgrace_gpu_vfio_pci_core_device *nvdev; + + if (!(vmf->vma->vm_file)) + goto error_exit; + + core_vdev = vfio_device_from_file(vmf->vma->vm_file); + + if (!core_vdev) + goto error_exit; + + nvdev = container_of(core_vdev, + struct nvgrace_gpu_vfio_pci_core_device, core_device.vdev); + + /* + * Check if the page is poisoned. + */ + if (mem_offset < (nvdev->memlength >> PAGE_SHIFT) && + test_bit(mem_offset, nvdev->pfn_bitmap)) + return VM_FAULT_HWPOISON; + +error_exit: + return VM_FAULT_ERROR; +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, +}; +#endif + static int nvgrace_gpu_vfio_pci_open_device(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = @@ -46,6 +127,9 @@ static void nvgrace_gpu_vfio_pci_close_device(struct vfio_device *core_vdev) mutex_destroy(&nvdev->memmap_lock); +#ifdef CONFIG_MEMORY_FAILURE + unregister_pfn_address_space(&(nvdev->pfn_address_space)); +#endif vfio_pci_core_close_device(core_vdev); } @@ -104,8 +188,12 @@ static int nvgrace_gpu_vfio_pci_mmap(struct vfio_device *core_vdev, return ret; vma->vm_pgoff = start_pfn; +#ifdef CONFIG_MEMORY_FAILURE + vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; - return 0; + ret = nvgrace_gpu_vfio_pci_register_pfn_range(nvdev, vma); +#endif + return ret; } static long @@ -406,6 +494,19 @@ nvgrace_gpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev, nvdev->memlength = memlength; +#ifdef CONFIG_MEMORY_FAILURE + /* + * A bitmap is maintained to track the pages that are poisoned. Each + * page is represented by a bit. Allocation size in bytes is + * determined by shifting the device memory size by PAGE_SHIFT to + * determine the number of pages; and further shifted by 3 as each + * byte could track 8 pages. + */ + nvdev->pfn_bitmap + = vzalloc((nvdev->memlength >> PAGE_SHIFT)/BITS_PER_TYPE(char)); + if (!nvdev->pfn_bitmap) + ret = -ENOMEM; +#endif return ret; } @@ -442,6 +543,10 @@ static void nvgrace_gpu_vfio_pci_remove(struct pci_dev *pdev) struct nvgrace_gpu_vfio_pci_core_device *nvdev = nvgrace_gpu_drvdata(pdev); struct vfio_pci_core_device *vdev = &nvdev->core_device; +#ifdef CONFIG_MEMORY_FAILURE + vfree(nvdev->pfn_bitmap); +#endif + vfio_pci_core_unregister_device(vdev); vfio_put_device(&vdev->vdev); } diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 307e3f29b527..747094503909 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -16,17 +16,6 @@ struct iommufd_ctx; struct iommu_group; struct vfio_container; -struct vfio_device_file { - struct vfio_device *device; - struct vfio_group *group; - - u8 access_granted; - u32 devid; /* only valid when iommufd is valid */ - spinlock_t kvm_ref_lock; /* protect kvm field */ - struct kvm *kvm; - struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */ -}; - void vfio_device_put_registration(struct vfio_device *device); bool vfio_device_try_get_registration(struct vfio_device *device); int vfio_df_open(struct vfio_device_file *df); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 40732e8ed4c6..a7dafd7c64a6 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1309,7 +1309,7 @@ const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; -static struct vfio_device *vfio_device_from_file(struct file *file) +struct vfio_device *vfio_device_from_file(struct file *file) { struct vfio_device_file *df = file->private_data; @@ -1317,6 +1317,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file) return NULL; return df->device; } +EXPORT_SYMBOL_GPL(vfio_device_from_file); /** * vfio_file_is_valid - True if the file is valid vfio file diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 454e9295970c..d88af251e931 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -361,4 +361,19 @@ int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *), struct virqfd **pvirqfd, int fd); void vfio_virqfd_disable(struct virqfd **pvirqfd); +/* + * VFIO device file. + */ +struct vfio_device_file { + struct vfio_device *device; + struct vfio_group *group; + u8 access_granted; + u32 devid; /* only valid when iommufd is valid */ + spinlock_t kvm_ref_lock; /* protect kvm field */ + struct kvm *kvm; + struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */ +}; + +struct vfio_device *vfio_device_from_file(struct file *file); + #endif /* VFIO_H */ -- 2.17.1