On Wed, 20 Sep 2023 19:32:10 +0530 <ankita@xxxxxxxxxx> wrote: > From: Ankit Agrawal <ankita@xxxxxxxxxx> > > The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA > (Qemu) using remap_pfn_range() without adding the memory to the kernel. > The device memory pages are not backed by struct page. Patches 1-3 > implements the mechanism to handle ECC/poison on memory page without > struct page and expose a registration function. This new mechanism is > leveraged here. > > The module registers its memory region with the kernel MM for ECC handling > using the register_pfn_address_space() registration API exposed by the > kernel. It also defines a failure callback function pfn_memory_failure() > to get the poisoned PFN from the MM. > > The module track poisoned PFN as a bitmap with a bit per PFN. The PFN is > communicated by the kernel MM to the module through the failure function, > which sets the appropriate bit in the bitmap. > > The module also defines a VMA fault ops for the module. It returns > VM_FAULT_HWPOISON in case the bit for the PFN is set in the bitmap. > > [1] https://lore.kernel.org/all/20230915025415.6762-1-ankita@xxxxxxxxxx/ > > Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx> > --- > drivers/vfio/pci/nvgrace-gpu/main.c | 107 +++++++++++++++++++++++++++- > drivers/vfio/vfio.h | 11 --- > drivers/vfio/vfio_main.c | 3 +- > include/linux/vfio.h | 15 ++++ > 4 files changed, 123 insertions(+), 13 deletions(-) > > diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c > index ba323f2d8ea1..1c89ce0cc1cc 100644 > --- a/drivers/vfio/pci/nvgrace-gpu/main.c > +++ b/drivers/vfio/pci/nvgrace-gpu/main.c > @@ -6,6 +6,10 @@ > #include <linux/pci.h> > #include <linux/vfio_pci_core.h> > #include <linux/vfio.h> > +#ifdef CONFIG_MEMORY_FAILURE > +#include <linux/bitmap.h> > +#include <linux/memory-failure.h> > +#endif > > struct nvgrace_gpu_vfio_pci_core_device { > struct vfio_pci_core_device core_device; > @@ -13,8 +17,85 @@ struct nvgrace_gpu_vfio_pci_core_device { > size_t memlength; > void *memmap; > struct mutex memmap_lock; > +#ifdef CONFIG_MEMORY_FAILURE > + struct pfn_address_space pfn_address_space; > + unsigned long *pfn_bitmap; > +#endif > }; > > +#ifdef CONFIG_MEMORY_FAILURE > +void nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space, > + unsigned long pfn) > +{ > + struct nvgrace_gpu_vfio_pci_core_device *nvdev = container_of( > + pfn_space, struct nvgrace_gpu_vfio_pci_core_device, pfn_address_space); > + unsigned long mem_offset = pfn - pfn_space->node.start; > + > + if (mem_offset >= nvdev->memlength) > + return; > + > + /* > + * MM has called to notify a poisoned page. Track that in the bitmap. > + */ > + __set_bit(mem_offset, nvdev->pfn_bitmap); > +} > + > +struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = { > + .failure = nvgrace_gpu_vfio_pci_pfn_memory_failure, > +}; > + > +static int > +nvgrace_gpu_vfio_pci_register_pfn_range(struct nvgrace_gpu_vfio_pci_core_device *nvdev, > + struct vm_area_struct *vma) > +{ > + unsigned long nr_pages; > + int ret = 0; > + > + nr_pages = nvdev->memlength >> PAGE_SHIFT; > + > + nvdev->pfn_address_space.node.start = vma->vm_pgoff; > + nvdev->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; > + nvdev->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops; > + nvdev->pfn_address_space.mapping = vma->vm_file->f_mapping; > + > + ret = register_pfn_address_space(&(nvdev->pfn_address_space)); > + > + return ret; > +} > + > +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) > +{ > + unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff; > + struct vfio_device *core_vdev; > + struct nvgrace_gpu_vfio_pci_core_device *nvdev; > + > + if (!(vmf->vma->vm_file)) > + goto error_exit; > + > + core_vdev = vfio_device_from_file(vmf->vma->vm_file); > + > + if (!core_vdev) > + goto error_exit; > + > + nvdev = container_of(core_vdev, > + struct nvgrace_gpu_vfio_pci_core_device, core_device.vdev); > + > + /* > + * Check if the page is poisoned. > + */ > + if (mem_offset < (nvdev->memlength >> PAGE_SHIFT) && > + test_bit(mem_offset, nvdev->pfn_bitmap)) > + return VM_FAULT_HWPOISON; > + > +error_exit: > + return VM_FAULT_ERROR; > +} > + > +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { > + .fault = nvgrace_gpu_vfio_pci_fault, > +}; > +#endif > + > static int nvgrace_gpu_vfio_pci_open_device(struct vfio_device *core_vdev) > { > struct vfio_pci_core_device *vdev = > @@ -46,6 +127,9 @@ static void nvgrace_gpu_vfio_pci_close_device(struct vfio_device *core_vdev) > > mutex_destroy(&nvdev->memmap_lock); > > +#ifdef CONFIG_MEMORY_FAILURE > + unregister_pfn_address_space(&(nvdev->pfn_address_space)); > +#endif > vfio_pci_core_close_device(core_vdev); > } > > @@ -104,8 +188,12 @@ static int nvgrace_gpu_vfio_pci_mmap(struct vfio_device *core_vdev, > return ret; > > vma->vm_pgoff = start_pfn; > +#ifdef CONFIG_MEMORY_FAILURE > + vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; > > - return 0; > + ret = nvgrace_gpu_vfio_pci_register_pfn_range(nvdev, vma); > +#endif > + return ret; > } > > static long > @@ -406,6 +494,19 @@ nvgrace_gpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev, > > nvdev->memlength = memlength; > > +#ifdef CONFIG_MEMORY_FAILURE > + /* > + * A bitmap is maintained to track the pages that are poisoned. Each > + * page is represented by a bit. Allocation size in bytes is > + * determined by shifting the device memory size by PAGE_SHIFT to > + * determine the number of pages; and further shifted by 3 as each > + * byte could track 8 pages. > + */ > + nvdev->pfn_bitmap > + = vzalloc((nvdev->memlength >> PAGE_SHIFT)/BITS_PER_TYPE(char)); > + if (!nvdev->pfn_bitmap) > + ret = -ENOMEM; > +#endif > return ret; > } > > @@ -442,6 +543,10 @@ static void nvgrace_gpu_vfio_pci_remove(struct pci_dev *pdev) > struct nvgrace_gpu_vfio_pci_core_device *nvdev = nvgrace_gpu_drvdata(pdev); > struct vfio_pci_core_device *vdev = &nvdev->core_device; > > +#ifdef CONFIG_MEMORY_FAILURE > + vfree(nvdev->pfn_bitmap); > +#endif > + > vfio_pci_core_unregister_device(vdev); > vfio_put_device(&vdev->vdev); > } > diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h > index 307e3f29b527..747094503909 100644 > --- a/drivers/vfio/vfio.h > +++ b/drivers/vfio/vfio.h > @@ -16,17 +16,6 @@ struct iommufd_ctx; > struct iommu_group; > struct vfio_container; > > -struct vfio_device_file { > - struct vfio_device *device; > - struct vfio_group *group; > - > - u8 access_granted; > - u32 devid; /* only valid when iommufd is valid */ > - spinlock_t kvm_ref_lock; /* protect kvm field */ > - struct kvm *kvm; > - struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */ > -}; > - > void vfio_device_put_registration(struct vfio_device *device); > bool vfio_device_try_get_registration(struct vfio_device *device); > int vfio_df_open(struct vfio_device_file *df); > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c > index 40732e8ed4c6..a7dafd7c64a6 100644 > --- a/drivers/vfio/vfio_main.c > +++ b/drivers/vfio/vfio_main.c > @@ -1309,7 +1309,7 @@ const struct file_operations vfio_device_fops = { > .mmap = vfio_device_fops_mmap, > }; > > -static struct vfio_device *vfio_device_from_file(struct file *file) > +struct vfio_device *vfio_device_from_file(struct file *file) > { > struct vfio_device_file *df = file->private_data; > > @@ -1317,6 +1317,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file) > return NULL; > return df->device; > } > +EXPORT_SYMBOL_GPL(vfio_device_from_file); > > /** > * vfio_file_is_valid - True if the file is valid vfio file > diff --git a/include/linux/vfio.h b/include/linux/vfio.h > index 454e9295970c..d88af251e931 100644 > --- a/include/linux/vfio.h > +++ b/include/linux/vfio.h > @@ -361,4 +361,19 @@ int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *), > struct virqfd **pvirqfd, int fd); > void vfio_virqfd_disable(struct virqfd **pvirqfd); > > +/* > + * VFIO device file. > + */ > +struct vfio_device_file { > + struct vfio_device *device; > + struct vfio_group *group; > + u8 access_granted; > + u32 devid; /* only valid when iommufd is valid */ > + spinlock_t kvm_ref_lock; /* protect kvm field */ > + struct kvm *kvm; > + struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */ > +}; What here necessitates moving this to the more public header? Thanks, Alex > + > +struct vfio_device *vfio_device_from_file(struct file *file); > + > #endif /* VFIO_H */