From: Ankit Agrawal <ankita@xxxxxxxxxx> The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA (Qemu) using remap_pfn_range() without adding the memory to the kernel. The device memory pages are not backed by struct page. Patches 1-3 implements the mechanism to handle ECC/poison on memory page without struct page and expose a registration function. This new mechanism is leveraged here. The module registers its memory region with the kernel MM for ECC handling using the register_pfn_address_space() registration API exposed by the kernel. It also defines a failure callback function pfn_memory_failure() to get the poisoned PFN from the MM. The module track poisoned PFN using a hastable. The PFN is communicated by the kernel MM to the module through the failure function, which push the appropriate memory offset to the hashtable. The module also defines a VMA fault ops for the module. It returns VM_FAULT_HWPOISON in case the memory offset is found in the hashtable. [1] https://lore.kernel.org/all/20231114081611.30550-1-ankita@xxxxxxxxxx/ Signed-off-by: Ankit Agrawal <ankita@xxxxxxxxxx> --- drivers/vfio/pci/nvgrace-gpu/main.c | 123 +++++++++++++++++++++++++++- drivers/vfio/vfio_main.c | 3 +- 2 files changed, 124 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index b8634974e5cc..5a567375bd14 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -6,6 +6,16 @@ #include <linux/pci.h> #include <linux/vfio_pci_core.h> #include <linux/vfio.h> +#ifdef CONFIG_MEMORY_FAILURE +#include <linux/bitmap.h> +#include <linux/memory-failure.h> +#include <linux/hashtable.h> +#endif + +struct h_node { + unsigned long mem_offset; + struct hlist_node node; +}; struct nvgrace_gpu_vfio_pci_core_device { struct vfio_pci_core_device core_device; @@ -13,8 +23,96 @@ struct nvgrace_gpu_vfio_pci_core_device { size_t memlength; void *memmap; struct mutex memmap_lock; +#ifdef CONFIG_MEMORY_FAILURE + struct pfn_address_space pfn_address_space; + DECLARE_HASHTABLE(htbl, 8); +#endif +}; + +#ifdef CONFIG_MEMORY_FAILURE +static void +nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space, + unsigned long pfn) +{ + struct nvgrace_gpu_vfio_pci_core_device *nvdev = container_of( + pfn_space, struct nvgrace_gpu_vfio_pci_core_device, pfn_address_space); + unsigned long mem_offset = pfn - pfn_space->node.start; + struct h_node *ecc; + + if (mem_offset >= (nvdev->memlength >> PAGE_SHIFT)) + return; + + /* + * MM has called to notify a poisoned page. Track that in the hastable. + */ + ecc = (struct h_node *)(vzalloc(sizeof(struct h_node))); + ecc->mem_offset = mem_offset; + hash_add(nvdev->htbl, &(ecc->node), ecc->mem_offset); +} + +struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = { + .failure = nvgrace_gpu_vfio_pci_pfn_memory_failure, }; +static int +nvgrace_gpu_vfio_pci_register_pfn_range(struct nvgrace_gpu_vfio_pci_core_device *nvdev, + struct vm_area_struct *vma) +{ + unsigned long nr_pages; + int ret = 0; + + nr_pages = nvdev->memlength >> PAGE_SHIFT; + + nvdev->pfn_address_space.node.start = vma->vm_pgoff; + nvdev->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; + nvdev->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops; + nvdev->pfn_address_space.mapping = vma->vm_file->f_mapping; + + ret = register_pfn_address_space(&(nvdev->pfn_address_space)); + + return ret; +} + +extern struct vfio_device *vfio_device_from_file(struct file *file); + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff; + struct vfio_device *core_vdev; + struct nvgrace_gpu_vfio_pci_core_device *nvdev; + bool found = false; + struct h_node *cur; + + if (!(vmf->vma->vm_file)) + goto error_exit; + + core_vdev = vfio_device_from_file(vmf->vma->vm_file); + + if (!core_vdev) + goto error_exit; + + nvdev = container_of(core_vdev, + struct nvgrace_gpu_vfio_pci_core_device, core_device.vdev); + + if (mem_offset < (nvdev->memlength >> PAGE_SHIFT)) { + /* + * Check if the page is poisoned. + */ + hash_for_each_possible(nvdev->htbl, cur, node, mem_offset) { + if (cur->mem_offset == mem_offset) + return VM_FAULT_HWPOISON; + } + } + +error_exit: + return VM_FAULT_ERROR; +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, +}; +#endif + static int nvgrace_gpu_vfio_pci_open_device(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = @@ -46,6 +144,9 @@ static void nvgrace_gpu_vfio_pci_close_device(struct vfio_device *core_vdev) mutex_destroy(&nvdev->memmap_lock); +#ifdef CONFIG_MEMORY_FAILURE + unregister_pfn_address_space(&(nvdev->pfn_address_space)); +#endif vfio_pci_core_close_device(core_vdev); } @@ -103,8 +204,12 @@ static int nvgrace_gpu_vfio_pci_mmap(struct vfio_device *core_vdev, return ret; vma->vm_pgoff = start_pfn; +#ifdef CONFIG_MEMORY_FAILURE + vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; - return 0; + ret = nvgrace_gpu_vfio_pci_register_pfn_range(nvdev, vma); +#endif + return ret; } static long @@ -413,6 +518,12 @@ nvgrace_gpu_vfio_pci_fetch_memory_property(struct pci_dev *pdev, nvdev->memlength = memlength; +#ifdef CONFIG_MEMORY_FAILURE + /* + * Initialize the hashtable tracking the poisoned pages. + */ + hash_init(nvdev->htbl); +#endif return ret; } @@ -448,6 +559,16 @@ static void nvgrace_gpu_vfio_pci_remove(struct pci_dev *pdev) { struct nvgrace_gpu_vfio_pci_core_device *nvdev = nvgrace_gpu_drvdata(pdev); struct vfio_pci_core_device *vdev = &nvdev->core_device; +#ifdef CONFIG_MEMORY_FAILURE + struct h_node *cur; + unsigned long bkt; + struct hlist_node *tmp_node; + + hash_for_each_safe(nvdev->htbl, bkt, tmp_node, cur, node) { + hash_del(&cur->node); + vfree(cur); + } +#endif vfio_pci_core_unregister_device(vdev); vfio_put_device(&vdev->vdev); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 8d4995ada74a..290431ac2e00 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1319,7 +1319,7 @@ const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; -static struct vfio_device *vfio_device_from_file(struct file *file) +struct vfio_device *vfio_device_from_file(struct file *file) { struct vfio_device_file *df = file->private_data; @@ -1327,6 +1327,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file) return NULL; return df->device; } +EXPORT_SYMBOL_GPL(vfio_device_from_file); /** * vfio_file_is_valid - True if the file is valid vfio file -- 2.17.1