On Thu, Sep 16, 2021 at 05:40:59PM -0600, Logan Gunthorpe wrote: > Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap > a hunk of p2pmem into userspace. > > Pages are allocated from the genalloc in bulk and their reference count > incremented. They are returned to the genalloc when the page is put. > > The VMA does not take a reference to the pages when they are inserted > with vmf_insert_mixed() (which is necessary for zone device pages) so > the backing P2P memory is stored in a structures in vm_private_data. > > A pseudo mount is used to allocate an inode for each PCI device. The > inode's address_space is used in the file doing the mmap so that all > VMAs are collected and can be unmapped if the PCI device is unbound. > After unmapping, the VMAs are iterated through and their pages are > put so the device can continue to be unbound. An active flag is used > to signal to VMAs not to allocate any further P2P memory once the > removal process starts. The flag is synchronized with concurrent > access with an RCU lock. > > The VMAs and inode will survive after the unbind of the device, but no > pages will be present in the VMA and a subsequent access will result > in a SIGBUS error. > > Signed-off-by: Logan Gunthorpe <logang@xxxxxxxxxxxx> Acked-by: Bjorn Helgaas <bhelgaas@xxxxxxxxxx> I would capitalize "Introduce" in the subject line. > --- > drivers/pci/p2pdma.c | 263 ++++++++++++++++++++++++++++++++++++- > include/linux/pci-p2pdma.h | 11 ++ > include/uapi/linux/magic.h | 1 + > 3 files changed, 273 insertions(+), 2 deletions(-) > > diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c > index 2422af5a529c..a5adf57af53a 100644 > --- a/drivers/pci/p2pdma.c > +++ b/drivers/pci/p2pdma.c > @@ -16,14 +16,19 @@ > #include <linux/genalloc.h> > #include <linux/memremap.h> > #include <linux/percpu-refcount.h> > +#include <linux/pfn_t.h> > +#include <linux/pseudo_fs.h> > #include <linux/random.h> > #include <linux/seq_buf.h> > #include <linux/xarray.h> > +#include <uapi/linux/magic.h> > > struct pci_p2pdma { > struct gen_pool *pool; > bool p2pmem_published; > struct xarray map_types; > + struct inode *inode; > + bool active; > }; > > struct pci_p2pdma_pagemap { > @@ -32,6 +37,14 @@ struct pci_p2pdma_pagemap { > u64 bus_offset; > }; > > +struct pci_p2pdma_map { > + struct kref ref; > + struct pci_dev *pdev; > + struct inode *inode; > + void *kaddr; > + size_t len; > +}; > + > static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) > { > return container_of(pgmap, struct pci_p2pdma_pagemap, pgmap); > @@ -100,6 +113,26 @@ static const struct attribute_group p2pmem_group = { > .name = "p2pmem", > }; > > +/* > + * P2PDMA internal mount > + * Fake an internal VFS mount-point in order to allocate struct address_space > + * mappings to remove VMAs on unbind events. > + */ > +static int pci_p2pdma_fs_cnt; > +static struct vfsmount *pci_p2pdma_fs_mnt; > + > +static int pci_p2pdma_fs_init_fs_context(struct fs_context *fc) > +{ > + return init_pseudo(fc, P2PDMA_MAGIC) ? 0 : -ENOMEM; > +} > + > +static struct file_system_type pci_p2pdma_fs_type = { > + .name = "p2dma", > + .owner = THIS_MODULE, > + .init_fs_context = pci_p2pdma_fs_init_fs_context, > + .kill_sb = kill_anon_super, > +}; > + > static void p2pdma_page_free(struct page *page) > { > struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap); > @@ -128,6 +161,9 @@ static void pci_p2pdma_release(void *data) > gen_pool_destroy(p2pdma->pool); > sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); > xa_destroy(&p2pdma->map_types); > + > + iput(p2pdma->inode); > + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt); > } > > static int pci_p2pdma_setup(struct pci_dev *pdev) > @@ -145,17 +181,32 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) > if (!p2p->pool) > goto out; > > - error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); > + error = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt, > + &pci_p2pdma_fs_cnt); > if (error) > goto out_pool_destroy; > > + p2p->inode = alloc_anon_inode(pci_p2pdma_fs_mnt->mnt_sb); > + if (IS_ERR(p2p->inode)) { > + error = -ENOMEM; > + goto out_unpin_fs; > + } > + > + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); > + if (error) > + goto out_put_inode; > + > error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); > if (error) > - goto out_pool_destroy; > + goto out_put_inode; > > rcu_assign_pointer(pdev->p2pdma, p2p); > return 0; > > +out_put_inode: > + iput(p2p->inode); > +out_unpin_fs: > + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt); > out_pool_destroy: > gen_pool_destroy(p2p->pool); > out: > @@ -163,6 +214,45 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) > return error; > } > > +static void pci_p2pdma_map_free_pages(struct pci_p2pdma_map *pmap) > +{ > + int i; > + > + if (!pmap->kaddr) > + return; > + > + for (i = 0; i < pmap->len; i += PAGE_SIZE) > + put_page(virt_to_page(pmap->kaddr + i)); > + > + pmap->kaddr = NULL; > +} > + > +static void pci_p2pdma_free_mappings(struct address_space *mapping) > +{ > + struct vm_area_struct *vma; > + > + i_mmap_lock_write(mapping); > + if (RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) > + goto out; > + > + vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, -1) > + pci_p2pdma_map_free_pages(vma->vm_private_data); > + > +out: > + i_mmap_unlock_write(mapping); > +} > + > +static void pci_p2pdma_unmap_mappings(void *data) > +{ > + struct pci_dev *pdev = data; > + struct pci_p2pdma *p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); > + > + p2pdma->active = false; > + synchronize_rcu(); > + unmap_mapping_range(p2pdma->inode->i_mapping, 0, 0, 1); > + pci_p2pdma_free_mappings(p2pdma->inode->i_mapping); > +} > + > /** > * pci_p2pdma_add_resource - add memory for use as p2p memory > * @pdev: the device to add the memory to > @@ -221,6 +311,11 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, > goto pgmap_free; > } > > + error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, > + pdev); > + if (error) > + goto pages_free; > + > p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); > error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr, > pci_bus_address(pdev, bar) + offset, > @@ -229,6 +324,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, > if (error) > goto pages_free; > > + p2pdma->active = true; > pci_info(pdev, "added peer-to-peer DMA memory %#llx-%#llx\n", > pgmap->range.start, pgmap->range.end); > > @@ -1029,3 +1125,166 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, > return sprintf(page, "%s\n", pci_name(p2p_dev)); > } > EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show); > + > +static struct pci_p2pdma_map *pci_p2pdma_map_alloc(struct pci_dev *pdev, > + size_t len) > +{ > + struct pci_p2pdma_map *pmap; > + > + pmap = kzalloc(sizeof(*pmap), GFP_KERNEL); > + if (!pmap) > + return NULL; > + > + kref_init(&pmap->ref); > + pmap->pdev = pci_dev_get(pdev); > + pmap->len = len; > + > + return pmap; > +} > + > +static void pci_p2pdma_map_free(struct kref *ref) > +{ > + struct pci_p2pdma_map *pmap = > + container_of(ref, struct pci_p2pdma_map, ref); > + > + pci_p2pdma_map_free_pages(pmap); > + pci_dev_put(pmap->pdev); > + iput(pmap->inode); > + simple_release_fs(&pci_p2pdma_fs_mnt, &pci_p2pdma_fs_cnt); > + kfree(pmap); > +} > + > +static void pci_p2pdma_vma_open(struct vm_area_struct *vma) > +{ > + struct pci_p2pdma_map *pmap = vma->vm_private_data; > + > + kref_get(&pmap->ref); > +} > + > +static void pci_p2pdma_vma_close(struct vm_area_struct *vma) > +{ > + struct pci_p2pdma_map *pmap = vma->vm_private_data; > + > + kref_put(&pmap->ref, pci_p2pdma_map_free); > +} > + > +static vm_fault_t pci_p2pdma_vma_fault(struct vm_fault *vmf) > +{ > + struct pci_p2pdma_map *pmap = vmf->vma->vm_private_data; > + struct pci_p2pdma *p2pdma; > + void *vaddr; > + pfn_t pfn; > + int i; > + > + if (!pmap->kaddr) { > + rcu_read_lock(); > + p2pdma = rcu_dereference(pmap->pdev->p2pdma); > + if (!p2pdma) > + goto err_out; > + > + if (!p2pdma->active) > + goto err_out; > + > + pmap->kaddr = (void *)gen_pool_alloc(p2pdma->pool, pmap->len); > + if (!pmap->kaddr) > + goto err_out; > + > + for (i = 0; i < pmap->len; i += PAGE_SIZE) > + get_page(virt_to_page(pmap->kaddr + i)); > + > + rcu_read_unlock(); > + } > + > + vaddr = pmap->kaddr + (vmf->pgoff << PAGE_SHIFT); > + pfn = phys_to_pfn_t(virt_to_phys(vaddr), PFN_DEV | PFN_MAP); > + > + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); > + > +err_out: > + rcu_read_unlock(); > + return VM_FAULT_SIGBUS; > +} > +static const struct vm_operations_struct pci_p2pdma_vmops = { > + .open = pci_p2pdma_vma_open, > + .close = pci_p2pdma_vma_close, > + .fault = pci_p2pdma_vma_fault, > +}; > + > +/** > + * pci_p2pdma_mmap_file_open - setup file mapping to store P2PMEM VMAs > + * @pdev: the device to allocate memory from > + * @vma: the userspace vma to map the memory to > + * > + * Set f_mapping of the file to the p2pdma inode so that mappings > + * are can be torn down on device unbind. > + * > + * Returns 0 on success, or a negative error code on failure > + */ > +void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, struct file *file) > +{ > + struct pci_p2pdma *p2pdma; > + > + rcu_read_lock(); > + p2pdma = rcu_dereference(pdev->p2pdma); > + if (p2pdma) > + file->f_mapping = p2pdma->inode->i_mapping; > + rcu_read_unlock(); > +} > +EXPORT_SYMBOL_GPL(pci_p2pdma_mmap_file_open); > + > +/** > + * pci_mmap_p2pmem - setup an mmap region to be backed with P2PDMA memory > + * that was registered with pci_p2pdma_add_resource() > + * @pdev: the device to allocate memory from > + * @vma: the userspace vma to map the memory to > + * > + * The file must call pci_p2pdma_mmap_file_open() in its open() operation. > + * > + * Returns 0 on success, or a negative error code on failure > + */ > +int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma) > +{ > + struct pci_p2pdma_map *pmap; > + struct pci_p2pdma *p2pdma; > + int ret; > + > + /* prevent private mappings from being established */ > + if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { > + pci_info_ratelimited(pdev, > + "%s: fail, attempted private mapping\n", > + current->comm); > + return -EINVAL; > + } > + > + pmap = pci_p2pdma_map_alloc(pdev, vma->vm_end - vma->vm_start); > + if (!pmap) > + return -ENOMEM; > + > + rcu_read_lock(); > + p2pdma = rcu_dereference(pdev->p2pdma); > + if (!p2pdma) { > + ret = -ENODEV; > + goto out; > + } > + > + ret = simple_pin_fs(&pci_p2pdma_fs_type, &pci_p2pdma_fs_mnt, > + &pci_p2pdma_fs_cnt); > + if (ret) > + goto out; > + > + ihold(p2pdma->inode); > + pmap->inode = p2pdma->inode; > + rcu_read_unlock(); > + > + vma->vm_flags |= VM_MIXEDMAP; > + vma->vm_private_data = pmap; > + vma->vm_ops = &pci_p2pdma_vmops; > + > + return 0; > + > +out: > + rcu_read_unlock(); > + kfree(pmap); > + return ret; > +} > +EXPORT_SYMBOL_GPL(pci_mmap_p2pmem); > diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h > index 0c33a40a86e7..f9f19f3db676 100644 > --- a/include/linux/pci-p2pdma.h > +++ b/include/linux/pci-p2pdma.h > @@ -81,6 +81,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, > bool *use_p2pdma); > ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, > bool use_p2pdma); > +void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, struct file *file); > +int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma); > #else /* CONFIG_PCI_P2PDMA */ > static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, > size_t size, u64 offset) > @@ -152,6 +154,15 @@ static inline ssize_t pci_p2pdma_enable_show(char *page, > { > return sprintf(page, "none\n"); > } > +static inline void pci_p2pdma_mmap_file_open(struct pci_dev *pdev, > + struct file *file) > +{ > +} > +static inline int pci_mmap_p2pmem(struct pci_dev *pdev, > + struct vm_area_struct *vma) > +{ > + return -EOPNOTSUPP; > +} > #endif /* CONFIG_PCI_P2PDMA */ > > > diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h > index 35687dcb1a42..af737842c56f 100644 > --- a/include/uapi/linux/magic.h > +++ b/include/uapi/linux/magic.h > @@ -88,6 +88,7 @@ > #define BPF_FS_MAGIC 0xcafe4a11 > #define AAFS_MAGIC 0x5a3c69f0 > #define ZONEFS_MAGIC 0x5a4f4653 > +#define P2PDMA_MAGIC 0x70327064 > > /* Since UDF 2.01 is ISO 13346 based... */ > #define UDF_SUPER_MAGIC 0x15013346 > -- > 2.30.2 >