On Fri, 20 Sep 2024 15:34:38 -0700 Zhi Wang <zhiw@xxxxxxxxxx> wrote: > To directly access the device memory, a CXL region is required. Creating > a CXL region requires to configure HDM decoders on the path to map the > access of HPA level by level and evetually hit the DPA in the CXL > topology. > > For the usersapce, e.g. QEMU, to access the CXL region, the region is > required to be exposed via VFIO interfaces. > > Introduce a new VFIO device region and region ops to expose the created > CXL region when initailize the device in the vfio-cxl-core. Introduce a > new sub region type for the userspace to identify a CXL region. > > Signed-off-by: Zhi Wang <zhiw@xxxxxxxxxx> > --- > drivers/vfio/pci/vfio_cxl_core.c | 140 ++++++++++++++++++++++++++++- > drivers/vfio/pci/vfio_pci_config.c | 1 + > include/linux/vfio_pci_core.h | 1 + > include/uapi/linux/vfio.h | 3 + > 4 files changed, 144 insertions(+), 1 deletion(-) > > diff --git a/drivers/vfio/pci/vfio_cxl_core.c b/drivers/vfio/pci/vfio_cxl_core.c > index 6a7859333f67..ffc15fd94b22 100644 > --- a/drivers/vfio/pci/vfio_cxl_core.c > +++ b/drivers/vfio/pci/vfio_cxl_core.c > @@ -102,6 +102,13 @@ static int create_cxl_region(struct vfio_pci_core_device *core_dev) > cxl_accel_get_region_params(cxl->region.region, &start, &end); > > cxl->region.addr = start; > + cxl->region.vaddr = ioremap(start, end - start); > + if (!cxl->region.addr) { This is testing addr, not the newly added vaddr. > + pci_err(pdev, "Fail to map CXL region\n"); > + cxl_region_detach(cxl->cxled); > + cxl_dpa_free(cxl->cxled); > + goto out; > + } > out: > cxl_release_endpoint(cxl->cxlmd, cxl->endpoint); > return ret; > @@ -152,17 +159,135 @@ static void disable_cxl(struct vfio_pci_core_device *core_dev) > { > struct vfio_cxl *cxl = &core_dev->cxl; > > - if (cxl->region.region) > + if (cxl->region.region) { > + iounmap(cxl->region.vaddr); > cxl_region_detach(cxl->cxled); > + } > > if (cxl->cxled) > cxl_dpa_free(cxl->cxled); > } > > +static unsigned long vma_to_pfn(struct vm_area_struct *vma) > +{ > + struct vfio_pci_core_device *vdev = vma->vm_private_data; > + struct vfio_cxl *cxl = &vdev->cxl; > + u64 pgoff; > + > + pgoff = vma->vm_pgoff & > + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); > + > + return (cxl->region.addr >> PAGE_SHIFT) + pgoff; > +} > + > +static vm_fault_t vfio_cxl_mmap_fault(struct vm_fault *vmf) > +{ > + struct vm_area_struct *vma = vmf->vma; > + struct vfio_pci_core_device *vdev = vma->vm_private_data; > + unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; > + unsigned long addr = vma->vm_start; > + vm_fault_t ret = VM_FAULT_SIGBUS; > + > + pfn = vma_to_pfn(vma); > + > + down_read(&vdev->memory_lock); > + > + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) > + goto out_unlock; > + > + ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); > + if (ret & VM_FAULT_ERROR) > + goto out_unlock; > + > + for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) { > + if (addr == vmf->address) > + continue; > + > + if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR) > + break; > + } > + > +out_unlock: > + up_read(&vdev->memory_lock); > + > + return ret; > +} This is identical to vfio_pci_mmap_fault(), we should figure out how to reuse it rather than duplicate it. > + > +static const struct vm_operations_struct vfio_cxl_mmap_ops = { > + .fault = vfio_cxl_mmap_fault, > +}; This should make use of the huge_fault support similar to what we added in vfio-pci too, right? > + > +static int vfio_cxl_region_mmap(struct vfio_pci_core_device *core_dev, > + struct vfio_pci_region *region, > + struct vm_area_struct *vma) > +{ > + struct vfio_cxl *cxl = &core_dev->cxl; > + u64 phys_len, req_len, pgoff, req_start; > + > + if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP)) > + return -EINVAL; > + > + if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) && > + (vma->vm_flags & VM_READ)) > + return -EPERM; > + > + if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) && > + (vma->vm_flags & VM_WRITE)) > + return -EPERM; > + > + phys_len = cxl->region.size; > + req_len = vma->vm_end - vma->vm_start; > + pgoff = vma->vm_pgoff & > + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); > + req_start = pgoff << PAGE_SHIFT; > + > + if (req_start + req_len > phys_len) > + return -EINVAL; > + > + vma->vm_private_data = core_dev; > + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); I thought a large part of CXL was that the memory is coherent, maybe I don't understand what we're providing access to here. > + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); > + > + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | > + VM_DONTEXPAND | VM_DONTDUMP); > + vma->vm_ops = &vfio_cxl_mmap_ops; > + > + return 0; > +} > + > +static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev, > + char __user *buf, size_t count, loff_t *ppos, > + bool iswrite) > +{ > + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; > + struct vfio_cxl_region *cxl_region = core_dev->region[i].data; > + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; > + > + if (!count) > + return 0; > + > + return vfio_pci_core_do_io_rw(core_dev, false, > + cxl_region->vaddr, > + (char __user *)buf, pos, count, > + 0, 0, iswrite); > +} > + > +static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev, > + struct vfio_pci_region *region) > +{ > +} > + > +static const struct vfio_pci_regops vfio_cxl_regops = { > + .rw = vfio_cxl_region_rw, > + .mmap = vfio_cxl_region_mmap, > + .release = vfio_cxl_region_release, > +}; > + > int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev) > { > struct vfio_cxl *cxl = &core_dev->cxl; > struct pci_dev *pdev = core_dev->pdev; > + u32 flags; > u16 dvsec; > int ret; > > @@ -182,8 +307,21 @@ int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev) > if (ret) > goto err_enable_cxl_device; > > + flags = VFIO_REGION_INFO_FLAG_READ | > + VFIO_REGION_INFO_FLAG_WRITE | > + VFIO_REGION_INFO_FLAG_MMAP; > + > + ret = vfio_pci_core_register_dev_region(core_dev, > + PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, > + VFIO_REGION_SUBTYPE_CXL, &vfio_cxl_regops, > + cxl->region.size, flags, &cxl->region); > + if (ret) > + goto err_register_cxl_region; > + > return 0; > > +err_register_cxl_region: > + disable_cxl(core_dev); > err_enable_cxl_device: > vfio_pci_core_disable(core_dev); > return ret; > diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c > index 97422aafaa7b..98f3ac2d305c 100644 > --- a/drivers/vfio/pci/vfio_pci_config.c > +++ b/drivers/vfio/pci/vfio_pci_config.c > @@ -412,6 +412,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) > return pdev->current_state < PCI_D3hot && > (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY)); > } > +EXPORT_SYMBOL(__vfio_pci_memory_enabled); _GPL There should also be a lockdep assert added if this is exported. With that it could also drop the underscore prefix. > > /* > * Restore the *real* BARs after we detect a FLR or backdoor reset. > diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h > index 7762d4a3e825..6523d9d1bffe 100644 > --- a/include/linux/vfio_pci_core.h > +++ b/include/linux/vfio_pci_core.h > @@ -54,6 +54,7 @@ struct vfio_pci_region { > struct vfio_cxl_region { > u64 size; > u64 addr; > + void *vaddr; > struct cxl_region *region; > }; > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 2b68e6cdf190..71f766c29060 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -372,6 +372,9 @@ struct vfio_region_info_cap_type { > /* sub-types for VFIO_REGION_TYPE_GFX */ > #define VFIO_REGION_SUBTYPE_GFX_EDID (1) > > +/* sub-types for VFIO CXL region */ > +#define VFIO_REGION_SUBTYPE_CXL (1) This is a PCI vendor sub-type with vendor ID 0x1e98. The comment should reflect that. Thanks, Alex > + > /** > * struct vfio_region_gfx_edid - EDID region layout. > *