Re: [RFC 05/13] vfio/cxl: expose CXL region to the usersapce via a new VFIO device region

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, 20 Sep 2024 15:34:38 -0700
Zhi Wang <zhiw@xxxxxxxxxx> wrote:

> To directly access the device memory, a CXL region is required. Creating
> a CXL region requires to configure HDM decoders on the path to map the
> access of HPA level by level and evetually hit the DPA in the CXL
> topology.
> 
> For the usersapce, e.g. QEMU, to access the CXL region, the region is
> required to be exposed via VFIO interfaces.
> 
> Introduce a new VFIO device region and region ops to expose the created
> CXL region when initailize the device in the vfio-cxl-core. Introduce a
> new sub region type for the userspace to identify a CXL region.
> 
> Signed-off-by: Zhi Wang <zhiw@xxxxxxxxxx>
> ---
>  drivers/vfio/pci/vfio_cxl_core.c   | 140 ++++++++++++++++++++++++++++-
>  drivers/vfio/pci/vfio_pci_config.c |   1 +
>  include/linux/vfio_pci_core.h      |   1 +
>  include/uapi/linux/vfio.h          |   3 +
>  4 files changed, 144 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/vfio/pci/vfio_cxl_core.c b/drivers/vfio/pci/vfio_cxl_core.c
> index 6a7859333f67..ffc15fd94b22 100644
> --- a/drivers/vfio/pci/vfio_cxl_core.c
> +++ b/drivers/vfio/pci/vfio_cxl_core.c
> @@ -102,6 +102,13 @@ static int create_cxl_region(struct vfio_pci_core_device *core_dev)
>  	cxl_accel_get_region_params(cxl->region.region, &start, &end);
>  
>  	cxl->region.addr = start;
> +	cxl->region.vaddr = ioremap(start, end - start);
> +	if (!cxl->region.addr) {

This is testing addr, not the newly added vaddr.

> +		pci_err(pdev, "Fail to map CXL region\n");
> +		cxl_region_detach(cxl->cxled);
> +		cxl_dpa_free(cxl->cxled);
> +		goto out;
> +	}
>  out:
>  	cxl_release_endpoint(cxl->cxlmd, cxl->endpoint);
>  	return ret;
> @@ -152,17 +159,135 @@ static void disable_cxl(struct vfio_pci_core_device *core_dev)
>  {
>  	struct vfio_cxl *cxl = &core_dev->cxl;
>  
> -	if (cxl->region.region)
> +	if (cxl->region.region) {
> +		iounmap(cxl->region.vaddr);
>  		cxl_region_detach(cxl->cxled);
> +	}
>  
>  	if (cxl->cxled)
>  		cxl_dpa_free(cxl->cxled);
>  }
>  
> +static unsigned long vma_to_pfn(struct vm_area_struct *vma)
> +{
> +	struct vfio_pci_core_device *vdev = vma->vm_private_data;
> +	struct vfio_cxl *cxl = &vdev->cxl;
> +	u64 pgoff;
> +
> +	pgoff = vma->vm_pgoff &
> +		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> +
> +	return (cxl->region.addr >> PAGE_SHIFT) + pgoff;
> +}
> +
> +static vm_fault_t vfio_cxl_mmap_fault(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct vfio_pci_core_device *vdev = vma->vm_private_data;
> +	unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
> +	unsigned long addr = vma->vm_start;
> +	vm_fault_t ret = VM_FAULT_SIGBUS;
> +
> +	pfn = vma_to_pfn(vma);
> +
> +	down_read(&vdev->memory_lock);
> +
> +	if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
> +		goto out_unlock;
> +
> +	ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
> +	if (ret & VM_FAULT_ERROR)
> +		goto out_unlock;
> +
> +	for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) {
> +		if (addr == vmf->address)
> +			continue;
> +
> +		if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
> +			break;
> +	}
> +
> +out_unlock:
> +	up_read(&vdev->memory_lock);
> +
> +	return ret;
> +}

This is identical to vfio_pci_mmap_fault(), we should figure out how to
reuse it rather than duplicate it.

> +
> +static const struct vm_operations_struct vfio_cxl_mmap_ops = {
> +	.fault = vfio_cxl_mmap_fault,
> +};

This should make use of the huge_fault support similar to what we added
in vfio-pci too, right?

> +
> +static int vfio_cxl_region_mmap(struct vfio_pci_core_device *core_dev,
> +				struct vfio_pci_region *region,
> +				struct vm_area_struct *vma)
> +{
> +	struct vfio_cxl *cxl = &core_dev->cxl;
> +	u64 phys_len, req_len, pgoff, req_start;
> +
> +	if (!(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
> +		return -EINVAL;
> +
> +	if (!(region->flags & VFIO_REGION_INFO_FLAG_READ) &&
> +	    (vma->vm_flags & VM_READ))
> +		return -EPERM;
> +
> +	if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE) &&
> +	    (vma->vm_flags & VM_WRITE))
> +		return -EPERM;
> +
> +	phys_len = cxl->region.size;
> +	req_len = vma->vm_end - vma->vm_start;
> +	pgoff = vma->vm_pgoff &
> +		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> +	req_start = pgoff << PAGE_SHIFT;
> +
> +	if (req_start + req_len > phys_len)
> +		return -EINVAL;
> +
> +	vma->vm_private_data = core_dev;
> +	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);

I thought a large part of CXL was that the memory is coherent, maybe I
don't understand what we're providing access to here.

> +	vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
> +
> +	vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
> +			VM_DONTEXPAND | VM_DONTDUMP);
> +	vma->vm_ops = &vfio_cxl_mmap_ops;
> +
> +	return 0;
> +}
> +
> +static ssize_t vfio_cxl_region_rw(struct vfio_pci_core_device *core_dev,
> +				  char __user *buf, size_t count, loff_t *ppos,
> +				  bool iswrite)
> +{
> +	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
> +	struct vfio_cxl_region *cxl_region = core_dev->region[i].data;
> +	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
> +
> +	if (!count)
> +		return 0;
> +
> +	return vfio_pci_core_do_io_rw(core_dev, false,
> +				      cxl_region->vaddr,
> +				      (char __user *)buf, pos, count,
> +				      0, 0, iswrite);
> +}
> +
> +static void vfio_cxl_region_release(struct vfio_pci_core_device *vdev,
> +				    struct vfio_pci_region *region)
> +{
> +}
> +
> +static const struct vfio_pci_regops vfio_cxl_regops = {
> +	.rw		= vfio_cxl_region_rw,
> +	.mmap		= vfio_cxl_region_mmap,
> +	.release	= vfio_cxl_region_release,
> +};
> +
>  int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
>  {
>  	struct vfio_cxl *cxl = &core_dev->cxl;
>  	struct pci_dev *pdev = core_dev->pdev;
> +	u32 flags;
>  	u16 dvsec;
>  	int ret;
>  
> @@ -182,8 +307,21 @@ int vfio_cxl_core_enable(struct vfio_pci_core_device *core_dev)
>  	if (ret)
>  		goto err_enable_cxl_device;
>  
> +	flags = VFIO_REGION_INFO_FLAG_READ |
> +		VFIO_REGION_INFO_FLAG_WRITE |
> +		VFIO_REGION_INFO_FLAG_MMAP;
> +
> +	ret = vfio_pci_core_register_dev_region(core_dev,
> +		PCI_VENDOR_ID_CXL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
> +		VFIO_REGION_SUBTYPE_CXL, &vfio_cxl_regops,
> +		cxl->region.size, flags, &cxl->region);
> +	if (ret)
> +		goto err_register_cxl_region;
> +
>  	return 0;
>  
> +err_register_cxl_region:
> +	disable_cxl(core_dev);
>  err_enable_cxl_device:
>  	vfio_pci_core_disable(core_dev);
>  	return ret;
> diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
> index 97422aafaa7b..98f3ac2d305c 100644
> --- a/drivers/vfio/pci/vfio_pci_config.c
> +++ b/drivers/vfio/pci/vfio_pci_config.c
> @@ -412,6 +412,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev)
>  	return pdev->current_state < PCI_D3hot &&
>  	       (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY));
>  }
> +EXPORT_SYMBOL(__vfio_pci_memory_enabled);

_GPL

There should also be a lockdep assert added if this is exported.  With
that it could also drop the underscore prefix.

>  
>  /*
>   * Restore the *real* BARs after we detect a FLR or backdoor reset.
> diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
> index 7762d4a3e825..6523d9d1bffe 100644
> --- a/include/linux/vfio_pci_core.h
> +++ b/include/linux/vfio_pci_core.h
> @@ -54,6 +54,7 @@ struct vfio_pci_region {
>  struct vfio_cxl_region {
>  	u64 size;
>  	u64 addr;
> +	void *vaddr;
>  	struct cxl_region *region;
>  };
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 2b68e6cdf190..71f766c29060 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -372,6 +372,9 @@ struct vfio_region_info_cap_type {
>  /* sub-types for VFIO_REGION_TYPE_GFX */
>  #define VFIO_REGION_SUBTYPE_GFX_EDID            (1)
>  
> +/* sub-types for VFIO CXL region */
> +#define VFIO_REGION_SUBTYPE_CXL                 (1)

This is a PCI vendor sub-type with vendor ID 0x1e98.  The comment
should reflect that.  Thanks,

Alex

> +
>  /**
>   * struct vfio_region_gfx_edid - EDID region layout.
>   *





[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux