Re: [PATCH kernel] vfio-pci: Allow mapping MSIX BAR

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Dec 08, 2017 at 03:18:29PM +1100, Alexey Kardashevskiy wrote:
> By default VFIO disables mapping of MSIX BAR to the userspace as
> the userspace may program it in a way allowing spurious interrupts;
> instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl.
> In order to eliminate guessing from the userspace about what is
> mmapable, VFIO also advertises a sparse list of regions allowed to mmap.
> 
> This works fine as long as the system page size equals to the MSIX
> alignment requirement which is 4KB. However with a bigger page size
> the existing code prohibits mapping non-MSIX parts of a page with MSIX
> structures so these parts have to be emulated via slow reads/writes on
> a VFIO device fd. If these emulated bits are accessed often, this has
> serious impact on performance.
> 
> This allows mmap of the entire BAR containing MSIX vector table.
> 
> This removes the sparse capability for PCI devices as it becomes useless.
> 
> As the userspace needs to know for sure whether mmapping of the MSIX
> vector containing data can succeed, this adds a new capability -
> VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace
> that the entire BAR can be mmapped.
> 
> This does not touch the MSIX mangling in the BAR read/write handlers as
> we are doing this just to enable direct access to non MSIX registers.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx>

This seems reasonable, except for the interface issue that Alex
pointed out already.

TBH, I'm not really sold on the need for a new kernel cap, rather than
just having qemu attempt the mamp() and fall back if it fails.  But
I'm not really fussed either way.

> ---
>  include/uapi/linux/vfio.h   | 10 +++++++
>  drivers/vfio/pci/vfio_pci.c | 65 +++++++--------------------------------------
>  drivers/vfio/vfio.c         |  9 +++++++
>  3 files changed, 29 insertions(+), 55 deletions(-)
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ae46105..0fb4407 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -300,6 +300,16 @@ struct vfio_region_info_cap_type {
>  #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG	(2)
>  #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
>  
> +/*
> + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
> + * which allows direct access to non-MSIX registers which happened to be within
> + * the same system page.
> + *
> + * Even though the userspace gets direct access to the MSIX data, the existing
> + * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration.
> + */
> +#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE	3
> +
>  /**
>   * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
>   *				    struct vfio_irq_info)
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index a98681d..9bcc1e1 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -565,46 +565,16 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>  	return walk.ret;
>  }
>  
> -static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev,
> -				struct vfio_info_cap *caps)
> +static int msix_sparse_msix_mmappable_cap(struct vfio_pci_device *vdev,
> +		struct vfio_info_cap *caps)
>  {
> -	struct vfio_region_info_cap_sparse_mmap *sparse;
> -	size_t end, size;
> -	int nr_areas = 2, i = 0, ret;
> +	struct vfio_info_cap_header header = {
> +		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
> +		.version = 1
> +	};
>  
> -	end = pci_resource_len(vdev->pdev, vdev->msix_bar);
> -
> -	/* If MSI-X table is aligned to the start or end, only one area */
> -	if (((vdev->msix_offset & PAGE_MASK) == 0) ||
> -	    (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end))
> -		nr_areas = 1;
> -
> -	size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas));
> -
> -	sparse = kzalloc(size, GFP_KERNEL);
> -	if (!sparse)
> -		return -ENOMEM;
> -
> -	sparse->nr_areas = nr_areas;
> -
> -	if (vdev->msix_offset & PAGE_MASK) {
> -		sparse->areas[i].offset = 0;
> -		sparse->areas[i].size = vdev->msix_offset & PAGE_MASK;
> -		i++;
> -	}
> -
> -	if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) {
> -		sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset +
> -						     vdev->msix_size);
> -		sparse->areas[i].size = end - sparse->areas[i].offset;
> -		i++;
> -	}
> -
> -	ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP,
> -				       sparse);
> -	kfree(sparse);
> -
> -	return ret;
> +	return vfio_info_add_capability(caps,
> +			VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, &header);
>  }
>  
>  int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
> @@ -695,7 +665,8 @@ static long vfio_pci_ioctl(void *device_data,
>  			if (vdev->bar_mmap_supported[info.index]) {
>  				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
>  				if (info.index == vdev->msix_bar) {
> -					ret = msix_sparse_mmap_cap(vdev, &caps);
> +					ret = msix_sparse_msix_mmappable_cap(
> +							vdev, &caps);
>  					if (ret)
>  						return ret;
>  				}
> @@ -1125,22 +1096,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
>  	if (req_start + req_len > phys_len)
>  		return -EINVAL;
>  
> -	if (index == vdev->msix_bar) {
> -		/*
> -		 * Disallow mmaps overlapping the MSI-X table; users don't
> -		 * get to touch this directly.  We could find somewhere
> -		 * else to map the overlap, but page granularity is only
> -		 * a recommendation, not a requirement, so the user needs
> -		 * to know which bits are real.  Requiring them to mmap
> -		 * around the table makes that clear.
> -		 */
> -
> -		/* If neither entirely above nor below, then it overlaps */
> -		if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
> -		      req_start + req_len <= vdev->msix_offset))
> -			return -EINVAL;
> -	}
> -
>  	/*
>  	 * Even though we don't make use of the barmap for the mmap,
>  	 * we need to request the region and the barmap tracks that.
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index f5a86f6..376b4e7 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1898,6 +1898,7 @@ int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
>  			     void *cap_type)
>  {
>  	int ret = -EINVAL;
> +	struct vfio_info_cap_header *header, *cap;
>  
>  	if (!cap_type)
>  		return 0;
> @@ -1910,6 +1911,14 @@ int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
>  	case VFIO_REGION_INFO_CAP_TYPE:
>  		ret = region_type_cap(caps, cap_type);
>  		break;
> +
> +	case VFIO_REGION_INFO_CAP_MSIX_MAPPABLE:
> +		cap = cap_type;
> +		header = vfio_info_cap_add(caps, sizeof(*header), cap->id,
> +				cap->version);
> +		if (IS_ERR(header))
> +			return PTR_ERR(header);
> +		return 0;
>  	}
>  
>  	return ret;

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux