Re: [PATCH v3] vfio/pci: Add OpRegion 2.0+ Extended VBT support.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu,  9 Sep 2021 13:09:34 +0800
Colin Xu <colin.xu@xxxxxxxxx> wrote:

> Due to historical reason, some legacy shipped system doesn't follow
> OpRegion 2.1 spec but still stick to OpRegion 2.0, in which the extended
> VBT is not contiguous after OpRegion in physical address, but any
> location pointed by RVDA via absolute address. Also although current
> OpRegion 2.1+ systems appears that the extended VBT follows OpRegion,
> RVDA is the relative address to OpRegion head, the extended VBT location
> may change to non-contiguous to OpRegion. In both cases, it's impossible
> to map a contiguous range to hold both OpRegion and the extended VBT and
> expose via one vfio region.
> 
> The only difference between OpRegion 2.0 and 2.1 is where extended
> VBT is stored: For 2.0, RVDA is the absolute address of extended VBT
> while for 2.1, RVDA is the relative address of extended VBT to OpRegion
> baes, and there is no other difference between OpRegion 2.0 and 2.1.
> To support the non-contiguous region case as described, the updated read
> op will patch OpRegion version and RVDA on-the-fly accordingly. So that
> from vfio igd OpRegion view, only 2.1+ with contiguous extended VBT
> after OpRegion is exposed, regardless the underneath host OpRegion is
> 2.0 or 2.1+. The mechanism makes it possible to support legacy OpRegion
> 2.0 extended VBT systems with on the market, and support OpRegion 2.1+
> where the extended VBT isn't contiguous after OpRegion.
> Also split the write op with read ops to leave flexibility for OpRegion
> write op support in future.
> 
> V2:
> Validate RVDA for 2.1+ before increasing total size. (Alex)
> 
> V3: (Alex)
> Split read and write ops.
> On-the-fly modify OpRegion version and RVDA.
> Fix sparse error on assign value to casted pointer.
> 
> Cc: Zhenyu Wang <zhenyuw@xxxxxxxxxxxxxxx>
> Cc: Hang Yuan <hang.yuan@xxxxxxxxxxxxxxx>
> Cc: Swee Yee Fonn <swee.yee.fonn@xxxxxxxxx>
> Cc: Fred Gao <fred.gao@xxxxxxxxx>
> Signed-off-by: Colin Xu <colin.xu@xxxxxxxxx>
> ---
>  drivers/vfio/pci/vfio_pci_igd.c | 229 +++++++++++++++++++++++---------
>  1 file changed, 169 insertions(+), 60 deletions(-)


BTW, this does not apply on current mainline.


> diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
> index 228df565e9bc..fd6ad80f0c5f 100644
> --- a/drivers/vfio/pci/vfio_pci_igd.c
> +++ b/drivers/vfio/pci/vfio_pci_igd.c
> @@ -25,30 +25,131 @@
>  #define OPREGION_RVDS		0x3c2
>  #define OPREGION_VERSION	0x16
>  
> -static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf,
> -			      size_t count, loff_t *ppos, bool iswrite)
> +struct igd_opregion_vbt {
> +	void *opregion;
> +	void *vbt_ex;

	__le16 version; // see below

> +};
> +
> +static size_t vfio_pci_igd_read(struct igd_opregion_vbt *opregionvbt,
> +				char __user *buf, size_t count, loff_t *ppos)
>  {
> -	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
> -	void *base = vdev->region[i].data;
> +	u16 version = le16_to_cpu(*(__le16 *)(opregionvbt->opregion + OPREGION_VERSION));

80 column throughout please (I know we already have some violations in
this file).

>  	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
> +	void *base, *shadow = NULL;
>  
> -	if (pos >= vdev->region[i].size || iswrite)
> -		return -EINVAL;
> +	/* Shift into the range for reading the extended VBT only */
> +	if (pos >= OPREGION_SIZE) {
> +		base = opregionvbt->vbt_ex + pos - OPREGION_SIZE;
> +		goto done;
> +	}
>  
> -	count = min(count, (size_t)(vdev->region[i].size - pos));
> +	/* Simply read from OpRegion if the extended VBT doesn't exist */
> +	if (!opregionvbt->vbt_ex) {
> +		base = opregionvbt->opregion + pos;
> +		goto done;
> +	} else {
> +		shadow = kzalloc(count, GFP_KERNEL);
> +

I don't really see any value in this shadow buffer, I don't think we
have any requirement to fulfill the read in a single copy_to_user().
Therefore we could do something like:

	size_t remaining = count;
	loff_t off = 0;

	if (remaining && pos < OPREGION_VERSION) {
		size_t bytes = min(remaining, OPREGION_VERSION - pos);

		if (copy_to_user(buf + off, opregionvbt->opregion + pos, bytes))
			return -EFAULT;

		pos += bytes;
		off += bytes;
		remaining -= bytes;
	}

	if (remaining && pos < OPREGION_VERSION + sizeof(__le16)) {
		size_t bytes = min(remaining, OPREGION_VERSION + sizeof(__le16) - pos);

		/* reported version cached in struct igd_opregion_vbt.version */
		if (copy_to_user(buf + off, &opregionvbt->version + pos, bytes))
			return -EFAULT;

		pos += bytes;
		off += bytes;
		remaining -= bytes;
	}

	if (remaining && pos < OPREGION_RVDA) {
		size_t bytes = min(remaining, OPREGION_RVDA - pos);

		if (copy_to_user(buf + off, opregionvbt->opregion + pos, bytes))
			return -EFAULT;

		pos += bytes;
		off += bytes;
		remaining -= bytes;
	}

	if (remaining && pos < OPREGION_RVDA + sizeof(__le64)) {
		size_t bytes = min(remaining, OPREGION_RVDA + sizeof(__le64) - pos);
		__le64 rvda = cpu_to_le64(opregionvbt->vbt_ex ? OPREGION_SIZE : 0);

		if (copy_to_user(buf + off, &rvda + pos, bytes))
			return -EFAULT;

		pos += bytes;
		off += bytes;
		remaining -= bytes;
	}

	if (remaining && pos < OPREGION_SIZE) {
		size_t bytes = min(remaining, OPREGION_SIZE - pos);

		if (copy_to_user(buf + off, opregionvbt->opregion + pos, bytes))
			return -EFAULT;

		pos += bytes;
		off += bytes;
		remaining -= bytes;
	}

	if (remaining) {
		if (copy_to_user(buf + off, opregionvbt->vbt_ex + pos, remaining))
			return -EFAULT;
	}

	*ppos += count;

	return count;
		
It's tedious, but extensible and simple (and avoids the partial read
problem below).  Maybe there's a macro or helper function that'd make
it less tedious.


> +		if (!shadow)
> +			return -ENOMEM;
> +	}
>  
> -	if (copy_to_user(buf, base + pos, count))
> +	/*
> +	 * If the extended VBT exist, need shift for non-contiguous reading and
> +	 * may need patch OpRegion version (for 2.0) and RVDA (for 2.0 and above)
> +	 * Use a temporary buffer to simplify the stitch and patch
> +	 */
> +
> +	/* Either crossing OpRegion and VBT or in OpRegion range only */
> +	if (pos < OPREGION_SIZE && (pos + count) > OPREGION_SIZE) {
> +		memcpy(shadow, opregionvbt->opregion + pos, OPREGION_SIZE - pos);
> +		memcpy(shadow + OPREGION_SIZE - pos, opregionvbt->vbt_ex,
> +		       pos + count - OPREGION_SIZE);
> +	} else {
> +		memcpy(shadow, opregionvbt->opregion + pos, count);
> +	}
> +
> +	/*
> +	 * Patch OpRegion 2.0 to 2.1 if extended VBT exist and reading the version
> +	 */
> +	if (opregionvbt->vbt_ex && version == 0x0200 &&
> +	    pos <= OPREGION_VERSION && pos + count > OPREGION_VERSION) {
> +		/* May only read 1 byte minor version */
> +		if (pos + count == OPREGION_VERSION + 1)
> +			*(u8 *)(shadow + OPREGION_VERSION - pos) = (u8)0x01;
> +		else
> +			*(__le16 *)(shadow + OPREGION_VERSION - pos) = cpu_to_le16(0x0201);
> +	}
> +
> +	/*
> +	 * Patch RVDA for OpRegion 2.0 and above to make the region contiguous.
> +	 * For 2.0, the requestor always see 2.1 with RVDA as relative.
> +	 * For 2.1+, RVDA is already relative, but possibly non-contiguous
> +	 *   after OpRegion.
> +	 * In both cases, patch RVDA to OpRegion size to make the extended
> +	 * VBT follows OpRegion and show the requestor a contiguous region.
> +	 * Always fail partial RVDA reading to prevent malicious reading to offset
> +	 *   of OpRegion by construct arbitrary offset.
> +	 */
> +	if (opregionvbt->vbt_ex) {
> +		/* Full RVDA reading */
> +		if (pos <= OPREGION_RVDA && pos + count >= OPREGION_RVDA + 8) {
> +			*(__le64 *)(shadow + OPREGION_RVDA - pos) = cpu_to_le64(OPREGION_SIZE);
> +		/* Fail partial reading to avoid construct arbitrary RVDA */
> +		} else {
> +			kfree(shadow);
> +			pr_err("%s: partial RVDA reading!\n", __func__);
> +			return -EFAULT;
> +		}
> +	}
> +
> +	base = shadow;
> +
> +done:
> +	if (copy_to_user(buf, base, count))
>  		return -EFAULT;
>  
> +	kfree(shadow);
> +
>  	*ppos += count;
> 
>  	return count;
>  }
>  
> +static size_t vfio_pci_igd_write(struct igd_opregion_vbt *opregionvbt,
> +				 char __user *buf, size_t count, loff_t *ppos)
> +{
> +	// Not supported yet.
> +	return -EINVAL;
> +}
> +
> +static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf,
> +			      size_t count, loff_t *ppos, bool iswrite)
> +{
> +	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
> +	struct igd_opregion_vbt *opregionvbt = vdev->region[i].data;
> +	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
> +
> +	if (pos >= vdev->region[i].size)
> +		return -EINVAL;
> +
> +	count = min(count, (size_t)(vdev->region[i].size - pos));
> +
> +	return (iswrite ?
> +		vfio_pci_igd_write(opregionvbt, buf, count, ppos) :
> +		vfio_pci_igd_read(opregionvbt, buf, count, ppos));
> +}

I don't think we need to go this far towards enabling write support,
I'd roll the range and iswrite check into your _read function (rename
back to _rw()) and call it good.

> +
>  static void vfio_pci_igd_release(struct vfio_pci_device *vdev,
>  				 struct vfio_pci_region *region)
>  {
> -	memunmap(region->data);
> +	struct igd_opregion_vbt *opregionvbt = region->data;
> +
> +	if (opregionvbt->vbt_ex)
> +		memunmap(opregionvbt->vbt_ex);
> +
> +	memunmap(opregionvbt->opregion);
> +	kfree(opregionvbt);
>  }
>  
>  static const struct vfio_pci_regops vfio_pci_igd_regops = {
> @@ -60,7 +161,7 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
>  {
>  	__le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR);
>  	u32 addr, size;
> -	void *base;
> +	struct igd_opregion_vbt *base;


@base doesn't seem like an appropriate name for this, it was called
opregionvbt in the function above.


>  	int ret;
>  	u16 version;
>  
> @@ -71,84 +172,92 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
>  	if (!addr || !(~addr))
>  		return -ENODEV;
>  
> -	base = memremap(addr, OPREGION_SIZE, MEMREMAP_WB);
> +	base = kzalloc(sizeof(*base), GFP_KERNEL);
>  	if (!base)
>  		return -ENOMEM;
>  
> -	if (memcmp(base, OPREGION_SIGNATURE, 16)) {
> -		memunmap(base);
> +	base->opregion = memremap(addr, OPREGION_SIZE, MEMREMAP_WB);
> +	if (!base->opregion) {
> +		kfree(base);
> +		return -ENOMEM;
> +	}
> +
> +	if (memcmp(base->opregion, OPREGION_SIGNATURE, 16)) {
> +		memunmap(base->opregion);
> +		kfree(base);
>  		return -EINVAL;
>  	}
>  
> -	size = le32_to_cpu(*(__le32 *)(base + 16));
> +	size = le32_to_cpu(*(__le32 *)(base->opregion + 16));
>  	if (!size) {
> -		memunmap(base);
> +		memunmap(base->opregion);
> +		kfree(base);
>  		return -EINVAL;
>  	}
>  
>  	size *= 1024; /* In KB */
>  
>  	/*
> -	 * Support opregion v2.1+
> -	 * When VBT data exceeds 6KB size and cannot be within mailbox #4, then
> -	 * the Extended VBT region next to opregion is used to hold the VBT data.
> -	 * RVDA (Relative Address of VBT Data from Opregion Base) and RVDS
> -	 * (Raw VBT Data Size) from opregion structure member are used to hold the
> -	 * address from region base and size of VBT data. RVDA/RVDS are not
> -	 * defined before opregion 2.0.
> -	 *
> -	 * opregion 2.1+: RVDA is unsigned, relative offset from
> -	 * opregion base, and should point to the end of opregion.
> -	 * otherwise, exposing to userspace to allow read access to everything between
> -	 * the OpRegion and VBT is not safe.
> -	 * RVDS is defined as size in bytes.
> +	 * OpRegion and VBT:
> +	 * When VBT data doesn't exceed 6KB, it's stored in Mailbox #4.
> +	 * When VBT data exceeds 6KB size, Mailbox #4 is no longer large enough
> +	 * to hold the VBT data, the Extended VBT region is introduced since
> +	 * OpRegion 2.0 to hold the VBT data. Since OpRegion 2.0, RVDA/RVDS are
> +	 * introduced to define the extended VBT data location and size.
> +	 * OpRegion 2.0: RVDA defines the absolute physical address of the
> +	 *   extended VBT data, RVDS defines the VBT data size.
> +	 * OpRegion 2.1 and above: RVDA defines the relative address of the
> +	 *   extended VBT data to OpRegion base, RVDS defines the VBT data size.
>  	 *
> -	 * opregion 2.0: rvda is the physical VBT address.
> -	 * Since rvda is HPA it cannot be directly used in guest.
> -	 * And it should not be practically available for end user,so it is not supported.
> +	 * Due to the RVDA difference in OpRegion VBT (also the only diff between
> +	 * 2.0 and 2.1), expose OpRegion and VBT as a contiguous range for
> +	 * OpRegion 2.0 and above makes it possible to support the non-contiguous
> +	 * VBT via a single vfio region. From r/w ops view, only contiguous VBT
> +	 * after OpRegion with version 2.1+ is exposed regardless the underneath
> +	 * host is 2.0 or non-contiguous 2.1+. The r/w ops will on-the-fly shift
> +	 * the actural offset into VBT so that data at correct position can be
> +	 * returned to the requester.
>  	 */
> -	version = le16_to_cpu(*(__le16 *)(base + OPREGION_VERSION));
> +	version = le16_to_cpu(*(__le16 *)(base->opregion + OPREGION_VERSION));
> +

	opregionvbt->version = *(__le16 *)(base + OPREGION_VERSION)
	version = le16_to_cpu(opregionvbt->version);


>  	if (version >= 0x0200) {
> -		u64 rvda;
> -		u32 rvds;
> +		u64 rvda = le64_to_cpu(*(__le64 *)(base->opregion + OPREGION_RVDA));
> +		u32 rvds = le32_to_cpu(*(__le32 *)(base->opregion + OPREGION_RVDS));
>  
> -		rvda = le64_to_cpu(*(__le64 *)(base + OPREGION_RVDA));
> -		rvds = le32_to_cpu(*(__le32 *)(base + OPREGION_RVDS));
> +		/* The extended VBT is valid only when RVDA/RVDS are non-zero. */
>  		if (rvda && rvds) {
> -			/* no support for opregion v2.0 with physical VBT address */
> -			if (version == 0x0200) {
> -				memunmap(base);
> -				pci_err(vdev->pdev,
> -					"IGD assignment does not support opregion v2.0 with an extended VBT region\n");
> -				return -EINVAL;
> -			}
> +			size += rvds;
>  
> -			if (rvda != size) {
> -				memunmap(base);
> -				pci_err(vdev->pdev,
> -					"Extended VBT does not follow opregion on version 0x%04x\n",
> -					version);
> -				return -EINVAL;
> +			if (version == 0x0200) {
> +				/* Absolute physical address for 2.0 */


			if (version == 0x0200) {
				opregionvbt->version = cpu_to_le16(0x0201);
				addr = rvda;
			} else {
				addr += rvda;
			}

			... single memremap and error path

Thanks,

Alex

> +				base->vbt_ex = memremap(rvda, rvds, MEMREMAP_WB);
> +				if (!base->vbt_ex) {
> +					memunmap(base->opregion);
> +					kfree(base);
> +					return -ENOMEM;
> +				}
> +			} else {
> +				/* Relative address to OpRegion header for 2.1+ */
> +				base->vbt_ex = memremap(addr + rvda, rvds, MEMREMAP_WB);
> +				if (!base->vbt_ex) {
> +					memunmap(base->opregion);
> +					kfree(base);
> +					return -ENOMEM;
> +				}
>  			}
> -
> -			/* region size for opregion v2.0+: opregion and VBT size. */
> -			size += rvds;
>  		}
>  	}
> 
> -	if (size != OPREGION_SIZE) {
> -		memunmap(base);
> -		base = memremap(addr, size, MEMREMAP_WB);
> -		if (!base)
> -			return -ENOMEM;
> -	}
> -
>  	ret = vfio_pci_register_dev_region(vdev,
>  		PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
>  		VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
>  		&vfio_pci_igd_regops, size, VFIO_REGION_INFO_FLAG_READ, base);
>  	if (ret) {
> -		memunmap(base);
> +		if (base->vbt_ex)
> +			memunmap(base->vbt_ex);
> +
> +		memunmap(base->opregion);
> +		kfree(base);
>  		return ret;
>  	}
>  




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux