Re: [RFC PATCH] vfio/pci: add PCIe TPH to device feature ioctl

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, 21 Feb 2025 22:46:33 +0000
Wathsala Vithanage <wathsala.vithanage@xxxxxxx> wrote:

> Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature for
> direct cache injection. As described in the relevant patch set [1],
> direct cache injection in supported hardware allows optimal platform
> resource utilization for specific requests on the PCIe bus. This feature
> is currently available only for kernel device drivers. However,
> user space applications, especially those whose performance is sensitive
> to the latency of inbound writes as seen by a CPU core, may benefit from
> using this information (E.g., DPDK cache stashing RFC [2] or an HPC
> application running in a VM).
> 
> This patch enables configuring of TPH from the user space via
> VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space
> drivers and VMMs to enable/disable the TPH feature on PCIe devices and
> set steering tags in MSI-X or steering-tag table entries using
> VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel using
> VFIO_DEVICE_FEATURE_GET to operate in device-specific mode.

Unless I'm missing it, the RFC in [2] doesn't make use of this
proposal.  Is there published code anywhere that does use this
interface?

> [1] lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@xxxxxxx
> [2] inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@xxxxxxx
> 
> Signed-off-by: Wathsala Vithanage <wathsala.vithanage@xxxxxxx>
> ---
>  drivers/vfio/pci/vfio_pci_core.c | 163 +++++++++++++++++++++++++++++++
>  include/uapi/linux/vfio.h        |  68 +++++++++++++
>  2 files changed, 231 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> index 586e49efb81b..d6dd0495b08b 100644
> --- a/drivers/vfio/pci/vfio_pci_core.c
> +++ b/drivers/vfio/pci/vfio_pci_core.c
> @@ -29,6 +29,7 @@
>  #include <linux/nospec.h>
>  #include <linux/sched/mm.h>
>  #include <linux/iommufd.h>
> +#include <linux/pci-tph.h>
>  #if IS_ENABLED(CONFIG_EEH)
>  #include <asm/eeh.h>
>  #endif
> @@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
>  	return 0;
>  }
>  
> +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph,
> +				      void __user *arg, size_t argsz,
> +				      struct vfio_pci_tph_info **info)
> +{
> +	size_t minsz;
> +
> +	if (tph->count > VFIO_TPH_INFO_MAX)
> +		return -EINVAL;
> +	if (!tph->count)
> +		return 0;
> +
> +	minsz = tph->count * sizeof(struct vfio_pci_tph_info);
> +	if (minsz < argsz)
> +		return -EINVAL;
> +
> +	*info = memdup_user(arg, minsz);
> +	if (IS_ERR(info))
> +		return PTR_ERR(info);
> +
> +	return minsz;
> +}
> +
> +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device *vdev,
> +				      struct vfio_pci_tph *tph,
> +				      void __user *arg, size_t argsz)
> +{
> +	int i, mtype, err = 0;
> +	u32 cpu_uid;
> +	struct vfio_pci_tph_info *info = NULL;
> +	ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz, &info);
> +
> +	if (data_size <= 0)
> +		return data_size;
> +
> +	for (i = 0; i < tph->count; i++) {
> +		if (!(info[i].cpu_id < nr_cpu_ids && cpu_present(info[i].cpu_id))) {

Not intuitive logic, imo.  This could easily be:

		if (info[i].cpu_id >= nr_cpu_ids || !cpu_present(info[i].cpu_id))

> +			info[i].err = -EINVAL;
> +			continue;
> +		}
> +		cpu_uid = topology_core_id(info[i].cpu_id);
> +		mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >>
> +			VFIO_TPH_MEM_TYPE_SHIFT;
> +
> +		/* processing hints are always ignored */
> +		info[i].ph_ignore = 1;
> +
> +		info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype, cpu_uid,
> +						  &info[i].st);
> +		if (info[i].err)
> +			continue;
> +
> +		if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) {
> +			info[i].err = pcie_tph_set_st_entry(vdev->pdev,
> +							    info[i].index,
> +							    info[i].st);
> +		}
> +	}
> +
> +	if (copy_to_user(arg, info, data_size))
> +		err = -EFAULT;
> +
> +	kfree(info);
> +	return err;
> +}
> +
> +
> +static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device *vdev,
> +				       struct vfio_pci_tph *arg)
> +{
> +	int mode = arg->flags & VFIO_TPH_ST_MODE_MASK;
> +
> +	switch (mode) {
> +	case VFIO_TPH_ST_NS_MODE:
> +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_NS_MODE);
> +
> +	case VFIO_TPH_ST_IV_MODE:
> +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_IV_MODE);
> +
> +	case VFIO_TPH_ST_DS_MODE:
> +		return pcie_enable_tph(vdev->pdev, PCI_TPH_ST_DS_MODE);
> +
> +	default:
> +		return -EINVAL;
> +	}
> +
> +}
> +
> +static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device *vdev)
> +{
> +	pcie_disable_tph(vdev->pdev);
> +	return 0;
> +}
> +
> +static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user *arg,
> +					size_t argsz, u32 flags,
> +					struct vfio_pci_tph *tph)
> +{
> +	u32 op;
> +	int err = vfio_check_feature(flags, argsz,
> +				 VFIO_DEVICE_FEATURE_SET |
> +				 VFIO_DEVICE_FEATURE_GET,
> +				 sizeof(struct vfio_pci_tph));
> +	if (err != 1)
> +		return err;

We don't seem to account for a host booted with pci=notph.

> +
> +	if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph)))
> +		return -EFAULT;
> +
> +	op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> +	switch (op) {
> +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> +		return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : -EINVAL;
> +
> +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> +		return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : -EINVAL;

This is a convoluted mangling of an ioctl into a vfio feature.

> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static int vfio_pci_core_feature_tph(struct vfio_device *device, u32 flags,
> +				     struct vfio_pci_tph __user *arg,
> +				     size_t argsz)
> +{
> +	u32 op;
> +	struct vfio_pci_tph tph;
> +	void __user *uinfo;
> +	size_t infosz;
> +	struct vfio_pci_core_device *vdev =
> +		container_of(device, struct vfio_pci_core_device, vdev);
> +	int err = vfio_pci_feature_tph_prepare(arg, argsz, flags, &tph);
> +
> +	if (err)
> +		return err;
> +
> +	op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK;
> +
> +	switch (op) {
> +	case VFIO_DEVICE_FEATURE_TPH_ENABLE:
> +		return vfio_pci_feature_tph_enable(vdev, &tph);
> +
> +	case VFIO_DEVICE_FEATURE_TPH_DISABLE:
> +		return vfio_pci_feature_tph_disable(vdev);
> +
> +	case VFIO_DEVICE_FEATURE_TPH_GET_ST:
> +	case VFIO_DEVICE_FEATURE_TPH_SET_ST:
> +		uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph, info);
> +		infosz = argsz - sizeof(struct vfio_pci_tph);
> +		return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo, infosz);

This is effectively encoding a regular ioctl as a feature.  See below.

> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  				void __user *arg, size_t argsz)
>  {
> @@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
>  		return vfio_pci_core_pm_exit(device, flags, arg, argsz);
>  	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
>  		return vfio_pci_core_feature_token(device, flags, arg, argsz);
> +	case VFIO_DEVICE_FEATURE_PCI_TPH:
> +		return vfio_pci_core_feature_tph(device, flags,
> +						 arg, argsz);
>  	default:
>  		return -ENOTTY;
>  	}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index c8dbf8219c4f..608d57dfe279 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master {
>  };
>  #define VFIO_DEVICE_FEATURE_BUS_MASTER 10
>  
> +/*
> + * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set steering tags
> + * on the device. Data provided when setting this feature is a __u32 with the
> + * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH in
> + * no-steering-tag, interrupt-vector, or device-specific mode when feature flags
> + * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and VFIO_TPH_ST_DS_MODE are set
> + * respectively.
> + * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device.
> + * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at an index in
> + * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used and device
> + * capabilities. The caller can set multiple steering tags by passing an array
> + * of vfio_pci_tph_info objects containing cpu_id, cache_level, and
> + * MSI-X/ST-table index. The caller can also set the intended memory type and
> + * the processing hint by setting VFIO_TPH_MEM_TYPE_x and VFIO_TPH_HINT_x flags,
> + * respectively. The return value for each vfio_pci_tph_info object is stored in
> + * err, with the steering-tag set on the device and the ph_ignore status bit
> + * resulting from the steering-tag lookup operation. If err < 0, the values
> + * stored in the st and ph_ignore fields should be considered invalid.
> + *

Sorry, I don't understand ph_ignore as described here.  It's only ever
set to 1.  I guess we assume the incoming state is zero.  But what does
it mean?

> + * Upon VFIO_DEVICE_FEATURE_GET,  return steering tags to the caller.
> + * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the caller.
> + * The return values per vfio_pci_tph_info object are stored in the st,
> + * ph_ignore, and err fields.

Why do we need to provide an interface to return the steering tags set
by the user?  Seems like this could be a write-only, SET, interface.

> + */
> +struct vfio_pci_tph_info {

This seems more of an _entry than an _info.  Note that vfio has various
INFO ioctls which make this nomenclature confusing.

> +	/* in */
> +	__u32 cpu_id;

The logical ID?

> +	__u32 cache_level;

Zero based?  1-based?  How many cache levels are there?  What's valid
here?

> +	__u8  flags;
> +#define VFIO_TPH_MEM_TYPE_MASK		0x1
> +#define VFIO_TPH_MEM_TYPE_SHIFT		0
> +#define VFIO_TPH_MEM_TYPE_VMEM		0	/* Request volatile memory ST */
> +#define VFIO_TPH_MEM_TYPE_PMEM		1	/* Request persistent memory ST */

Is there a relation to the cache_level here?  Spec references here and
below, assuming those are relevant to these values.

> +
> +#define VFIO_TPH_HINT_MASK		0x3
> +#define VFIO_TPH_HINT_SHIFT		1
> +#define VFIO_TPH_HINT_BIDIR		0
> +#define VFIO_TPH_HINT_REQSTR		(1 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET		(2 << VFIO_TPH_HINT_SHIFT)
> +#define VFIO_TPH_HINT_TARGET_PRIO	(3 << VFIO_TPH_HINT_SHIFT)

There needs to be a __u8 padding in here somewhere or flags extended to
__u16.

> +	__u16 index;			/* MSI-X/ST-table index to set ST */
> +	/* out */
> +	__u16 st;			/* Steering-Tag */

Sorry if I'm just not familiar with TPH, but why do we need to return
the ST?  Doesn't hardware make use of the ST index and the physical
value gets applied automatically in HW?

> +	__u8  ph_ignore;		/* Processing hint was ignored by */

Padding/alignment, same as above.  "ignored by..."  By what?  Is that
an error?

> +	__s32 err;			/* Error on getting/setting Steering-Tag*/
> +};

Generally we'd expect a system call either works or fails.  Why do we
need per entry error report?  Can't we validate and prepare the entire
operation before setting any of it into the device?  Ultimately we're
just writing hints to config space or MSI-X table space, so the write
operation itself is not likely to be the point of failure.

> +
> +struct vfio_pci_tph {
> +	__u32 argsz;			/* Size of vfio_pci_tph and info[] */
> +	__u32 flags;
> +#define VFIO_DEVICE_FEATURE_TPH_OP_MASK		0x7
> +#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT	3
> +#define VFIO_DEVICE_FEATURE_TPH_ENABLE		0	/* Enable TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_DISABLE	1	/* Disable TPH on device */
> +#define VFIO_DEVICE_FEATURE_TPH_GET_ST		2	/* Get steering-tags */
> +#define VFIO_DEVICE_FEATURE_TPH_SET_ST		4	/* Set steering-rags */

s/rags/tags/

vfio device features already have GET and SET as part of their base
structure, why are they duplicated here?  It really seems like there
are two separate features here, one that allows enabling with a given
mode or disable, and another that allows writing specific steering
tags.  Both seem like they could be SET-only features.  It's also not
clear to me that there's a significant advantage to providing an array
of steering tags.  Isn't updating STs an infrequent operation and
likely bound to at most 2K tags in the case of MSI-X?

> +
> +#define	VFIO_TPH_ST_MODE_MASK	(0x3 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_NS_MODE	(0 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_IV_MODE	(1 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +#define	VFIO_TPH_ST_DS_MODE	(2 << VFIO_DEVICE_FEATURE_TPH_OP_SHIFT)
> +	__u32 count;				/* Number of entries in info[] */
> +	struct vfio_pci_tph_info info[];
> +#define VFIO_TPH_INFO_MAX	64		/* Max entries allowed in info[] */

This seems to match the limit if the table is located in extended
config space, but it's not particularly relevant if the table is
located in MSI-X space.  Why is this a good limit?

Also, try to keep these all in 80 column.  Thanks,

Alex

> +};
> +
> +#define VFIO_DEVICE_FEATURE_PCI_TPH 11
> +
>  /* -------- API for Type1 VFIO IOMMU -------- */
>  
>  /**






[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux