On Fri, 2025-02-21 at 22:46 +0000, Wathsala Vithanage wrote: > Linux v6.13 introduced the PCIe TLP Processing Hints (TPH) feature > for > direct cache injection. As described in the relevant patch set [1], > direct cache injection in supported hardware allows optimal platform > resource utilization for specific requests on the PCIe bus. This > feature > is currently available only for kernel device drivers. However, > user space applications, especially those whose performance is > sensitive > to the latency of inbound writes as seen by a CPU core, may benefit > from > using this information (E.g., DPDK cache stashing RFC [2] or an HPC > application running in a VM). > > This patch enables configuring of TPH from the user space via > VFIO_DEVICE_FEATURE IOCLT. It provides an interface to user space > drivers and VMMs to enable/disable the TPH feature on PCIe devices > and > set steering tags in MSI-X or steering-tag table entries using > VFIO_DEVICE_FEATURE_SET flag or read steering tags from the kernel > using > VFIO_DEVICE_FEATURE_GET to operate in device-specific mode. > > [1] > lore.kernel.org/linux-pci/20241002165954.128085-1-wei.huang2@xxxxxxx > [2] > inbox.dpdk.org/dev/20241021015246.304431-2-wathsala.vithanage@xxxxxxx > > Signed-off-by: Wathsala Vithanage <wathsala.vithanage@xxxxxxx> > --- > drivers/vfio/pci/vfio_pci_core.c | 163 > +++++++++++++++++++++++++++++++ > include/uapi/linux/vfio.h | 68 +++++++++++++ > 2 files changed, 231 insertions(+) > > diff --git a/drivers/vfio/pci/vfio_pci_core.c > b/drivers/vfio/pci/vfio_pci_core.c > index 586e49efb81b..d6dd0495b08b 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -29,6 +29,7 @@ > #include <linux/nospec.h> > #include <linux/sched/mm.h> > #include <linux/iommufd.h> > +#include <linux/pci-tph.h> > #if IS_ENABLED(CONFIG_EEH) > #include <asm/eeh.h> > #endif > @@ -1510,6 +1511,165 @@ static int vfio_pci_core_feature_token(struct > vfio_device *device, u32 flags, > return 0; > } > > +static ssize_t vfio_pci_tph_uinfo_dup(struct vfio_pci_tph *tph, > + void __user *arg, size_t > argsz, > + struct vfio_pci_tph_info > **info) > +{ > + size_t minsz; > + > + if (tph->count > VFIO_TPH_INFO_MAX) > + return -EINVAL; > + if (!tph->count) > + return 0; > + > + minsz = tph->count * sizeof(struct vfio_pci_tph_info); > + if (minsz < argsz) > + return -EINVAL; > + > + *info = memdup_user(arg, minsz); You can use memdup_array_user() instead of the lines above. It does the multiplication plus overflow check for you and will make your code more compact. > + if (IS_ERR(info)) > + return PTR_ERR(info); > + > + return minsz; see below… > +} > + > +static int vfio_pci_feature_tph_st_op(struct vfio_pci_core_device > *vdev, > + struct vfio_pci_tph *tph, > + void __user *arg, size_t > argsz) > +{ > + int i, mtype, err = 0; > + u32 cpu_uid; > + struct vfio_pci_tph_info *info = NULL; > + ssize_t data_size = vfio_pci_tph_uinfo_dup(tph, arg, argsz, > &info); > + > + if (data_size <= 0) > + return data_size; So it seems you return here in case of an error. However, that would result in a length of 0 being an error? I would try to avoid to return 0 for an error whenever possible. That breaks convention. How about you return the result value of memdup_array_user() in …uinfo_dup()? The only thing I can't tell is whether tph->count == 0 should be treated as an error. Maybe map it to -EINVAL? Regards, P. > + > + for (i = 0; i < tph->count; i++) { > + if (!(info[i].cpu_id < nr_cpu_ids && > cpu_present(info[i].cpu_id))) { > + info[i].err = -EINVAL; > + continue; > + } > + cpu_uid = topology_core_id(info[i].cpu_id); > + mtype = (info[i].flags & VFIO_TPH_MEM_TYPE_MASK) >> > + VFIO_TPH_MEM_TYPE_SHIFT; > + > + /* processing hints are always ignored */ > + info[i].ph_ignore = 1; > + > + info[i].err = pcie_tph_get_cpu_st(vdev->pdev, mtype, > cpu_uid, > + &info[i].st); > + if (info[i].err) > + continue; > + > + if (tph->flags & VFIO_DEVICE_FEATURE_TPH_SET_ST) { > + info[i].err = pcie_tph_set_st_entry(vdev- > >pdev, > + > info[i].index, > + > info[i].st); > + } > + } > + > + if (copy_to_user(arg, info, data_size)) > + err = -EFAULT; > + > + kfree(info); > + return err; > +} > + > + > +static int vfio_pci_feature_tph_enable(struct vfio_pci_core_device > *vdev, > + struct vfio_pci_tph *arg) > +{ > + int mode = arg->flags & VFIO_TPH_ST_MODE_MASK; > + > + switch (mode) { > + case VFIO_TPH_ST_NS_MODE: > + return pcie_enable_tph(vdev->pdev, > PCI_TPH_ST_NS_MODE); > + > + case VFIO_TPH_ST_IV_MODE: > + return pcie_enable_tph(vdev->pdev, > PCI_TPH_ST_IV_MODE); > + > + case VFIO_TPH_ST_DS_MODE: > + return pcie_enable_tph(vdev->pdev, > PCI_TPH_ST_DS_MODE); > + > + default: > + return -EINVAL; > + } > + > +} > + > +static int vfio_pci_feature_tph_disable(struct vfio_pci_core_device > *vdev) > +{ > + pcie_disable_tph(vdev->pdev); > + return 0; > +} > + > +static int vfio_pci_feature_tph_prepare(struct vfio_pci_tph __user > *arg, > + size_t argsz, u32 flags, > + struct vfio_pci_tph *tph) > +{ > + u32 op; > + int err = vfio_check_feature(flags, argsz, > + VFIO_DEVICE_FEATURE_SET | > + VFIO_DEVICE_FEATURE_GET, > + sizeof(struct vfio_pci_tph)); > + if (err != 1) > + return err; > + > + if (copy_from_user(tph, arg, sizeof(struct vfio_pci_tph))) > + return -EFAULT; > + > + op = tph->flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK; > + > + switch (op) { > + case VFIO_DEVICE_FEATURE_TPH_ENABLE: > + case VFIO_DEVICE_FEATURE_TPH_DISABLE: > + case VFIO_DEVICE_FEATURE_TPH_SET_ST: > + return (flags & VFIO_DEVICE_FEATURE_SET) ? 0 : - > EINVAL; > + > + case VFIO_DEVICE_FEATURE_TPH_GET_ST: > + return (flags & VFIO_DEVICE_FEATURE_GET) ? 0 : - > EINVAL; > + > + default: > + return -EINVAL; > + } > +} > + > +static int vfio_pci_core_feature_tph(struct vfio_device *device, u32 > flags, > + struct vfio_pci_tph __user > *arg, > + size_t argsz) > +{ > + u32 op; > + struct vfio_pci_tph tph; > + void __user *uinfo; > + size_t infosz; > + struct vfio_pci_core_device *vdev = > + container_of(device, struct vfio_pci_core_device, > vdev); > + int err = vfio_pci_feature_tph_prepare(arg, argsz, flags, > &tph); > + > + if (err) > + return err; > + > + op = tph.flags & VFIO_DEVICE_FEATURE_TPH_OP_MASK; > + > + switch (op) { > + case VFIO_DEVICE_FEATURE_TPH_ENABLE: > + return vfio_pci_feature_tph_enable(vdev, &tph); > + > + case VFIO_DEVICE_FEATURE_TPH_DISABLE: > + return vfio_pci_feature_tph_disable(vdev); > + > + case VFIO_DEVICE_FEATURE_TPH_GET_ST: > + case VFIO_DEVICE_FEATURE_TPH_SET_ST: > + uinfo = (u8 *)(arg) + offsetof(struct vfio_pci_tph, > info); > + infosz = argsz - sizeof(struct vfio_pci_tph); > + return vfio_pci_feature_tph_st_op(vdev, &tph, uinfo, > infosz); > + > + default: > + return -EINVAL; > + } > +} > + > int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 > flags, > void __user *arg, size_t argsz) > { > @@ -1523,6 +1683,9 @@ int vfio_pci_core_ioctl_feature(struct > vfio_device *device, u32 flags, > return vfio_pci_core_pm_exit(device, flags, arg, > argsz); > case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: > return vfio_pci_core_feature_token(device, flags, > arg, argsz); > + case VFIO_DEVICE_FEATURE_PCI_TPH: > + return vfio_pci_core_feature_tph(device, flags, > + arg, argsz); > default: > return -ENOTTY; > } > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index c8dbf8219c4f..608d57dfe279 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -1458,6 +1458,74 @@ struct vfio_device_feature_bus_master { > }; > #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 > > +/* > + * Upon VFIO_DEVICE_FEATURE_SET, enable or disable PCIe TPH or set > steering tags > + * on the device. Data provided when setting this feature is a __u32 > with the > + * following flags. VFIO_DEVICE_FEATURE_TPH_ENABLE enables PCIe TPH > in > + * no-steering-tag, interrupt-vector, or device-specific mode when > feature flags > + * VFIO_TPH_ST_NS_MODE, VFIO_TPH_ST_IV_MODE, and VFIO_TPH_ST_DS_MODE > are set > + * respectively. > + * VFIO_DEVICE_FEATURE_TPH_DISABLE disables PCIe TPH on the device. > + * VFIO_DEVICE_FEATURE_TPH_SET_ST set steering tags on a device at > an index in > + * MSI-X or ST-table depending on the VFIO_TPH_ST_x_MODE flag used > and device > + * capabilities. The caller can set multiple steering tags by > passing an array > + * of vfio_pci_tph_info objects containing cpu_id, cache_level, and > + * MSI-X/ST-table index. The caller can also set the intended memory > type and > + * the processing hint by setting VFIO_TPH_MEM_TYPE_x and > VFIO_TPH_HINT_x flags, > + * respectively. The return value for each vfio_pci_tph_info object > is stored in > + * err, with the steering-tag set on the device and the ph_ignore > status bit > + * resulting from the steering-tag lookup operation. If err < 0, the > values > + * stored in the st and ph_ignore fields should be considered > invalid. > + * > + * Upon VFIO_DEVICE_FEATURE_GET, return steering tags to the > caller. > + * VFIO_DEVICE_FEATURE_TPH_GET_ST returns steering tags to the > caller. > + * The return values per vfio_pci_tph_info object are stored in the > st, > + * ph_ignore, and err fields. > + */ > +struct vfio_pci_tph_info { > + /* in */ > + __u32 cpu_id; > + __u32 cache_level; > + __u8 flags; > +#define VFIO_TPH_MEM_TYPE_MASK 0x1 > +#define VFIO_TPH_MEM_TYPE_SHIFT 0 > +#define VFIO_TPH_MEM_TYPE_VMEM 0 /* Request volatile > memory ST */ > +#define VFIO_TPH_MEM_TYPE_PMEM 1 /* Request > persistent memory ST */ > + > +#define VFIO_TPH_HINT_MASK 0x3 > +#define VFIO_TPH_HINT_SHIFT 1 > +#define VFIO_TPH_HINT_BIDIR 0 > +#define VFIO_TPH_HINT_REQSTR (1 << VFIO_TPH_HINT_SHIFT) > +#define VFIO_TPH_HINT_TARGET (2 << VFIO_TPH_HINT_SHIFT) > +#define VFIO_TPH_HINT_TARGET_PRIO (3 << VFIO_TPH_HINT_SHIFT) > + __u16 index; /* MSI-X/ST-table index to > set ST */ > + /* out */ > + __u16 st; /* Steering-Tag */ > + __u8 ph_ignore; /* Processing hint was > ignored by */ > + __s32 err; /* Error on getting/setting > Steering-Tag*/ > +}; > + > +struct vfio_pci_tph { > + __u32 argsz; /* Size of vfio_pci_tph and > info[] */ > + __u32 flags; > +#define VFIO_DEVICE_FEATURE_TPH_OP_MASK 0x7 > +#define VFIO_DEVICE_FEATURE_TPH_OP_SHIFT 3 > +#define VFIO_DEVICE_FEATURE_TPH_ENABLE 0 /* Enable > TPH on device */ > +#define VFIO_DEVICE_FEATURE_TPH_DISABLE 1 /* Disable > TPH on device */ > +#define VFIO_DEVICE_FEATURE_TPH_GET_ST 2 /* Get > steering-tags */ > +#define VFIO_DEVICE_FEATURE_TPH_SET_ST 4 /* Set > steering-rags */ > + > +#define VFIO_TPH_ST_MODE_MASK (0x3 << > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT) > +#define VFIO_TPH_ST_NS_MODE (0 << > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT) > +#define VFIO_TPH_ST_IV_MODE (1 << > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT) > +#define VFIO_TPH_ST_DS_MODE (2 << > VFIO_DEVICE_FEATURE_TPH_OP_SHIFT) > + __u32 count; /* Number of entries > in info[] */ > + struct vfio_pci_tph_info info[]; > +#define VFIO_TPH_INFO_MAX 64 /* Max entries > allowed in info[] */ > +}; > + > +#define VFIO_DEVICE_FEATURE_PCI_TPH 11 > + > /* -------- API for Type1 VFIO IOMMU -------- */ > > /**