On Mon, 10 Jun 2024 12:57:08 +0000 Fred Griffoul <fgriffo@xxxxxxxxxxxx> wrote: > The usual way to configure a device interrupt from userland is to write > the /proc/irq/<irq>/smp_affinity or smp_affinity_list files. When using > vfio to implement a device driver or a virtual machine monitor, this may > not be ideal: the process managing the vfio device interrupts may not be > granted root privilege, for security reasons. Thus it cannot directly > control the interrupt affinity and has to rely on an external command. > > This patch extends the VFIO_DEVICE_SET_IRQS ioctl() with a new data flag > to specify the affinity of interrupts of a vfio pci device. > > The CPU affinity mask argument must be a subset of the process cpuset, > otherwise an error -EPERM is returned. > > The vfio_irq_set argument shall be set-up in the following way: > > - the 'flags' field have the new flag VFIO_IRQ_SET_DATA_AFFINITY set > as well as VFIO_IRQ_SET_ACTION_TRIGGER. > > - the variable-length 'data' field is a cpu_set_t structure, as > for the sched_setaffinity() syscall, the size of which is derived > from 'argsz'. > > Signed-off-by: Fred Griffoul <fgriffo@xxxxxxxxxxxx> > --- > drivers/vfio/pci/vfio_pci_core.c | 27 +++++++++++++++++---- > drivers/vfio/pci/vfio_pci_intrs.c | 39 +++++++++++++++++++++++++++++++ > drivers/vfio/vfio_main.c | 13 +++++++---- > include/uapi/linux/vfio.h | 10 +++++++- > 4 files changed, 80 insertions(+), 9 deletions(-) > > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index 80cae87fff36..2e3419560480 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -1192,6 +1192,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, > { > unsigned long minsz = offsetofend(struct vfio_irq_set, count); > struct vfio_irq_set hdr; > + cpumask_var_t mask; > u8 *data = NULL; > int max, ret = 0; > size_t data_size = 0; > @@ -1207,9 +1208,22 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, > return ret; > > if (data_size) { > - data = memdup_user(&arg->data, data_size); > - if (IS_ERR(data)) > - return PTR_ERR(data); > + if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY) { > + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) > + return -ENOMEM; > + > + if (copy_from_user(mask, &arg->data, data_size)) { > + ret = -EFAULT; > + goto out; > + } > + > + data = (u8 *)mask; Seems like this could just use the memdup_user() path, why do we care to copy it into a cpumask_var_t here? If we do care, wouldn't we implement something like get_user_cpu_mask() used by sched_setaffinity(2)? > + > + } else { > + data = memdup_user(&arg->data, data_size); > + if (IS_ERR(data)) > + return PTR_ERR(data); > + } > } > > mutex_lock(&vdev->igate); > @@ -1218,7 +1232,12 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, > hdr.count, data); > > mutex_unlock(&vdev->igate); > - kfree(data); > + > +out: > + if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY && data_size) > + free_cpumask_var(mask); > + else > + kfree(data); > > return ret; > } > diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c > index 8382c5834335..fe01303cf94e 100644 > --- a/drivers/vfio/pci/vfio_pci_intrs.c > +++ b/drivers/vfio/pci/vfio_pci_intrs.c > @@ -19,6 +19,7 @@ > #include <linux/vfio.h> > #include <linux/wait.h> > #include <linux/slab.h> > +#include <linux/cpuset.h> > > #include "vfio_pci_priv.h" > > @@ -675,6 +676,41 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev, > return 0; > } > > +static int vfio_pci_set_msi_affinity(struct vfio_pci_core_device *vdev, > + unsigned int start, unsigned int count, > + struct cpumask *irq_mask) Aside from the name, what makes this unique to MSI vectors? > +{ > + struct vfio_pci_irq_ctx *ctx; > + cpumask_var_t allowed_mask; > + unsigned int i; > + int err = 0; > + > + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) > + return -ENOMEM; > + > + cpuset_cpus_allowed(current, allowed_mask); > + if (!cpumask_subset(irq_mask, allowed_mask)) { > + err = -EPERM; > + goto finish; > + } > + > + for (i = start; i < start + count; i++) { > + ctx = vfio_irq_ctx_get(vdev, i); > + if (!ctx) { > + err = -EINVAL; > + break; > + } > + > + err = irq_set_affinity(ctx->producer.irq, irq_mask); > + if (err) > + break; > + } Is this typical/userful behavior to set a series of vectors to the same cpu_set_t? It's unusual behavior for this ioctl to apply the same data across multiple vectors. Should the DATA_AFFINITY case support an array of cpu_set_t? > + > +finish: > + free_cpumask_var(allowed_mask); > + return err; > +} > + > static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, > unsigned index, unsigned start, > unsigned count, uint32_t flags, void *data) > @@ -713,6 +749,9 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, > if (!irq_is(vdev, index)) > return -EINVAL; > > + if (flags & VFIO_IRQ_SET_DATA_AFFINITY) > + return vfio_pci_set_msi_affinity(vdev, start, count, data); > + > for (i = start; i < start + count; i++) { > ctx = vfio_irq_ctx_get(vdev, i); > if (!ctx) > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c > index e97d796a54fb..e75c5d66681c 100644 > --- a/drivers/vfio/vfio_main.c > +++ b/drivers/vfio/vfio_main.c > @@ -1505,23 +1505,28 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, > size = 0; > break; > case VFIO_IRQ_SET_DATA_BOOL: > - size = sizeof(uint8_t); > + size = hdr->count * sizeof(uint8_t); > break; > case VFIO_IRQ_SET_DATA_EVENTFD: > - size = sizeof(int32_t); > + size = size_mul(hdr->count, sizeof(int32_t)); Why use size_mul() in one place and not the other? > + break; > + case VFIO_IRQ_SET_DATA_AFFINITY: > + size = hdr->argsz - minsz; > + if (size > cpumask_size()) > + size = cpumask_size(); Or just set size = (hdr->argsz - minsz) / count? Generate an error if (hdr->argsz - minsz) % count? It seems like all the cpumask'items can be contained to the set affinity function. > break; > default: > return -EINVAL; > } > > if (size) { > - if (hdr->argsz - minsz < hdr->count * size) > + if (hdr->argsz - minsz < size) > return -EINVAL; > > if (!data_size) > return -EINVAL; > > - *data_size = hdr->count * size; > + *data_size = size; > } > > return 0; > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 2b68e6cdf190..5ba2ca223550 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -580,6 +580,12 @@ struct vfio_irq_info { > * > * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while > * ACTION_TRIGGER specifies kernel->user signaling. > + * > + * DATA_AFFINITY specifies the affinity for the range of interrupt vectors. > + * It must be set with ACTION_TRIGGER in 'flags'. The variable-length 'data' > + * array is a CPU affinity mask 'cpu_set_t' structure, as for the > + * sched_setaffinity() syscall argument: the 'argsz' field is used to check > + * the actual cpu_set_t size. > */ DATA_CPUSET? The IRQ_INFO ioctl should probably also report support for this feature. Is there any proposed userspace code that takes advantage of this interface? Thanks, Alex > struct vfio_irq_set { > __u32 argsz; > @@ -587,6 +593,7 @@ struct vfio_irq_set { > #define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */ > #define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */ > #define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */ > +#define VFIO_IRQ_SET_DATA_AFFINITY (1 << 6) /* Data is cpu_set_t */ > #define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */ > #define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */ > #define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */ > @@ -599,7 +606,8 @@ struct vfio_irq_set { > > #define VFIO_IRQ_SET_DATA_TYPE_MASK (VFIO_IRQ_SET_DATA_NONE | \ > VFIO_IRQ_SET_DATA_BOOL | \ > - VFIO_IRQ_SET_DATA_EVENTFD) > + VFIO_IRQ_SET_DATA_EVENTFD | \ > + VFIO_IRQ_SET_DATA_AFFINITY) > #define VFIO_IRQ_SET_ACTION_TYPE_MASK (VFIO_IRQ_SET_ACTION_MASK | \ > VFIO_IRQ_SET_ACTION_UNMASK | \ > VFIO_IRQ_SET_ACTION_TRIGGER)