Re: [PATCH v5 2/2] vfio/pci: add msi interrupt affinity support

Frederic Griffoul <griffoul@xxxxxxxxx> · Tue, 11 Jun 2024 09:58:48 +0100

On Mon, Jun 10, 2024 at 8:32 PM Alex Williamson
<alex.williamson@xxxxxxxxxx> wrote:
>
> On Mon, 10 Jun 2024 12:57:08 +0000
> Fred Griffoul <fgriffo@xxxxxxxxxxxx> wrote:
>
> > The usual way to configure a device interrupt from userland is to write
> > the /proc/irq/<irq>/smp_affinity or smp_affinity_list files. When using
> > vfio to implement a device driver or a virtual machine monitor, this may
> > not be ideal: the process managing the vfio device interrupts may not be
> > granted root privilege, for security reasons. Thus it cannot directly
> > control the interrupt affinity and has to rely on an external command.
> >
> > This patch extends the VFIO_DEVICE_SET_IRQS ioctl() with a new data flag
> > to specify the affinity of interrupts of a vfio pci device.
> >
> > The CPU affinity mask argument must be a subset of the process cpuset,
> > otherwise an error -EPERM is returned.
> >
> > The vfio_irq_set argument shall be set-up in the following way:
> >
> > - the 'flags' field have the new flag VFIO_IRQ_SET_DATA_AFFINITY set
> > as well as VFIO_IRQ_SET_ACTION_TRIGGER.
> >
> > - the variable-length 'data' field is a cpu_set_t structure, as
> > for the sched_setaffinity() syscall, the size of which is derived
> > from 'argsz'.
> >
> > Signed-off-by: Fred Griffoul <fgriffo@xxxxxxxxxxxx>
> > ---
> >  drivers/vfio/pci/vfio_pci_core.c  | 27 +++++++++++++++++----
> >  drivers/vfio/pci/vfio_pci_intrs.c | 39 +++++++++++++++++++++++++++++++
> >  drivers/vfio/vfio_main.c          | 13 +++++++----
> >  include/uapi/linux/vfio.h         | 10 +++++++-
> >  4 files changed, 80 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
> > index 80cae87fff36..2e3419560480 100644
> > --- a/drivers/vfio/pci/vfio_pci_core.c
> > +++ b/drivers/vfio/pci/vfio_pci_core.c
> > @@ -1192,6 +1192,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> >  {
> >       unsigned long minsz = offsetofend(struct vfio_irq_set, count);
> >       struct vfio_irq_set hdr;
> > +     cpumask_var_t mask;
> >       u8 *data = NULL;
> >       int max, ret = 0;
> >       size_t data_size = 0;
> > @@ -1207,9 +1208,22 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> >               return ret;
> >
> >       if (data_size) {
> > -             data = memdup_user(&arg->data, data_size);
> > -             if (IS_ERR(data))
> > -                     return PTR_ERR(data);
> > +             if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY) {
> > +                     if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
> > +                             return -ENOMEM;
> > +
> > +                     if (copy_from_user(mask, &arg->data, data_size)) {
> > +                             ret = -EFAULT;
> > +                             goto out;
> > +                     }
> > +
> > +                     data = (u8 *)mask;
>
> Seems like this could just use the memdup_user() path, why do we care
> to copy it into a cpumask_var_t here?  If we do care, wouldn't we
> implement something like get_user_cpu_mask() used by
> sched_setaffinity(2)?
>

A valid cpu_set_t argument could be smaller than a cpumask_var_t so we
have to allocate a cpumask_var_t and zero it if the argument size is smaller.
Moreover depending on the kernel configuration the cpumask_var_t could
be allocated on the stack, avoiding an actual memory allocation.

Exporting get_user_cpu_mask() may be better, although here the size
is checked in a separate function, as there are other explicit user
cpumask handling (in io_uring for instance).

> > +
> > +             } else {
> > +                     data = memdup_user(&arg->data, data_size);
> > +                     if (IS_ERR(data))
> > +                             return PTR_ERR(data);
> > +             }
> >       }
> >
> >       mutex_lock(&vdev->igate);
> > @@ -1218,7 +1232,12 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
> >                                     hdr.count, data);
> >
> >       mutex_unlock(&vdev->igate);
> > -     kfree(data);
> > +
> > +out:
> > +     if (hdr.flags & VFIO_IRQ_SET_DATA_AFFINITY && data_size)
> > +             free_cpumask_var(mask);
> > +     else
> > +             kfree(data);
> >
> >       return ret;
> >  }
> > diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
> > index 8382c5834335..fe01303cf94e 100644
> > --- a/drivers/vfio/pci/vfio_pci_intrs.c
> > +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> > @@ -19,6 +19,7 @@
> >  #include <linux/vfio.h>
> >  #include <linux/wait.h>
> >  #include <linux/slab.h>
> > +#include <linux/cpuset.h>
> >
> >  #include "vfio_pci_priv.h"
> >
> > @@ -675,6 +676,41 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev,
> >       return 0;
> >  }
> >
> > +static int vfio_pci_set_msi_affinity(struct vfio_pci_core_device *vdev,
> > +                                  unsigned int start, unsigned int count,
> > +                                  struct cpumask *irq_mask)
>
> Aside from the name, what makes this unique to MSI vectors?
>

Actually nothing, I had a use case for VFIO msi and msi-x based devices only.

> > +{
> > +     struct vfio_pci_irq_ctx *ctx;
> > +     cpumask_var_t allowed_mask;
> > +     unsigned int i;
> > +     int err = 0;
> > +
> > +     if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
> > +             return -ENOMEM;
> > +
> > +     cpuset_cpus_allowed(current, allowed_mask);
> > +     if (!cpumask_subset(irq_mask, allowed_mask)) {
> > +             err = -EPERM;
> > +             goto finish;
> > +     }
> > +
> > +     for (i = start; i < start + count; i++) {
> > +             ctx = vfio_irq_ctx_get(vdev, i);
> > +             if (!ctx) {
> > +                     err = -EINVAL;
> > +                     break;
> > +             }
> > +
> > +             err = irq_set_affinity(ctx->producer.irq, irq_mask);
> > +             if (err)
> > +                     break;
> > +     }
>
> Is this typical/userful behavior to set a series of vectors to the same
> cpu_set_t?  It's unusual behavior for this ioctl to apply the same data
> across multiple vectors.  Should the DATA_AFFINITY case support an
> array of cpu_set_t?
>

My main use case is to configure NVMe queues in a virtual machine monitor
to interrupt only the physical CPUs assigned to that vmm. Then we can
set the same cpu_set_t to all the admin and I/O queues with a single ioctl().

I reckon another usage would be to assign a specific CPU for each interrupt
vector: with this interface it requires multiple ioctl().

I'm worried about the size of the argument if we allow an array of cpu_set_t
for a device with many interrupt vectors.

> > +
> > +finish:
> > +     free_cpumask_var(allowed_mask);
> > +     return err;
> > +}
> > +
> >  static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
> >                                   unsigned index, unsigned start,
> >                                   unsigned count, uint32_t flags, void *data)
> > @@ -713,6 +749,9 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev,
> >       if (!irq_is(vdev, index))
> >               return -EINVAL;
> >
> > +     if (flags & VFIO_IRQ_SET_DATA_AFFINITY)
> > +             return vfio_pci_set_msi_affinity(vdev, start, count, data);
> > +
> >       for (i = start; i < start + count; i++) {
> >               ctx = vfio_irq_ctx_get(vdev, i);
> >               if (!ctx)
> > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> > index e97d796a54fb..e75c5d66681c 100644
> > --- a/drivers/vfio/vfio_main.c
> > +++ b/drivers/vfio/vfio_main.c
> > @@ -1505,23 +1505,28 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
> >               size = 0;
> >               break;
> >       case VFIO_IRQ_SET_DATA_BOOL:
> > -             size = sizeof(uint8_t);
> > +             size = hdr->count * sizeof(uint8_t);
> >               break;
> >       case VFIO_IRQ_SET_DATA_EVENTFD:
> > -             size = sizeof(int32_t);
> > +             size = size_mul(hdr->count, sizeof(int32_t));
>
> Why use size_mul() in one place and not the other?
>

Right. The DATA_BOOL cannot overflow this `hdr->count` has been checked
already but it would be more consistent to use it there too.

> > +             break;
> > +     case VFIO_IRQ_SET_DATA_AFFINITY:
> > +             size = hdr->argsz - minsz;
> > +             if (size > cpumask_size())
> > +                     size = cpumask_size();
>
> Or just set size = (hdr->argsz - minsz) / count?
>
> Generate an error if (hdr->argsz - minsz) % count?
>
> It seems like all the cpumask'items can be contained to the set affinity
> function.
>

Ok. Indeed we can just copy the hdr->argz - minsz, then allocate and copy
the cpumask_var_t only in the set affinity function. It only costs 1 memory
allocation but the patch will be less intrusive in the generic ioctl()) code.

> >               break;
> >       default:
> >               return -EINVAL;
> >       }
> >
> >       if (size) {
> > -             if (hdr->argsz - minsz < hdr->count * size)
> > +             if (hdr->argsz - minsz < size)
> >                       return -EINVAL;
> >
> >               if (!data_size)
> >                       return -EINVAL;
> >
> > -             *data_size = hdr->count * size;
> > +             *data_size = size;
> >       }
> >
> >       return 0;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 2b68e6cdf190..5ba2ca223550 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -580,6 +580,12 @@ struct vfio_irq_info {
> >   *
> >   * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
> >   * ACTION_TRIGGER specifies kernel->user signaling.
> > + *
> > + * DATA_AFFINITY specifies the affinity for the range of interrupt vectors.
> > + * It must be set with ACTION_TRIGGER in 'flags'. The variable-length 'data'
> > + * array is a CPU affinity mask 'cpu_set_t' structure, as for the
> > + * sched_setaffinity() syscall argument: the 'argsz' field is used to check
> > + * the actual cpu_set_t size.
> >   */
>
> DATA_CPUSET?
>
> The IRQ_INFO ioctl should probably also report support for this
> feature.
>

Ok, I will do it in the next revision.

> Is there any proposed userspace code that takes advantage of this
> interface?  Thanks,
>

Not yet but I will work on it.

Thanks for your review.

Fred

> Alex
>
> >  struct vfio_irq_set {
> >       __u32   argsz;
> > @@ -587,6 +593,7 @@ struct vfio_irq_set {
> >  #define VFIO_IRQ_SET_DATA_NONE               (1 << 0) /* Data not present */
> >  #define VFIO_IRQ_SET_DATA_BOOL               (1 << 1) /* Data is bool (u8) */
> >  #define VFIO_IRQ_SET_DATA_EVENTFD    (1 << 2) /* Data is eventfd (s32) */
> > +#define VFIO_IRQ_SET_DATA_AFFINITY   (1 << 6) /* Data is cpu_set_t */
> >  #define VFIO_IRQ_SET_ACTION_MASK     (1 << 3) /* Mask interrupt */
> >  #define VFIO_IRQ_SET_ACTION_UNMASK   (1 << 4) /* Unmask interrupt */
> >  #define VFIO_IRQ_SET_ACTION_TRIGGER  (1 << 5) /* Trigger interrupt */
> > @@ -599,7 +606,8 @@ struct vfio_irq_set {
> >
> >  #define VFIO_IRQ_SET_DATA_TYPE_MASK  (VFIO_IRQ_SET_DATA_NONE | \
> >                                        VFIO_IRQ_SET_DATA_BOOL | \
> > -                                      VFIO_IRQ_SET_DATA_EVENTFD)
> > +                                      VFIO_IRQ_SET_DATA_EVENTFD | \
> > +                                      VFIO_IRQ_SET_DATA_AFFINITY)
> >  #define VFIO_IRQ_SET_ACTION_TYPE_MASK        (VFIO_IRQ_SET_ACTION_MASK | \
> >                                        VFIO_IRQ_SET_ACTION_UNMASK | \
> >                                        VFIO_IRQ_SET_ACTION_TRIGGER)
>