On Tue, 26 Mar 2024 09:38:04 +0530 Nipun Gupta <nipun.gupta@xxxxxxx> wrote: > Support the following ioctls for CDX devices: > - VFIO_DEVICE_GET_IRQ_INFO > - VFIO_DEVICE_SET_IRQS > > This allows user to set an eventfd for cdx device interrupts and > trigger this interrupt eventfd from userspace. > All CDX device interrupts are MSIs. The MSIs are allocated from the > CDX-MSI domain. > > Signed-off-by: Nipun Gupta <nipun.gupta@xxxxxxx> > Reviewed-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@xxxxxxx> > --- > > Changes v4->v5: > - Rebased on 6.9-rc1 > > Changes v3->v4: > - Added VFIO_IRQ_INFO_NORESIZE flag > > Changes v2->v3: > - Use generic MSI alloc/free APIs instead of CDX MSI APIs > - Rebased on Linux 6.8-rc6 > > Changes v1->v2: > - Rebased on Linux 6.7-rc1 > > drivers/vfio/cdx/Makefile | 2 +- > drivers/vfio/cdx/intr.c | 211 +++++++++++++++++++++++++++++++++++++ > drivers/vfio/cdx/main.c | 60 ++++++++++- > drivers/vfio/cdx/private.h | 18 ++++ > 4 files changed, 289 insertions(+), 2 deletions(-) > create mode 100644 drivers/vfio/cdx/intr.c > > diff --git a/drivers/vfio/cdx/Makefile b/drivers/vfio/cdx/Makefile > index cd4a2e6fe609..df92b320122a 100644 > --- a/drivers/vfio/cdx/Makefile > +++ b/drivers/vfio/cdx/Makefile > @@ -5,4 +5,4 @@ > > obj-$(CONFIG_VFIO_CDX) += vfio-cdx.o > > -vfio-cdx-objs := main.o > +vfio-cdx-objs := main.o intr.o > diff --git a/drivers/vfio/cdx/intr.c b/drivers/vfio/cdx/intr.c > new file mode 100644 > index 000000000000..4637b57d0242 > --- /dev/null > +++ b/drivers/vfio/cdx/intr.c > @@ -0,0 +1,211 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. > + */ > + > +#include <linux/vfio.h> > +#include <linux/slab.h> > +#include <linux/types.h> > +#include <linux/eventfd.h> > +#include <linux/msi.h> > +#include <linux/interrupt.h> > + > +#include "linux/cdx/cdx_bus.h" > +#include "private.h" > + > +static irqreturn_t vfio_cdx_msihandler(int irq_no, void *arg) > +{ > + struct eventfd_ctx *trigger = arg; > + > + eventfd_signal(trigger); > + return IRQ_HANDLED; > +} > + > +static int vfio_cdx_msi_enable(struct vfio_cdx_device *vdev, int nvec) > +{ > + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); > + struct device *dev = vdev->vdev.dev; > + int msi_idx, ret; > + > + vdev->cdx_irqs = kcalloc(nvec, sizeof(struct vfio_cdx_irq), GFP_KERNEL); > + if (!vdev->cdx_irqs) > + return -ENOMEM; > + > + ret = cdx_enable_msi(cdx_dev); > + if (ret) { > + kfree(vdev->cdx_irqs); > + return ret; > + } > + > + /* Allocate cdx MSIs */ > + ret = msi_domain_alloc_irqs(dev, MSI_DEFAULT_DOMAIN, nvec); > + if (ret) { > + cdx_disable_msi(cdx_dev); > + kfree(vdev->cdx_irqs); > + return ret; > + } > + > + for (msi_idx = 0; msi_idx < nvec; msi_idx++) > + vdev->cdx_irqs[msi_idx].irq_no = msi_get_virq(dev, msi_idx); > + > + vdev->irq_count = nvec; > + vdev->config_msi = 1; > + > + return 0; > +} > + > +static int vfio_cdx_msi_set_vector_signal(struct vfio_cdx_device *vdev, > + int vector, int fd) > +{ > + struct eventfd_ctx *trigger; > + int irq_no, ret; > + > + if (vector < 0 || vector >= vdev->irq_count) > + return -EINVAL; > + > + irq_no = vdev->cdx_irqs[vector].irq_no; > + > + if (vdev->cdx_irqs[vector].trigger) { > + free_irq(irq_no, vdev->cdx_irqs[vector].trigger); > + kfree(vdev->cdx_irqs[vector].name); > + eventfd_ctx_put(vdev->cdx_irqs[vector].trigger); > + vdev->cdx_irqs[vector].trigger = NULL; > + } > + > + if (fd < 0) > + return 0; > + > + vdev->cdx_irqs[vector].name = kasprintf(GFP_KERNEL, "vfio-msi[%d](%s)", > + vector, dev_name(vdev->vdev.dev)); > + if (!vdev->cdx_irqs[vector].name) > + return -ENOMEM; > + > + trigger = eventfd_ctx_fdget(fd); > + if (IS_ERR(trigger)) { > + kfree(vdev->cdx_irqs[vector].name); > + return PTR_ERR(trigger); > + } > + > + ret = request_irq(irq_no, vfio_cdx_msihandler, 0, > + vdev->cdx_irqs[vector].name, trigger); > + if (ret) { > + kfree(vdev->cdx_irqs[vector].name); > + eventfd_ctx_put(trigger); > + return ret; > + } > + > + vdev->cdx_irqs[vector].trigger = trigger; > + > + return 0; > +} > + > +static int vfio_cdx_msi_set_block(struct vfio_cdx_device *vdev, > + unsigned int start, unsigned int count, > + int32_t *fds) > +{ > + int i, j, ret = 0; > + > + if (start >= vdev->irq_count || start + count > vdev->irq_count) > + return -EINVAL; > + > + for (i = 0, j = start; i < count && !ret; i++, j++) { > + int fd = fds ? fds[i] : -1; > + > + ret = vfio_cdx_msi_set_vector_signal(vdev, j, fd); > + } > + > + if (ret) { > + for (--j; j >= (int)start; j--) > + vfio_cdx_msi_set_vector_signal(vdev, j, -1); > + } > + > + return ret; > +} > + > +static void vfio_cdx_msi_disable(struct vfio_cdx_device *vdev) > +{ > + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); > + struct device *dev = vdev->vdev.dev; > + > + vfio_cdx_msi_set_block(vdev, 0, vdev->irq_count, NULL); > + > + if (!vdev->config_msi) > + return; > + > + msi_domain_free_irqs_all(dev, MSI_DEFAULT_DOMAIN); > + cdx_disable_msi(cdx_dev); > + kfree(vdev->cdx_irqs); > + > + vdev->cdx_irqs = NULL; > + vdev->irq_count = 0; > + vdev->config_msi = 0; > +} > + > +static int vfio_cdx_set_msi_trigger(struct vfio_cdx_device *vdev, > + unsigned int index, unsigned int start, > + unsigned int count, u32 flags, > + void *data) > +{ > + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); > + int i; > + > + if (start + count > cdx_dev->num_msi) > + return -EINVAL; > + > + if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) { > + vfio_cdx_msi_disable(vdev); > + return 0; > + } > + > + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { > + s32 *fds = data; > + int ret; > + > + if (vdev->config_msi) > + return vfio_cdx_msi_set_block(vdev, start, count, > + fds); > + ret = vfio_cdx_msi_enable(vdev, cdx_dev->num_msi); > + if (ret) > + return ret; > + > + ret = vfio_cdx_msi_set_block(vdev, start, count, fds); > + if (ret) > + vfio_cdx_msi_disable(vdev); > + > + return ret; > + } > + > + for (i = start; i < start + count; i++) { > + if (!vdev->cdx_irqs[i].trigger) > + continue; > + if (flags & VFIO_IRQ_SET_DATA_NONE) > + eventfd_signal(vdev->cdx_irqs[i].trigger); Typically DATA_BOOL support is also added here. > + } > + > + return 0; > +} > + > +int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev, > + u32 flags, unsigned int index, > + unsigned int start, unsigned int count, > + void *data) > +{ > + if (flags & VFIO_IRQ_SET_ACTION_TRIGGER) > + return vfio_cdx_set_msi_trigger(vdev, index, start, > + count, flags, data); > + else > + return -EINVAL; > +} > + > +/* Free All IRQs for the given device */ > +void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev) > +{ > + /* > + * Device does not support any interrupt or the interrupts > + * were not configured > + */ > + if (!vdev->cdx_irqs) > + return; > + > + vfio_cdx_set_msi_trigger(vdev, 1, 0, 0, VFIO_IRQ_SET_DATA_NONE, NULL); @index is passed as 1 here. AFAICT only index zero is supported. The SET_IRQS ioctl path catches this in vfio_set_irqs_validate_and_prepare() but it might cause some strange behavior here if another index were ever added. > +} > diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c > index 9cff8d75789e..f0861a38ae10 100644 > --- a/drivers/vfio/cdx/main.c > +++ b/drivers/vfio/cdx/main.c > @@ -61,6 +61,7 @@ static void vfio_cdx_close_device(struct vfio_device *core_vdev) > > kfree(vdev->regions); > cdx_dev_reset(core_vdev->dev); > + vfio_cdx_irqs_cleanup(vdev); > } > > static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags, > @@ -123,7 +124,7 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, > info.flags |= VFIO_DEVICE_FLAGS_RESET; > > info.num_regions = cdx_dev->res_count; > - info.num_irqs = 0; > + info.num_irqs = cdx_dev->num_msi ? 1 : 0; > > return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; > } > @@ -152,6 +153,59 @@ static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, > return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; > } > > +static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev, > + struct vfio_irq_info __user *arg) > +{ > + unsigned long minsz = offsetofend(struct vfio_irq_info, count); > + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); > + struct vfio_irq_info info; > + > + if (copy_from_user(&info, arg, minsz)) > + return -EFAULT; > + > + if (info.argsz < minsz) > + return -EINVAL; > + > + if (info.index >= 1) > + return -EINVAL; > + > + info.flags = VFIO_IRQ_INFO_EVENTFD | VFIO_IRQ_INFO_NORESIZE; > + info.count = cdx_dev->num_msi; This should return -EINVAL if cdx_dev->num_msi is zero. > + > + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; > +} > + > +static int vfio_cdx_ioctl_set_irqs(struct vfio_cdx_device *vdev, > + struct vfio_irq_set __user *arg) > +{ > + unsigned long minsz = offsetofend(struct vfio_irq_set, count); > + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); > + struct vfio_irq_set hdr; > + size_t data_size = 0; > + u8 *data = NULL; > + int ret = 0; > + > + if (copy_from_user(&hdr, arg, minsz)) > + return -EFAULT; > + > + ret = vfio_set_irqs_validate_and_prepare(&hdr, cdx_dev->num_msi, > + 1, &data_size); > + if (ret) > + return ret; > + > + if (data_size) { > + data = memdup_user(arg->data, data_size); > + if (IS_ERR(data)) > + return PTR_ERR(data); > + } > + > + ret = vfio_cdx_set_irqs_ioctl(vdev, hdr.flags, hdr.index, > + hdr.start, hdr.count, data); > + kfree(data); > + > + return ret; > +} > + > static long vfio_cdx_ioctl(struct vfio_device *core_vdev, > unsigned int cmd, unsigned long arg) > { > @@ -164,6 +218,10 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev, > return vfio_cdx_ioctl_get_info(vdev, uarg); > case VFIO_DEVICE_GET_REGION_INFO: > return vfio_cdx_ioctl_get_region_info(vdev, uarg); > + case VFIO_DEVICE_GET_IRQ_INFO: > + return vfio_cdx_ioctl_get_irq_info(vdev, uarg); > + case VFIO_DEVICE_SET_IRQS: > + return vfio_cdx_ioctl_set_irqs(vdev, uarg); > case VFIO_DEVICE_RESET: > return cdx_dev_reset(core_vdev->dev); > default: > diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h > index 8e9d25913728..7a8477ae4652 100644 > --- a/drivers/vfio/cdx/private.h > +++ b/drivers/vfio/cdx/private.h > @@ -13,6 +13,14 @@ static inline u64 vfio_cdx_index_to_offset(u32 index) > return ((u64)(index) << VFIO_CDX_OFFSET_SHIFT); > } > > +struct vfio_cdx_irq { > + u32 flags; > + u32 count; > + int irq_no; > + struct eventfd_ctx *trigger; > + char *name; > +}; > + > struct vfio_cdx_region { > u32 flags; > u32 type; > @@ -25,6 +33,16 @@ struct vfio_cdx_device { > struct vfio_cdx_region *regions; > u32 flags; > #define BME_SUPPORT BIT(0) > + struct vfio_cdx_irq *cdx_irqs; > + u32 irq_count; > + u32 config_msi; > }; You might want to reorder these to avoid holes in the data structures. vfio_cdx_irq will have a 4byte hole in the middle, vfio_cdx_device will have a 4byte hole after flags. config_msi is used as a bool, I'm not sure why it's defined as a u32. irq_count also holds a value up to 1, so a u32 might be oversized. There's clearly latent support here for devices with multiple MSI vectors, is this just copying vfio-pci-core or will CDX devices have multiple MSIs? Thanks, Alex > > +int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev, > + u32 flags, unsigned int index, > + unsigned int start, unsigned int count, > + void *data); > + > +void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev); > + > #endif /* VFIO_CDX_PRIVATE_H */