I'm looking for opinions on this approach. For vfio device assignment we minimally need a way to get EOIs from the in-kernel irqchip out to userspace. Getting that out via an eventfd would allow us to bounce all level interrupts out to userspace, where we would de-assert the device interrupt in qemu and unmask the physical device. Ideally we could deassert the interrupt in KVM, which allows us to send the EOI directly to vfio. To do that, we need to use a new IRQ source ID so the guest sees the logical OR of qemu requested state and external device state. This allows external devices to share interrupts with emulated devices, just like KVM assignment. That means the means we also need to use a new source ID when injecting the interrupt via irqfd. Rather than creating a source ID allocation interface, extending irqfd to support a user supplied source ID, and creating another new interface to get the EOI out, I think it works out better to bundle these all together as just a level irqfd interface. This way we don't allow users to create unbalanced states where a level interrupt is asserted, but has no way of being deasserted. I believe the below does this, but needs testing and validation with an implementation in qemu. Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx> --- arch/x86/kvm/x86.c | 1 + include/linux/kvm.h | 6 +++ include/linux/kvm_host.h | 4 +- virt/kvm/eventfd.c | 90 ++++++++++++++++++++++++++++++++++++++-------- virt/kvm/kvm_main.c | 2 + 5 files changed, 84 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a01a424..80bed07 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_PCI_2_3: case KVM_CAP_KVMCLOCK_CTRL: + case KVM_CAP_IRQFD_LEVEL: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2ce09aa..cca49a1 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_PPC_GET_SMMU_INFO 78 #define KVM_CAP_S390_COW 79 #define KVM_CAP_PPC_ALLOC_HTAB 80 +#define KVM_CAP_IRQFD_LEVEL 81 #ifdef KVM_CAP_IRQ_ROUTING @@ -683,12 +684,15 @@ struct kvm_xen_hvm_config { #endif #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) +/* Available with KVM_CAP_IRQFD_LEVEL */ +#define KVM_IRQFD_FLAG_LEVEL (1 << 1) struct kvm_irqfd { __u32 fd; __u32 gsi; __u32 flags; - __u8 pad[20]; + __u32 eoi_fd; + __u8 pad[16]; }; struct kvm_clock_data { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 27ac8a4..ae3b426 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -824,7 +824,7 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} #ifdef CONFIG_HAVE_KVM_EVENTFD void kvm_eventfd_init(struct kvm *kvm); -int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); +int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); void kvm_irqfd_release(struct kvm *kvm); void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); @@ -833,7 +833,7 @@ int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); static inline void kvm_eventfd_init(struct kvm *kvm) {} -static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { return -EINVAL; } diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index f59c1e8..89a6fa9 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -49,9 +49,13 @@ struct _irqfd { wait_queue_t wait; /* Update side is protected by irqfds.lock */ struct kvm_kernel_irq_routing_entry __rcu *irq_entry; - /* Used for level IRQ fast-path */ + /* Used for IRQ fast-path */ int gsi; struct work_struct inject; + /* Used for level EOI path */ + int irq_source_id; + struct eventfd_ctx *eoi_eventfd; + struct kvm_irq_ack_notifier notifier; /* Used for setup/shutdown */ struct eventfd_ctx *eventfd; struct list_head list; @@ -62,7 +66,7 @@ struct _irqfd { static struct workqueue_struct *irqfd_cleanup_wq; static void -irqfd_inject(struct work_struct *work) +irqfd_inject_edge(struct work_struct *work) { struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); struct kvm *kvm = irqfd->kvm; @@ -71,6 +75,23 @@ irqfd_inject(struct work_struct *work) kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); } +static void +irqfd_inject_level(struct work_struct *work) +{ + struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); + + kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 1); +} + +static void +irqfd_ack_level(struct kvm_irq_ack_notifier *notifier) +{ + struct _irqfd *irqfd = container_of(notifier, struct _irqfd, notifier); + + kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0); + eventfd_signal(irqfd->eoi_eventfd, 1); +} + /* * Race-free decouple logic (ordering is critical) */ @@ -96,6 +117,14 @@ irqfd_shutdown(struct work_struct *work) * It is now safe to release the object's resources */ eventfd_ctx_put(irqfd->eventfd); + + if (irqfd->eoi_eventfd) { + kvm_unregister_irq_ack_notifier(irqfd->kvm, &irqfd->notifier); + eventfd_ctx_put(irqfd->eoi_eventfd); + kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0); + kvm_free_irq_source_id(irqfd->kvm, irqfd->irq_source_id); + } + kfree(irqfd); } @@ -198,13 +227,13 @@ static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, } static int -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) +kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { struct kvm_irq_routing_table *irq_rt; struct _irqfd *irqfd, *tmp; struct file *file = NULL; - struct eventfd_ctx *eventfd = NULL; - int ret; + struct eventfd_ctx *eventfd = NULL, *eoi_eventfd = NULL; + int ret, irq_source_id = -1; unsigned int events; irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); @@ -212,12 +241,35 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) return -ENOMEM; irqfd->kvm = kvm; - irqfd->gsi = gsi; + irqfd->gsi = args->gsi; INIT_LIST_HEAD(&irqfd->list); - INIT_WORK(&irqfd->inject, irqfd_inject); + + if (args->flags & KVM_IRQFD_FLAG_LEVEL) { + irq_source_id = kvm_request_irq_source_id(kvm); + if (irq_source_id < 0) { + ret = irq_source_id; + goto fail; + } + + eoi_eventfd = eventfd_ctx_fdget(args->eoi_fd); + if (IS_ERR(eoi_eventfd)) { + ret = PTR_ERR(eoi_eventfd); + goto fail; + } + + irqfd->irq_source_id = irq_source_id; + irqfd->eoi_eventfd = eoi_eventfd; + irqfd->notifier.gsi = args->gsi; + irqfd->notifier.irq_acked = irqfd_ack_level; + kvm_register_irq_ack_notifier(kvm, &irqfd->notifier); + + INIT_WORK(&irqfd->inject, irqfd_inject_level); + } else + INIT_WORK(&irqfd->inject, irqfd_inject_edge); + INIT_WORK(&irqfd->shutdown, irqfd_shutdown); - file = eventfd_fget(fd); + file = eventfd_fget(args->fd); if (IS_ERR(file)) { ret = PTR_ERR(file); goto fail; @@ -282,6 +334,14 @@ fail: if (!IS_ERR(file)) fput(file); + if (eoi_eventfd && !IS_ERR(eoi_eventfd)) { + kvm_unregister_irq_ack_notifier(kvm, &irqfd->notifier); + eventfd_ctx_put(eoi_eventfd); + } + + if (irq_source_id >= 0) + kvm_free_irq_source_id(kvm, irq_source_id); + kfree(irqfd); return ret; } @@ -298,19 +358,19 @@ kvm_eventfd_init(struct kvm *kvm) * shutdown any irqfd's that match fd+gsi */ static int -kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) +kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) { struct _irqfd *irqfd, *tmp; struct eventfd_ctx *eventfd; - eventfd = eventfd_ctx_fdget(fd); + eventfd = eventfd_ctx_fdget(args->fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { + if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { /* * This rcu_assign_pointer is needed for when * another thread calls kvm_irq_routing_update before @@ -338,12 +398,12 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) } int -kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { - if (flags & KVM_IRQFD_FLAG_DEASSIGN) - return kvm_irqfd_deassign(kvm, fd, gsi); + if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) + return kvm_irqfd_deassign(kvm, args); - return kvm_irqfd_assign(kvm, fd, gsi); + return kvm_irqfd_assign(kvm, args); } /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 02cb440..b4ad14cc 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2059,7 +2059,7 @@ static long kvm_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&data, argp, sizeof data)) goto out; - r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); + r = kvm_irqfd(kvm, &data); break; } case KVM_IOEVENTFD: { -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html