This new ioctl enables an eventfd to be triggered when an EOI is written for a specified irqchip pin. The first user of this will be external device assignment through VFIO, using a level irqfd for asserting a PCI INTx interrupt and this interface for de-assert and notification once the interrupt is serviced. Here we make use of the reference counting of the _irq_source object allowing us to share it with an irqfd and cleanup regardless of the release order. Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx> --- Documentation/virtual/kvm/api.txt | 21 ++ arch/x86/kvm/x86.c | 2 include/linux/kvm.h | 15 ++ include/linux/kvm_host.h | 13 + virt/kvm/eventfd.c | 336 +++++++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 11 + 6 files changed, 398 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 3911e62..8cd6b36 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1989,6 +1989,27 @@ return the hash table order in the parameter. (If the guest is using the virtualized real-mode area (VRMA) facility, the kernel will re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) +4.77 KVM_EOIFD + +Capability: KVM_CAP_EOIFD +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_eoifd (in) +Returns: 0 on success, < 0 on error + +KVM_EOIFD allows userspace to receive interrupt EOI notification +through an eventfd. kvm_eoifd.fd specifies the eventfd used for +notification. KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd +once assigned. KVM_EOIFD also requires additional bits set in +kvm_eoifd.flags to bind to the proper interrupt line. The +KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided +and is a key from a level triggered interrupt (configured from +KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL). The EOI notification is bound +to the same GSI and irqchip input as the irqfd. Both kvm_eoifd.key +and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and +de-assignment of KVM_EOIFD. A level irqfd may only be bound to a +single eoifd. KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of +KVM_EOIFD_FLAG_LEVEL_IRQFD. 5. The kvm_run structure ------------------------ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ded39d..8f3164e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2171,6 +2171,8 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PCI_2_3: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_IRQFD_LEVEL: + case KVM_CAP_EOIFD: + case KVM_CAP_EOIFD_LEVEL_IRQFD: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index b2e6e4f..effb916 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -619,6 +619,8 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_COW 79 #define KVM_CAP_PPC_ALLOC_HTAB 80 #define KVM_CAP_IRQFD_LEVEL 81 +#define KVM_CAP_EOIFD 82 +#define KVM_CAP_EOIFD_LEVEL_IRQFD 83 #ifdef KVM_CAP_IRQ_ROUTING @@ -694,6 +696,17 @@ struct kvm_irqfd { __u8 pad[20]; }; +#define KVM_EOIFD_FLAG_DEASSIGN (1 << 0) +/* Available with KVM_CAP_EOIFD_LEVEL_IRQFD */ +#define KVM_EOIFD_FLAG_LEVEL_IRQFD (1 << 1) + +struct kvm_eoifd { + __u32 fd; + __u32 flags; + __u32 key; + __u8 pad[20]; +}; + struct kvm_clock_data { __u64 clock; __u32 flags; @@ -834,6 +847,8 @@ struct kvm_s390_ucas_mapping { #define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) /* Available with KVM_CAP_PPC_ALLOC_HTAB */ #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) +/* Available with KVM_CAP_EOIFD */ +#define KVM_EOIFD _IOW(KVMIO, 0xa8, struct kvm_eoifd) /* * ioctls for vcpu fds diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c73f071..01e72a6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -289,6 +289,10 @@ struct kvm { struct mutex lock; struct list_head items; } irqsources; + struct { + spinlock_t lock; + struct list_head items; + } eoifds; #endif struct kvm_vm_stat stat; struct kvm_arch arch; @@ -832,6 +836,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); void kvm_irqfd_release(struct kvm *kvm); void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args); +void kvm_eoifd_release(struct kvm *kvm); #else @@ -857,6 +863,13 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) return -ENOSYS; } +static inline int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args) +{ + return -ENOSYS; +} + +static inline void kvm_eoifd_release(struct kvm *kvm) {} + #endif /* CONFIG_HAVE_KVM_EVENTFD */ #ifdef CONFIG_KVM_APIC_ARCHITECTURE diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 878cb52..3aa2d62 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -95,6 +95,25 @@ static struct _irq_source *_irq_source_alloc(struct kvm *kvm, int gsi) return source; } +static struct _irq_source *_irq_source_get_from_key(struct kvm *kvm, int key) +{ + struct _irq_source *tmp, *source = ERR_PTR(-ENOENT); + + mutex_lock(&kvm->irqsources.lock); + + list_for_each_entry(tmp, &kvm->irqsources.items, list) { + if (tmp->id == key) { + source = tmp; + kref_get(&source->kref); + break; + } + } + + mutex_unlock(&kvm->irqsources.lock); + + return source; +} + /* * -------------------------------------------------------------------- * irqfd: Allows an fd to be used to inject an interrupt to the guest @@ -406,6 +425,8 @@ kvm_eventfd_init(struct kvm *kvm) INIT_LIST_HEAD(&kvm->ioeventfds); mutex_init(&kvm->irqsources.lock); INIT_LIST_HEAD(&kvm->irqsources.items); + spin_lock_init(&kvm->eoifds.lock); + INIT_LIST_HEAD(&kvm->eoifds.items); } /* @@ -772,3 +793,318 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) return kvm_assign_ioeventfd(kvm, args); } + +/* + * -------------------------------------------------------------------- + * eoifd: Translate KVM APIC/IOAPIC EOI into eventfd signal. + * + * userspace can register with an eventfd for receiving + * notification when an EOI occurs. + * -------------------------------------------------------------------- + */ + +struct _eoifd { + /* eventfd triggered on EOI */ + struct eventfd_ctx *eventfd; + /* irq source ID de-asserted on EOI */ + struct _irq_source *source; + wait_queue_t wait; + /* EOI notification from KVM */ + struct kvm_irq_ack_notifier notifier; + struct list_head list; + poll_table pt; + struct work_struct shutdown; +}; + +/* Called under eoifds.lock */ +static void eoifd_shutdown(struct work_struct *work) +{ + struct _eoifd *eoifd = container_of(work, struct _eoifd, shutdown); + struct kvm *kvm = eoifd->source->kvm; + u64 cnt; + + /* + * Stop EOI signaling + */ + kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier); + + /* + * Synchronize with the wait-queue and unhook ourselves to prevent + * further events. + */ + eventfd_ctx_remove_wait_queue(eoifd->eventfd, &eoifd->wait, &cnt); + + /* + * Release resources + */ + eventfd_ctx_put(eoifd->eventfd); + _irq_source_put(eoifd->source); + kfree(eoifd); +} + +/* assumes kvm->eoifds.lock is held */ +static bool eoifd_is_active(struct _eoifd *eoifd) +{ + return list_empty(&eoifd->list) ? false : true; +} + +/* + * Mark the eoifd as inactive and schedule it for removal + * + * assumes kvm->eoifds.lock is held + */ +static void eoifd_deactivate(struct _eoifd *eoifd) +{ + BUG_ON(!eoifd_is_active(eoifd)); + + list_del_init(&eoifd->list); + + queue_work(irqfd_cleanup_wq, &eoifd->shutdown); +} + +/* + * Called with wqh->lock held and interrupts disabled + */ +static int eoifd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + unsigned long flags = (unsigned long)key; + + if (unlikely(flags & POLLHUP)) { + /* The eventfd is closing, detach from KVM */ + struct _eoifd *eoifd = container_of(wait, struct _eoifd, wait); + struct kvm *kvm = eoifd->source->kvm; + unsigned long flags; + + spin_lock_irqsave(&kvm->eoifds.lock, flags); + + /* + * We must check if someone deactivated the eoifd before + * we could acquire the eoifds.lock since the item is + * deactivated from the KVM side before it is unhooked from + * the wait-queue. If it is already deactivated, we can + * simply return knowing the other side will cleanup for us. + * We cannot race against the eoifd going away since the + * other side is required to acquire wqh->lock, which we hold + */ + if (eoifd_is_active(eoifd)) + eoifd_deactivate(eoifd); + + spin_unlock_irqrestore(&kvm->eoifds.lock, flags); + } + + return 0; +} + +static void eoifd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct _eoifd *eoifd = container_of(pt, struct _eoifd, pt); + add_wait_queue(wqh, &eoifd->wait); +} + +/* + * This function is called as the kvm VM fd is being released. Shutdown all + * eoifds that still remain open + */ +void kvm_eoifd_release(struct kvm *kvm) +{ + struct _eoifd *tmp, *eoifd; + + spin_lock_irq(&kvm->eoifds.lock); + + list_for_each_entry_safe(eoifd, tmp, &kvm->eoifds.items, list) + eoifd_deactivate(eoifd); + + spin_unlock_irq(&kvm->eoifds.lock); + + flush_workqueue(irqfd_cleanup_wq); +} + +static void eoifd_event(struct kvm_irq_ack_notifier *notifier) +{ + struct _eoifd *eoifd; + + eoifd = container_of(notifier, struct _eoifd, notifier); + + if (unlikely(!eoifd->source)) + return; + + /* + * De-assert and send EOI, user needs to re-assert if + * device still requires service. + */ + kvm_set_irq(eoifd->source->kvm, + eoifd->source->id, eoifd->source->gsi, 0); + eventfd_signal(eoifd->eventfd, 1); +} + +static int kvm_assign_eoifd(struct kvm *kvm, struct kvm_eoifd *args) +{ + struct file *file = NULL; + struct eventfd_ctx *eventfd = NULL; + struct _eoifd *eoifd = NULL, *tmp; + struct _irq_source *source = NULL; + int ret; + u64 cnt; + + if (!(args->flags & KVM_EOIFD_FLAG_LEVEL_IRQFD)) + return -EINVAL; + + file = eventfd_fget(args->fd); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + eventfd = eventfd_ctx_fileget(file); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + eoifd = kzalloc(sizeof(*eoifd), GFP_KERNEL); + if (!eoifd) { + ret = -ENOMEM; + goto fail; + } + + source = _irq_source_get_from_key(kvm, args->key); + if (IS_ERR(source)) { + ret = PTR_ERR(source); + goto fail; + } + + INIT_LIST_HEAD(&eoifd->list); + INIT_WORK(&eoifd->shutdown, eoifd_shutdown); + eoifd->eventfd = eventfd; + eoifd->notifier.gsi = source->gsi; + eoifd->notifier.irq_acked = eoifd_event; + + /* + * Install our own custom wake-up handling so we are notified via + * a callback whenever someone releases the underlying eventfd + */ + init_waitqueue_func_entry(&eoifd->wait, eoifd_wakeup); + init_poll_funcptr(&eoifd->pt, eoifd_ptable_queue_proc); + + /* + * Clear out any previously released eoifds that might conflict + */ + flush_workqueue(irqfd_cleanup_wq); + + /* + * This can sleep, so register before acquiring spinlock, notifier + * becomes a nop until we finish. + */ + kvm_register_irq_ack_notifier(kvm, &eoifd->notifier); + + /* + * Install the wait queue function to allow cleanup when the + * eventfd is closed by the user. This grabs the wqh lock, so + * we do it out of spinlock, holding the file reference ensures + * we won't see a POLLHUP until setup is complete. + */ + file->f_op->poll(file, &eoifd->pt); + + spin_lock_irq(&kvm->eoifds.lock); + + /* + * Enforce a one-to-one relationship between irq source and eoifd so + * that this interface can't be used to consume all kernel memory. + * NB. single eventfd can still be used by multiple eoifds. + */ + list_for_each_entry(tmp, &kvm->eoifds.items, list) { + if (tmp->source == source) { + spin_unlock_irq(&kvm->eoifds.lock); + ret = -EBUSY; + goto fail_unregister; + } + } + + list_add_tail(&eoifd->list, &kvm->eoifds.items); + eoifd->source = source; /* Enable ack notifier */ + + spin_unlock_irq(&kvm->eoifds.lock); + + fput(file); /* Enable POLLHUP */ + + return 0; + +fail_unregister: + eventfd_ctx_remove_wait_queue(eventfd, &eoifd->wait, &cnt); + kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier); +fail: + if (source && !IS_ERR(source)) + _irq_source_put(source); + + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + + if (file && !IS_ERR(file)) + fput(file); + + kfree(eoifd); + return ret; +} + +static int kvm_deassign_eoifd(struct kvm *kvm, struct kvm_eoifd *args) +{ + struct eventfd_ctx *eventfd = NULL; + struct _irq_source *source = NULL; + struct _eoifd *eoifd; + int ret = -ENOENT; + + if (!(args->flags & KVM_EOIFD_FLAG_LEVEL_IRQFD)) + return -EINVAL; + + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) { + ret = PTR_ERR(eventfd); + goto fail; + } + + source = _irq_source_get_from_key(kvm, args->key); + if (IS_ERR(source)) { + ret = PTR_ERR(source); + goto fail; + } + + spin_lock_irq(&kvm->eoifds.lock); + + list_for_each_entry(eoifd, &kvm->eoifds.items, list) { + if (eoifd->eventfd == eventfd && eoifd->source == source) { + eoifd_deactivate(eoifd); + ret = 0; + break; + } + } + + spin_unlock_irq(&kvm->eoifds.lock); + +fail: + if (source && !IS_ERR(source)) + _irq_source_put(source); + if (eventfd && !IS_ERR(eventfd)) + eventfd_ctx_put(eventfd); + + /* + * Block until we know all outstanding shutdown jobs have completed + * so that we guarantee there will not be any more EOIs signaled on + * this eventfd once this deassign function returns. + */ + flush_workqueue(irqfd_cleanup_wq); + + return ret; +} + +int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args) +{ + if (args->flags & ~(KVM_EOIFD_FLAG_DEASSIGN | + KVM_EOIFD_FLAG_LEVEL_IRQFD)) + return -EINVAL; + + if (args->flags & KVM_EOIFD_FLAG_DEASSIGN) + return kvm_deassign_eoifd(kvm, args); + + return kvm_assign_eoifd(kvm, args); +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2468523..0b241bf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -620,6 +620,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) kvm_irqfd_release(kvm); + kvm_eoifd_release(kvm); + kvm_put_kvm(kvm); return 0; } @@ -2093,6 +2095,15 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif + case KVM_EOIFD: { + struct kvm_eoifd data; + + r = -EFAULT; + if (copy_from_user(&data, argp, sizeof data)) + goto out; + r = kvm_eoifd(kvm, &data); + break; + } default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); if (r == -ENOTTY) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html