On Fri, Aug 10, 2012 at 04:37:48PM -0600, Alex Williamson wrote: > Enable a mechanism for IRQ ACKs to be exposed through an eventfd. The > user can specify the GSI and optionally an IRQ source ID and have the > provided eventfd trigger whenever the irqchip resamples it's inputs, > for instance on EOI. > > Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx> > --- > > Documentation/virtual/kvm/api.txt | 18 ++ > arch/x86/kvm/x86.c | 2 > include/linux/kvm.h | 16 ++ > include/linux/kvm_host.h | 13 ++ > virt/kvm/eventfd.c | 285 +++++++++++++++++++++++++++++++++++++ > virt/kvm/kvm_main.c | 10 + > 6 files changed, 344 insertions(+) > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt > index 17cd599..77b4837 100644 > --- a/Documentation/virtual/kvm/api.txt > +++ b/Documentation/virtual/kvm/api.txt > @@ -2011,6 +2011,24 @@ of IRQ source IDs. These IDs are also shared with KVM internal users > (ex. KVM assigned devices, PIT, shared user ID), therefore not all IDs > may be allocated through this interface. > > +4.78 KVM_IRQ_ACKFD > + > +Capability: KVM_CAP_IRQ_ACKFD > +Architectures: x86 > +Type: vm ioctl > +Parameters: struct kvm_irq_ackfd (in) > +Returns: 0 on success, -errno on error > + > +Allows userspace notification of IRQ ACKs, or resampling of irqchip > +inputs, often associated with an EOI. User provided kvm_irq_ackfd.fd > +and kvm_irq_ackfd.gsi are required and result in an eventfd trigger > +whenever the GSI is acknowledged. When KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_FD > +is available, KVM_IRQ_ACKFD supports the KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID > +flag which indicates kvm_irqfd.irq_source_id is provided. With this, > +the eventfd is only triggered when the specified IRQ source ID is > +pending. On deassign, fd, gsi, and irq_source_id (if provided on assign) > +must be provided. > + > This seems to imply that ACKFD can be used with edge GSIs, but this does not actually work: with source ID because of the filtering, and without because of PV EOI. It seems that what we are really tracking is level change 1->0. For edge no level -> no notification? Or does 'resampling of irqchip inputs' imply level somehow? > 5. The kvm_run structure > ------------------------ > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 19680ed..3d98e59 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2176,6 +2176,8 @@ int kvm_dev_ioctl_check_extension(long ext) > case KVM_CAP_KVMCLOCK_CTRL: > case KVM_CAP_IRQFD_IRQ_SOURCE_ID: > case KVM_CAP_IRQFD_ASSERT_ONLY: > + case KVM_CAP_IRQ_ACKFD: > + case KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_ID: > r = 1; > break; > case KVM_CAP_COALESCED_MMIO: > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > index 19b1235..0f53bd5 100644 > --- a/include/linux/kvm.h > +++ b/include/linux/kvm.h > @@ -621,6 +621,8 @@ struct kvm_ppc_smmu_info { > #define KVM_CAP_NR_IRQ_SOURCE_ID 81 > #define KVM_CAP_IRQFD_IRQ_SOURCE_ID 82 > #define KVM_CAP_IRQFD_ASSERT_ONLY 83 > +#define KVM_CAP_IRQ_ACKFD 84 > +#define KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_ID 85 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -707,6 +709,18 @@ struct kvm_irq_source_id { > __u8 pad[24]; > }; > > +#define KVM_IRQ_ACKFD_FLAG_DEASSIGN (1 << 0) > +/* Available with KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_ID */ > +#define KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID (1 << 1) > + > +struct kvm_irq_ackfd { > + __u32 flags; > + __u32 fd; > + __u32 gsi; > + __u32 irq_source_id; > + __u8 pad[16]; > +}; > + > struct kvm_clock_data { > __u64 clock; > __u32 flags; > @@ -849,6 +863,8 @@ struct kvm_s390_ucas_mapping { > #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) > /* Available with KVM_CAP_IRQ_SOURCE_ID */ > #define KVM_IRQ_SOURCE_ID _IOWR(KVMIO, 0xa8, struct kvm_irq_source_id) > +/* Available with KVM_CAP_IRQ_ACKFD */ > +#define KVM_IRQ_ACKFD _IOW(KVMIO, 0xa9, struct kvm_irq_ackfd) > > /* > * ioctls for vcpu fds > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index ea6d7a1..cdc55c2 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -285,6 +285,10 @@ struct kvm { > struct list_head items; > } irqfds; > struct list_head ioeventfds; > + struct { > + spinlock_t lock; > + struct list_head items; > + } irq_ackfds; > #endif > struct kvm_vm_stat stat; > struct kvm_arch arch; > @@ -831,6 +835,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); > void kvm_irqfd_release(struct kvm *kvm); > void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); > int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); > +int kvm_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args); > +void kvm_irq_ackfd_release(struct kvm *kvm); > > #else > > @@ -843,6 +849,13 @@ static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) > > static inline void kvm_irqfd_release(struct kvm *kvm) {} > > +static inline int kvm_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args) > +{ > + return -EINVAL; > +} > + > +static inline void kvm_irq_ackfd_release(struct kvm *kvm) {} > + > #ifdef CONFIG_HAVE_KVM_IRQCHIP > static inline void kvm_irq_routing_update(struct kvm *kvm, > struct kvm_irq_routing_table *irq_rt) > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c > index df41038..ff5c784 100644 > --- a/virt/kvm/eventfd.c > +++ b/virt/kvm/eventfd.c > @@ -300,6 +300,8 @@ kvm_eventfd_init(struct kvm *kvm) > spin_lock_init(&kvm->irqfds.lock); > INIT_LIST_HEAD(&kvm->irqfds.items); > INIT_LIST_HEAD(&kvm->ioeventfds); > + spin_lock_init(&kvm->irq_ackfds.lock); > + INIT_LIST_HEAD(&kvm->irq_ackfds.items); > } > > /* > @@ -668,3 +670,286 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) > > return kvm_assign_ioeventfd(kvm, args); > } > + > +/* > + * -------------------------------------------------------------------- > + * irq_ackfd: Expose IRQ ACKs to userspace via eventfd > + * > + * userspace can register to recieve eventfd notification for ACKs of > + * interrupt lines. > + * -------------------------------------------------------------------- > + */ > + > +struct _irq_ackfd { > + struct kvm *kvm; > + struct eventfd_ctx *eventfd; /* signaled on ack */ > + struct kvm_irq_ack_notifier notifier; > + /* Setup/shutdown */ > + wait_queue_t wait; > + poll_table pt; > + struct work_struct shutdown; > + struct list_head list; > +}; > + > +/* Called under irq_ackfds.lock */ > +static void irq_ackfd_shutdown(struct work_struct *work) > +{ > + struct _irq_ackfd *ackfd = container_of(work, struct _irq_ackfd, > + shutdown); > + struct kvm *kvm = ackfd->kvm; > + u64 cnt; > + > + /* > + * Stop ACK signaling > + */ > + kvm_unregister_irq_ack_notifier(kvm, &ackfd->notifier); > + > + /* > + * Synchronize with the wait-queue and unhook ourselves to prevent > + * further events. > + */ > + eventfd_ctx_remove_wait_queue(ackfd->eventfd, &ackfd->wait, &cnt); > + > + /* > + * Release resources > + */ > + eventfd_ctx_put(ackfd->eventfd); > + kfree(ackfd); > +} > + > +/* assumes kvm->irq_ackfds.lock is held */ > +static bool irq_ackfd_is_active(struct _irq_ackfd *ackfd) > +{ > + return list_empty(&ackfd->list) ? false : true; > +} > + > +/* > + * Mark the irq_ackfd as inactive and schedule it for removal > + * > + * assumes kvm->irq_ackfds.lock is held > + */ > +static void irq_ackfd_deactivate(struct _irq_ackfd *ackfd) > +{ > + BUG_ON(!irq_ackfd_is_active(ackfd)); > + > + list_del_init(&ackfd->list); > + > + /* Borrow irqfd's workqueue, we don't need our own. */ > + queue_work(irqfd_cleanup_wq, &ackfd->shutdown); > +} > + > +/* > + * Called with wqh->lock held and interrupts disabled > + */ > +static int irq_ackfd_wakeup(wait_queue_t *wait, unsigned mode, > + int sync, void *key) > +{ > + unsigned long flags = (unsigned long)key; > + > + if (unlikely(flags & POLLHUP)) { > + /* The eventfd is closing, detach from KVM */ > + struct _irq_ackfd *ackfd; > + unsigned long flags; > + > + ackfd = container_of(wait, struct _irq_ackfd, wait); > + > + spin_lock_irqsave(&ackfd->kvm->irq_ackfds.lock, flags); > + > + /* > + * We must check if someone deactivated the irq_ackfd before > + * we could acquire the irq_ackfds.lock since the item is > + * deactivated from the KVM side before it is unhooked from > + * the wait-queue. If it is already deactivated, we can > + * simply return knowing the other side will cleanup for us. > + * We cannot race against the irq_ackfd going away since the > + * other side is required to acquire wqh->lock, which we hold > + */ > + if (irq_ackfd_is_active(ackfd)) > + irq_ackfd_deactivate(ackfd); > + > + spin_unlock_irqrestore(&ackfd->kvm->irq_ackfds.lock, flags); > + } > + > + return 0; > +} > + > +static void irq_ackfd_ptable_queue_proc(struct file *file, > + wait_queue_head_t *wqh, > + poll_table *pt) > +{ > + struct _irq_ackfd *ackfd = container_of(pt, struct _irq_ackfd, pt); > + add_wait_queue(wqh, &ackfd->wait); > +} > + > +/* > + * This function is called as the kvm VM fd is being released. Shutdown all > + * irq_ackfds that still remain open > + */ > +void kvm_irq_ackfd_release(struct kvm *kvm) > +{ > + struct _irq_ackfd *tmp, *ackfd; > + > + spin_lock_irq(&kvm->irq_ackfds.lock); > + > + list_for_each_entry_safe(ackfd, tmp, &kvm->irq_ackfds.items, list) > + irq_ackfd_deactivate(ackfd); > + > + spin_unlock_irq(&kvm->irq_ackfds.lock); > + > + flush_workqueue(irqfd_cleanup_wq); > +} > + > +static void irq_ackfd_acked(struct kvm_irq_ack_notifier *notifier) > +{ > + struct _irq_ackfd *ackfd; > + > + ackfd = container_of(notifier, struct _irq_ackfd, notifier); > + > + eventfd_signal(ackfd->eventfd, 1); > +} > + > +static int kvm_assign_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args) > +{ > + struct file *file = NULL; > + struct eventfd_ctx *eventfd = NULL; > + struct _irq_ackfd *ackfd = NULL, *tmp; > + int ret; > + u64 cnt; > + > + file = eventfd_fget(args->fd); > + if (IS_ERR(file)) { > + ret = PTR_ERR(file); > + goto fail; > + } > + > + eventfd = eventfd_ctx_fileget(file); > + if (IS_ERR(eventfd)) { > + ret = PTR_ERR(eventfd); > + goto fail; > + } > + > + ackfd = kzalloc(sizeof(*ackfd), GFP_KERNEL); > + if (!ackfd) { > + ret = -ENOMEM; > + goto fail; > + } > + > + INIT_LIST_HEAD(&ackfd->list); > + INIT_WORK(&ackfd->shutdown, irq_ackfd_shutdown); > + ackfd->kvm = kvm; > + ackfd->eventfd = eventfd; > + ackfd->notifier.gsi = args->gsi; > + if (args->flags & KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID) > + ackfd->notifier.irq_source_id = args->irq_source_id; > + else > + ackfd->notifier.irq_source_id = -1; > + ackfd->notifier.irq_acked = irq_ackfd_acked; > + > + /* > + * Install our own custom wake-up handling so we are notified via > + * a callback whenever someone releases the underlying eventfd > + */ > + init_waitqueue_func_entry(&ackfd->wait, irq_ackfd_wakeup); > + init_poll_funcptr(&ackfd->pt, irq_ackfd_ptable_queue_proc); > + > + /* > + * Install the wait queue function to allow cleanup when the > + * eventfd is closed by the user. This grabs the wqh lock, so > + * we do it out of spinlock, holding the file reference ensures > + * we won't see a POLLHUP until setup is complete. > + */ > + file->f_op->poll(file, &ackfd->pt); > + > + spin_lock_irq(&kvm->irq_ackfds.lock); > + > + /* Check for duplicates. */ > + list_for_each_entry(tmp, &kvm->irq_ackfds.items, list) { > + if (ackfd->eventfd == tmp->eventfd && > + ackfd->notifier.gsi == tmp->notifier.gsi && > + ackfd->notifier.irq_source_id == > + tmp->notifier.irq_source_id) { > + spin_unlock_irq(&kvm->irq_ackfds.lock); > + ret = -EBUSY; > + goto fail_unqueue; > + } > + } > + So source ID is not validated at all, this means there are 4G ways for the same eventfd to be registered, drinking up unlimited kernel memory by means of kzalloc above. > + list_add_tail(&ackfd->list, &kvm->irq_ackfds.items); > + > + spin_unlock_irq(&kvm->irq_ackfds.lock); > + > + /* This can sleep, so register after spinlock. */ > + kvm_register_irq_ack_notifier(kvm, &ackfd->notifier); > + > + fput(file); /* Enable POLLHUP */ > + > + return 0; > + > +fail_unqueue: > + eventfd_ctx_remove_wait_queue(eventfd, &ackfd->wait, &cnt); > +fail: > + if (eventfd && !IS_ERR(eventfd)) > + eventfd_ctx_put(eventfd); > + > + if (file && !IS_ERR(file)) > + fput(file); > + > + kfree(ackfd); > + > + return ret; > +} > + > +static int kvm_deassign_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args) > +{ > + struct eventfd_ctx *eventfd = NULL; > + struct _irq_ackfd *ackfd; > + int irq_source_id = -1, ret = -ENOENT; > + > + eventfd = eventfd_ctx_fdget(args->fd); > + if (IS_ERR(eventfd)) { > + ret = PTR_ERR(eventfd); > + goto fail; > + } > + > + if (args->flags & KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID) > + irq_source_id = args->irq_source_id; > + > + spin_lock_irq(&kvm->irq_ackfds.lock); > + > + list_for_each_entry(ackfd, &kvm->irq_ackfds.items, list) { > + if (ackfd->eventfd == eventfd && > + ackfd->notifier.gsi == args->gsi && > + ackfd->notifier.irq_source_id == irq_source_id) { > + irq_ackfd_deactivate(ackfd); > + ret = 0; > + break; > + } > + } > + > + spin_unlock_irq(&kvm->irq_ackfds.lock); > + > +fail: > + if (eventfd && !IS_ERR(eventfd)) > + eventfd_ctx_put(eventfd); > + > + /* > + * Block until we know all outstanding shutdown jobs have completed > + * so that we guarantee there will not be any more ACKs signaled on > + * this eventfd once this deassign function returns. > + */ > + flush_workqueue(irqfd_cleanup_wq); > + > + return ret; > +} > + > +int kvm_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args) > +{ > + if (args->flags & ~(KVM_IRQ_ACKFD_FLAG_DEASSIGN | > + KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID)) > + return -EINVAL; > + > + if (args->flags & KVM_IRQ_ACKFD_FLAG_DEASSIGN) > + return kvm_deassign_irq_ackfd(kvm, args); > + > + return kvm_assign_irq_ackfd(kvm, args); > +} > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index e120cb3..bbe3a20 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -619,6 +619,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) > struct kvm *kvm = filp->private_data; > > kvm_irqfd_release(kvm); > + kvm_irq_ackfd_release(kvm); > > kvm_put_kvm(kvm); > return 0; > @@ -2109,6 +2110,15 @@ static long kvm_vm_ioctl(struct file *filp, > break; > } > #endif > + case KVM_IRQ_ACKFD: { > + struct kvm_irq_ackfd data; > + > + r = -EFAULT; > + if (copy_from_user(&data, argp, sizeof(data))) > + goto out; > + r = kvm_irq_ackfd(kvm, &data); > + break; > + } > default: > r = kvm_arch_vm_ioctl(filp, ioctl, arg); > if (r == -ENOTTY) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html