[PATCH v6 2/2] kvm: KVM_EOIFD, an eventfd for EOIs

Alex Williamson <alex.williamson@xxxxxxxxxx> · Fri, 20 Jul 2012 10:33:50 -0600

This new ioctl enables an eventfd to be triggered when an EOI is
written for a specified irqchip pin.  The first user of this will
be external device assignment through VFIO, using a level irqfd
for asserting a PCI INTx interrupt and this interface for de-assert
and notification once the interrupt is serviced.

Here we make use of the reference counting of the _irq_source
object allowing us to share it with an irqfd and cleanup regardless
of the release order.

Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
---

 Documentation/virtual/kvm/api.txt |   21 ++
 arch/x86/kvm/x86.c                |    2 
 include/linux/kvm.h               |   15 ++
 include/linux/kvm_host.h          |   13 +
 virt/kvm/eventfd.c                |  335 +++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c               |   11 +
 6 files changed, 397 insertions(+)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 3911e62..8cd6b36 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1989,6 +1989,27 @@ return the hash table order in the parameter.  (If the guest is using
 the virtualized real-mode area (VRMA) facility, the kernel will
 re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
 
+4.77 KVM_EOIFD
+
+Capability: KVM_CAP_EOIFD
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_eoifd (in)
+Returns: 0 on success, < 0 on error
+
+KVM_EOIFD allows userspace to receive interrupt EOI notification
+through an eventfd.  kvm_eoifd.fd specifies the eventfd used for
+notification.  KVM_EOIFD_FLAG_DEASSIGN is used to de-assign an eoifd
+once assigned.  KVM_EOIFD also requires additional bits set in
+kvm_eoifd.flags to bind to the proper interrupt line.  The
+KVM_EOIFD_FLAG_LEVEL_IRQFD indicates that kvm_eoifd.key is provided
+and is a key from a level triggered interrupt (configured from
+KVM_IRQFD using KVM_IRQFD_FLAG_LEVEL).  The EOI notification is bound
+to the same GSI and irqchip input as the irqfd.  Both kvm_eoifd.key
+and KVM_EOIFD_FLAG_LEVEL_IRQFD must be specified on assignment and
+de-assignment of KVM_EOIFD.  A level irqfd may only be bound to a
+single eoifd.  KVM_CAP_EOIFD_LEVEL_IRQFD indicates support of
+KVM_EOIFD_FLAG_LEVEL_IRQFD.
 
 5. The kvm_run structure
 ------------------------
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9ded39d..8f3164e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2171,6 +2171,8 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_KVMCLOCK_CTRL:
 	case KVM_CAP_IRQFD_LEVEL:
+	case KVM_CAP_EOIFD:
+	case KVM_CAP_EOIFD_LEVEL_IRQFD:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index b2e6e4f..effb916 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -619,6 +619,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_S390_COW 79
 #define KVM_CAP_PPC_ALLOC_HTAB 80
 #define KVM_CAP_IRQFD_LEVEL 81
+#define KVM_CAP_EOIFD 82
+#define KVM_CAP_EOIFD_LEVEL_IRQFD 83
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -694,6 +696,17 @@ struct kvm_irqfd {
 	__u8  pad[20];
 };
 
+#define KVM_EOIFD_FLAG_DEASSIGN (1 << 0)
+/* Available with KVM_CAP_EOIFD_LEVEL_IRQFD */
+#define KVM_EOIFD_FLAG_LEVEL_IRQFD (1 << 1)
+
+struct kvm_eoifd {
+	__u32 fd;
+	__u32 flags;
+	__u32 key;
+	__u8 pad[20];
+};
+
 struct kvm_clock_data {
 	__u64 clock;
 	__u32 flags;
@@ -834,6 +847,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
 /* Available with KVM_CAP_PPC_ALLOC_HTAB */
 #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
+/* Available with KVM_CAP_EOIFD */
+#define KVM_EOIFD                 _IOW(KVMIO,  0xa8, struct kvm_eoifd)
 
 /*
  * ioctls for vcpu fds
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c73f071..01e72a6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -289,6 +289,10 @@ struct kvm {
 		struct mutex lock;
 		struct list_head items;
 	} irqsources;
+	struct {
+		spinlock_t lock;
+		struct list_head items;
+	} eoifds;
 #endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
@@ -832,6 +836,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
 void kvm_irqfd_release(struct kvm *kvm);
 void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
 int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
+int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args);
+void kvm_eoifd_release(struct kvm *kvm);
 
 #else
 
@@ -857,6 +863,13 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	return -ENOSYS;
 }
 
+static inline int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
+{
+	return -ENOSYS;
+}
+
+static inline void kvm_eoifd_release(struct kvm *kvm) {}
+
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 878cb52..5ebddad 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -95,6 +95,25 @@ static struct _irq_source *_irq_source_alloc(struct kvm *kvm, int gsi)
 	return source;
 }
 
+static struct _irq_source *_irq_source_get_from_key(struct kvm *kvm, int key)
+{
+	struct _irq_source *tmp, *source = ERR_PTR(-ENOENT);
+
+	mutex_lock(&kvm->irqsources.lock);
+
+	list_for_each_entry(tmp, &kvm->irqsources.items, list) {
+		if (tmp->id == key) {
+			source = tmp;
+			kref_get(&source->kref);
+			break;
+		}
+	}
+
+	mutex_unlock(&kvm->irqsources.lock);
+
+	return source;
+}
+
 /*
  * --------------------------------------------------------------------
  * irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -406,6 +425,8 @@ kvm_eventfd_init(struct kvm *kvm)
 	INIT_LIST_HEAD(&kvm->ioeventfds);
 	mutex_init(&kvm->irqsources.lock);
 	INIT_LIST_HEAD(&kvm->irqsources.items);
+	spin_lock_init(&kvm->eoifds.lock);
+	INIT_LIST_HEAD(&kvm->eoifds.items);
 }
 
 /*
@@ -772,3 +793,317 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
 	return kvm_assign_ioeventfd(kvm, args);
 }
+
+/*
+ * --------------------------------------------------------------------
+ *  eoifd: Translate KVM APIC/IOAPIC EOI into eventfd signal.
+ *
+ *  userspace can register with an eventfd for receiving
+ *  notification when an EOI occurs.
+ * --------------------------------------------------------------------
+ */
+
+struct _eoifd {
+	/* eventfd triggered on EOI */
+	struct eventfd_ctx *eventfd;
+	/* irq source ID de-asserted on EOI */
+	struct _irq_source *source;
+	wait_queue_t wait;
+	/* EOI notification from KVM */
+	struct kvm_irq_ack_notifier notifier;
+	struct list_head list;
+	poll_table pt;
+	struct work_struct shutdown;
+};
+
+/* Called under eoifds.lock */
+static void eoifd_shutdown(struct work_struct *work)
+{
+	struct _eoifd *eoifd = container_of(work, struct _eoifd, shutdown);
+	struct kvm *kvm = eoifd->source->kvm;
+	u64 cnt;
+
+	/*
+	 * Stop EOI signaling
+	 */
+	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
+
+	/*
+	 * Synchronize with the wait-queue and unhook ourselves to prevent
+	 * further events.
+	 */
+	eventfd_ctx_remove_wait_queue(eoifd->eventfd, &eoifd->wait, &cnt);
+
+	/*
+	 * Release resources
+	 */
+	eventfd_ctx_put(eoifd->eventfd);
+	_irq_source_put(eoifd->source);
+	kfree(eoifd);
+}
+
+/* assumes kvm->eoifds.lock is held */
+static bool eoifd_is_active(struct _eoifd *eoifd)
+{
+	return list_empty(&eoifd->list) ? false : true;
+}
+
+/*
+ * Mark the eoifd as inactive and schedule it for removal
+ *
+ * assumes kvm->eoifds.lock is held
+ */
+static void eoifd_deactivate(struct _eoifd *eoifd)
+{
+	BUG_ON(!eoifd_is_active(eoifd));
+
+	list_del_init(&eoifd->list);
+
+	queue_work(irqfd_cleanup_wq, &eoifd->shutdown);
+}
+
+/*
+ * Called with wqh->lock held and interrupts disabled
+ */
+static int eoifd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	unsigned long flags = (unsigned long)key;
+
+	if (unlikely(flags & POLLHUP)) {
+		/* The eventfd is closing, detach from KVM */
+		struct _eoifd *eoifd = container_of(wait, struct _eoifd, wait);
+		struct kvm *kvm = eoifd->source->kvm;
+		unsigned long flags;
+
+		spin_lock_irqsave(&kvm->eoifds.lock, flags);
+
+		/*
+		 * We must check if someone deactivated the eoifd before
+		 * we could acquire the eoifds.lock since the item is
+		 * deactivated from the KVM side before it is unhooked from
+		 * the wait-queue.  If it is already deactivated, we can
+		 * simply return knowing the other side will cleanup for us.
+		 * We cannot race against the eoifd going away since the
+		 * other side is required to acquire wqh->lock, which we hold
+		 */
+		if (eoifd_is_active(eoifd))
+			eoifd_deactivate(eoifd);
+
+		spin_unlock_irqrestore(&kvm->eoifds.lock, flags);
+	}
+
+	return 0;
+}
+
+static void eoifd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
+				    poll_table *pt)
+{
+	struct _eoifd *eoifd = container_of(pt, struct _eoifd, pt);
+	add_wait_queue(wqh, &eoifd->wait);
+}
+
+/*
+ * This function is called as the kvm VM fd is being released. Shutdown all
+ * eoifds that still remain open
+ */
+void kvm_eoifd_release(struct kvm *kvm)
+{
+	struct _eoifd *tmp, *eoifd;
+
+	spin_lock_irq(&kvm->eoifds.lock);
+
+	list_for_each_entry_safe(eoifd, tmp, &kvm->eoifds.items, list)
+		eoifd_deactivate(eoifd);
+
+	spin_unlock_irq(&kvm->eoifds.lock);
+
+	flush_workqueue(irqfd_cleanup_wq);
+}
+
+static void eoifd_event(struct kvm_irq_ack_notifier *notifier)
+{
+	struct _eoifd *eoifd;
+
+	eoifd = container_of(notifier, struct _eoifd, notifier);
+
+	if (unlikely(!eoifd->source))
+		return;
+
+	/*
+	 * De-assert and send EOI, user needs to re-assert if
+	 * device still requires service.
+	 */
+	kvm_set_irq(eoifd->source->kvm,
+		    eoifd->source->id, eoifd->source->gsi, 0);
+	eventfd_signal(eoifd->eventfd, 1);
+}
+
+static int kvm_assign_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
+{
+	struct file *file = NULL;
+	struct eventfd_ctx *eventfd = NULL;
+	struct _eoifd *eoifd = NULL, *tmp;
+	struct _irq_source *source = NULL;
+	int ret;
+
+	if (!(args->flags & KVM_EOIFD_FLAG_LEVEL_IRQFD))
+		return -EINVAL;
+
+	file = eventfd_fget(args->fd);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto fail;
+	}
+
+	eventfd = eventfd_ctx_fileget(file);
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto fail;
+	}
+
+	eoifd = kzalloc(sizeof(*eoifd), GFP_KERNEL);
+	if (!eoifd) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	source = _irq_source_get_from_key(kvm, args->key);
+	if (IS_ERR(source)) {
+		ret = PTR_ERR(source);
+		goto fail;
+	}
+
+	INIT_LIST_HEAD(&eoifd->list);
+	INIT_WORK(&eoifd->shutdown, eoifd_shutdown);
+	eoifd->eventfd = eventfd;
+	eoifd->notifier.gsi = source->gsi;
+	eoifd->notifier.irq_acked = eoifd_event;
+
+	/*
+	 * Install our own custom wake-up handling so we are notified via
+	 * a callback whenever someone releases the underlying eventfd
+	 */
+	init_waitqueue_func_entry(&eoifd->wait, eoifd_wakeup);
+	init_poll_funcptr(&eoifd->pt, eoifd_ptable_queue_proc);
+
+	/*
+	 * Clear out any previously released eoifds that might conflict
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+	/*
+	 * This can sleep, so register before acquiring spinlock, notifier
+	 * becomes a nop until we finish.
+	 */
+	kvm_register_irq_ack_notifier(kvm, &eoifd->notifier);
+
+	spin_lock_irq(&kvm->eoifds.lock);
+
+	/*
+	 * Enforce a one-to-one relationship between irq source and eoifd so
+	 * that this interface can't be used to consume all kernel memory.
+	 * NB. single eventfd can still be used by multiple eoifds.
+	 */
+	list_for_each_entry(tmp, &kvm->eoifds.items, list) {
+		if (tmp->source == source) {
+			spin_unlock_irq(&kvm->eoifds.lock);
+			ret = -EBUSY;
+			goto fail_unregister;
+		}
+	}
+
+	/*
+	 * Install the wait queue function.  This allow cleanup when
+	 * the eventfd is closed by the user, just like irqfd.
+	 */
+	file->f_op->poll(file, &eoifd->pt);
+
+	list_add_tail(&eoifd->list, &kvm->eoifds.items);
+	eoifd->source = source; /* Enable ack notifier */
+
+	spin_unlock_irq(&kvm->eoifds.lock);
+
+	/*
+	 * No need to check for POLLHUP above, drop file here to enable it.
+	 */
+	fput(file);
+
+	return 0;
+
+fail_unregister:
+	kvm_unregister_irq_ack_notifier(kvm, &eoifd->notifier);
+fail:
+	if (source && !IS_ERR(source))
+		_irq_source_put(source);
+
+	if (eventfd && !IS_ERR(eventfd))
+		eventfd_ctx_put(eventfd);
+
+	if (file && !IS_ERR(file))
+		fput(file);
+
+	kfree(eoifd);
+	return ret;
+}
+
+static int kvm_deassign_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
+{
+	struct eventfd_ctx *eventfd = NULL;
+	struct _irq_source *source = NULL;
+	struct _eoifd *eoifd;
+	int ret = -ENOENT;
+
+	if (!(args->flags & KVM_EOIFD_FLAG_LEVEL_IRQFD))
+		return -EINVAL;
+
+	eventfd = eventfd_ctx_fdget(args->fd);
+	if (IS_ERR(eventfd)) {
+		ret = PTR_ERR(eventfd);
+		goto fail;
+	}
+
+	source = _irq_source_get_from_key(kvm, args->key);
+	if (IS_ERR(source)) {
+		ret = PTR_ERR(source);
+		goto fail;
+	}
+
+	spin_lock_irq(&kvm->eoifds.lock);
+
+	list_for_each_entry(eoifd, &kvm->eoifds.items, list) {
+		if (eoifd->eventfd == eventfd && eoifd->source == source) {
+			eoifd_deactivate(eoifd);
+			ret = 0;
+			break;
+		}
+	}
+
+	spin_unlock_irq(&kvm->eoifds.lock);
+
+fail:
+	if (source && !IS_ERR(source))
+		_irq_source_put(source);
+	if (eventfd && !IS_ERR(eventfd))
+		eventfd_ctx_put(eventfd);
+
+	/*
+	 * Block until we know all outstanding shutdown jobs have completed
+	 * so that we guarantee there will not be any more EOIs signaled on
+	 * this eventfd once this deassign function returns.
+	 */
+	flush_workqueue(irqfd_cleanup_wq);
+
+	return ret;
+}
+
+int kvm_eoifd(struct kvm *kvm, struct kvm_eoifd *args)
+{
+	if (args->flags & ~(KVM_EOIFD_FLAG_DEASSIGN |
+			    KVM_EOIFD_FLAG_LEVEL_IRQFD))
+		return -EINVAL;
+
+	if (args->flags & KVM_EOIFD_FLAG_DEASSIGN)
+		return kvm_deassign_eoifd(kvm, args);
+
+	return kvm_assign_eoifd(kvm, args);
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2468523..0b241bf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -620,6 +620,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 
 	kvm_irqfd_release(kvm);
 
+	kvm_eoifd_release(kvm);
+
 	kvm_put_kvm(kvm);
 	return 0;
 }
@@ -2093,6 +2095,15 @@ static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif
+	case KVM_EOIFD: {
+		struct kvm_eoifd data;
+
+		r = -EFAULT;
+		if (copy_from_user(&data, argp, sizeof data))
+			goto out;
+		r = kvm_eoifd(kvm, &data);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html