Re: [PATCH 0/2] eventfd: new EFD_STATE flag

Davide Libenzi <davidel@xxxxxxxxxxxxxxx> · Thu, 7 Jan 2010 15:37:11 -0800 (PST)

On Thu, 7 Jan 2010, Michael S. Tsirkin wrote:

> Sure, I was trying to be as brief as possible, here's a detailed summary.
> 
> Description of the system (MSI emulation in KVM):
> 
> KVM supports an ioctl to assign/deassign an eventfd file to interrupt message
> in guest OS.  When this eventfd is signalled, interrupt message is sent.
> This assignment is done from qemu system emulator.
> 
> eventfd is signalled from device emulation in another thread in
> userspace or from kernel, which talks with guest OS through another
> eventfd and shared memory (possibility of out of process was discussed
> but never got implemented yet).
> 
> Note: it's okay to delay messages from correctness point of view, but
> generally this is latency-sensitive path. If multiple identical messages
> are requested, it's okay to send a single last message, but missing a
> message altogether causes deadlocks.  Sending a message when none were
> requested might in theory cause crashes, in practice doing this causes
> performance degradation.
> 
> Another KVM feature is interrupt masking: guest OS requests that we
> stop sending some interrupt message, possibly modified mapping
> and re-enables this message. This needs to be done without
> involving the device that might keep requesting events:
> while masked, message is marked "pending", and guest might test
> the pending status.
> 
> We can implement masking in system emulator in userspace, by using
> assign/deassign ioctls: when message is masked, we simply deassign all
> eventfd, and when it is unmasked, we assign them back.
> 
> Here's some code to illustrate how this all works: assign/deassign code
> in kernel looks like the following:
> 
> 
> this is called to unmask interrupt
> 
> static int
> kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi)
> {
> 	struct _irqfd *irqfd, *tmp;
> 	struct file *file = NULL;
> 	struct eventfd_ctx *eventfd = NULL;
> 	int ret;
> 	unsigned int events;
> 
> 	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
> 
> ...
> 
> 	file = eventfd_fget(fd);
> 	if (IS_ERR(file)) {
> 		ret = PTR_ERR(file);
> 		goto fail;
> 	}
> 
> 	eventfd = eventfd_ctx_fileget(file);
> 	if (IS_ERR(eventfd)) {
> 		ret = PTR_ERR(eventfd);
> 		goto fail;
> 	}
> 
> 	irqfd->eventfd = eventfd;
> 
> 	/*
> 	 * Install our own custom wake-up handling so we are notified via
> 	 * a callback whenever someone signals the underlying eventfd
> 	 */
> 	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
> 	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
> 
> 	spin_lock_irq(&kvm->irqfds.lock);
> 
> 	events = file->f_op->poll(file, &irqfd->pt);
> 
> 	list_add_tail(&irqfd->list, &kvm->irqfds.items);
> 	spin_unlock_irq(&kvm->irqfds.lock);
> 
> A.
> 	/*
> 	 * Check if there was an event already pending on the eventfd
> 	 * before we registered, and trigger it as if we didn't miss it.
> 	 */
> 	if (events & POLLIN)
> 		schedule_work(&irqfd->inject);
> 
> 	/*
> 	 * do not drop the file until the irqfd is fully initialized, otherwise
> 	 * we might race against the POLLHUP
> 	 */
> 	fput(file);
> 
> 	return 0;
> 
> fail:
> 	...
> }
> 
> This is called to mask interrupt
> 
> /*
>  * shutdown any irqfd's that match fd+gsi
>  */
> static int
> kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
> {
> 	struct _irqfd *irqfd, *tmp;
> 	struct eventfd_ctx *eventfd;
> 
> 	eventfd = eventfd_ctx_fdget(fd);
> 	if (IS_ERR(eventfd))
> 		return PTR_ERR(eventfd);
> 
> 	spin_lock_irq(&kvm->irqfds.lock);
> 
> 	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
> 		if (irqfd->eventfd == eventfd && irqfd->gsi == gsi)
> 			irqfd_deactivate(irqfd);
> 	}
> 
> 	spin_unlock_irq(&kvm->irqfds.lock);
> 	eventfd_ctx_put(eventfd);
> 
> 	/*
> 	 * Block until we know all outstanding shutdown jobs have completed
> 	 * so that we guarantee there will not be any more interrupts on this
> 	 * gsi once this deassign function returns.
> 	 */
> 	flush_workqueue(irqfd_cleanup_wq);
> 
> 	return 0;
> }
> 
> 
> And deactivation deep down does this (from irqfd_cleanup_wq workqueue,
> so this is not under the spinlock):
> 
>         /*
>          * Synchronize with the wait-queue and unhook ourselves to
>          * prevent
>          * further events.
>          */
> B.
>         remove_wait_queue(irqfd->wqh, &irqfd->wait);
> 
> 	....
> 
>         /*
>          * It is now safe to release the object's resources
>          */
>         eventfd_ctx_put(irqfd->eventfd);
>         kfree(irqfd);
> 
> 
> The problems (really the same bug) in KVM that I am trying to fix:
> 1. Because of A above, if event was requested while message was masked,
>    we will not miss a message. However, because we never clear
>    the counter, so we currently get a spurious message each time
>    we unmask.
> 
>    We should clear the counter either each time we
>    deliver message, or at least when message is masked.
> 
> 2. We currently always return pending status 0 to guests,
>    but for strict spec compliance we must do it correctly
>    and report pending message if one was requested while
>    it was masked.
> 
>    An obvious way to do this is to do nonblocking read on the eventfd.
>    Since this clears the counter, we will have to write the value
>    back. But again, this requires that all messages that were
>    sent before mask are cleared from the counter, otherwise
>    we will always report pending messages.
> 
> 
> So, how should I clear the counter?
> 
> I started with the idea of clearing it in irqfd_wakeup.  However, as you
> point out this will cause another list scan, so it's messy.  It
> currently uses a workqueue to work around KVM locking issue, but the
> issue is fixed so we will stop doing this soon, so can not rely on that.
> 
> Thus I came up with the idea to do this on unmask:
> this is slow path and can always be done from workqueue or ioctl
> context. So I would add eventfd_read_ctx in code at point B:
> 
> 
> 		eventfd_read_ctx(irqfd->eventfd, &ucnt);
> 	->
> 		remove_wait_queue(irqfd->wqh, &irqfd->wait);
> 
> now, if device signals eventfd at point marked by ->,
> it will be sent but counter not cleared,
> so we will get spurious message when we unmask.
> 
> Or if I do it the other way:
> 		remove_wait_queue(irqfd->wqh, &irqfd->wait);
> 	->
> 		eventfd_read_ctx(irqfd->eventfd, &ucnt);
> 
> now, if device signals eventfd at point marked by ->,
> it will not be sent but counter will be cleared,
> so we will loose a message.
> 
> There does not appear to be a way to avoid this race unless we both
> remove from wait queue and clear counter under the wait queue lock, just
> like eventfd_read currently does.
> 
> I hope this clarifies.

Yes, it sure does, thanks.
When you unmask, shouldn't the fd be in a "clear" state anyway?
So, can't you just do  eventfd_ctx_read(ctx, 1, &ucnt)  on unmask, and 
ignore the value (and the POLLIN flags)?
Another solution, is to keep track of the counter (using 
eventfd_ctx_read() and eventfd_ctx_peek_value()) inside your code, and 
decide the status based on its changes.




- Davide


---
 fs/eventfd.c            |   74 ++++++++++++++++++++++++++++++++++++++----------
 include/linux/eventfd.h |   13 ++++++++
 2 files changed, 72 insertions(+), 15 deletions(-)

Index: linux-2.6.mod/fs/eventfd.c
===================================================================

--- linux-2.6.mod.orig/fs/eventfd.c	2010-01-06 12:37:28.000000000 -0800
+++ linux-2.6.mod/fs/eventfd.c	2010-01-06 17:43:09.000000000 -0800
@@ -135,26 +135,56 @@ static unsigned int eventfd_poll(struct 
 	return events;
 }
 
-static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
-			    loff_t *ppos)
+/**
+ * eventfd_ctx_peek_value - Reads the eventfd counter.
+ * @ctx: [in] Pointer to eventfd context.
+ *
+ * NOTE: Must be called with the wait queue lock held.  Locking the
+ * eventfd_ctx interface is not actually exposed outside of the
+ * eventfd implementation, but when poll callback registers with the
+ * eventfd poll, such callbacks will be invoked with the wait queue
+ * lock held.  IOW this means that this function can only be called from
+ * inside a registered poll callback at the moment.
+ * The function returns the current counter value, without consuming it.
+ */
+__u64 eventfd_ctx_peek_value(struct eventfd_ctx *ctx)
+{
+	assert_spin_locked(&ctx->wqh.lock);
+
+	return ctx->count;
+}
+EXPORT_SYMBOL_GPL(eventfd_ctx_peek_value);
+
+/**
+ * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
+ * @ctx: [in] Pointer to eventfd context.
+ * @no_wait: [in] Different from zero if the operation should not block.
+ * @cnt: [out] Pointer to the 64bit conter value.
+ *
+ * Returns zero if successful, or the following error codes:
+ *
+ * -EAGAIN      : The operation would have blocked but @no_wait was nonzero.
+ * -ERESTARTSYS : A signal interrupted the wait operation.
+ *
+ * If @no_wait is zero, the function might sleep until the eventfd internal
+ * counter becomes greater than zero.
+ */
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
 {
-	struct eventfd_ctx *ctx = file->private_data;
 	ssize_t res;
-	__u64 ucnt = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
-	if (count < sizeof(ucnt))
-		return -EINVAL;
 	spin_lock_irq(&ctx->wqh.lock);
+	*cnt = 0;
 	res = -EAGAIN;
 	if (ctx->count > 0)
-		res = sizeof(ucnt);
-	else if (!(file->f_flags & O_NONBLOCK)) {
+		res = 0;
+	else if (!no_wait) {
 		__add_wait_queue(&ctx->wqh, &wait);
-		for (res = 0;;) {
+		for (;;) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (ctx->count > 0) {
-				res = sizeof(ucnt);
+				res = 0;
 				break;
 			}
 			if (signal_pending(current)) {
@@ -168,18 +198,32 @@ static ssize_t eventfd_read(struct file 
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (likely(res > 0)) {
-		ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
-		ctx->count -= ucnt;
+	if (likely(res == 0)) {
+		*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1: ctx->count;
+		ctx->count -= *cnt;
 		if (waitqueue_active(&ctx->wqh))
 			wake_up_locked_poll(&ctx->wqh, POLLOUT);
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
-	if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
-		return -EFAULT;
 
 	return res;
 }
+EXPORT_SYMBOL_GPL(eventfd_ctx_read);
+
+static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct eventfd_ctx *ctx = file->private_data;
+	ssize_t res;
+	__u64 cnt;
+
+	if (count < sizeof(cnt))
+		return -EINVAL;
+	if ((res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt)) < 0)
+		return res;
+
+	return put_user(cnt, (__u64 __user *) buf) ? -EFAULT: sizeof(cnt);
+}
 
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 			     loff_t *ppos)
Index: linux-2.6.mod/include/linux/eventfd.h
===================================================================
--- linux-2.6.mod.orig/include/linux/eventfd.h	2010-01-06 12:37:28.000000000 -0800
+++ linux-2.6.mod/include/linux/eventfd.h	2010-01-06 14:54:36.000000000 -0800
@@ -34,6 +34,8 @@ struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 int eventfd_signal(struct eventfd_ctx *ctx, int n);
+ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt);
+__u64 eventfd_ctx_peek_value(struct eventfd_ctx *ctx);
 
 #else /* CONFIG_EVENTFD */
 
@@ -61,6 +63,17 @@ static inline void eventfd_ctx_put(struc
 
 }
 
+static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait,
+				       __u64 *cnt)
+{
+	return -ENOSYS;
+}
+
+static inline __u64 eventfd_ctx_peek_value(struct eventfd_ctx *ctx)
+{
+	return 0;
+}
+
 #endif
 
 #endif /* _LINUX_EVENTFD_H */
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html