On Sun, Jan 10, 2010 at 11:04:23AM -0800, Davide Libenzi wrote: > On Sun, 10 Jan 2010, Michael S. Tsirkin wrote: > > > On Sun, Jan 10, 2010 at 09:27:34AM -0800, Davide Libenzi wrote: > > > On Sun, 10 Jan 2010, Michael S. Tsirkin wrote: > > > > > > > Not sure what you mean by irqfd locking. IMO we must take waitqueue > > > > lock, otherwise we can not prevent wait queue callback from being > > > > invoked, right? Note that if we take our own lock under wait queue > > > > callback, we won't be able to use this lock around remove_wait_queue. > > > > Also note how fs/evetfd.c uses __remove_wait_queue after taking wait > > > > queue lock - we'd like to do the same with our wait queue. > > > > > > > > Could you clarify pls? > > > > > > Wait a second. Looking at the current code in Linus tree, when you > > > deassign an irqfd, you are actually destroying it, so the idea of saving > > > the counter on deassign is not going to work. > > > > > > > > > > > > - Davide > > > > Yes, the only context passed between deassign and assign is the eventfd > > itself. So I think the following code for deassign would work provided > > eventfd_ctx_read_locked works with wait queue lock taken: > > > > spin_lock_irqsave(&irqfd->wqh.lock, flags); > > eventfd_ctx_read_locked(ctx->eventfd, 1, &ucnt); > > __remove_wait_queue(irqfd->wqh, &irqfd->wait); > > spin_lock_irqrestore(&irqfd->wqh.lock, flags); > > As I said, you cannot do that from KVM, since write-witers will nedd to be > woke up. > Use the eventfd_ctx_remove_wait_queue() below (NOTE: compiled, not tested). > > > > - Davide > Yes, I think this will work. Will test and report. Thanks! > --- > fs/eventfd.c | 88 +++++++++++++++++++++++++++++++++++++++--------- > include/linux/eventfd.h | 15 ++++++++ > 2 files changed, 88 insertions(+), 15 deletions(-) > > Index: linux-2.6.mod/fs/eventfd.c > =================================================================== > --- linux-2.6.mod.orig/fs/eventfd.c 2010-01-06 18:33:27.000000000 -0800 > +++ linux-2.6.mod/fs/eventfd.c 2010-01-10 11:02:00.000000000 -0800 > @@ -135,26 +135,71 @@ static unsigned int eventfd_poll(struct > return events; > } > > -static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, > - loff_t *ppos) > +static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) > +{ > + *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1: ctx->count; > + ctx->count -= *cnt; > +} > + > +/** > + * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. > + * @ctx: [in] Pointer to eventfd context. > + * @wait: [in] Wait queue to be removed. > + * @cnt: [out] Pointer to the 64bit conter value. > + * > + * Returns zero if successful, or the following error codes: > + * > + * -EAGAIN : The operation would have blocked. > + * > + * This is used to atomically remove a wait queue entry from the eventfd wait > + * queue head, and read/reset the counter value. > + */ > +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, > + __u64 *cnt) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&ctx->wqh.lock, flags); > + eventfd_ctx_do_read(ctx, cnt); > + __remove_wait_queue(&ctx->wqh, wait); > + if (*cnt != 0 && waitqueue_active(&ctx->wqh)) > + wake_up_locked_poll(&ctx->wqh, POLLOUT); > + spin_unlock_irqrestore(&ctx->wqh.lock, flags); > + > + return *cnt != 0 ? 0: -EAGAIN; > +} > +EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); > + > +/** > + * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. > + * @ctx: [in] Pointer to eventfd context. > + * @no_wait: [in] Different from zero if the operation should not block. > + * @cnt: [out] Pointer to the 64bit conter value. > + * > + * Returns zero if successful, or the following error codes: > + * > + * -EAGAIN : The operation would have blocked but @no_wait was nonzero. > + * -ERESTARTSYS : A signal interrupted the wait operation. > + * > + * If @no_wait is zero, the function might sleep until the eventfd internal > + * counter becomes greater than zero. > + */ > +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) > { > - struct eventfd_ctx *ctx = file->private_data; > ssize_t res; > - __u64 ucnt = 0; > DECLARE_WAITQUEUE(wait, current); > > - if (count < sizeof(ucnt)) > - return -EINVAL; > spin_lock_irq(&ctx->wqh.lock); > + *cnt = 0; > res = -EAGAIN; > if (ctx->count > 0) > - res = sizeof(ucnt); > - else if (!(file->f_flags & O_NONBLOCK)) { > + res = 0; > + else if (!no_wait) { > __add_wait_queue(&ctx->wqh, &wait); > - for (res = 0;;) { > + for (;;) { > set_current_state(TASK_INTERRUPTIBLE); > if (ctx->count > 0) { > - res = sizeof(ucnt); > + res = 0; > break; > } > if (signal_pending(current)) { > @@ -168,18 +213,31 @@ static ssize_t eventfd_read(struct file > __remove_wait_queue(&ctx->wqh, &wait); > __set_current_state(TASK_RUNNING); > } > - if (likely(res > 0)) { > - ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; > - ctx->count -= ucnt; > + if (likely(res == 0)) { > + eventfd_ctx_do_read(ctx, cnt); > if (waitqueue_active(&ctx->wqh)) > wake_up_locked_poll(&ctx->wqh, POLLOUT); > } > spin_unlock_irq(&ctx->wqh.lock); > - if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) > - return -EFAULT; > > return res; > } > +EXPORT_SYMBOL_GPL(eventfd_ctx_read); > + > +static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, > + loff_t *ppos) > +{ > + struct eventfd_ctx *ctx = file->private_data; > + ssize_t res; > + __u64 cnt; > + > + if (count < sizeof(cnt)) > + return -EINVAL; > + if ((res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt)) < 0) > + return res; > + > + return put_user(cnt, (__u64 __user *) buf) ? -EFAULT: sizeof(cnt); > +} > > static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, > loff_t *ppos) > Index: linux-2.6.mod/include/linux/eventfd.h > =================================================================== > --- linux-2.6.mod.orig/include/linux/eventfd.h 2010-01-06 18:33:27.000000000 -0800 > +++ linux-2.6.mod/include/linux/eventfd.h 2010-01-10 10:57:06.000000000 -0800 > @@ -34,6 +34,9 @@ struct file *eventfd_fget(int fd); > struct eventfd_ctx *eventfd_ctx_fdget(int fd); > struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); > int eventfd_signal(struct eventfd_ctx *ctx, int n); > +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt); > +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, > + __u64 *cnt); > > #else /* CONFIG_EVENTFD */ > > @@ -61,6 +64,18 @@ static inline void eventfd_ctx_put(struc > > } > > +static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, > + __u64 *cnt) > +{ > + return -ENOSYS; > +} > + > +static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, > + wait_queue_t *wait, __u64 *cnt) > +{ > + return -ENOSYS; > +} > + > #endif > > #endif /* _LINUX_EVENTFD_H */ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html