While trying to fix a race when closing cgroup eventfd, I took a look at how kvm deals with this problem, and I found it doesn't. I may be wrong, as I don't know kvm code, so correct me if I'm. /* * Race-free decouple logic (ordering is critical) */ static void irqfd_shutdown(struct work_struct *work) I don't think it's race-free! static int irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) { ... * We cannot race against the irqfd going away since the * other side is required to acquire wqh->lock, which we hold */ if (irqfd_is_active(irqfd)) irqfd_deactivate(irqfd); } In kvm_irqfd_deassign() and kvm_irqfd_release() where irqfds are freed, wqh->lock is not acquired! So here is the race: CPU0 CPU1 ----------------------------------- --------------------------------- kvm_irqfd_release() spin_lock(kvm->irqfds.lock); ... irqfd_deactivate(irqfd); list_del_init(&irqfd->list); spin_unlock(kvm->irqfd.lock); ... close(eventfd) irqfd_wakeup(); irqfd_shutdown(); remove_waitqueue(irqfd->wait); kfree(irqfd); spin_lock(kvm->irqfd.lock); if (!list_empty(&irqfd->list)) irqfd_deactivate(irqfd); list_del_init(&irqfd->list); spin_unlock(kvm->irqfd.lock); Look, we're accessing irqfd though it has already been freed! Here's a fix I have. Please enlighten me with a better fix. Signed-off-by: Li Zefan <lizefan@xxxxxxxxxx> --- fs/eventfd.c | 34 ++++++++++++++-------------------- include/linux/eventfd.h | 17 +++++++++++++++++ virt/kvm/eventfd.c | 45 +++++++++++++++++++++++++++++++++++---------- 3 files changed, 66 insertions(+), 30 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index 35470d9..cda8a4c 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -22,21 +22,6 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> -struct eventfd_ctx { - struct kref kref; - wait_queue_head_t wqh; - /* - * Every time that a write(2) is performed on an eventfd, the - * value of the __u64 being written is added to "count" and a - * wakeup is performed on "wqh". A read(2) will return the "count" - * value to userspace, and will reset "count" to zero. The kernel - * side eventfd_signal() also, adds to the "count" counter and - * issue a wakeup. - */ - __u64 count; - unsigned int flags; -}; - /** * eventfd_signal - Adds @n to the eventfd counter. * @ctx: [in] Pointer to the eventfd context. @@ -153,20 +138,29 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) * This is used to atomically remove a wait queue entry from the eventfd wait * queue head, and read/reset the counter value. */ -int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, +int eventfd_ctx_remove_wait_queue_locked(struct eventfd_ctx *ctx, wait_queue_t *wait, __u64 *cnt) { - unsigned long flags; - - spin_lock_irqsave(&ctx->wqh.lock, flags); eventfd_ctx_do_read(ctx, cnt); __remove_wait_queue(&ctx->wqh, wait); if (*cnt != 0 && waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLOUT); - spin_unlock_irqrestore(&ctx->wqh.lock, flags); return *cnt != 0 ? 0 : -EAGAIN; } +EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue_locked); + +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + ret = eventfd_ctx_remove_wait_queue_locked(ctx, wait, cnt); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + return ret; +} EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); /** diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 3c3ef19..61085ac 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -26,6 +26,21 @@ #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE) +struct eventfd_ctx { + struct kref kref; + wait_queue_head_t wqh; + /* + * Every time that a write(2) is performed on an eventfd, the + * value of the __u64 being written is added to "count" and a + * wakeup is performed on "wqh". A read(2) will return the "count" + * value to userspace, and will reset "count" to zero. The kernel + * side eventfd_signal() also, adds to the "count" counter and + * issue a wakeup. + */ + __u64 count; + unsigned int flags; +}; + #ifdef CONFIG_EVENTFD struct file *eventfd_file_create(unsigned int count, int flags); @@ -38,6 +53,8 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, __u64 *cnt); +int eventfd_ctx_remove_wait_queue_locked(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt); #else /* CONFIG_EVENTFD */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index b6eea5c..ccbeb9a 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -89,6 +89,7 @@ struct _irqfd { struct list_head list; poll_table pt; struct work_struct shutdown; + wait_queue_head_t *wqh; }; static struct workqueue_struct *irqfd_cleanup_wq; @@ -160,13 +161,6 @@ static void irqfd_shutdown(struct work_struct *work) { struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); - u64 cnt; - - /* - * Synchronize with the wait-queue and unhook ourselves to prevent - * further events. - */ - eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); /* * We know no new events will be scheduled at this point, so block @@ -197,15 +191,23 @@ irqfd_is_active(struct _irqfd *irqfd) /* * Mark the irqfd as inactive and schedule it for removal * - * assumes kvm->irqfds.lock is held + * assumes kvm->irqfds.lock and irqfd->wqh.lock are held */ static void irqfd_deactivate(struct _irqfd *irqfd) { + u64 cnt; + BUG_ON(!irqfd_is_active(irqfd)); list_del_init(&irqfd->list); + /* + * Synchronize with the wait-queue and unhook ourselves to prevent + * further events. + */ + eventfd_ctx_remove_wait_queue_locked(irqfd->eventfd, &irqfd->wait, &cnt); + queue_work(irqfd_cleanup_wq, &irqfd->shutdown); } @@ -260,6 +262,7 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); + irqfd->wqh = wqh; add_wait_queue(wqh, &irqfd->wait); } @@ -454,6 +457,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) if (IS_ERR(eventfd)) return PTR_ERR(eventfd); + spin_lock_irq(&eventfd->wqh.lock); spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { @@ -472,6 +476,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) } spin_unlock_irq(&kvm->irqfds.lock); + spin_unlock_irq(&eventfd->wqh.lock); eventfd_ctx_put(eventfd); /* @@ -504,14 +509,34 @@ void kvm_irqfd_release(struct kvm *kvm) { struct _irqfd *irqfd, *tmp; + struct eventfd_ctx *ctx; spin_lock_irq(&kvm->irqfds.lock); +again: + if (list_empty(&kvm->irqfds.items)) { + spin_unlock_irq(&kvm->irqfds.lock); + goto out; + } + + irqfd = list_first_entry(&kvm->irqfds.items, struct _irqfd, list); + ctx = eventfd_ctx_get(irqfd->eventfd); + + spin_unlock_irq(&kvm->irqfds.lock); - list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) - irqfd_deactivate(irqfd); + spin_lock_irq(&ctx->wqh.lock); + spin_lock_irq(&kvm->irqfds.lock); + + list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { + if (irqfd->eventfd == ctx) + irqfd_deactivate(irqfd); + } spin_unlock_irq(&kvm->irqfds.lock); + spin_unlock_irq(&ctx->wqh.lock); + eventfd_ctx_put(ctx); + goto again; +out: /* * Block until we know all outstanding shutdown jobs have completed * since we do not take a kvm* reference. -- 1.8.0.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html