From: Al Viro <viro@xxxxxxxxxxxxxxxxxx> The only reason for taking the extra ref to iocb is that we want to access it after vfs_poll() and an early wakeup could have it already completed by the time vfs_poll() returns. That's very easy to avoid, though - we need to know which lock to grab and, having grabbed it, we need to check if an early wakeup has already happened. So let's just copy the reference to waitqueue into aio_poll_table and instead of having the "woken" flag in iocb, move it to aio_poll() stack frame and put its address into iocb (and make sure it's cleared, so aio_poll_wake() won't step onto it after aio_poll() exits). That's enough to get rid of the refcount. In async case freeing is done by aio_poll_complete() (and aio_poll() will only touch the iocb past the vfs_poll() if it's guaranteed that aio_poll_complete() has not happened yet), in all other cases we make sure that wakeups hadn't and won't happen and deal with disposal of iocb ourselves. Signed-off-by: Al Viro <viro@xxxxxxxxxxxxxxxxxx> --- fs/aio.c | 55 +++++++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 22b288997441..ee062253e303 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -180,8 +180,8 @@ struct fsync_iocb { struct poll_iocb { struct file *file; struct wait_queue_head *head; + bool *taken; __poll_t events; - bool woken; bool cancelled; struct wait_queue_entry wait; struct work_struct work; @@ -209,8 +209,6 @@ struct aio_kiocb { struct list_head ki_list; /* the aio core uses this * for cancellation */ - refcount_t ki_refcnt; - /* * If the aio_resfd field of the userspace iocb is not zero, * this is the underlying eventfd context to deliver events to. @@ -1034,7 +1032,6 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) percpu_ref_get(&ctx->reqs); req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); - refcount_set(&req->ki_refcnt, 0); req->ki_eventfd = NULL; return req; } @@ -1069,13 +1066,10 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) static inline void iocb_put(struct aio_kiocb *iocb) { - if (refcount_read(&iocb->ki_refcnt) == 0 || - refcount_dec_and_test(&iocb->ki_refcnt)) { - if (iocb->ki_filp) - fput(iocb->ki_filp); - percpu_ref_put(&iocb->ki_ctx->reqs); - kmem_cache_free(kiocb_cachep, iocb); - } + if (iocb->ki_filp) + fput(iocb->ki_filp); + percpu_ref_put(&iocb->ki_ctx->reqs); + kmem_cache_free(kiocb_cachep, iocb); } static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, @@ -1672,8 +1666,10 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & req->events)) return 0; - req->woken = true; - + if (unlikely(req->taken)) { + *req->taken = true; + req->taken = NULL; + } if (mask) { /* * Try to complete the iocb inline if we can. Use @@ -1698,6 +1694,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct aio_poll_table { struct poll_table_struct pt; + struct wait_queue_head *head; struct aio_kiocb *iocb; int error; }; @@ -1715,7 +1712,7 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, } pt->error = 0; - pt->iocb->poll.head = head; + pt->head = pt->iocb->poll.head = head; add_wait_queue(head, &pt->iocb->poll.wait); } @@ -1738,7 +1735,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; - req->woken = false; + req->taken = &async; req->cancelled = false; apt.pt._qproc = aio_poll_queue_proc; @@ -1750,36 +1747,38 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); - /* one for removal from waitqueue, one for this function */ - refcount_set(&aiocb->ki_refcnt, 2); - - mask = vfs_poll(req->file, &apt.pt) & req->events; - if (unlikely(!req->head)) { + mask = req->events; + mask &= vfs_poll(req->file, &apt.pt); + /* + * Careful: we might've been put into waitqueue *and* already + * woken up before vfs_poll() returns. The caller is holding + * a reference to file, so it couldn't have been killed under + * us, no matter what. However, in case of early wakeup, @req + * might be already gone by now. + */ + if (unlikely(!apt.head)) { /* we did not manage to set up a waitqueue, done */ goto out; } - spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - if (req->woken) { /* already taken up by aio_poll_wake() */ - async = true; + spin_lock(&apt.head->lock); + if (async) { /* already taken up by aio_poll_wake() */ apt.error = 0; } else if (!mask && !apt.error) { /* actually waiting for an event */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; + req->taken = NULL; async = true; } else { /* if we get an error or a mask we are done */ WARN_ON_ONCE(list_empty(&req->wait.entry)); list_del_init(&req->wait.entry); /* no wakeup in the future either; aiocb is ours to dispose of */ } - spin_unlock(&req->head->lock); + spin_unlock(&apt.head->lock); spin_unlock_irq(&ctx->ctx_lock); - out: - if (async && !apt.error) + if (!async && !apt.error) aio_poll_complete(aiocb, mask); - iocb_put(aiocb); return apt.error; } -- 2.11.0