After discussing this one with Thomas yesterday at KR2023 I had this nagging feeling that something was still amiss. Took a closer look at it, and there is an issue with the odd case of futex_wait_multiple_setup() returning 1. It does so if a wakeup was triggered during setup. Which is fine, except then it also unqueues ALL the futexes at that point, which is unlike the normal wakeup path on the io_uring side. It'd be nice to unify those and leave the cleanup to the caller, but since we also re-loop in that setup handler if nobody was woken AND we use the futex_unqueue_multiple() to see if we were woken to begin with, I think it's cleaner to just note this fact in io_uring and deal with it. I'm folding in the below incremental for now. Has a few cleanups in there too that I spotted while doing that, the important bit is the ->futexv_unqueued part. diff --git a/io_uring/futex.c b/io_uring/futex.c index 0c07df8668aa..8a2b0a260d5b 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -23,6 +23,7 @@ struct io_futex { unsigned long futexv_owned; u32 futex_flags; unsigned int futex_nr; + bool futexv_unqueued; }; struct io_futex_data { @@ -71,25 +72,21 @@ static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; - struct io_ring_ctx *ctx = req->ctx; - int res = 0; - io_tw_lock(ctx, ts); + io_tw_lock(req->ctx, ts); + + if (!iof->futexv_unqueued) { + int res = futex_unqueue_multiple(futexv, iof->futex_nr); - res = futex_unqueue_multiple(futexv, iof->futex_nr); - if (res != -1) - io_req_set_res(req, res, 0); + if (res != -1) + io_req_set_res(req, res, 0); + } kfree(req->async_data); req->flags &= ~REQ_F_ASYNC_DATA; __io_futex_complete(req, ts); } -static bool io_futexv_claimed(struct io_futex *iof) -{ - return test_bit(0, &iof->futexv_owned); -} - static bool io_futexv_claim(struct io_futex *iof) { if (test_bit(0, &iof->futexv_owned) || @@ -238,6 +235,7 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } iof->futexv_owned = 0; + iof->futexv_unqueued = 0; req->flags |= REQ_F_ASYNC_DATA; req->async_data = futexv; return 0; @@ -278,6 +276,18 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken); + /* + * We got woken while setting up, let that side do the completion. + * Note that futex_wait_multiple_setup() will have unqueued all + * the futexes in this case. Mark us as having done that already, + * since this is different from normal wakeup. + */ + if (ret == 1) { + iof->futexv_unqueued = 1; + io_req_set_res(req, woken, 0); + goto skip; + } + /* * The above call leaves us potentially non-running. This is fine * for the sync syscall as it'll be blocking unless we already got @@ -287,29 +297,23 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) __set_current_state(TASK_RUNNING); /* - * We got woken while setting up, let that side do the completion + * 0 return means that we successfully setup the waiters, and that + * nobody triggered a wakeup while we were doing so. If the wakeup + * happened post setup, the task_work will be run post this issue + * and under the submission lock. */ - if (io_futexv_claimed(iof)) { + if (!ret) { + hlist_add_head(&req->hash_node, &ctx->futex_list); skip: io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } /* - * 0 return means that we successfully setup the waiters, and that - * nobody triggered a wakeup while we were doing so. < 0 or 1 return - * is either an error or we got a wakeup while setting up. + * Error case, ret is < 0. Mark the request as failed. */ - if (!ret) { - hlist_add_head(&req->hash_node, &ctx->futex_list); - goto skip; - } - io_ring_submit_unlock(ctx, issue_flags); - if (ret < 0) - req_set_fail(req); - else if (woken != -1) - ret = woken; + req_set_fail(req); io_req_set_res(req, ret, 0); kfree(futexv); req->flags &= ~REQ_F_ASYNC_DATA; -- Jens Axboe