It can be quite beneficial to mark appropriate functions with __attribute__((hot)), it mostly helps to rearrange functions so they are cached better. E.g. nops test showed 31->32 MIOPS improvement with it. Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> --- fs/io_uring.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 62dc128e9b6b..e35569df7f80 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -114,6 +114,8 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) +#define __hot __attribute__((__hot__)) + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; @@ -1816,8 +1818,8 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, io_cqring_ev_posted(ctx); } -static inline void io_req_complete_state(struct io_kiocb *req, s32 res, - u32 cflags) +static inline __hot void io_req_complete_state(struct io_kiocb *req, + s32 res, u32 cflags) { req->result = res; req->cflags = cflags; @@ -2260,8 +2262,8 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked) io_free_req(req); } -static void io_free_batch_list(struct io_ring_ctx *ctx, - struct io_wq_work_node *node) +static __hot void io_free_batch_list(struct io_ring_ctx *ctx, + struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) { struct task_struct *task = NULL; @@ -2294,7 +2296,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, io_put_task(task, task_refs); } -static void __io_submit_flush_completions(struct io_ring_ctx *ctx) +static __hot void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_wq_work_node *node, *prev; @@ -2389,7 +2391,7 @@ static inline bool io_run_task_work(void) return false; } -static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) +static __hot int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { struct io_wq_work_node *pos, *start, *prev; int nr_events = 0; @@ -2479,7 +2481,7 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, long min) +static __hot int io_iopoll_check(struct io_ring_ctx *ctx, long min) { unsigned int nr_events = 0; int ret = 0; @@ -6541,7 +6543,7 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +static __hot int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; const struct cred *creds = NULL; @@ -6882,7 +6884,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req) io_queue_linked_timeout(linked_timeout); } -static inline void __io_queue_sqe(struct io_kiocb *req) +static inline __hot void __io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { struct io_kiocb *linked_timeout; @@ -6926,7 +6928,7 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) } } -static inline void io_queue_sqe(struct io_kiocb *req) +static inline __hot void io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) @@ -6977,8 +6979,8 @@ static void io_init_req_drain(struct io_kiocb *req) } } -static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static __hot int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { struct io_submit_state *state; @@ -7050,8 +7052,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, return io_req_prep(req, sqe); } -static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static __hot int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; @@ -7174,7 +7176,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static inline const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) { unsigned head, mask = ctx->sq_entries - 1; unsigned sq_idx = ctx->cached_sq_head++ & mask; @@ -7198,7 +7200,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) return NULL; } -static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) +static __hot int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { unsigned int entries = io_sqring_entries(ctx); -- 2.33.0