On 2/6/24 5:43 PM, Pavel Begunkov wrote: > On 2/6/24 16:22, Jens Axboe wrote: >> We're out of space here, and none of the flags are easily reclaimable. >> Bump it to 64-bits and re-arrange the struct a bit to avoid gaps. >> >> Add a specific bitwise type for the request flags, io_request_flags_t. >> This will help catch violations of casting this value to a smaller type >> on 32-bit archs, like unsigned int. >> >> No functional changes intended in this patch. >> >> Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> >> --- >> include/linux/io_uring_types.h | 87 ++++++++++++++++++--------------- >> include/trace/events/io_uring.h | 14 +++--- >> io_uring/filetable.h | 2 +- >> io_uring/io_uring.c | 9 ++-- >> 4 files changed, 60 insertions(+), 52 deletions(-) >> >> diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h >> index 854ad67a5f70..5ac18b05d4ee 100644 >> --- a/include/linux/io_uring_types.h >> +++ b/include/linux/io_uring_types.h >> @@ -428,7 +428,7 @@ struct io_tw_state { >> bool locked; >> }; >> -enum { >> +enum io_req_flags { >> REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, >> REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, >> REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, >> @@ -468,70 +468,73 @@ enum { >> __REQ_F_LAST_BIT, >> }; >> +typedef enum io_req_flags __bitwise io_req_flags_t; >> +#define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno))) >> + >> enum { >> /* ctx owns file */ >> - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), >> + REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT), >> /* drain existing IO first */ >> - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), >> + REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT), >> /* linked sqes */ >> - REQ_F_LINK = BIT(REQ_F_LINK_BIT), >> + REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT), >> /* doesn't sever on completion < 0 */ >> - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), >> + REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT), >> /* IOSQE_ASYNC */ >> - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), >> + REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT), >> /* IOSQE_BUFFER_SELECT */ >> - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), >> + REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT), >> /* IOSQE_CQE_SKIP_SUCCESS */ >> - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), >> + REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT), >> /* fail rest of links */ >> - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), >> + REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT), >> /* on inflight list, should be cancelled and waited on exit reliably */ >> - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), >> + REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT), >> /* read/write uses file position */ >> - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), >> + REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT), >> /* must not punt to workers */ >> - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), >> + REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT), >> /* has or had linked timeout */ >> - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), >> + REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT), >> /* needs cleanup */ >> - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), >> + REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT), >> /* already went through poll handler */ >> - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), >> + REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT), >> /* buffer already selected */ >> - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), >> + REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT), >> /* buffer selected from ring, needs commit */ >> - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), >> + REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT), >> /* caller should reissue async */ >> - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), >> + REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT), >> /* supports async reads/writes */ >> - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), >> + REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT), >> /* regular file */ >> - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), >> + REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT), >> /* has creds assigned */ >> - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), >> + REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT), >> /* skip refcounting if not set */ >> - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), >> + REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT), >> /* there is a linked timeout that has to be armed */ >> - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), >> + REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT), >> /* ->async_data allocated */ >> - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), >> + REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT), >> /* don't post CQEs while failing linked requests */ >> - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), >> + REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT), >> /* single poll may be active */ >> - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), >> + REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT), >> /* double poll may active */ >> - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), >> + REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT), >> /* request has already done partial IO */ >> - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), >> + REQ_F_PARTIAL_IO = IO_REQ_FLAG(REQ_F_PARTIAL_IO_BIT), >> /* fast poll multishot mode */ >> - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), >> + REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT), >> /* recvmsg special flag, clear EPOLLIN */ >> - REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), >> + REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT), >> /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ >> - REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), >> + REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT), >> /* don't use lazy poll wake for this request */ >> - REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT), >> + REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT), >> }; >> typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); >> @@ -592,15 +595,14 @@ struct io_kiocb { >> * and after selection it points to the buffer ID itself. >> */ >> u16 buf_index; >> - unsigned int flags; >> - struct io_cqe cqe; > > With the current layout the min number of lines we touch per > request is 2 (including the op specific 64B), that's includes > setting up cqe at init and using it for completing. Moving cqe > down makes it 3. > >> + atomic_t refs; > > We're pulling it refs, which is not touched at all in the hot > path. Even if there's a hole I'd argue it's better to leave it > at the end. > >> + >> + io_req_flags_t flags; >> struct io_ring_ctx *ctx; >> struct task_struct *task; >> - struct io_rsrc_node *rsrc_node; > > It's used in hot paths, registered buffers/files, would be > unfortunate to move it to the next line. Yep I did feel a bit bad about that one... Let me take another stab at it. >> - >> union { >> /* store used ubuf, so we can prevent reloading */ >> struct io_mapped_ubuf *imu; >> @@ -615,18 +617,23 @@ struct io_kiocb { >> struct io_buffer_list *buf_list; >> }; >> + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ >> + struct hlist_node hash_node; >> + > > And we're pulling hash_node into the hottest line, which is > used only when we arm a poll and remove poll. So, it's mostly > for networking, sends wouldn't use it much, and multishots > wouldn't normally touch it. > > As for ideas how to find space: > 1) iopoll_completed completed can be converted to flags2 That's a good idea, but won't immediately find any space as it'd just leave a hole anyway. But would be good to note in there perhaps, you never know when it needs re-arranging again. > 2) REQ_F_{SINGLE,DOUBLE}_POLL is a weird duplication. Can > probably be combined into one flag, or removed at all. > Again, sends are usually not so poll heavy and the hot > path for recv is multishot. Normal receive is also a hot path, even if multishot should be preferred in general. Ditto on non-sockets but still pollable files, doing eg read for example. > 3) we can probably move req->task down and replace it with > > get_task() { > if (req->ctx->flags & DEFER_TASKRUN) > task = ctx->submitter_task; > else > task = req->task; > } Assuming ctx flags is hot, which is would generally be, that's not a bad idea at all. I'll do another loop over this one. -- Jens Axboe