On 7/8/22 1:33 AM, Dylan Yudaken wrote: > On Thu, 2022-07-07 at 17:23 -0600, Jens Axboe wrote: >> For recvmsg/sendmsg, if they don't complete inline, we currently need >> to allocate a struct io_async_msghdr for each request. This is a >> somewhat large struct. >> >> Hook up sendmsg/recvmsg to use the io_alloc_cache. This reduces the >> alloc + free overhead considerably, yielding 4-5% of extra >> performance >> running netbench. >> >> Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> >> --- >> include/linux/io_uring_types.h | 6 ++- >> io_uring/io_uring.c | 3 ++ >> io_uring/net.c | 73 +++++++++++++++++++++++++++++--- >> -- >> io_uring/net.h | 11 ++++- >> 4 files changed, 81 insertions(+), 12 deletions(-) >> >> diff --git a/include/linux/io_uring_types.h >> b/include/linux/io_uring_types.h >> index bf8f95332eda..d54b8b7e0746 100644 >> --- a/include/linux/io_uring_types.h >> +++ b/include/linux/io_uring_types.h >> @@ -222,8 +222,7 @@ struct io_ring_ctx { >> struct io_hash_table cancel_table_locked; >> struct list_head cq_overflow_list; >> struct io_alloc_cache apoll_cache; >> - struct xarray personalities; >> - u32 pers_next; >> + struct io_alloc_cache netmsg_cache; >> } ____cacheline_aligned_in_smp; >> >> /* IRQ completion list, under ->completion_lock */ >> @@ -241,6 +240,9 @@ struct io_ring_ctx { >> unsigned int file_alloc_start; >> unsigned int file_alloc_end; >> >> + struct xarray personalities; >> + u32 pers_next; >> + >> struct { >> /* >> * We cache a range of free CQEs we can use, once >> exhausted it >> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c >> index b5098773d924..32110c5b4059 100644 >> --- a/io_uring/io_uring.c >> +++ b/io_uring/io_uring.c >> @@ -89,6 +89,7 @@ >> #include "kbuf.h" >> #include "rsrc.h" >> #include "cancel.h" >> +#include "net.h" >> >> #include "timeout.h" >> #include "poll.h" >> @@ -297,6 +298,7 @@ static __cold struct io_ring_ctx >> *io_ring_ctx_alloc(struct io_uring_params *p) >> INIT_LIST_HEAD(&ctx->cq_overflow_list); >> INIT_LIST_HEAD(&ctx->io_buffers_cache); >> io_alloc_cache_init(&ctx->apoll_cache); >> + io_alloc_cache_init(&ctx->netmsg_cache); >> init_completion(&ctx->ref_comp); >> xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); >> mutex_init(&ctx->uring_lock); >> @@ -2473,6 +2475,7 @@ static __cold void io_ring_ctx_free(struct >> io_ring_ctx *ctx) >> __io_cqring_overflow_flush(ctx, true); >> io_eventfd_unregister(ctx); >> io_flush_apoll_cache(ctx); >> + io_flush_netmsg_cache(ctx); >> mutex_unlock(&ctx->uring_lock); >> io_destroy_buffers(ctx); >> if (ctx->sq_creds) >> diff --git a/io_uring/net.c b/io_uring/net.c >> index 6679069eeef1..ba7e94ff287c 100644 >> --- a/io_uring/net.c >> +++ b/io_uring/net.c >> @@ -12,6 +12,7 @@ >> >> #include "io_uring.h" >> #include "kbuf.h" >> +#include "alloc_cache.h" >> #include "net.h" >> >> #if defined(CONFIG_NET) >> @@ -97,18 +98,57 @@ static bool io_net_retry(struct socket *sock, int >> flags) >> return sock->type == SOCK_STREAM || sock->type == >> SOCK_SEQPACKET; >> } >> >> +static void io_netmsg_recycle(struct io_kiocb *req, unsigned int >> issue_flags) >> +{ >> + struct io_async_msghdr *hdr = req->async_data; >> + >> + if (!hdr || issue_flags & IO_URING_F_UNLOCKED) >> + return; >> + >> + if (io_alloc_cache_store(&req->ctx->netmsg_cache)) { >> + hlist_add_head(&hdr->cache_list, &req->ctx- >>> netmsg_cache.list); > > can io_alloc_cache_store just do the store? > would be nicer to have cache::list be generally unused outside of the > cache code. We could do that if we just make the hlist_node be inside a struct. Would probably allow cleaning up the get-entry etc too, let me give that a whirl. -- Jens Axboe