From: Ammar Faizi <ammarfaizi2@xxxxxxxxxxx> These functions are called at initialization, which are slow-paths. Mark them as __cold so that the compiler will optimize for code size. Here is the result compiling with Ubuntu clang 15.0.0-++20220601012204+ec2711b35411-1~exp1~20220601012300.510 Without this patch: $ wc -c src/liburing.so.2.3 71288 src/liburing.so.2.3 With this patch: $ wc -c src/liburing.so.2.3 69448 src/liburing.so.2.3 Take one slow-path function example, using __cold avoids aggresive inlining. Without this patch: 00000000000024f0 <io_uring_queue_init>: 24f0: pushq %r14 24f2: pushq %rbx 24f3: subq $0x78,%rsp 24f7: movq %rsi,%r14 24fa: xorps %xmm0,%xmm0 24fd: movaps %xmm0,(%rsp) 2501: movaps %xmm0,0x60(%rsp) 2506: movaps %xmm0,0x50(%rsp) 250b: movaps %xmm0,0x40(%rsp) 2510: movaps %xmm0,0x30(%rsp) 2515: movaps %xmm0,0x20(%rsp) 251a: movaps %xmm0,0x10(%rsp) 251f: movq $0x0,0x70(%rsp) 2528: movl %edx,0x8(%rsp) 252c: movq %rsp,%rsi 252f: movl $0x1a9,%eax 2534: syscall 2536: movq %rax,%rbx 2539: testl %ebx,%ebx 253b: js 256a <io_uring_queue_init+0x7a> 253d: movq %rsp,%rsi 2540: movl %ebx,%edi 2542: movq %r14,%rdx 2545: callq 2080 <io_uring_queue_mmap@plt> 254a: testl %eax,%eax 254c: je 255d <io_uring_queue_init+0x6d> 254e: movl %eax,%edx 2550: movl $0x3,%eax 2555: movl %ebx,%edi 2557: syscall 2559: movl %edx,%ebx 255b: jmp 256a <io_uring_queue_init+0x7a> 255d: movl 0x14(%rsp),%eax 2561: movl %eax,0xc8(%r14) 2568: xorl %ebx,%ebx 256a: movl %ebx,%eax 256c: addq $0x78,%rsp 2570: popq %rbx 2571: popq %r14 2573: retq With this patch: 000000000000240c <io_uring_queue_init>: 240c: subq $0x78,%rsp 2410: xorps %xmm0,%xmm0 2413: movq %rsp,%rax 2416: movaps %xmm0,(%rax) 2419: movaps %xmm0,0x60(%rax) 241d: movaps %xmm0,0x50(%rax) 2421: movaps %xmm0,0x40(%rax) 2425: movaps %xmm0,0x30(%rax) 2429: movaps %xmm0,0x20(%rax) 242d: movaps %xmm0,0x10(%rax) 2431: movq $0x0,0x70(%rax) 2439: movl %edx,0x8(%rax) 243c: movq %rax,%rdx 243f: callq 2090 <io_uring_queue_init_params@plt> 2444: addq $0x78,%rsp 2448: retq Signed-off-by: Ammar Faizi <ammarfaizi2@xxxxxxxxxxx> --- src/setup.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/setup.c b/src/setup.c index d2adc7f..2badcc1 100644 --- a/src/setup.c +++ b/src/setup.c @@ -89,7 +89,8 @@ err: * Returns -errno on error, or zero on success. On success, 'ring' * contains the necessary information to read/write to the rings. */ -int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring) +__cold int io_uring_queue_mmap(int fd, struct io_uring_params *p, + struct io_uring *ring) { int ret; @@ -107,7 +108,7 @@ int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring * Ensure that the mmap'ed rings aren't available to a child after a fork(2). * This uses madvise(..., MADV_DONTFORK) on the mmap'ed ranges. */ -int io_uring_ring_dontfork(struct io_uring *ring) +__cold int io_uring_ring_dontfork(struct io_uring *ring) { size_t len; int ret; @@ -138,8 +139,8 @@ int io_uring_ring_dontfork(struct io_uring *ring) return 0; } -int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, - struct io_uring_params *p) +__cold int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, + struct io_uring_params *p) { int fd, ret; @@ -161,7 +162,8 @@ int io_uring_queue_init_params(unsigned entries, struct io_uring *ring, * Returns -errno on error, or zero on success. On success, 'ring' * contains the necessary information to read/write to the rings. */ -int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) +__cold int io_uring_queue_init(unsigned entries, struct io_uring *ring, + unsigned flags) { struct io_uring_params p; @@ -171,7 +173,7 @@ int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) return io_uring_queue_init_params(entries, ring, &p); } -void io_uring_queue_exit(struct io_uring *ring) +__cold void io_uring_queue_exit(struct io_uring *ring) { struct io_uring_sq *sq = &ring->sq; struct io_uring_cq *cq = &ring->cq; @@ -191,7 +193,7 @@ void io_uring_queue_exit(struct io_uring *ring) __sys_close(ring->ring_fd); } -struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring) +__cold struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring) { struct io_uring_probe *probe; size_t len; @@ -211,7 +213,7 @@ struct io_uring_probe *io_uring_get_probe_ring(struct io_uring *ring) return NULL; } -struct io_uring_probe *io_uring_get_probe(void) +__cold struct io_uring_probe *io_uring_get_probe(void) { struct io_uring ring; struct io_uring_probe *probe; @@ -226,7 +228,7 @@ struct io_uring_probe *io_uring_get_probe(void) return probe; } -void io_uring_free_probe(struct io_uring_probe *probe) +__cold void io_uring_free_probe(struct io_uring_probe *probe) { uring_free(probe); } @@ -284,7 +286,8 @@ static size_t rings_size(struct io_uring_params *p, unsigned entries, * return the required memory so that the caller can ensure that enough space * is available before setting up a ring with the specified parameters. */ -ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p) +__cold ssize_t io_uring_mlock_size_params(unsigned entries, + struct io_uring_params *p) { struct io_uring_params lp = { }; struct io_uring ring; @@ -343,7 +346,7 @@ ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p) * Return required ulimit -l memory space for a given ring setup. See * @io_uring_mlock_size_params(). */ -ssize_t io_uring_mlock_size(unsigned entries, unsigned flags) +__cold ssize_t io_uring_mlock_size(unsigned entries, unsigned flags) { struct io_uring_params p = { .flags = flags, }; -- Ammar Faizi