Our provided buffer interface only allows selection of a single buffer. Add an API that allows getting/peeking multiple buffers at the same time. This is only implemented for the ring provided buffers. It could be added for the legacy provided buffers as well, but since it's strongly encouraged to use the new interface, let's keep it simpler and just provide it for the new API. The legacy interface will always just select a single buffer. There are two new main functions: io_buffers_select(), which selects up as many buffers as it can. The caller supplies the iovec array, and io_buffers_select() may allocate a bigger array if the 'out_len' being passed in is non-zero and bigger than what we can fit in the provided iovec. Buffers grabbed with this helper are permanently assigned. io_buffers_peek(), which works like io_buffers_select(), except they can be recycled, if needed. Callers using either of these functions should call io_put_kbufs() rather than io_put_kbuf() at completion time. The peek interface must be called with the ctx locked from peek to completion. This add a bit state for the request: - REQ_F_BUFFERS_COMMIT, which means that the the buffers have been peeked and should be committed to the buffer ring head when they are put as part of completion. Prior to this, we used the fact that req->buf_list was cleared to NULL when committed. But with the peek interface requiring the ring to be locked throughout the operation, we can use that as a lookup cache instead. Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> --- include/linux/io_uring_types.h | 3 + io_uring/kbuf.c | 203 ++++++++++++++++++++++++++++++--- io_uring/kbuf.h | 39 +++++-- 3 files changed, 223 insertions(+), 22 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e24893625085..971294dfd22e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -481,6 +481,7 @@ enum { REQ_F_CAN_POLL_BIT, REQ_F_BL_EMPTY_BIT, REQ_F_BL_NO_RECYCLE_BIT, + REQ_F_BUFFERS_COMMIT_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -559,6 +560,8 @@ enum { REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT), /* don't recycle provided buffers for this request */ REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), + /* buffer ring head needs incrementing on put */ + REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 9be42bff936b..921e8e25e027 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -140,34 +140,57 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, return NULL; } +static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + struct iovec *iov) +{ + void __user *buf; + + buf = io_provided_buffer_select(req, len, bl); + if (unlikely(!buf)) + return -ENOBUFS; + + iov[0].iov_base = buf; + iov[0].iov_len = *len; + return 0; +} + +static struct io_uring_buf *io_ring_head_to_buf(struct io_buffer_list *bl, + __u16 head) +{ + head &= bl->mask; + + /* mmaped buffers are always contig */ + if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { + return &bl->buf_ring->bufs[head]; + } else { + int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); + int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; + struct io_uring_buf *buf; + + buf = page_address(bl->buf_pages[index]); + return buf + off; + } +} + static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl, unsigned int issue_flags) { - struct io_uring_buf_ring *br = bl->buf_ring; __u16 tail, head = bl->head; struct io_uring_buf *buf; - tail = smp_load_acquire(&br->tail); + tail = smp_load_acquire(&bl->buf_ring->tail); if (unlikely(tail == head)) return NULL; if (head + 1 == tail) req->flags |= REQ_F_BL_EMPTY; - head &= bl->mask; - /* mmaped buffers are always contig */ - if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; - buf = page_address(bl->buf_pages[index]); - buf += off; - } + buf = io_ring_head_to_buf(bl, head); if (*len == 0 || *len > buf->len) *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; + req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; @@ -182,6 +205,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ + req->flags &= ~REQ_F_BUFFERS_COMMIT; req->buf_list = NULL; bl->head++; } @@ -208,6 +232,159 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } +static int io_ring_buffers_peek(struct io_kiocb *req, struct iovec **iovs, + int nr_iovs, size_t *out_len, + struct io_buffer_list *bl) +{ + struct iovec *iov = *iovs; + __u16 nr_avail, tail, head; + struct io_uring_buf *buf; + size_t max_len = 0; + int i; + + if (*out_len) { + max_len = *out_len; + *out_len = 0; + } + + tail = smp_load_acquire(&bl->buf_ring->tail); + head = bl->head; + nr_avail = tail - head; + if (unlikely(!nr_avail)) + return -ENOBUFS; + + buf = io_ring_head_to_buf(bl, head); + if (max_len) { + int needed; + + needed = (max_len + buf->len - 1) / buf->len; + /* cap it at a reasonable 256, will be one page even for 4K */ + needed = min(needed, 256); + if (nr_avail > needed) + nr_avail = needed; + } + + if (nr_avail > UIO_MAXIOV) + nr_avail = UIO_MAXIOV; + + /* + * only alloc a bigger array if we know we have data to map, eg not + * a speculative peek operation. + */ + if (nr_iovs == UIO_FASTIOV && nr_avail > nr_iovs && max_len) { + iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + nr_iovs = nr_avail; + } else if (nr_avail < nr_iovs) { + nr_iovs = nr_avail; + } + + buf = io_ring_head_to_buf(bl, head); + req->buf_index = buf->bid; + + i = 0; + while (nr_iovs--) { + void __user *ubuf; + + /* truncate end piece, if needed */ + if (max_len && buf->len > max_len) + buf->len = max_len; + + ubuf = u64_to_user_ptr(buf->addr); + if (!access_ok(ubuf, buf->len)) + break; + iov[i].iov_base = ubuf; + iov[i].iov_len = buf->len; + *out_len += buf->len; + i++; + head++; + if (max_len) { + max_len -= buf->len; + if (!max_len) + break; + } + buf = io_ring_head_to_buf(bl, head); + } + + if (head == tail) + req->flags |= REQ_F_BL_EMPTY; + + if (i) { + req->flags |= REQ_F_BUFFER_RING; + *iovs = iov; + return i; + } + + if (iov != *iovs) + kfree(iov); + *iovs = NULL; + return -EFAULT; +} + +int io_buffers_select(struct io_kiocb *req, struct iovec **iovs, int nr_iovs, + size_t *out_len, unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret = -ENOENT; + + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, req->buf_index); + if (unlikely(!bl)) + goto out_unlock; + + if (bl->is_mapped) { + ret = io_ring_buffers_peek(req, iovs, nr_iovs, out_len, bl); + /* + * Don't recycle these buffers if we need to go through poll. + * Nobody else can use them anyway, and holding on to provided + * buffers for a send/write operation would happen on the app + * side anyway with normal buffers. Besides, we already + * committed them, they cannot be put back in the queue. + */ + req->buf_list = bl; + if (ret > 0) { + req->flags |= REQ_F_BL_NO_RECYCLE; + req->buf_list->head += ret; + } + } else { + ret = io_provided_buffers_select(req, out_len, bl, *iovs); + } +out_unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +int io_buffers_peek(struct io_kiocb *req, struct iovec **iovs, int nr_iovs, + size_t *out_len) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; + + lockdep_assert_held(&ctx->uring_lock); + + if (req->buf_list) { + bl = req->buf_list; + } else { + bl = io_buffer_get_list(ctx, req->buf_index); + if (unlikely(!bl)) + return -ENOENT; + } + + /* don't support multiple buffer selections for legacy */ + if (!bl->is_mapped) + return io_provided_buffers_select(req, out_len, bl, *iovs); + + ret = io_ring_buffers_peek(req, iovs, nr_iovs, out_len, bl); + if (ret > 0) { + req->buf_list = bl; + req->flags |= REQ_F_BUFFERS_COMMIT; + } + return ret; +} + static __cold int io_init_bl_list(struct io_ring_ctx *ctx) { struct io_buffer_list *bl; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 5218bfd79e87..b4f48a144b73 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -43,6 +43,10 @@ struct io_buffer { void __user *io_buffer_select(struct io_kiocb *req, size_t *len, unsigned int issue_flags); +int io_buffers_select(struct io_kiocb *req, struct iovec **iovs, int nr_iovs, + size_t *out_len, unsigned int issue_flags); +int io_buffers_peek(struct io_kiocb *req, struct iovec **iovs, int nr_iovs, + size_t *out_len); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); @@ -74,7 +78,7 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) */ if (req->buf_list) { req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; + req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; } return false; @@ -98,11 +102,16 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -static inline void __io_put_kbuf_ring(struct io_kiocb *req) +static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr) { - if (req->buf_list) { - req->buf_index = req->buf_list->bgid; - req->buf_list->head++; + struct io_buffer_list *bl = req->buf_list; + + if (bl) { + if (req->flags & REQ_F_BUFFERS_COMMIT) { + bl->head += nr; + req->flags &= ~REQ_F_BUFFERS_COMMIT; + } + req->buf_index = bl->bgid; } req->flags &= ~REQ_F_BUFFER_RING; } @@ -111,7 +120,7 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req, struct list_head *list) { if (req->flags & REQ_F_BUFFER_RING) { - __io_put_kbuf_ring(req); + __io_put_kbuf_ring(req, 1); } else { req->buf_index = req->kbuf->bgid; list_add(&req->kbuf->list, list); @@ -133,8 +142,8 @@ static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) return ret; } -static inline unsigned int io_put_kbuf(struct io_kiocb *req, - unsigned issue_flags) +static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, + unsigned issue_flags) { unsigned int ret; @@ -143,9 +152,21 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); if (req->flags & REQ_F_BUFFER_RING) - __io_put_kbuf_ring(req); + __io_put_kbuf_ring(req, nbufs); else __io_put_kbuf(req, issue_flags); return ret; } + +static inline unsigned int io_put_kbuf(struct io_kiocb *req, + unsigned issue_flags) +{ + return __io_put_kbufs(req, 1, issue_flags); +} + +static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs, + unsigned issue_flags) +{ + return __io_put_kbufs(req, nbufs, issue_flags); +} #endif -- 2.43.0