From: Keith Busch <kbusch@xxxxxxxxxx> Provide a new register operation that can request to pre-map a known bvec to the driver of the requested file descriptor's specific implementation. If successful, io_uring will use the returned dma tag for future fixed buffer requests to the same file. Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> --- include/uapi/linux/io_uring.h | 12 ++++ io_uring/io_uring.c | 129 ++++++++++++++++++++++++++++++++++ io_uring/net.c | 2 +- io_uring/rsrc.c | 13 +++- io_uring/rsrc.h | 16 ++++- io_uring/rw.c | 2 +- 6 files changed, 166 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1463cfecb56b..daacbe899d1d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -485,6 +485,10 @@ enum { IORING_REGISTER_NOTIFIERS = 26, IORING_UNREGISTER_NOTIFIERS = 27, + /* dma map registered buffers */ + IORING_REGISTER_MAP_BUFFERS = 28, + IORING_REGISTER_UNMAP_BUFFERS = 29, + /* this goes last */ IORING_REGISTER_LAST }; @@ -661,4 +665,12 @@ struct io_uring_recvmsg_out { __u32 flags; }; +struct io_uring_map_buffers { + __s32 fd; + __s32 buf_start; + __s32 buf_end; + __u32 flags; + __u64 rsvd[2]; +}; + #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1d600a63643b..12f7354e0423 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3704,6 +3704,123 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return ret; } +#ifdef CONFIG_BLOCK +static int get_map_range(struct io_ring_ctx *ctx, + struct io_uring_map_buffers *map, void __user *arg) +{ + int ret; + + if (copy_from_user(map, arg, sizeof(*map))) + return -EFAULT; + if (map->flags || map->rsvd[0] || map->rsvd[1]) + return -EINVAL; + if (map->buf_start < 0) + return -EINVAL; + if (map->buf_start >= ctx->nr_user_bufs) + return -EINVAL; + if (map->buf_end > ctx->nr_user_bufs) + map->buf_end = ctx->nr_user_bufs; + + ret = map->buf_end - map->buf_start; + if (ret <= 0) + return -EINVAL; + + return ret; +} + +void io_dma_unmap(struct io_mapped_ubuf *imu) +{ + if (imu->dma_tag) + block_dma_unmap(imu->bdev, imu->dma_tag); +} + +static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_map_buffers map; + int i, ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + ret = get_map_range(ctx, &map, arg); + if (ret < 0) + return ret; + + for (i = map.buf_start; i < map.buf_end; i++) { + struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + + io_dma_unmap(imu); + } + + return 0; +} + +static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_map_buffers map; + struct block_device *bdev; + struct file *file; + int ret, i; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = get_map_range(ctx, &map, arg); + if (ret < 0) + return ret; + + file = fget(map.fd); + if (!file) + return -EBADF; + + if (S_ISBLK(file_inode(file)->i_mode)) + bdev = I_BDEV(file->f_mapping->host); + else if (S_ISREG(file_inode(file)->i_mode)) + bdev = file->f_inode->i_sb->s_bdev; + else + return -EOPNOTSUPP; + + for (i = map.buf_start; i < map.buf_end; i++) { + struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + void *tag; + + if (imu->dma_tag) { + ret = -EBUSY; + goto err; + } + + tag = block_dma_map(bdev, imu->bvec, imu->nr_bvecs); + if (IS_ERR(tag)) { + ret = PTR_ERR(tag); + goto err; + } + + imu->dma_tag = tag; + imu->dma_file = file; + imu->bdev = bdev; + } + + fput(file); + return 0; +err: + while (--i >= map.buf_start) { + struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + + io_dma_unmap(imu); + } + fput(file); + return ret; +} +#else /* CONFIG_BLOCK */ +static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + return -EOPNOTSUPP; +} +static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BLOCK */ + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -3870,6 +3987,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_notif_unregister(ctx); break; + case IORING_REGISTER_MAP_BUFFERS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_map_buffers(ctx, arg); + break; + case IORING_REGISTER_UNMAP_BUFFERS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_unmap_buffers(ctx, arg); + break; default: ret = -EINVAL; break; diff --git a/io_uring/net.c b/io_uring/net.c index 8276b9537194..68a996318959 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -977,7 +977,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) if (zc->flags & IORING_RECVSEND_FIXED_BUF) { ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu, - (u64)(uintptr_t)zc->buf, zc->len); + (u64)(uintptr_t)zc->buf, zc->len, NULL); if (unlikely(ret)) return ret; } else { diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 59704b9ac537..1a7a8dedbbd5 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -148,6 +148,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); + io_dma_unmap(imu); kvfree(imu); } *slot = NULL; @@ -1285,6 +1286,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, imu->ubuf = (unsigned long) iov->iov_base; imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; + imu->dma_tag = NULL; *pimu = imu; ret = 0; done: @@ -1359,9 +1361,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len) +int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len, struct file *file) { u64 buf_end; size_t offset; @@ -1379,6 +1380,12 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; + if (imu->dma_tag && file == imu->dma_file) { + unsigned long nr_segs = (buf_addr & (PAGE_SIZE - 1)) + + (len >> PAGE_SHIFT); + iov_iter_dma_tag(iter, ddir, imu->dma_tag, offset, nr_segs, len); + return 0; + } iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); if (offset) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index f3a9a177941f..6e63b7a57b34 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -50,6 +50,11 @@ struct io_mapped_ubuf { u64 ubuf_end; unsigned int nr_bvecs; unsigned long acct_pages; + void *dma_tag; + struct file *dma_file; +#ifdef CONFIG_BLOCK + struct block_device *bdev; +#endif struct bio_vec bvec[]; }; @@ -64,9 +69,14 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill); -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len); +int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len, struct file *file); + +#ifdef CONFIG_BLOCK +void io_dma_unmap(struct io_mapped_ubuf *imu); +#else +static inline void io_dma_unmap(struct io_mapped_ubuf *imu) {} +#endif void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); diff --git a/io_uring/rw.c b/io_uring/rw.c index 2b784795103c..9e2164d09adb 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -359,7 +359,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, ssize_t ret; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len); + ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len, req->file); if (ret) return ERR_PTR(ret); return NULL; -- 2.30.2