On Mon, Mar 3, 2025 at 7:51 AM Pavel Begunkov <asml.silence@xxxxxxxxx> wrote: > > Add io_import_reg_vec(), which will be responsible for importing > vectored registered buffers. iovecs are overlapped with the resulting > bvec in memory, which is why the iovec is expected to be padded in > iou_vec. > > Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> > --- > include/linux/io_uring_types.h | 5 +- > io_uring/rsrc.c | 124 +++++++++++++++++++++++++++++++++ > io_uring/rsrc.h | 5 ++ > 3 files changed, 133 insertions(+), 1 deletion(-) > > diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h > index 9101f12d21ef..b770a2b12da6 100644 > --- a/include/linux/io_uring_types.h > +++ b/include/linux/io_uring_types.h > @@ -111,7 +111,10 @@ struct io_uring_task { > }; > > struct iou_vec { > - struct iovec *iovec; > + union { > + struct iovec *iovec; > + struct bio_vec *bvec; > + }; If I understand correctly, io_import_reg_vec() converts the iovecs to bio_vecs in place. If an iovec expands to more than one bio_vec (i.e. crosses a folio boundary), wouldn't the bio_vecs overwrite iovecs that hadn't been processed yet? > unsigned nr; > }; > > diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c > index 9b05e614819e..1ec1f5b3e385 100644 > --- a/io_uring/rsrc.c > +++ b/io_uring/rsrc.c > @@ -1267,9 +1267,133 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) > > void io_vec_free(struct iou_vec *iv) > { > + BUILD_BUG_ON(sizeof(struct bio_vec) > sizeof(struct iovec)); > + > if (!iv->iovec) > return; > kfree(iv->iovec); > iv->iovec = NULL; > iv->nr = 0; > } > + > +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) > +{ > + struct iovec *iov; > + > + WARN_ON_ONCE(nr_entries <= 0); > + > + iov = kmalloc_array(nr_entries, sizeof(iov[0]), GFP_KERNEL); > + if (!iov) > + return -ENOMEM; > + > + io_vec_free(iv); > + iv->iovec = iov; > + iv->nr = nr_entries; > + return 0; > +} > + > +static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, > + struct io_mapped_ubuf *imu, > + struct iovec *iovec, int nr_iovs, > + struct iou_vec *vec) > +{ > + unsigned long folio_size = (1 << imu->folio_shift); > + unsigned long folio_mask = folio_size - 1; > + struct bio_vec *res_bvec = vec->bvec; > + size_t total_len = 0; > + int bvec_idx = 0; > + int iov_idx; > + > + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { > + size_t iov_len = iovec[iov_idx].iov_len; > + u64 buf_addr = (u64)iovec[iov_idx].iov_base; > + u64 folio_addr = imu->ubuf & ~folio_mask; The computation of folio_addr could be moved out of the loop. > + struct bio_vec *src_bvec; > + size_t offset; > + u64 buf_end; > + > + if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end))) > + return -EFAULT; > + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) > + return -EFAULT; > + > + total_len += iov_len; > + /* by using folio address it also accounts for bvec offset */ > + offset = buf_addr - folio_addr; > + src_bvec = imu->bvec + (offset >> imu->folio_shift); > + offset &= folio_mask; > + > + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { > + size_t seg_size = min_t(size_t, iov_len, > + folio_size - offset); > + > + res_bvec[bvec_idx].bv_page = src_bvec->bv_page; > + res_bvec[bvec_idx].bv_offset = offset; > + res_bvec[bvec_idx].bv_len = seg_size; Could just increment res_bvec to avoid the variable bvec_idx? > + iov_len -= seg_size; > + } > + } > + if (total_len > MAX_RW_COUNT) > + return -EINVAL; > + > + iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); > + return 0; > +} > + > +static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, > + struct io_mapped_ubuf *imu) > +{ > + unsigned shift = imu->folio_shift; > + size_t max_segs = 0; > + unsigned i; > + > + for (i = 0; i < nr_iovs; i++) > + max_segs += (iov[i].iov_len >> shift) + 2; Sees like this may overestimate a bit. I think something like this would give the exact number of segments for each iovec? (((u64)iov_base & folio_mask) + iov_len + folio_mask) >> folio_shift > + return max_segs; > +} > + > +int io_import_reg_vec(int ddir, struct iov_iter *iter, > + struct io_kiocb *req, struct iou_vec *vec, > + int nr_iovs, unsigned iovec_off, > + unsigned issue_flags) > +{ > + struct io_rsrc_node *node; > + struct io_mapped_ubuf *imu; > + unsigned cache_nr; > + struct iovec *iov; > + unsigned nr_segs; > + int ret; > + > + node = io_find_buf_node(req, issue_flags); > + if (!node) > + return -EFAULT; > + imu = node->buf; > + if (imu->is_kbuf) > + return -EOPNOTSUPP; > + > + iov = vec->iovec + iovec_off; > + ret = io_estimate_bvec_size(iov, nr_iovs, imu); > + if (ret < 0) > + return ret; io_estimate_bvec_size() doesn't (intentionally) return an error code, just an unsigned value cast to an int. Best, Caleb > + nr_segs = ret; > + cache_nr = vec->nr; > + > + if (WARN_ON_ONCE(iovec_off + nr_iovs != cache_nr) || > + nr_segs > cache_nr) { > + struct iou_vec tmp_vec = {}; > + > + ret = io_vec_realloc(&tmp_vec, nr_segs); > + if (ret) > + return ret; > + > + iovec_off = tmp_vec.nr - nr_iovs; > + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); > + io_vec_free(vec); > + > + *vec = tmp_vec; > + iov = vec->iovec + iovec_off; > + req->flags |= REQ_F_NEED_CLEANUP; > + } > + > + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); > +} > diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h > index e3f1cfb2ff7b..769ef5d76a4b 100644 > --- a/io_uring/rsrc.h > +++ b/io_uring/rsrc.h > @@ -61,6 +61,10 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, > int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, > u64 buf_addr, size_t len, int ddir, > unsigned issue_flags); > +int io_import_reg_vec(int ddir, struct iov_iter *iter, > + struct io_kiocb *req, struct iou_vec *vec, > + int nr_iovs, unsigned iovec_off, > + unsigned issue_flags); > > int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); > int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); > @@ -146,6 +150,7 @@ static inline void __io_unaccount_mem(struct user_struct *user, > } > > void io_vec_free(struct iou_vec *iv); > +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries); > > static inline void io_vec_reset_iovec(struct iou_vec *iv, > struct iovec *iovec, unsigned nr) > -- > 2.48.1 > >