On 10/27/2023 11:49 PM, Keith Busch wrote: > From: Keith Busch <kbusch@xxxxxxxxxx> > > Passthrough commands that utilize metadata currently need to bounce the > user space buffer through the kernel. Add support for mapping user space > directly so that we can avoid this costly overhead. This is similiar to > how the normal bio data payload utilizes user addresses with > bio_map_user_iov(). > > If the user address can't directly be used for reasons like too many > segments or address unalignement, fallback to a copy of the user vec > while keeping the user address pinned for the IO duration so that it > can safely be copied on completion in any process context. > > Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> > --- > block/bio-integrity.c | 195 ++++++++++++++++++++++++++++++++++++++++++ > include/linux/bio.h | 9 ++ > 2 files changed, 204 insertions(+) > > diff --git a/block/bio-integrity.c b/block/bio-integrity.c > index ec8ac8cf6e1b9..7f9d242ad79df 100644 > --- a/block/bio-integrity.c > +++ b/block/bio-integrity.c > @@ -91,6 +91,37 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, > } > EXPORT_SYMBOL(bio_integrity_alloc); > > +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) > +{ > + bool dirty = bio_data_dir(bip->bip_bio) == READ; > + struct bio_vec *copy = bip->copy_vec; > + struct bvec_iter iter; > + struct bio_vec bv; > + > + if (copy) { > + unsigned short nr_vecs = bip->bip_max_vcnt; > + size_t bytes = bip->bip_iter.bi_size; > + void *buf = bvec_virt(bip->bip_vec); > + > + if (dirty) { > + struct iov_iter iter; > + > + iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); > + WARN_ON(copy_to_iter(buf, bytes, &iter) != bytes); > + } > + > + memcpy(bip->bip_vec, copy, nr_vecs * sizeof(*copy)); > + kfree(copy); > + kfree(buf); > + } > + > + bip_for_each_vec(bv, bip, iter) { > + if (dirty && !PageCompound(bv.bv_page)) > + set_page_dirty_lock(bv.bv_page); > + unpin_user_page(bv.bv_page); > + } > +} Leak here, page-unpinning loop will not execute for the common (i.e., no-copy) case... > +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len, > + u32 seed) > +{ > + struct request_queue *q = bdev_get_queue(bio->bi_bdev); > + unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q); > + int ret, direction, nr_vecs, i, j, folios = 0; > + struct bio_vec stack_vec[UIO_FASTIOV]; > + struct bio_vec bv, *bvec = stack_vec; > + struct page *stack_pages[UIO_FASTIOV]; > + struct page **pages = stack_pages; > + struct bio_integrity_payload *bip; > + struct iov_iter iter; > + struct bvec_iter bi; > + u32 bytes; > + > + if (bio_integrity(bio)) > + return -EINVAL; > + if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q)) > + return -E2BIG; > + > + if (bio_data_dir(bio) == READ) > + direction = ITER_DEST; > + else > + direction = ITER_SOURCE; > + > + iov_iter_ubuf(&iter, direction, ubuf, len); > + nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); > + if (nr_vecs > BIO_MAX_VECS) > + return -E2BIG; > + if (nr_vecs > UIO_FASTIOV) { > + bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); > + if (!bvec) > + return -ENOMEM; > + pages = NULL; > + } > + > + bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs); > + if (unlikely(bytes < 0)) { > + ret = bytes; > + goto free_bvec; > + } > + > + for (i = 0; i < nr_vecs; i = j) { > + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs); > + struct folio *folio = page_folio(pages[i]); > + > + bytes -= size; > + for (j = i + 1; j < nr_vecs; j++) { > + size_t next = min_t(size_t, PAGE_SIZE, bytes); > + > + if (page_folio(pages[j]) != folio || > + pages[j] != pages[j - 1] + 1) > + break; > + unpin_user_page(pages[j]); > + size += next; > + bytes -= next; > + } > + > + bvec_set_page(&bvec[folios], pages[i], size, offs); > + offs = 0; > + folios++; > + } > + > + if (pages != stack_pages) > + kvfree(pages); > + > + if (folios > queue_max_integrity_segments(q) || > + !iov_iter_is_aligned(&iter, align, align)) { > + ret = bio_integrity_copy_user(bio, bvec, folios, len, > + direction, seed); > + if (ret) > + goto release_pages; > + return 0; > + } > + > + bip = bio_integrity_alloc(bio, GFP_KERNEL, folios); > + if (IS_ERR(bip)) { > + ret = PTR_ERR(bip); > + goto release_pages; > + } > + > + memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec)); Because with this way of copying, bip->bip_iter.bi_size will remain zero. Second, is it fine not to have those virt-alignment checks that are done by bvec_gap_to_prev() when the pages are added using bio_integrity_add_page()?