Provide a function for filling in a scatterlist from the list of pages contained in an iterator. The function is passed FOLL_SOURCE_BUF or FOLL_DEST_BUF to indicate how the extracted pages are to be used. If the iterator is UBUF- or IOBUF-type, the pages have a ref (FOLL_SOURCE_BUF) or a pin (FOLL_DEST_BUF) taken on them. If the iterator is BVEC-, KVEC- or XARRAY-type, no ref is taken on the pages and it is left to the caller to manage their lifetime. It cannot be assumed that a ref can be validly taken, particularly in the case of a KVEC iterator. Changes: ======== ver #6) - Pass in a gup_flags argument to allow FOLL_SOURCE_BUF and FOLL_DEST_BUF and other FOLL_* flags to be passed in. - Don't pass back the cleanup mode - iov_iter_extract_mode() can be used to determine that. ver #3) - Switch to using EXPORT_SYMBOL_GPL to prevent indirect 3rd-party access to get/pin_user_pages_fast()[1]. Signed-off-by: David Howells <dhowells@xxxxxxxxxx> cc: Jeff Layton <jlayton@xxxxxxxxxx> cc: Steve French <sfrench@xxxxxxxxx> cc: Shyam Prasad N <nspmangalore@xxxxxxxxx> cc: Rohith Surabattula <rohiths.msft@xxxxxxxxx> cc: linux-cachefs@xxxxxxxxxx cc: linux-cifs@xxxxxxxxxxxxxxx cc: linux-fsdevel@xxxxxxxxxxxxxxx Link: https://lore.kernel.org/r/Y3zFzdWnWlEJ8X8/@infradead.org/ [1] Link: https://lore.kernel.org/r/166697255985.61150.16489950598033809487.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # rfc Link: https://lore.kernel.org/r/166732027275.3186319.5186488812166611598.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # rfc Link: https://lore.kernel.org/r/166869691313.3723671.10714823767342163891.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # rfc Link: https://lore.kernel.org/r/166920905749.1461876.12079195122363691498.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # v2 Link: https://lore.kernel.org/r/166997423514.9475.11145024341505464337.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # v3 Link: https://lore.kernel.org/r/167305165398.1521586.12353215176136705725.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # v4 Link: https://lore.kernel.org/r/167344730041.2425628.14391053364759792950.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # v5 --- fs/netfs/iterator.c | 269 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/netfs.h | 4 + mm/vmalloc.c | 1 3 files changed, 274 insertions(+) diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index f7f26de1a247..1d20ad2123b5 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -7,7 +7,9 @@ #include <linux/export.h> #include <linux/slab.h> +#include <linux/mm.h> #include <linux/uio.h> +#include <linux/scatterlist.h> #include <linux/netfs.h> #include "internal.h" @@ -100,3 +102,270 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, return npages; } EXPORT_SYMBOL_GPL(netfs_extract_user_iter); + +/* + * Extract as list of up to sg_max pages from UBUF- or IOVEC-class iterators, + * pin or get refs on them appropriate and add them to the scatterlist. + */ +static ssize_t netfs_extract_user_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + unsigned int gup_flags) +{ + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + struct page **pages; + unsigned int npages; + ssize_t ret = 0, res; + size_t len, off; + + /* We decant the page list into the tail of the scatterlist */ + pages = (void *)sgtable->sgl + array_size(sg_max, sizeof(struct scatterlist)); + pages -= sg_max; + + do { + res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max, + gup_flags, &off); + if (res < 0) + goto failed; + + len = res; + maxsize -= len; + ret += len; + npages = DIV_ROUND_UP(off + len, PAGE_SIZE); + sg_max -= npages; + + for (; npages < 0; npages--) { + struct page *page = *pages; + size_t seg = min_t(size_t, PAGE_SIZE - off, len); + + *pages++ = NULL; + sg_set_page(sg, page, len, off); + sgtable->nents++; + sg++; + len -= seg; + off = 0; + } + } while (maxsize > 0 && sg_max > 0); + + return ret; + +failed: + while (sgtable->nents > sgtable->orig_nents) + put_page(sg_page(&sgtable->sgl[--sgtable->nents])); + return res; +} + +/* + * Extract up to sg_max pages from a BVEC-type iterator and add them to the + * scatterlist. The pages are not pinned. + */ +static ssize_t netfs_extract_bvec_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + unsigned int gup_flags) +{ + const struct bio_vec *bv = iter->bvec; + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + sg_set_page(sg, bv[i].bv_page, len, off); + sgtable->nents++; + sg++; + sg_max--; + + ret += len; + maxsize -= len; + if (maxsize <= 0 || sg_max == 0) + break; + start = 0; + } + + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/* + * Extract up to sg_max pages from a KVEC-type iterator and add them to the + * scatterlist. This can deal with vmalloc'd buffers as well as kmalloc'd or + * static buffers. The pages are not pinned. + */ +static ssize_t netfs_extract_kvec_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + unsigned int gup_flags) +{ + const struct kvec *kv = iter->kvec; + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + struct page *page; + unsigned long kaddr; + size_t off, len, seg; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + kaddr = (unsigned long)kv[i].iov_base + start; + off = kaddr & ~PAGE_MASK; + len = min_t(size_t, maxsize, len - start); + kaddr &= PAGE_MASK; + + maxsize -= len; + ret += len; + do { + seg = min_t(size_t, len, PAGE_SIZE - off); + if (is_vmalloc_or_module_addr((void *)kaddr)) + page = vmalloc_to_page((void *)kaddr); + else + page = virt_to_page(kaddr); + + sg_set_page(sg, page, len, off); + sgtable->nents++; + sg++; + sg_max--; + + len -= seg; + kaddr += PAGE_SIZE; + off = 0; + } while (len > 0 && sg_max > 0); + + if (maxsize <= 0 || sg_max == 0) + break; + start = 0; + } + + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/* + * Extract up to sg_max folios from an XARRAY-type iterator and add them to + * the scatterlist. The pages are not pinned. + */ +static ssize_t netfs_extract_xarray_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + unsigned int gup_flags) +{ + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + struct xarray *xa = iter->xarray; + struct folio *folio; + loff_t start = iter->xarray_start + iter->iov_offset; + pgoff_t index = start / PAGE_SIZE; + ssize_t ret = 0; + size_t offset, len; + XA_STATE(xas, xa, index); + + rcu_read_lock(); + + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) + continue; + if (WARN_ON(xa_is_value(folio))) + break; + if (WARN_ON(folio_test_hugetlb(folio))) + break; + + offset = offset_in_folio(folio, start); + len = min_t(size_t, maxsize, folio_size(folio) - offset); + + sg_set_page(sg, folio_page(folio, 0), len, offset); + sgtable->nents++; + sg++; + sg_max--; + + maxsize -= len; + ret += len; + if (maxsize <= 0 || sg_max == 0) + break; + } + + rcu_read_unlock(); + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/** + * netfs_extract_iter_to_sg - Extract pages from an iterator and add ot an sglist + * @iter: The iterator to extract from + * @maxsize: The amount of iterator to copy + * @sgtable: The scatterlist table to fill in + * @sg_max: Maximum number of elements in @sgtable that may be filled + * @gup_flags: Direction indicator and additional flags + * + * Extract the page fragments from the given amount of the source iterator and + * add them to a scatterlist that refers to all of those bits, to a maximum + * addition of @sg_max elements. + * + * The pages referred to by UBUF- and IOVEC-type iterators are extracted and + * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE- + * and DISCARD-type are not supported. + * + * No end mark is placed on the scatterlist; that's left to the caller. + * + * @gup_flags should indicate FOLL_SOURCE_BUF or FOLL_DEST_BUF plus any + * additional flags needed. + * + * If successul, @sgtable->nents is updated to include the number of elements + * added and the number of bytes added is returned. @sgtable->orig_nents is + * left unaltered. + * + * The iov_iter_extract_mode() function should be used to query how cleanup + * should be performed. + */ +ssize_t netfs_extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, + struct sg_table *sgtable, unsigned int sg_max, + unsigned int gup_flags) +{ + if (maxsize == 0) + return 0; + + switch (iov_iter_type(iter)) { + case ITER_UBUF: + case ITER_IOVEC: + return netfs_extract_user_to_sg(iter, maxsize, sgtable, sg_max, + gup_flags); + case ITER_BVEC: + return netfs_extract_bvec_to_sg(iter, maxsize, sgtable, sg_max, + gup_flags); + case ITER_KVEC: + return netfs_extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, + gup_flags); + case ITER_XARRAY: + return netfs_extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, + gup_flags); + default: + pr_err("netfs_extract_iter_to_sg(%u) unsupported\n", + iov_iter_type(iter)); + WARN_ON_ONCE(1); + return -EIO; + } +} +EXPORT_SYMBOL_GPL(netfs_extract_iter_to_sg); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index a45757dd382d..2493df855f05 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -298,6 +298,10 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq, void netfs_stats_show(struct seq_file *); ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, struct iov_iter *new, unsigned int gup_flags); +struct sg_table; +ssize_t netfs_extract_iter_to_sg(struct iov_iter *iter, size_t len, + struct sg_table *sgtable, unsigned int sg_max, + unsigned int gup_flags); /** * netfs_inode - Get the netfs inode context from the inode diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ca71de7c9d77..61f5bec0f2b6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -656,6 +656,7 @@ int is_vmalloc_or_module_addr(const void *x) #endif return is_vmalloc_addr(x); } +EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); /* * Walk a vmap address to the struct page it maps. Huge vmap mappings will