Use the direction in the iterator functions rather than READ/WRITE. Add a check into __iov_iter_get_pages_alloc() that the supplied FOLL_SOURCE/DEST_BUF gup_flag matches the ITER_SOURCE/DEST flag on the iterator. Changes ======= ver #6) - Add a check on FOLL_SOURCE/DEST_BUF into __iov_iter_get_pages_alloc() Signed-off-by: David Howells <dhowells@xxxxxxxxxx> cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx> Link: https://lore.kernel.org/r/167305162465.1521586.18077838937455153675.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # v4 Link: https://lore.kernel.org/r/167344727112.2425628.995771894170560721.stgit@xxxxxxxxxxxxxxxxxxxxxx/ # v5 --- include/linux/uio.h | 22 +-- lib/iov_iter.c | 409 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 396 insertions(+), 35 deletions(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 8d0dabfcb2fe..18b64068cc6d 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -256,16 +256,16 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); -void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, +void iov_iter_init(struct iov_iter *i, enum iter_dir direction, const struct iovec *iov, unsigned long nr_segs, size_t count); -void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, +void iov_iter_kvec(struct iov_iter *i, enum iter_dir direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); -void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, +void iov_iter_bvec(struct iov_iter *i, enum iter_dir direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); -void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, +void iov_iter_pipe(struct iov_iter *i, enum iter_dir direction, struct pipe_inode_info *pipe, size_t count); -void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); -void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, +void iov_iter_discard(struct iov_iter *i, enum iter_dir direction, size_t count); +void iov_iter_xarray(struct iov_iter *i, enum iter_dir direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start, @@ -351,19 +351,19 @@ size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, struct iovec *iovec_from_user(const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat); -ssize_t import_iovec(int type, const struct iovec __user *uvec, +ssize_t import_iovec(enum iter_dir direction, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i); -ssize_t __import_iovec(int type, const struct iovec __user *uvec, +ssize_t __import_iovec(enum iter_dir direction, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); -int import_single_range(int type, void __user *buf, size_t len, +int import_single_range(enum iter_dir direction, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i); -static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, +static inline void iov_iter_ubuf(struct iov_iter *i, enum iter_dir direction, void __user *buf, size_t count) { - WARN_ON(direction & ~(READ | WRITE)); + WARN_ON(!iov_iter_dir_valid(direction)); *i = (struct iov_iter) { .iter_type = ITER_UBUF, .user_backed = true, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index ca89ffa9d6e1..6436438bf46b 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -421,11 +421,11 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) } EXPORT_SYMBOL(fault_in_iov_iter_writeable); -void iov_iter_init(struct iov_iter *i, unsigned int direction, +void iov_iter_init(struct iov_iter *i, enum iter_dir direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { - WARN_ON(direction & ~(READ | WRITE)); + WARN_ON(!iov_iter_dir_valid(direction)); *i = (struct iov_iter) { .iter_type = ITER_IOVEC, .nofault = false, @@ -994,11 +994,11 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_single_seg_count); -void iov_iter_kvec(struct iov_iter *i, unsigned int direction, +void iov_iter_kvec(struct iov_iter *i, enum iter_dir direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { - WARN_ON(direction & ~(READ | WRITE)); + WARN_ON(!iov_iter_dir_valid(direction)); *i = (struct iov_iter){ .iter_type = ITER_KVEC, .data_source = direction, @@ -1010,11 +1010,11 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_kvec); -void iov_iter_bvec(struct iov_iter *i, unsigned int direction, +void iov_iter_bvec(struct iov_iter *i, enum iter_dir direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { - WARN_ON(direction & ~(READ | WRITE)); + WARN_ON(!iov_iter_dir_valid(direction)); *i = (struct iov_iter){ .iter_type = ITER_BVEC, .data_source = direction, @@ -1026,15 +1026,15 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_bvec); -void iov_iter_pipe(struct iov_iter *i, unsigned int direction, +void iov_iter_pipe(struct iov_iter *i, enum iter_dir direction, struct pipe_inode_info *pipe, size_t count) { - BUG_ON(direction != READ); + BUG_ON(direction != ITER_DEST); WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); *i = (struct iov_iter){ .iter_type = ITER_PIPE, - .data_source = false, + .data_source = ITER_DEST, .pipe = pipe, .head = pipe->head, .start_head = pipe->head, @@ -1057,10 +1057,10 @@ EXPORT_SYMBOL(iov_iter_pipe); * from evaporation, either by taking a ref on them or locking them by the * caller. */ -void iov_iter_xarray(struct iov_iter *i, unsigned int direction, +void iov_iter_xarray(struct iov_iter *i, enum iter_dir direction, struct xarray *xarray, loff_t start, size_t count) { - BUG_ON(direction & ~1); + WARN_ON(!iov_iter_dir_valid(direction)); *i = (struct iov_iter) { .iter_type = ITER_XARRAY, .data_source = direction, @@ -1079,14 +1079,14 @@ EXPORT_SYMBOL(iov_iter_xarray); * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator that just discards everything that's written to it. - * It's only available as a READ iterator. + * It's only available as a destination iterator. */ -void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) +void iov_iter_discard(struct iov_iter *i, enum iter_dir direction, size_t count) { - BUG_ON(direction != READ); + BUG_ON(direction != ITER_DEST); *i = (struct iov_iter){ .iter_type = ITER_DISCARD, - .data_source = false, + .data_source = ITER_DEST, .count = count, .iov_offset = 0 }; @@ -1444,10 +1444,10 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, maxsize = MAX_RW_COUNT; if (WARN_ON_ONCE((gup_flags & FOLL_BUF_MASK) == FOLL_SOURCE_BUF && - i->data_source == ITER_DEST)) + iov_iter_is_dest(i))) return -EIO; if (WARN_ON_ONCE((gup_flags & FOLL_BUF_MASK) == FOLL_DEST_BUF && - i->data_source == ITER_SOURCE)) + iov_iter_is_source(i))) return -EIO; if (likely(user_backed_iter(i))) { @@ -1775,7 +1775,7 @@ struct iovec *iovec_from_user(const struct iovec __user *uvec, return iov; } -ssize_t __import_iovec(int type, const struct iovec __user *uvec, +ssize_t __import_iovec(enum iter_dir direction, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) { @@ -1814,7 +1814,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, total_len += len; } - iov_iter_init(i, type, iov, nr_segs, total_len); + iov_iter_init(i, direction, iov, nr_segs, total_len); if (iov == *iovp) *iovp = NULL; else @@ -1827,7 +1827,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, * into the kernel, check that it is valid, and initialize a new * &struct iov_iter iterator to access it. * - * @type: One of %READ or %WRITE. + * @direction: One of %ITER_SOURCE or %ITER_DEST. * @uvec: Pointer to the userspace array. * @nr_segs: Number of elements in userspace array. * @fast_segs: Number of elements in @iov. @@ -1844,16 +1844,16 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, * * Return: Negative error code on error, bytes imported on success */ -ssize_t import_iovec(int type, const struct iovec __user *uvec, +ssize_t import_iovec(enum iter_dir direction, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i) { - return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, + return __import_iovec(direction, uvec, nr_segs, fast_segs, iovp, i, in_compat_syscall()); } EXPORT_SYMBOL(import_iovec); -int import_single_range(int rw, void __user *buf, size_t len, +int import_single_range(enum iter_dir direction, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i) { if (len > MAX_RW_COUNT) @@ -1863,7 +1863,7 @@ int import_single_range(int rw, void __user *buf, size_t len, iov->iov_base = buf; iov->iov_len = len; - iov_iter_init(i, rw, iov, 1, len); + iov_iter_init(i, direction, iov, 1, len); return 0; } EXPORT_SYMBOL(import_single_range); @@ -1905,3 +1905,364 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) i->iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } + +/* + * Extract a list of contiguous pages from an ITER_PIPE iterator. This does + * not get references of its own on the pages, nor does it get a pin on them. + * If there's a partial page, it adds that first and will then allocate and add + * pages into the pipe to make up the buffer space to the amount required. + * + * The caller must hold the pipe locked and only transferring into a pipe is + * supported. + */ +static ssize_t iov_iter_extract_pipe_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + unsigned int gup_flags, + size_t *offset0) +{ + unsigned int nr, offset, chunk, j; + struct page **p; + size_t left; + + if (!sanity(i)) + return -EFAULT; + + offset = pipe_npages(i, &nr); + if (!nr) + return -EFAULT; + *offset0 = offset; + + maxpages = min_t(size_t, nr, maxpages); + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + p = *pages; + + left = maxsize; + for (j = 0; j < maxpages; j++) { + struct page *page = append_pipe(i, left, &offset); + if (!page) + break; + chunk = min_t(size_t, left, PAGE_SIZE - offset); + left -= chunk; + *p++ = page; + } + if (!j) + return -EFAULT; + return maxsize - left; +} + +/* + * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not + * get references on the pages, nor does it get a pin on them. + */ +static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + unsigned int gup_flags, + size_t *offset0) +{ + struct page *page, **p; + unsigned int nr = 0, offset; + loff_t pos = i->xarray_start + i->iov_offset; + pgoff_t index = pos >> PAGE_SHIFT; + XA_STATE(xas, i->xarray, index); + + offset = pos & ~PAGE_MASK; + *offset0 = offset; + + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + p = *pages; + + rcu_read_lock(); + for (page = xas_load(&xas); page; page = xas_next(&xas)) { + if (xas_retry(&xas, page)) + continue; + + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(&xas))) { + xas_reset(&xas); + continue; + } + + p[nr++] = find_subpage(page, xas.xa_index); + if (nr == maxpages) + break; + } + rcu_read_unlock(); + + maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); + i->iov_offset += maxsize; + i->count -= maxsize; + return maxsize; +} + +/* + * Extract a list of contiguous pages from an ITER_BVEC iterator. This does + * not get references on the pages, nor does it get a pin on them. + */ +static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + unsigned int gup_flags, + size_t *offset0) +{ + struct page **p, *page; + size_t skip = i->iov_offset, offset; + int k; + + maxsize = min(maxsize, i->bvec->bv_len - skip); + skip += i->bvec->bv_offset; + page = i->bvec->bv_page + skip / PAGE_SIZE; + offset = skip % PAGE_SIZE; + *offset0 = offset; + + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + p = *pages; + for (k = 0; k < maxpages; k++) + p[k] = page + k; + + maxsize = min_t(size_t, maxsize, maxpages * PAGE_SIZE - offset); + i->count -= maxsize; + i->iov_offset += maxsize; + if (i->iov_offset == i->bvec->bv_len) { + i->iov_offset = 0; + i->bvec++; + i->nr_segs--; + } + return maxsize; +} + +/* + * Get the first segment from an ITER_UBUF or ITER_IOVEC iterator. The + * iterator must not be empty. + */ +static unsigned long iov_iter_extract_first_user_segment(const struct iov_iter *i, + size_t *size) +{ + size_t skip; + long k; + + if (iter_is_ubuf(i)) + return (unsigned long)i->ubuf + i->iov_offset; + + for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { + size_t len = i->iov[k].iov_len - skip; + + if (unlikely(!len)) + continue; + if (*size > len) + *size = len; + return (unsigned long)i->iov[k].iov_base + skip; + } + BUG(); // if it had been empty, we wouldn't get called +} + +/* + * Extract a list of contiguous pages from a user iterator and get references + * on them. This should only be used iff the iterator is user-backed + * (IOBUF/UBUF) and data is being transferred out of the buffer described by + * the iterator (ie. this is the source). + * + * The pages are returned with incremented refcounts that the caller must undo + * once the transfer is complete, but no additional pins are obtained. + * + * This is only safe to be used where background IO/DMA is not going to be + * modifying the buffer, and so won't cause a problem with CoW on fork. + */ +static ssize_t iov_iter_extract_user_pages_and_get(struct iov_iter *i, + struct page ***pages, + size_t maxsize, + unsigned int maxpages, + unsigned int gup_flags, + size_t *offset0) +{ + unsigned long addr; + size_t offset; + int res; + + if (WARN_ON_ONCE(!iov_iter_is_source(i))) + return -EFAULT; + + gup_flags |= FOLL_GET; + if (i->nofault) + gup_flags |= FOLL_NOFAULT; + + addr = iov_iter_extract_first_user_segment(i, &maxsize); + *offset0 = offset = addr % PAGE_SIZE; + addr &= PAGE_MASK; + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + res = get_user_pages_fast(addr, maxpages, gup_flags, *pages); + if (unlikely(res <= 0)) + return res; + maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); + iov_iter_advance(i, maxsize); + return maxsize; +} + +/* + * Extract a list of contiguous pages from a user iterator and get a pin on + * each of them. This should only be used iff the iterator is user-backed + * (IOBUF/UBUF) and data is being transferred into the buffer described by the + * iterator (ie. this is the destination). + * + * It does not get refs on the pages, but the pages must be unpinned by the + * caller once the transfer is complete. + * + * This is safe to be used where background IO/DMA *is* going to be modifying + * the buffer; using a pin rather than a ref makes sure that CoW happens + * correctly in the parent during fork. + */ +static ssize_t iov_iter_extract_user_pages_and_pin(struct iov_iter *i, + struct page ***pages, + size_t maxsize, + unsigned int maxpages, + unsigned int gup_flags, + size_t *offset0) +{ + unsigned long addr; + size_t offset; + int res; + + if (WARN_ON_ONCE(!iov_iter_is_dest(i))) + return -EFAULT; + + gup_flags |= FOLL_PIN | FOLL_WRITE; + if (i->nofault) + gup_flags |= FOLL_NOFAULT; + + addr = first_iovec_segment(i, &maxsize); + *offset0 = offset = addr % PAGE_SIZE; + addr &= PAGE_MASK; + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); + if (unlikely(res <= 0)) + return res; + maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); + iov_iter_advance(i, maxsize); + return maxsize; +} + +static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + unsigned int gup_flags, + size_t *offset0) +{ + if (iov_iter_extract_mode(i, gup_flags) == FOLL_GET) + return iov_iter_extract_user_pages_and_get(i, pages, maxsize, + maxpages, gup_flags, + offset0); + else + return iov_iter_extract_user_pages_and_pin(i, pages, maxsize, + maxpages, gup_flags, + offset0); +} + +/** + * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator + * @i: The iterator to extract from + * @pages: Where to return the list of pages + * @maxsize: The maximum amount of iterator to extract + * @maxpages: The maximum size of the list of pages + * @gup_flags: Direction indicator and additional flags + * @offset0: Where to return the starting offset into (*@pages)[0] + * + * Extract a list of contiguous pages from the current point of the iterator, + * advancing the iterator. The maximum number of pages and the maximum amount + * of page contents can be set. + * + * If *@pages is NULL, a page list will be allocated to the required size and + * *@pages will be set to its base. If *@pages is not NULL, it will be assumed + * that the caller allocated a page list at least @maxpages in size and this + * will be filled in. + * + * @gup_flags can be set to either FOLL_SOURCE_BUF or FOLL_DEST_BUF, indicating + * how the buffer is to be used, and can have FOLL_PCI_P2PDMA OR'd with that. + * + * The iov_iter_extract_mode() function can be used to query how cleanup should + * be performed. + * + * Extra refs or pins on the pages may be obtained as follows: + * + * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF) and data is to be + * transferred /OUT OF/ the buffer (@gup_flags |= FOLL_SOURCE_BUF), refs + * will be taken on the pages, but pins will not be added. This can be + * used for DMA from a page; it cannot be used for DMA to a page, as it + * may cause page-COW problems in fork. iov_iter_extract_mode() will + * return FOLL_GET. + * + * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF) and data is to be + * transferred /INTO/ the described buffer (@gup_flags |= FOLL_DEST_BUF), + * pins will be added to the pages, but refs will not be taken. This must + * be used for DMA to a page. iov_iter_extract_mode() will return + * FOLL_PIN. + * + * (*) If the iterator is ITER_PIPE, this must describe a destination for the + * data. Additional pages may be allocated and added to the pipe (which + * will hold the refs), but neither refs nor pins will be obtained for the + * caller. The caller must hold the pipe lock. iov_iter_extract_mode() + * will return 0. + * + * (*) If the iterator is ITER_BVEC or ITER_XARRAY, the pages are merely + * listed; no extra refs or pins are obtained. iov_iter_extract_mode() + * will return 0. + * + * Note also: + * + * (*) Use with ITER_KVEC is not supported as that may refer to memory that + * doesn't have associated page structs. + * + * (*) Use with ITER_DISCARD is not supported as that has no content. + * + * On success, the function sets *@pages to the new pagelist, if allocated, and + * sets *offset0 to the offset into the first page.. + * + * It may also return -ENOMEM and -EFAULT. + */ +ssize_t iov_iter_extract_pages(struct iov_iter *i, + struct page ***pages, + size_t maxsize, + unsigned int maxpages, + unsigned int gup_flags, + size_t *offset0) +{ + if (WARN_ON_ONCE((gup_flags & FOLL_BUF_MASK) == FOLL_SOURCE_BUF && + iov_iter_is_dest(i))) + return -EIO; + if (WARN_ON_ONCE((gup_flags & FOLL_BUF_MASK) == FOLL_DEST_BUF && + iov_iter_is_source(i))) + return -EIO; + + maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); + if (!maxsize) + return 0; + + if (likely(user_backed_iter(i))) + return iov_iter_extract_user_pages(i, pages, maxsize, + maxpages, gup_flags, + offset0); + if (iov_iter_is_bvec(i)) + return iov_iter_extract_bvec_pages(i, pages, maxsize, + maxpages, gup_flags, + offset0); + if (iov_iter_is_pipe(i)) + return iov_iter_extract_pipe_pages(i, pages, maxsize, + maxpages, gup_flags, + offset0); + if (iov_iter_is_xarray(i)) + return iov_iter_extract_xarray_pages(i, pages, maxsize, + maxpages, gup_flags, + offset0); + return -EFAULT; +} +EXPORT_SYMBOL_GPL(iov_iter_extract_pages);