Add an iterator, ITER_MAPPING, that walks through a set of pages attached to an address_space, starting at a given file position and length. The caller must guarantee that the pages are all present and they must be locked is some way[*] (eg. ref, PG_locked, PG_writeback or PG_fscache) to prevent them from going away or being migrated whilst they're being accessed. [*] Note that I'm assuming that a page cannot get detached from or replaced in a mapping if it is so 'locked' without permission from the appropriate address_space operation. This is useful for copying data from socket buffers to inodes in network filesystems and for transferring data between those inodes and the cache using direct I/O. Whilst it is true that ITER_BVEC could be used instead, the following issues arise: (1) A bio_vec array would need to be allocated to list to all the pages - which is redundant if inode->i_pages *also* points to all these pages. (2) A bio_vec array might exceed the size of a page (a page-sized bio_vec array would correspond to a 1M payload on a 64-bit machine with 4K pages). (3) The offset and length fields in the bio_vec elements are superfluous in this situation. An alternative could be to have an "ITER_ARRAY" that just has the page pointers and not the offset/length info. This decreases the redundancy and increases the max payload-per-array-page to 2M. I've been using this in my rewrite of fscache/cachefiles, which can be found here: https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git/log/?h=fscache-iter Signed-off-by: David Howells <dhowells@xxxxxxxxxx> --- include/linux/uio.h | 11 ++ lib/iov_iter.c | 282 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 270 insertions(+), 23 deletions(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 9576fd8158d7..a0321a740f51 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -11,6 +11,7 @@ #include <uapi/linux/uio.h> struct page; +struct address_space; struct pipe_inode_info; struct kvec { @@ -25,6 +26,7 @@ enum iter_type { ITER_BVEC = 16, ITER_PIPE = 32, ITER_DISCARD = 64, + ITER_MAPPING = 128, }; struct iov_iter { @@ -40,6 +42,7 @@ struct iov_iter { const struct iovec *iov; const struct kvec *kvec; const struct bio_vec *bvec; + struct address_space *mapping; struct pipe_inode_info *pipe; }; union { @@ -48,6 +51,7 @@ struct iov_iter { unsigned int head; unsigned int start_head; }; + loff_t mapping_start; }; }; @@ -81,6 +85,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i) return iov_iter_type(i) == ITER_DISCARD; } +static inline bool iov_iter_is_mapping(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_MAPPING; +} + static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->type & (READ | WRITE); @@ -222,6 +231,8 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_ void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); +void iov_iter_mapping(struct iov_iter *i, unsigned int direction, struct address_space *mapping, + loff_t start, size_t count); ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 51595bf3af85..7c108824414c 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -74,7 +74,40 @@ } \ } -#define iterate_all_kinds(i, n, v, I, B, K) { \ +#define iterate_mapping(i, n, __v, skip, STEP) { \ + struct page *page; \ + size_t wanted = n, seg, offset; \ + loff_t start = i->mapping_start + skip; \ + pgoff_t index = start >> PAGE_SHIFT; \ + \ + XA_STATE(xas, &i->mapping->i_pages, index); \ + \ + rcu_read_lock(); \ + for (page = xas_load(&xas); page; page = xas_next(&xas)) { \ + if (xas_retry(&xas, page)) \ + continue; \ + if (xa_is_value(page)) \ + break; \ + if (PageCompound(page)) \ + break; \ + if (page_to_pgoff(page) != xas.xa_index) \ + break; \ + __v.bv_page = page; \ + offset = (i->mapping_start + skip) & ~PAGE_MASK; \ + seg = PAGE_SIZE - offset; \ + __v.bv_offset = offset; \ + __v.bv_len = min(n, seg); \ + (void)(STEP); \ + n -= __v.bv_len; \ + skip += __v.bv_len; \ + if (n == 0) \ + break; \ + } \ + rcu_read_unlock(); \ + n = wanted - n; \ +} + +#define iterate_all_kinds(i, n, v, I, B, K, M) { \ if (likely(n)) { \ size_t skip = i->iov_offset; \ if (unlikely(i->type & ITER_BVEC)) { \ @@ -86,6 +119,9 @@ struct kvec v; \ iterate_kvec(i, n, v, kvec, skip, (K)) \ } else if (unlikely(i->type & ITER_DISCARD)) { \ + } else if (unlikely(i->type & ITER_MAPPING)) { \ + struct bio_vec v; \ + iterate_mapping(i, n, v, skip, (M)); \ } else { \ const struct iovec *iov; \ struct iovec v; \ @@ -94,7 +130,7 @@ } \ } -#define iterate_and_advance(i, n, v, I, B, K) { \ +#define iterate_and_advance(i, n, v, I, B, K, M) { \ if (unlikely(i->count < n)) \ n = i->count; \ if (i->count) { \ @@ -119,6 +155,9 @@ i->kvec = kvec; \ } else if (unlikely(i->type & ITER_DISCARD)) { \ skip += n; \ + } else if (unlikely(i->type & ITER_MAPPING)) { \ + struct bio_vec v; \ + iterate_mapping(i, n, v, skip, (M)) \ } else { \ const struct iovec *iov; \ struct iovec v; \ @@ -628,7 +667,9 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), memcpy_to_page(v.bv_page, v.bv_offset, (from += v.bv_len) - v.bv_len, v.bv_len), - memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) + memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), + memcpy_to_page(v.bv_page, v.bv_offset, + (from += v.bv_len) - v.bv_len, v.bv_len) ) return bytes; @@ -746,6 +787,15 @@ size_t _copy_to_iter_mcsafe(const void *addr, size_t bytes, struct iov_iter *i) bytes = curr_addr - s_addr - rem; return bytes; } + }), + ({ + rem = memcpy_mcsafe_to_page(v.bv_page, v.bv_offset, + (from += v.bv_len) - v.bv_len, v.bv_len); + if (rem) { + curr_addr = (unsigned long) from; + bytes = curr_addr - s_addr - rem; + return bytes; + } }) ) @@ -767,7 +817,9 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, v.bv_offset, v.bv_len), - memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len) ) return bytes; @@ -793,7 +845,9 @@ bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) 0;}), memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, v.bv_offset, v.bv_len), - memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len) ) iov_iter_advance(i, bytes); @@ -813,7 +867,9 @@ size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) v.iov_base, v.iov_len), memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, v.bv_offset, v.bv_len), - memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len) ) return bytes; @@ -848,7 +904,9 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page, v.bv_offset, v.bv_len), memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base, - v.iov_len) + v.iov_len), + memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len) ) return bytes; @@ -872,7 +930,9 @@ bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) 0;}), memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, v.bv_offset, v.bv_len), - memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len) ) iov_iter_advance(i, bytes); @@ -909,7 +969,7 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, { if (unlikely(!page_copy_sane(page, offset, bytes))) return 0; - if (i->type & (ITER_BVEC|ITER_KVEC)) { + if (i->type & (ITER_BVEC | ITER_KVEC | ITER_MAPPING)) { void *kaddr = kmap_atomic(page); size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); @@ -932,7 +992,7 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, WARN_ON(1); return 0; } - if (i->type & (ITER_BVEC|ITER_KVEC)) { + if (i->type & (ITER_BVEC | ITER_KVEC | ITER_MAPPING)) { void *kaddr = kmap_atomic(page); size_t wanted = _copy_from_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); @@ -976,7 +1036,8 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i) iterate_and_advance(i, bytes, v, clear_user(v.iov_base, v.iov_len), memzero_page(v.bv_page, v.bv_offset, v.bv_len), - memset(v.iov_base, 0, v.iov_len) + memset(v.iov_base, 0, v.iov_len), + memzero_page(v.bv_page, v.bv_offset, v.bv_len) ) return bytes; @@ -1000,7 +1061,9 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, v.bv_offset, v.bv_len), - memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), + memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len) ) kunmap_atomic(kaddr); return bytes; @@ -1071,7 +1134,13 @@ void iov_iter_advance(struct iov_iter *i, size_t size) i->count -= size; return; } - iterate_and_advance(i, size, v, 0, 0, 0) + if (unlikely(iov_iter_is_mapping(i))) { + /* We really don't want to fetch pages if we can avoid it */ + i->iov_offset += size; + i->count -= size; + return; + } + iterate_and_advance(i, size, v, 0, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); @@ -1115,7 +1184,12 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) return; } unroll -= i->iov_offset; - if (iov_iter_is_bvec(i)) { + if (iov_iter_is_mapping(i)) { + BUG(); /* We should never go beyond the start of the specified + * range since we might then be straying into pages that + * aren't pinned. + */ + } else if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; @@ -1152,9 +1226,9 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) return i->count; // it is a silly place, anyway if (i->nr_segs == 1) return i->count; - if (unlikely(iov_iter_is_discard(i))) + if (unlikely(iov_iter_is_discard(i) || iov_iter_is_mapping(i))) return i->count; - else if (iov_iter_is_bvec(i)) + if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); else return min(i->count, i->iov->iov_len - i->iov_offset); @@ -1202,6 +1276,32 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_pipe); +/** + * iov_iter_mapping - Initialise an I/O iterator to use the pages in a mapping + * @i: The iterator to initialise. + * @direction: The direction of the transfer. + * @mapping: The mapping to access. + * @start: The start file position. + * @count: The size of the I/O buffer in bytes. + * + * Set up an I/O iterator to either draw data out of the pages attached to an + * inode or to inject data into those pages. The pages *must* be prevented + * from evaporation, either by taking a ref on them or locking them by the + * caller. + */ +void iov_iter_mapping(struct iov_iter *i, unsigned int direction, + struct address_space *mapping, + loff_t start, size_t count) +{ + BUG_ON(direction & ~1); + i->type = ITER_MAPPING | (direction & (READ | WRITE)); + i->mapping = mapping; + i->mapping_start = start; + i->count = count; + i->iov_offset = 0; +} +EXPORT_SYMBOL(iov_iter_mapping); + /** * iov_iter_discard - Initialise an I/O iterator that discards data * @i: The iterator to initialise. @@ -1235,7 +1335,8 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) iterate_all_kinds(i, size, v, (res |= (unsigned long)v.iov_base | v.iov_len, 0), res |= v.bv_offset | v.bv_len, - res |= (unsigned long)v.iov_base | v.iov_len + res |= (unsigned long)v.iov_base | v.iov_len, + res |= v.bv_offset | v.bv_len ) return res; } @@ -1257,7 +1358,9 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) (res |= (!res ? 0 : (unsigned long)v.bv_offset) | (size != v.bv_len ? size : 0)), (res |= (!res ? 0 : (unsigned long)v.iov_base) | - (size != v.iov_len ? size : 0)) + (size != v.iov_len ? size : 0)), + (res |= (!res ? 0 : (unsigned long)v.bv_offset) | + (size != v.bv_len ? size : 0)) ); return res; } @@ -1307,6 +1410,46 @@ static ssize_t pipe_get_pages(struct iov_iter *i, return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start); } +static ssize_t iter_mapping_get_pages(struct iov_iter *i, + struct page **pages, size_t maxsize, + unsigned maxpages, size_t *_start_offset) +{ + unsigned nr, offset; + pgoff_t index, count; + size_t size = maxsize, actual; + loff_t pos; + + if (!size || !maxpages) + return 0; + + pos = i->mapping_start + i->iov_offset; + index = pos >> PAGE_SHIFT; + offset = pos & ~PAGE_MASK; + *_start_offset = offset; + + count = 1; + if (size > PAGE_SIZE - offset) { + size -= PAGE_SIZE - offset; + count += size >> PAGE_SHIFT; + size &= ~PAGE_MASK; + if (size) + count++; + } + + if (count > maxpages) + count = maxpages; + + nr = find_get_pages_contig(i->mapping, index, count, pages); + if (nr == 0) + return 0; + + actual = PAGE_SIZE * nr; + actual -= offset; + if (nr == count && size > 0) + actual -= PAGE_SIZE - size; + return actual; +} + ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) @@ -1316,6 +1459,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (unlikely(iov_iter_is_pipe(i))) return pipe_get_pages(i, pages, maxsize, maxpages, start); + if (unlikely(iov_iter_is_mapping(i))) + return iter_mapping_get_pages(i, pages, maxsize, maxpages, start); if (unlikely(iov_iter_is_discard(i))) return -EFAULT; @@ -1342,7 +1487,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, return v.bv_len; }),({ return -EFAULT; - }) + }), + 0 ) return 0; } @@ -1386,6 +1532,49 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i, return n; } +static ssize_t iter_mapping_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *_start_offset) +{ + struct page **p; + unsigned nr, offset; + pgoff_t index, count; + size_t size = maxsize, actual; + loff_t pos; + + if (!size) + return 0; + + pos = i->mapping_start + i->iov_offset; + index = pos >> PAGE_SHIFT; + offset = pos & ~PAGE_MASK; + *_start_offset = offset; + + count = 1; + if (size > PAGE_SIZE - offset) { + size -= PAGE_SIZE - offset; + count += size >> PAGE_SHIFT; + size &= ~PAGE_MASK; + if (size) + count++; + } + + p = get_pages_array(count); + if (!p) + return -ENOMEM; + *pages = p; + + nr = find_get_pages_contig(i->mapping, index, count, p); + if (nr == 0) + return 0; + + actual = PAGE_SIZE * nr; + actual -= offset; + if (nr == count && size > 0) + actual -= PAGE_SIZE - size; + return actual; +} + ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) @@ -1397,6 +1586,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (unlikely(iov_iter_is_pipe(i))) return pipe_get_pages_alloc(i, pages, maxsize, start); + if (unlikely(iov_iter_is_mapping(i))) + return iter_mapping_get_pages_alloc(i, pages, maxsize, start); if (unlikely(iov_iter_is_discard(i))) return -EFAULT; @@ -1429,7 +1620,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, return v.bv_len; }),({ return -EFAULT; - }) + }), 0 ) return 0; } @@ -1468,6 +1659,14 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, v.iov_base, v.iov_len, sum, off); off += v.iov_len; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck(p + v.bv_offset, + (to += v.bv_len) - v.bv_len, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; }) ) *csum = sum; @@ -1510,6 +1709,14 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, v.iov_base, v.iov_len, sum, off); off += v.iov_len; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck(p + v.bv_offset, + (to += v.bv_len) - v.bv_len, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; }) ) *csum = sum; @@ -1556,6 +1763,14 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, (from += v.iov_len) - v.iov_len, v.iov_len, sum, off); off += v.iov_len; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len, + p + v.bv_offset, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; }) ) *csum = sum; @@ -1605,6 +1820,21 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) npages = pipe_space_for_user(iter_head, pipe->tail, pipe); if (npages >= maxpages) return maxpages; + } else if (unlikely(iov_iter_is_mapping(i))) { + unsigned offset; + + offset = (i->mapping_start + i->iov_offset) & ~PAGE_MASK; + + npages = 1; + if (size > PAGE_SIZE - offset) { + size -= PAGE_SIZE - offset; + npages += size >> PAGE_SHIFT; + size &= ~PAGE_MASK; + if (size) + npages++; + } + if (npages >= maxpages) + return maxpages; } else iterate_all_kinds(i, size, v, ({ unsigned long p = (unsigned long)v.iov_base; npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) @@ -1621,7 +1851,8 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) - p / PAGE_SIZE; if (npages >= maxpages) return maxpages; - }) + }), + 0 ) return npages; } @@ -1634,7 +1865,7 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) WARN_ON(1); return NULL; } - if (unlikely(iov_iter_is_discard(new))) + if (unlikely(iov_iter_is_discard(new) || iov_iter_is_mapping(new))) return NULL; if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, @@ -1746,7 +1977,12 @@ int iov_iter_for_each_range(struct iov_iter *i, size_t bytes, kunmap(v.bv_page); err;}), ({ w = v; - err = f(&w, context);}) + err = f(&w, context);}), ({ + w.iov_base = kmap(v.bv_page) + v.bv_offset; + w.iov_len = v.bv_len; + err = f(&w, context); + kunmap(v.bv_page); + err;}) ) return err; }