Reverting this is a dependency for reverting commit d9722a475711 ("splice: Do splice read from a buffered file without using ITER_PIPE") This reverts commit 82cf0207bed44feb0b3b8b17a4c351fdde34a97b. Signed-off-by: Konrad Dybcio <konrad.dybcio@xxxxxxxxxx> --- fs/cifs/file.c | 8 +- include/linux/uio.h | 14 ++ lib/iov_iter.c | 435 +++++++++++++++++++++++++++++++++++++++++++++++++++- mm/filemap.c | 3 +- 4 files changed, 455 insertions(+), 5 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9e0c03be032b..052609172690 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3833,7 +3833,13 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) size_t copy = min_t(size_t, remaining, PAGE_SIZE); size_t written; - written = copy_page_to_iter(page, 0, copy, iter); + if (unlikely(iov_iter_is_pipe(iter))) { + void *addr = kmap_atomic(page); + + written = copy_to_iter(addr, copy, iter); + kunmap_atomic(addr); + } else + written = copy_page_to_iter(page, 0, copy, iter); remaining -= written; if (written < copy && iov_iter_count(iter) > 0) break; diff --git a/include/linux/uio.h b/include/linux/uio.h index 74598426edb4..27e3fd942960 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -11,6 +11,7 @@ #include <uapi/linux/uio.h> struct page; +struct pipe_inode_info; typedef unsigned int __bitwise iov_iter_extraction_t; @@ -24,6 +25,7 @@ enum iter_type { ITER_IOVEC, ITER_KVEC, ITER_BVEC, + ITER_PIPE, ITER_XARRAY, ITER_DISCARD, ITER_UBUF, @@ -53,10 +55,15 @@ struct iov_iter { const struct kvec *kvec; const struct bio_vec *bvec; struct xarray *xarray; + struct pipe_inode_info *pipe; void __user *ubuf; }; union { unsigned long nr_segs; + struct { + unsigned int head; + unsigned int start_head; + }; loff_t xarray_start; }; }; @@ -94,6 +101,11 @@ static inline bool iov_iter_is_bvec(const struct iov_iter *i) return iov_iter_type(i) == ITER_BVEC; } +static inline bool iov_iter_is_pipe(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_PIPE; +} + static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; @@ -235,6 +247,8 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); +void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, + size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index fad95e4cf372..02a73ca52207 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -14,6 +14,8 @@ #include <linux/scatterlist.h> #include <linux/instrumented.h> +#define PIPE_PARANOIA /* for now */ + /* covers ubuf and kbuf alike */ #define iterate_buf(i, n, base, len, off, __p, STEP) { \ size_t __maybe_unused off = 0; \ @@ -184,6 +186,156 @@ static int copyin(void *to, const void __user *from, size_t n) return res; } +static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, + unsigned int slot) +{ + return &pipe->bufs[slot & (pipe->ring_size - 1)]; +} + +#ifdef PIPE_PARANOIA +static bool sanity(const struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + unsigned int p_head = pipe->head; + unsigned int p_tail = pipe->tail; + unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); + unsigned int i_head = i->head; + unsigned int idx; + + if (i->last_offset) { + struct pipe_buffer *p; + if (unlikely(p_occupancy == 0)) + goto Bad; // pipe must be non-empty + if (unlikely(i_head != p_head - 1)) + goto Bad; // must be at the last buffer... + + p = pipe_buf(pipe, i_head); + if (unlikely(p->offset + p->len != abs(i->last_offset))) + goto Bad; // ... at the end of segment + } else { + if (i_head != p_head) + goto Bad; // must be right after the last buffer + } + return true; +Bad: + printk(KERN_ERR "idx = %d, offset = %d\n", i_head, i->last_offset); + printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", + p_head, p_tail, pipe->ring_size); + for (idx = 0; idx < pipe->ring_size; idx++) + printk(KERN_ERR "[%p %p %d %d]\n", + pipe->bufs[idx].ops, + pipe->bufs[idx].page, + pipe->bufs[idx].offset, + pipe->bufs[idx].len); + WARN_ON(1); + return false; +} +#else +#define sanity(i) true +#endif + +static struct page *push_anon(struct pipe_inode_info *pipe, unsigned size) +{ + struct page *page = alloc_page(GFP_USER); + if (page) { + struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); + *buf = (struct pipe_buffer) { + .ops = &default_pipe_buf_ops, + .page = page, + .offset = 0, + .len = size + }; + } + return page; +} + +static void push_page(struct pipe_inode_info *pipe, struct page *page, + unsigned int offset, unsigned int size) +{ + struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); + *buf = (struct pipe_buffer) { + .ops = &page_cache_pipe_buf_ops, + .page = page, + .offset = offset, + .len = size + }; + get_page(page); +} + +static inline int last_offset(const struct pipe_buffer *buf) +{ + if (buf->ops == &default_pipe_buf_ops) + return buf->len; // buf->offset is 0 for those + else + return -(buf->offset + buf->len); +} + +static struct page *append_pipe(struct iov_iter *i, size_t size, + unsigned int *off) +{ + struct pipe_inode_info *pipe = i->pipe; + int offset = i->last_offset; + struct pipe_buffer *buf; + struct page *page; + + if (offset > 0 && offset < PAGE_SIZE) { + // some space in the last buffer; add to it + buf = pipe_buf(pipe, pipe->head - 1); + size = min_t(size_t, size, PAGE_SIZE - offset); + buf->len += size; + i->last_offset += size; + i->count -= size; + *off = offset; + return buf->page; + } + // OK, we need a new buffer + *off = 0; + size = min_t(size_t, size, PAGE_SIZE); + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + return NULL; + page = push_anon(pipe, size); + if (!page) + return NULL; + i->head = pipe->head - 1; + i->last_offset = size; + i->count -= size; + return page; +} + +static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + unsigned int head = pipe->head; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + if (!sanity(i)) + return 0; + + if (offset && i->last_offset == -offset) { // could we merge it? + struct pipe_buffer *buf = pipe_buf(pipe, head - 1); + if (buf->page == page) { + buf->len += bytes; + i->last_offset -= bytes; + i->count -= bytes; + return bytes; + } + } + if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + return 0; + + push_page(pipe, page, offset, bytes); + i->last_offset = -(offset + bytes); + i->head = head; + i->count -= bytes; + return bytes; +} + /* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator @@ -287,6 +439,46 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_init); +// returns the offset in partial buffer (if any) +static inline unsigned int pipe_npages(const struct iov_iter *i, int *npages) +{ + struct pipe_inode_info *pipe = i->pipe; + int used = pipe->head - pipe->tail; + int off = i->last_offset; + + *npages = max((int)pipe->max_usage - used, 0); + + if (off > 0 && off < PAGE_SIZE) { // anon and not full + (*npages)++; + return off; + } + return 0; +} + +static size_t copy_pipe_to_iter(const void *addr, size_t bytes, + struct iov_iter *i) +{ + unsigned int off, chunk; + + if (unlikely(bytes > i->count)) + bytes = i->count; + if (unlikely(!bytes)) + return 0; + + if (!sanity(i)) + return 0; + + for (size_t n = bytes; n; n -= chunk) { + struct page *page = append_pipe(i, n, &off); + chunk = min_t(size_t, n, PAGE_SIZE - off); + if (!page) + return bytes - n; + memcpy_to_page(page, off, addr, chunk); + addr += chunk; + } + return bytes; +} + static __wsum csum_and_memcpy(void *to, const void *from, size_t len, __wsum sum, size_t off) { @@ -294,10 +486,44 @@ static __wsum csum_and_memcpy(void *to, const void *from, size_t len, return csum_block_add(sum, next, off); } +static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, + struct iov_iter *i, __wsum *sump) +{ + __wsum sum = *sump; + size_t off = 0; + unsigned int chunk, r; + + if (unlikely(bytes > i->count)) + bytes = i->count; + if (unlikely(!bytes)) + return 0; + + if (!sanity(i)) + return 0; + + while (bytes) { + struct page *page = append_pipe(i, bytes, &r); + char *p; + + if (!page) + break; + chunk = min_t(size_t, bytes, PAGE_SIZE - r); + p = kmap_local_page(page); + sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off); + kunmap_local(p); + off += chunk; + bytes -= chunk; + } + *sump = sum; + return off; +} + size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; + if (unlikely(iov_iter_is_pipe(i))) + return copy_pipe_to_iter(addr, bytes, i); if (user_backed_iter(i)) might_fault(); iterate_and_advance(i, bytes, base, len, off, @@ -319,6 +545,42 @@ static int copyout_mc(void __user *to, const void *from, size_t n) return n; } +static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, + struct iov_iter *i) +{ + size_t xfer = 0; + unsigned int off, chunk; + + if (unlikely(bytes > i->count)) + bytes = i->count; + if (unlikely(!bytes)) + return 0; + + if (!sanity(i)) + return 0; + + while (bytes) { + struct page *page = append_pipe(i, bytes, &off); + unsigned long rem; + char *p; + + if (!page) + break; + chunk = min_t(size_t, bytes, PAGE_SIZE - off); + p = kmap_local_page(page); + rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); + chunk -= rem; + kunmap_local(p); + xfer += chunk; + bytes -= chunk; + if (rem) { + iov_iter_revert(i, rem); + break; + } + } + return xfer; +} + /** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address @@ -338,8 +600,9 @@ static int copyout_mc(void __user *to, const void *from, size_t n) * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * - * * ITER_KVEC and ITER_BVEC can return short copies. Compare to - * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. + * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. + * Compare to copy_to_iter() where only ITER_IOVEC attempts might return + * a short copy. * * Return: number of bytes copied (may be %0) */ @@ -347,6 +610,8 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; + if (unlikely(iov_iter_is_pipe(i))) + return copy_mc_pipe_to_iter(addr, bytes, i); if (user_backed_iter(i)) might_fault(); __iterate_and_advance(i, bytes, base, len, off, @@ -452,6 +717,8 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, return 0; if (WARN_ON_ONCE(i->data_source)) return 0; + if (unlikely(iov_iter_is_pipe(i))) + return copy_page_to_iter_pipe(page, offset, bytes, i); page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { @@ -500,8 +767,36 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_from_iter); +static size_t pipe_zero(size_t bytes, struct iov_iter *i) +{ + unsigned int chunk, off; + + if (unlikely(bytes > i->count)) + bytes = i->count; + if (unlikely(!bytes)) + return 0; + + if (!sanity(i)) + return 0; + + for (size_t n = bytes; n; n -= chunk) { + struct page *page = append_pipe(i, n, &off); + char *p; + + if (!page) + return bytes - n; + chunk = min_t(size_t, n, PAGE_SIZE - off); + p = kmap_local_page(page); + memset(p + off, 0, chunk); + kunmap_local(p); + } + return bytes; +} + size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { + if (unlikely(iov_iter_is_pipe(i))) + return pipe_zero(bytes, i); iterate_and_advance(i, bytes, base, len, count, clear_user(base, len), memset(base, 0, len) @@ -532,6 +827,32 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt } EXPORT_SYMBOL(copy_page_from_iter_atomic); +static void pipe_advance(struct iov_iter *i, size_t size) +{ + struct pipe_inode_info *pipe = i->pipe; + int off = i->last_offset; + + if (!off && !size) { + pipe_discard_from(pipe, i->start_head); // discard everything + return; + } + i->count -= size; + while (1) { + struct pipe_buffer *buf = pipe_buf(pipe, i->head); + if (off) /* make it relative to the beginning of buffer */ + size += abs(off) - buf->offset; + if (size <= buf->len) { + buf->len = size; + i->last_offset = last_offset(buf); + break; + } + size -= buf->len; + i->head++; + off = 0; + } + pipe_discard_from(pipe, i->head + 1); // discard everything past this one +} + static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { const struct bio_vec *bvec, *end; @@ -583,6 +904,8 @@ void iov_iter_advance(struct iov_iter *i, size_t size) iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); + } else if (iov_iter_is_pipe(i)) { + pipe_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } @@ -596,6 +919,26 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; + if (unlikely(iov_iter_is_pipe(i))) { + struct pipe_inode_info *pipe = i->pipe; + unsigned int head = pipe->head; + + while (head > i->start_head) { + struct pipe_buffer *b = pipe_buf(pipe, --head); + if (unroll < b->len) { + b->len -= unroll; + i->last_offset = last_offset(b); + i->head = head; + return; + } + unroll -= b->len; + pipe_buf_release(pipe, b); + pipe->head--; + } + i->last_offset = 0; + i->head = head; + return; + } if (unlikely(iov_iter_is_discard(i))) return; if (unroll <= i->iov_offset) { @@ -683,6 +1026,24 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_bvec); +void iov_iter_pipe(struct iov_iter *i, unsigned int direction, + struct pipe_inode_info *pipe, + size_t count) +{ + BUG_ON(direction != READ); + WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); + *i = (struct iov_iter){ + .iter_type = ITER_PIPE, + .data_source = false, + .pipe = pipe, + .head = pipe->head, + .start_head = pipe->head, + .last_offset = 0, + .count = count + }; +} +EXPORT_SYMBOL(iov_iter_pipe); + /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. @@ -807,6 +1168,19 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); + if (iov_iter_is_pipe(i)) { + size_t size = i->count; + + if (size & len_mask) + return false; + if (size && i->last_offset > 0) { + if (i->last_offset & addr_mask) + return false; + } + + return true; + } + if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; @@ -876,6 +1250,14 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); + if (iov_iter_is_pipe(i)) { + size_t size = i->count; + + if (size && i->last_offset > 0) + return size | i->last_offset; + return size; + } + if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; @@ -927,6 +1309,36 @@ static int want_pages_array(struct page ***res, size_t size, return count; } +static ssize_t pipe_get_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, unsigned maxpages, + size_t *start) +{ + unsigned int npages, count, off, chunk; + struct page **p; + size_t left; + + if (!sanity(i)) + return -EFAULT; + + *start = off = pipe_npages(i, &npages); + if (!npages) + return -EFAULT; + count = want_pages_array(pages, maxsize, off, min(npages, maxpages)); + if (!count) + return -ENOMEM; + p = *pages; + for (npages = 0, left = maxsize ; npages < count; npages++, left -= chunk) { + struct page *page = append_pipe(i, left, &off); + if (!page) + break; + chunk = min_t(size_t, left, PAGE_SIZE - off); + get_page(*p++ = page); + } + if (!npages) + return -EFAULT; + return maxsize - left; +} + static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { @@ -1076,6 +1488,8 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, } return maxsize; } + if (iov_iter_is_pipe(i)) + return pipe_get_pages(i, pages, maxsize, maxpages, start); if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; @@ -1165,7 +1579,9 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, } sum = csum_shift(csstate->csum, csstate->off); - iterate_and_advance(i, bytes, base, len, off, ({ + if (unlikely(iov_iter_is_pipe(i))) + bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum); + else iterate_and_advance(i, bytes, base, len, off, ({ next = csum_and_copy_to_user(addr + off, base, len); sum = csum_block_add(sum, next, off); next ? 0 : len; @@ -1250,6 +1666,15 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); + if (iov_iter_is_pipe(i)) { + int npages; + + if (!sanity(i)) + return 0; + + pipe_npages(i, &npages); + return min(npages, maxpages); + } if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); @@ -1262,6 +1687,10 @@ EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; + if (unlikely(iov_iter_is_pipe(new))) { + WARN_ON(1); + return NULL; + } if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), diff --git a/mm/filemap.c b/mm/filemap.c index d647b2e7cd51..9cf984f7c25e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2692,7 +2692,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; - error = filemap_get_pages(iocb, iter->count, &fbatch, false); + error = filemap_get_pages(iocb, iter->count, &fbatch, + iov_iter_is_pipe(iter)); if (error < 0) break; -- 2.39.1