On 25-Sep-2014 04:33:23 PM, Matthew Wilcox wrote: > From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> > > For DAX, we want to be able to copy between iovecs and kernel addresses > that don't necessarily have a struct page. This is a fairly simple > rearrangement for bvec iters to kmap the pages outside and pass them in, > but for user iovecs it gets more complicated because we might try various > different ways to kmap the memory. Duplicating the existing logic works > out best in this case. > > We need to be able to write zeroes to an iovec for reads from unwritten > ranges in a file. This is performed by the new iov_iter_zero() function, > again patterned after the existing code that handles iovec iterators. > > Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> > --- > include/linux/uio.h | 3 + > mm/iov_iter.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++---- > 2 files changed, 226 insertions(+), 14 deletions(-) > > diff --git a/include/linux/uio.h b/include/linux/uio.h > index 48d64e6..1863ddd 100644 > --- a/include/linux/uio.h > +++ b/include/linux/uio.h > @@ -80,6 +80,9 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, > struct iov_iter *i); > size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, > struct iov_iter *i); > +size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i); > +size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); > +size_t iov_iter_zero(size_t bytes, struct iov_iter *); > unsigned long iov_iter_alignment(const struct iov_iter *i); > void iov_iter_init(struct iov_iter *i, int direction, const struct iovec *iov, > unsigned long nr_segs, size_t count); > diff --git a/mm/iov_iter.c b/mm/iov_iter.c > index ab88dc0..d481fd8 100644 > --- a/mm/iov_iter.c > +++ b/mm/iov_iter.c > @@ -4,6 +4,96 @@ > #include <linux/slab.h> > #include <linux/vmalloc.h> > > +static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i) > +{ > + size_t skip, copy, left, wanted; > + const struct iovec *iov; > + char __user *buf; > + > + if (unlikely(bytes > i->count)) > + bytes = i->count; > + > + if (unlikely(!bytes)) > + return 0; > + > + wanted = bytes; > + iov = i->iov; > + skip = i->iov_offset; > + buf = iov->iov_base + skip; > + copy = min(bytes, iov->iov_len - skip); > + > + left = __copy_to_user(buf, from, copy); How comes this function uses __copy_to_user without any access_ok() check ? This has security implications. > + copy -= left; > + skip += copy; > + from += copy; > + bytes -= copy; > + while (unlikely(!left && bytes)) { > + iov++; > + buf = iov->iov_base; > + copy = min(bytes, iov->iov_len); > + left = __copy_to_user(buf, from, copy); same here. > + copy -= left; > + skip = copy; > + from += copy; > + bytes -= copy; > + } > + > + if (skip == iov->iov_len) { > + iov++; > + skip = 0; > + } > + i->count -= wanted - bytes; > + i->nr_segs -= iov - i->iov; > + i->iov = iov; > + i->iov_offset = skip; > + return wanted - bytes; > +} > + > +static size_t copy_from_iter_iovec(void *to, size_t bytes, struct iov_iter *i) > +{ > + size_t skip, copy, left, wanted; > + const struct iovec *iov; > + char __user *buf; > + > + if (unlikely(bytes > i->count)) > + bytes = i->count; > + > + if (unlikely(!bytes)) > + return 0; > + > + wanted = bytes; > + iov = i->iov; > + skip = i->iov_offset; > + buf = iov->iov_base + skip; > + copy = min(bytes, iov->iov_len - skip); > + > + left = __copy_from_user(to, buf, copy); same here. > + copy -= left; > + skip += copy; > + to += copy; > + bytes -= copy; > + while (unlikely(!left && bytes)) { > + iov++; > + buf = iov->iov_base; > + copy = min(bytes, iov->iov_len); > + left = __copy_from_user(to, buf, copy); same. > + copy -= left; > + skip = copy; > + to += copy; > + bytes -= copy; > + } > + > + if (skip == iov->iov_len) { > + iov++; > + skip = 0; > + } > + i->count -= wanted - bytes; > + i->nr_segs -= iov - i->iov; > + i->iov = iov; > + i->iov_offset = skip; > + return wanted - bytes; > +} > + > static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, > struct iov_iter *i) > { > @@ -166,6 +256,50 @@ done: > return wanted - bytes; > } > > +static size_t zero_iovec(size_t bytes, struct iov_iter *i) > +{ > + size_t skip, copy, left, wanted; > + const struct iovec *iov; > + char __user *buf; > + > + if (unlikely(bytes > i->count)) > + bytes = i->count; > + > + if (unlikely(!bytes)) > + return 0; > + > + wanted = bytes; > + iov = i->iov; > + skip = i->iov_offset; > + buf = iov->iov_base + skip; > + copy = min(bytes, iov->iov_len - skip); > + > + left = __clear_user(buf, copy); I would guess an access_ok() would be needed here too. > + copy -= left; > + skip += copy; > + bytes -= copy; > + > + while (unlikely(!left && bytes)) { > + iov++; > + buf = iov->iov_base; > + copy = min(bytes, iov->iov_len); > + left = __clear_user(buf, copy); Same. > + copy -= left; > + skip = copy; > + bytes -= copy; > + } > + > + if (skip == iov->iov_len) { > + iov++; > + skip = 0; > + } > + i->count -= wanted - bytes; > + i->nr_segs -= iov - i->iov; > + i->iov = iov; > + i->iov_offset = skip; > + return wanted - bytes; > +} > + > static size_t __iovec_copy_from_user_inatomic(char *vaddr, > const struct iovec *iov, size_t base, size_t bytes) > { > @@ -412,12 +546,17 @@ static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t > kunmap_atomic(to); > } > > -static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t bytes, > - struct iov_iter *i) > +static void memzero_page(struct page *page, size_t offset, size_t len) > +{ > + char *addr = kmap_atomic(page); > + memset(addr + offset, 0, len); > + kunmap_atomic(addr); > +} > + > +static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i) > { > size_t skip, copy, wanted; > const struct bio_vec *bvec; > - void *kaddr, *from; > > if (unlikely(bytes > i->count)) > bytes = i->count; > @@ -430,8 +569,6 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by > skip = i->iov_offset; > copy = min_t(size_t, bytes, bvec->bv_len - skip); > > - kaddr = kmap_atomic(page); > - from = kaddr + offset; > memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy); > skip += copy; > from += copy; > @@ -444,7 +581,6 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by > from += copy; > bytes -= copy; > } > - kunmap_atomic(kaddr); > if (skip == bvec->bv_len) { > bvec++; > skip = 0; > @@ -456,12 +592,10 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by > return wanted - bytes; > } > > -static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bytes, > - struct iov_iter *i) > +static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i) > { > size_t skip, copy, wanted; > const struct bio_vec *bvec; > - void *kaddr, *to; > > if (unlikely(bytes > i->count)) > bytes = i->count; > @@ -473,10 +607,6 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t > bvec = i->bvec; > skip = i->iov_offset; > > - kaddr = kmap_atomic(page); > - > - to = kaddr + offset; > - > copy = min(bytes, bvec->bv_len - skip); > > memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy); > @@ -493,7 +623,6 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t > to += copy; > bytes -= copy; > } > - kunmap_atomic(kaddr); > if (skip == bvec->bv_len) { > bvec++; > skip = 0; > @@ -505,6 +634,61 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t > return wanted; > } > > +static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, > + size_t bytes, struct iov_iter *i) > +{ > + void *kaddr = kmap_atomic(page); > + size_t wanted = copy_to_iter_bvec(kaddr + offset, bytes, i); missing newline. > + kunmap_atomic(kaddr); > + return wanted; > +} > + > +static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, > + size_t bytes, struct iov_iter *i) > +{ > + void *kaddr = kmap_atomic(page); > + size_t wanted = copy_from_iter_bvec(kaddr + offset, bytes, i); missing newline. Thanks, Mathieu > + kunmap_atomic(kaddr); > + return wanted; > +} > + > +static size_t zero_bvec(size_t bytes, struct iov_iter *i) > +{ > + size_t skip, copy, wanted; > + const struct bio_vec *bvec; > + > + if (unlikely(bytes > i->count)) > + bytes = i->count; > + > + if (unlikely(!bytes)) > + return 0; > + > + wanted = bytes; > + bvec = i->bvec; > + skip = i->iov_offset; > + copy = min_t(size_t, bytes, bvec->bv_len - skip); > + > + memzero_page(bvec->bv_page, skip + bvec->bv_offset, copy); > + skip += copy; > + bytes -= copy; > + while (bytes) { > + bvec++; > + copy = min(bytes, (size_t)bvec->bv_len); > + memzero_page(bvec->bv_page, bvec->bv_offset, copy); > + skip = copy; > + bytes -= copy; > + } > + if (skip == bvec->bv_len) { > + bvec++; > + skip = 0; > + } > + i->count -= wanted - bytes; > + i->nr_segs -= bvec - i->bvec; > + i->bvec = bvec; > + i->iov_offset = skip; > + return wanted - bytes; > +} > + > static size_t copy_from_user_bvec(struct page *page, > struct iov_iter *i, unsigned long offset, size_t bytes) > { > @@ -668,6 +852,31 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, > } > EXPORT_SYMBOL(copy_page_from_iter); > > +size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) > +{ > + if (i->type & ITER_BVEC) > + return copy_to_iter_bvec(addr, bytes, i); > + else > + return copy_to_iter_iovec(addr, bytes, i); > +} > + > +size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) > +{ > + if (i->type & ITER_BVEC) > + return copy_from_iter_bvec(addr, bytes, i); > + else > + return copy_from_iter_iovec(addr, bytes, i); > +} > + > +size_t iov_iter_zero(size_t bytes, struct iov_iter *i) > +{ > + if (i->type & ITER_BVEC) { > + return zero_bvec(bytes, i); > + } else { > + return zero_iovec(bytes, i); > + } > +} > + > size_t iov_iter_copy_from_user_atomic(struct page *page, > struct iov_iter *i, unsigned long offset, size_t bytes) > { > -- > 2.1.0 > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> > > -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com Key fingerprint: 2A0B 4ED9 15F2 D3FA 45F5 B162 1728 0A97 8118 6ACF -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>