On 5/22/22 6:48 AM, Al Viro wrote: > On Sun, May 22, 2022 at 06:39:39AM -0600, Jens Axboe wrote: >> On 5/22/22 5:45 AM, Christoph Hellwig wrote: >>> On Sun, May 22, 2022 at 12:15:59PM +0100, Matthew Wilcox wrote: >>>>> Direct kernel pointer, surely? And from a quick look, >>>>> iov_iter_is_kaddr() checks for the wrong value... >>>> >>>> Indeed. I didn't test it; it was a quick patch to see if the idea was >>>> worth pursuing. Neither you nor Christoph thought so at the time, so >>>> I dropped it. if there are performance improvements to be had from >>>> doing something like that, it's a more compelling idea than just "Hey, >>>> this removes a few lines of code and a bit of stack space from every >>>> caller". >>> >>> Oh, right I actually misremembered what the series did. But something >>> similar except for user pointers might help with the performance issues >>> that Jens sees, and if it does it might be worth it to avoid having >>> both the legacy read/write path and the iter path in various drivers. >> >> Right, ITER_UADDR or something would useful. I'll try and test that, >> should be easy to wire up. > > Careful - it's not just iov_iter_advance() and __iterate_and_advance() (that > one should use the same "callback" argument as iovec case). /dev/random is > not the only thing we use read(2) and write(2) on... > > I can cook a patch doing that, just let me get some caffeine into the > bloodstream first... Sure, if you want to cook it up. Here's what I did so far, no caffeine and not even compiled yet :-). It also doesn't do the page copy, separate advancing, etc. Was going to check all the parts in iov_iter.c that checks the type and has separate handlers. The read vs write const and not is not great... diff --git a/fs/read_write.c b/fs/read_write.c index e643aec2b0ef..862ec6c549c6 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -389,14 +389,13 @@ EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { - struct iovec iov = { .iov_base = buf, .iov_len = len }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); - iov_iter_init(&iter, READ, &iov, 1, len); + uaddr_iter_init(&iter, READ, buf, len); ret = call_read_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); @@ -492,14 +491,13 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { - struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); - iov_iter_init(&iter, WRITE, &iov, 1, len); + uaddr_iter_init(&iter, WRITE, (void __user *) buf, len); ret = call_write_iter(filp, &kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); diff --git a/include/linux/uio.h b/include/linux/uio.h index 739285fe5a2f..58e20e83745e 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -21,6 +21,7 @@ struct kvec { enum iter_type { /* iter types */ ITER_IOVEC, + ITER_UADDR, ITER_KVEC, ITER_BVEC, ITER_PIPE, @@ -42,6 +43,7 @@ struct iov_iter { size_t count; union { const struct iovec *iov; + void __user *uaddr; const struct kvec *kvec; const struct bio_vec *bvec; struct xarray *xarray; @@ -75,6 +77,11 @@ static inline bool iter_is_iovec(const struct iov_iter *i) return iov_iter_type(i) == ITER_IOVEC; } +static inline bool iter_is_uaddr(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_UADDR; +} + static inline bool iov_iter_is_kvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_KVEC; @@ -241,6 +248,18 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); +static inline void uaddr_iter_init(struct iov_iter *iter, int rw, + void __user *uaddr, size_t len) +{ + iter->iter_type = ITER_UADDR; + iter->nofault = false; + iter->data_source = rw; + iter->iov_offset = 0; + iter->count = len; + iter->uaddr = uaddr; + iter->nr_segs = 0; +} + static inline size_t iov_iter_count(const struct iov_iter *i) { return i->count; diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 6dd5330f7a99..4239fffd14cb 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -38,6 +38,22 @@ n = off; \ } +#define iterate_uaddr(i, n, base, len, off, STEP) { \ + void __user *base; \ + size_t len; \ + size_t off = 0; \ + size_t skip = i->iov_offset; \ + len = min(n, i->count - skip); \ + if (likely(len)) { \ + base = i->uaddr + skip; \ + len -= (STEP); \ + off += len; \ + skip += len; \ + } \ + i->iov_offset = skip; \ + n = off; \ +} + #define iterate_bvec(i, n, base, len, off, p, STEP) { \ size_t off = 0; \ unsigned skip = i->iov_offset; \ @@ -118,6 +134,8 @@ __out: \ iov, (I)) \ i->nr_segs -= iov - i->iov; \ i->iov = iov; \ + } else if (iter_is_uaddr(i)) { \ + iterate_uaddr(i, n, base, len, off, (I)) \ } else if (iov_iter_is_bvec(i)) { \ const struct bio_vec *bvec = i->bvec; \ void *base; \ @@ -662,7 +680,7 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(iov_iter_is_pipe(i))) return copy_pipe_to_iter(addr, bytes, i); - if (iter_is_iovec(i)) + if (iter_is_iovec(i) || iter_is_uaddr(i)) might_fault(); iterate_and_advance(i, bytes, base, len, off, copyout(base, addr + off, len), @@ -744,7 +762,7 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(iov_iter_is_pipe(i))) return copy_mc_pipe_to_iter(addr, bytes, i); - if (iter_is_iovec(i)) + if (iter_is_iovec(i) || iter_is_uaddr(i)) might_fault(); __iterate_and_advance(i, bytes, base, len, off, copyout_mc(base, addr + off, len), @@ -1061,6 +1079,10 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) i->iov = iov; } +static void iter_uaddr_advance(struct iov_iter *i, size_t size) +{ +} + void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) @@ -1068,6 +1090,8 @@ void iov_iter_advance(struct iov_iter *i, size_t size) if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { /* iovec and kvec have identical layouts */ iov_iter_iovec_advance(i, size); + } else if (iter_is_uaddr(i)) { + iter_uaddr_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); } else if (iov_iter_is_pipe(i)) { -- Jens Axboe