Re: [RFC] what to do with IOCB_DSYNC?

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 5/22/22 6:48 AM, Al Viro wrote:
> On Sun, May 22, 2022 at 06:39:39AM -0600, Jens Axboe wrote:
>> On 5/22/22 5:45 AM, Christoph Hellwig wrote:
>>> On Sun, May 22, 2022 at 12:15:59PM +0100, Matthew Wilcox wrote:
>>>>> 	Direct kernel pointer, surely?  And from a quick look,
>>>>> iov_iter_is_kaddr() checks for the wrong value...
>>>>
>>>> Indeed.  I didn't test it; it was a quick patch to see if the idea was
>>>> worth pursuing.  Neither you nor Christoph thought so at the time, so
>>>> I dropped it.  if there are performance improvements to be had from
>>>> doing something like that, it's a more compelling idea than just "Hey,
>>>> this removes a few lines of code and a bit of stack space from every
>>>> caller".
>>>
>>> Oh, right I actually misremembered what the series did.  But something
>>> similar except for user pointers might help with the performance issues
>>> that Jens sees, and if it does it might be worth it to avoid having
>>> both the legacy read/write path and the iter path in various drivers.
>>
>> Right, ITER_UADDR or something would useful. I'll try and test that,
>> should be easy to wire up.
> 
> Careful - it's not just iov_iter_advance() and __iterate_and_advance() (that
> one should use the same "callback" argument as iovec case).  /dev/random is
> not the only thing we use read(2) and write(2) on...
> 
> I can cook a patch doing that, just let me get some caffeine into the
> bloodstream first...

Sure, if you want to cook it up. Here's what I did so far, no caffeine
and not even compiled yet :-). It also doesn't do the page copy,
separate advancing, etc. Was going to check all the parts in iov_iter.c
that checks the type and has separate handlers.

The read vs write const and not is not great...


diff --git a/fs/read_write.c b/fs/read_write.c
index e643aec2b0ef..862ec6c549c6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -389,14 +389,13 @@ EXPORT_SYMBOL(rw_verify_area);
 
 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 {
-	struct iovec iov = { .iov_base = buf, .iov_len = len };
 	struct kiocb kiocb;
 	struct iov_iter iter;
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = (ppos ? *ppos : 0);
-	iov_iter_init(&iter, READ, &iov, 1, len);
+	uaddr_iter_init(&iter, READ, buf, len);
 
 	ret = call_read_iter(filp, &kiocb, &iter);
 	BUG_ON(ret == -EIOCBQUEUED);
@@ -492,14 +491,13 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 
 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 {
-	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 	struct kiocb kiocb;
 	struct iov_iter iter;
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = (ppos ? *ppos : 0);
-	iov_iter_init(&iter, WRITE, &iov, 1, len);
+	uaddr_iter_init(&iter, WRITE, (void __user *) buf, len);
 
 	ret = call_write_iter(filp, &kiocb, &iter);
 	BUG_ON(ret == -EIOCBQUEUED);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 739285fe5a2f..58e20e83745e 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -21,6 +21,7 @@ struct kvec {
 enum iter_type {
 	/* iter types */
 	ITER_IOVEC,
+	ITER_UADDR,
 	ITER_KVEC,
 	ITER_BVEC,
 	ITER_PIPE,
@@ -42,6 +43,7 @@ struct iov_iter {
 	size_t count;
 	union {
 		const struct iovec *iov;
+		void __user *uaddr;
 		const struct kvec *kvec;
 		const struct bio_vec *bvec;
 		struct xarray *xarray;
@@ -75,6 +77,11 @@ static inline bool iter_is_iovec(const struct iov_iter *i)
 	return iov_iter_type(i) == ITER_IOVEC;
 }
 
+static inline bool iter_is_uaddr(const struct iov_iter *i)
+{
+	return iov_iter_type(i) == ITER_UADDR;
+}
+
 static inline bool iov_iter_is_kvec(const struct iov_iter *i)
 {
 	return iov_iter_type(i) == ITER_KVEC;
@@ -241,6 +248,18 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
 
 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
 
+static inline void uaddr_iter_init(struct iov_iter *iter, int rw,
+				   void __user *uaddr, size_t len)
+{
+	iter->iter_type = ITER_UADDR;
+	iter->nofault = false;
+	iter->data_source = rw;
+	iter->iov_offset = 0;
+	iter->count = len;
+	iter->uaddr = uaddr;
+	iter->nr_segs = 0;
+}
+
 static inline size_t iov_iter_count(const struct iov_iter *i)
 {
 	return i->count;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 6dd5330f7a99..4239fffd14cb 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -38,6 +38,22 @@
 	n = off;						\
 }
 
+#define iterate_uaddr(i, n, base, len, off, STEP) {		\
+	void __user *base;					\
+	size_t len;						\
+	size_t off = 0;						\
+	size_t skip = i->iov_offset;				\
+	len = min(n, i->count - skip);				\
+	if (likely(len)) {					\
+		base = i->uaddr + skip;				\
+		len -= (STEP);					\
+		off += len;					\
+		skip += len;					\
+	}							\
+	i->iov_offset = skip;					\
+	n = off;						\
+}
+
 #define iterate_bvec(i, n, base, len, off, p, STEP) {		\
 	size_t off = 0;						\
 	unsigned skip = i->iov_offset;				\
@@ -118,6 +134,8 @@ __out:								\
 						iov, (I))	\
 			i->nr_segs -= iov - i->iov;		\
 			i->iov = iov;				\
+		} else if (iter_is_uaddr(i)) {			\
+			iterate_uaddr(i, n, base, len, off, (I)) \
 		} else if (iov_iter_is_bvec(i)) {		\
 			const struct bio_vec *bvec = i->bvec;	\
 			void *base;				\
@@ -662,7 +680,7 @@ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (unlikely(iov_iter_is_pipe(i)))
 		return copy_pipe_to_iter(addr, bytes, i);
-	if (iter_is_iovec(i))
+	if (iter_is_iovec(i) || iter_is_uaddr(i))
 		might_fault();
 	iterate_and_advance(i, bytes, base, len, off,
 		copyout(base, addr + off, len),
@@ -744,7 +762,7 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	if (unlikely(iov_iter_is_pipe(i)))
 		return copy_mc_pipe_to_iter(addr, bytes, i);
-	if (iter_is_iovec(i))
+	if (iter_is_iovec(i) || iter_is_uaddr(i))
 		might_fault();
 	__iterate_and_advance(i, bytes, base, len, off,
 		copyout_mc(base, addr + off, len),
@@ -1061,6 +1079,10 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
 	i->iov = iov;
 }
 
+static void iter_uaddr_advance(struct iov_iter *i, size_t size)
+{
+}
+
 void iov_iter_advance(struct iov_iter *i, size_t size)
 {
 	if (unlikely(i->count < size))
@@ -1068,6 +1090,8 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
 		/* iovec and kvec have identical layouts */
 		iov_iter_iovec_advance(i, size);
+	} else if (iter_is_uaddr(i)) {
+		iter_uaddr_advance(i, size);
 	} else if (iov_iter_is_bvec(i)) {
 		iov_iter_bvec_advance(i, size);
 	} else if (iov_iter_is_pipe(i)) {

-- 
Jens Axboe




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux