[RFC PATCH 2/4] iov_iter: Add an iterator-of-iterators

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Add a new I/O iterator type, ITER_ITERLIST, that allows iteration over a
series of I/O iterators, provided the iterators are all the same direction
(all ITER_SOURCE or all ITER_DEST) and none of them are themselves
ITER_ITERLIST (this function is recursive).

To make reversion possible, I've added an 'orig_count' member into the
iov_iter struct so that reversion of an ITER_ITERLIST can know when to go
move backwards through the iter list.  It might make more sense to make the
iterator list element, say:

	struct itervec {
		struct iov_iter iter;
		size_t orig_count;
	};

rather than expanding struct iov_iter itself and have iov_iter_iterlist()
set vec[i].orig_count from vec[i].iter->count.

Also, for the moment, I've only permitted its use with source iterators
(eg. sendmsg).

To use this, you allocate an array of iterators and point the list iterator
at it, e.g.:

	struct iov_iter iters[3];
	struct msghdr msg;

	iov_iter_bvec(&iters[0], ITER_SOURCE, &head_bv, 1,
		      sizeof(marker) + head->iov_len);
	iov_iter_xarray(&iters[1], ITER_SOURCE, xdr->pages,
			xdr->page_fpos, xdr->page_len);
	iov_iter_kvec(&iters[2], ITER_SOURCE, &tail_kv, 1,
		      tail->iov_len);
	iov_iter_iterlist(&msg.msg_iter, ITER_SOURCE, iters, 3, size);

This can be used by network filesystem protocols, such as sunrpc, to glue a
header and a trailer on to some data to form a message and then dump the
entire message onto the socket in a single go.

[!] Note: I'm not entirely sure that this is a good idea: the problem is
    that it's reasonably common practice to copy an iterator by direct
    assignment - and that works for the existing iterators... but not this
    one.  With the iterator-of-iterators, the list of iterators has to be
    modified if we recurse.  It's probably fine just for calling sendmsg()
    from network filesystems, but I'm not 100% sure of that.

Suggested-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
---
 include/linux/uio.h |  15 +++++
 lib/iov_iter.c      | 158 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 172 insertions(+), 1 deletion(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 8ada84e85447..59a586333e1b 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -29,6 +29,7 @@ enum iter_type {
 	ITER_FOLIOQ,
 	ITER_XARRAY,
 	ITER_DISCARD,
+	ITER_ITERLIST,
 };
 
 #define ITER_SOURCE	1	// == WRITE
@@ -71,6 +72,7 @@ struct iov_iter {
 				const struct folio_queue *folioq;
 				struct xarray *xarray;
 				void __user *ubuf;
+				struct iov_iterlist *iterlist;
 			};
 			size_t count;
 		};
@@ -82,6 +84,11 @@ struct iov_iter {
 	};
 };
 
+struct iov_iterlist {
+	struct iov_iter	iter;
+	size_t		orig_count;
+};
+
 typedef __u16 uio_meta_flags_t;
 
 struct uio_meta {
@@ -149,6 +156,11 @@ static inline bool iov_iter_is_xarray(const struct iov_iter *i)
 	return iov_iter_type(i) == ITER_XARRAY;
 }
 
+static inline bool iov_iter_is_iterlist(const struct iov_iter *i)
+{
+	return iov_iter_type(i) == ITER_ITERLIST;
+}
+
 static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 {
 	return i->data_source ? WRITE : READ;
@@ -302,6 +314,9 @@ void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
 			  unsigned int first_slot, unsigned int offset, size_t count);
 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
 		     loff_t start, size_t count);
+void iov_iter_iterlist(struct iov_iter *i, unsigned int direction,
+		       struct iov_iterlist *iterlist, unsigned long nr_segs,
+		       size_t count);
 ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
 			size_t maxsize, unsigned maxpages, size_t *start);
 ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 33a8746e593e..1d9190abfeb5 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -578,6 +578,19 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 		iov_iter_folioq_advance(i, size);
 	} else if (iov_iter_is_discard(i)) {
 		i->count -= size;
+	} else if (iov_iter_is_iterlist(i)) {
+		i->count -= size;
+		for (;;) {
+			size_t part = umin(size, i->iterlist->iter.count);
+
+			if (part > 0)
+				iov_iter_advance(&i->iterlist->iter, part);
+			size -= part;
+			if (!size)
+				break;
+			i->iterlist++;
+			i->nr_segs--;
+		}
 	}
 }
 EXPORT_SYMBOL(iov_iter_advance);
@@ -608,6 +621,23 @@ static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll)
 	i->folioq = folioq;
 }
 
+static void iov_iter_revert_iterlist(struct iov_iter *i, size_t unroll)
+{
+	for (;;) {
+		struct iov_iterlist *il = i->iterlist;
+
+		size_t part = umin(unroll, il->orig_count - il->iter.count);
+
+		if (part > 0)
+			iov_iter_revert(&il->iter, part);
+		unroll -= part;
+		if (!unroll)
+			break;
+		i->iterlist--;
+		i->nr_segs++;
+	}
+}
+
 void iov_iter_revert(struct iov_iter *i, size_t unroll)
 {
 	if (!unroll)
@@ -617,6 +647,8 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
 	i->count += unroll;
 	if (unlikely(iov_iter_is_discard(i)))
 		return;
+	if (unlikely(iov_iter_is_iterlist(i)))
+		return iov_iter_revert_iterlist(i, unroll);
 	if (unroll <= i->iov_offset) {
 		i->iov_offset -= unroll;
 		return;
@@ -663,6 +695,8 @@ EXPORT_SYMBOL(iov_iter_revert);
  */
 size_t iov_iter_single_seg_count(const struct iov_iter *i)
 {
+	if (iov_iter_is_iterlist(i))
+		i = &i->iterlist->iter;
 	if (i->nr_segs > 1) {
 		if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
 			return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
@@ -787,6 +821,41 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
 }
 EXPORT_SYMBOL(iov_iter_discard);
 
+/**
+ * iov_iter_iterlist - Initialise an I/O iterator that is a list of iterators
+ * @iter: The iterator to initialise.
+ * @direction: The direction of the transfer.
+ * @iterlist: The list of iterators
+ * @nr_segs: The number of elements in the list
+ * @count: The size of the I/O buffer in bytes.
+ *
+ * Set up an I/O iterator that walks over an array of other iterators.  It's
+ * only available as a source iterator (for WRITE) and none of the iterators in
+ * the array can be of ITER_ITERLIST type to prevent infinite recursion.
+ */
+void iov_iter_iterlist(struct iov_iter *iter, unsigned int direction,
+		       struct iov_iterlist *iterlist, unsigned long nr_segs,
+		       size_t count)
+{
+	unsigned long i;
+
+	BUG_ON(direction != WRITE);
+	for (i = 0; i < nr_segs; i++) {
+		BUG_ON(iterlist[i].iter.iter_type == ITER_ITERLIST);
+		BUG_ON(iterlist[i].iter.data_source != direction);
+		iterlist[i].orig_count = iterlist[i].iter.count;
+	}
+
+	*iter = (struct iov_iter){
+		.iter_type	= ITER_ITERLIST,
+		.data_source	= true,
+		.count		= count,
+		.iterlist	= iterlist,
+		.nr_segs	= nr_segs,
+	};
+}
+EXPORT_SYMBOL(iov_iter_iterlist);
+
 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
 				   unsigned len_mask)
 {
@@ -947,6 +1016,15 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
 	if (iov_iter_is_xarray(i))
 		return (i->xarray_start + i->iov_offset) | i->count;
 
+	if (iov_iter_is_iterlist(i)) {
+		unsigned long align = 0;
+		unsigned int j;
+
+		for (j = 0; j < i->nr_segs; j++)
+			align |= iov_iter_alignment(&i->iterlist[j].iter);
+		return align;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(iov_iter_alignment);
@@ -1206,6 +1284,18 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
 		return iter_folioq_get_pages(i, pages, maxsize, maxpages, start);
 	if (iov_iter_is_xarray(i))
 		return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
+	if (iov_iter_is_iterlist(i)) {
+		ssize_t size;
+
+		while (!i->iterlist->iter.count) {
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		size = __iov_iter_get_pages_alloc(&i->iterlist->iter,
+						  pages, maxsize, maxpages, start);
+		i->count -= size;
+		return size;
+	}
 	return -EFAULT;
 }
 
@@ -1274,6 +1364,21 @@ static int bvec_npages(const struct iov_iter *i, int maxpages)
 	return npages;
 }
 
+static int iterlist_npages(const struct iov_iter *i, int maxpages)
+{
+	const struct iov_iterlist *p;
+	ssize_t size = i->count;
+	int npages = 0;
+
+	for (p = i->iterlist; size; p++) {
+		size -= p->iter.count;
+		npages += iov_iter_npages(&p->iter, maxpages - npages);
+		if (unlikely(npages >= maxpages))
+			return maxpages;
+	}
+	return npages;
+}
+
 int iov_iter_npages(const struct iov_iter *i, int maxpages)
 {
 	if (unlikely(!i->count))
@@ -1298,6 +1403,8 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 		int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
 		return min(npages, maxpages);
 	}
+	if (iov_iter_is_iterlist(i))
+		return iterlist_npages(i, maxpages);
 	return 0;
 }
 EXPORT_SYMBOL(iov_iter_npages);
@@ -1309,11 +1416,14 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
 		return new->bvec = kmemdup(new->bvec,
 				    new->nr_segs * sizeof(struct bio_vec),
 				    flags);
-	else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
+	if (iov_iter_is_kvec(new) || iter_is_iovec(new))
 		/* iovec and kvec have identical layout */
 		return new->__iov = kmemdup(new->__iov,
 				   new->nr_segs * sizeof(struct iovec),
 				   flags);
+	if (WARN_ON_ONCE(iov_iter_is_iterlist(old)))
+		/* Don't allow dup'ing of iterlist as the cleanup is complicated */
+		return NULL;
 	return NULL;
 }
 EXPORT_SYMBOL(dup_iter);
@@ -1924,6 +2034,23 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i,
 		return iov_iter_extract_xarray_pages(i, pages, maxsize,
 						     maxpages, extraction_flags,
 						     offset0);
+	if (iov_iter_is_iterlist(i)) {
+		ssize_t size;
+
+		while (i->nr_segs && !i->iterlist->iter.count) {
+			i->iterlist++;
+			i->nr_segs--;
+		}
+		if (!i->nr_segs) {
+			WARN_ON_ONCE(i->count);
+			return 0;
+		}
+		size = iov_iter_extract_pages(&i->iterlist->iter,
+					      pages, maxsize, maxpages,
+					      extraction_flags, offset0);
+		i->count -= size;
+		return size;
+	}
 	return -EFAULT;
 }
 EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
@@ -1994,6 +2121,33 @@ size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv
 	return progress;
 }
 
+/*
+ * Handle iteration over ITER_ITERLIST.
+ */
+static size_t iterate_iterlist(struct iov_iter *iter, size_t len, void *priv, void *priv2,
+			       iov_ustep_f ustep, iov_step_f step)
+{
+	struct iov_iterlist *p = iter->iterlist;
+	size_t progress = 0;
+
+	do {
+		size_t consumed;
+
+		consumed = iterate_and_advance2(&p->iter, len, priv, priv2, ustep, step);
+
+		len -= consumed;
+		progress += consumed;
+		if (p->iter.count)
+			break;
+		p++;
+	} while (len);
+
+	iter->nr_segs -= p - iter->iterlist;
+	iter->iterlist = p;
+	iter->count -= progress;
+	return progress;
+}
+
 /*
  * Out of line iteration for iterator types that don't need such fast handling.
  */
@@ -2004,6 +2158,8 @@ size_t __iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
 		return iterate_discard(iter, len, priv, priv2, step);
 	if (iov_iter_is_xarray(iter))
 		return iterate_xarray(iter, len, priv, priv2, step);
+	if (iov_iter_is_iterlist(iter))
+		return iterate_iterlist(iter, len, priv, priv2, ustep, step);
 	WARN_ON(1);
 	return 0;
 }





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux