On Mon, 2023-05-15 at 10:33 +0100, David Howells wrote: > Add a function to handle MSG_SPLICE_PAGES being passed internally to > sendmsg(). Pages are spliced into the given socket buffer if possible and > copied in if not (e.g. they're slab pages or have a zero refcount). > > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> > cc: Eric Dumazet <edumazet@xxxxxxxxxx> > cc: "David S. Miller" <davem@xxxxxxxxxxxxx> > cc: David Ahern <dsahern@xxxxxxxxxx> > cc: Jakub Kicinski <kuba@xxxxxxxxxx> > cc: Paolo Abeni <pabeni@xxxxxxxxxx> > cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx> > cc: Jens Axboe <axboe@xxxxxxxxx> > cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> > cc: netdev@xxxxxxxxxxxxxxx > --- > > Notes: > ver #7) > - Export function. > - Never copy data, return -EIO if sendpage_ok() returns false. > > include/linux/skbuff.h | 3 ++ > net/core/skbuff.c | 95 ++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 98 insertions(+) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 4c0ad48e38ca..1c5f0ac6f8c3 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -5097,5 +5097,8 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) > #endif > } > > +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, > + ssize_t maxsize, gfp_t gfp); > + > #endif /* __KERNEL__ */ > #endif /* _LINUX_SKBUFF_H */ > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 7f53dcb26ad3..56d629ea2f3d 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -6892,3 +6892,98 @@ nodefer: __kfree_skb(skb); > if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) > smp_call_function_single_async(cpu, &sd->defer_csd); > } > + > +static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, > + size_t offset, size_t len) > +{ > + const char *kaddr; > + __wsum csum; > + > + kaddr = kmap_local_page(page); > + csum = csum_partial(kaddr + offset, len, 0); > + kunmap_local(kaddr); > + skb->csum = csum_block_add(skb->csum, csum, skb->len); > +} > + > +/** > + * skb_splice_from_iter - Splice (or copy) pages to skbuff > + * @skb: The buffer to add pages to > + * @iter: Iterator representing the pages to be added > + * @maxsize: Maximum amount of pages to be added > + * @gfp: Allocation flags > + * > + * This is a common helper function for supporting MSG_SPLICE_PAGES. It > + * extracts pages from an iterator and adds them to the socket buffer if > + * possible, copying them to fragments if not possible (such as if they're slab > + * pages). > + * > + * Returns the amount of data spliced/copied or -EMSGSIZE if there's > + * insufficient space in the buffer to transfer anything. > + */ > +ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, > + ssize_t maxsize, gfp_t gfp) > +{ > + struct page *pages[8], **ppages = pages; > + unsigned int i; > + ssize_t spliced = 0, ret = 0; > + size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); Minor nit: please respect the reverse x-mas tree order (there are a few other occurrences around) > + > + while (iter->count > 0) { > + ssize_t space, nr; > + size_t off, len; > + > + ret = -EMSGSIZE; > + space = frag_limit - skb_shinfo(skb)->nr_frags; > + if (space < 0) > + break; > + > + /* We might be able to coalesce without increasing nr_frags */ > + nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); > + > + len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); > + if (len <= 0) { > + ret = len ?: -EIO; > + break; > + } > + > + if (space == 0 && > + !skb_can_coalesce(skb, skb_shinfo(skb)->nr_frags, > + pages[0], off)) { > + iov_iter_revert(iter, len); > + break; > + } It looks like the above condition/checks duplicate what the later skb_append_pagefrags() will perform below. I guess the above chunk could be removed? > + > + i = 0; > + do { > + struct page *page = pages[i++]; > + size_t part = min_t(size_t, PAGE_SIZE - off, len); > + > + ret = -EIO; > + if (!sendpage_ok(page)) > + goto out; My (limited) understanding is that the current sendpage code assumes that the caller provides/uses pages suitable for such use. The existing sendpage_ok() check is in place as way to try to catch possible code bug - via the WARN_ONCE(). I think the same could be done here? Thanks! Paolo > + > + ret = skb_append_pagefrags(skb, page, off, part, > + frag_limit); > + if (ret < 0) { > + iov_iter_revert(iter, len); > + goto out; > + } > + > + if (skb->ip_summed == CHECKSUM_NONE) > + skb_splice_csum_page(skb, page, off, part); > + > + off = 0; > + spliced += part; > + maxsize -= part; > + len -= part; > + } while (len > 0); > + > + if (maxsize <= 0) > + break; > + } > + > +out: > + skb_len_add(skb, spliced); > + return spliced ?: ret; > +} > +EXPORT_SYMBOL(skb_splice_from_iter); >