On Tue, Jun 28, 2022 at 07:56:27PM +0100, Pavel Begunkov wrote: > Add an bvec specialised and optimised path in zerocopy_sg_from_iter. > It'll be used later for {get,put}_page() optimisations. > > Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> > --- > net/core/datagram.c | 47 +++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 47 insertions(+) > Rather than propagating iter functions, I have been using the attached patch for a few months now. It leverages your ubuf_info in msghdr to allow in kernel users to pass in their own iter handler.
>From 1101177acb64832df2bb2b44d9305a8ebc4ca648 Mon Sep 17 00:00:00 2001 From: David Ahern <dsahern@xxxxxxxxxx> Date: Tue, 19 Apr 2022 10:39:59 -0600 Subject: [PATCH] net: Allow custom iter handler in uarg Add support for custom iov_iter handling to ubuf. The idea is that in-kernel subsystems want control over how an SG is split. The custom iterator is a union with mmpin to keep the size of ubuf_info <= sizeof(skb->cb) which is 48B. Signed-off-by: David Ahern <dsahern@xxxxxxxxxx> --- include/linux/skbuff.h | 21 ++++++++++++++++----- net/core/datagram.c | 11 ++++++++--- net/core/datagram.h | 3 ++- net/core/skbuff.c | 19 +++++++++++++++---- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 6 files changed, 43 insertions(+), 15 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dbf820a50a39..71161f65dedd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -482,11 +482,21 @@ struct ubuf_info { }; refcount_t refcnt; u8 flags; + u8 has_sg_from_iter; - struct mmpin { - struct user_struct *user; - unsigned int num_pg; - } mmp; + /* sg_from_iter is expected to be used with ubuf in + * msghdr and is only referenced at the transport + * layer segmenting an iov into packets. mmpin is used + * by in-tree ubuf_info {re,}alloc at L3 layer. + */ + union { + int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length); + struct mmpin { + struct user_struct *user; + unsigned int num_pg; + } mmp; + }; }; #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) @@ -503,7 +513,8 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success); -int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len); +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len, + struct ubuf_info *uarg); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg); diff --git a/net/core/datagram.c b/net/core/datagram.c index 15ab9ffb27fe..9ca61a0a400d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -617,10 +617,15 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, EXPORT_SYMBOL(skb_copy_datagram_from_iter); int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, - struct iov_iter *from, size_t length) + struct iov_iter *from, size_t length, + struct ubuf_info *uarg) { - int frag = skb_shinfo(skb)->nr_frags; + int frag; + if (unlikely(uarg && uarg->has_sg_from_iter)) + return uarg->sg_from_iter(sk, skb, from, length); + + frag = skb_shinfo(skb)->nr_frags; while (length && iov_iter_count(from)) { struct page *pages[MAX_SKB_FRAGS]; struct page *last_head = NULL; @@ -704,7 +709,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); diff --git a/net/core/datagram.h b/net/core/datagram.h index bcfb75bfa3b2..65027fcf3322 100644 --- a/net/core/datagram.h +++ b/net/core/datagram.h @@ -10,6 +10,7 @@ struct sk_buff; struct iov_iter; int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, - struct iov_iter *from, size_t length); + struct iov_iter *from, size_t length, + struct ubuf_info *uarg); #endif /* _NET_CORE_DATAGRAM_H_ */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 17b93177a68f..9acb43e5a779 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1158,6 +1158,7 @@ struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); uarg = (void *)skb->cb; + uarg->has_sg_from_iter = 0; uarg->mmp.user = NULL; if (mm_account_pinned_pages(&uarg->mmp, size)) { @@ -1206,6 +1207,12 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, return NULL; } + if (WARN_ON(uarg->has_sg_from_iter)) { + uarg->has_sg_from_iter = 0; + uarg->mmp.user = NULL; + uarg->mmp.num_pg = 0; + } + next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg->id + uarg->len) == next) { if (mm_account_pinned_pages(&uarg->mmp, size)) @@ -1258,7 +1265,10 @@ static void __msg_zerocopy_callback(struct ubuf_info *uarg) u32 lo, hi; u16 len; - mm_unaccount_pinned_pages(&uarg->mmp); + + WARN_ON(uarg->has_sg_from_iter); + if (!uarg->has_sg_from_iter) + mm_unaccount_pinned_pages(&uarg->mmp); /* if !len, there was only 1 call, and it was aborted * so do not queue a completion notification @@ -1319,9 +1329,10 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) } EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); -int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) +int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len, + struct ubuf_info *uarg) { - return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); + return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len, uarg); } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram); @@ -1339,7 +1350,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, if (orig_uarg && uarg != orig_uarg) return -EEXIST; - err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); + err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len, uarg); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1b6a64b19c76..1ff403c2dcb0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1238,7 +1238,7 @@ static int __ip_append_data(struct sock *sk, skb->truesize += copy; wmem_alloc_delta += copy; } else { - err = skb_zerocopy_iter_dgram(skb, from, copy); + err = skb_zerocopy_iter_dgram(skb, from, copy, uarg); if (err < 0) goto error; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 63a217128f8b..6795144653ac 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1791,7 +1791,7 @@ static int __ip6_append_data(struct sock *sk, skb->truesize += copy; wmem_alloc_delta += copy; } else { - err = skb_zerocopy_iter_dgram(skb, from, copy); + err = skb_zerocopy_iter_dgram(skb, from, copy, uarg); if (err < 0) goto error; } -- 2.25.1