David Howells wrote: > Make IP/UDP sendmsg() support MSG_SPLICE_PAGES. This causes pages to be > spliced from the source iterator if possible (the iterator must be > ITER_BVEC and the pages must be spliceable). > > This allows ->sendpage() to be replaced by something that can handle > multiple multipage folios in a single transaction. > > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> > cc: Willem de Bruijn <willemdebruijn.kernel@xxxxxxxxx> > cc: "David S. Miller" <davem@xxxxxxxxxxxxx> > cc: Eric Dumazet <edumazet@xxxxxxxxxx> > cc: Jakub Kicinski <kuba@xxxxxxxxxx> > cc: Paolo Abeni <pabeni@xxxxxxxxxx> > cc: Jens Axboe <axboe@xxxxxxxxx> > cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> > cc: netdev@xxxxxxxxxxxxxxx > --- > net/ipv4/ip_output.c | 85 +++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 81 insertions(+), 4 deletions(-) > > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c A non-RFC version would require the same for ipv6, of course. > index 4e4e308c3230..07736da70eab 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -973,11 +973,11 @@ static int __ip_append_data(struct sock *sk, > int hh_len; > int exthdrlen; > int mtu; > - int copy; > + ssize_t copy; > int err; > int offset = 0; > bool zc = false; > - unsigned int maxfraglen, fragheaderlen, maxnonfragsize; > + unsigned int maxfraglen, fragheaderlen, maxnonfragsize, xlength; Does x here stand for anything? > int csummode = CHECKSUM_NONE; > struct rtable *rt = (struct rtable *)cork->dst; > unsigned int wmem_alloc_delta = 0; > @@ -1017,6 +1017,7 @@ static int __ip_append_data(struct sock *sk, > (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) > csummode = CHECKSUM_PARTIAL; > > + xlength = length; > if ((flags & MSG_ZEROCOPY) && length) { > struct msghdr *msg = from; > > @@ -1047,6 +1048,14 @@ static int __ip_append_data(struct sock *sk, > skb_zcopy_set(skb, uarg, &extra_uref); > } > } > + } else if ((flags & MSG_SPLICE_PAGES) && length) { > + struct msghdr *msg = from; > + > + if (inet->hdrincl) > + return -EPERM; > + if (!(rt->dst.dev->features & NETIF_F_SG)) > + return -EOPNOTSUPP; > + xlength = transhdrlen; /* We need an empty buffer to attach stuff to */ > } > > cork->length += length; > @@ -1074,6 +1083,50 @@ static int __ip_append_data(struct sock *sk, > unsigned int alloclen, alloc_extra; > unsigned int pagedlen; > struct sk_buff *skb_prev; > + > + if (unlikely(flags & MSG_SPLICE_PAGES)) { > + skb_prev = skb; > + fraggap = skb_prev->len - maxfraglen; > + > + alloclen = fragheaderlen + hh_len + fraggap + 15; > + skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); > + if (unlikely(!skb)) { > + err = -ENOBUFS; > + goto error; > + } > + > + /* > + * Fill in the control structures > + */ > + skb->ip_summed = CHECKSUM_NONE; > + skb->csum = 0; > + skb_reserve(skb, hh_len); > + > + /* > + * Find where to start putting bytes. > + */ > + skb_put(skb, fragheaderlen + fraggap); > + skb_reset_network_header(skb); > + skb->transport_header = (skb->network_header + > + fragheaderlen); > + if (fraggap) { > + skb->csum = skb_copy_and_csum_bits( > + skb_prev, maxfraglen, > + skb_transport_header(skb), > + fraggap); > + skb_prev->csum = csum_sub(skb_prev->csum, > + skb->csum); > + pskb_trim_unique(skb_prev, maxfraglen); > + } > + > + /* > + * Put the packet on the pending queue. > + */ > + __skb_queue_tail(&sk->sk_write_queue, skb); > + continue; > + } > + xlength = length; > + > alloc_new_skb: > skb_prev = skb; > if (skb_prev) > @@ -1085,7 +1138,7 @@ static int __ip_append_data(struct sock *sk, > * If remaining data exceeds the mtu, > * we know we need more fragment(s). > */ > - datalen = length + fraggap; > + datalen = xlength + fraggap; > if (datalen > mtu - fragheaderlen) > datalen = maxfraglen - fragheaderlen; > fraglen = datalen + fragheaderlen; > @@ -1099,7 +1152,7 @@ static int __ip_append_data(struct sock *sk, > * because we have no idea what fragment will be > * the last. > */ > - if (datalen == length + fraggap) > + if (datalen == xlength + fraggap) > alloc_extra += rt->dst.trailer_len; > > if ((flags & MSG_MORE) && > @@ -1206,6 +1259,30 @@ static int __ip_append_data(struct sock *sk, > err = -EFAULT; > goto error; > } > + } else if (flags & MSG_SPLICE_PAGES) { > + struct msghdr *msg = from; > + struct page *page = NULL, **pages = &page; > + size_t off; > + > + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, > + copy, 1, 0, &off); > + if (copy <= 0) { > + err = copy ?: -EIO; > + goto error; > + } > + > + err = skb_append_pagefrags(skb, page, off, copy); > + if (err < 0) > + goto error; > + > + if (skb->ip_summed == CHECKSUM_NONE) { > + __wsum csum; > + csum = csum_page(page, off, copy); > + skb->csum = csum_block_add(skb->csum, csum, skb->len); > + } > + > + skb_len_add(skb, copy); > + refcount_add(copy, &sk->sk_wmem_alloc); > } else if (!zc) { > int i = skb_shinfo(skb)->nr_frags; > > This does add a lot of code to two functions that are already unwieldy. It may be unavoidable, but it if can use helpers, that would be preferable.