David Howells wrote: > Make IP/UDP sendmsg() support MSG_SPLICE_PAGES. This causes pages to be > spliced from the source iterator. > > This allows ->sendpage() to be replaced by something that can handle > multiple multipage folios in a single transaction. > > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> > cc: Willem de Bruijn <willemdebruijn.kernel@xxxxxxxxx> > cc: "David S. Miller" <davem@xxxxxxxxxxxxx> > cc: Eric Dumazet <edumazet@xxxxxxxxxx> > cc: Jakub Kicinski <kuba@xxxxxxxxxx> > cc: Paolo Abeni <pabeni@xxxxxxxxxx> > cc: Jens Axboe <axboe@xxxxxxxxx> > cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> > cc: netdev@xxxxxxxxxxxxxxx > --- > net/ipv4/ip_output.c | 102 +++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 99 insertions(+), 3 deletions(-) > > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > index 4e4e308c3230..e2eaba817c1f 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -956,6 +956,79 @@ csum_page(struct page *page, int offset, int copy) > return csum; > } > > +/* > + * Allocate a packet for MSG_SPLICE_PAGES. > + */ > +static int __ip_splice_alloc(struct sock *sk, struct sk_buff **pskb, > + unsigned int fragheaderlen, unsigned int maxfraglen, > + unsigned int hh_len) > +{ > + struct sk_buff *skb_prev = *pskb, *skb; > + unsigned int fraggap = skb_prev->len - maxfraglen; > + unsigned int alloclen = fragheaderlen + hh_len + fraggap + 15; > + > + skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); > + if (unlikely(!skb)) > + return -ENOBUFS; > + > + /* Fill in the control structures */ > + skb->ip_summed = CHECKSUM_NONE; > + skb->csum = 0; > + skb_reserve(skb, hh_len); > + > + /* Find where to start putting bytes. */ > + skb_put(skb, fragheaderlen + fraggap); > + skb_reset_network_header(skb); > + skb->transport_header = skb->network_header + fragheaderlen; > + if (fraggap) { > + skb->csum = skb_copy_and_csum_bits(skb_prev, maxfraglen, > + skb_transport_header(skb), > + fraggap); > + skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); > + pskb_trim_unique(skb_prev, maxfraglen); > + } > + > + /* Put the packet on the pending queue. */ > + __skb_queue_tail(&sk->sk_write_queue, skb); > + *pskb = skb; > + return 0; > +} > + > +/* > + * Add (or copy) data pages for MSG_SPLICE_PAGES. > + */ > +static int __ip_splice_pages(struct sock *sk, struct sk_buff *skb, > + void *from, int *pcopy) > +{ > + struct msghdr *msg = from; > + struct page *page = NULL, **pages = &page; > + ssize_t copy = *pcopy; > + size_t off; > + int err; > + > + copy = iov_iter_extract_pages(&msg->msg_iter, &pages, copy, 1, 0, &off); > + if (copy <= 0) > + return copy ?: -EIO; > + > + err = skb_append_pagefrags(skb, page, off, copy); > + if (err < 0) { > + iov_iter_revert(&msg->msg_iter, copy); > + return err; > + } > + > + if (skb->ip_summed == CHECKSUM_NONE) { > + __wsum csum; > + > + csum = csum_page(page, off, copy); > + skb->csum = csum_block_add(skb->csum, csum, skb->len); > + } > + > + skb_len_add(skb, copy); > + refcount_add(copy, &sk->sk_wmem_alloc); > + *pcopy = copy; > + return 0; > +} These functions are derived from and replace ip_append_page. That can be removed once udp_sendpage is converted? > > static int __ip_append_data(struct sock *sk, > struct flowi4 *fl4, > struct sk_buff_head *queue, > @@ -977,7 +1050,7 @@ static int __ip_append_data(struct sock *sk, > int err; > int offset = 0; > bool zc = false; > - unsigned int maxfraglen, fragheaderlen, maxnonfragsize; > + unsigned int maxfraglen, fragheaderlen, maxnonfragsize, initial_length; > int csummode = CHECKSUM_NONE; > struct rtable *rt = (struct rtable *)cork->dst; > unsigned int wmem_alloc_delta = 0; > @@ -1017,6 +1090,7 @@ static int __ip_append_data(struct sock *sk, > (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) > csummode = CHECKSUM_PARTIAL; > > + initial_length = length; > if ((flags & MSG_ZEROCOPY) && length) { > struct msghdr *msg = from; > > @@ -1047,6 +1121,14 @@ static int __ip_append_data(struct sock *sk, > skb_zcopy_set(skb, uarg, &extra_uref); > } > } > + } else if ((flags & MSG_SPLICE_PAGES) && length) { > + if (inet->hdrincl) > + return -EPERM; > + if (rt->dst.dev->features & NETIF_F_SG) > + /* We need an empty buffer to attach stuff to */ > + initial_length = transhdrlen; I still don't entirely understand what initial_length means. More importantly, transhdrlen can be zero. If not called for UDP but for RAW. Or if this is a subsequent call to a packet that is being held with MSG_MORE. This works fine for existing use-cases, which go to alloc_new_skb. Not sure how this case would be different. But the comment alludes that it does. > + else > + flags &= ~MSG_SPLICE_PAGES; > } > > cork->length += length; > @@ -1074,6 +1156,16 @@ static int __ip_append_data(struct sock *sk, > unsigned int alloclen, alloc_extra; > unsigned int pagedlen; > struct sk_buff *skb_prev; > + > + if (unlikely(flags & MSG_SPLICE_PAGES)) { > + err = __ip_splice_alloc(sk, &skb, fragheaderlen, > + maxfraglen, hh_len); > + if (err < 0) > + goto error; > + continue; > + } > + initial_length = length; > + > alloc_new_skb: > skb_prev = skb; > if (skb_prev) > @@ -1085,7 +1177,7 @@ static int __ip_append_data(struct sock *sk, > * If remaining data exceeds the mtu, > * we know we need more fragment(s). > */ > - datalen = length + fraggap; > + datalen = initial_length + fraggap; > if (datalen > mtu - fragheaderlen) > datalen = maxfraglen - fragheaderlen; > fraglen = datalen + fragheaderlen; > @@ -1099,7 +1191,7 @@ static int __ip_append_data(struct sock *sk, > * because we have no idea what fragment will be > * the last. > */ > - if (datalen == length + fraggap) > + if (datalen == initial_length + fraggap) > alloc_extra += rt->dst.trailer_len; > > if ((flags & MSG_MORE) && > @@ -1206,6 +1298,10 @@ static int __ip_append_data(struct sock *sk, > err = -EFAULT; > goto error; > } > + } else if (flags & MSG_SPLICE_PAGES) { > + err = __ip_splice_pages(sk, skb, from, ©); > + if (err < 0) > + goto error; > } else if (!zc) { > int i = skb_shinfo(skb)->nr_frags; > >