From: Willem de Bruijn <willemb@xxxxxxxxxx> Add MSG_ZEROCOPY support to INET(6). This includes UDP, but also RAW sockets that do not take the raw_send_hdrinc() path. Zerocopy is only effective when payload is not touched at all. Limit it to paths that support both checksum offload and scatter-gather. When a caller passes MSG_ZEROCOPY to send and it returns a positive result, the caller must always receive a completion notification. Therefore, attach the structure even when zerocopy is not possible. Also in edge cases, such as corking with mixed zc/non-zc calls. Tested: msg_zerocopy.sh 4 udp: without zerocopy tx=146127 (9118 MB) txc=0 zc=n rx=146127 (9118 MB) with zerocopy tx=335789 (20954 MB) txc=335789 zc=y rx=335789 (20954 MB) msg_zerocopy.sh 4 raw: without zerocopy tx=106461 (6643 MB) txc=0 zc=n rx=106461 (6643 MB) with zerocopy tx=296082 (18476 MB) txc=296082 zc=y rx=296082 (18476 MB) Signed-off-by: Willem de Bruijn <willemb@xxxxxxxxxx> --- net/core/skbuff.c | 4 ++++ net/ipv4/ip_output.c | 37 ++++++++++++++++++++++++++++++------- net/ipv6/ip6_output.c | 40 +++++++++++++++++++++++++++++++++------- 3 files changed, 67 insertions(+), 14 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0beaf961f79c..7d4c12316df6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1145,6 +1145,10 @@ extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, int skb_zerocopy_iter(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len) { + /* raw has extra indirection in raw_frag_vec */ + if (sk->sk_type == SOCK_RAW && sk->sk_family != PF_PACKET) + msg = *(struct msghdr **)msg; + return __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); } EXPORT_SYMBOL_GPL(skb_zerocopy_iter); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7a3fd25e8913..3ff425f7ded6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -919,7 +919,7 @@ static int __ip_append_data(struct sock *sk, { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; - + struct ubuf_info *uarg = NULL; struct ip_options *opt = cork->opt; int hh_len; int exthdrlen; @@ -963,9 +963,21 @@ static int __ip_append_data(struct sock *sk, !exthdrlen) csummode = CHECKSUM_PARTIAL; + if (flags & MSG_ZEROCOPY && length) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + + if (!(rt->dst.dev->features & NETIF_F_SG) || + (sk->sk_type == SOCK_DGRAM && csummode == CHECKSUM_NONE)) { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg); + } + } + cork->length += length; if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && - (sk->sk_protocol == IPPROTO_UDP) && + (sk->sk_protocol == IPPROTO_UDP) && !uarg && (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, @@ -997,6 +1009,7 @@ static int __ip_append_data(struct sock *sk, unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; + unsigned int zcopylen = 0; struct sk_buff *skb_prev; alloc_new_skb: skb_prev = skb; @@ -1017,8 +1030,12 @@ static int __ip_append_data(struct sock *sk, if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; - else + else if (!uarg || !uarg->zerocopy) alloclen = fraglen; + else { + alloclen = min_t(int, fraglen, MAX_HEADER); + zcopylen = fraglen - alloclen; + } alloclen += exthdrlen; @@ -1059,11 +1076,12 @@ static int __ip_append_data(struct sock *sk, cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; + skb_zcopy_set(skb, uarg); /* * Find where to start putting bytes. */ - data = skb_put(skb, fraglen + exthdrlen); + data = skb_put(skb, fraglen + exthdrlen - zcopylen); skb_set_network_header(skb, exthdrlen); skb->transport_header = (skb->network_header + fragheaderlen); @@ -1079,7 +1097,7 @@ static int __ip_append_data(struct sock *sk, pskb_trim_unique(skb_prev, maxfraglen); } - copy = datalen - transhdrlen - fraggap; + copy = datalen - transhdrlen - fraggap - zcopylen; if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); @@ -1087,7 +1105,7 @@ static int __ip_append_data(struct sock *sk, } offset += copy; - length -= datalen - fraggap; + length -= copy + transhdrlen; transhdrlen = 0; exthdrlen = 0; csummode = CHECKSUM_NONE; @@ -1115,7 +1133,7 @@ static int __ip_append_data(struct sock *sk, err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1145,6 +1163,10 @@ static int __ip_append_data(struct sock *sk, skb->data_len += copy; skb->truesize += copy; atomic_add(copy, &sk->sk_wmem_alloc); + } else { + err = skb_zerocopy_iter(sk, skb, from, copy); + if (err) + goto error; } offset += copy; length -= copy; @@ -1155,6 +1177,7 @@ static int __ip_append_data(struct sock *sk, error_efault: err = -EFAULT; error: + sock_zerocopy_put_abort(uarg); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5baa6fab4b97..38d9722d4e3c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1307,6 +1307,7 @@ static int __ip6_append_data(struct sock *sk, struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; + struct ubuf_info *uarg = NULL; skb = skb_peek_tail(queue); if (!skb) { @@ -1368,6 +1369,18 @@ static int __ip6_append_data(struct sock *sk, tskey = sk->sk_tskey++; } + if (flags & MSG_ZEROCOPY && length) { + uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); + if (!uarg) + return -ENOBUFS; + + if (!(rt->dst.dev->features & NETIF_F_SG) || + (sk->sk_type == SOCK_DGRAM && csummode == CHECKSUM_NONE)) { + uarg->zerocopy = 0; + skb_zcopy_set(skb, uarg); + } + } + /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1387,7 +1400,7 @@ static int __ip6_append_data(struct sock *sk, cork->length += length; if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && - (sk->sk_protocol == IPPROTO_UDP) && + (sk->sk_protocol == IPPROTO_UDP) && !uarg && (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, @@ -1413,6 +1426,7 @@ static int __ip6_append_data(struct sock *sk, unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; + unsigned int zcopylen = 0; alloc_new_skb: /* There's no room in the current skb */ if (skb) @@ -1435,11 +1449,17 @@ static int __ip6_append_data(struct sock *sk, if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; + fraglen = datalen + fragheaderlen; + if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; - else - alloclen = datalen + fragheaderlen; + else if (!uarg || !uarg->zerocopy) + alloclen = fraglen; + else { + alloclen = min_t(int, fraglen, MAX_HEADER); + zcopylen = fraglen - alloclen; + } alloclen += dst_exthdrlen; @@ -1461,7 +1481,7 @@ static int __ip6_append_data(struct sock *sk, */ alloclen += sizeof(struct frag_hdr); - copy = datalen - transhdrlen - fraggap; + copy = datalen - transhdrlen - fraggap - zcopylen; if (copy < 0) { err = -EINVAL; goto error; @@ -1497,11 +1517,12 @@ static int __ip6_append_data(struct sock *sk, tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; + skb_zcopy_set(skb, uarg); /* * Find where to start putting bytes */ - data = skb_put(skb, fraglen); + data = skb_put(skb, fraglen - zcopylen); skb_set_network_header(skb, exthdrlen); data += fragheaderlen; skb->transport_header = (skb->network_header + @@ -1524,7 +1545,7 @@ static int __ip6_append_data(struct sock *sk, } offset += copy; - length -= datalen - fraggap; + length -= copy + transhdrlen; transhdrlen = 0; exthdrlen = 0; dst_exthdrlen = 0; @@ -1552,7 +1573,7 @@ static int __ip6_append_data(struct sock *sk, err = -EFAULT; goto error; } - } else { + } else if (!uarg || !uarg->zerocopy) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; @@ -1582,6 +1603,10 @@ static int __ip6_append_data(struct sock *sk, skb->data_len += copy; skb->truesize += copy; atomic_add(copy, &sk->sk_wmem_alloc); + } else { + err = skb_zerocopy_iter(sk, skb, from, copy); + if (err) + goto error; } offset += copy; length -= copy; @@ -1592,6 +1617,7 @@ static int __ip6_append_data(struct sock *sk, error_efault: err = -EFAULT; error: + sock_zerocopy_put_abort(uarg); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); return err; -- 2.13.1.611.g7e3b11ae1-goog -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html