On 12/18/2014 02:54 AM, Michael S. Tsirkin wrote: > Cc Dave, pls remember to do this next time otherwise > your patches won't get merged :) > > On Wed, Dec 17, 2014 at 06:31:50PM -0500, Vlad Yasevich wrote: >> On 12/17/2014 05:45 PM, Michael S. Tsirkin wrote: >>> On Wed, Dec 17, 2014 at 01:20:46PM -0500, Vladislav Yasevich wrote: >>>> Split IPv6 support for UFO into its own feature similiar to TSO. >>>> This will later allow us to re-enable UFO support for virtio-net >>>> devices. >>>> >>>> Signed-off-by: Vladislav Yasevich <vyasevic@xxxxxxxxxx> >>>> --- >>>> include/linux/netdev_features.h | 7 +++++-- >>>> include/linux/netdevice.h | 1 + >>>> include/linux/skbuff.h | 1 + >>>> net/core/dev.c | 35 +++++++++++++++++++---------------- >>>> net/core/ethtool.c | 2 +- >>>> 5 files changed, 27 insertions(+), 19 deletions(-) >>>> >>>> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h >>>> index dcfdecb..a078945 100644 >>>> --- a/include/linux/netdev_features.h >>>> +++ b/include/linux/netdev_features.h >>>> @@ -48,8 +48,9 @@ enum { >>>> NETIF_F_GSO_UDP_TUNNEL_BIT, /* ... UDP TUNNEL with TSO */ >>>> NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT,/* ... UDP TUNNEL with TSO & CSUM */ >>>> NETIF_F_GSO_MPLS_BIT, /* ... MPLS segmentation */ >>>> + NETIF_F_UFO6_BIT, /* ... UDPv6 fragmentation */ >>>> /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ >>>> - NETIF_F_GSO_MPLS_BIT, >>>> + NETIF_F_UFO6_BIT, >>>> >>>> NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ >>>> NETIF_F_SCTP_CSUM_BIT, /* SCTP checksum offload */ >>>> @@ -109,6 +110,7 @@ enum { >>>> #define NETIF_F_TSO_ECN __NETIF_F(TSO_ECN) >>>> #define NETIF_F_TSO __NETIF_F(TSO) >>>> #define NETIF_F_UFO __NETIF_F(UFO) >>>> +#define NETIF_F_UFO6 __NETIF_F(UFO6) >>>> #define NETIF_F_VLAN_CHALLENGED __NETIF_F(VLAN_CHALLENGED) >>>> #define NETIF_F_RXFCS __NETIF_F(RXFCS) >>>> #define NETIF_F_RXALL __NETIF_F(RXALL) >>>> @@ -141,7 +143,7 @@ enum { >>>> >>>> /* List of features with software fallbacks. */ >>>> #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | \ >>>> - NETIF_F_TSO6 | NETIF_F_UFO) >>>> + NETIF_F_TSO6 | NETIF_F_UFO | NETIF_F_UFO6) >>>> >>>> #define NETIF_F_GEN_CSUM NETIF_F_HW_CSUM >>>> #define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM) >>>> @@ -149,6 +151,7 @@ enum { >>>> #define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM) >>>> >>>> #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN) >>>> +#define NETIF_F_ALL_UFO (NETIF_F_UFO | NETIF_F_UFO6) >>>> >>>> #define NETIF_F_ALL_FCOE (NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \ >>>> NETIF_F_FSO) >>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h >>>> index 74fd5d3..86af10a 100644 >>>> --- a/include/linux/netdevice.h >>>> +++ b/include/linux/netdevice.h >>>> @@ -3559,6 +3559,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) >>>> /* check flags correspondence */ >>>> BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT)); >>>> BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_UFO >> NETIF_F_GSO_SHIFT)); >>>> + BUILD_BUG_ON(SKB_GSO_UDP6 != (NETIF_F_UFO6 >> NETIF_F_GSO_SHIFT)); >>>> BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT)); >>>> BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT)); >>>> BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT)); >>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >>>> index 6c8b6f6..8538b67 100644 >>>> --- a/include/linux/skbuff.h >>>> +++ b/include/linux/skbuff.h >>>> @@ -372,6 +372,7 @@ enum { >>>> >>>> SKB_GSO_MPLS = 1 << 12, >>>> >>>> + SKB_GSO_UDP6 = 1 << 13 >>>> }; >>>> >>>> #if BITS_PER_LONG > 32 >>> >>> So this implies anything getting GSO packets e.g. >>> from userspace now needs to check IP version to >>> set GSO type correctly. >>> >>> I think you missed some places that do this, e.g. af_packet >>> sockets. >>> >> >> I looked at af_packet sockets and they set this only in the event >> vnet header has been used with a GSO type. In this case, the user >> already knows the the type. > > Imagine you are receiving a packet: > > if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { > switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { > case VIRTIO_NET_HDR_GSO_TCPV4: > gso_type = SKB_GSO_TCPV4; > break; > case VIRTIO_NET_HDR_GSO_TCPV6: > gso_type = SKB_GSO_TCPV6; > break; > case VIRTIO_NET_HDR_GSO_UDP: > gso_type = SKB_GSO_UDP; > break; > default: > goto out_unlock; > } > > if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) > gso_type |= SKB_GSO_TCP_ECN; > > if (vnet_hdr.gso_size == 0) > goto out_unlock; > > } > > we used to report UFO6 as SKB_GSO_UDP, we probably > should keep doing this, with your patch we select the > goto out_unlock path. > > No. The vnet_hdr.gso_type is still going to be VIRTIO_NET_HDR_GSO_UDP since the UDP6 version isn't defined yet. So, it will be marked as GSO_UDP. This code most likely needs the same workaround as exists in tap and macvtap, i.e select the proxy fragment id and decide what to do with gso_type. > >> It is true that with this series af_packets now can't do IPv6 UFO >> since there is no VIRTIO_NET_HDR_GSO_UDPV6 yet. > > What do you mean by "do". What I mean is that AF_PACKET sockets currently do not do IPv6 UFO correctly, even after Ben's fixes to tap/macvtap. There is no proxy fragment id selection in af_packet case and we have the same problem Ben was trying address for tap/macvtap. > Are we talking about sending or receiving packets? I am talking about sending, see above. > You seem to conflate the two. > > We always discarded ID on RX. > > For tun, this is xmit, so just by saying "this device can > not do UFO" you will never get short packets. > You must mean long packets. This is actually an issue I've been thinking about. With with your suggestion of switching the GSO type for legacy applications we end up with fragments for IPv6 traffic. As a result, legacy VMs will see a regression for large IPv6 datagrams. > >> >> I suppose we could do something similar there as we do in tun code/macvtap code. >> If that's the case, it currently broken as well. >> >> -vlad > > > Broken is a big word. > > Let's stop conflating two directions. I am not and was talking only about af_packet as that is what you asked about. There is no tun/macvtap in play here. They are handled separately in their respective drivers. > > Here's the way I look at it: > > 1. Userspace doesn't have a way to get fragment IDs > from tun/macvtap/packet sockets. > Presumably, not all users need these IDs. > E.g. old guests clearly don't. > > We should either give them packets stripping the ID, > like we always did, or make sure they never get these packets. > Second option seems achievable for tun if we > never tell linux it can do UFO, but I don't see > how to do it for macvtap and packet socket. > macvtap is slightly problematic, but doable with some tricks. packet socket is an interesting problem. The only way an af_packet socket can receive an skb marked SKB_GSO_UDPV6 is if someone else on the host sent it (like another guest). Since there is are no feature or vnet header length negotiations, it is impossible to tell if an application using this af_packet socket is capable of processing VIRTIO_NET_HDR_GSO_UDPV6 type (yet to be defined). So, we can either use existing VIRTIO_NET_HDR_GSO_UDP on receive path, add some kind of negotiation logic to packet socket (like storing the application expected vnet header size), or perform IPv6 fragmentation somehow. Options 1 or 2 are doable. > > 2. Userspace doesn't have a way to set fragment IDs > for tun/macvtap/packet sockets. > Presumably, not all users have these IDs. E.g. old > guests clearly don't. > > We should either generate our own ID, > like we always did, or make sure we don't accept > these packets. > Second option is almost sure to break userspace, > so it seems we must do the first one. > Right. This was missing from packet sockets. I can fix it. -vlad > > 3. > Exactly the same two things if you replace userspace > with hypervisor and tun/macvtap/packet socket with > virtio device. > > > 4. > As a next step, we should add a way for userspace > to tell us that it has ids and can handle them. > > > > > >>> >>>> diff --git a/net/core/dev.c b/net/core/dev.c >>>> index 945bbd0..fa4d2ee 100644 >>>> --- a/net/core/dev.c >>>> +++ b/net/core/dev.c >>>> @@ -5929,6 +5929,12 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, >>>> features &= ~NETIF_F_ALL_TSO; >>>> } >>>> >>>> + /* UFO requires that SG is present as well */ >>>> + if ((features & NETIF_F_ALL_UFO) && !(features & NETIF_F_SG)) { >>>> + netdev_dbg(dev, "Dropping UFO features since no SG feature.\n"); >>>> + features &= ~NETIF_F_ALL_UFO; >>>> + } >>>> + >>>> if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && >>>> !(features & NETIF_F_IP_CSUM)) { >>>> netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); >>>> @@ -5952,24 +5958,21 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, >>>> features &= ~NETIF_F_GSO; >>>> } >>>> >>>> - /* UFO needs SG and checksumming */ >>>> - if (features & NETIF_F_UFO) { >>>> - /* maybe split UFO into V4 and V6? */ >>>> - if (!((features & NETIF_F_GEN_CSUM) || >>>> - (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) >>>> - == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { >>>> - netdev_dbg(dev, >>>> - "Dropping NETIF_F_UFO since no checksum offload features.\n"); >>>> - features &= ~NETIF_F_UFO; >>>> - } >>>> - >>>> - if (!(features & NETIF_F_SG)) { >>>> - netdev_dbg(dev, >>>> - "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); >>>> - features &= ~NETIF_F_UFO; >>>> - } >>>> + /* UFO also needs checksumming */ >>>> + if ((features & NETIF_F_UFO) && !(features & NETIF_F_GEN_CSUM) && >>>> + !(features & NETIF_F_IP_CSUM)) { >>>> + netdev_dbg(dev, >>>> + "Dropping NETIF_F_UFO since no checksum offload features.\n"); >>>> + features &= ~NETIF_F_UFO; >>>> + } >>>> + if ((features & NETIF_F_UFO6) && !(features & NETIF_F_GEN_CSUM) && >>>> + !(features & NETIF_F_IPV6_CSUM)) { >>>> + netdev_dbg(dev, >>>> + "Dropping NETIF_F_UFO6 since no checksum offload features.\n"); >>>> + features &= ~NETIF_F_UFO6; >>>> } >>>> >>>> + >>>> #ifdef CONFIG_NET_RX_BUSY_POLL >>>> if (dev->netdev_ops->ndo_busy_poll) >>>> features |= NETIF_F_BUSY_POLL; >>>> diff --git a/net/core/ethtool.c b/net/core/ethtool.c >>>> index 06dfb29..93eff41 100644 >>>> --- a/net/core/ethtool.c >>>> +++ b/net/core/ethtool.c >>>> @@ -223,7 +223,7 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) >>>> return NETIF_F_ALL_TSO; >>>> case ETHTOOL_GUFO: >>>> case ETHTOOL_SUFO: >>>> - return NETIF_F_UFO; >>>> + return NETIF_F_ALL_UFO; >>>> case ETHTOOL_GGSO: >>>> case ETHTOOL_SGSO: >>>> return NETIF_F_GSO; >>>> -- >>>> 1.9.3 _______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/virtualization