[PATCH net-next,RFC 06/13] netfilter: add early ingress support for IPv6

Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> · Thu, 14 Jun 2018 16:19:40 +0200

From: Steffen Klassert <steffen.klassert@xxxxxxxxxxx>

This patch adds the custom GSO and GRO logic for the IPv6 early ingress
hook. Layer 4 supports UDP and TCP at this stage.

Signed-off-by: Steffen Klassert <steffen.klassert@xxxxxxxxxxx>
Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>
---
 include/net/netfilter/early_ingress.h |   2 +
 net/ipv6/netfilter/Makefile           |   1 +
 net/ipv6/netfilter/early_ingress.c    | 307 ++++++++++++++++++++++++++++++++++
 net/netfilter/early_ingress.c         |   2 +
 4 files changed, 312 insertions(+)
 create mode 100644 net/ipv6/netfilter/early_ingress.c

diff --git a/include/net/netfilter/early_ingress.h b/include/net/netfilter/early_ingress.h
index caaef9fe619f..9ba8e2875345 100644
--- a/include/net/netfilter/early_ingress.h
+++ b/include/net/netfilter/early_ingress.h
@@ -13,6 +13,8 @@ int nf_hook_early_ingress(struct sk_buff *skb);
 
 void nf_early_ingress_ip_enable(void);
 void nf_early_ingress_ip_disable(void);
+void nf_early_ingress_ip6_enable(void);
+void nf_early_ingress_ip6_disable(void);
 
 void nf_early_ingress_enable(void);
 void nf_early_ingress_disable(void);
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 10a5a1c87320..445dfcf51ca8 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -2,6 +2,7 @@
 #
 # Makefile for the netfilter modules on top of IPv6.
 #
+obj-$(CONFIG_NETFILTER_EARLY_INGRESS) += early_ingress.o
 
 # Link order matters here.
 obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o
diff --git a/net/ipv6/netfilter/early_ingress.c b/net/ipv6/netfilter/early_ingress.c
new file mode 100644
index 000000000000..026d2814530a
--- /dev/null
+++ b/net/ipv6/netfilter/early_ingress.c
@@ -0,0 +1,307 @@
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/types.h>
+#include <net/xfrm.h>
+#include <net/arp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+#include <net/netfilter/early_ingress.h>
+#include <net/ip6_route.h>
+
+static const struct net_offload __rcu *nft_ip6_offloads[MAX_INET_PROTOS] __read_mostly;
+
+static struct sk_buff *nft_udp6_gso_segment(struct sk_buff *skb,
+					    netdev_features_t features)
+{
+	skb_push(skb, sizeof(struct ipv6hdr));
+	return nft_skb_segment(skb);
+}
+
+static struct sk_buff *nft_tcp6_gso_segment(struct sk_buff *skb,
+					    netdev_features_t features)
+{
+	skb_push(skb, sizeof(struct ipv6hdr));
+	return nft_skb_segment(skb);
+}
+
+static struct sk_buff *nft_ipv6_gso_segment(struct sk_buff *skb,
+					    netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	const struct net_offload *ops;
+	struct packet_offload *ptype;
+	struct ipv6hdr *iph;
+	int proto;
+
+	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_NFT)) {
+		ptype = dev_get_packet_offload(skb->protocol, 1);
+		if (ptype)
+			return ptype->callbacks.gso_segment(skb, features);
+
+		return ERR_PTR(-EPROTONOSUPPORT);
+	}
+
+	if (SKB_GSO_CB(skb)->encap_level == 0) {
+		iph = ipv6_hdr(skb);
+		skb_reset_network_header(skb);
+	} else {
+		iph = (struct ipv6hdr *)skb->data;
+	}
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+		goto out;
+
+	SKB_GSO_CB(skb)->encap_level += sizeof(*iph);
+
+	if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
+		goto out;
+
+	__skb_pull(skb, sizeof(*iph));
+
+	proto = iph->nexthdr;
+
+	segs = ERR_PTR(-EPROTONOSUPPORT);
+
+	ops = rcu_dereference(nft_ip6_offloads[proto]);
+	if (likely(ops && ops->callbacks.gso_segment))
+		segs = ops->callbacks.gso_segment(skb, features);
+
+out:
+	return segs;
+}
+
+static int nft_ipv6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
+	struct dst_entry *dst = skb_dst(skb);
+	struct rt6_info *rt = (struct rt6_info *)dst;
+	const struct net_offload *ops;
+	struct packet_offload *ptype;
+	int proto = iph->nexthdr;
+	struct in6_addr *nexthop;
+	struct neighbour *neigh;
+	struct net_device *dev;
+	unsigned int hh_len;
+	int err = 0;
+	u16 count;
+
+	count = NAPI_GRO_CB(skb)->count;
+
+	if (!NAPI_GRO_CB(skb)->is_ffwd) {
+		ptype = dev_get_packet_offload(skb->protocol, 1);
+		if (ptype)
+			return ptype->callbacks.gro_complete(skb, nhoff);
+
+		return 0;
+	}
+
+	rcu_read_lock();
+	ops = rcu_dereference(nft_ip6_offloads[proto]);
+	if (!ops || !ops->callbacks.gro_complete)
+		goto out_unlock;
+
+	/* Only need to add sizeof(*iph) to get to the next hdr below
+	 * because any hdr with option will have been flushed in
+	 * inet_gro_receive().
+	 */
+	err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
+
+out_unlock:
+	rcu_read_unlock();
+
+	if (err)
+		return err;
+
+	skb_shinfo(skb)->gso_type |= SKB_GSO_NFT;
+	skb_shinfo(skb)->gso_segs = count;
+
+	dev = dst->dev;
+	dev_hold(dev);
+	skb->dev = dev;
+
+	if (skb_dst(skb)->xfrm) {
+		err = dst_output(dev_net(dev), NULL, skb);
+		if (err != -EREMOTE)
+			return -EINPROGRESS;
+	}
+
+	if (count <= 1)
+		skb_gso_reset(skb);
+
+	hh_len = LL_RESERVED_SPACE(dev);
+
+	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+		struct sk_buff *skb2;
+
+		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+		if (!skb2) {
+			kfree_skb(skb);
+			return -ENOMEM;
+		}
+		consume_skb(skb);
+		skb = skb2;
+	}
+	rcu_read_lock();
+	nexthop = rt6_nexthop(rt, &iph->daddr);
+	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
+	if (unlikely(!neigh))
+		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+	if (!IS_ERR(neigh))
+		neigh_output(neigh, skb);
+	rcu_read_unlock();
+
+	return -EINPROGRESS;
+}
+
+static struct sk_buff **nft_ipv6_gro_receive(struct sk_buff **head,
+					     struct sk_buff *skb)
+{
+	const struct net_offload *ops;
+	struct packet_offload *ptype;
+	struct sk_buff **pp = NULL;
+	struct sk_buff *p;
+	struct ipv6hdr *iph;
+	unsigned int nlen;
+	unsigned int hlen;
+	unsigned int off;
+	int proto, ret;
+
+	off = skb_gro_offset(skb);
+	hlen = off + sizeof(*iph);
+
+	iph = skb_gro_header_slow(skb, hlen, off);
+	if (unlikely(!iph))
+		goto out;
+
+	proto = iph->nexthdr;
+
+	rcu_read_lock();
+
+	if (iph->version != 6)
+		goto out_unlock;
+
+	nlen = skb_network_header_len(skb);
+
+	ret = nf_hook_early_ingress(skb);
+	switch (ret) {
+	case NF_STOLEN:
+		break;
+	case NF_ACCEPT:
+		ptype = dev_get_packet_offload(skb->protocol, 1);
+		if (ptype)
+			pp = ptype->callbacks.gro_receive(head, skb);
+
+		goto out_unlock;
+	case NF_DROP:
+		pp = ERR_PTR(-EPERM);
+		goto out_unlock;
+	}
+
+	ops = rcu_dereference(nft_ip6_offloads[proto]);
+	if (!ops || !ops->callbacks.gro_receive)
+		goto out_unlock;
+
+	if (iph->hop_limit <= 1)
+		goto out_unlock;
+
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	for (p = *head; p; p = p->next) {
+		struct ipv6hdr *iph2;
+		__be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */
+
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		if (!NAPI_GRO_CB(p)->is_ffwd) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		if (!skb_dst(p)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		iph2 = ipv6_hdr(p);
+		first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
+
+		/* All fields must match except length and Traffic Class.
+		 * XXX skbs on the gro_list have all been parsed and pulled
+		 * already so we don't need to compare nlen
+		 * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
+		 * memcmp() alone below is suffcient, right?
+		 */
+		if ((first_word & htonl(0xF00FFFFF)) ||
+		   memcmp(&iph->nexthdr, &iph2->nexthdr,
+			  nlen - offsetof(struct ipv6hdr, nexthdr))) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+		/* flush if Traffic Class fields are different */
+		NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
+
+		NAPI_GRO_CB(skb)->is_ffwd = 1;
+		skb_dst_set_noref(skb, skb_dst(p));
+		pp = &p;
+
+		break;
+	}
+
+	NAPI_GRO_CB(skb)->is_atomic = true;
+
+	iph->hop_limit--;
+
+	skb_pull(skb, off);
+	NAPI_GRO_CB(skb)->data_offset = sizeof(*iph);
+	skb_reset_network_header(skb);
+	skb_set_transport_header(skb, sizeof(*iph));
+
+	pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+out_unlock:
+	rcu_read_unlock();
+
+out:
+	NAPI_GRO_CB(skb)->data_offset = 0;
+	return pp;
+}
+
+static struct packet_offload nft_ip6_packet_offload __read_mostly = {
+	.type = cpu_to_be16(ETH_P_IPV6),
+	.priority = 0,
+	.callbacks = {
+		.gro_receive = nft_ipv6_gro_receive,
+		.gro_complete = nft_ipv6_gro_complete,
+		.gso_segment = nft_ipv6_gso_segment,
+	},
+};
+
+static const struct net_offload nft_udp6_offload = {
+	.callbacks = {
+		.gso_segment = nft_udp6_gso_segment,
+		.gro_receive  =	nft_udp_gro_receive,
+	},
+};
+
+static const struct net_offload nft_tcp6_offload = {
+	.callbacks = {
+		.gso_segment = nft_tcp6_gso_segment,
+		.gro_receive  =	nft_tcp_gro_receive,
+	},
+};
+
+static const struct net_offload __rcu *nft_ip6_offloads[MAX_INET_PROTOS] __read_mostly = {
+	[IPPROTO_UDP]	= &nft_udp6_offload,
+	[IPPROTO_TCP]	= &nft_tcp6_offload,
+};
+
+void nf_early_ingress_ip6_enable(void)
+{
+	dev_add_offload(&nft_ip6_packet_offload);
+}
+
+void nf_early_ingress_ip6_disable(void)
+{
+	dev_remove_offload(&nft_ip6_packet_offload);
+}
diff --git a/net/netfilter/early_ingress.c b/net/netfilter/early_ingress.c
index bf31aa8b3721..4daf6cfea304 100644
--- a/net/netfilter/early_ingress.c
+++ b/net/netfilter/early_ingress.c
@@ -312,6 +312,7 @@ void nf_early_ingress_enable(void)
 	if (nf_early_ingress_use++ == 0) {
 		nf_early_ingress_use++;
 		nf_early_ingress_ip_enable();
+		nf_early_ingress_ip6_enable();
 	}
 }
 
@@ -319,5 +320,6 @@ void nf_early_ingress_disable(void)
 {
 	if (--nf_early_ingress_use == 0) {
 		nf_early_ingress_ip_disable();
+		nf_early_ingress_ip6_disable();
 	}
 }
-- 
2.11.0


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html