Re: [PATCH v2 nf-next 1/6] net: untangle ip_fragment and bridge netfilter

Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> · Mon, 16 Mar 2015 23:55:45 +0100

Hi David,

This patch changes the interface ip_fragment(). If this sounds
reasonable to you, please ack it and I'll route it to you in the next
nf-next batch.

Thanks.

On Thu, Mar 12, 2015 at 06:05:20PM +0100, Florian Westphal wrote:
> Long time ago it was possible for the netfilter ip_conntrack
> core to call ip_fragment in POST_ROUTING hook.
> 
> This is no longer the case, so the only case where bridge netfilter
> ends up calling ip_fragment is the direct call site in br_netfilter.c.
> 
> Add ll and mtu arguments for ip_fragment and then get rid of the bridge
> netfilter specific helpers from ip_fragment.
> 
> Cc: Andy Zhou <azhou@xxxxxxxxxx>
> Signed-off-by: Florian Westphal <fw@xxxxxxxxx>
> ---
>  include/linux/netfilter_bridge.h | 17 -----------------
>  include/net/ip.h                 |  4 ++--
>  net/bridge/br_netfilter.c        | 23 ++++++++++++++++++++---
>  net/ipv4/ip_output.c             | 37 +++++++++++++++++++++----------------
>  4 files changed, 43 insertions(+), 38 deletions(-)
> 
> diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
> index ed0d3bf..fbbd5de 100644
> --- a/include/linux/netfilter_bridge.h
> +++ b/include/linux/netfilter_bridge.h
> @@ -35,24 +35,8 @@ static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
>  	}
>  }
>  
> -static inline unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
> -{
> -	if (unlikely(skb->nf_bridge->mask & BRNF_PPPoE))
> -		return PPPOE_SES_HLEN;
> -	return 0;
> -}
> -
>  int br_handle_frame_finish(struct sk_buff *skb);
>  
> -/* This is called by the IP fragmenting code and it ensures there is
> - * enough room for the encapsulating header (if there is one). */
> -static inline unsigned int nf_bridge_pad(const struct sk_buff *skb)
> -{
> -	if (skb->nf_bridge)
> -		return nf_bridge_encap_header_len(skb);
> -	return 0;
> -}
> -
>  static inline void br_drop_fake_rtable(struct sk_buff *skb)
>  {
>  	struct dst_entry *dst = skb_dst(skb);
> @@ -62,7 +46,6 @@ static inline void br_drop_fake_rtable(struct sk_buff *skb)
>  }
>  
>  #else
> -#define nf_bridge_pad(skb)			(0)
>  #define br_drop_fake_rtable(skb)	        do { } while (0)
>  #endif /* CONFIG_BRIDGE_NETFILTER */
>  
> diff --git a/include/net/ip.h b/include/net/ip.h
> index 025c61c..2905a4b 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -108,8 +108,8 @@ int ip_local_deliver(struct sk_buff *skb);
>  int ip_mr_input(struct sk_buff *skb);
>  int ip_output(struct sock *sk, struct sk_buff *skb);
>  int ip_mc_output(struct sock *sk, struct sk_buff *skb);
> -int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
> -int ip_do_nat(struct sk_buff *skb);
> +int ip_fragment(struct sk_buff *skb, unsigned int mtu,
> +		unsigned int ll_rs, int (*output)(struct sk_buff *));
>  void ip_send_check(struct iphdr *ip);
>  int __ip_local_out(struct sk_buff *skb);
>  int ip_local_out_sk(struct sock *sk, struct sk_buff *skb);
> diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
> index bd2d24d..550ee19 100644
> --- a/net/bridge/br_netfilter.c
> +++ b/net/bridge/br_netfilter.c
> @@ -812,26 +812,43 @@ static int br_nf_push_frag_xmit(struct sk_buff *skb)
>  	return br_dev_queue_push_xmit(skb);
>  }
>  
> +static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
> +{
> +	if (skb->nf_bridge->mask & BRNF_PPPoE)
> +		return PPPOE_SES_HLEN;
> +	return 0;
> +}
> +
>  static int br_nf_dev_queue_xmit(struct sk_buff *skb)
>  {
>  	int ret;
>  	int frag_max_size;
> -	unsigned int mtu_reserved;
> +	unsigned int mtu_reserved, mtu;
>  
>  	if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP))
>  		return br_dev_queue_push_xmit(skb);
>  
>  	mtu_reserved = nf_bridge_mtu_reduction(skb);
> +	mtu = min(skb->dev->mtu, IP_MAX_MTU);
>  	/* This is wrong! We should preserve the original fragment
>  	 * boundaries by preserving frag_list rather than refragmenting.
>  	 */
> -	if (skb->len + mtu_reserved > skb->dev->mtu) {
> +	if (skb->len + mtu_reserved > mtu) {
> +		unsigned int llrs;
> +
>  		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
>  		if (br_parse_ip_options(skb))
>  			/* Drop invalid packet */
>  			return NF_DROP;
>  		IPCB(skb)->frag_max_size = frag_max_size;
> -		ret = ip_fragment(skb, br_nf_push_frag_xmit);
> +
> +		/* for bridged IP traffic encapsulated inside f.e. a vlan header,
> +		 * we need to make room for the encapsulating header
> +		 */
> +		llrs = nf_bridge_encap_header_len(skb);
> +
> +		mtu -= mtu_reserved;
> +		ret = ip_fragment(skb, mtu, llrs, br_nf_push_frag_xmit);
>  	} else
>  		ret = br_dev_queue_push_xmit(skb);
>  
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index a7aea20..fe5ec3f 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -216,6 +216,7 @@ static int ip_finish_output_gso(struct sk_buff *skb)
>  	netdev_features_t features;
>  	struct sk_buff *segs;
>  	int ret = 0;
> +	unsigned int mtu;
>  
>  	/* common case: locally created skb or seglen is <= mtu */
>  	if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
> @@ -236,6 +237,7 @@ static int ip_finish_output_gso(struct sk_buff *skb)
>  		return -ENOMEM;
>  	}
>  
> +	mtu = ip_skb_dst_mtu(skb);
>  	consume_skb(skb);
>  
>  	do {
> @@ -243,7 +245,7 @@ static int ip_finish_output_gso(struct sk_buff *skb)
>  		int err;
>  
>  		segs->next = NULL;
> -		err = ip_fragment(segs, ip_finish_output2);
> +		err = ip_fragment(segs, mtu, 0, ip_finish_output2);
>  
>  		if (err && ret == 0)
>  			ret = err;
> @@ -255,6 +257,8 @@ static int ip_finish_output_gso(struct sk_buff *skb)
>  
>  static int ip_finish_output(struct sk_buff *skb)
>  {
> +	unsigned int mtu;
> +
>  #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
>  	/* Policy lookup after SNAT yielded a new policy */
>  	if (skb_dst(skb)->xfrm != NULL) {
> @@ -265,8 +269,9 @@ static int ip_finish_output(struct sk_buff *skb)
>  	if (skb_is_gso(skb))
>  		return ip_finish_output_gso(skb);
>  
> -	if (skb->len > ip_skb_dst_mtu(skb))
> -		return ip_fragment(skb, ip_finish_output2);
> +	mtu = ip_skb_dst_mtu(skb);
> +	if (skb->len > mtu)
> +		return ip_fragment(skb, mtu, 0, ip_finish_output2);
>  
>  	return ip_finish_output2(skb);
>  }
> @@ -472,20 +477,28 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
>  	skb_copy_secmark(to, from);
>  }
>  
> -/*
> +/**
> + *	ip_fragment - fragment IP datagram or send ICMP error
> + *
> + *	@skb: the skb to fragment
> + *	@mtu: mtu to use for fragmentation
> + *	@ll_rs: extra linklayer space required
> + *	@output: transmit function used to send fragments
> + *
>   *	This IP datagram is too large to be sent in one piece.  Break it up into
>   *	smaller pieces (each of size equal to IP header plus
>   *	a block of the data of the original IP data part) that will yet fit in a
>   *	single device frame, and queue such a frame for sending.
>   */
> -
> -int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
> +int ip_fragment(struct sk_buff *skb,
> +		unsigned int mtu, unsigned int ll_rs,
> +		int (*output)(struct sk_buff *))
>  {
>  	struct iphdr *iph;
>  	int ptr;
>  	struct net_device *dev;
>  	struct sk_buff *skb2;
> -	unsigned int mtu, hlen, left, len, ll_rs;
> +	unsigned int hlen, left, len;
>  	int offset;
>  	__be16 not_last_frag;
>  	struct rtable *rt = skb_rtable(skb);
> @@ -499,7 +512,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
>  
>  	iph = ip_hdr(skb);
>  
> -	mtu = ip_skb_dst_mtu(skb);
>  	if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) ||
>  		     (IPCB(skb)->frag_max_size &&
>  		      IPCB(skb)->frag_max_size > mtu))) {
> @@ -516,10 +528,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
>  
>  	hlen = iph->ihl * 4;
>  	mtu = mtu - hlen;	/* Size of data space */
> -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
> -	if (skb->nf_bridge)
> -		mtu -= nf_bridge_mtu_reduction(skb);
> -#endif
>  	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
>  
>  	/* When frag_list is given, use it. First, check its validity:
> @@ -636,10 +644,7 @@ slow_path:
>  	left = skb->len - hlen;		/* Space per frame */
>  	ptr = hlen;		/* Where to start from */
>  
> -	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
> -	 * we need to make room for the encapsulating header
> -	 */
> -	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
> +	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, ll_rs);
>  
>  	/*
>  	 *	Fragment the datagram.
> -- 
> 2.0.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html