Long time ago it was possible for the netfilter ip_conntrack core to call ip_fragment in POST_ROUTING hook. This is no longer the case, so the only case where bridge netfilter ends up calling ip_fragment is the direct call site in br_netfilter.c. Add ll and mtu arguments for ip_fragment and then get rid of the bridge netfilter specific helpers from ip_fragment. Cc: Andy Zhou <azhou@xxxxxxxxxx> Signed-off-by: Florian Westphal <fw@xxxxxxxxx> --- include/linux/netfilter_bridge.h | 17 ----------------- include/net/ip.h | 4 ++-- net/bridge/br_netfilter.c | 23 ++++++++++++++++++++--- net/ipv4/ip_output.c | 37 +++++++++++++++++++++---------------- 4 files changed, 43 insertions(+), 38 deletions(-) diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index ed0d3bf..fbbd5de 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -35,24 +35,8 @@ static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) } } -static inline unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) -{ - if (unlikely(skb->nf_bridge->mask & BRNF_PPPoE)) - return PPPOE_SES_HLEN; - return 0; -} - int br_handle_frame_finish(struct sk_buff *skb); -/* This is called by the IP fragmenting code and it ensures there is - * enough room for the encapsulating header (if there is one). */ -static inline unsigned int nf_bridge_pad(const struct sk_buff *skb) -{ - if (skb->nf_bridge) - return nf_bridge_encap_header_len(skb); - return 0; -} - static inline void br_drop_fake_rtable(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -62,7 +46,6 @@ static inline void br_drop_fake_rtable(struct sk_buff *skb) } #else -#define nf_bridge_pad(skb) (0) #define br_drop_fake_rtable(skb) do { } while (0) #endif /* CONFIG_BRIDGE_NETFILTER */ diff --git a/include/net/ip.h b/include/net/ip.h index 025c61c..2905a4b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -108,8 +108,8 @@ int ip_local_deliver(struct sk_buff *skb); int ip_mr_input(struct sk_buff *skb); int ip_output(struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct sock *sk, struct sk_buff *skb); -int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); -int ip_do_nat(struct sk_buff *skb); +int ip_fragment(struct sk_buff *skb, unsigned int mtu, + unsigned int ll_rs, int (*output)(struct sk_buff *)); void ip_send_check(struct iphdr *ip); int __ip_local_out(struct sk_buff *skb); int ip_local_out_sk(struct sock *sk, struct sk_buff *skb); diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index bd2d24d..550ee19 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -812,26 +812,43 @@ static int br_nf_push_frag_xmit(struct sk_buff *skb) return br_dev_queue_push_xmit(skb); } +static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb) +{ + if (skb->nf_bridge->mask & BRNF_PPPoE) + return PPPOE_SES_HLEN; + return 0; +} + static int br_nf_dev_queue_xmit(struct sk_buff *skb) { int ret; int frag_max_size; - unsigned int mtu_reserved; + unsigned int mtu_reserved, mtu; if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP)) return br_dev_queue_push_xmit(skb); mtu_reserved = nf_bridge_mtu_reduction(skb); + mtu = min(skb->dev->mtu, IP_MAX_MTU); /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ - if (skb->len + mtu_reserved > skb->dev->mtu) { + if (skb->len + mtu_reserved > mtu) { + unsigned int llrs; + frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; if (br_parse_ip_options(skb)) /* Drop invalid packet */ return NF_DROP; IPCB(skb)->frag_max_size = frag_max_size; - ret = ip_fragment(skb, br_nf_push_frag_xmit); + + /* for bridged IP traffic encapsulated inside f.e. a vlan header, + * we need to make room for the encapsulating header + */ + llrs = nf_bridge_encap_header_len(skb); + + mtu -= mtu_reserved; + ret = ip_fragment(skb, mtu, llrs, br_nf_push_frag_xmit); } else ret = br_dev_queue_push_xmit(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a7aea20..fe5ec3f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -216,6 +216,7 @@ static int ip_finish_output_gso(struct sk_buff *skb) netdev_features_t features; struct sk_buff *segs; int ret = 0; + unsigned int mtu; /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || @@ -236,6 +237,7 @@ static int ip_finish_output_gso(struct sk_buff *skb) return -ENOMEM; } + mtu = ip_skb_dst_mtu(skb); consume_skb(skb); do { @@ -243,7 +245,7 @@ static int ip_finish_output_gso(struct sk_buff *skb) int err; segs->next = NULL; - err = ip_fragment(segs, ip_finish_output2); + err = ip_fragment(segs, mtu, 0, ip_finish_output2); if (err && ret == 0) ret = err; @@ -255,6 +257,8 @@ static int ip_finish_output_gso(struct sk_buff *skb) static int ip_finish_output(struct sk_buff *skb) { + unsigned int mtu; + #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm != NULL) { @@ -265,8 +269,9 @@ static int ip_finish_output(struct sk_buff *skb) if (skb_is_gso(skb)) return ip_finish_output_gso(skb); - if (skb->len > ip_skb_dst_mtu(skb)) - return ip_fragment(skb, ip_finish_output2); + mtu = ip_skb_dst_mtu(skb); + if (skb->len > mtu) + return ip_fragment(skb, mtu, 0, ip_finish_output2); return ip_finish_output2(skb); } @@ -472,20 +477,28 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -/* +/** + * ip_fragment - fragment IP datagram or send ICMP error + * + * @skb: the skb to fragment + * @mtu: mtu to use for fragmentation + * @ll_rs: extra linklayer space required + * @output: transmit function used to send fragments + * * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus * a block of the data of the original IP data part) that will yet fit in a * single device frame, and queue such a frame for sending. */ - -int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +int ip_fragment(struct sk_buff *skb, + unsigned int mtu, unsigned int ll_rs, + int (*output)(struct sk_buff *)) { struct iphdr *iph; int ptr; struct net_device *dev; struct sk_buff *skb2; - unsigned int mtu, hlen, left, len, ll_rs; + unsigned int hlen, left, len; int offset; __be16 not_last_frag; struct rtable *rt = skb_rtable(skb); @@ -499,7 +512,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) iph = ip_hdr(skb); - mtu = ip_skb_dst_mtu(skb); if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { @@ -516,10 +528,6 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge) - mtu -= nf_bridge_mtu_reduction(skb); -#endif IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: @@ -636,10 +644,7 @@ slow_path: left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ - /* for bridged IP traffic encapsulated inside f.e. a vlan header, - * we need to make room for the encapsulating header - */ - ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb)); + ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, ll_rs); /* * Fragment the datagram. -- 2.0.5 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html