From: Peter Nørlund <pch@xxxxxxxxxxxx> When doing L3 based multipath, ICMP packets are inspected to let them route over the same path as the flow they relate to, allowing anycast environments to work with ECMP. Signed-off-by: Peter Nørlund <pch@xxxxxxxxxxxx> --- include/net/ip_fib.h | 2 +- include/net/route.h | 12 ++++++- net/ipv4/fib_semantics.c | 2 +- net/ipv4/icmp.c | 34 +++++++++++++++++++- net/ipv4/route.c | 82 ++++++++++++++++++++++++++++++++++++++---------- 5 files changed, 112 insertions(+), 20 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 21e74b5..3e5d4ed 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -328,7 +328,7 @@ struct multipath_flow4 { }; typedef void (*multipath_flow4_func_t)(struct multipath_flow4 *flow, - void *ctx); + enum rt_mp_alg_t algo, void *ctx); void fib_select_multipath(struct fib_result *res, multipath_flow4_func_t flow_func, diff --git a/include/net/route.h b/include/net/route.h index 395d79b..ccb85fc 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -28,6 +28,7 @@ #include <net/inetpeer.h> #include <net/flow.h> #include <net/inet_sock.h> +#include <net/ip_fib.h> #include <linux/in_route.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> @@ -110,7 +111,16 @@ struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); -struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); +struct rtable *__ip_route_output_key_flow(struct net *, struct flowi4 *flp, + multipath_flow4_func_t flow_func, + void *ctx); + +static inline struct rtable *__ip_route_output_key(struct net *net, + struct flowi4 *flp) +{ + return __ip_route_output_key_flow(net, flp, NULL, NULL); +} + struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, struct sock *sk); struct dst_entry *ipv4_blackhole_route(struct net *net, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3a80b1a..000c535 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1536,7 +1536,7 @@ static int fib_multipath_hash(const struct fib_result *res, { struct multipath_flow4 flow; - flow_func(&flow, ctx); + flow_func(&flow, res->fi->fib_mp_alg, ctx); if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH) return jhash_3words(flow.saddr, flow.daddr, flow.ports, 0) >> 1; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f16488e..0e25fe4 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -439,6 +439,38 @@ out_unlock: icmp_xmit_unlock(sk); } +/* Source and destination is swapped. See ip_multipath_flow_skb */ +static void icmp_multipath_flow(struct multipath_flow4 *flow, + enum rt_mp_alg_t algo, void *ctx) +{ + const struct sk_buff *skb = (const struct sk_buff *)ctx; + const struct iphdr *iph = ip_hdr(skb); + + flow->saddr = iph->daddr; + flow->daddr = iph->saddr; + flow->ports = 0; + + if (algo == RT_MP_ALG_L4_HASH) + return; + + if (unlikely(!(iph->frag_off & htons(IP_DF)))) + return; + + if (iph->protocol == IPPROTO_TCP || + iph->protocol == IPPROTO_UDP || + iph->protocol == IPPROTO_SCTP) { + __be16 _ports[2]; + const __be16 *ports; + + ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports), + &_ports); + if (ports) { + flow->sport = ports[1]; + flow->dport = ports[0]; + } + } +} + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -463,7 +495,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev) ? : skb_in->dev->ifindex; security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key_flow(net, fl4, icmp_multipath_flow, skb_in); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f50f84f..edbeb56 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1646,37 +1646,82 @@ out: #ifdef CONFIG_IP_ROUTE_MULTIPATH /* Fill multipath flow key data based on socket buffer */ -static void ip_multipath_flow_skb(struct multipath_flow4 *flow, void *ctx) +static void ip_multipath_flow_skb(struct multipath_flow4 *flow, + enum rt_mp_alg_t algo, void *ctx) { const struct sk_buff *skb = (const struct sk_buff *)ctx; - const struct iphdr *iph; + struct icmphdr _icmph; + struct iphdr _inner_iph; + const struct iphdr *outer_iph; + const struct icmphdr *icmph; + const struct iphdr *inner_iph; + unsigned int offset; - iph = ip_hdr(skb); + outer_iph = ip_hdr(skb); - flow->saddr = iph->saddr; - flow->daddr = iph->daddr; + flow->saddr = outer_iph->saddr; + flow->daddr = outer_iph->daddr; flow->ports = 0; - if (unlikely(!(iph->frag_off & htons(IP_DF)))) - return; + offset = outer_iph->ihl * 4; - if (iph->protocol == IPPROTO_TCP || - iph->protocol == IPPROTO_UDP || - iph->protocol == IPPROTO_SCTP) { + if (algo == RT_MP_ALG_L4_HASH) { __be16 _ports[2]; const __be16 *ports; - ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports), + if (unlikely(!(outer_iph->frag_off & htons(IP_DF)))) + return; + + if (outer_iph->protocol != IPPROTO_TCP && + outer_iph->protocol != IPPROTO_UDP && + outer_iph->protocol != IPPROTO_SCTP) { + return; + } + + ports = skb_header_pointer(skb, offset, sizeof(_ports), &_ports); if (ports) { flow->sport = ports[0]; flow->dport = ports[1]; } + + return; + } + + if (outer_iph->protocol != IPPROTO_ICMP) + return; + + if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) + return; + + icmph = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (!icmph) + return; + + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_SOURCE_QUENCH && + icmph->type != ICMP_REDIRECT && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB) { + return; } + + offset += sizeof(_icmph); + inner_iph = skb_header_pointer(skb, offset, sizeof(_inner_iph), + &_inner_iph); + if (!inner_iph) + return; + + /* Since the ICMP payload contains a packet sent from the current + * recipient, we swap source and destination addresses + */ + flow->saddr = inner_iph->daddr; + flow->daddr = inner_iph->saddr; } /* Fill multipath flow key data based on flowi4 */ -static void ip_multipath_flow_fl4(struct multipath_flow4 *flow, void *ctx) +static void ip_multipath_flow_fl4(struct multipath_flow4 *flow, + enum rt_mp_alg_t algo, void *ctx) { const struct flowi4 *fl4 = (const struct flowi4 *)ctx; @@ -2086,7 +2131,9 @@ add: * Major route resolver routine. */ -struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) +struct rtable *__ip_route_output_key_flow(struct net *net, struct flowi4 *fl4, + multipath_flow4_func_t flow_func, + void *ctx) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2248,9 +2295,12 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) - fib_select_multipath(&res, ip_multipath_flow_fl4, fl4); - else + if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) { + if (flow_func) + fib_select_multipath(&res, flow_func, ctx); + else + fib_select_multipath(&res, ip_multipath_flow_fl4, fl4); + } else #endif if (!res.prefixlen && res.table->tb_num_default > 1 && -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html