ICMP packets are inspected to let them route together with the flow they belong to, allowing anycast environments to work with ECMP. Signed-off-by: Peter Nørlund <pch@xxxxxxxxxxxx> --- net/ipv4/icmp.c | 27 ++++++++++++++++++- net/ipv4/route.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 92 insertions(+), 15 deletions(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 3abcfea..20f1d5e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -447,6 +447,7 @@ static struct rtable *icmp_route_lookup(struct net *net, { struct rtable *rt, *rt2; struct flowi4 fl4_dec; + struct flowi4 mp_flow; int err; memset(fl4, 0, sizeof(*fl4)); @@ -459,7 +460,31 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key(net, fl4, NULL); + + /* Source and destination is swapped. See ip_multipath_flow */ + mp_flow.saddr = iph->daddr; + mp_flow.daddr = iph->saddr; + mp_flow.flowi4_proto = iph->protocol; + mp_flow.fl4_sport = 0; + mp_flow.fl4_dport = 0; + if (!ip_is_fragment(iph)) { + if (iph->protocol == IPPROTO_TCP || + iph->protocol == IPPROTO_UDP || + iph->protocol == IPPROTO_SCTP) { + __be16 _ports[2]; + const __be16 *ports; + + ports = skb_header_pointer(skb_in, iph->ihl * 4, + sizeof(_ports), + &_ports); + if (ports) { + mp_flow.fl4_sport = ports[1]; + mp_flow.fl4_dport = ports[0]; + } + } + } + + rt = __ip_route_output_key(net, fl4, &mp_flow); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a1ec62c..bab4318 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1635,31 +1635,83 @@ out: /* Fill flow key data based on packet for use in multipath routing. */ static void ip_multipath_flow(const struct sk_buff *skb, struct flowi4 *flow) { - const struct iphdr *iph; - - iph = ip_hdr(skb); - - flow->saddr = iph->saddr; - flow->daddr = iph->daddr; - flow->flowi4_proto = iph->protocol; + struct icmphdr _icmph; + struct iphdr _inner_iph; + const struct iphdr *outer_iph; + const struct icmphdr *icmph; + const struct iphdr *inner_iph; + unsigned int offset; + __be16 _ports[2]; + const __be16 *ports; + + outer_iph = ip_hdr(skb); + + flow->saddr = outer_iph->saddr; + flow->daddr = outer_iph->daddr; + flow->flowi4_proto = outer_iph->protocol; flow->fl4_sport = 0; flow->fl4_dport = 0; - if (unlikely(ip_is_fragment(iph))) + if (unlikely(ip_is_fragment(outer_iph))) return; - if (iph->protocol == IPPROTO_TCP || - iph->protocol == IPPROTO_UDP || - iph->protocol == IPPROTO_SCTP) { - __be16 _ports; - const __be16 *ports; + offset = outer_iph->ihl * 4; - ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports), + if (outer_iph->protocol == IPPROTO_TCP || + outer_iph->protocol == IPPROTO_UDP || + outer_iph->protocol == IPPROTO_SCTP) { + ports = skb_header_pointer(skb, offset, sizeof(_ports), &_ports); if (ports) { flow->fl4_sport = ports[0]; flow->fl4_dport = ports[1]; } + + return; + } + + if (outer_iph->protocol != IPPROTO_ICMP) + return; + + icmph = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (!icmph) + return; + + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_SOURCE_QUENCH && + icmph->type != ICMP_REDIRECT && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB) { + return; + } + + offset += sizeof(_icmph); + inner_iph = skb_header_pointer(skb, offset, sizeof(_inner_iph), + &_inner_iph); + if (inner_iph) + return; + + /* Since the ICMP payload contains a packet sent from the current + * recipient, we swap source and destination addresses and ports + */ + flow->saddr = inner_iph->daddr; + flow->daddr = inner_iph->saddr; + flow->flowi4_proto = inner_iph->protocol; + + if (unlikely(ip_is_fragment(inner_iph))) + return; + + if (inner_iph->protocol != IPPROTO_TCP && + inner_iph->protocol != IPPROTO_UDP && + inner_iph->protocol != IPPROTO_SCTP) { + return; + } + + offset += inner_iph->ihl * 4; + ports = skb_header_pointer(skb, offset, sizeof(_ports), &_ports); + if (ports) { + flow->fl4_sport = ports[1]; + flow->fl4_dport = ports[0]; } } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html