On Thu, 18 Jun 2015 15:52:22 -0700 Alexander Duyck <alexander.h.duyck@xxxxxxxxxx> wrote: > > > On 06/17/2015 01:08 PM, Peter Nørlund wrote: > > This patch adds L3 and L4 hash-based multipath routing, selectable > > on a per-route basis with the reintroduced RTA_MP_ALGO attribute. > > The default is now RT_MP_ALG_L3_HASH. > > > > Signed-off-by: Peter Nørlund <pch@xxxxxxxxxxxx> > > --- > > include/net/ip_fib.h | 4 ++- > > include/net/route.h | 5 ++-- > > include/uapi/linux/rtnetlink.h | 14 ++++++++++- > > net/ipv4/fib_frontend.c | 4 +++ > > net/ipv4/fib_semantics.c | 34 ++++++++++++++++++++++--- > > net/ipv4/icmp.c | 4 +-- > > net/ipv4/route.c | 56 > > +++++++++++++++++++++++++++++++++++------- > > net/ipv4/xfrm4_policy.c | 2 +- 8 files changed, 103 > > insertions(+), 20 deletions(-) > > > > diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h > > index 4be4f25..250d98e 100644 > > --- a/include/net/ip_fib.h > > +++ b/include/net/ip_fib.h > > @@ -37,6 +37,7 @@ struct fib_config { > > u32 fc_flags; > > u32 fc_priority; > > __be32 fc_prefsrc; > > + int fc_mp_alg; > > struct nlattr *fc_mx; > > struct rtnexthop *fc_mp; > > int fc_mx_len; > > @@ -116,6 +117,7 @@ struct fib_info { > > int fib_nhs; > > #ifdef CONFIG_IP_ROUTE_MULTIPATH > > int fib_mp_weight; > > + int fib_mp_alg; > > #endif > > struct rcu_head rcu; > > struct fib_nh fib_nh[0]; > > @@ -308,7 +310,7 @@ int ip_fib_check_default(__be32 gw, struct > > net_device *dev); int fib_sync_down_dev(struct net_device *dev, int > > force); int fib_sync_down_addr(struct net *net, __be32 local); > > int fib_sync_up(struct net_device *dev); > > -void fib_select_multipath(struct fib_result *res); > > +void fib_select_multipath(struct fib_result *res, const struct > > flowi4 *flow); > > > > /* Exported by fib_trie.c */ > > void fib_trie_init(void); > > diff --git a/include/net/route.h b/include/net/route.h > > index fe22d03..1fc7deb 100644 > > --- a/include/net/route.h > > +++ b/include/net/route.h > > @@ -110,7 +110,8 @@ struct in_device; > > int ip_rt_init(void); > > void rt_cache_flush(struct net *net); > > void rt_flush_dev(struct net_device *dev); > > -struct rtable *__ip_route_output_key(struct net *, struct flowi4 > > *flp); +struct rtable *__ip_route_output_key(struct net *, struct > > flowi4 *flp, > > + const struct flowi4 *mp_flow); > > struct rtable *ip_route_output_flow(struct net *, struct flowi4 > > *flp, struct sock *sk); > > struct dst_entry *ipv4_blackhole_route(struct net *net, > > @@ -267,7 +268,7 @@ static inline struct rtable > > *ip_route_connect(struct flowi4 *fl4, sport, dport, sk); > > > > if (!dst || !src) { > > - rt = __ip_route_output_key(net, fl4); > > + rt = __ip_route_output_key(net, fl4, NULL); > > if (IS_ERR(rt)) > > return rt; > > ip_rt_put(rt); > > diff --git a/include/uapi/linux/rtnetlink.h > > b/include/uapi/linux/rtnetlink.h index 17fb02f..dff4a72 100644 > > --- a/include/uapi/linux/rtnetlink.h > > +++ b/include/uapi/linux/rtnetlink.h > > @@ -271,6 +271,18 @@ enum rt_scope_t { > > #define RTM_F_EQUALIZE 0x400 /* Multipath > > equalizer: NI */ #define RTM_F_PREFIX > > 0x800 /* Prefix addresses */ > > > > +/* Multipath algorithms */ > > + > > +enum rt_mp_alg_t { > > + RT_MP_ALG_L3_HASH, /* Was IP_MP_ALG_NONE */ > > + RT_MP_ALG_PER_PACKET, /* Was IP_MP_ALG_RR */ > > + RT_MP_ALG_DRR, /* not used */ > > + RT_MP_ALG_RANDOM, /* not used */ > > + RT_MP_ALG_WRANDOM, /* not used */ > > + RT_MP_ALG_L4_HASH, > > + __RT_MP_ALG_MAX > > +}; > > + > > /* Reserved table identifiers */ > > > > enum rt_class_t { > > @@ -301,7 +313,7 @@ enum rtattr_type_t { > > RTA_FLOW, > > RTA_CACHEINFO, > > RTA_SESSION, /* no longer used */ > > - RTA_MP_ALGO, /* no longer used */ > > + RTA_MP_ALGO, > > RTA_TABLE, > > RTA_MARK, > > RTA_MFC_STATS, > > diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c > > index 872494e..376e8c1 100644 > > --- a/net/ipv4/fib_frontend.c > > +++ b/net/ipv4/fib_frontend.c > > @@ -590,6 +590,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX > > + 1] = { [RTA_PREFSRC] = { .type = NLA_U32 }, > > [RTA_METRICS] = { .type = NLA_NESTED }, > > [RTA_MULTIPATH] = { .len = sizeof(struct > > rtnexthop) }, > > + [RTA_MP_ALGO] = { .type = NLA_U32 }, > > [RTA_FLOW] = { .type = NLA_U32 }, > > }; > > > > @@ -650,6 +651,9 @@ static int rtm_to_fib_config(struct net *net, > > struct sk_buff *skb, cfg->fc_mp = nla_data(attr); > > cfg->fc_mp_len = nla_len(attr); > > break; > > + case RTA_MP_ALGO: > > + cfg->fc_mp_alg = nla_get_u32(attr); > > + break; > > case RTA_FLOW: > > cfg->fc_flow = nla_get_u32(attr); > > break; > > diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c > > index 8c8df80..da06e88 100644 > > --- a/net/ipv4/fib_semantics.c > > +++ b/net/ipv4/fib_semantics.c > > @@ -257,6 +257,11 @@ static inline int nh_comp(const struct > > fib_info *fi, const struct fib_info *ofi) { > > const struct fib_nh *onh = ofi->fib_nh; > > > > +#ifdef CONFIG_IP_ROUTE_MULTIPATH > > + if (fi->fib_mp_alg != ofi->fib_mp_alg) > > + return -1; > > +#endif > > + > > for_nexthops(fi) { > > if (nh->nh_oif != onh->nh_oif || > > nh->nh_gw != onh->nh_gw || > > @@ -896,6 +901,7 @@ struct fib_info *fib_create_info(struct > > fib_config *cfg) > > > > if (cfg->fc_mp) { > > #ifdef CONFIG_IP_ROUTE_MULTIPATH > > + fi->fib_mp_alg = cfg->fc_mp_alg; > > err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, > > cfg); if (err != 0) > > goto failure; > > @@ -1085,6 +1091,10 @@ int fib_dump_info(struct sk_buff *skb, u32 > > portid, u32 seq, int event, struct rtnexthop *rtnh; > > struct nlattr *mp; > > > > + if (fi->fib_mp_alg && > > + nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg)) > > + goto nla_put_failure; > > + > > mp = nla_nest_start(skb, RTA_MULTIPATH); > > if (!mp) > > goto nla_put_failure; > > @@ -1312,15 +1322,31 @@ int fib_sync_up(struct net_device *dev) > > } > > > > /* > > - * The algorithm is suboptimal, but it provides really > > - * fair weighted route distribution. > > + * Compute multipath hash based on 3- or 5-tuple > > */ > > -void fib_select_multipath(struct fib_result *res) > > +static int fib_multipath_hash(const struct fib_result *res, > > + const struct flowi4 *flow) > > +{ > > + u32 hash = flow->saddr ^ flow->daddr; > > + > > + if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH && > > flow->flowi4_proto != 0) > > + hash ^= flow->flowi4_proto ^ flow->fl4_sport ^ > > flow->fl4_dport; + > > + hash ^= hash >> 16; > > + hash ^= hash >> 8; > > + return hash & 0xFF; > > +} > > + > > This hash is still far from optimal. Really I think you should look > at something such as jhash_3words or the like for mixing up the > addresses. Right now just XORing the values together like you are > will end up with a fairly high collision rate since for example in > the case of two endpoints on the same subnet you would lose the > subnet as a result of XORing the source and destination addresses. > Also you would lose the port data in the case of a protocol using > something such as UDP where the source and destination ports might be > the same value. > When I begun to work on the multipath code, I took a look at what the hardware routers did, and found that you could typically choose between XOR, CRC16, and CRC32. CRC32 produces excellent hashes but is slow (unless you take advantage of SSE 4.2), while XOR is fast but less optimal. When I tested the XOR hash on the routers of ordbogen.com (my work place), I was actually surprised to see that the XOR hash was sufficient as long as I stuck with 8 bits. But you are probably right. While HTTP-traffic will likely balance well enough with XOR, other kind of traffic might not. I didn't know the Jenkins hash function, so I haven't tested it, but by the looks of it, I too think it is a good candidate. It also adds the possibility of extending the key space to 16 bits or more. > > +void fib_select_multipath(struct fib_result *res, const struct > > flowi4 *flow) { > > struct fib_info *fi = res->fi; > > u8 w; > > > > - w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter)); > > + if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) { > > + w = > > bitrev8(this_cpu_inc_return(fib_mp_rr_counter)); > > + } else { > > + w = fib_multipath_hash(res, flow); > > + } > > > > for_nexthops(fi) { > > if (w >= atomic_read(&nh->nh_mp_upper_bound)) > > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c > > index f5203fb..3abcfea 100644 > > --- a/net/ipv4/icmp.c > > +++ b/net/ipv4/icmp.c > > @@ -459,7 +459,7 @@ static struct rtable *icmp_route_lookup(struct > > net *net, fl4->fl4_icmp_type = type; > > fl4->fl4_icmp_code = code; > > security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); > > - rt = __ip_route_output_key(net, fl4); > > + rt = __ip_route_output_key(net, fl4, NULL); > > if (IS_ERR(rt)) > > return rt; > > > > @@ -481,7 +481,7 @@ static struct rtable *icmp_route_lookup(struct > > net *net, goto relookup_failed; > > > > if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { > > - rt2 = __ip_route_output_key(net, &fl4_dec); > > + rt2 = __ip_route_output_key(net, &fl4_dec, NULL); > > if (IS_ERR(rt2)) > > err = PTR_ERR(rt2); > > } else { > > diff --git a/net/ipv4/route.c b/net/ipv4/route.c > > index f605598..a1ec62c 100644 > > --- a/net/ipv4/route.c > > +++ b/net/ipv4/route.c > > @@ -1006,7 +1006,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, > > struct net *net, u32 mtu, > > > > __build_flow_key(&fl4, NULL, iph, oif, > > RT_TOS(iph->tos), protocol, mark, > > flow_flags); > > - rt = __ip_route_output_key(net, &fl4); > > + rt = __ip_route_output_key(net, &fl4, NULL); > > if (!IS_ERR(rt)) { > > __ip_rt_update_pmtu(rt, &fl4, mtu); > > ip_rt_put(rt); > > @@ -1025,7 +1025,7 @@ static void __ipv4_sk_update_pmtu(struct > > sk_buff *skb, struct sock *sk, u32 mtu) if (!fl4.flowi4_mark) > > fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), > > skb->mark); > > > > - rt = __ip_route_output_key(sock_net(sk), &fl4); > > + rt = __ip_route_output_key(sock_net(sk), &fl4, NULL); > > if (!IS_ERR(rt)) { > > __ip_rt_update_pmtu(rt, &fl4, mtu); > > ip_rt_put(rt); > > @@ -1094,7 +1094,7 @@ void ipv4_redirect(struct sk_buff *skb, > > struct net *net, > > > > __build_flow_key(&fl4, NULL, iph, oif, > > RT_TOS(iph->tos), protocol, mark, > > flow_flags); > > - rt = __ip_route_output_key(net, &fl4); > > + rt = __ip_route_output_key(net, &fl4, NULL); > > if (!IS_ERR(rt)) { > > __ip_do_redirect(rt, skb, &fl4, false); > > ip_rt_put(rt); > > @@ -1109,7 +1109,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, > > struct sock *sk) struct rtable *rt; > > > > __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); > > - rt = __ip_route_output_key(sock_net(sk), &fl4); > > + rt = __ip_route_output_key(sock_net(sk), &fl4, NULL); > > if (!IS_ERR(rt)) { > > __ip_do_redirect(rt, skb, &fl4, false); > > ip_rt_put(rt); > > @@ -1631,6 +1631,39 @@ out: > > return err; > > } > > > > +#ifdef CONFIG_IP_ROUTE_MULTIPATH > > +/* Fill flow key data based on packet for use in multipath > > routing. */ +static void ip_multipath_flow(const struct sk_buff > > *skb, struct flowi4 *flow) +{ > > + const struct iphdr *iph; > > + > > + iph = ip_hdr(skb); > > + > > + flow->saddr = iph->saddr; > > + flow->daddr = iph->daddr; > > + flow->flowi4_proto = iph->protocol; > > + flow->fl4_sport = 0; > > + flow->fl4_dport = 0; > > + > > + if (unlikely(ip_is_fragment(iph))) > > + return; > > + > > I'm not sure if checking for fragmentation is enough. For example if > this system is routing and received a flow of UDP packets, some > fragmented some not then it might end up mixing them over 2 separate > next hops since some will include L4 header data and some won't. > > As such you may want to have the option to exclude UDP from the > protocols listed below. > Just like the hash algorithms, I looked at the hardware routers for inspiration, and they typically do like this. It is a known problem, and it can just as well happen with TCP, so I don't think adding a special case for UDP is the way to go. But what about looking at the don't-fragment-bit? If the DF-bit is set on a packet, it is usually set on all packets in that flow, so one can assume that that flow will never fragment. If it is clear, you can't be sure and should probably avoid L4 hashing altogether. The interesting question is how many flows will be excluded from L4 hashing with this approach. > > + if (iph->protocol == IPPROTO_TCP || > > + iph->protocol == IPPROTO_UDP || > > + iph->protocol == IPPROTO_SCTP) { > > + __be16 _ports; > > + const __be16 *ports; > > + > > + ports = skb_header_pointer(skb, iph->ihl * 4, > > sizeof(_ports), > > + &_ports); > > + if (ports) { > > + flow->fl4_sport = ports[0]; > > + flow->fl4_dport = ports[1]; > > + } > > + } > > +} > > +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ > > + > > static int ip_mkroute_input(struct sk_buff *skb, > > struct fib_result *res, > > const struct flowi4 *fl4, > > @@ -1638,8 +1671,12 @@ static int ip_mkroute_input(struct sk_buff > > *skb, __be32 daddr, __be32 saddr, u32 tos) > > { > > #ifdef CONFIG_IP_ROUTE_MULTIPATH > > - if (res->fi && res->fi->fib_nhs > 1) > > - fib_select_multipath(res); > > + if (res->fi && res->fi->fib_nhs > 1) { > > + struct flowi4 mp_flow; > > + > > + ip_multipath_flow(skb, &mp_flow); > > + fib_select_multipath(res, &mp_flow); > > + } > > What is the point in populating the mp_flow if you don't know if it > is going to be used? You are populating it in ip_multipath_flow, and > then you might completely ignore it in fib_select_multipath. > > Maybe instead of passing the mp_flow you could instead look at > passing a function pointer that would alter the flow for the > multipath case instead. > Definably not a bad idea. It would remove unnecessary overhead when running per-packet multipath. > > #endif > > > > /* create a routing cache entry */ > > @@ -2012,7 +2049,8 @@ add: > > * Major route resolver routine. > > */ > > > > -struct rtable *__ip_route_output_key(struct net *net, struct > > flowi4 *fl4) +struct rtable *__ip_route_output_key(struct net *net, > > struct flowi4 *fl4, > > + const struct flowi4 *mp_flow) > > { > > struct net_device *dev_out = NULL; > > __u8 tos = RT_FL_TOS(fl4); > > @@ -2170,7 +2208,7 @@ struct rtable *__ip_route_output_key(struct > > net *net, struct flowi4 *fl4) > > > > #ifdef CONFIG_IP_ROUTE_MULTIPATH > > if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) > > - fib_select_multipath(&res); > > + fib_select_multipath(&res, (mp_flow ? mp_flow : > > fl4)); else > > #endif > > if (!res.prefixlen && > > @@ -2273,7 +2311,7 @@ struct dst_entry *ipv4_blackhole_route(struct > > net *net, struct dst_entry *dst_or struct rtable > > *ip_route_output_flow(struct net *net, struct flowi4 *flp4, struct > > sock *sk) { > > - struct rtable *rt = __ip_route_output_key(net, flp4); > > + struct rtable *rt = __ip_route_output_key(net, flp4, NULL); > > > > if (IS_ERR(rt)) > > return rt; > > diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c > > index bff6974..7eae158 100644 > > --- a/net/ipv4/xfrm4_policy.c > > +++ b/net/ipv4/xfrm4_policy.c > > @@ -31,7 +31,7 @@ static struct dst_entry > > *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, if (saddr) > > fl4->saddr = saddr->a4; > > > > - rt = __ip_route_output_key(net, fl4); > > + rt = __ip_route_output_key(net, fl4, NULL); > > if (!IS_ERR(rt)) > > return &rt->dst; > > > > -- To unsubscribe from this list: send the line "unsubscribe linux-api" in