Le lundi 08 novembre 2010 Ã 14:44 +0800, Changli Gao a Ãcrit : > On Mon, Nov 8, 2010 at 2:39 PM, Patrick McHardy <kaber@xxxxxxxxx> wrote: > > [resend with netfilter-devel address fixed] > > > > On 08.11.2010 07:36, Patrick McHardy wrote: > >> On 08.11.2010 07:32, Changli Gao wrote: > >>> When conntrack is enabled, we can cache dst_entry into the > >>> corresponding conntrack to eliminate the subsequence > >>> ip_route_input_noref() calls for the same connection. The current > >>> implementation is a standalone module. If this idea is welcomed, I'll > >>> try to push it up. Thanks. > >>> > >>> Here it is: https://github.com/xiaosuo/xiaosuo/tree/master/nf_rtcache/ > >> > >> Please send the patch as attachment so it can be discussed. > > > > > > Here is a trivial patch to add RTCACHE NF extention. > > > diff --git a/include/net/netfilter/nf_conntrack_extend.h > b/include/net/netfilter/nf_conntrack_extend.h > index 0772d29..a5077ef 100644 > --- a/include/net/netfilter/nf_conntrack_extend.h > +++ b/include/net/netfilter/nf_conntrack_extend.h > @@ -11,6 +11,7 @@ enum nf_ct_ext_id { > NF_CT_EXT_ACCT, > NF_CT_EXT_ECACHE, > NF_CT_EXT_ZONE, > + NF_CT_EXT_RTCACHE, > NF_CT_EXT_NUM, > }; > > @@ -19,6 +20,7 @@ enum nf_ct_ext_id { > #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter > #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache > #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone > +#define NF_CT_EXT_RTCACHE_TYPE struct nf_rtcache > > /* Extensions: optional stuff which isn't permanently in struct. */ > struct nf_ct_ext { > > And here is main code: > > > #define DEBUG > #define pr_fmt(fmt) "nf_rtcache: " fmt > #include <linux/module.h> > #include <linux/ip.h> > #include <net/ip.h> > #include <net/dst.h> > #include <net/route.h> > #include <net/netfilter/nf_conntrack_extend.h> > > MODULE_AUTHOR("Changli Gao <xiaosuo@xxxxxxxxx>"); > MODULE_LICENSE("GPL"); > > struct nf_rtcache { > struct dst_entry *dst[IP_CT_DIR_MAX]; > }; > > static void nf_rtcache_destroy(struct nf_conn *ct) > { > struct nf_rtcache *cache = nf_ct_ext_find(ct, NF_CT_EXT_RTCACHE); > struct dst_entry *dst; > > /* rcu_read_lock is held by __nf_ct_ext_destroy() */ > dst = rcu_dereference(cache->dst[IP_CT_DIR_ORIGINAL]); > if (dst) > dst_release(dst); > dst = rcu_dereference(cache->dst[IP_CT_DIR_REPLY]); > if (dst) > dst_release(dst); > } > > static struct nf_ct_ext_type nf_rtcache_ext __read_mostly = { > .len = sizeof(struct nf_rtcache), > .align = __alignof__(struct nf_rtcache), > .id = NF_CT_EXT_RTCACHE, > .destroy = nf_rtcache_destroy, > }; > > static unsigned int nf_rtcache_hook(unsigned int hooknum, struct sk_buff *skb, > const struct net_device *in, > const struct net_device *out, > int (*okfn)(struct sk_buff *)) > { > struct nf_conn *ct; > enum ip_conntrack_info ctinfo; > struct nf_rtcache *cache; > enum ip_conntrack_dir dir; > struct dst_entry *dst; > struct iphdr *iph; > int err; > > dst = skb_dst(skb); > if (dst) > return NF_ACCEPT; > /* rcu_read_lock is held by nf_hook_slow() */ > ct = nf_ct_get(skb, &ctinfo); > if (!ct) > return NF_ACCEPT; > cache = nf_ct_ext_find(ct, NF_CT_EXT_RTCACHE); > if (!cache) { > cache = nf_ct_ext_add(ct, NF_CT_EXT_RTCACHE, GFP_ATOMIC); > if (!cache) > return NF_ACCEPT; > } > dir = CTINFO2DIR(ctinfo); > dst = rcu_dereference(cache->dst[dir]); > iph = ip_hdr(skb); > if (dst && dst->obsolete <= 0) { > struct rtable *rth; > > rth = (struct rtable *)dst; > if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)iph->daddr) | > ((__force u32)rth->fl.fl4_src ^ (__force u32)iph->saddr) | > (rth->fl.iif ^ skb->dev->ifindex) | > rth->fl.oif | > (rth->fl.fl4_tos ^ (iph->tos & IPTOS_RT_MASK))) == 0 && > rth->fl.mark == skb->mark && > net_eq(dev_net(rth->dst.dev), dev_net(skb->dev)) && > rth->dst.ops->check(&rth->dst, 0)) { > dst_use_noref(dst, jiffies); > skb_dst_set_noref(skb, dst); > pr_debug("hit: %p\n", cache); > > return NF_ACCEPT; > } > } > > err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, > skb->dev); > if (unlikely(err)) { > if (err == -EHOSTUNREACH) > IP_INC_STATS(dev_net(skb->dev), > IPSTATS_MIB_INADDRERRORS); > else if (err == -ENETUNREACH) > IP_INC_STATS(dev_net(skb->dev), > IPSTATS_MIB_INNOROUTES); > else if (err == -EXDEV) > NET_INC_STATS(dev_net(skb->dev), > LINUX_MIB_IPRPFILTER); > return NF_DROP; > } > > dst = skb_dst(skb); > if (dst->flags & DST_NOCACHE) > dst = NULL; > else > dst_hold(dst); > dst = xchg(&cache->dst[dir], dst); > if (dst) > dst_release(dst); > pr_debug("miss: %p\n", cache); > > return NF_ACCEPT; > } > > static struct nf_hook_ops nf_rtcache_ops __read_mostly = { > .hook = nf_rtcache_hook, > .owner = THIS_MODULE, > .pf = PF_INET, > .hooknum = NF_INET_PRE_ROUTING, > .priority = NF_IP_PRI_LAST, > }; > > static __init int init(void) > { > int err; > > need_ipv4_conntrack(); > > err = nf_ct_extend_register(&nf_rtcache_ext); > if (err) > return err; > > err = nf_register_hook(&nf_rtcache_ops); > if (err) { > nf_ct_extend_unregister(&nf_rtcache_ext); > return err; > } > > return 0; > } > > static __exit void fini(void) > { > nf_unregister_hook(&nf_rtcache_ops); > nf_ct_extend_unregister(&nf_rtcache_ext); > } > > module_init(init); > module_exit(fini); > Adding yet another level of cache seems wrong to me. Either we fix the first level, or we dont use it at all. If ip_route_input_noref() is slow, we can work on it to make it faster. I know dst_use_noref() is the real problem an can be optimized if needed. -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html