Re: RFC: netfilter: cache dst_entry in conntrack

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Le lundi 08 novembre 2010 Ã 14:44 +0800, Changli Gao a Ãcrit :
> On Mon, Nov 8, 2010 at 2:39 PM, Patrick McHardy <kaber@xxxxxxxxx> wrote:
> > [resend with netfilter-devel address fixed]
> >
> > On 08.11.2010 07:36, Patrick McHardy wrote:
> >> On 08.11.2010 07:32, Changli Gao wrote:
> >>> When conntrack is enabled, we can cache dst_entry into the
> >>> corresponding conntrack to eliminate the subsequence
> >>> ip_route_input_noref() calls for the same connection. The current
> >>> implementation is a standalone module. If this idea is welcomed, I'll
> >>> try to push it up. Thanks.
> >>>
> >>> Here it is: https://github.com/xiaosuo/xiaosuo/tree/master/nf_rtcache/
> >>
> >> Please send the patch as attachment so it can be discussed.
> >
> >
> 
> Here is a trivial patch to add RTCACHE NF extention.
> 
> 
> diff --git a/include/net/netfilter/nf_conntrack_extend.h
> b/include/net/netfilter/nf_conntrack_extend.h
> index 0772d29..a5077ef 100644
> --- a/include/net/netfilter/nf_conntrack_extend.h
> +++ b/include/net/netfilter/nf_conntrack_extend.h
> @@ -11,6 +11,7 @@ enum nf_ct_ext_id {
>  	NF_CT_EXT_ACCT,
>  	NF_CT_EXT_ECACHE,
>  	NF_CT_EXT_ZONE,
> +	NF_CT_EXT_RTCACHE,
>  	NF_CT_EXT_NUM,
>  };
> 
> @@ -19,6 +20,7 @@ enum nf_ct_ext_id {
>  #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter
>  #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
>  #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone
> +#define NF_CT_EXT_RTCACHE_TYPE struct nf_rtcache
> 
>  /* Extensions: optional stuff which isn't permanently in struct. */
>  struct nf_ct_ext {
> 
> And here is main code:
> 
> 
> #define DEBUG
> #define pr_fmt(fmt) "nf_rtcache: " fmt
> #include <linux/module.h>
> #include <linux/ip.h>
> #include <net/ip.h>
> #include <net/dst.h>
> #include <net/route.h>
> #include <net/netfilter/nf_conntrack_extend.h>
> 
> MODULE_AUTHOR("Changli Gao <xiaosuo@xxxxxxxxx>");
> MODULE_LICENSE("GPL");
> 
> struct nf_rtcache {
> 	struct dst_entry	*dst[IP_CT_DIR_MAX];
> };
> 
> static void nf_rtcache_destroy(struct nf_conn *ct)
> {
> 	struct nf_rtcache *cache = nf_ct_ext_find(ct, NF_CT_EXT_RTCACHE);
> 	struct dst_entry *dst;
> 
> 	/* rcu_read_lock is held by __nf_ct_ext_destroy() */
> 	dst = rcu_dereference(cache->dst[IP_CT_DIR_ORIGINAL]);
> 	if (dst)
> 		dst_release(dst);
> 	dst = rcu_dereference(cache->dst[IP_CT_DIR_REPLY]);
> 	if (dst)
> 		dst_release(dst);
> }
> 
> static struct nf_ct_ext_type nf_rtcache_ext __read_mostly = {
> 	.len		= sizeof(struct nf_rtcache),
> 	.align		= __alignof__(struct nf_rtcache),
> 	.id		= NF_CT_EXT_RTCACHE,
> 	.destroy	= nf_rtcache_destroy,
> };
> 
> static unsigned int nf_rtcache_hook(unsigned int hooknum, struct sk_buff *skb,
> 				    const struct net_device *in,
> 				    const struct net_device *out,
> 				    int (*okfn)(struct sk_buff *))
> {
> 	struct nf_conn *ct;
> 	enum ip_conntrack_info ctinfo;
> 	struct nf_rtcache *cache;
> 	enum ip_conntrack_dir dir;
> 	struct dst_entry *dst;
> 	struct iphdr *iph;
> 	int err;
> 
> 	dst = skb_dst(skb);
> 	if (dst)
> 		return NF_ACCEPT;
> 	/* rcu_read_lock is held by nf_hook_slow() */
> 	ct = nf_ct_get(skb, &ctinfo);
> 	if (!ct)
> 		return NF_ACCEPT;
> 	cache = nf_ct_ext_find(ct, NF_CT_EXT_RTCACHE);
> 	if (!cache) {
> 		cache = nf_ct_ext_add(ct, NF_CT_EXT_RTCACHE, GFP_ATOMIC);
> 		if (!cache)
> 			return NF_ACCEPT;
> 	}
> 	dir = CTINFO2DIR(ctinfo);
> 	dst = rcu_dereference(cache->dst[dir]);
> 	iph = ip_hdr(skb);
> 	if (dst && dst->obsolete <= 0) {
> 		struct rtable *rth;
> 
> 		rth = (struct rtable *)dst;
> 		if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)iph->daddr) |
> 		     ((__force u32)rth->fl.fl4_src ^ (__force u32)iph->saddr) |
> 		     (rth->fl.iif ^ skb->dev->ifindex) |
> 		     rth->fl.oif |
> 		     (rth->fl.fl4_tos ^ (iph->tos & IPTOS_RT_MASK))) == 0 &&
> 		    rth->fl.mark == skb->mark &&
> 		    net_eq(dev_net(rth->dst.dev), dev_net(skb->dev)) &&
> 		    rth->dst.ops->check(&rth->dst, 0)) {
> 			dst_use_noref(dst, jiffies);
> 			skb_dst_set_noref(skb, dst);
> 			pr_debug("hit: %p\n", cache);
> 
> 			return NF_ACCEPT;
> 		}
> 	}
> 
> 	err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos,
> 				   skb->dev);
> 	if (unlikely(err)) {
> 		if (err == -EHOSTUNREACH)
> 			IP_INC_STATS(dev_net(skb->dev),
> 				     IPSTATS_MIB_INADDRERRORS);
> 		else if (err == -ENETUNREACH)
> 			IP_INC_STATS(dev_net(skb->dev),
> 				     IPSTATS_MIB_INNOROUTES);
> 		else if (err == -EXDEV)
> 			NET_INC_STATS(dev_net(skb->dev),
> 				      LINUX_MIB_IPRPFILTER);
> 		return NF_DROP;
> 	}
> 
> 	dst = skb_dst(skb);
> 	if (dst->flags & DST_NOCACHE)
> 		dst = NULL;
> 	else
> 		dst_hold(dst);
> 	dst = xchg(&cache->dst[dir], dst);
> 	if (dst)
> 		dst_release(dst);
> 	pr_debug("miss: %p\n", cache);
> 
> 	return NF_ACCEPT;
> }
> 
> static struct nf_hook_ops nf_rtcache_ops __read_mostly = {
> 	.hook		= nf_rtcache_hook,
> 	.owner		= THIS_MODULE,
> 	.pf		= PF_INET,
> 	.hooknum	= NF_INET_PRE_ROUTING,
> 	.priority	= NF_IP_PRI_LAST,
> };
> 
> static __init int init(void)
> {
> 	int err;
> 
> 	need_ipv4_conntrack();
> 
> 	err = nf_ct_extend_register(&nf_rtcache_ext);
> 	if (err)
> 		return err;
> 
> 	err = nf_register_hook(&nf_rtcache_ops);
> 	if (err) {
> 		nf_ct_extend_unregister(&nf_rtcache_ext);
> 		return err;
> 	}
> 
> 	return 0;
> }
> 
> static __exit void fini(void)
> {
> 	nf_unregister_hook(&nf_rtcache_ops);
> 	nf_ct_extend_unregister(&nf_rtcache_ext);
> }
> 
> module_init(init);
> module_exit(fini);
> 

Adding yet another level of cache seems wrong to me.

Either we fix the first level, or we dont use it at all.

If ip_route_input_noref() is slow, we can work on it to make it faster.

I know dst_use_noref() is the real problem an can be optimized if
needed.



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Netfitler Users]     [LARTC]     [Bugtraq]     [Yosemite Forum]

  Powered by Linux