This brings the (per-conntrack) ecache extension back to 24 bytes in size (was 112 byte on x86_64 with lockdep on). Instead we use a per-ns timer to re-trigger event delivery. When we enqueue a ct entry into the dying list, the timer will be scheduled. The timer will then deliver up to 20 entries. If not all pending entries could be delivered, it will re-add itself to run again after 2 jiffies. This gives userspace consumers time to drain their sockets. When userspace listeners don't accept events at all, re-try is done after 10ms. If an event was sucessfully delivered via the normal delivery path, we take this as a hint that userspace has processed its backlog and re-try pending entries on the next timer tick. This speeds up re-delivery without adding too much overhead. While at it, dying list handling is moved into ecache.c, since its only revlevant if ct events are enabled. Signed-off-by: Florian Westphal <fw@xxxxxxxxx> --- Pablo, it would be great if you could give this one a spin in your conntrackd testsetup. It avoids the "perpertual-busy-retry" of the previous tasklet-based approach. include/net/netfilter/nf_conntrack.h | 1 - include/net/netfilter/nf_conntrack_ecache.h | 15 ++++++- include/net/netns/conntrack.h | 5 ++- net/netfilter/nf_conntrack_core.c | 55 +---------------------- net/netfilter/nf_conntrack_ecache.c | 64 ++++++++++++++++++++++---- net/netfilter/nf_conntrack_netlink.c | 2 +- 6 files changed, 76 insertions(+), 66 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index caca0c4..e1cc862 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -182,7 +182,6 @@ __nf_conntrack_find(struct net *net, u16 zone, extern int nf_conntrack_hash_check_insert(struct nf_conn *ct); extern void nf_ct_delete_from_lists(struct nf_conn *ct); -extern void nf_ct_dying_timeout(struct nf_conn *ct); extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report); diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 5654d29..cb46cfc 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -18,7 +18,6 @@ struct nf_conntrack_ecache { u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ u32 portid; /* netlink portid of destroyer */ - struct timer_list timeout; }; static inline struct nf_conntrack_ecache * @@ -210,6 +209,17 @@ nf_ct_expect_event(enum ip_conntrack_expect_events event, extern int nf_conntrack_ecache_init(struct net *net); extern void nf_conntrack_ecache_fini(struct net *net); +static inline void nf_ct_dying_timer_set(struct net *net) +{ + if (!net->ct.dying_backlogged) + mod_timer(&net->ct.dying_timer, jiffies); +} + +static inline void nf_ct_dying_timer_retry(struct net *net) +{ + if (net->ct.dying_backlogged) + mod_timer(&net->ct.dying_timer, jiffies); +} #else /* CONFIG_NF_CONNTRACK_EVENTS */ static inline void nf_conntrack_event_cache(enum ip_conntrack_events event, @@ -232,6 +242,9 @@ static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e, u32 portid, int report) {} +static inline void nf_ct_dying_timer_set(struct net *net) { } +static inline void nf_ct_dying_timer_retry(struct net *net) { } + static inline int nf_conntrack_ecache_init(struct net *net) { return 0; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index a1d83cc..cc39bd8 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -71,11 +71,14 @@ struct netns_ct { struct hlist_head *expect_hash; struct hlist_nulls_head unconfirmed; struct hlist_nulls_head dying; +#ifdef CONFIG_NF_CONNTRACK_EVENTS + struct timer_list dying_timer; + bool dying_backlogged; +#endif struct ip_conntrack_stat __percpu *stat; struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; struct nf_exp_event_notifier __rcu *nf_expect_event_cb; int sysctl_events; - unsigned int sysctl_events_retry_timeout; int sysctl_acct; int sysctl_tstamp; int sysctl_checksum; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 08cdc71..03a275c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -252,41 +252,6 @@ void nf_ct_delete_from_lists(struct nf_conn *ct) } EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); -static void death_by_event(unsigned long ul_conntrack) -{ - struct nf_conn *ct = (void *)ul_conntrack; - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); - - BUG_ON(ecache == NULL); - - if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { - /* bad luck, let's retry again */ - ecache->timeout.expires = jiffies + - (random32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ecache->timeout); - return; - } - /* we've got the event delivered, now it's dying */ - set_bit(IPS_DYING_BIT, &ct->status); - nf_ct_put(ct); -} - -void nf_ct_dying_timeout(struct nf_conn *ct) -{ - struct net *net = nf_ct_net(ct); - struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); - - BUG_ON(ecache == NULL); - - /* set a new timer to retry event delivery */ - setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); - ecache->timeout.expires = jiffies + - (random32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ecache->timeout); -} -EXPORT_SYMBOL_GPL(nf_ct_dying_timeout); - static void death_by_timeout(unsigned long ul_conntrack) { struct nf_conn *ct = (void *)ul_conntrack; @@ -300,12 +265,13 @@ static void death_by_timeout(unsigned long ul_conntrack) unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { /* destroy event was not delivered */ nf_ct_delete_from_lists(ct); - nf_ct_dying_timeout(ct); + nf_ct_dying_timer_set(nf_ct_net(ct)); return; } set_bit(IPS_DYING_BIT, &ct->status); nf_ct_delete_from_lists(ct); nf_ct_put(ct); + nf_ct_dying_timer_retry(nf_ct_net(ct)); } /* @@ -1304,21 +1270,6 @@ void nf_conntrack_flush_report(struct net *net, u32 pid, int report) } EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); -static void nf_ct_release_dying_list(struct net *net) -{ - struct nf_conntrack_tuple_hash *h; - struct nf_conn *ct; - struct hlist_nulls_node *n; - - spin_lock_bh(&nf_conntrack_lock); - hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); - /* never fails to remove them, no listeners at this point */ - nf_ct_kill(ct); - } - spin_unlock_bh(&nf_conntrack_lock); -} - static int untrack_refs(void) { int cnt = 0, cpu; @@ -1345,7 +1296,7 @@ static void nf_conntrack_cleanup_net(struct net *net) { i_see_dead_people: nf_ct_iterate_cleanup(net, kill_all, NULL); - nf_ct_release_dying_list(net); + nf_ct_dying_timer_set(net); if (atomic_read(&net->ct.count) != 0) { schedule(); goto i_see_dead_people; diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index faa978f..714d0bd 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -25,8 +25,58 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> +#define RETRY_INTERVAL (HZ/10) +#define RETRY_BUDGET 20 + static DEFINE_MUTEX(nf_ct_ecache_mutex); +static void dying_timer_retry_events(unsigned long ctnetns) +{ + struct netns_ct *ctnet = (void *) ctnetns; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct nf_conn *ct; + int err = 0; + int budget = RETRY_BUDGET; + unsigned long now = jiffies; + + rcu_read_lock(); + + hlist_nulls_for_each_entry(h, n, &ctnet->dying, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (test_bit(IPS_DYING_BIT, &ct->status)) + continue; + err = nf_conntrack_event(IPCT_DESTROY, ct); + if (err) + break; + /* we've got the event delivered, now it's dying */ + set_bit(IPS_DYING_BIT, &ct->status); + nf_ct_put(ct); + if (--budget == 0) + break; + } + + rcu_read_unlock(); + + if (err) { + if (!ctnet->dying_backlogged) + ctnet->dying_backlogged = true; + + if (budget == RETRY_BUDGET) + now += RETRY_INTERVAL; + else + now += 2; + + mod_timer(&ctnet->dying_timer, now); + } else if (budget == 0) { /* might be able to deliver more */ + mod_timer(&ctnet->dying_timer, now); + } else if (ctnet->dying_backlogged) { + ctnet->dying_backlogged = false; + /* All entries delivered, but other cpu might have added more */ + mod_timer(&ctnet->dying_timer, now + RETRY_INTERVAL); + } +} + /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) @@ -155,7 +205,6 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); #define NF_CT_EVENTS_DEFAULT 1 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; -static int nf_ct_events_retry_timeout __read_mostly = 15*HZ; #ifdef CONFIG_SYSCTL static struct ctl_table event_sysctl_table[] = { @@ -166,13 +215,6 @@ static struct ctl_table event_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "nf_conntrack_events_retry_timeout", - .data = &init_net.ct.sysctl_events_retry_timeout, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, {} }; #endif /* CONFIG_SYSCTL */ @@ -194,7 +236,6 @@ static int nf_conntrack_event_init_sysctl(struct net *net) goto out; table[0].data = &net->ct.sysctl_events; - table[1].data = &net->ct.sysctl_events_retry_timeout; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) @@ -238,8 +279,9 @@ int nf_conntrack_ecache_init(struct net *net) int ret; net->ct.sysctl_events = nf_ct_events; - net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout; + setup_timer(&net->ct.dying_timer, dying_timer_retry_events, + (unsigned long) &net->ct); if (net_eq(net, &init_net)) { ret = nf_ct_extend_register(&event_extend); if (ret < 0) { @@ -264,6 +306,8 @@ out_extend_register: void nf_conntrack_ecache_fini(struct net *net) { + del_timer(&net->ct.dying_timer); + nf_conntrack_event_fini_sysctl(net); if (net_eq(net, &init_net)) nf_ct_extend_unregister(&event_extend); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 4e078cd..31d685d 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -992,7 +992,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, nlmsg_report(nlh)) < 0) { nf_ct_delete_from_lists(ct); /* we failed to report the event, try later */ - nf_ct_dying_timeout(ct); + nf_ct_dying_timer_set(net); nf_ct_put(ct); return 0; } -- 1.7.8.6 -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html