Eric Dumazet wrote:
Le mercredi 09 juin 2010 à 18:30 +0200, Patrick McHardy a écrit :
Eric Dumazet wrote:
@@ -121,16 +128,14 @@ clusterip_config_find_get(__be32 clusterip, int entry)
{
struct clusterip_config *c;
- read_lock_bh(&clusterip_lock);
+ rcu_read_lock_bh();
c = __clusterip_config_find(clusterip);
- if (!c) {
- read_unlock_bh(&clusterip_lock);
- return NULL;
+ if (c) {
+ atomic_inc(&c->refcount);
+ if (entry)
+ atomic_inc(&c->entries);
Shouldn't this be using atomic_inc_not_zero() to avoid races with
clusterip_config_entry_put()?
Ah yes, I should stop doing patches today !
Hmm, I am not sure current code is correct ?
clusterip_config_find_get() can return a struct clusterip_config pointer
to an entry just that was given to clusterip_config_entry_put()
(c->entries == 0)
Then we access c->dev, while clusterip_config_entry_put() did a
dev_put(c->dev) on it ?
In the clusterip_tg_check() case we hold a "entries" reference, so
this can't happen. In arp_mangle() the device pointer is only used
for comparison (except for the pr_debug statement), so at least it
won't crash.
I wonder why this isn't simply using the refcount for both rule
entries and other references.
So following 'standard rcu lookup' works, but race is still there ?
static inline struct clusterip_config *
clusterip_config_find_get(__be32 clusterip, int entry)
{
struct clusterip_config *c;
rcu_read_lock_bh();
c = __clusterip_config_find(clusterip);
if (c) {
if (unlikely(!atomic_inc_not_zero(&c->refcount)))
c = NULL;
else if (entry)
atomic_inc(&c->entries);
}
rcu_read_unlock_bh();
return c;
}
updated patch for reference :
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index f91c94b..ecd77b1 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -53,12 +53,13 @@ struct clusterip_config {
#endif
enum clusterip_hashmode hash_mode; /* which hashing mode */
u_int32_t hash_initval; /* hash initialization */
+ struct rcu_head rcu;
};
static LIST_HEAD(clusterip_configs);
/* clusterip_lock protects the clusterip_configs list */
-static DEFINE_RWLOCK(clusterip_lock);
+static DEFINE_SPINLOCK(clusterip_lock);
#ifdef CONFIG_PROC_FS
static const struct file_operations clusterip_proc_fops;
@@ -71,11 +72,17 @@ clusterip_config_get(struct clusterip_config *c)
atomic_inc(&c->refcount);
}
+
+static void clusterip_config_rcu_free(struct rcu_head *head)
+{
+ kfree(container_of(head, struct clusterip_config, rcu));
+}
+
static inline void
clusterip_config_put(struct clusterip_config *c)
{
if (atomic_dec_and_test(&c->refcount))
- kfree(c);
+ call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
}
/* decrease the count of entries using/referencing this config. If last
@@ -84,10 +91,10 @@ clusterip_config_put(struct clusterip_config *c)
static inline void
clusterip_config_entry_put(struct clusterip_config *c)
{
- write_lock_bh(&clusterip_lock);
+ spin_lock_bh(&clusterip_lock);
if (atomic_dec_and_test(&c->entries)) {
list_del(&c->list);
- write_unlock_bh(&clusterip_lock);
+ spin_unlock_bh(&clusterip_lock);
dev_mc_del(c->dev, c->clustermac);
dev_put(c->dev);
@@ -100,7 +107,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
#endif
return;
}
- write_unlock_bh(&clusterip_lock);
+ spin_unlock_bh(&clusterip_lock);
}
static struct clusterip_config *
@@ -108,7 +115,7 @@ __clusterip_config_find(__be32 clusterip)
{
struct clusterip_config *c;
- list_for_each_entry(c, &clusterip_configs, list) {
+ list_for_each_entry_rcu(c, &clusterip_configs, list) {
if (c->clusterip == clusterip)
return c;
}
@@ -121,16 +128,15 @@ clusterip_config_find_get(__be32 clusterip, int entry)
{
struct clusterip_config *c;
- read_lock_bh(&clusterip_lock);
+ rcu_read_lock_bh();
c = __clusterip_config_find(clusterip);
- if (!c) {
- read_unlock_bh(&clusterip_lock);
- return NULL;
+ if (c) {
+ if (unlikely(!atomic_inc_not_zero(&c->refcount)))
+ c = NULL;
+ else if (entry)
+ atomic_inc(&c->entries);
}
- atomic_inc(&c->refcount);
- if (entry)
- atomic_inc(&c->entries);
- read_unlock_bh(&clusterip_lock);
+ rcu_read_unlock_bh();
return c;
}
@@ -181,9 +187,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
}
#endif
- write_lock_bh(&clusterip_lock);
+ spin_lock_bh(&clusterip_lock);
list_add(&c->list, &clusterip_configs);
- write_unlock_bh(&clusterip_lock);
+ spin_unlock_bh(&clusterip_lock);
return c;
}
@@ -733,6 +739,9 @@ static void __exit clusterip_tg_exit(void)
#endif
nf_unregister_hook(&cip_arp_ops);
xt_unregister_target(&clusterip_tg_reg);
+
+ /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
+ rcu_barrier_bh();
}
module_init(clusterip_tg_init);
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html