Linkui Xiao reported that there's a race condition when ipset swap and destroy is called, which can lead to crash in add/del/test element operations. Swap then destroy are usual operations to replace a set with another one in a production system. The issue can in some cases be reproduced with the script: ipset create hash_ip1 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip1 172.20.0.0/16 ipset add hash_ip1 192.168.0.0/16 iptables -A INPUT -m set --match-set hash_ip1 src -j ACCEPT while [ 1 ] do # ... Ongoing traffic... ipset create hash_ip2 hash:net family inet hashsize 1024 maxelem 1048576 ipset add hash_ip2 172.20.0.0/16 ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 sleep 0.05 done In the race case the possible order of the operations are CPU0 CPU1 ip_set_test ipset swap hash_ip1 hash_ip2 ipset destroy hash_ip2 hash_net_kadt Swap replaces hash_ip1 with hash_ip2 and then destroy removes hash_ip2 which is the original hash_ip1. ip_set_test was called on hash_ip1 and because destroy removed it, hash_net_kadt crashes. The fix is to protect both the list of the sets and the set pointers in an extended RCU region and before exiting ip_set_swap(), wait to finish all started rcu_read_lock(). The first version of the patch was written by Linkui Xiao <xiaolinkui@xxxxxxxxxx>. v2: synchronize_rcu() is moved into ip_set_swap() in order not to burden ip_set_destroy() unnecessarily when all sets are destroyed. Closes: https://lore.kernel.org/all/69e7963b-e7f8-3ad0-210-7b86eebf7f78@xxxxxxxxxxxxx/ Reported by: Linkui Xiao <xiaolinkui@xxxxxxxxxx> Signed-off-by: Jozsef Kadlecsik <kadlec@xxxxxxxxxxxxx> --- net/netfilter/ipset/ip_set_core.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e564b5174261..a50ded1ee66d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -704,13 +704,18 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index) struct ip_set_net *inst = ip_set_pernet(net); rcu_read_lock(); - /* ip_set_list itself needs to be protected */ + /* ip_set_list and the set pointer need to be protected */ set = rcu_dereference(inst->ip_set_list)[index]; - rcu_read_unlock(); return set; } +static inline void +ip_set_rcu_put(struct ip_set *set __always_unused) +{ + rcu_read_unlock(); +} + static inline void ip_set_lock(struct ip_set *set) { @@ -736,8 +741,10 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || - !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) { + ip_set_rcu_put(set); return 0; + } ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); @@ -756,6 +763,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, ret = -ret; } + ip_set_rcu_put(set); /* Convert error codes to nomatch */ return (ret < 0 ? 0 : ret); } @@ -772,12 +780,15 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || - !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) { + ip_set_rcu_put(set); return -IPSET_ERR_TYPE_MISMATCH; + } ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); ip_set_unlock(set); + ip_set_rcu_put(set); return ret; } @@ -794,12 +805,15 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, pr_debug("set %s, index %u\n", set->name, index); if (opt->dim < set->type->dimension || - !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) { + ip_set_rcu_put(set); return -IPSET_ERR_TYPE_MISMATCH; + } ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); ip_set_unlock(set); + ip_set_rcu_put(set); return ret; } @@ -874,6 +888,7 @@ ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) read_lock_bh(&ip_set_ref_lock); strscpy_pad(name, set->name, IPSET_MAXNAMELEN); read_unlock_bh(&ip_set_ref_lock); + ip_set_rcu_put(set); } EXPORT_SYMBOL_GPL(ip_set_name_byindex); @@ -1389,6 +1404,9 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, ip_set(inst, to_id) = from; write_unlock_bh(&ip_set_ref_lock); + /* Make sure all readers of the old set pointers are completed. */ + synchronize_rcu(); + return 0; } -- 2.30.2