Hi Stefano, On Sun, 8 Jul 2018, Stefano Brivio wrote: > Commit 45040978c899 ("netfilter: ipset: Fix set:list type crash > when flush/dump set in parallel") postponed decreasing set > reference counters to the RCU callback. > > An 'ipset del' command can terminate before the RCU grace period > is elapsed, and if sets are listed before then, the reference > counter shown in userspace will be wrong: > > # ipset create h hash:ip; ipset create l list:set; ipset add l > # ipset del l h; ipset list h > Name: h > Type: hash:ip > Revision: 4 > Header: family inet hashsize 1024 maxelem 65536 > Size in memory: 88 > References: 1 > Number of entries: 0 > Members: > # sleep 1; ipset list h > Name: h > Type: hash:ip > Revision: 4 > Header: family inet hashsize 1024 maxelem 65536 > Size in memory: 88 > References: 0 > Number of entries: 0 > Members: > > Fix this by making the reference count update synchronous again. > > As a result, when sets are listed, ip_set_name_byindex() might > now fetch a set whose reference count is already zero. Instead > of relying on the reference count to protect against concurrent > set renaming, grab the netlink refcount while copying the name, > and avoid renaming if the netlink refcount is taken, in a > similar way as it's already done for swap and delete operations. > > This also reverts commit db268d4dfdb9 ("ipset: remove unused > function __ip_set_get_netlink") and restores that function > since we now need it. > > Reported-by: Li Shuang <shuali@xxxxxxxxxx> > Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel") > Signed-off-by: Stefano Brivio <sbrivio@xxxxxxxxxx> > --- > v2: As pointed out by Joszef, we can't assume that the nfnl mutex > is taken while dumping, so we need some explicit locking > to protect names of list:set members against changes > > include/linux/netfilter/ipset/ip_set.h | 2 +- > net/netfilter/ipset/ip_set_core.c | 31 +++++++++++++++++++++---------- > net/netfilter/ipset/ip_set_list_set.c | 17 +++++++++++------ > 3 files changed, 33 insertions(+), 17 deletions(-) > > diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h > index 34fc80f3eb90..1d100efe74ec 100644 > --- a/include/linux/netfilter/ipset/ip_set.h > +++ b/include/linux/netfilter/ipset/ip_set.h > @@ -314,7 +314,7 @@ enum { > extern ip_set_id_t ip_set_get_byname(struct net *net, > const char *name, struct ip_set **set); > extern void ip_set_put_byindex(struct net *net, ip_set_id_t index); > -extern const char *ip_set_name_byindex(struct net *net, ip_set_id_t index); > +extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name); > extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index); > extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index); > > diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c > index bc4bd247bb7d..4062c0396a02 100644 > --- a/net/netfilter/ipset/ip_set_core.c > +++ b/net/netfilter/ipset/ip_set_core.c > @@ -527,6 +527,15 @@ __ip_set_put(struct ip_set *set) > /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need > * a separate reference counter > */ > +static inline void > +__ip_set_get_netlink(struct ip_set *set) > +{ > + write_lock_bh(&ip_set_ref_lock); > + BUG_ON(set->ref_netlink != 0); > + set->ref_netlink++; > + write_unlock_bh(&ip_set_ref_lock); > +} > + > static inline void > __ip_set_put_netlink(struct ip_set *set) > { > @@ -693,21 +702,19 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index) > EXPORT_SYMBOL_GPL(ip_set_put_byindex); > > /* Get the name of a set behind a set index. > - * We assume the set is referenced, so it does exist and > - * can't be destroyed. The set cannot be renamed due to > - * the referencing either. > - * > + * Set itself is protected by RCU, but its name isn't: to protect against > + * renaming, grab its netlink refcount (see ip_set_rename()) and copy it. > */ > -const char * > -ip_set_name_byindex(struct net *net, ip_set_id_t index) > +void > +ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name) > { > - const struct ip_set *set = ip_set_rcu_get(net, index); > + struct ip_set *set = ip_set_rcu_get(net, index); > > BUG_ON(!set); > - BUG_ON(set->ref == 0); > > - /* Referenced, so it's safe */ > - return set->name; > + __ip_set_get_netlink(set); > + strncpy(name, set->name, IPSET_MAXNAMELEN); > + __ip_set_put_netlink(set); > } > EXPORT_SYMBOL_GPL(ip_set_name_byindex); This is my main concern about the patch: it boils down to *four* locking operations, for every single list members in a list type of set. It costs too much! ip_set_name_byindex() is used in ip_set_list_set.c only, so please use write_lock_bh()/write_unlock_bh() directly, that should be sufficient. > @@ -1158,6 +1165,10 @@ static int ip_set_rename(struct net *net, struct sock *ctnl, > ret = -IPSET_ERR_REFERENCED; > goto out; > } > + if (set->ref_netlink != 0) { > + ret = -EBUSY; > + goto out; > + } Please return -IPSET_ERR_REFERENCED instead, which is then translated into a proper textual error message to the users by ipset. > name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); > for (i = 0; i < inst->ip_set_max; i++) { > diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c > index 072a658fde04..3a39f20ba6fd 100644 > --- a/net/netfilter/ipset/ip_set_list_set.c > +++ b/net/netfilter/ipset/ip_set_list_set.c > @@ -148,9 +148,7 @@ __list_set_del_rcu(struct rcu_head * rcu) > { > struct set_elem *e = container_of(rcu, struct set_elem, rcu); > struct ip_set *set = e->set; > - struct list_set *map = set->data; > > - ip_set_put_byindex(map->net, e->id); > ip_set_ext_destroy(set, e); > kfree(e); > } > > @@ -158,14 +156,20 @@ __list_set_del_rcu(struct rcu_head * rcu) > static inline void > list_set_del(struct ip_set *set, struct set_elem *e) > { > + struct list_set *map = set->data; > + > set->elements--; > + ip_set_put_byindex(map->net, e->id); > list_del_rcu(&e->list); > call_rcu(&e->rcu, __list_set_del_rcu); > } Above and below please move ip_set_put_byindex() just before call_rcu(). > static inline void > -list_set_replace(struct set_elem *e, struct set_elem *old) > +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) > { > + struct list_set *map = set->data; > + > + ip_set_put_byindex(map->net, old->id); > list_replace_rcu(&old->list, &e->list); > call_rcu(&old->rcu, __list_set_del_rcu); > } > @@ -298,7 +302,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, > INIT_LIST_HEAD(&e->list); > list_set_init_extensions(set, ext, e); > if (n) > - list_set_replace(e, n); > + list_set_replace(set, e, n); > else if (next) > list_add_tail_rcu(&e->list, &next->list); > else if (prev) > @@ -486,6 +490,7 @@ list_set_list(const struct ip_set *set, > const struct list_set *map = set->data; > struct nlattr *atd, *nested; > u32 i = 0, first = cb->args[IPSET_CB_ARG0]; > + char name[IPSET_MAXNAMELEN]; > struct set_elem *e; > int ret = 0; > > @@ -504,8 +509,8 @@ list_set_list(const struct ip_set *set, > nested = ipset_nest_start(skb, IPSET_ATTR_DATA); > if (!nested) > goto nla_put_failure; > - if (nla_put_string(skb, IPSET_ATTR_NAME, > - ip_set_name_byindex(map->net, e->id))) > + ip_set_name_byindex(map->net, e->id, name); > + if (nla_put_string(skb, IPSET_ATTR_NAME, name)) > goto nla_put_failure; > if (ip_set_put_extensions(skb, set, e, true)) > goto nla_put_failure; Thanks! Best regards, Jozsef - E-mail : kadlec@xxxxxxxxxxxxxxxxx, kadlecsik.jozsef@xxxxxxxxxxxxx PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences H-1525 Budapest 114, POB. 49, Hungary -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html