Re: [PATCH v2] ipset: list:set: Decrease refcount synchronously on deletion and replace

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Stefano,

On Sun, 8 Jul 2018, Stefano Brivio wrote:

> Commit 45040978c899 ("netfilter: ipset: Fix set:list type crash
> when flush/dump set in parallel") postponed decreasing set
> reference counters to the RCU callback.
> 
> An 'ipset del' command can terminate before the RCU grace period
> is elapsed, and if sets are listed before then, the reference
> counter shown in userspace will be wrong:
> 
>  # ipset create h hash:ip; ipset create l list:set; ipset add l
>  # ipset del l h; ipset list h
>  Name: h
>  Type: hash:ip
>  Revision: 4
>  Header: family inet hashsize 1024 maxelem 65536
>  Size in memory: 88
>  References: 1
>  Number of entries: 0
>  Members:
>  # sleep 1; ipset list h
>  Name: h
>  Type: hash:ip
>  Revision: 4
>  Header: family inet hashsize 1024 maxelem 65536
>  Size in memory: 88
>  References: 0
>  Number of entries: 0
>  Members:
> 
> Fix this by making the reference count update synchronous again.
> 
> As a result, when sets are listed, ip_set_name_byindex() might
> now fetch a set whose reference count is already zero. Instead
> of relying on the reference count to protect against concurrent
> set renaming, grab the netlink refcount while copying the name,
> and avoid renaming if the netlink refcount is taken, in a
> similar way as it's already done for swap and delete operations.
> 
> This also reverts commit db268d4dfdb9 ("ipset: remove unused
> function __ip_set_get_netlink") and restores that function
> since we now need it.
> 
> Reported-by: Li Shuang <shuali@xxxxxxxxxx>
> Fixes: 45040978c899 ("netfilter: ipset: Fix set:list type crash when flush/dump set in parallel")
> Signed-off-by: Stefano Brivio <sbrivio@xxxxxxxxxx>
> ---
> v2: As pointed out by Joszef, we can't assume that the nfnl mutex
>     is taken while dumping, so we need some explicit locking
>     to protect names of list:set members against changes
> 
>  include/linux/netfilter/ipset/ip_set.h |  2 +-
>  net/netfilter/ipset/ip_set_core.c      | 31 +++++++++++++++++++++----------
>  net/netfilter/ipset/ip_set_list_set.c  | 17 +++++++++++------
>  3 files changed, 33 insertions(+), 17 deletions(-)
> 
> diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
> index 34fc80f3eb90..1d100efe74ec 100644
> --- a/include/linux/netfilter/ipset/ip_set.h
> +++ b/include/linux/netfilter/ipset/ip_set.h
> @@ -314,7 +314,7 @@ enum {
>  extern ip_set_id_t ip_set_get_byname(struct net *net,
>  				     const char *name, struct ip_set **set);
>  extern void ip_set_put_byindex(struct net *net, ip_set_id_t index);
> -extern const char *ip_set_name_byindex(struct net *net, ip_set_id_t index);
> +extern void ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name);
>  extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index);
>  extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index);
>  
> diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> index bc4bd247bb7d..4062c0396a02 100644
> --- a/net/netfilter/ipset/ip_set_core.c
> +++ b/net/netfilter/ipset/ip_set_core.c
> @@ -527,6 +527,15 @@ __ip_set_put(struct ip_set *set)
>  /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need
>   * a separate reference counter
>   */
> +static inline void
> +__ip_set_get_netlink(struct ip_set *set)
> +{
> +	write_lock_bh(&ip_set_ref_lock);
> +	BUG_ON(set->ref_netlink != 0);
> +	set->ref_netlink++;
> +	write_unlock_bh(&ip_set_ref_lock);
> +}
> +
>  static inline void
>  __ip_set_put_netlink(struct ip_set *set)
>  {
> @@ -693,21 +702,19 @@ ip_set_put_byindex(struct net *net, ip_set_id_t index)
>  EXPORT_SYMBOL_GPL(ip_set_put_byindex);
>  
>  /* Get the name of a set behind a set index.
> - * We assume the set is referenced, so it does exist and
> - * can't be destroyed. The set cannot be renamed due to
> - * the referencing either.
> - *
> + * Set itself is protected by RCU, but its name isn't: to protect against
> + * renaming, grab its netlink refcount (see ip_set_rename()) and copy it.
>   */
> -const char *
> -ip_set_name_byindex(struct net *net, ip_set_id_t index)
> +void
> +ip_set_name_byindex(struct net *net, ip_set_id_t index, char *name)
>  {
> -	const struct ip_set *set = ip_set_rcu_get(net, index);
> +	struct ip_set *set = ip_set_rcu_get(net, index);
>  
>  	BUG_ON(!set);
> -	BUG_ON(set->ref == 0);
>  
> -	/* Referenced, so it's safe */
> -	return set->name;
> +	__ip_set_get_netlink(set);
> +	strncpy(name, set->name, IPSET_MAXNAMELEN);
> +	__ip_set_put_netlink(set);
>  }
>  EXPORT_SYMBOL_GPL(ip_set_name_byindex);

This is my main concern about the patch: it boils down to *four* locking 
operations, for every single list members in a list type of set. It costs 
too much!

ip_set_name_byindex() is used in ip_set_list_set.c only, so please use 
write_lock_bh()/write_unlock_bh() directly, that should be sufficient.
 
> @@ -1158,6 +1165,10 @@ static int ip_set_rename(struct net *net, struct sock *ctnl,
>  		ret = -IPSET_ERR_REFERENCED;
>  		goto out;
>  	}
> +	if (set->ref_netlink != 0) {
> +		ret = -EBUSY;
> +		goto out;
> +	}

Please return -IPSET_ERR_REFERENCED instead, which is then translated into 
a proper textual error message to the users by ipset.
  
>  	name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
>  	for (i = 0; i < inst->ip_set_max; i++) {
> diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
> index 072a658fde04..3a39f20ba6fd 100644
> --- a/net/netfilter/ipset/ip_set_list_set.c
> +++ b/net/netfilter/ipset/ip_set_list_set.c
> @@ -148,9 +148,7 @@ __list_set_del_rcu(struct rcu_head * rcu)
>  {
>  	struct set_elem *e = container_of(rcu, struct set_elem, rcu);
>  	struct ip_set *set = e->set;
> -	struct list_set *map = set->data;
>  
> -	ip_set_put_byindex(map->net, e->id);
>  	ip_set_ext_destroy(set, e);
>  	kfree(e);
>  }
>
> @@ -158,14 +156,20 @@ __list_set_del_rcu(struct rcu_head * rcu)
>  static inline void
>  list_set_del(struct ip_set *set, struct set_elem *e)
>  {
> +	struct list_set *map = set->data;
> +
>  	set->elements--;
> +	ip_set_put_byindex(map->net, e->id);
>  	list_del_rcu(&e->list);
>  	call_rcu(&e->rcu, __list_set_del_rcu);
>  }

Above and below please move ip_set_put_byindex() just before call_rcu().
  
>  static inline void
> -list_set_replace(struct set_elem *e, struct set_elem *old)
> +list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
>  {
> +	struct list_set *map = set->data;
> +
> +	ip_set_put_byindex(map->net, old->id);
>  	list_replace_rcu(&old->list, &e->list);
>  	call_rcu(&old->rcu, __list_set_del_rcu);
>  }
> @@ -298,7 +302,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
>  	INIT_LIST_HEAD(&e->list);
>  	list_set_init_extensions(set, ext, e);
>  	if (n)
> -		list_set_replace(e, n);
> +		list_set_replace(set, e, n);
>  	else if (next)
>  		list_add_tail_rcu(&e->list, &next->list);
>  	else if (prev)
> @@ -486,6 +490,7 @@ list_set_list(const struct ip_set *set,
>  	const struct list_set *map = set->data;
>  	struct nlattr *atd, *nested;
>  	u32 i = 0, first = cb->args[IPSET_CB_ARG0];
> +	char name[IPSET_MAXNAMELEN];
>  	struct set_elem *e;
>  	int ret = 0;
>  
> @@ -504,8 +509,8 @@ list_set_list(const struct ip_set *set,
>  		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
>  		if (!nested)
>  			goto nla_put_failure;
> -		if (nla_put_string(skb, IPSET_ATTR_NAME,
> -				   ip_set_name_byindex(map->net, e->id)))
> +		ip_set_name_byindex(map->net, e->id, name);
> +		if (nla_put_string(skb, IPSET_ATTR_NAME, name))
>  			goto nla_put_failure;
>  		if (ip_set_put_extensions(skb, set, e, true))
>  			goto nla_put_failure;

Thanks!

Best regards,
Jozsef
-
E-mail  : kadlec@xxxxxxxxxxxxxxxxx, kadlecsik.jozsef@xxxxxxxxxxxxx
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
          H-1525 Budapest 114, POB. 49, Hungary
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Netfitler Users]     [LARTC]     [Bugtraq]     [Yosemite Forum]

  Powered by Linux