At 2024-01-23 17:03:42, "Jozsef Kadlecsik" <kadlec@xxxxxxxxxxxxxxxxx> wrote: >Hi, > >On Thu, 18 Jan 2024, Jozsef Kadlecsik wrote: > >> On Thu, 18 Jan 2024, Eric Dumazet wrote: >> >> > On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> wrote: >> > > >> > > The patch "netfilter: ipset: fix race condition between swap/destroy >> > > and kernel side add/del/test", commit 28628fa9 fixes a race condition. >> > > But the synchronize_rcu() added to the swap function unnecessarily slows >> > > it down: it can safely be moved to destroy and use call_rcu() instead. >> > > Thus we can get back the same performance and preventing the race condition >> > > at the same time. >[...] >> > > +static void >> > > +ip_set_destroy_set_rcu(struct rcu_head *head) >> > > +{ >> > > + struct ip_set *set = container_of(head, struct ip_set, rcu); >> > > + >> > > + ip_set_destroy_set(set); >> > >> > Calling ip_set_destroy_set() from BH (rcu callbacks) is not working. >> >> Yeah, it calls cancel_delayed_work_sync() to handle the garbage collector >> and that can wait. The call can be moved into the main destroy function >> and let the rcu callback do just the minimal job, however it needs a >> restructuring. So please skip this patch now. > >I reworked the patch to work safely with using call_rcu() and still >preventing the race condition. According to my tests the patch works as >intended, but it'd be good to receive feedback that it indeed fixes the >issue properly. I apply the patch on 6.8.0-rc1, stress create-add-swap-destroy sequence for 1000000 rounds, and no performance regression observed. $ time sudo ./a.out real 0m19.717s user 0m0.849s sys 0m17.060s I also test with two processes create-add-swap-destroy disjoint set of ipset parallelly on a 2-Core VM, no abnormality noticed other than some slower, I guess this is expected, right? $ time sudo ./a.out real 0m23.625s user 0m0.827s sys 0m20.222s $ time sudo ./a2.out real 0m23.838s user 0m0.835s sys 0m20.401s No obverse slab memory increment can be confirmed after several round of stress. cat /proc/meminfo | grep Slab: 39248 kB - 39368 kB -- 39340 kB - 39388 kB ----- 39308 kB FYI David > >From 23e85f453da573dd4265e7d1fffd2aeab3369e0d Mon Sep 17 00:00:00 2001 >From: Jozsef Kadlecsik <kadlec@xxxxxxxxxxxxx> >Date: Tue, 16 Jan 2024 17:10:45 +0100 >Subject: [PATCH 1/1] netfilter: ipset: fix performance regression in swap > operation > >The patch "netfilter: ipset: fix race condition between swap/destroy >and kernel side add/del/test", commit 28628fa9 fixes a race condition. >But the synchronize_rcu() added to the swap function unnecessarily slows >it down: it can safely be moved to destroy and use call_rcu() instead. > >Eric Dumazet pointed out that simply calling the destroy functions as >rcu callback does not work: sets with timeout use garbage collectors >which need cancelling at destroy which can wait. Therefore the destroy >functions are split into two: cancelling garbage collectors safely at >executing the command received by netlink and moving the remaining >part only into the rcu callback. > >Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@xxxxxxxxxxxxxx/ >Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test") >Reported-by: Ale Crismani <ale.crismani@xxxxxxxxxxxxxx> >Reported-by: David Wang <00107082@xxxxxxx >Signed-off-by: Jozsef Kadlecsik <kadlec@xxxxxxxxxxxxx> >--- > include/linux/netfilter/ipset/ip_set.h | 4 +++ > net/netfilter/ipset/ip_set_bitmap_gen.h | 14 ++++++++-- > net/netfilter/ipset/ip_set_core.c | 37 +++++++++++++++++++------ > net/netfilter/ipset/ip_set_hash_gen.h | 15 ++++++++-- > net/netfilter/ipset/ip_set_list_set.c | 13 +++++++-- > 5 files changed, 65 insertions(+), 18 deletions(-) > >diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h >index e8c350a3ade1..e9f4f845d760 100644 >--- a/include/linux/netfilter/ipset/ip_set.h >+++ b/include/linux/netfilter/ipset/ip_set.h >@@ -186,6 +186,8 @@ struct ip_set_type_variant { > /* Return true if "b" set is the same as "a" > * according to the create set parameters */ > bool (*same_set)(const struct ip_set *a, const struct ip_set *b); >+ /* Cancel ongoing garbage collectors before destroying the set*/ >+ void (*cancel_gc)(struct ip_set *set); > /* Region-locking is used */ > bool region_lock; > }; >@@ -242,6 +244,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type); > > /* A generic IP set */ > struct ip_set { >+ /* For call_cru in destroy */ >+ struct rcu_head rcu; > /* The name of the set */ > char name[IPSET_MAXNAMELEN]; > /* Lock protecting the set data */ >diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h >index 26ab0e9612d8..9523104a90da 100644 >--- a/net/netfilter/ipset/ip_set_bitmap_gen.h >+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h >@@ -28,6 +28,7 @@ > #define mtype_del IPSET_TOKEN(MTYPE, _del) > #define mtype_list IPSET_TOKEN(MTYPE, _list) > #define mtype_gc IPSET_TOKEN(MTYPE, _gc) >+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) > #define mtype MTYPE > > #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) >@@ -57,9 +58,6 @@ mtype_destroy(struct ip_set *set) > { > struct mtype *map = set->data; > >- if (SET_WITH_TIMEOUT(set)) >- del_timer_sync(&map->gc); >- > if (set->dsize && set->extensions & IPSET_EXT_DESTROY) > mtype_ext_cleanup(set); > ip_set_free(map->members); >@@ -288,6 +286,15 @@ mtype_gc(struct timer_list *t) > add_timer(&map->gc); > } > >+static void >+mtype_cancel_gc(struct ip_set *set) >+{ >+ struct mtype *map = set->data; >+ >+ if (SET_WITH_TIMEOUT(set)) >+ del_timer_sync(&map->gc); >+} >+ > static const struct ip_set_type_variant mtype = { > .kadt = mtype_kadt, > .uadt = mtype_uadt, >@@ -301,6 +308,7 @@ static const struct ip_set_type_variant mtype = { > .head = mtype_head, > .list = mtype_list, > .same_set = mtype_same_set, >+ .cancel_gc = mtype_cancel_gc, > }; > > #endif /* __IP_SET_BITMAP_IP_GEN_H */ >diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c >index 4c133e06be1d..bcaad9c009fe 100644 >--- a/net/netfilter/ipset/ip_set_core.c >+++ b/net/netfilter/ipset/ip_set_core.c >@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set) > kfree(set); > } > >+static void >+ip_set_destroy_set_rcu(struct rcu_head *head) >+{ >+ struct ip_set *set = container_of(head, struct ip_set, rcu); >+ >+ ip_set_destroy_set(set); >+} >+ > static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, > const struct nlattr * const attr[]) > { >@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, > if (unlikely(protocol_min_failed(attr))) > return -IPSET_ERR_PROTOCOL; > >- /* Must wait for flush to be really finished in list:set */ >- rcu_barrier(); > > /* Commands are serialized and references are > * protected by the ip_set_ref_lock. >@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, > * counter, so if it's already zero, we can proceed > * without holding the lock. > */ >- read_lock_bh(&ip_set_ref_lock); > if (!attr[IPSET_ATTR_SETNAME]) { >+ /* Must wait for flush to be really finished in list:set */ >+ rcu_barrier(); >+ read_lock_bh(&ip_set_ref_lock); > for (i = 0; i < inst->ip_set_max; i++) { > s = ip_set(inst, i); > if (s && (s->ref || s->ref_netlink)) { >@@ -1221,6 +1229,8 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, > s = ip_set(inst, i); > if (s) { > ip_set(inst, i) = NULL; >+ /* Must cancel garbage collectors */ >+ s->variant->cancel_gc(s); > ip_set_destroy_set(s); > } > } >@@ -1228,6 +1238,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, > inst->is_destroyed = false; > } else { > u32 flags = flag_exist(info->nlh); >+ u16 features = 0; >+ >+ read_lock_bh(&ip_set_ref_lock); > s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), > &i); > if (!s) { >@@ -1238,10 +1251,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, > ret = -IPSET_ERR_BUSY; > goto out; > } >+ features = s->type->features; > ip_set(inst, i) = NULL; > read_unlock_bh(&ip_set_ref_lock); >- >- ip_set_destroy_set(s); >+ if (features & IPSET_TYPE_NAME) { >+ /* Must wait for flush to be really finished */ >+ rcu_barrier(); >+ } >+ /* Must cancel garbage collectors */ >+ s->variant->cancel_gc(s); >+ call_rcu(&s->rcu, ip_set_destroy_set_rcu); > } > return 0; > out: >@@ -1394,9 +1413,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, > ip_set(inst, to_id) = from; > write_unlock_bh(&ip_set_ref_lock); > >- /* Make sure all readers of the old set pointers are completed. */ >- synchronize_rcu(); >- > return 0; > } > >@@ -2409,8 +2425,11 @@ ip_set_fini(void) > { > nf_unregister_sockopt(&so_set); > nfnetlink_subsys_unregister(&ip_set_netlink_subsys); >- > unregister_pernet_subsys(&ip_set_net_ops); >+ >+ /* Wait for call_rcu() in destroy */ >+ rcu_barrier(); >+ > pr_debug("these are the famous last words\n"); > } > >diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h >index 7c2399541771..c62998b46f00 100644 >--- a/net/netfilter/ipset/ip_set_hash_gen.h >+++ b/net/netfilter/ipset/ip_set_hash_gen.h >@@ -221,6 +221,7 @@ static const union nf_inet_addr zeromask = {}; > #undef mtype_gc_do > #undef mtype_gc > #undef mtype_gc_init >+#undef mtype_cancel_gc > #undef mtype_variant > #undef mtype_data_match > >@@ -265,6 +266,7 @@ static const union nf_inet_addr zeromask = {}; > #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) > #define mtype_gc IPSET_TOKEN(MTYPE, _gc) > #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) >+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) > #define mtype_variant IPSET_TOKEN(MTYPE, _variant) > #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) > >@@ -449,9 +451,6 @@ mtype_destroy(struct ip_set *set) > struct htype *h = set->data; > struct list_head *l, *lt; > >- if (SET_WITH_TIMEOUT(set)) >- cancel_delayed_work_sync(&h->gc.dwork); >- > mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); > list_for_each_safe(l, lt, &h->ad) { > list_del(l); >@@ -598,6 +597,15 @@ mtype_gc_init(struct htable_gc *gc) > queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); > } > >+static void >+mtype_cancel_gc(struct ip_set *set) >+{ >+ struct htype *h = set->data; >+ >+ if (SET_WITH_TIMEOUT(set)) >+ cancel_delayed_work_sync(&h->gc.dwork); >+} >+ > static int > mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, > struct ip_set_ext *mext, u32 flags); >@@ -1440,6 +1448,7 @@ static const struct ip_set_type_variant mtype_variant = { > .uref = mtype_uref, > .resize = mtype_resize, > .same_set = mtype_same_set, >+ .cancel_gc = mtype_cancel_gc, > .region_lock = true, > }; > >diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c >index e162636525cf..6c3f28bc59b3 100644 >--- a/net/netfilter/ipset/ip_set_list_set.c >+++ b/net/netfilter/ipset/ip_set_list_set.c >@@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set) > struct list_set *map = set->data; > struct set_elem *e, *n; > >- if (SET_WITH_TIMEOUT(set)) >- timer_shutdown_sync(&map->gc); >- > list_for_each_entry_safe(e, n, &map->members, list) { > list_del(&e->list); > ip_set_put_byindex(map->net, e->id); >@@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b) > a->extensions == b->extensions; > } > >+static void >+list_set_cancel_gc(struct ip_set *set) >+{ >+ struct list_set *map = set->data; >+ >+ if (SET_WITH_TIMEOUT(set)) >+ timer_shutdown_sync(&map->gc); >+} >+ > static const struct ip_set_type_variant set_variant = { > .kadt = list_set_kadt, > .uadt = list_set_uadt, >@@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = { > .head = list_set_head, > .list = list_set_list, > .same_set = list_set_same_set, >+ .cancel_gc = list_set_cancel_gc, > }; > > static void >-- >2.39.2 > >Best regards, >Jozsef >-- >E-mail : kadlec@xxxxxxxxxxxxxxxxx, kadlecsik.jozsef@xxxxxxxxx >PGP key : https://wigner.hu/~kadlec/pgp_public_key.txt >Address : Wigner Research Centre for Physics > H-1525 Budapest 114, POB. 49, Hungary