[PATCH RFC 5/6] mm, slab: asynchronously destroy caches with outstanding objects

Vlastimil Babka <vbabka@xxxxxxx> · Mon, 15 Jul 2024 22:29:31 +0200

We would like to replace call_rcu() users with kfree_rcu() where the
existing callback is just a kmem_cache_free(). However this causes
issues when the cache can be destroyed (such as due to module unload).

Currently such modules should be issuing rcu_barrier() before
kmem_cache_destroy() to have their call_rcu() callbacks processed first.
This barrier is however not sufficient for kfree_rcu() in flight due
to the batching introduced by a35d16905efc ("rcu: Add basic support for
kfree_rcu() batching").

This is not a problem for kmalloc caches which are never destroyed, but
since removing SLOB, kfree_rcu() is allowed also for any other cache,
that might be destroyed.

In order not to complicate the API, put the responsibility for handling
outstanding kfree_rcu() in kmem_cache_destroy() itself. Use the result
of __kmem_cache_shutdown() to determine if there are still allocated
objects in the cache, and if there are, assume it's due to kfree_rcu().
In that case schedule a work item that will use the appropriate barrier
and then attempt __kmem_cache_shutdown() again. Only if that fails as
well, produce the usual warning about non-freed objects.

Sysfs and debugs directories are removed immediately, so the cache can
be recreated with the same name without issues, while the previous
instance is still pending removal.

Users of call_rcu() with arbitrary callbacks should still perform their
own synchronous barrier before destroying the cache and unloading the
module, as the callbacks may be invoking module code or perform other
actions that are necessary for a successful unload.

Note that another non-bug reason why there might be objects outstanding
is the kasan quarantine. In that case the cleanup also becomes
asynchronous, and flushing the quarantine by kasan_cache_shutdown(s) is
only done in the workfn.

Signed-off-by: Vlastimil Babka <vbabka@xxxxxxx>
---
 mm/slab.h        |  4 +++-
 mm/slab_common.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++----
 mm/slub.c        |  9 +++++----
 3 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index ece18ef5dd04..390a4e265f03 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -279,6 +279,8 @@ struct kmem_cache {
 	unsigned int red_left_pad;	/* Left redzone padding size */
 	const char *name;		/* Name (only for display!) */
 	struct list_head list;		/* List of slab caches */
+	struct work_struct async_destroy_work;
+
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;		/* For sysfs */
 #endif
@@ -478,7 +480,7 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
 			      SLAB_NO_USER_FLAGS)
 
 bool __kmem_cache_empty(struct kmem_cache *);
-int __kmem_cache_shutdown(struct kmem_cache *);
+int __kmem_cache_shutdown(struct kmem_cache *, bool);
 void __kmem_cache_release(struct kmem_cache *);
 int __kmem_cache_shrink(struct kmem_cache *);
 void slab_kmem_cache_release(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 57962e1a5a86..3e15525819b6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -44,6 +44,8 @@ static LIST_HEAD(slab_caches_to_rcu_destroy);
 static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
 static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
 		    slab_caches_to_rcu_destroy_workfn);
+static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work);
+
 
 /*
  * Set of flags that will prevent slab merging
@@ -235,6 +237,7 @@ static struct kmem_cache *create_cache(const char *name,
 
 	s->refcount = 1;
 	list_add(&s->list, &slab_caches);
+	INIT_WORK(&s->async_destroy_work, kmem_cache_kfree_rcu_destroy_workfn);
 	return s;
 
 out_free_cache:
@@ -535,6 +538,47 @@ void slab_kmem_cache_release(struct kmem_cache *s)
 	kmem_cache_free(kmem_cache, s);
 }
 
+static void kmem_cache_kfree_rcu_destroy_workfn(struct work_struct *work)
+{
+	struct kmem_cache *s;
+	bool rcu_set;
+	int err;
+
+	s = container_of(work, struct kmem_cache, async_destroy_work);
+
+	// XXX use the real kmem_cache_free_barrier() or similar thing here
+	rcu_barrier();
+
+	cpus_read_lock();
+	mutex_lock(&slab_mutex);
+
+	rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
+
+	/* free asan quarantined objects */
+	kasan_cache_shutdown(s);
+
+	err = __kmem_cache_shutdown(s, true);
+	WARN(err, "kmem_cache_destroy %s: Slab cache still has objects",
+	     s->name);
+
+	if (err)
+		goto out_unlock;
+
+	list_del(&s->list);
+
+	if (rcu_set) {
+		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
+		schedule_work(&slab_caches_to_rcu_destroy_work);
+	}
+
+out_unlock:
+	mutex_unlock(&slab_mutex);
+	cpus_read_unlock();
+
+	if (!err && !rcu_set)
+		kmem_cache_release(s);
+}
+
 void kmem_cache_destroy(struct kmem_cache *s)
 {
 	bool rcu_set;
@@ -558,9 +602,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	/* free asan quarantined objects */
 	kasan_cache_shutdown(s);
 
-	err = __kmem_cache_shutdown(s);
-	WARN(err, "%s %s: Slab cache still has objects when called from %pS",
-	     __func__, s->name, (void *)_RET_IP_);
+	err = __kmem_cache_shutdown(s, false);
 
 	if (!err)
 		list_del(&s->list);
@@ -573,8 +615,10 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	}
 	debugfs_slab_release(s);
 
-	if (err)
+	if (err) {
+		schedule_work(&s->async_destroy_work);
 		return;
+	}
 
 	if (rcu_set) {
 		mutex_lock(&slab_mutex);
diff --git a/mm/slub.c b/mm/slub.c
index aa4d80109c49..c1222467c346 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5352,7 +5352,8 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
  * This is called from __kmem_cache_shutdown(). We must take list_lock
  * because sysfs file might still access partial list after the shutdowning.
  */
-static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
+static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n,
+			 bool warn_inuse)
 {
 	LIST_HEAD(discard);
 	struct slab *slab, *h;
@@ -5363,7 +5364,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 		if (!slab->inuse) {
 			remove_partial(n, slab);
 			list_add(&slab->slab_list, &discard);
-		} else {
+		} else if (warn_inuse) {
 			list_slab_objects(s, slab,
 			  "Objects remaining in %s on __kmem_cache_shutdown()");
 		}
@@ -5388,7 +5389,7 @@ bool __kmem_cache_empty(struct kmem_cache *s)
 /*
  * Release all resources used by a slab cache.
  */
-int __kmem_cache_shutdown(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s, bool warn_inuse)
 {
 	int node;
 	struct kmem_cache_node *n;
@@ -5396,7 +5397,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 	flush_all_cpus_locked(s);
 	/* Attempt to free all objects */
 	for_each_kmem_cache_node(s, node, n) {
-		free_partial(s, n);
+		free_partial(s, n, warn_inuse);
 		if (n->nr_partial || node_nr_slabs(n))
 			return 1;
 	}

-- 
2.45.2