Re: [PATCH 5/7] mm/list_lru: simplify reparenting and initial allocation

Muchun Song <muchun.song@xxxxxxxxx> · Wed, 17 Jul 2024 11:04:40 +0800

> On Jun 25, 2024, at 01:53, Kairui Song <ryncsn@xxxxxxxxx> wrote:
> 
> From: Kairui Song <kasong@xxxxxxxxxxx>
> 
> Currently, there is a lot of code for detecting reparent racing
> using kmemcg_id as the synchronization flag. And an intermediate
> table is required to record and compare the kmemcg_id.
> 
> We can simplify this by just checking the cgroup css status, skip
> if cgroup is being offlined. On the reparenting side, ensure no
> more allocation is on going and no further allocation will occur
> by using the XArray lock as barrier.
> 
> Combined with a O(n^2) top-down walk for the allocation, we get rid
> of the intermediate table allocation completely. Despite being O(n^2),
> it should be actually faster because it's not practical to have a very
> deep cgroup level.

I kind of agree with you. But just instinctually.

> 
> This also avoided changing kmemcg_id before reparenting, making
> cgroups have a stable index for list_lru_memcg. After this change
> it's possible that a dying cgroup will see a NULL value in XArray
> corresponding to the kmemcg_id, because the kmemcg_id will point
> to an empty slot. In such case, just fallback to use its parent.
> 
> As a result the code is simpler, following test also showed a
> performance gain (6 test runs):
> 
> mkdir /tmp/test-fs
> modprobe brd rd_nr=1 rd_size=16777216
> mkfs.xfs /dev/ram0
> mount -t xfs /dev/ram0 /tmp/test-fs
> worker() {
>     echo TEST-CONTENT > "/tmp/test-fs/$1"
> }
> do_test() {
>     for i in $(seq 1 2048); do
>         (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs"
>         worker "$i" &
>     done; wait
>     echo 1 > /proc/sys/vm/drop_caches
> }
> mkdir -p /sys/fs/cgroup/benchmark
> echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control
> for i in $(seq 1 2048); do
>     rmdir "/sys/fs/cgroup/benchmark/$i" &>/dev/null
>     mkdir -p "/sys/fs/cgroup/benchmark/$i"
> done
> time do_test
> 
> Before:
> real    0m5.932s user    0m2.366s sys     0m5.583s
> real    0m5.939s user    0m2.347s sys     0m5.597s
> real    0m6.149s user    0m2.398s sys     0m5.761s
> real    0m5.945s user    0m2.403s sys     0m5.547s
> real    0m5.925s user    0m2.293s sys     0m5.651s
> real    0m6.017s user    0m2.367s sys     0m5.686s
> 
> After:
> real    0m5.712s user    0m2.343s sys     0m5.307s
> real    0m5.885s user    0m2.326s sys     0m5.518s
> real    0m5.694s user    0m2.347s sys     0m5.264s
> real    0m5.865s user    0m2.300s sys     0m5.545s
> real    0m5.748s user    0m2.273s sys     0m5.424s
> real    0m5.756s user    0m2.318s sys     0m5.398s
> 
> Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx>
> ---
> mm/list_lru.c | 182 ++++++++++++++++++++++----------------------------
> mm/zswap.c    |   7 +-
> 2 files changed, 81 insertions(+), 108 deletions(-)
> 
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 4c619857e916..ac8aec8451dd 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -59,6 +59,20 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
> }
> 	return &lru->node[nid].lru;
> }
> +
> +static inline struct list_lru_one *
> +list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg)
> +{
> + 	struct list_lru_one *l;
> +again:
> + 	l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
> + 	if (likely(l))
> + 	return l;
> +
> +	memcg = parent_mem_cgroup(memcg);
> +	WARN_ON(!css_is_dying(&memcg->css));
> + 	goto again;

Just want to confirm this will only loop max twice, right?

> +}
> #else
> static void list_lru_register(struct list_lru *lru)
> {
> @@ -83,6 +97,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
> {
> 	return &lru->node[nid].lru;
> }
> +
> +static inline struct list_lru_one *
> +list_lru_from_memcg(struct list_lru *lru, int nid, int idx)
> +{
> + 	return &lru->node[nid].lru;
> +}
> #endif /* CONFIG_MEMCG_KMEM */
> 
> bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
> @@ -93,7 +113,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
> 
> 	spin_lock(&nlru->lock);
> 	if (list_empty(item)) {
> - 		l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
> + 		l = list_lru_from_memcg(lru, nid, memcg);
> 		list_add_tail(item, &l->list);
> 		/* Set shrinker bit if the first element was added */
> 		if (!l->nr_items++)
> @@ -124,7 +144,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
> 
> 	spin_lock(&nlru->lock);
> 	if (!list_empty(item)) {
> - 		l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
> + 		l = list_lru_from_memcg(lru, nid, memcg);
> 		list_del_init(item);
> 		l->nr_items--;
> 		nlru->nr_items--;
> @@ -339,20 +359,6 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
> return mlru;
> }
> 
> -static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
> -{
> - 	struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);
> -
> - 	/*
> -	 * The __list_lru_walk_one() can walk the list of this node.
> -	 * We need kvfree_rcu() here. And the walking of the list
> -	 * is under lru->node[nid]->lock, which can serve as a RCU
> -	 * read-side critical section.
> -	 */
> -	if (mlru)
> - 		kvfree_rcu(mlru, rcu);
> -}
> -
> static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
> {
> 	if (memcg_aware)
> @@ -377,22 +383,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
> }
> 
> static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
> -					 int src_idx, struct mem_cgroup *dst_memcg)
> +					 struct list_lru_one *src,
> +					 struct mem_cgroup *dst_memcg)
> {
> 	struct list_lru_node *nlru = &lru->node[nid];
> - 	int dst_idx = dst_memcg->kmemcg_id;
> - 	struct list_lru_one *src, *dst;
> + 	struct list_lru_one *dst;
> 
> 	/*
> 	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
> 	 * we have to use IRQ-safe primitives here to avoid deadlock.
> 	 */
> 	spin_lock_irq(&nlru->lock);
> -
> - 	src = list_lru_from_memcg_idx(lru, nid, src_idx);
> - 	if (!src)
> - 		goto out;
> - 	dst = list_lru_from_memcg_idx(lru, nid, dst_idx);
> + 	dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg));
> 
> 	list_splice_init(&src->list, &dst->list);
> 
> @@ -401,46 +403,45 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
> 		set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
> 		src->nr_items = 0;
> 	}
> -out:
> 	spin_unlock_irq(&nlru->lock);
> }
> 
> void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
> {
> - 	struct cgroup_subsys_state *css;
> 	struct list_lru *lru;
> - 	int src_idx = memcg->kmemcg_id, i;
> -
> - 	/*
> - 	 * Change kmemcg_id of this cgroup and all its descendants to the
> - 	 * parent's id, and then move all entries from this cgroup's list_lrus
> - 	 * to ones of the parent.
> - 	 */
> - 	rcu_read_lock();
> - 	css_for_each_descendant_pre(css, &memcg->css) {
> - 		struct mem_cgroup *child;
> -
> - 		child = mem_cgroup_from_css(css);
> - 		WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
> - 	}
> - 	rcu_read_unlock();
> + 	int i;
> 
> - 	/*
> -	 * With kmemcg_id set to parent, holding the lru lock below can
> -	 * prevent list_lru_{add,del,isolate} from touching the lru, safe
> -	 * to reparent.
> -	 */
> 	mutex_lock(&list_lrus_mutex);
> 	list_for_each_entry(lru, &memcg_list_lrus, list) {
> + 		struct list_lru_memcg *mlru;
> + 		XA_STATE(xas, &lru->xa, memcg->kmemcg_id);
> +
> + 		/*
> +		 * Lock the Xarray to ensure no on going allocation and

ongoing? I'd like to rewrite the comment here, like:

The XArray lock is used to make the future observers will see the current
memory cgroup has been dying (i.e. css_is_dying() will return true), which
is significant for memcg_list_lru_alloc() to make sure it will not allocate
a new mlru for this died memory cgroup for which we just free it below.

> +		 * further allocation will see css_is_dying().
> +		 */
> + 		xas_lock_irq(&xas);
> + 		mlru = xas_load(&xas);
> + 		if (mlru)
> + 			xas_store(&xas, NULL);
> + 		xas_unlock_irq(&xas);

The above block could be replaced with xa_erase_irq(), simpler, right?

> + 		if (!mlru)
> + 			continue;
> +
> + 		/*
> +		 * With Xarray value set to NULL, holding the lru lock below
> +		 * prevents list_lru_{add,del,isolate} from touching the lru,
> +		 * safe to reparent.
> +		 */
> 		for_each_node(i)
> - 			memcg_reparent_list_lru_node(lru, i, src_idx, parent);
> + 			memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent);
> 
> 		/*
> 		 * Here all list_lrus corresponding to the cgroup are guaranteed
> 		 * to remain empty, we can safely free this lru, any further
> 		 * memcg_list_lru_alloc() call will simply bail out.
> 		 */
> - 		memcg_list_lru_free(lru, src_idx);
> + 		kvfree_rcu(mlru, rcu);
> 	}
> 	mutex_unlock(&list_lrus_mutex);
> }
> @@ -456,78 +457,51 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
> int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
> 			 gfp_t gfp)
> {
> - 	int i;
> 	unsigned long flags;
> - 	struct list_lru_memcg_table {
> - 		struct list_lru_memcg *mlru;
> - 		struct mem_cgroup *memcg;
> - 	} *table;
> + 	struct list_lru_memcg *mlru;
> + 	struct mem_cgroup *pos, *parent;
> 	XA_STATE(xas, &lru->xa, 0);
> 
> 	if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
> 		return 0;
> 
> 	gfp &= GFP_RECLAIM_MASK;
> - 	table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp);
> - 	if (!table)
> - 		return -ENOMEM;
> -
> 	/*
> 	 * Because the list_lru can be reparented to the parent cgroup's
> 	 * list_lru, we should make sure that this cgroup and all its
> 	 * ancestors have allocated list_lru_memcg.
> 	 */
> - 	for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) {
> - 		if (memcg_list_lru_allocated(memcg, lru))
> - 			break;
> -
> - 		table[i].memcg = memcg;
> - 		table[i].mlru = memcg_init_list_lru_one(gfp);
> - 		if (!table[i].mlru) {
> - 			while (i--)
> - 				kfree(table[i].mlru);
> - 			kfree(table);
> - 			return -ENOMEM;
> + 	do {
> + 		/*
> +		 * Keep finding the farest parent that wasn't populated
> +		 * until found memcg itself.
> +		 */
> + 		pos = memcg;
> + 		parent = parent_mem_cgroup(pos);
> + 		while (parent && !memcg_list_lru_allocated(parent, lru)) {

The check of @parent is unnecessary since memcg_list_lru_allocated() always
return true for root memory cgroup. Right?

> + 			pos = parent;
> +			parent = parent_mem_cgroup(pos);
> 		}
> - 	}
> 
> - 	xas_lock_irqsave(&xas, flags);
> - 	while (i--) {
> - 		int index = READ_ONCE(table[i].memcg->kmemcg_id);
> - 		struct list_lru_memcg *mlru = table[i].mlru;
> -
> - 		xas_set(&xas, index);
> -retry:
> - 		if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
> - 			kfree(mlru);
> - 		} else {
> - 			xas_store(&xas, mlru);
> - 			if (xas_error(&xas) == -ENOMEM) {
> + 		mlru = memcg_init_list_lru_one(gfp);
> + 		do {
> + 			bool alloced = false;
> +
> + 			xas_set(&xas, pos->kmemcg_id);
> + 			xas_lock_irqsave(&xas, flags);
> + 			if (!css_is_dying(&pos->css) && !xas_load(&xas)) {
> + 				xas_store(&xas, mlru);
> + 				alloced = true;
> + 			}
> + 			if (!alloced || xas_error(&xas)) {
> 				xas_unlock_irqrestore(&xas, flags);
> - 				if (xas_nomem(&xas, gfp))
> - 					xas_set_err(&xas, 0);
> - 				xas_lock_irqsave(&xas, flags);
> - 				/*
> -				 * The xas lock has been released, this memcg
> -				 * can be reparented before us. So reload
> -				 * memcg id. More details see the comments
> -				 * in memcg_reparent_list_lrus().
> -				 */
> - 				index = READ_ONCE(table[i].memcg->kmemcg_id);
> - 				if (index < 0)
> - 					xas_set_err(&xas, 0);
> - 				else if (!xas_error(&xas) && index != xas.xa_index)
> - 					xas_set(&xas, index);
> - 				goto retry;
> + 				kfree(mlru);
> + 				goto out;

1) If xas_error() returns true, we should continue since xas_mem() will handle the NOMEM
case for us.

2) If xas_load() returns a non-NULL pointer meaning there is a concurrent thread has
assigned a new mlru for us, we should continue since we might have not assigned a mlru
for the leaf memory cgroup. Suppose the following case:

	A
      / \
     B   C

If there are two threads allocating mlru for A, then either B or C will not allocate a mlru.
Here is a code snippet FYI.

Thanks.

```c
struct list_lru_memcg *mlru = NULL; 

do {
	/* ... */
	if (!mlru)
		mlru = memcg_init_list_lru_one(gfp);

	xas_set(&xas, pos->kmemcg_id);
	do {
		xas_lock_irqsave(&xas, flags);
		if (css_is_dying(&pos->css) || xas_load(&xas))
			goto unlock;
		xas_store(&xas, mlru);
		if (!xas_error(&xas))
			mlru = NULL;
unlock:
		xas_unlock_irqrestore(&xas, flags);
	} while (xas_nomem(&xas, gfp));
} while (pos != memcg);

if (mlru)
	kfree(mlru);
/* ... */
```
> 			}
> - 		}
> - 	}
> - 	/* xas_nomem() is used to free memory instead of memory allocation. */
> - 	if (xas.xa_alloc)
> - 		xas_nomem(&xas, gfp);
> - 	xas_unlock_irqrestore(&xas, flags);
> - 	kfree(table);
> -
> + 			xas_unlock_irqrestore(&xas, flags);
> + 		} while (xas_nomem(&xas, gfp));
> + 	} while (pos != memcg);
> +out:
> 	return xas_error(&xas);
> }
> #else
> diff --git a/mm/zswap.c b/mm/zswap.c
> index a50e2986cd2f..c6e2256347ff 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -718,12 +718,11 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> 
> 	/*
> 	 * Note that it is safe to use rcu_read_lock() here, even in the face of
> -	 * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
> -	 * used in list_lru lookup, only two scenarios are possible:
> +	 * concurrent memcg offlining:
> *
> -	 * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
> +	 * 1. list_lru_add() is called before list_lru_memcg is erased. The
> 	 *    new entry will be reparented to memcg's parent's list_lru.
> -	 * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
> +	 * 2. list_lru_add() is called after list_lru_memcg is erased. The
> 	 *    new entry will be added directly to memcg's parent's list_lru.
> 	 *
> 	 * Similar reasoning holds for list_lru_del().
> -- 
> 2.45.2
>