> On Jul 18, 2024, at 19:49, Kairui Song <ryncsn@xxxxxxxxx> wrote: > > On Wed, Jul 17, 2024 at 11:05 AM Muchun Song <muchun.song@xxxxxxxxx> wrote: >>> On Jun 25, 2024, at 01:53, Kairui Song <ryncsn@xxxxxxxxx> wrote: >>> >>> From: Kairui Song <kasong@xxxxxxxxxxx> >>> >>> Currently, there is a lot of code for detecting reparent racing >>> using kmemcg_id as the synchronization flag. And an intermediate >>> table is required to record and compare the kmemcg_id. >>> >>> We can simplify this by just checking the cgroup css status, skip >>> if cgroup is being offlined. On the reparenting side, ensure no >>> more allocation is on going and no further allocation will occur >>> by using the XArray lock as barrier. >>> >>> Combined with a O(n^2) top-down walk for the allocation, we get rid >>> of the intermediate table allocation completely. Despite being O(n^2), >>> it should be actually faster because it's not practical to have a very >>> deep cgroup level. >> >> I kind of agree with you. But just instinctually. >> >>> >>> This also avoided changing kmemcg_id before reparenting, making >>> cgroups have a stable index for list_lru_memcg. After this change >>> it's possible that a dying cgroup will see a NULL value in XArray >>> corresponding to the kmemcg_id, because the kmemcg_id will point >>> to an empty slot. In such case, just fallback to use its parent. >>> >>> As a result the code is simpler, following test also showed a >>> performance gain (6 test runs): >>> >>> mkdir /tmp/test-fs >>> modprobe brd rd_nr=1 rd_size=16777216 >>> mkfs.xfs /dev/ram0 >>> mount -t xfs /dev/ram0 /tmp/test-fs >>> worker() { >>> echo TEST-CONTENT > "/tmp/test-fs/$1" >>> } >>> do_test() { >>> for i in $(seq 1 2048); do >>> (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs" >>> worker "$i" & >>> done; wait >>> echo 1 > /proc/sys/vm/drop_caches >>> } >>> mkdir -p /sys/fs/cgroup/benchmark >>> echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control >>> for i in $(seq 1 2048); do >>> rmdir "/sys/fs/cgroup/benchmark/$i" &>/dev/null >>> mkdir -p "/sys/fs/cgroup/benchmark/$i" >>> done >>> time do_test >>> >>> Before: >>> real 0m5.932s user 0m2.366s sys 0m5.583s >>> real 0m5.939s user 0m2.347s sys 0m5.597s >>> real 0m6.149s user 0m2.398s sys 0m5.761s >>> real 0m5.945s user 0m2.403s sys 0m5.547s >>> real 0m5.925s user 0m2.293s sys 0m5.651s >>> real 0m6.017s user 0m2.367s sys 0m5.686s >>> >>> After: >>> real 0m5.712s user 0m2.343s sys 0m5.307s >>> real 0m5.885s user 0m2.326s sys 0m5.518s >>> real 0m5.694s user 0m2.347s sys 0m5.264s >>> real 0m5.865s user 0m2.300s sys 0m5.545s >>> real 0m5.748s user 0m2.273s sys 0m5.424s >>> real 0m5.756s user 0m2.318s sys 0m5.398s >>> >>> Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx> >>> --- >>> mm/list_lru.c | 182 ++++++++++++++++++++++---------------------------- >>> mm/zswap.c | 7 +- >>> 2 files changed, 81 insertions(+), 108 deletions(-) >>> >>> diff --git a/mm/list_lru.c b/mm/list_lru.c >>> index 4c619857e916..ac8aec8451dd 100644 >>> --- a/mm/list_lru.c >>> +++ b/mm/list_lru.c >>> @@ -59,6 +59,20 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) >>> } >>> return &lru->node[nid].lru; >>> } >>> + >>> +static inline struct list_lru_one * >>> +list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) >>> +{ >>> + struct list_lru_one *l; >>> +again: >>> + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); >>> + if (likely(l)) >>> + return l; >>> + >>> + memcg = parent_mem_cgroup(memcg); >>> + WARN_ON(!css_is_dying(&memcg->css)); >>> + goto again; >> >> Just want to confirm this will only loop max twice, right? > > If the memcg is dying, it finds the nearest parent that is still > active and returns the parent's lru. If the parent is also dying, try > the parent's parent. It may repeat mult times until it reaches root cg > but very unlikely. Right. > >> >>> +} >>> #else >>> static void list_lru_register(struct list_lru *lru) >>> { >>> @@ -83,6 +97,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) >>> { >>> return &lru->node[nid].lru; >>> } >>> + >>> +static inline struct list_lru_one * >>> +list_lru_from_memcg(struct list_lru *lru, int nid, int idx) >>> +{ >>> + return &lru->node[nid].lru; >>> +} >>> #endif /* CONFIG_MEMCG_KMEM */ >>> >>> bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, >>> @@ -93,7 +113,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, >>> >>> spin_lock(&nlru->lock); >>> if (list_empty(item)) { >>> - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); >>> + l = list_lru_from_memcg(lru, nid, memcg); >>> list_add_tail(item, &l->list); >>> /* Set shrinker bit if the first element was added */ >>> if (!l->nr_items++) >>> @@ -124,7 +144,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, >>> >>> spin_lock(&nlru->lock); >>> if (!list_empty(item)) { >>> - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); >>> + l = list_lru_from_memcg(lru, nid, memcg); >>> list_del_init(item); >>> l->nr_items--; >>> nlru->nr_items--; >>> @@ -339,20 +359,6 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) >>> return mlru; >>> } >>> >>> -static void memcg_list_lru_free(struct list_lru *lru, int src_idx) >>> -{ >>> - struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); >>> - >>> - /* >>> - * The __list_lru_walk_one() can walk the list of this node. >>> - * We need kvfree_rcu() here. And the walking of the list >>> - * is under lru->node[nid]->lock, which can serve as a RCU >>> - * read-side critical section. >>> - */ >>> - if (mlru) >>> - kvfree_rcu(mlru, rcu); >>> -} >>> - >>> static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) >>> { >>> if (memcg_aware) >>> @@ -377,22 +383,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru) >>> } >>> >>> static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, >>> - int src_idx, struct mem_cgroup *dst_memcg) >>> + struct list_lru_one *src, >>> + struct mem_cgroup *dst_memcg) >>> { >>> struct list_lru_node *nlru = &lru->node[nid]; >>> - int dst_idx = dst_memcg->kmemcg_id; >>> - struct list_lru_one *src, *dst; >>> + struct list_lru_one *dst; >>> >>> /* >>> * Since list_lru_{add,del} may be called under an IRQ-safe lock, >>> * we have to use IRQ-safe primitives here to avoid deadlock. >>> */ >>> spin_lock_irq(&nlru->lock); >>> - >>> - src = list_lru_from_memcg_idx(lru, nid, src_idx); >>> - if (!src) >>> - goto out; >>> - dst = list_lru_from_memcg_idx(lru, nid, dst_idx); >>> + dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); >>> >>> list_splice_init(&src->list, &dst->list); >>> >>> @@ -401,46 +403,45 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, >>> set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); >>> src->nr_items = 0; >>> } >>> -out: >>> spin_unlock_irq(&nlru->lock); >>> } >>> >>> void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) >>> { >>> - struct cgroup_subsys_state *css; >>> struct list_lru *lru; >>> - int src_idx = memcg->kmemcg_id, i; >>> - >>> - /* >>> - * Change kmemcg_id of this cgroup and all its descendants to the >>> - * parent's id, and then move all entries from this cgroup's list_lrus >>> - * to ones of the parent. >>> - */ >>> - rcu_read_lock(); >>> - css_for_each_descendant_pre(css, &memcg->css) { >>> - struct mem_cgroup *child; >>> - >>> - child = mem_cgroup_from_css(css); >>> - WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); >>> - } >>> - rcu_read_unlock(); >>> + int i; >>> >>> - /* >>> - * With kmemcg_id set to parent, holding the lru lock below can >>> - * prevent list_lru_{add,del,isolate} from touching the lru, safe >>> - * to reparent. >>> - */ >>> mutex_lock(&list_lrus_mutex); >>> list_for_each_entry(lru, &memcg_list_lrus, list) { >>> + struct list_lru_memcg *mlru; >>> + XA_STATE(xas, &lru->xa, memcg->kmemcg_id); >>> + >>> + /* >>> + * Lock the Xarray to ensure no on going allocation and >> >> ongoing? I'd like to rewrite the comment here, like: >> >> The XArray lock is used to make the future observers will see the current >> memory cgroup has been dying (i.e. css_is_dying() will return true), which >> is significant for memcg_list_lru_alloc() to make sure it will not allocate >> a new mlru for this died memory cgroup for which we just free it below. > > Good suggestion. > >> >>> + * further allocation will see css_is_dying(). >>> + */ >>> + xas_lock_irq(&xas); >>> + mlru = xas_load(&xas); >>> + if (mlru) >>> + xas_store(&xas, NULL); >>> + xas_unlock_irq(&xas); >> >> The above block could be replaced with xa_erase_irq(), simpler, right? > > I wanted to reuse the `xas` here, it will be used by later commits and > this helps reduce total stack usage. Might be trivial, I can use > xa_erase_irq here for easier coding, and expand this to this again > later. Yes. > >> >>> + if (!mlru) >>> + continue; >>> + >>> + /* >>> + * With Xarray value set to NULL, holding the lru lock below >>> + * prevents list_lru_{add,del,isolate} from touching the lru, >>> + * safe to reparent. >>> + */ >>> for_each_node(i) >>> - memcg_reparent_list_lru_node(lru, i, src_idx, parent); >>> + memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); >>> >>> /* >>> * Here all list_lrus corresponding to the cgroup are guaranteed >>> * to remain empty, we can safely free this lru, any further >>> * memcg_list_lru_alloc() call will simply bail out. >>> */ >>> - memcg_list_lru_free(lru, src_idx); >>> + kvfree_rcu(mlru, rcu); >>> } >>> mutex_unlock(&list_lrus_mutex); >>> } >>> @@ -456,78 +457,51 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, >>> int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, >>> gfp_t gfp) >>> { >>> - int i; >>> unsigned long flags; >>> - struct list_lru_memcg_table { >>> - struct list_lru_memcg *mlru; >>> - struct mem_cgroup *memcg; >>> - } *table; >>> + struct list_lru_memcg *mlru; >>> + struct mem_cgroup *pos, *parent; >>> XA_STATE(xas, &lru->xa, 0); >>> >>> if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) >>> return 0; >>> >>> gfp &= GFP_RECLAIM_MASK; >>> - table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); >>> - if (!table) >>> - return -ENOMEM; >>> - >>> /* >>> * Because the list_lru can be reparented to the parent cgroup's >>> * list_lru, we should make sure that this cgroup and all its >>> * ancestors have allocated list_lru_memcg. >>> */ >>> - for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { >>> - if (memcg_list_lru_allocated(memcg, lru)) >>> - break; >>> - >>> - table[i].memcg = memcg; >>> - table[i].mlru = memcg_init_list_lru_one(gfp); >>> - if (!table[i].mlru) { >>> - while (i--) >>> - kfree(table[i].mlru); >>> - kfree(table); >>> - return -ENOMEM; >>> + do { >>> + /* >>> + * Keep finding the farest parent that wasn't populated >>> + * until found memcg itself. >>> + */ >>> + pos = memcg; >>> + parent = parent_mem_cgroup(pos); >>> + while (parent && !memcg_list_lru_allocated(parent, lru)) { >> >> The check of @parent is unnecessary since memcg_list_lru_allocated() always >> return true for root memory cgroup. Right? > > Right, just wrote this to be less error prone as > memcg_list_lru_allocated can't handle NULL. But yeah, parent = NULL > shouldn't happen here, it can be removed. > >> >>> + pos = parent; >>> + parent = parent_mem_cgroup(pos); >>> } >>> - } >>> >>> - xas_lock_irqsave(&xas, flags); >>> - while (i--) { >>> - int index = READ_ONCE(table[i].memcg->kmemcg_id); >>> - struct list_lru_memcg *mlru = table[i].mlru; >>> - >>> - xas_set(&xas, index); >>> -retry: >>> - if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { >>> - kfree(mlru); >>> - } else { >>> - xas_store(&xas, mlru); >>> - if (xas_error(&xas) == -ENOMEM) { >>> + mlru = memcg_init_list_lru_one(gfp); >>> + do { >>> + bool alloced = false; >>> + >>> + xas_set(&xas, pos->kmemcg_id); >>> + xas_lock_irqsave(&xas, flags); >>> + if (!css_is_dying(&pos->css) && !xas_load(&xas)) { >>> + xas_store(&xas, mlru); >>> + alloced = true; >>> + } >>> + if (!alloced || xas_error(&xas)) { >>> xas_unlock_irqrestore(&xas, flags); >>> - if (xas_nomem(&xas, gfp)) >>> - xas_set_err(&xas, 0); >>> - xas_lock_irqsave(&xas, flags); >>> - /* >>> - * The xas lock has been released, this memcg >>> - * can be reparented before us. So reload >>> - * memcg id. More details see the comments >>> - * in memcg_reparent_list_lrus(). >>> - */ >>> - index = READ_ONCE(table[i].memcg->kmemcg_id); >>> - if (index < 0) >>> - xas_set_err(&xas, 0); >>> - else if (!xas_error(&xas) && index != xas.xa_index) >>> - xas_set(&xas, index); >>> - goto retry; >>> + kfree(mlru); >>> + goto out; >> >> 1) If xas_error() returns true, we should continue since xas_mem() will handle the NOMEM >> case for us. > > Oh, My bad, I think I lost an intermediate commit for this part and > ended up exiting too early on error case; and it also needs to check > !mlru case above and return -ENOMEM, will update this part. > >> >> 2) If xas_load() returns a non-NULL pointer meaning there is a concurrent thread has >> assigned a new mlru for us, we should continue since we might have not assigned a mlru >> for the leaf memory cgroup. Suppose the following case: >> >> A >> / \ >> B C >> >> If there are two threads allocating mlru for A, then either B or C will not allocate a mlru. >> Here is a code snippet FYI. > > Nice find, forgot the case when multiple children could race for one parent. > >> >> Thanks. >> >> ```c >> struct list_lru_memcg *mlru = NULL; >> >> do { >> /* ... */ >> if (!mlru) >> mlru = memcg_init_list_lru_one(gfp); >> >> xas_set(&xas, pos->kmemcg_id); >> do { >> xas_lock_irqsave(&xas, flags); >> if (css_is_dying(&pos->css) || xas_load(&xas)) >> goto unlock; >> xas_store(&xas, mlru); >> if (!xas_error(&xas)) >> mlru = NULL; >> unlock: >> xas_unlock_irqrestore(&xas, flags); >> } while (xas_nomem(&xas, gfp)); >> } while (pos != memcg); >> >> if (mlru) >> kfree(mlru); > > Good suggestion, one more thing is, should it abort the iteration if > the memcg is dead? Considering handling the memcg_init_list_lru_one > failure, so the loop could be this? Agree. > > + mlru = NULL; > + do { > + pos = memcg; > + parent = parent_mem_cgroup(pos); > + while (!memcg_list_lru_allocated(parent, lru)) { > + pos = parent; > + parent = parent_mem_cgroup(pos); > + } > + > + mlru = mlru ?: memcg_init_list_lru_one(gfp); > + if (!mlru) > + return -ENOMEM; > + xas_set(&xas, pos->kmemcg_id); > + do { > + xas_lock_irqsave(&xas, flags); > + if (!xas_load(&xas) && !css_is_dying(&pos->css)) { > + xas_store(&xas, mlru); > + if (!xas_error(&xas)) > + mlru = NULL; > + } > + xas_unlock_irqrestore(&xas, flags); > + } while (xas_nomem(&xas, gfp)); > + } while (!css_is_dying(&pos->css) && pos != memcg); > + > + if (mlru) > + kfree(mlru);