> On Jun 25, 2024, at 01:53, Kairui Song <ryncsn@xxxxxxxxx> wrote: > > From: Kairui Song <kasong@xxxxxxxxxxx> > > Currently, there is a lot of code for detecting reparent racing > using kmemcg_id as the synchronization flag. And an intermediate > table is required to record and compare the kmemcg_id. > > We can simplify this by just checking the cgroup css status, skip > if cgroup is being offlined. On the reparenting side, ensure no > more allocation is on going and no further allocation will occur > by using the XArray lock as barrier. > > Combined with a O(n^2) top-down walk for the allocation, we get rid > of the intermediate table allocation completely. Despite being O(n^2), > it should be actually faster because it's not practical to have a very > deep cgroup level. I kind of agree with you. But just instinctually. > > This also avoided changing kmemcg_id before reparenting, making > cgroups have a stable index for list_lru_memcg. After this change > it's possible that a dying cgroup will see a NULL value in XArray > corresponding to the kmemcg_id, because the kmemcg_id will point > to an empty slot. In such case, just fallback to use its parent. > > As a result the code is simpler, following test also showed a > performance gain (6 test runs): > > mkdir /tmp/test-fs > modprobe brd rd_nr=1 rd_size=16777216 > mkfs.xfs /dev/ram0 > mount -t xfs /dev/ram0 /tmp/test-fs > worker() { > echo TEST-CONTENT > "/tmp/test-fs/$1" > } > do_test() { > for i in $(seq 1 2048); do > (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs" > worker "$i" & > done; wait > echo 1 > /proc/sys/vm/drop_caches > } > mkdir -p /sys/fs/cgroup/benchmark > echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control > for i in $(seq 1 2048); do > rmdir "/sys/fs/cgroup/benchmark/$i" &>/dev/null > mkdir -p "/sys/fs/cgroup/benchmark/$i" > done > time do_test > > Before: > real 0m5.932s user 0m2.366s sys 0m5.583s > real 0m5.939s user 0m2.347s sys 0m5.597s > real 0m6.149s user 0m2.398s sys 0m5.761s > real 0m5.945s user 0m2.403s sys 0m5.547s > real 0m5.925s user 0m2.293s sys 0m5.651s > real 0m6.017s user 0m2.367s sys 0m5.686s > > After: > real 0m5.712s user 0m2.343s sys 0m5.307s > real 0m5.885s user 0m2.326s sys 0m5.518s > real 0m5.694s user 0m2.347s sys 0m5.264s > real 0m5.865s user 0m2.300s sys 0m5.545s > real 0m5.748s user 0m2.273s sys 0m5.424s > real 0m5.756s user 0m2.318s sys 0m5.398s > > Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx> > --- > mm/list_lru.c | 182 ++++++++++++++++++++++---------------------------- > mm/zswap.c | 7 +- > 2 files changed, 81 insertions(+), 108 deletions(-) > > diff --git a/mm/list_lru.c b/mm/list_lru.c > index 4c619857e916..ac8aec8451dd 100644 > --- a/mm/list_lru.c > +++ b/mm/list_lru.c > @@ -59,6 +59,20 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) > } > return &lru->node[nid].lru; > } > + > +static inline struct list_lru_one * > +list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) > +{ > + struct list_lru_one *l; > +again: > + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > + if (likely(l)) > + return l; > + > + memcg = parent_mem_cgroup(memcg); > + WARN_ON(!css_is_dying(&memcg->css)); > + goto again; Just want to confirm this will only loop max twice, right? > +} > #else > static void list_lru_register(struct list_lru *lru) > { > @@ -83,6 +97,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) > { > return &lru->node[nid].lru; > } > + > +static inline struct list_lru_one * > +list_lru_from_memcg(struct list_lru *lru, int nid, int idx) > +{ > + return &lru->node[nid].lru; > +} > #endif /* CONFIG_MEMCG_KMEM */ > > bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, > @@ -93,7 +113,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, > > spin_lock(&nlru->lock); > if (list_empty(item)) { > - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > + l = list_lru_from_memcg(lru, nid, memcg); > list_add_tail(item, &l->list); > /* Set shrinker bit if the first element was added */ > if (!l->nr_items++) > @@ -124,7 +144,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, > > spin_lock(&nlru->lock); > if (!list_empty(item)) { > - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); > + l = list_lru_from_memcg(lru, nid, memcg); > list_del_init(item); > l->nr_items--; > nlru->nr_items--; > @@ -339,20 +359,6 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) > return mlru; > } > > -static void memcg_list_lru_free(struct list_lru *lru, int src_idx) > -{ > - struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); > - > - /* > - * The __list_lru_walk_one() can walk the list of this node. > - * We need kvfree_rcu() here. And the walking of the list > - * is under lru->node[nid]->lock, which can serve as a RCU > - * read-side critical section. > - */ > - if (mlru) > - kvfree_rcu(mlru, rcu); > -} > - > static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) > { > if (memcg_aware) > @@ -377,22 +383,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru) > } > > static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, > - int src_idx, struct mem_cgroup *dst_memcg) > + struct list_lru_one *src, > + struct mem_cgroup *dst_memcg) > { > struct list_lru_node *nlru = &lru->node[nid]; > - int dst_idx = dst_memcg->kmemcg_id; > - struct list_lru_one *src, *dst; > + struct list_lru_one *dst; > > /* > * Since list_lru_{add,del} may be called under an IRQ-safe lock, > * we have to use IRQ-safe primitives here to avoid deadlock. > */ > spin_lock_irq(&nlru->lock); > - > - src = list_lru_from_memcg_idx(lru, nid, src_idx); > - if (!src) > - goto out; > - dst = list_lru_from_memcg_idx(lru, nid, dst_idx); > + dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); > > list_splice_init(&src->list, &dst->list); > > @@ -401,46 +403,45 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, > set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); > src->nr_items = 0; > } > -out: > spin_unlock_irq(&nlru->lock); > } > > void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) > { > - struct cgroup_subsys_state *css; > struct list_lru *lru; > - int src_idx = memcg->kmemcg_id, i; > - > - /* > - * Change kmemcg_id of this cgroup and all its descendants to the > - * parent's id, and then move all entries from this cgroup's list_lrus > - * to ones of the parent. > - */ > - rcu_read_lock(); > - css_for_each_descendant_pre(css, &memcg->css) { > - struct mem_cgroup *child; > - > - child = mem_cgroup_from_css(css); > - WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); > - } > - rcu_read_unlock(); > + int i; > > - /* > - * With kmemcg_id set to parent, holding the lru lock below can > - * prevent list_lru_{add,del,isolate} from touching the lru, safe > - * to reparent. > - */ > mutex_lock(&list_lrus_mutex); > list_for_each_entry(lru, &memcg_list_lrus, list) { > + struct list_lru_memcg *mlru; > + XA_STATE(xas, &lru->xa, memcg->kmemcg_id); > + > + /* > + * Lock the Xarray to ensure no on going allocation and ongoing? I'd like to rewrite the comment here, like: The XArray lock is used to make the future observers will see the current memory cgroup has been dying (i.e. css_is_dying() will return true), which is significant for memcg_list_lru_alloc() to make sure it will not allocate a new mlru for this died memory cgroup for which we just free it below. > + * further allocation will see css_is_dying(). > + */ > + xas_lock_irq(&xas); > + mlru = xas_load(&xas); > + if (mlru) > + xas_store(&xas, NULL); > + xas_unlock_irq(&xas); The above block could be replaced with xa_erase_irq(), simpler, right? > + if (!mlru) > + continue; > + > + /* > + * With Xarray value set to NULL, holding the lru lock below > + * prevents list_lru_{add,del,isolate} from touching the lru, > + * safe to reparent. > + */ > for_each_node(i) > - memcg_reparent_list_lru_node(lru, i, src_idx, parent); > + memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); > > /* > * Here all list_lrus corresponding to the cgroup are guaranteed > * to remain empty, we can safely free this lru, any further > * memcg_list_lru_alloc() call will simply bail out. > */ > - memcg_list_lru_free(lru, src_idx); > + kvfree_rcu(mlru, rcu); > } > mutex_unlock(&list_lrus_mutex); > } > @@ -456,78 +457,51 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, > int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, > gfp_t gfp) > { > - int i; > unsigned long flags; > - struct list_lru_memcg_table { > - struct list_lru_memcg *mlru; > - struct mem_cgroup *memcg; > - } *table; > + struct list_lru_memcg *mlru; > + struct mem_cgroup *pos, *parent; > XA_STATE(xas, &lru->xa, 0); > > if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) > return 0; > > gfp &= GFP_RECLAIM_MASK; > - table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); > - if (!table) > - return -ENOMEM; > - > /* > * Because the list_lru can be reparented to the parent cgroup's > * list_lru, we should make sure that this cgroup and all its > * ancestors have allocated list_lru_memcg. > */ > - for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { > - if (memcg_list_lru_allocated(memcg, lru)) > - break; > - > - table[i].memcg = memcg; > - table[i].mlru = memcg_init_list_lru_one(gfp); > - if (!table[i].mlru) { > - while (i--) > - kfree(table[i].mlru); > - kfree(table); > - return -ENOMEM; > + do { > + /* > + * Keep finding the farest parent that wasn't populated > + * until found memcg itself. > + */ > + pos = memcg; > + parent = parent_mem_cgroup(pos); > + while (parent && !memcg_list_lru_allocated(parent, lru)) { The check of @parent is unnecessary since memcg_list_lru_allocated() always return true for root memory cgroup. Right? > + pos = parent; > + parent = parent_mem_cgroup(pos); > } > - } > > - xas_lock_irqsave(&xas, flags); > - while (i--) { > - int index = READ_ONCE(table[i].memcg->kmemcg_id); > - struct list_lru_memcg *mlru = table[i].mlru; > - > - xas_set(&xas, index); > -retry: > - if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { > - kfree(mlru); > - } else { > - xas_store(&xas, mlru); > - if (xas_error(&xas) == -ENOMEM) { > + mlru = memcg_init_list_lru_one(gfp); > + do { > + bool alloced = false; > + > + xas_set(&xas, pos->kmemcg_id); > + xas_lock_irqsave(&xas, flags); > + if (!css_is_dying(&pos->css) && !xas_load(&xas)) { > + xas_store(&xas, mlru); > + alloced = true; > + } > + if (!alloced || xas_error(&xas)) { > xas_unlock_irqrestore(&xas, flags); > - if (xas_nomem(&xas, gfp)) > - xas_set_err(&xas, 0); > - xas_lock_irqsave(&xas, flags); > - /* > - * The xas lock has been released, this memcg > - * can be reparented before us. So reload > - * memcg id. More details see the comments > - * in memcg_reparent_list_lrus(). > - */ > - index = READ_ONCE(table[i].memcg->kmemcg_id); > - if (index < 0) > - xas_set_err(&xas, 0); > - else if (!xas_error(&xas) && index != xas.xa_index) > - xas_set(&xas, index); > - goto retry; > + kfree(mlru); > + goto out; 1) If xas_error() returns true, we should continue since xas_mem() will handle the NOMEM case for us. 2) If xas_load() returns a non-NULL pointer meaning there is a concurrent thread has assigned a new mlru for us, we should continue since we might have not assigned a mlru for the leaf memory cgroup. Suppose the following case: A / \ B C If there are two threads allocating mlru for A, then either B or C will not allocate a mlru. Here is a code snippet FYI. Thanks. ```c struct list_lru_memcg *mlru = NULL; do { /* ... */ if (!mlru) mlru = memcg_init_list_lru_one(gfp); xas_set(&xas, pos->kmemcg_id); do { xas_lock_irqsave(&xas, flags); if (css_is_dying(&pos->css) || xas_load(&xas)) goto unlock; xas_store(&xas, mlru); if (!xas_error(&xas)) mlru = NULL; unlock: xas_unlock_irqrestore(&xas, flags); } while (xas_nomem(&xas, gfp)); } while (pos != memcg); if (mlru) kfree(mlru); /* ... */ ``` > } > - } > - } > - /* xas_nomem() is used to free memory instead of memory allocation. */ > - if (xas.xa_alloc) > - xas_nomem(&xas, gfp); > - xas_unlock_irqrestore(&xas, flags); > - kfree(table); > - > + xas_unlock_irqrestore(&xas, flags); > + } while (xas_nomem(&xas, gfp)); > + } while (pos != memcg); > +out: > return xas_error(&xas); > } > #else > diff --git a/mm/zswap.c b/mm/zswap.c > index a50e2986cd2f..c6e2256347ff 100644 > --- a/mm/zswap.c > +++ b/mm/zswap.c > @@ -718,12 +718,11 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) > > /* > * Note that it is safe to use rcu_read_lock() here, even in the face of > - * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection > - * used in list_lru lookup, only two scenarios are possible: > + * concurrent memcg offlining: > * > - * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The > + * 1. list_lru_add() is called before list_lru_memcg is erased. The > * new entry will be reparented to memcg's parent's list_lru. > - * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The > + * 2. list_lru_add() is called after list_lru_memcg is erased. The > * new entry will be added directly to memcg's parent's list_lru. > * > * Similar reasoning holds for list_lru_del(). > -- > 2.45.2 >