On Wed 02-10-19 06:16:43, Yang Shi wrote: > The commit 87eaceb3faa59b9b4d940ec9554ce251325d83fe ("mm: thp: make > deferred split shrinker memcg aware") makes deferred split queue per > memcg to resolve memcg pre-mature OOM problem. But, all nodes end up > sharing the same queue instead of one queue per-node before the commit. > It is not a big deal for memcg limit reclaim, but it may cause global > kswapd shrink THPs from a different node. > > And, 0-day testing reported -19.6% regression of stress-ng's madvise > test [1]. I didn't see that much regression on my test box (24 threads, > 48GB memory, 2 nodes), with the same test (stress-ng --timeout 1 > --metrics-brief --sequential 72 --class vm --exclude spawn,exec), I saw > average -3% (run the same test 10 times then calculate the average since > the test itself may have most 15% variation according to my test) > regression sometimes (not every time, sometimes I didn't see regression > at all). > > This might be caused by deferred split queue lock contention. With some > configuration (i.e. just one root memcg) the lock contention my be worse > than before (given 2 nodes, two locks are reduced to one lock). > > So, moving deferred split queue to memcg's nodeinfo to make it NUMA > aware again. > > With this change stress-ng's madvise test shows average 4% improvement > sometimes and I didn't see degradation anymore. My concern about this getting more and more complex (http://lkml.kernel.org/r/20191002084014.GH15624@xxxxxxxxxxxxxx) holds here even more. Can we step back and reconsider the whole thing please? > [1]: https://lore.kernel.org/lkml/20190930084604.GC17687@shao2-debian/ > > Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> > Cc: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx> > Cc: Johannes Weiner <hannes@xxxxxxxxxxx> > Cc: Michal Hocko <mhocko@xxxxxxxx> > Cc: Hugh Dickins <hughd@xxxxxxxxxx> > Cc: Shakeel Butt <shakeelb@xxxxxxxxxx> > Cc: David Rientjes <rientjes@xxxxxxxxxx> > Signed-off-by: Yang Shi <yang.shi@xxxxxxxxxxxxxxxxx> > --- > include/linux/memcontrol.h | 8 ++++---- > mm/huge_memory.c | 15 +++++++++------ > mm/memcontrol.c | 29 +++++++++++++++++------------ > 3 files changed, 30 insertions(+), 22 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 9b60863..4b5c791 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -137,6 +137,10 @@ struct mem_cgroup_per_node { > bool congested; /* memcg has many dirty pages */ > /* backed by a congested BDI */ > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + struct deferred_split deferred_split_queue; > +#endif > + > struct mem_cgroup *memcg; /* Back pointer, we cannot */ > /* use container_of */ > }; > @@ -330,10 +334,6 @@ struct mem_cgroup { > struct list_head event_list; > spinlock_t event_list_lock; > > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE > - struct deferred_split deferred_split_queue; > -#endif > - > struct mem_cgroup_per_node *nodeinfo[0]; > /* WARNING: nodeinfo must be the last member here */ > }; > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index c5cb6dc..3b78910 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -500,10 +500,11 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) > static inline struct deferred_split *get_deferred_split_queue(struct page *page) > { > struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; > - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); > + int nid = page_to_nid(page); > + struct pglist_data *pgdat = NODE_DATA(nid); > > if (memcg) > - return &memcg->deferred_split_queue; > + return &memcg->nodeinfo[nid]->deferred_split_queue; > else > return &pgdat->deferred_split_queue; > } > @@ -2882,12 +2883,13 @@ void deferred_split_huge_page(struct page *page) > static unsigned long deferred_split_count(struct shrinker *shrink, > struct shrink_control *sc) > { > - struct pglist_data *pgdata = NODE_DATA(sc->nid); > + int nid = sc->nid; > + struct pglist_data *pgdata = NODE_DATA(nid); > struct deferred_split *ds_queue = &pgdata->deferred_split_queue; > > #ifdef CONFIG_MEMCG > if (sc->memcg) > - ds_queue = &sc->memcg->deferred_split_queue; > + ds_queue = &sc->memcg->nodeinfo[nid]->deferred_split_queue; > #endif > return READ_ONCE(ds_queue->split_queue_len); > } > @@ -2895,7 +2897,8 @@ static unsigned long deferred_split_count(struct shrinker *shrink, > static unsigned long deferred_split_scan(struct shrinker *shrink, > struct shrink_control *sc) > { > - struct pglist_data *pgdata = NODE_DATA(sc->nid); > + int nid = sc->nid; > + struct pglist_data *pgdata = NODE_DATA(nid); > struct deferred_split *ds_queue = &pgdata->deferred_split_queue; > unsigned long flags; > LIST_HEAD(list), *pos, *next; > @@ -2904,7 +2907,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, > > #ifdef CONFIG_MEMCG > if (sc->memcg) > - ds_queue = &sc->memcg->deferred_split_queue; > + ds_queue = &sc->memcg->nodeinfo[nid]->deferred_split_queue; > #endif > > spin_lock_irqsave(&ds_queue->split_queue_lock, flags); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index c313c49..19d4295 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -4989,6 +4989,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) > pn->on_tree = false; > pn->memcg = memcg; > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + spin_lock_init(&pn->deferred_split_queue.split_queue_lock); > + INIT_LIST_HEAD(&pn->deferred_split_queue.split_queue); > + pn->deferred_split_queue.split_queue_len = 0; > +#endif > + > memcg->nodeinfo[node] = pn; > return 0; > } > @@ -5081,11 +5087,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) > memcg->cgwb_frn[i].done = > __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); > #endif > -#ifdef CONFIG_TRANSPARENT_HUGEPAGE > - spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); > - INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); > - memcg->deferred_split_queue.split_queue_len = 0; > -#endif > idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); > return memcg; > fail: > @@ -5419,6 +5420,8 @@ static int mem_cgroup_move_account(struct page *page, > unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; > int ret; > bool anon; > + struct deferred_split *ds_queue; > + int nid = page_to_nid(page); > > VM_BUG_ON(from == to); > VM_BUG_ON_PAGE(PageLRU(page), page); > @@ -5466,10 +5469,11 @@ static int mem_cgroup_move_account(struct page *page, > > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > if (compound && !list_empty(page_deferred_list(page))) { > - spin_lock(&from->deferred_split_queue.split_queue_lock); > + ds_queue = &from->nodeinfo[nid]->deferred_split_queue; > + spin_lock(&ds_queue->split_queue_lock); > list_del_init(page_deferred_list(page)); > - from->deferred_split_queue.split_queue_len--; > - spin_unlock(&from->deferred_split_queue.split_queue_lock); > + ds_queue->split_queue_len--; > + spin_unlock(&ds_queue->split_queue_lock); > } > #endif > /* > @@ -5483,11 +5487,12 @@ static int mem_cgroup_move_account(struct page *page, > > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > if (compound && list_empty(page_deferred_list(page))) { > - spin_lock(&to->deferred_split_queue.split_queue_lock); > + ds_queue = &to->nodeinfo[nid]->deferred_split_queue; > + spin_lock(&ds_queue->split_queue_lock); > list_add_tail(page_deferred_list(page), > - &to->deferred_split_queue.split_queue); > - to->deferred_split_queue.split_queue_len++; > - spin_unlock(&to->deferred_split_queue.split_queue_lock); > + &ds_queue->split_queue); > + ds_queue->split_queue_len++; > + spin_unlock(&ds_queue->split_queue_lock); > } > #endif > > -- > 1.8.3.1 -- Michal Hocko SUSE Labs