On 6/16/20 8:16 AM, js1304@xxxxxxxxx wrote: > From: Johannes Weiner <hannes@xxxxxxxxxxx> > > After ("mm: workingset: let cache workingset challenge anon fix"), we This could now be updated to: After commit 34e58cac6d8f ("mm: workingset: let cache workingset challenge anon"), we ... > compare refault distances to active_file + anon. But age of the > non-resident information is only driven by the file LRU. As a result, > we may overestimate the recency of any incoming refaults and activate > them too eagerly, causing unnecessary LRU churn in certain situations. > > Make anon aging drive nonresident age as well to address that. Fixes: 34e58cac6d8f ("mm: workingset: let cache workingset challenge anon") > Reported-by: Joonsoo Kim <js1304@xxxxxxxxx> > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@xxxxxxx> Acked-by: Vlastimil Babka <vbabka@xxxxxxx> > --- > include/linux/mmzone.h | 4 ++-- > include/linux/swap.h | 1 + > mm/vmscan.c | 3 +++ > mm/workingset.c | 46 +++++++++++++++++++++++++++------------------- > 4 files changed, 33 insertions(+), 21 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index c4c37fd..f6f8849 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -257,8 +257,8 @@ struct lruvec { > */ > unsigned long anon_cost; > unsigned long file_cost; > - /* Evictions & activations on the inactive file list */ > - atomic_long_t inactive_age; > + /* Non-resident age, driven by LRU movement */ > + atomic_long_t nonresident_age; > /* Refaults at the time of last reclaim cycle */ > unsigned long refaults; > /* Various lruvec state flags (enum lruvec_flags) */ > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 4c5974b..5b3216b 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -313,6 +313,7 @@ struct vma_swap_readahead { > }; > > /* linux/mm/workingset.c */ > +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); > void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); > void workingset_refault(struct page *page, void *shadow); > void workingset_activation(struct page *page); > diff --git a/mm/vmscan.c b/mm/vmscan.c > index b6d8432..749d239 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -904,6 +904,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, > __delete_from_swap_cache(page, swap); > xa_unlock_irqrestore(&mapping->i_pages, flags); > put_swap_page(page, swap); > + workingset_eviction(page, target_memcg); > } else { > void (*freepage)(struct page *); > void *shadow = NULL; > @@ -1884,6 +1885,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, > list_add(&page->lru, &pages_to_free); > } else { > nr_moved += nr_pages; > + if (PageActive(page)) > + workingset_age_nonresident(lruvec, nr_pages); > } > } > > diff --git a/mm/workingset.c b/mm/workingset.c > index d481ea4..50b7937 100644 > --- a/mm/workingset.c > +++ b/mm/workingset.c > @@ -156,8 +156,8 @@ > * > * Implementation > * > - * For each node's file LRU lists, a counter for inactive evictions > - * and activations is maintained (node->inactive_age). > + * For each node's LRU lists, a counter for inactive evictions and > + * activations is maintained (node->nonresident_age). > * > * On eviction, a snapshot of this counter (along with some bits to > * identify the node) is stored in the now empty page cache > @@ -213,7 +213,17 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, > *workingsetp = workingset; > } > > -static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) > +/** > + * workingset_age_nonresident - age non-resident entries as LRU ages > + * @memcg: the lruvec that was aged > + * @nr_pages: the number of pages to count > + * > + * As in-memory pages are aged, non-resident pages need to be aged as > + * well, in order for the refault distances later on to be comparable > + * to the in-memory dimensions. This function allows reclaim and LRU > + * operations to drive the non-resident aging along in parallel. > + */ > +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) > { > /* > * Reclaiming a cgroup means reclaiming all its children in a > @@ -227,11 +237,8 @@ static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) > * the root cgroup's, age as well. > */ > do { > - struct lruvec *lruvec; > - > - lruvec = mem_cgroup_lruvec(memcg, pgdat); > - atomic_long_inc(&lruvec->inactive_age); > - } while (memcg && (memcg = parent_mem_cgroup(memcg))); > + atomic_long_add(nr_pages, &lruvec->nonresident_age); > + } while ((lruvec = parent_lruvec(lruvec))); > } > > /** > @@ -254,12 +261,11 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) > VM_BUG_ON_PAGE(page_count(page), page); > VM_BUG_ON_PAGE(!PageLocked(page), page); > > - advance_inactive_age(page_memcg(page), pgdat); > - > lruvec = mem_cgroup_lruvec(target_memcg, pgdat); > + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); > /* XXX: target_memcg can be NULL, go through lruvec */ > memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); > - eviction = atomic_long_read(&lruvec->inactive_age); > + eviction = atomic_long_read(&lruvec->nonresident_age); > return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); > } > > @@ -309,20 +315,20 @@ void workingset_refault(struct page *page, void *shadow) > if (!mem_cgroup_disabled() && !eviction_memcg) > goto out; > eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); > - refault = atomic_long_read(&eviction_lruvec->inactive_age); > + refault = atomic_long_read(&eviction_lruvec->nonresident_age); > > /* > * Calculate the refault distance > * > * The unsigned subtraction here gives an accurate distance > - * across inactive_age overflows in most cases. There is a > + * across nonresident_age overflows in most cases. There is a > * special case: usually, shadow entries have a short lifetime > * and are either refaulted or reclaimed along with the inode > * before they get too old. But it is not impossible for the > - * inactive_age to lap a shadow entry in the field, which can > - * then result in a false small refault distance, leading to a > - * false activation should this old entry actually refault > - * again. However, earlier kernels used to deactivate > + * nonresident_age to lap a shadow entry in the field, which > + * can then result in a false small refault distance, leading > + * to a false activation should this old entry actually > + * refault again. However, earlier kernels used to deactivate > * unconditionally with *every* reclaim invocation for the > * longest time, so the occasional inappropriate activation > * leading to pressure on the active list is not a problem. > @@ -359,7 +365,7 @@ void workingset_refault(struct page *page, void *shadow) > goto out; > > SetPageActive(page); > - advance_inactive_age(memcg, pgdat); > + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); > inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); > > /* Page was active prior to eviction */ > @@ -382,6 +388,7 @@ void workingset_refault(struct page *page, void *shadow) > void workingset_activation(struct page *page) > { > struct mem_cgroup *memcg; > + struct lruvec *lruvec; > > rcu_read_lock(); > /* > @@ -394,7 +401,8 @@ void workingset_activation(struct page *page) > memcg = page_memcg_rcu(page); > if (!mem_cgroup_disabled() && !memcg) > goto out; > - advance_inactive_age(memcg, page_pgdat(page)); > + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); > + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); > out: > rcu_read_unlock(); > } >