Re: [PATCH v11 07/14] mm: multi-gen LRU: exploit locality in rmap

Barry Song <21cnbao@xxxxxxxxx> · Mon, 6 Jun 2022 21:25:14 +1200

On Wed, May 18, 2022 at 4:49 PM Yu Zhao <yuzhao@xxxxxxxxxx> wrote:
>
> Searching the rmap for PTEs mapping each page on an LRU list (to test
> and clear the accessed bit) can be expensive because pages from
> different VMAs (PA space) are not cache friendly to the rmap (VA
> space). For workloads mostly using mapped pages, the rmap has a high
> CPU cost in the reclaim path.
>
> This patch exploits spatial locality to reduce the trips into the
> rmap. When shrink_page_list() walks the rmap and finds a young PTE, a
> new function lru_gen_look_around() scans at most BITS_PER_LONG-1
> adjacent PTEs. On finding another young PTE, it clears the accessed
> bit and updates the gen counter of the page mapped by this PTE to
> (max_seq%MAX_NR_GENS)+1.
>
> Server benchmark results:
>   Single workload:
>     fio (buffered I/O): no change
>
>   Single workload:
>     memcached (anon): +[5.5, 7.5]%
>                 Ops/sec      KB/sec
>       patch1-6: 1120643.70   43588.06
>       patch1-7: 1193918.93   46438.15
>
>   Configurations:
>     no change
>
> Client benchmark results:
>   kswapd profiles:
>     patch1-6
>       35.99%  lzo1x_1_do_compress (real work)
>       19.40%  page_vma_mapped_walk
>        6.31%  _raw_spin_unlock_irq
>        3.95%  do_raw_spin_lock
>        2.39%  anon_vma_interval_tree_iter_first
>        2.25%  ptep_clear_flush
>        1.92%  __anon_vma_interval_tree_subtree_search
>        1.70%  folio_referenced_one
>        1.68%  __zram_bvec_write
>        1.43%  anon_vma_interval_tree_iter_next
>
>     patch1-7
>       45.90%  lzo1x_1_do_compress (real work)
>        9.14%  page_vma_mapped_walk
>        6.81%  _raw_spin_unlock_irq
>        2.80%  ptep_clear_flush
>        2.34%  __zram_bvec_write
>        2.29%  do_raw_spin_lock
>        1.84%  lru_gen_look_around
>        1.78%  memmove
>        1.74%  obj_malloc
>        1.50%  free_unref_page_list
>
>   Configurations:
>     no change
>
> Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx>
> Acked-by: Brian Geffon <bgeffon@xxxxxxxxxx>
> Acked-by: Jan Alexander Steffens (heftig) <heftig@xxxxxxxxxxxxx>
> Acked-by: Oleksandr Natalenko <oleksandr@xxxxxxxxxxxxxx>
> Acked-by: Steven Barrett <steven@xxxxxxxxxxxx>
> Acked-by: Suleiman Souhlal <suleiman@xxxxxxxxxx>
> Tested-by: Daniel Byrne <djbyrne@xxxxxxx>
> Tested-by: Donald Carr <d@xxxxxxxxxxxxxxx>
> Tested-by: Holger Hoffstätte <holger@xxxxxxxxxxxxxxxxxxxxxx>
> Tested-by: Konstantin Kharlamov <Hi-Angel@xxxxxxxxx>
> Tested-by: Shuang Zhai <szhai2@xxxxxxxxxxxxxxxx>
> Tested-by: Sofia Trinh <sofia.trinh@edi.works>
> Tested-by: Vaibhav Jain <vaibhav@xxxxxxxxxxxxx>
> ---
>  include/linux/memcontrol.h |  31 ++++++++
>  include/linux/mm.h         |   5 ++
>  include/linux/mmzone.h     |   6 ++
>  mm/internal.h              |   1 +
>  mm/memcontrol.c            |   1 +
>  mm/rmap.c                  |   7 ++
>  mm/swap.c                  |   4 +-
>  mm/vmscan.c                | 157 +++++++++++++++++++++++++++++++++++++
>  8 files changed, 210 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 89b14729d59f..2bfdcc77648a 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -438,6 +438,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
>   * - LRU isolation
>   * - lock_page_memcg()
>   * - exclusive reference
> + * - mem_cgroup_trylock_pages()
>   *
>   * For a kmem folio a caller should hold an rcu read lock to protect memcg
>   * associated with a kmem folio from being released.
> @@ -499,6 +500,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
>   * - LRU isolation
>   * - lock_page_memcg()
>   * - exclusive reference
> + * - mem_cgroup_trylock_pages()
>   *
>   * For a kmem page a caller should hold an rcu read lock to protect memcg
>   * associated with a kmem page from being released.
> @@ -948,6 +950,23 @@ void unlock_page_memcg(struct page *page);
>
>  void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
>
> +/* try to stablize folio_memcg() for all the pages in a memcg */
> +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
> +{
> +       rcu_read_lock();
> +
> +       if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
> +               return true;
> +
> +       rcu_read_unlock();
> +       return false;
> +}
> +
> +static inline void mem_cgroup_unlock_pages(void)
> +{
> +       rcu_read_unlock();
> +}
> +
>  /* idx can be of type enum memcg_stat_item or node_stat_item */
>  static inline void mod_memcg_state(struct mem_cgroup *memcg,
>                                    int idx, int val)
> @@ -1386,6 +1405,18 @@ static inline void folio_memcg_unlock(struct folio *folio)
>  {
>  }
>
> +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
> +{
> +       /* to match folio_memcg_rcu() */
> +       rcu_read_lock();
> +       return true;
> +}
> +
> +static inline void mem_cgroup_unlock_pages(void)
> +{
> +       rcu_read_unlock();
> +}
> +
>  static inline void mem_cgroup_handle_over_high(void)
>  {
>  }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 894c289c2c06..4e8ab4ad4473 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1523,6 +1523,11 @@ static inline unsigned long folio_pfn(struct folio *folio)
>         return page_to_pfn(&folio->page);
>  }
>
> +static inline struct folio *pfn_folio(unsigned long pfn)
> +{
> +       return page_folio(pfn_to_page(pfn));
> +}
> +
>  static inline atomic_t *folio_pincount_ptr(struct folio *folio)
>  {
>         return &folio_page(folio, 1)->compound_pincount;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 2d023d243e73..f0b980362186 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -374,6 +374,7 @@ enum lruvec_flags {
>  #ifndef __GENERATING_BOUNDS_H
>
>  struct lruvec;
> +struct page_vma_mapped_walk;
>
>  #define LRU_GEN_MASK           ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
>  #define LRU_REFS_MASK          ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
> @@ -429,6 +430,7 @@ struct lru_gen_struct {
>  };
>
>  void lru_gen_init_lruvec(struct lruvec *lruvec);
> +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
>
>  #ifdef CONFIG_MEMCG
>  void lru_gen_init_memcg(struct mem_cgroup *memcg);
> @@ -441,6 +443,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
>  {
>  }
>
> +static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
> +{
> +}
> +
>  #ifdef CONFIG_MEMCG
>  static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
>  {
> diff --git a/mm/internal.h b/mm/internal.h
> index cf16280ce132..59d2422b647d 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -68,6 +68,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
>  void folio_rotate_reclaimable(struct folio *folio);
>  bool __folio_end_writeback(struct folio *folio);
>  void deactivate_file_folio(struct folio *folio);
> +void folio_activate(struct folio *folio);
>
>  void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
>                 unsigned long floor, unsigned long ceiling);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2ee074f80e72..98aa720ac639 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2769,6 +2769,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
>          * - LRU isolation
>          * - lock_page_memcg()
>          * - exclusive reference
> +        * - mem_cgroup_trylock_pages()
>          */
>         folio->memcg_data = (unsigned long)memcg;
>  }
> diff --git a/mm/rmap.c b/mm/rmap.c
> index fedb82371efe..7cb7ef29088a 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -73,6 +73,7 @@
>  #include <linux/page_idle.h>
>  #include <linux/memremap.h>
>  #include <linux/userfaultfd_k.h>
> +#include <linux/mm_inline.h>
>
>  #include <asm/tlbflush.h>
>
> @@ -821,6 +822,12 @@ static bool folio_referenced_one(struct folio *folio,
>                 }
>
>                 if (pvmw.pte) {
> +                       if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
> +                           !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
> +                               lru_gen_look_around(&pvmw);
> +                               referenced++;
> +                       }
> +
>                         if (ptep_clear_flush_young_notify(vma, address,

Hello, Yu.
look_around() is calling ptep_test_and_clear_young(pvmw->vma, addr, pte + i)
only without flush and notify. for flush, there is a tlb operation for arm64:
static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
                                         unsigned long address, pte_t *ptep)
{
        int young = ptep_test_and_clear_young(vma, address, ptep);

        if (young) {
                /*
                 * We can elide the trailing DSB here since the worst that can
                 * happen is that a CPU continues to use the young entry in its
                 * TLB and we mistakenly reclaim the associated page. The
                 * window for such an event is bounded by the next
                 * context-switch, which provides a DSB to complete the TLB
                 * invalidation.
                 */
                flush_tlb_page_nosync(vma, address);
        }

        return young;
}

Does it mean the current kernel is over cautious?  is it
safe to call ptep_test_and_clear_young() only?

btw, lru_gen_look_around() has already included 'address', are we doing
pte check for 'address' twice here?

>                                                 pvmw.pte)) {
>                                 /*
> diff --git a/mm/swap.c b/mm/swap.c
> index a99d22308f28..0aa1d0b33d42 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -342,7 +342,7 @@ static bool need_activate_page_drain(int cpu)
>         return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
>  }
>
> -static void folio_activate(struct folio *folio)
> +void folio_activate(struct folio *folio)
>  {
>         if (folio_test_lru(folio) && !folio_test_active(folio) &&
>             !folio_test_unevictable(folio)) {
> @@ -362,7 +362,7 @@ static inline void activate_page_drain(int cpu)
>  {
>  }
>
> -static void folio_activate(struct folio *folio)
> +void folio_activate(struct folio *folio)
>  {
>         struct lruvec *lruvec;
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 891f0ab69b3a..cf89a28c3b0e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1554,6 +1554,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
>                 if (!sc->may_unmap && page_mapped(page))
>                         goto keep_locked;
>
> +               /* folio_update_gen() tried to promote this page? */
> +               if (lru_gen_enabled() && !ignore_references &&
> +                   page_mapped(page) && PageReferenced(page))
> +                       goto keep_locked;
> +
>                 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
>                         (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
>
> @@ -3137,6 +3142,28 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
>   *                          the aging
>   ******************************************************************************/
>
> +static int folio_update_gen(struct folio *folio, int gen)
> +{
> +       unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
> +
> +       VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
> +       VM_WARN_ON_ONCE(!rcu_read_lock_held());
> +
> +       do {
> +               /* lru_gen_del_folio() has isolated this page? */
> +               if (!(old_flags & LRU_GEN_MASK)) {
> +                       /* for shrink_page_list() */
> +                       new_flags = old_flags | BIT(PG_referenced);
> +                       continue;
> +               }
> +
> +               new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
> +               new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
> +       } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
> +
> +       return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
> +}
> +
>  static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
>  {
>         int type = folio_is_file_lru(folio);
> @@ -3147,6 +3174,11 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
>         VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
>
>         do {
> +               new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
> +               /* folio_update_gen() has promoted this page? */
> +               if (new_gen >= 0 && new_gen != old_gen)
> +                       return new_gen;
> +
>                 new_gen = (old_gen + 1) % MAX_NR_GENS;
>
>                 new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
> @@ -3365,6 +3397,125 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
>         } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
>  }
>
> +/*
> + * This function exploits spatial locality when shrink_page_list() walks the
> + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
> + */
> +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
> +{
> +       int i;
> +       pte_t *pte;
> +       unsigned long start;
> +       unsigned long end;
> +       unsigned long addr;
> +       unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
> +       struct folio *folio = pfn_folio(pvmw->pfn);
> +       struct mem_cgroup *memcg = folio_memcg(folio);
> +       struct pglist_data *pgdat = folio_pgdat(folio);
> +       struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
> +       DEFINE_MAX_SEQ(lruvec);
> +       int old_gen, new_gen = lru_gen_from_seq(max_seq);
> +
> +       lockdep_assert_held(pvmw->ptl);
> +       VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
> +
> +       if (spin_is_contended(pvmw->ptl))
> +               return;
> +
> +       start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
> +       end = pmd_addr_end(pvmw->address, pvmw->vma->vm_end);
> +
> +       if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
> +               if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
> +                       end = start + MIN_LRU_BATCH * PAGE_SIZE;
> +               else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
> +                       start = end - MIN_LRU_BATCH * PAGE_SIZE;
> +               else {
> +                       start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
> +                       end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
> +               }
> +       }
> +
> +       pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
> +
> +       rcu_read_lock();
> +       arch_enter_lazy_mmu_mode();
> +
> +       for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
> +               unsigned long pfn = pte_pfn(pte[i]);
> +
> +               VM_WARN_ON_ONCE(addr < pvmw->vma->vm_start || addr >= pvmw->vma->vm_end);
> +
> +               if (!pte_present(pte[i]) || is_zero_pfn(pfn))
> +                       continue;
> +
> +               if (WARN_ON_ONCE(pte_devmap(pte[i]) || pte_special(pte[i])))
> +                       continue;
> +
> +               if (!pte_young(pte[i]))
> +                       continue;
> +
> +               VM_WARN_ON_ONCE(!pfn_valid(pfn));
> +               if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
> +                       continue;
> +
> +               folio = pfn_folio(pfn);
> +               if (folio_nid(folio) != pgdat->node_id)
> +                       continue;
> +
> +               if (folio_memcg_rcu(folio) != memcg)
> +                       continue;
> +
> +               if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
> +                       continue;
> +
> +               if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
> +                   !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
> +                     !folio_test_swapcache(folio)))
> +                       folio_mark_dirty(folio);
> +
> +               old_gen = folio_lru_gen(folio);
> +               if (old_gen < 0)
> +                       folio_set_referenced(folio);
> +               else if (old_gen != new_gen)
> +                       __set_bit(i, bitmap);
> +       }
> +
> +       arch_leave_lazy_mmu_mode();
> +       rcu_read_unlock();
> +
> +       if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
> +               for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
> +                       folio = pfn_folio(pte_pfn(pte[i]));
> +                       folio_activate(folio);
> +               }
> +               return;
> +       }
> +
> +       /* folio_update_gen() requires stable folio_memcg() */
> +       if (!mem_cgroup_trylock_pages(memcg))
> +               return;
> +
> +       spin_lock_irq(&lruvec->lru_lock);
> +       new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
> +
> +       for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
> +               folio = pfn_folio(pte_pfn(pte[i]));
> +               if (folio_memcg_rcu(folio) != memcg)
> +                       continue;
> +
> +               old_gen = folio_update_gen(folio, new_gen);
> +               if (old_gen < 0 || old_gen == new_gen)
> +                       continue;
> +
> +               lru_gen_update_size(lruvec, folio, old_gen, new_gen);
> +       }
> +
> +       spin_unlock_irq(&lruvec->lru_lock);
> +
> +       mem_cgroup_unlock_pages();
> +}
> +
>  /******************************************************************************
>   *                          the eviction
>   ******************************************************************************/
> @@ -3401,6 +3552,12 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
>                 return true;
>         }
>
> +       /* promoted */
> +       if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
> +               list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
> +               return true;
> +       }
> +
>         /* protected */
>         if (tier > tier_idx) {
>                 int hist = lru_hist_from_seq(lrugen->min_seq[type]);
> --
> 2.36.0.550.gb090851708-goog
>

Thanks
Barry