The patch titled Subject: mm/mglru: add CONFIG_LRU_GEN_WALKS_MMU has been added to the -mm mm-unstable branch. Its filename is mm-mglru-add-config_lru_gen_walks_mmu.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-mglru-add-config_lru_gen_walks_mmu.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Kinsey Ho <kinseyho@xxxxxxxxxx> Subject: mm/mglru: add CONFIG_LRU_GEN_WALKS_MMU Date: Wed, 27 Dec 2023 14:12:02 +0000 Add CONFIG_LRU_GEN_WALKS_MMU such that if disabled, the code that walks page tables to promote pages into the youngest generation will not be built. Also improves code readability by adding two helper functions get_mm_state() and get_next_mm(). Link: https://lkml.kernel.org/r/20231227141205.2200125-3-kinseyho@xxxxxxxxxx Signed-off-by: Kinsey Ho <kinseyho@xxxxxxxxxx> Co-developed-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> Tested-by: Donet Tom <donettom@xxxxxxxxxxxxxxxxxx> Acked-by: Yu Zhao <yuzhao@xxxxxxxxxx> Cc: kernel test robot <lkp@xxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/memcontrol.h | 2 include/linux/mm_types.h | 12 +- include/linux/mmzone.h | 2 kernel/fork.c | 2 mm/Kconfig | 4 mm/vmscan.c | 192 ++++++++++++++++++++++------------- 6 files changed, 139 insertions(+), 75 deletions(-) --- a/include/linux/memcontrol.h~mm-mglru-add-config_lru_gen_walks_mmu +++ a/include/linux/memcontrol.h @@ -330,7 +330,7 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif --- a/include/linux/mm_types.h~mm-mglru-add-config_lru_gen_walks_mmu +++ a/include/linux/mm_types.h @@ -958,7 +958,7 @@ struct mm_struct { */ unsigned long ksm_zero_pages; #endif /* CONFIG_KSM */ -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_WALKS_MMU struct { /* this mm_struct is on lru_gen_mm_list */ struct list_head list; @@ -973,7 +973,7 @@ struct mm_struct { struct mem_cgroup *memcg; #endif } lru_gen; -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_GEN_WALKS_MMU */ } __randomize_layout; /* @@ -1011,6 +1011,10 @@ struct lru_gen_mm_list { spinlock_t lock; }; +#endif /* CONFIG_LRU_GEN */ + +#ifdef CONFIG_LRU_GEN_WALKS_MMU + void lru_gen_add_mm(struct mm_struct *mm); void lru_gen_del_mm(struct mm_struct *mm); #ifdef CONFIG_MEMCG @@ -1036,7 +1040,7 @@ static inline void lru_gen_use_mm(struct WRITE_ONCE(mm->lru_gen.bitmap, -1); } -#else /* !CONFIG_LRU_GEN */ +#else /* !CONFIG_LRU_GEN_WALKS_MMU */ static inline void lru_gen_add_mm(struct mm_struct *mm) { @@ -1060,7 +1064,7 @@ static inline void lru_gen_use_mm(struct { } -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_GEN_WALKS_MMU */ struct vma_iterator { struct ma_state mas; --- a/include/linux/mmzone.h~mm-mglru-add-config_lru_gen_walks_mmu +++ a/include/linux/mmzone.h @@ -640,9 +640,11 @@ struct lruvec { #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; +#ifdef CONFIG_LRU_GEN_WALKS_MMU /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif +#endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif --- a/kernel/fork.c~mm-mglru-add-config_lru_gen_walks_mmu +++ a/kernel/fork.c @@ -2946,7 +2946,7 @@ pid_t kernel_clone(struct kernel_clone_a get_task_struct(p); } - if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) { /* lock the task to synchronize with memcg migration */ task_lock(p); lru_gen_add_mm(p->mm); --- a/mm/Kconfig~mm-mglru-add-config_lru_gen_walks_mmu +++ a/mm/Kconfig @@ -1274,6 +1274,10 @@ config LRU_GEN_STATS from evicted generations for debugging purpose. This option has a per-memcg and per-node memory overhead. + +config LRU_GEN_WALKS_MMU + def_bool y + depends on LRU_GEN && ARCH_HAS_HW_PTE_YOUNG # } config ARCH_SUPPORTS_PER_VMA_LOCK --- a/mm/vmscan.c~mm-mglru-add-config_lru_gen_walks_mmu +++ a/mm/vmscan.c @@ -2671,13 +2671,14 @@ static void get_item_key(void *item, int key[1] = hash >> BLOOM_FILTER_SHIFT; } -static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +static bool test_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, + void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = READ_ONCE(lruvec->mm_state.filters[gen]); + filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return true; @@ -2686,13 +2687,14 @@ static bool test_bloom_filter(struct lru return test_bit(key[0], filter) && test_bit(key[1], filter); } -static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +static void update_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq, + void *item) { int key[2]; unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = READ_ONCE(lruvec->mm_state.filters[gen]); + filter = READ_ONCE(mm_state->filters[gen]); if (!filter) return; @@ -2704,12 +2706,12 @@ static void update_bloom_filter(struct l set_bit(key[1], filter); } -static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +static void reset_bloom_filter(struct lru_gen_mm_state *mm_state, unsigned long seq) { unsigned long *filter; int gen = filter_gen_from_seq(seq); - filter = lruvec->mm_state.filters[gen]; + filter = mm_state->filters[gen]; if (filter) { bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); return; @@ -2717,13 +2719,15 @@ static void reset_bloom_filter(struct lr filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); - WRITE_ONCE(lruvec->mm_state.filters[gen], filter); + WRITE_ONCE(mm_state->filters[gen], filter); } /****************************************************************************** * mm_struct list ******************************************************************************/ +#ifdef CONFIG_LRU_GEN_WALKS_MMU + static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) { static struct lru_gen_mm_list mm_list = { @@ -2740,6 +2744,29 @@ static struct lru_gen_mm_list *get_mm_li return &mm_list; } +static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) +{ + return &lruvec->mm_state; +} + +static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) +{ + int key; + struct mm_struct *mm; + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); + + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); + + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) + return NULL; + + clear_bit(key, &mm->lru_gen.bitmap); + + return mmget_not_zero(mm) ? mm : NULL; +} + void lru_gen_add_mm(struct mm_struct *mm) { int nid; @@ -2755,10 +2782,11 @@ void lru_gen_add_mm(struct mm_struct *mm for_each_node_state(nid, N_MEMORY) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* the first addition since the last iteration */ - if (lruvec->mm_state.tail == &mm_list->fifo) - lruvec->mm_state.tail = &mm->lru_gen.list; + if (mm_state->tail == &mm_list->fifo) + mm_state->tail = &mm->lru_gen.list; } list_add_tail(&mm->lru_gen.list, &mm_list->fifo); @@ -2784,14 +2812,15 @@ void lru_gen_del_mm(struct mm_struct *mm for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* where the current iteration continues after */ - if (lruvec->mm_state.head == &mm->lru_gen.list) - lruvec->mm_state.head = lruvec->mm_state.head->prev; + if (mm_state->head == &mm->lru_gen.list) + mm_state->head = mm_state->head->prev; /* where the last iteration ended before */ - if (lruvec->mm_state.tail == &mm->lru_gen.list) - lruvec->mm_state.tail = lruvec->mm_state.tail->next; + if (mm_state->tail == &mm->lru_gen.list) + mm_state->tail = mm_state->tail->next; } list_del_init(&mm->lru_gen.list); @@ -2834,10 +2863,30 @@ void lru_gen_migrate_mm(struct mm_struct } #endif +#else /* !CONFIG_LRU_GEN_WALKS_MMU */ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + return NULL; +} + +static struct lru_gen_mm_state *get_mm_state(struct lruvec *lruvec) +{ + return NULL; +} + +static struct mm_struct *get_next_mm(struct lru_gen_mm_walk *walk) +{ + return NULL; +} + +#endif + static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) { int i; int hist; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); @@ -2845,42 +2894,18 @@ static void reset_mm_stats(struct lruvec hist = lru_hist_from_seq(walk->max_seq); for (i = 0; i < NR_MM_STATS; i++) { - WRITE_ONCE(lruvec->mm_state.stats[hist][i], - lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); + WRITE_ONCE(mm_state->stats[hist][i], + mm_state->stats[hist][i] + walk->mm_stats[i]); walk->mm_stats[i] = 0; } } if (NR_HIST_GENS > 1 && last) { - hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); + hist = lru_hist_from_seq(mm_state->seq + 1); for (i = 0; i < NR_MM_STATS; i++) - WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); - } -} - -static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) -{ - int type; - unsigned long size = 0; - struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); - int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); - - if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) - return true; - - clear_bit(key, &mm->lru_gen.bitmap); - - for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { - size += type ? get_mm_counter(mm, MM_FILEPAGES) : - get_mm_counter(mm, MM_ANONPAGES) + - get_mm_counter(mm, MM_SHMEMPAGES); + WRITE_ONCE(mm_state->stats[hist][i], 0); } - - if (size < MIN_LRU_BATCH) - return true; - - return !mmget_not_zero(mm); } static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, @@ -2891,7 +2916,7 @@ static bool iterate_mm_list(struct lruve struct mm_struct *mm = NULL; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); /* * mm_state->seq is incremented after each iteration of mm_list. There @@ -2929,11 +2954,7 @@ static bool iterate_mm_list(struct lruve mm_state->tail = mm_state->head->next; walk->force_scan = true; } - - mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); - if (should_skip_mm(mm, walk)) - mm = NULL; - } while (!mm); + } while (!(mm = get_next_mm(walk))); done: if (*iter || last) reset_mm_stats(lruvec, walk, last); @@ -2941,7 +2962,7 @@ done: spin_unlock(&mm_list->lock); if (mm && first) - reset_bloom_filter(lruvec, walk->max_seq + 1); + reset_bloom_filter(mm_state, walk->max_seq + 1); if (*iter) mmput_async(*iter); @@ -2956,7 +2977,7 @@ static bool iterate_mm_list_nowalk(struc bool success = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); spin_lock(&mm_list->lock); @@ -3469,6 +3490,7 @@ static void walk_pmd_range(pud_t *pud, u DECLARE_BITMAP(bitmap, MIN_LRU_BATCH); unsigned long first = -1; struct lru_gen_mm_walk *walk = args->private; + struct lru_gen_mm_state *mm_state = get_mm_state(walk->lruvec); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3520,7 +3542,7 @@ restart: walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); } - if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + if (!walk->force_scan && !test_bloom_filter(mm_state, walk->max_seq, pmd + i)) continue; walk->mm_stats[MM_NONLEAF_FOUND]++; @@ -3531,7 +3553,7 @@ restart: walk->mm_stats[MM_NONLEAF_ADDED]++; /* carry over to the next generation */ - update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); + update_bloom_filter(mm_state, walk->max_seq + 1, pmd + i); } walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first); @@ -3738,16 +3760,25 @@ next: return success; } -static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) +static bool inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + bool can_swap, bool force_scan) { + bool success; int prev, next; int type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; restart: + if (max_seq < READ_ONCE(lrugen->max_seq)) + return false; + spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + success = max_seq == lrugen->max_seq; + if (!success) + goto unlock; + for (type = ANON_AND_FILE - 1; type >= 0; type--) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; @@ -3791,8 +3822,10 @@ restart: WRITE_ONCE(lrugen->timestamps[next], jiffies); /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); - +unlock: spin_unlock_irq(&lruvec->lru_lock); + + return success; } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, @@ -3802,14 +3835,16 @@ static bool try_to_inc_max_seq(struct lr struct lru_gen_mm_walk *walk; struct mm_struct *mm = NULL; struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + if (!mm_state) + return inc_max_seq(lruvec, max_seq, can_swap, force_scan); + /* see the comment in iterate_mm_list() */ - if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { - success = false; - goto done; - } + if (max_seq <= READ_ONCE(mm_state->seq)) + return false; /* * If the hardware doesn't automatically set the accessed bit, fallback @@ -3839,8 +3874,10 @@ static bool try_to_inc_max_seq(struct lr walk_mm(lruvec, mm, walk); } while (mm); done: - if (success) - inc_max_seq(lruvec, can_swap, force_scan); + if (success) { + success = inc_max_seq(lruvec, max_seq, can_swap, force_scan); + WARN_ON_ONCE(!success); + } return success; } @@ -3965,6 +4002,7 @@ void lru_gen_look_around(struct page_vma struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); int old_gen, new_gen = lru_gen_from_seq(max_seq); @@ -4047,8 +4085,8 @@ void lru_gen_look_around(struct page_vma mem_cgroup_unlock_pages(); /* feedback from rmap walkers to page table walkers */ - if (suitable_to_scan(i, young)) - update_bloom_filter(lruvec, max_seq, pvmw->pmd); + if (mm_state && suitable_to_scan(i, young)) + update_bloom_filter(mm_state, max_seq, pvmw->pmd); } /****************************************************************************** @@ -5224,6 +5262,7 @@ static void lru_gen_seq_show_full(struct int type, tier; int hist = lru_hist_from_seq(seq); struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); @@ -5249,6 +5288,9 @@ static void lru_gen_seq_show_full(struct seq_putc(m, '\n'); } + if (!mm_state) + return; + seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { const char *s = " "; @@ -5256,10 +5298,10 @@ static void lru_gen_seq_show_full(struct if (seq == max_seq && NR_HIST_GENS == 1) { s = "LOYNFA"; - n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + n = READ_ONCE(mm_state->stats[hist][i]); } else if (seq != max_seq && NR_HIST_GENS > 1) { s = "loynfa"; - n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + n = READ_ONCE(mm_state->stats[hist][i]); } seq_printf(m, " %10lu%c", n, s[i]); @@ -5528,6 +5570,7 @@ void lru_gen_init_lruvec(struct lruvec * int i; int gen, type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); @@ -5538,7 +5581,8 @@ void lru_gen_init_lruvec(struct lruvec * for_each_gen_type_zone(gen, type, zone) INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); - lruvec->mm_state.seq = MIN_NR_GENS; + if (mm_state) + mm_state->seq = MIN_NR_GENS; } #ifdef CONFIG_MEMCG @@ -5557,28 +5601,38 @@ void lru_gen_init_pgdat(struct pglist_da void lru_gen_init_memcg(struct mem_cgroup *memcg) { - INIT_LIST_HEAD(&memcg->mm_list.fifo); - spin_lock_init(&memcg->mm_list.lock); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + if (!mm_list) + return; + + INIT_LIST_HEAD(&mm_list->fifo); + spin_lock_init(&mm_list->lock); } void lru_gen_exit_memcg(struct mem_cgroup *memcg) { int i; int nid; + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); - VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); + VM_WARN_ON_ONCE(mm_list && !list_empty(&mm_list->fifo)); for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); + struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); lruvec->lrugen.list.next = LIST_POISON1; + if (!mm_state) + continue; + for (i = 0; i < NR_BLOOM_FILTERS; i++) { - bitmap_free(lruvec->mm_state.filters[i]); - lruvec->mm_state.filters[i] = NULL; + bitmap_free(mm_state->filters[i]); + mm_state->filters[i] = NULL; } } } _ Patches currently in -mm which might be from kinseyho@xxxxxxxxxx are mm-mglru-add-config_arch_has_hw_pte_young.patch mm-mglru-add-config_lru_gen_walks_mmu.patch mm-mglru-remove-config_memcg.patch mm-mglru-add-dummy-pmd_dirty.patch mm-mglru-remove-config_transparent_hugepage.patch