Not all architecture supports hardware atomic updates of access bits. On such an arch, we don't use page table walk to classify pages into generations. Add a kernel config option and remove adding all the page table walk code on such architecture. No preformance change observed with mongodb ycsb test: Patch details Throughput(Ops/sec) without patch 93278 With patch 93400 Without patch: $ size mm/vmscan.o text data bss dec hex filename 112102 42721 40 154863 25cef mm/vmscan.o With patch $ size mm/vmscan.o text data bss dec hex filename 105430 41333 24 146787 23d63 mm/vmscan.o Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> --- arch/Kconfig | 3 + arch/arm64/Kconfig | 1 + arch/x86/Kconfig | 1 + include/linux/memcontrol.h | 2 +- include/linux/mm_types.h | 8 +-- include/linux/mmzone.h | 8 +++ include/linux/swap.h | 2 +- kernel/fork.c | 2 +- mm/memcontrol.c | 2 +- mm/vmscan.c | 128 +++++++++++++++++++++++++++++++++---- 10 files changed, 137 insertions(+), 20 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 205fd23e0cad..5cdd98731298 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1458,6 +1458,9 @@ config DYNAMIC_SIGFRAME config HAVE_ARCH_NODE_DEV_GROUP bool +config LRU_TASK_PAGE_AGING + bool + config ARCH_HAS_NONLEAF_PMD_YOUNG bool help diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b1201d25a8a4..e0994fb3504b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -225,6 +225,7 @@ config ARM64 select IRQ_DOMAIN select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN + select LRU_TASK_PAGE_AGING if LRU_GEN select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 53bab123a8ee..bde9e6f33b22 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -276,6 +276,7 @@ config X86 select HAVE_GENERIC_VDSO select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING + select LRU_TASK_PAGE_AGING if LRU_GEN select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK select NEED_SG_DMA_LENGTH diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 43d4ec8445d4..ea5d1d7bfb8b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -321,7 +321,7 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_TASK_PAGE_AGING /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 306a3d1a0fa6..f90a4860a792 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -786,7 +786,7 @@ struct mm_struct { */ unsigned long ksm_rmap_items; #endif -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_TASK_PAGE_AGING struct { /* this mm_struct is on lru_gen_mm_list */ struct list_head list; @@ -801,7 +801,7 @@ struct mm_struct { struct mem_cgroup *memcg; #endif } lru_gen; -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_TASK_PAGE_AGING */ } __randomize_layout; /* @@ -830,7 +830,7 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return (struct cpumask *)&mm->cpu_bitmap; } -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_TASK_PAGE_AGING struct lru_gen_mm_list { /* mm_struct list for page table walkers */ @@ -864,7 +864,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm) WRITE_ONCE(mm->lru_gen.bitmap, -1); } -#else /* !CONFIG_LRU_GEN */ +#else /* !CONFIG_LRU_TASK_PAGE_AGING */ static inline void lru_gen_add_mm(struct mm_struct *mm) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a4889c9d4055..b35698148d3c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -427,6 +427,7 @@ struct lru_gen_folio { #endif }; +#ifdef CONFIG_LRU_TASK_PAGE_AGING enum { MM_LEAF_TOTAL, /* total leaf entries */ MM_LEAF_OLD, /* old leaf entries */ @@ -469,6 +470,7 @@ struct lru_gen_mm_walk { bool can_swap; bool force_scan; }; +#endif void lru_gen_init_lruvec(struct lruvec *lruvec); void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); @@ -613,8 +615,12 @@ struct lruvec { #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; +#ifdef CONFIG_LRU_TASK_PAGE_AGING /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; +#else + bool seq_update_progress; +#endif #endif #ifdef CONFIG_MEMCG struct pglist_data *pgdat; @@ -1354,8 +1360,10 @@ typedef struct pglist_data { unsigned long flags; #ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_TASK_PAGE_AGING /* kswap mm walk data */ struct lru_gen_mm_walk mm_walk; +#endif /* lru_gen_folio list */ struct lru_gen_memcg memcg_lru; #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index 3c69cb653cb9..ce09b1e44275 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -155,7 +155,7 @@ union swap_header { struct reclaim_state { /* pages reclaimed outside of LRU-based reclaim */ unsigned long reclaimed; -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_TASK_PAGE_AGING /* per-thread mm walk data */ struct lru_gen_mm_walk *mm_walk; #endif diff --git a/kernel/fork.c b/kernel/fork.c index ed4e01daccaa..2c9e21e39f84 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2939,7 +2939,7 @@ pid_t kernel_clone(struct kernel_clone_args *args) get_task_struct(p); } - if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + if (IS_ENABLED(CONFIG_LRU_TASK_PAGE_AGING) && !(clone_flags & CLONE_VM)) { /* lock the task to synchronize with memcg migration */ task_lock(p); lru_gen_add_mm(p->mm); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 959d6a27e23d..d8fe30d880c6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6404,7 +6404,7 @@ static void mem_cgroup_move_task(void) } #endif -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_TASK_PAGE_AGING static void mem_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; diff --git a/mm/vmscan.c b/mm/vmscan.c index f277beba556c..207e62d42888 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3304,6 +3304,7 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; } +#ifdef CONFIG_LRU_TASK_PAGE_AGING /****************************************************************************** * Bloom filters ******************************************************************************/ @@ -3650,6 +3651,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) return success; } +#endif /****************************************************************************** * PID controller @@ -3819,6 +3821,8 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, return folio; } +#ifdef CONFIG_LRU_TASK_PAGE_AGING + /* promote pages accessed through page tables */ static int folio_update_gen(struct folio *folio, int gen) { @@ -3882,6 +3886,16 @@ static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk } } +static void reset_current_reclaim_batch_size(struct lruvec *lruvec) +{ + struct lru_gen_mm_walk *walk; + + walk = current->reclaim_state->mm_walk; + if (walk && walk->batched) + return reset_batch_size(lruvec, walk); + +} + static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) { struct address_space *mapping; @@ -4304,7 +4318,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ } while (err == -EAGAIN); } -static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) +static void *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) { struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; @@ -4335,6 +4349,23 @@ static void clear_mm_walk(void) if (!current_is_kswapd()) kfree(walk); } +#else + +static void reset_current_reclaim_batch_size(struct lruvec *lruvec) +{ + +} + +static inline void *set_mm_walk(struct pglist_data *pgdat, bool force_alloc) +{ + return NULL; +} + +static inline void clear_mm_walk(void) +{ + +} +#endif static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) { @@ -4468,11 +4499,15 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) /* make sure preceding modifications appear */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); +#ifndef CONFIG_LRU_TASK_PAGE_AGING + lruvec->seq_update_progress = false; +#endif spin_unlock_irq(&lruvec->lru_lock); } +#ifdef CONFIG_LRU_TASK_PAGE_AGING static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, - struct scan_control *sc, bool can_swap, bool force_scan) + bool can_swap, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; @@ -4498,7 +4533,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, goto done; } - walk = set_mm_walk(NULL, true); + walk = (struct lru_gen_mm_walk *)set_mm_walk(NULL, true); if (!walk) { success = iterate_mm_list_nowalk(lruvec, max_seq); goto done; @@ -4520,6 +4555,51 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, return success; } +#else + +/* + * inc_max_seq can drop the lru_lock in between. So use a waitqueue seq_update_progress + * to allow concurrent access. + */ +bool __try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + bool can_swap, bool force_scan) +{ + bool success = false; + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + + /* see the comment in iterate_mm_list() */ + if (lruvec->seq_update_progress) + success = false; + else { + spin_lock_irq(&lruvec->lru_lock); + + if (max_seq != lrugen->max_seq) + goto done; + + if (lruvec->seq_update_progress) + goto done; + + success = true; + lruvec->seq_update_progress = true; +done: + spin_unlock_irq(&lruvec->lru_lock); + } + + if (success) + inc_max_seq(lruvec, can_swap, force_scan); + + return success; +} + +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + bool can_swap, bool force_scan) +{ + return __try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan); +} +#endif + /****************************************************************************** * working set protection @@ -4630,6 +4710,7 @@ static void __look_around_gen_update(struct folio *folio, int new_gen) folio_activate(folio); } +#ifdef CONFIG_LRU_TASK_PAGE_AGING static inline bool current_reclaim_state_can_swap(void) { if (current->reclaim_state) @@ -4651,6 +4732,18 @@ static void look_around_gen_update(struct folio *folio, int new_gen) } return __look_around_gen_update(folio, new_gen); } +#else + +static inline bool current_reclaim_state_can_swap(void) +{ + return true; +} + +static inline void look_around_gen_update(struct folio *folio, int new_gen) +{ + return __look_around_gen_update(folio, new_gen); +} +#endif /* * This function exploits spatial locality when shrink_folio_list() walks the @@ -4714,7 +4807,6 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) folio = get_pfn_folio(pfn, memcg, pgdat, current_reclaim_state_can_swap()); - if (!folio) continue; @@ -4734,9 +4826,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) arch_leave_lazy_mmu_mode(); mem_cgroup_unlock_pages(); +#ifdef CONFIG_LRU_TASK_PAGE_AGING /* feedback from rmap walkers to page table walkers */ if (suitable_to_scan(i, young)) update_bloom_filter(lruvec, max_seq, pvmw->pmd); +#endif } /****************************************************************************** @@ -5156,7 +5250,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap struct folio *next; enum vm_event_item item; struct reclaim_stat stat; - struct lru_gen_mm_walk *walk; bool skip_retry = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -5211,9 +5304,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap move_folios_to_lru(lruvec, &list); - walk = current->reclaim_state->mm_walk; - if (walk && walk->batched) - reset_batch_size(lruvec, walk); + reset_current_reclaim_batch_size(lruvec); item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) @@ -5321,7 +5412,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool return nr_to_scan; /* skip this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; + return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0; } static unsigned long get_nr_to_reclaim(struct scan_control *sc) @@ -5929,6 +6020,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, seq_putc(m, '\n'); } +#ifdef CONFIG_LRU_TASK_PAGE_AGING seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { const char *s = " "; @@ -5945,6 +6037,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, seq_printf(m, " %10lu%c", n, s[i]); } seq_putc(m, '\n'); +#endif } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ @@ -6026,7 +6119,7 @@ static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_contr if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) return -ERANGE; - try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); + try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan); return 0; } @@ -6218,7 +6311,12 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) for_each_gen_type_zone(gen, type, zone) INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); +#ifdef CONFIG_LRU_TASK_PAGE_AGING lruvec->mm_state.seq = MIN_NR_GENS; +#else + lruvec->seq_update_progress = false; +#endif + } #ifdef CONFIG_MEMCG @@ -6237,16 +6335,20 @@ void lru_gen_init_pgdat(struct pglist_data *pgdat) void lru_gen_init_memcg(struct mem_cgroup *memcg) { +#ifdef CONFIG_LRU_TASK_PAGE_AGING INIT_LIST_HEAD(&memcg->mm_list.fifo); spin_lock_init(&memcg->mm_list.lock); + +#endif } void lru_gen_exit_memcg(struct mem_cgroup *memcg) { - int i; int nid; +#ifdef CONFIG_LRU_TASK_PAGE_AGING VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo)); +#endif for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); @@ -6256,10 +6358,12 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) lruvec->lrugen.list.next = LIST_POISON1; - for (i = 0; i < NR_BLOOM_FILTERS; i++) { +#ifdef CONFIG_LRU_TASK_PAGE_AGING + for (int i = 0; i < NR_BLOOM_FILTERS; i++) { bitmap_free(lruvec->mm_state.filters[i]); lruvec->mm_state.filters[i] = NULL; } +#endif } } -- 2.40.1