Add an infrastructure that maintains either a system-wide mm_struct list or per-memcg mm_struct lists. Multiple threads can concurrently work on the same mm_struct list, and each of them will be given a different mm_struct. Those who finish early can optionally wait on the rest after the iterator has reached the end of the list. This infrastructure also tracks whether an mm_struct is being used on any CPUs or has been used since the last time a worker looked at it. In other words, workers will not be given an mm_struct that belongs to a process that has been sleeping. Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx> --- fs/exec.c | 2 + include/linux/memcontrol.h | 4 + include/linux/mm_types.h | 135 +++++++++++++++++++ include/linux/mmzone.h | 2 - kernel/exit.c | 1 + kernel/fork.c | 10 ++ kernel/kthread.c | 1 + kernel/sched/core.c | 2 + mm/memcontrol.c | 28 ++++ mm/vmscan.c | 263 +++++++++++++++++++++++++++++++++++++ 10 files changed, 446 insertions(+), 2 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 18594f11c31f..c691d4d7720c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1008,6 +1008,7 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; + lru_gen_add_mm(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1018,6 +1019,7 @@ static int exec_mmap(struct mm_struct *mm) if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); + lru_gen_switch_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); tsk->mm->vmacache_seqnum = 0; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f325aeb4b4e8..591557c5b7e2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -335,6 +335,10 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif +#ifdef CONFIG_LRU_GEN + struct lru_gen_mm_list *mm_list; +#endif + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0974ad501a47..b8a038a016f2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -15,6 +15,8 @@ #include <linux/page-flags-layout.h> #include <linux/workqueue.h> #include <linux/seqlock.h> +#include <linux/nodemask.h> +#include <linux/mmdebug.h> #include <asm/mmu.h> @@ -382,6 +384,8 @@ struct core_state { struct completion startup; }; +#define ANON_AND_FILE 2 + struct kioctx_table; struct mm_struct { struct { @@ -560,6 +564,22 @@ struct mm_struct { #ifdef CONFIG_IOMMU_SUPPORT u32 pasid; +#endif +#ifdef CONFIG_LRU_GEN + struct { + /* node of a global or per-memcg mm list */ + struct list_head list; +#ifdef CONFIG_MEMCG + /* points to memcg of the owner task above */ + struct mem_cgroup *memcg; +#endif + /* indicates this mm has been used since last walk */ + nodemask_t nodes[ANON_AND_FILE]; +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + /* number of cpus that are using this mm */ + atomic_t nr_cpus; +#endif + } lru_gen; #endif } __randomize_layout; @@ -587,6 +607,121 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return (struct cpumask *)&mm->cpu_bitmap; } +#ifdef CONFIG_LRU_GEN + +struct lru_gen_mm_list { + /* head of a global or per-memcg mm list */ + struct list_head head; + /* protects the list */ + spinlock_t lock; + struct { + /* set to max_seq after each round of walk */ + unsigned long cur_seq; + /* next mm on the list to walk */ + struct list_head *iter; + /* to wait for last worker to finish */ + struct wait_queue_head wait; + /* number of concurrent workers */ + int nr_workers; + } nodes[0]; +}; + +void lru_gen_init_mm(struct mm_struct *mm); +void lru_gen_add_mm(struct mm_struct *mm); +void lru_gen_del_mm(struct mm_struct *mm); +#ifdef CONFIG_MEMCG +int lru_gen_alloc_mm_list(struct mem_cgroup *memcg); +void lru_gen_free_mm_list(struct mem_cgroup *memcg); +void lru_gen_migrate_mm(struct mm_struct *mm); +#endif + +/* + * Track usage so mms that haven't been used since last walk can be skipped. + * + * This function introduces a theoretical overhead for each mm switch, but it + * hasn't been measurable. + */ +static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) +{ + int file; + + /* exclude init_mm, efi_mm, etc. */ + if (!core_kernel_data((unsigned long)old)) { + VM_BUG_ON(old == &init_mm); + + for (file = 0; file < ANON_AND_FILE; file++) + nodes_setall(old->lru_gen.nodes[file]); + +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + atomic_dec(&old->lru_gen.nr_cpus); + VM_BUG_ON_MM(atomic_read(&old->lru_gen.nr_cpus) < 0, old); +#endif + } else + VM_BUG_ON_MM(READ_ONCE(old->lru_gen.list.prev) || + READ_ONCE(old->lru_gen.list.next), old); + + if (!core_kernel_data((unsigned long)new)) { + VM_BUG_ON(new == &init_mm); + +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + atomic_inc(&new->lru_gen.nr_cpus); + VM_BUG_ON_MM(atomic_read(&new->lru_gen.nr_cpus) < 0, new); +#endif + } else + VM_BUG_ON_MM(READ_ONCE(new->lru_gen.list.prev) || + READ_ONCE(new->lru_gen.list.next), new); +} + +/* Returns whether the mm is being used on any cpus. */ +static inline bool lru_gen_mm_is_active(struct mm_struct *mm) +{ +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + return !cpumask_empty(mm_cpumask(mm)); +#else + return atomic_read(&mm->lru_gen.nr_cpus); +#endif +} + +#else /* CONFIG_LRU_GEN */ + +static inline void lru_gen_init_mm(struct mm_struct *mm) +{ +} + +static inline void lru_gen_add_mm(struct mm_struct *mm) +{ +} + +static inline void lru_gen_del_mm(struct mm_struct *mm) +{ +} + +#ifdef CONFIG_MEMCG +static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg) +{ + return 0; +} + +static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg) +{ +} + +static inline void lru_gen_migrate_mm(struct mm_struct *mm) +{ +} +#endif + +static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new) +{ +} + +static inline bool lru_gen_mm_is_active(struct mm_struct *mm) +{ + return false; +} + +#endif /* CONFIG_LRU_GEN */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 47946cec7584..a99a1050565a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -285,8 +285,6 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } -#define ANON_AND_FILE 2 - enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI diff --git a/kernel/exit.c b/kernel/exit.c index 04029e35e69a..e4292717ce37 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -422,6 +422,7 @@ void mm_update_next_owner(struct mm_struct *mm) goto retry; } WRITE_ONCE(mm->owner, c); + lru_gen_migrate_mm(mm); task_unlock(c); put_task_struct(c); } diff --git a/kernel/fork.c b/kernel/fork.c index d3171e8e88e5..e261b797955d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -665,6 +665,7 @@ static void check_mm(struct mm_struct *mm) #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif + VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm); } #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) @@ -1047,6 +1048,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, goto fail_nocontext; mm->user_ns = get_user_ns(user_ns); + lru_gen_init_mm(mm); return mm; fail_nocontext: @@ -1089,6 +1091,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + lru_gen_del_mm(mm); mmdrop(mm); } @@ -2513,6 +2516,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) get_task_struct(p); } + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + /* lock p to synchronize with memcg migration */ + task_lock(p); + lru_gen_add_mm(p->mm); + task_unlock(p); + } + wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 1578973c5740..8da7767bb06a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1303,6 +1303,7 @@ void kthread_use_mm(struct mm_struct *mm) tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); + lru_gen_switch_mm(active_mm, mm); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca2bb629595f..56274a14ce09 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4308,6 +4308,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * finish_task_switch()'s mmdrop(). */ switch_mm_irqs_off(prev->active_mm, next->mm, next); + lru_gen_switch_mm(prev->active_mm, next->mm); if (!prev->mm) { // from kernel /* will mmdrop() in finish_task_switch(). */ @@ -7599,6 +7600,7 @@ void idle_task_exit(void) if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + lru_gen_switch_mm(mm, &init_mm); finish_arch_post_lock_switch(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 845eec01ef9d..5836780fe138 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5209,6 +5209,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); free_percpu(memcg->vmstats_local); + lru_gen_free_mm_list(memcg); kfree(memcg); } @@ -5261,6 +5262,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; + if (lru_gen_alloc_mm_list(memcg)) + goto fail; + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) goto fail; @@ -6165,6 +6169,29 @@ static void mem_cgroup_move_task(void) } #endif +#ifdef CONFIG_LRU_GEN +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *task = NULL; + + cgroup_taskset_for_each_leader(task, css, tset) + ; + + if (!task) + return; + + task_lock(task); + if (task->mm && task->mm->owner == task) + lru_gen_migrate_mm(task->mm); + task_unlock(task); +} +#else +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ +} +#endif + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6505,6 +6532,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, + .attach = mem_cgroup_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, .dfl_cftypes = memory_files, diff --git a/mm/vmscan.c b/mm/vmscan.c index 1a24d2e0a4cb..f7657ab0d4b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4314,3 +4314,266 @@ void check_move_unevictable_pages(struct pagevec *pvec) } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + +#ifdef CONFIG_LRU_GEN + +/****************************************************************************** + * global and per-memcg mm list + ******************************************************************************/ + +/* + * After pages are faulted in, they become the youngest generation. They must + * go through aging process twice before they can be evicted. After first scan, + * their accessed bit set during initial faults are cleared and they become the + * second youngest generation. And second scan makes sure they haven't been used + * since the first. + */ +#define MIN_NR_GENS 2 + +static struct lru_gen_mm_list *global_mm_list; + +static struct lru_gen_mm_list *alloc_mm_list(void) +{ + int nid; + struct lru_gen_mm_list *mm_list; + + mm_list = kzalloc(struct_size(mm_list, nodes, nr_node_ids), GFP_KERNEL); + if (!mm_list) + return NULL; + + INIT_LIST_HEAD(&mm_list->head); + spin_lock_init(&mm_list->lock); + + for_each_node(nid) { + mm_list->nodes[nid].cur_seq = MIN_NR_GENS - 1; + mm_list->nodes[nid].iter = &mm_list->head; + init_waitqueue_head(&mm_list->nodes[nid].wait); + } + + return mm_list; +} + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ +#ifdef CONFIG_MEMCG + if (!mem_cgroup_disabled()) + return memcg ? memcg->mm_list : root_mem_cgroup->mm_list; +#endif + VM_BUG_ON(memcg); + + return global_mm_list; +} + +void lru_gen_init_mm(struct mm_struct *mm) +{ + int file; + + INIT_LIST_HEAD(&mm->lru_gen.list); +#ifdef CONFIG_MEMCG + mm->lru_gen.memcg = NULL; +#endif +#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + atomic_set(&mm->lru_gen.nr_cpus, 0); +#endif + for (file = 0; file < ANON_AND_FILE; file++) + nodes_clear(mm->lru_gen.nodes[file]); +} + +void lru_gen_add_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + VM_BUG_ON_MM(!list_empty(&mm->lru_gen.list), mm); +#ifdef CONFIG_MEMCG + VM_BUG_ON_MM(mm->lru_gen.memcg, mm); + WRITE_ONCE(mm->lru_gen.memcg, memcg); +#endif + spin_lock(&mm_list->lock); + list_add_tail(&mm->lru_gen.list, &mm_list->head); + spin_unlock(&mm_list->lock); +} + +void lru_gen_del_mm(struct mm_struct *mm) +{ + int nid; +#ifdef CONFIG_MEMCG + struct lru_gen_mm_list *mm_list = get_mm_list(mm->lru_gen.memcg); +#else + struct lru_gen_mm_list *mm_list = get_mm_list(NULL); +#endif + + spin_lock(&mm_list->lock); + + for_each_node(nid) { + if (mm_list->nodes[nid].iter != &mm->lru_gen.list) + continue; + + mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next; + if (mm_list->nodes[nid].iter == &mm_list->head) + WRITE_ONCE(mm_list->nodes[nid].cur_seq, + mm_list->nodes[nid].cur_seq + 1); + } + + list_del_init(&mm->lru_gen.list); + + spin_unlock(&mm_list->lock); + +#ifdef CONFIG_MEMCG + mem_cgroup_put(mm->lru_gen.memcg); + WRITE_ONCE(mm->lru_gen.memcg, NULL); +#endif +} + +#ifdef CONFIG_MEMCG +int lru_gen_alloc_mm_list(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return 0; + + memcg->mm_list = alloc_mm_list(); + + return memcg->mm_list ? 0 : -ENOMEM; +} + +void lru_gen_free_mm_list(struct mem_cgroup *memcg) +{ + kfree(memcg->mm_list); + memcg->mm_list = NULL; +} + +void lru_gen_migrate_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + + lockdep_assert_held(&mm->owner->alloc_lock); + + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(mm->owner); + rcu_read_unlock(); + if (memcg == mm->lru_gen.memcg) + return; + + VM_BUG_ON_MM(!mm->lru_gen.memcg, mm); + VM_BUG_ON_MM(list_empty(&mm->lru_gen.list), mm); + + lru_gen_del_mm(mm); + lru_gen_add_mm(mm); +} + +static bool mm_has_migrated(struct mm_struct *mm, struct mem_cgroup *memcg) +{ + return READ_ONCE(mm->lru_gen.memcg) != memcg; +} +#else +static bool mm_has_migrated(struct mm_struct *mm, struct mem_cgroup *memcg) +{ + return false; +} +#endif + +static bool should_skip_mm(struct mm_struct *mm, int nid, int swappiness) +{ + int file; + unsigned long size = 0; + + if (mm_is_oom_victim(mm)) + return true; + + for (file = !swappiness; file < ANON_AND_FILE; file++) { + if (lru_gen_mm_is_active(mm) || node_isset(nid, mm->lru_gen.nodes[file])) + size += file ? get_mm_counter(mm, MM_FILEPAGES) : + get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + } + + if (size < SWAP_CLUSTER_MAX) + return true; + + return !mmget_not_zero(mm); +} + +/* To support multiple workers that concurrently walk mm list. */ +static bool get_next_mm(struct lruvec *lruvec, unsigned long next_seq, + int swappiness, struct mm_struct **iter) +{ + bool last = true; + struct mm_struct *mm = NULL; + int nid = lruvec_pgdat(lruvec)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + if (*iter) + mmput_async(*iter); + else if (next_seq <= READ_ONCE(mm_list->nodes[nid].cur_seq)) + return false; + + spin_lock(&mm_list->lock); + + VM_BUG_ON(next_seq > mm_list->nodes[nid].cur_seq + 1); + VM_BUG_ON(*iter && next_seq < mm_list->nodes[nid].cur_seq); + VM_BUG_ON(*iter && !mm_list->nodes[nid].nr_workers); + + if (next_seq <= mm_list->nodes[nid].cur_seq) { + last = *iter; + goto done; + } + + if (mm_list->nodes[nid].iter == &mm_list->head) { + VM_BUG_ON(*iter || mm_list->nodes[nid].nr_workers); + mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next; + } + + while (!mm && mm_list->nodes[nid].iter != &mm_list->head) { + mm = list_entry(mm_list->nodes[nid].iter, struct mm_struct, lru_gen.list); + mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next; + if (should_skip_mm(mm, nid, swappiness)) + mm = NULL; + } + + if (mm_list->nodes[nid].iter == &mm_list->head) + WRITE_ONCE(mm_list->nodes[nid].cur_seq, + mm_list->nodes[nid].cur_seq + 1); +done: + if (*iter && !mm) + mm_list->nodes[nid].nr_workers--; + if (!*iter && mm) + mm_list->nodes[nid].nr_workers++; + + last = last && !mm_list->nodes[nid].nr_workers && + mm_list->nodes[nid].iter == &mm_list->head; + + spin_unlock(&mm_list->lock); + + *iter = mm; + + return last; +} + +/****************************************************************************** + * initialization + ******************************************************************************/ + +static int __init init_lru_gen(void) +{ + if (mem_cgroup_disabled()) { + global_mm_list = alloc_mm_list(); + if (!global_mm_list) { + pr_err("lru_gen: failed to allocate global mm list\n"); + return -ENOMEM; + } + } + + return 0; +}; +/* + * We want to run as early as possible because some debug code, e.g., + * dma_resv_lockdep(), calls mm_alloc() and mmput(). We only depend on mm_kobj, + * which is initialized one stage earlier by postcore_initcall(). + */ +arch_initcall(init_lru_gen); + +#endif /* CONFIG_LRU_GEN */ -- 2.31.0.rc2.261.g7f71774620-goog