Re: [PATCHv2] mm: Account pud page tables

Michal Hocko <mhocko@xxxxxxxxxx> · Tue, 26 Sep 2017 12:17:15 +0200

On Mon 25-09-17 10:39:13, Kirill A. Shutemov wrote:
> On machine with 5-level paging support a process can allocate
> significant amount of memory and stay unnoticed by oom-killer and
> memory cgroup. The trick is to allocate a lot of PUD page tables.
> We don't account PUD page tables, only PMD and PTE.
> 
> We already addressed the same issue for PMD page tables, see
> dc6c9a35b66b ("mm: account pmd page tables to the process").
> Introduction 5-level paging bring the same issue for PUD page tables.
> 
> The patch expands accounting to PUD level.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxx>
> Cc: Vlastimil Babka <vbabka@xxxxxxx>

So just for the reference. You can assume my
Acked-by: Michal Hocko <mhocko@xxxxxxxx>

it seems that no arch has PUD_ORDER > 0 so the oom part works correctly.
As mentioned in other email I think we should actually simplify the
whole thing and use a single counter for all pte levels. This will
remove some code and make this whole thing less error prone.

> ---
>  Documentation/sysctl/vm.txt   |  8 ++++----
>  arch/powerpc/mm/hugetlbpage.c |  1 +
>  arch/sparc/mm/hugetlbpage.c   |  1 +
>  fs/proc/task_mmu.c            |  5 ++++-
>  include/linux/mm.h            | 34 ++++++++++++++++++++++++++++++++--
>  include/linux/mm_types.h      |  3 +++
>  kernel/fork.c                 |  4 ++++
>  mm/debug.c                    |  6 ++++--
>  mm/memory.c                   | 15 +++++++++------
>  mm/oom_kill.c                 |  8 +++++---
>  10 files changed, 67 insertions(+), 18 deletions(-)
> 
> diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
> index 9baf66a9ef4e..2717b6f2d706 100644
> --- a/Documentation/sysctl/vm.txt
> +++ b/Documentation/sysctl/vm.txt
> @@ -622,10 +622,10 @@ oom_dump_tasks
>  
>  Enables a system-wide task dump (excluding kernel threads) to be produced
>  when the kernel performs an OOM-killing and includes such information as
> -pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
> -score, and name.  This is helpful to determine why the OOM killer was
> -invoked, to identify the rogue task that caused it, and to determine why
> -the OOM killer chose the task it did to kill.
> +pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
> +oom_score_adj score, and name.  This is helpful to determine why the OOM
> +killer was invoked, to identify the rogue task that caused it, and to
> +determine why the OOM killer chose the task it did to kill.
>  
>  If this is set to zero, this information is suppressed.  On very
>  large systems with thousands of tasks it may not be feasible to dump
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index 1571a498a33f..a9b9083c5e49 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
>  	pud = pud_offset(pgd, start);
>  	pgd_clear(pgd);
>  	pud_free_tlb(tlb, pud, start);
> +	mm_dec_nr_puds(tlb->mm);
>  }
>  
>  /*
> diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
> index bcd8cdbc377f..fd0d85808828 100644
> --- a/arch/sparc/mm/hugetlbpage.c
> +++ b/arch/sparc/mm/hugetlbpage.c
> @@ -471,6 +471,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
>  	pud = pud_offset(pgd, start);
>  	pgd_clear(pgd);
>  	pud_free_tlb(tlb, pud, start);
> +	mm_dec_nr_puds(tlb->mm);
>  }
>  
>  void hugetlb_free_pgd_range(struct mmu_gather *tlb,
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 5589b4bd4b85..0bf9e423aa99 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -25,7 +25,7 @@
>  
>  void task_mem(struct seq_file *m, struct mm_struct *mm)
>  {
> -	unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
> +	unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
>  	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
>  
>  	anon = get_mm_counter(mm, MM_ANONPAGES);
> @@ -51,6 +51,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
>  	swap = get_mm_counter(mm, MM_SWAPENTS);
>  	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
>  	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
> +	puds = PTRS_PER_PUD * sizeof(pmd_t) * mm_nr_puds(mm);
>  	seq_printf(m,
>  		"VmPeak:\t%8lu kB\n"
>  		"VmSize:\t%8lu kB\n"
> @@ -67,6 +68,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
>  		"VmLib:\t%8lu kB\n"
>  		"VmPTE:\t%8lu kB\n"
>  		"VmPMD:\t%8lu kB\n"
> +		"VmPUD:\t%8lu kB\n"
>  		"VmSwap:\t%8lu kB\n",
>  		hiwater_vm << (PAGE_SHIFT-10),
>  		total_vm << (PAGE_SHIFT-10),
> @@ -81,6 +83,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
>  		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
>  		ptes >> 10,
>  		pmds >> 10,
> +		puds >> 10,
>  		swap << (PAGE_SHIFT-10));
>  	hugetlb_report_usage(m, mm);
>  }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f8c10d336e42..c5eb8c609599 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1604,8 +1604,38 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
>  {
>  	return 0;
>  }
> +
> +static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
> +{
> +	return 0;
> +}
> +
> +static inline void mm_nr_puds_init(struct mm_struct *mm) {}
> +static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
> +static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
> +
>  #else
>  int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
> +
> +static inline void mm_nr_puds_init(struct mm_struct *mm)
> +{
> +	atomic_long_set(&mm->nr_puds, 0);
> +}
> +
> +static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
> +{
> +	return atomic_long_read(&mm->nr_puds);
> +}
> +
> +static inline void mm_inc_nr_puds(struct mm_struct *mm)
> +{
> +	atomic_long_inc(&mm->nr_puds);
> +}
> +
> +static inline void mm_dec_nr_puds(struct mm_struct *mm)
> +{
> +	atomic_long_dec(&mm->nr_puds);
> +}
>  #endif
>  
>  #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
> @@ -1617,7 +1647,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
>  
>  static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
>  
> -static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
> +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
>  {
>  	return 0;
>  }
> @@ -1633,7 +1663,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm)
>  	atomic_long_set(&mm->nr_pmds, 0);
>  }
>  
> -static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
> +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
>  {
>  	return atomic_long_read(&mm->nr_pmds);
>  }
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 46f4ecf5479a..6c8c2bb9e5a1 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -401,6 +401,9 @@ struct mm_struct {
>  	atomic_long_t nr_ptes;			/* PTE page table pages */
>  #if CONFIG_PGTABLE_LEVELS > 2
>  	atomic_long_t nr_pmds;			/* PMD page table pages */
> +#endif
> +#if CONFIG_PGTABLE_LEVELS > 3
> +	atomic_long_t nr_puds;			/* PUD page table pages */
>  #endif
>  	int map_count;				/* number of VMAs */
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 10646182440f..5624918154db 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -815,6 +815,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	mm->core_state = NULL;
>  	atomic_long_set(&mm->nr_ptes, 0);
>  	mm_nr_pmds_init(mm);
> +	mm_nr_puds_init(mm);
>  	mm->map_count = 0;
>  	mm->locked_vm = 0;
>  	mm->pinned_vm = 0;
> @@ -874,6 +875,9 @@ static void check_mm(struct mm_struct *mm)
>  	if (mm_nr_pmds(mm))
>  		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
>  				mm_nr_pmds(mm));
> +	if (mm_nr_puds(mm))
> +		pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
> +				mm_nr_puds(mm));
>  
>  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
>  	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
> diff --git a/mm/debug.c b/mm/debug.c
> index 5715448ab0b5..afccb2565269 100644
> --- a/mm/debug.c
> +++ b/mm/debug.c
> @@ -104,7 +104,8 @@ void dump_mm(const struct mm_struct *mm)
>  		"get_unmapped_area %p\n"
>  #endif
>  		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
> -		"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
> +		"pgd %p mm_users %d mm_count %d\n"
> +		"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
>  		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
>  		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
>  		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
> @@ -135,7 +136,8 @@ void dump_mm(const struct mm_struct *mm)
>  		mm->pgd, atomic_read(&mm->mm_users),
>  		atomic_read(&mm->mm_count),
>  		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
> -		mm_nr_pmds((struct mm_struct *)mm),
> +		mm_nr_pmds(mm),
> +		mm_nr_puds(mm),
>  		mm->map_count,
>  		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
>  		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
> diff --git a/mm/memory.c b/mm/memory.c
> index ec4e15494901..8f49fdafac56 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
>  	pud = pud_offset(p4d, start);
>  	p4d_clear(p4d);
>  	pud_free_tlb(tlb, pud, start);
> +	mm_dec_nr_puds(tlb->mm);
>  }
>  
>  static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
> @@ -4124,15 +4125,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
>  
>  	spin_lock(&mm->page_table_lock);
>  #ifndef __ARCH_HAS_5LEVEL_HACK
> -	if (p4d_present(*p4d))		/* Another has populated it */
> -		pud_free(mm, new);
> -	else
> +	if (!p4d_present(*p4d)) {
> +		mm_inc_nr_puds(mm);
>  		p4d_populate(mm, p4d, new);
> -#else
> -	if (pgd_present(*p4d))		/* Another has populated it */
> +	} else	/* Another has populated it */
>  		pud_free(mm, new);
> -	else
> +#else
> +	if (!pgd_present(*pud)) {
> +		mm_inc_nr_puds(mm);
>  		pgd_populate(mm, p4d, new);
> +	} else	/* Another has populated it */
> +		pud_free(mm, new);
>  #endif /* __ARCH_HAS_5LEVEL_HACK */
>  	spin_unlock(&mm->page_table_lock);
>  	return 0;
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 99736e026712..4bee6968885d 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -200,7 +200,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
>  	 * task's rss, pagetable and swap space use.
>  	 */
>  	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
> -		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
> +		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
> +		mm_nr_puds(p->mm);
>  	task_unlock(p);
>  
>  	/*
> @@ -376,7 +377,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
>  	struct task_struct *p;
>  	struct task_struct *task;
>  
> -	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
> +	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
>  	rcu_read_lock();
>  	for_each_process(p) {
>  		if (oom_unkillable_task(p, memcg, nodemask))
> @@ -392,11 +393,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
>  			continue;
>  		}
>  
> -		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
> +		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
>  			task->pid, from_kuid(&init_user_ns, task_uid(task)),
>  			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
>  			atomic_long_read(&task->mm->nr_ptes),
>  			mm_nr_pmds(task->mm),
> +			mm_nr_puds(task->mm),
>  			get_mm_counter(task->mm, MM_SWAPENTS),
>  			task->signal->oom_score_adj, task->comm);
>  		task_unlock(task);
> -- 
> 2.14.1
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>