Re: [PATCH v2] smaps should deal with huge zero page exactly same as normal zero page.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Oct 28, 2014 at 03:18:10PM +0200, Kirill A. Shutemov wrote:
> On Tue, Oct 28, 2014 at 11:18:38PM +0800, Fengwei Yin wrote:
> > On Mon, Oct 27, 2014 at 03:17:48PM -0700, Andrew Morton wrote:
> > > On Mon, 27 Oct 2014 23:02:13 +0800 Fengwei Yin <yfw.kernel@xxxxxxxxx> wrote:
> > > 
> > > > We could see following memory info in /proc/xxxx/smaps with THP enabled.
> > > >   7bea458b3000-7fea458b3000 r--p 00000000 00:13 39989  /dev/zero
> > > >   Size:           4294967296 kB
> > > >   Rss:            10612736 kB
> > > >   Pss:            10612736 kB
> > > >   Shared_Clean:          0 kB
> > > >   Shared_Dirty:          0 kB
> > > >   Private_Clean:  10612736 kB
> > > >   Private_Dirty:         0 kB
> > > >   Referenced:     10612736 kB
> > > >   Anonymous:             0 kB
> > > >   AnonHugePages:  10612736 kB
> > > >   Swap:                  0 kB
> > > >   KernelPageSize:        4 kB
> > > >   MMUPageSize:           4 kB
> > > >   Locked:                0 kB
> > > >   VmFlags: rd mr mw me
> > > > which is wrong becuase just huge_zero_page/normal_zero_page is used for
> > > > /dev/zero. Most of the value should be 0.
> > > > 
> > > > This patch detects huge_zero_page (original implementation just detect
> > > > normal_zero_page) and avoids to update the wrong value for huge_zero_page.
> > > > 
> > > > ...
> > > >
> > > > --- a/mm/memory.c
> > > > +++ b/mm/memory.c
> > > > @@ -41,6 +41,7 @@
> > > >  #include <linux/kernel_stat.h>
> > > >  #include <linux/mm.h>
> > > >  #include <linux/hugetlb.h>
> > > > +#include <linux/huge_mm.h>
> > > >  #include <linux/mman.h>
> > > >  #include <linux/swap.h>
> > > >  #include <linux/highmem.h>
> > > > @@ -787,6 +788,9 @@ check_pfn:
> > > >  		return NULL;
> > > >  	}
> > > >  
> > > > +	if (is_huge_zero_pfn(pfn))
> > > > +		return NULL;
> > > > +
> > > 
> > > Why this change?
> > > 
> > I suppose the huge zero page should have same behavior as normal zero
> > page. vm_normal_page will return NULL if the pte is for normal zero
> > page. This change make it return NULL for huge zero page.
> > 
> > > What effect does it have upon vm_normal_page()'s many existing callers?
> > This is good question. I suppose it will not impact existing caller.
> 
> vm_normal_page() is designed to handle pte. We only get there due hack
> with pmd to pte cast in smaps_pte_range(). Let's try to get rid of it
> instead.
> 
> Could you test the patch below? I think it's a better fix.
I will test the patch tomorrow and let you know the result. Thanks.

> 
> From 592a7e789128d92e7f5165b583558443e82c88fd Mon Sep 17 00:00:00 2001
> From: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx>
> Date: Tue, 28 Oct 2014 14:51:31 +0200
> Subject: [PATCH] mm: fix huge zero page accounting in smaps report
> 
> As a small zero page, huge zero page should not be accounted in smaps
> report as normal page.
> 
> For small pages we rely on vm_normal_page() to filter out zero page, but
> vm_normal_page() is not designed to handle pmds. We only get here due
> hackish cast pmd to pte in smaps_pte_range() -- pte and pmd format is
> not necessary compatible on each and every architecture.
> 
> Let's add separate codepath to handle pmds. follow_trans_huge_pmd() will
> detect huge zero page for us.
> 
> We would need pmd_dirty() helper to do this properly. The patch adds it
> to THP-enabled architectures which don't yet have one.
> 
> Reported-by: Fengguang Wu <fengguang.wu@xxxxxxxxx>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
> ---
>  arch/arm64/include/asm/pgtable.h         |   1 +
>  arch/powerpc/include/asm/pgtable-ppc64.h |   1 +
>  arch/sparc/include/asm/pgtable_64.h      |   7 +++
>  arch/x86/include/asm/pgtable.h           |   5 ++
>  fs/proc/task_mmu.c                       | 101 ++++++++++++++++++++-----------
>  5 files changed, 79 insertions(+), 36 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 41a43bf26492..df22314f57cf 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -279,6 +279,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
>  #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
> +#define pmd_dirty(pmd)		pte_dirty(pmd_pte(pmd))
>  #define pmd_young(pmd)		pte_young(pmd_pte(pmd))
>  #define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
>  #define pmd_mksplitting(pmd)	pte_pmd(pte_mkspecial(pmd_pte(pmd)))
> diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
> index ae153c40ab7c..9b4b1904efae 100644
> --- a/arch/powerpc/include/asm/pgtable-ppc64.h
> +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
> @@ -467,6 +467,7 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
>  }
>  
>  #define pmd_pfn(pmd)		pte_pfn(pmd_pte(pmd))
> +#define pmd_dirty(pmd)		pte_dirty(pmd_pte(pmd))
>  #define pmd_young(pmd)		pte_young(pmd_pte(pmd))
>  #define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
>  #define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
> diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
> index bfeb626085ac..90af17ee6184 100644
> --- a/arch/sparc/include/asm/pgtable_64.h
> +++ b/arch/sparc/include/asm/pgtable_64.h
> @@ -667,6 +667,13 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
>  }
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static inline pmd_t pmd_dirty(pmd_t pmd)
> +{
> +	pte_t pte = __pte(pmd_val(pmd));
> +
> +	return pte_dirty(pte);
> +}
> +
>  static inline unsigned long pmd_young(pmd_t pmd)
>  {
>  	pte_t pte = __pte(pmd_val(pmd));
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index aa97a070f09f..081d6f45e006 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -99,6 +99,11 @@ static inline int pte_young(pte_t pte)
>  	return pte_flags(pte) & _PAGE_ACCESSED;
>  }
>  
> +static inline int pmd_dirty(pmd_t pmd)
> +{
> +	return pmd_flags(pmd) & _PAGE_DIRTY;
> +}
> +
>  static inline int pmd_young(pmd_t pmd)
>  {
>  	return pmd_flags(pmd) & _PAGE_ACCESSED;
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 4e0388cffe3d..2ab200d429be 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -447,58 +447,88 @@ struct mem_size_stats {
>  	u64 pss;
>  };
>  
> +static void smaps_account(struct mem_size_stats *mss, struct page *page,
> +		unsigned long size, bool young, bool dirty)
> +{
> +	int mapcount;
> +
> +	if (PageAnon(page))
> +		mss->anonymous += size;
>  
> -static void smaps_pte_entry(pte_t ptent, unsigned long addr,
> -		unsigned long ptent_size, struct mm_walk *walk)
> +	mss->resident += size;
> +	/* Accumulate the size in pages that have been accessed. */
> +	if (young || PageReferenced(page))
> +		mss->referenced += size;
> +	mapcount = page_mapcount(page);
> +	if (mapcount >= 2) {
> +		if (dirty || PageDirty(page))
> +			mss->shared_dirty += size;
> +		else
> +			mss->shared_clean += size;
> +		mss->pss += (size << PSS_SHIFT) / mapcount;
> +	} else {
> +		if (dirty || PageDirty(page))
> +			mss->private_dirty += size;
> +		else
> +			mss->private_clean += size;
> +		mss->pss += (size << PSS_SHIFT);
> +	}
> +}
> +
> +
> +static void smaps_pte_entry(pte_t *pte, unsigned long addr,
> +		struct mm_walk *walk)
>  {
>  	struct mem_size_stats *mss = walk->private;
>  	struct vm_area_struct *vma = mss->vma;
>  	pgoff_t pgoff = linear_page_index(vma, addr);
>  	struct page *page = NULL;
> -	int mapcount;
>  
> -	if (pte_present(ptent)) {
> -		page = vm_normal_page(vma, addr, ptent);
> -	} else if (is_swap_pte(ptent)) {
> -		swp_entry_t swpent = pte_to_swp_entry(ptent);
> +	if (pte_present(*pte)) {
> +		page = vm_normal_page(vma, addr, *pte);
> +	} else if (is_swap_pte(*pte)) {
> +		swp_entry_t swpent = pte_to_swp_entry(*pte);
>  
>  		if (!non_swap_entry(swpent))
> -			mss->swap += ptent_size;
> +			mss->swap += PAGE_SIZE;
>  		else if (is_migration_entry(swpent))
>  			page = migration_entry_to_page(swpent);
> -	} else if (pte_file(ptent)) {
> -		if (pte_to_pgoff(ptent) != pgoff)
> -			mss->nonlinear += ptent_size;
> +	} else if (pte_file(*pte)) {
> +		if (pte_to_pgoff(*pte) != pgoff)
> +			mss->nonlinear += PAGE_SIZE;
>  	}
>  
>  	if (!page)
>  		return;
>  
> -	if (PageAnon(page))
> -		mss->anonymous += ptent_size;
> -
>  	if (page->index != pgoff)
> -		mss->nonlinear += ptent_size;
> +		mss->nonlinear += PAGE_SIZE;
>  
> -	mss->resident += ptent_size;
> -	/* Accumulate the size in pages that have been accessed. */
> -	if (pte_young(ptent) || PageReferenced(page))
> -		mss->referenced += ptent_size;
> -	mapcount = page_mapcount(page);
> -	if (mapcount >= 2) {
> -		if (pte_dirty(ptent) || PageDirty(page))
> -			mss->shared_dirty += ptent_size;
> -		else
> -			mss->shared_clean += ptent_size;
> -		mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
> -	} else {
> -		if (pte_dirty(ptent) || PageDirty(page))
> -			mss->private_dirty += ptent_size;
> -		else
> -			mss->private_clean += ptent_size;
> -		mss->pss += (ptent_size << PSS_SHIFT);
> -	}
> +	smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
> +}
> +
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
> +		struct mm_walk *walk)
> +{
> +	struct mem_size_stats *mss = walk->private;
> +	struct vm_area_struct *vma = mss->vma;
> +	struct page *page;
> +
> +	/* FOLL_DUMP will return -EFAULT on huge zero page */
> +	page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
> +	if (IS_ERR_OR_NULL(page))
> +		return;
> +	mss->anonymous_thp += HPAGE_PMD_SIZE;
> +	smaps_account(mss, page, HPAGE_PMD_SIZE,
> +			pmd_young(*pmd), pmd_dirty(*pmd));
>  }
> +#else
> +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
> +		struct mm_walk *walk)
> +{
> +}
> +#endif
>  
>  static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  			   struct mm_walk *walk)
> @@ -509,9 +539,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  	spinlock_t *ptl;
>  
>  	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
> -		smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
> +		smaps_pmd_entry(pmd, addr, walk);
>  		spin_unlock(ptl);
> -		mss->anonymous_thp += HPAGE_PMD_SIZE;
>  		return 0;
>  	}
>  
> @@ -524,7 +553,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
>  	 */
>  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>  	for (; addr != end; pte++, addr += PAGE_SIZE)
> -		smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
> +		smaps_pte_entry(pte, addr, walk);
>  	pte_unmap_unlock(pte - 1, ptl);
>  	cond_resched();
>  	return 0;
> -- 
>  Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-arch" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel]     [Kernel Newbies]     [x86 Platform Driver]     [Netdev]     [Linux Wireless]     [Netfilter]     [Bugtraq]     [Linux Filesystems]     [Yosemite Discussion]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]

  Powered by Linux