Re: [PATCH -V5 06/25] powerpc: Reduce PTE table memory wastage

David Gibson <dwg@xxxxxxxxxxx> · Wed, 10 Apr 2013 14:46:11 +1000

On Thu, Apr 04, 2013 at 11:27:44AM +0530, Aneesh Kumar K.V wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> 
> We allocate one page for the last level of linux page table. With THP and
> large page size of 16MB, that would mean we are wasting large part
> of that page. To map 16MB area, we only need a PTE space of 2K with 64K
> page size. This patch reduce the space wastage by sharing the page
> allocated for the last level of linux page table with multiple pmd
> entries. We call these smaller chunks PTE page fragments and allocated
> page, PTE page.
> 
> In order to support systems which doesn't have 64K HPTE support, we also
> add another 2K to PTE page fragment. The second half of the PTE fragments
> is used for storing slot and secondary bit information of an HPTE. With this
> we now have a 4K PTE fragment.
> 
> We use a simple approach to share the PTE page. On allocation, we bump the
> PTE page refcount to 16 and share the PTE page with the next 16 pte alloc
> request. This should help in the node locality of the PTE page fragment,
> assuming that the immediate pte alloc request will mostly come from the
> same NUMA node. We don't try to reuse the freed PTE page fragment. Hence
> we could be waisting some space.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> ---
>  arch/powerpc/include/asm/mmu-book3e.h |    4 +
>  arch/powerpc/include/asm/mmu-hash64.h |    4 +
>  arch/powerpc/include/asm/page.h       |    4 +
>  arch/powerpc/include/asm/pgalloc-64.h |   72 ++++-------------
>  arch/powerpc/kernel/setup_64.c        |    4 +-
>  arch/powerpc/mm/mmu_context_hash64.c  |   35 +++++++++
>  arch/powerpc/mm/pgtable_64.c          |  137 +++++++++++++++++++++++++++++++++
>  7 files changed, 202 insertions(+), 58 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
> index 99d43e0..affbd68 100644
> --- a/arch/powerpc/include/asm/mmu-book3e.h
> +++ b/arch/powerpc/include/asm/mmu-book3e.h
> @@ -231,6 +231,10 @@ typedef struct {
>  	u64 high_slices_psize;  /* 4 bits per slice for now */
>  	u16 user_psize;         /* page size index */
>  #endif
> +#ifdef CONFIG_PPC_64K_PAGES
> +	/* for 4K PTE fragment support */
> +	struct page *pgtable_page;
> +#endif
>  } mm_context_t;
>  
>  /* Page size definitions, common between 32 and 64-bit
> diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
> index 35bb51e..300ac3c 100644
> --- a/arch/powerpc/include/asm/mmu-hash64.h
> +++ b/arch/powerpc/include/asm/mmu-hash64.h
> @@ -498,6 +498,10 @@ typedef struct {
>  	unsigned long acop;	/* mask of enabled coprocessor types */
>  	unsigned int cop_pid;	/* pid value used with coprocessors */
>  #endif /* CONFIG_PPC_ICSWX */
> +#ifdef CONFIG_PPC_64K_PAGES
> +	/* for 4K PTE fragment support */
> +	struct page *pgtable_page;
> +#endif
>  } mm_context_t;
>  
>  
> diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
> index f072e97..38e7ff6 100644
> --- a/arch/powerpc/include/asm/page.h
> +++ b/arch/powerpc/include/asm/page.h
> @@ -378,7 +378,11 @@ void arch_free_page(struct page *page, int order);
>  
>  struct vm_area_struct;
>  
> +#ifdef CONFIG_PPC_64K_PAGES
> +typedef pte_t *pgtable_t;
> +#else
>  typedef struct page *pgtable_t;
> +#endif

Ugh, that's pretty horrible, though I don't see an easy way around it.

>  #include <asm-generic/memory_model.h>
>  #endif /* __ASSEMBLY__ */
> diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
> index cdbf555..3418989 100644
> --- a/arch/powerpc/include/asm/pgalloc-64.h
> +++ b/arch/powerpc/include/asm/pgalloc-64.h
> @@ -150,6 +150,13 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
>  
>  #else /* if CONFIG_PPC_64K_PAGES */
>  
> +extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int);
> +extern void page_table_free(struct mm_struct *, unsigned long *, int);
> +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
> +#ifdef CONFIG_SMP
> +extern void __tlb_remove_table(void *_table);
> +#endif
> +
>  #define pud_populate(mm, pud, pmd)	pud_set(pud, (unsigned long)pmd)
>  
>  static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
> @@ -161,90 +168,42 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
>  static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
>  				pgtable_t pte_page)
>  {
> -	pmd_populate_kernel(mm, pmd, page_address(pte_page));
> +	pmd_set(pmd, (unsigned long)pte_page);
>  }
>  
>  static inline pgtable_t pmd_pgtable(pmd_t pmd)
>  {
> -	return pmd_page(pmd);
> +	return (pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE);
>  }
>  
>  static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
>  					  unsigned long address)
>  {
> -	return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
> +	return (pte_t *)page_table_alloc(mm, address, 1);
>  }
>  
>  static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
> -				      unsigned long address)
> +					unsigned long address)
>  {
> -	struct page *page;
> -	pte_t *pte;
> -
> -	pte = pte_alloc_one_kernel(mm, address);
> -	if (!pte)
> -		return NULL;
> -	page = virt_to_page(pte);
> -	pgtable_page_ctor(page);
> -	return page;
> +	return (pgtable_t)page_table_alloc(mm, address, 0);
>  }
>  
>  static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
>  {
> -	free_page((unsigned long)pte);
> +	page_table_free(mm, (unsigned long *)pte, 1);
>  }
>  
>  static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
>  {
> -	pgtable_page_dtor(ptepage);
> -	__free_page(ptepage);
> -}
> -
> -static inline void pgtable_free(void *table, unsigned index_size)
> -{
> -	if (!index_size)
> -		free_page((unsigned long)table);
> -	else {
> -		BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
> -		kmem_cache_free(PGT_CACHE(index_size), table);
> -	}
> +	page_table_free(mm, (unsigned long *)ptepage, 0);
>  }
>  
> -#ifdef CONFIG_SMP
> -static inline void pgtable_free_tlb(struct mmu_gather *tlb,
> -				    void *table, int shift)
> -{
> -	unsigned long pgf = (unsigned long)table;
> -	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
> -	pgf |= shift;
> -	tlb_remove_table(tlb, (void *)pgf);
> -}
> -
> -static inline void __tlb_remove_table(void *_table)
> -{
> -	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
> -	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
> -
> -	pgtable_free(table, shift);
> -}
> -#else /* !CONFIG_SMP */
> -static inline void pgtable_free_tlb(struct mmu_gather *tlb,
> -				    void *table, int shift)
> -{
> -	pgtable_free(table, shift);
> -}
> -#endif /* CONFIG_SMP */
> -
>  static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
>  				  unsigned long address)
>  {
> -	struct page *page = page_address(table);
> -
>  	tlb_flush_pgtable(tlb, address);
> -	pgtable_page_dtor(page);
> -	pgtable_free_tlb(tlb, page, 0);
> +	pgtable_free_tlb(tlb, table, 0);
>  }
> -
>  #endif /* CONFIG_PPC_64K_PAGES */
>  
>  static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
> @@ -258,7 +217,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
>  	kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
>  }
>  
> -
>  #define __pmd_free_tlb(tlb, pmd, addr)		      \
>  	pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
>  #ifndef CONFIG_PPC_64K_PAGES
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 6da881b..04d833c 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -575,7 +575,9 @@ void __init setup_arch(char **cmdline_p)
>  	init_mm.end_code = (unsigned long) _etext;
>  	init_mm.end_data = (unsigned long) _edata;
>  	init_mm.brk = klimit;
> -	
> +#ifdef CONFIG_PPC_64K_PAGES
> +	init_mm.context.pgtable_page = NULL;
> +#endif
>  	irqstack_early_init();
>  	exc_lvl_early_init();
>  	emergency_stack_init();
> diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
> index 59cd773..fbfdca2 100644
> --- a/arch/powerpc/mm/mmu_context_hash64.c
> +++ b/arch/powerpc/mm/mmu_context_hash64.c
> @@ -86,6 +86,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
>  	spin_lock_init(mm->context.cop_lockp);
>  #endif /* CONFIG_PPC_ICSWX */
>  
> +#ifdef CONFIG_PPC_64K_PAGES
> +	mm->context.pgtable_page = NULL;
> +#endif
>  	return 0;
>  }
>  
> @@ -97,13 +100,45 @@ void __destroy_context(int context_id)
>  }
>  EXPORT_SYMBOL_GPL(__destroy_context);
>  
> +#ifdef CONFIG_PPC_64K_PAGES
> +static void destroy_pagetable_page(struct mm_struct *mm)
> +{
> +	int count;
> +	struct page *page;
> +
> +	page = mm->context.pgtable_page;
> +	if (!page)
> +		return;
> +
> +	/* drop all the pending references */
> +	count = atomic_read(&page->_mapcount) + 1;
> +	/* We allow PTE_FRAG_NR(16) fragments from a PTE page */
> +	count = atomic_sub_return(16 - count, &page->_count);

You should really move PTE_FRAG_NR to a header so you can actually use
it here rather than hard coding 16.

It took me a fair while to convince myself that there is no race here
with something altering mapcount and count between the atomic_read()
and the atomic_sub_return().  It could do with a comment to explain
why that is safe.

Re-using the mapcount field for your index also seems odd, and it took
me a while to convince myself that that's safe too.  Wouldn't it be
simpler to store a pointer to the next sub-page in the mm_context
instead? You can get from that to the struct page easily enough with a
shift and pfn_to_page().

> +	if (!count) {
> +		pgtable_page_dtor(page);
> +		reset_page_mapcount(page);
> +		free_hot_cold_page(page, 0);

It would be nice to use put_page() somehow instead of duplicating its
logic, though I realise the sparc code you've based this on does the
same thing.

> +	}
> +}
> +
> +#else
> +static inline void destroy_pagetable_page(struct mm_struct *mm)
> +{
> +	return;
> +}
> +#endif
> +
> +
>  void destroy_context(struct mm_struct *mm)
>  {
> +
>  #ifdef CONFIG_PPC_ICSWX
>  	drop_cop(mm->context.acop, mm);
>  	kfree(mm->context.cop_lockp);
>  	mm->context.cop_lockp = NULL;
>  #endif /* CONFIG_PPC_ICSWX */
> +
> +	destroy_pagetable_page(mm);
>  	__destroy_context(mm->context.id);
>  	subpage_prot_free(mm);
>  	mm->context.id = MMU_NO_CONTEXT;
> diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> index e212a27..e79840b 100644
> --- a/arch/powerpc/mm/pgtable_64.c
> +++ b/arch/powerpc/mm/pgtable_64.c
> @@ -337,3 +337,140 @@ EXPORT_SYMBOL(__ioremap_at);
>  EXPORT_SYMBOL(iounmap);
>  EXPORT_SYMBOL(__iounmap);
>  EXPORT_SYMBOL(__iounmap_at);
> +
> +#ifdef CONFIG_PPC_64K_PAGES
> +/*
> + * we support 16 fragments per PTE page. This is limited by how many
> + * bits we can pack in page->_mapcount. We use the first half for
> + * tracking the usage for rcu page table free.
> + */
> +#define PTE_FRAG_NR	16
> +/*
> + * We use a 2K PTE page fragment and another 2K for storing
> + * real_pte_t hash index
> + */
> +#define PTE_FRAG_SIZE (2 * PTRS_PER_PTE * sizeof(pte_t))
> +
> +static pte_t *get_from_cache(struct mm_struct *mm)
> +{
> +	int index;
> +	pte_t *ret = NULL;
> +	struct page *page;
> +
> +	spin_lock(&mm->page_table_lock);
> +	page = mm->context.pgtable_page;
> +	if (page) {
> +		void *p = page_address(page);
> +		index = atomic_add_return(1, &page->_mapcount);
> +		ret = (pte_t *) (p + (index * PTE_FRAG_SIZE));
> +		/*
> +		 * If we have taken up all the fragments mark PTE page NULL
> +		 */
> +		if (index == PTE_FRAG_NR - 1)
> +			mm->context.pgtable_page = NULL;
> +	}
> +	spin_unlock(&mm->page_table_lock);
> +	return ret;
> +}
> +
> +static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
> +{
> +	pte_t *ret = NULL;
> +	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
> +				       __GFP_REPEAT | __GFP_ZERO);
> +	if (!page)
> +		return NULL;
> +
> +	spin_lock(&mm->page_table_lock);
> +	/*
> +	 * If we find pgtable_page set, we return
> +	 * the allocated page with single fragement
> +	 * count.
> +	 */
> +	if (likely(!mm->context.pgtable_page)) {
> +		atomic_set(&page->_count, PTE_FRAG_NR);
> +		atomic_set(&page->_mapcount, 0);
> +		mm->context.pgtable_page = page;
> +	}

.. and in the unlikely case where there *is* a pgtable_page already
set, what then?  Seems like you should BUG_ON, or at least return NULL
- as it is you will return the first sub-page of that page again,
which is very likely in use.

> +	spin_unlock(&mm->page_table_lock);
> +
> +	ret = (unsigned long *)page_address(page);
> +	if (!kernel)
> +		pgtable_page_ctor(page);
> +
> +	return ret;
> +}
> +
> +pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
> +{
> +	pte_t *pte;
> +
> +	pte = get_from_cache(mm);
> +	if (pte)
> +		return pte;
> +
> +	return __alloc_for_cache(mm, kernel);
> +}
> +
> +void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
> +{
> +	struct page *page = virt_to_page(table);
> +	if (put_page_testzero(page)) {
> +		if (!kernel)
> +			pgtable_page_dtor(page);
> +		reset_page_mapcount(page);
> +		free_hot_cold_page(page, 0);
> +	}
> +}
> +
> +#ifdef CONFIG_SMP
> +static void page_table_free_rcu(void *table)
> +{
> +	struct page *page = virt_to_page(table);
> +	if (put_page_testzero(page)) {
> +		pgtable_page_dtor(page);
> +		reset_page_mapcount(page);
> +		free_hot_cold_page(page, 0);
> +	}
> +}
> +
> +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
> +{
> +	unsigned long pgf = (unsigned long)table;
> +
> +	BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
> +	pgf |= shift;
> +	tlb_remove_table(tlb, (void *)pgf);
> +}
> +
> +void __tlb_remove_table(void *_table)
> +{
> +	void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
> +	unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
> +
> +	if (!shift)
> +		/* PTE page needs special handling */
> +		page_table_free_rcu(table);
> +	else {
> +		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
> +		kmem_cache_free(PGT_CACHE(shift), table);
> +	}
> +}
> +#else
> +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
> +{
> +	if (!shift) {
> +		/* PTE page needs special handling */
> +		struct page *page = virt_to_page(table);
> +		if (put_page_testzero(page)) {
> +			pgtable_page_dtor(page);
> +			reset_page_mapcount(page);
> +			free_hot_cold_page(page, 0);
> +		}
> +	} else {
> +		BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
> +		kmem_cache_free(PGT_CACHE(shift), table);
> +	}
> +}
> +#endif
> +#endif /* CONFIG_PPC_64K_PAGES */

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
Attachment:
signature.asc

Description: Digital signature