On Thu, Apr 04, 2013 at 11:27:44AM +0530, Aneesh Kumar K.V wrote: > From: "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx> > > We allocate one page for the last level of linux page table. With THP and > large page size of 16MB, that would mean we are wasting large part > of that page. To map 16MB area, we only need a PTE space of 2K with 64K > page size. This patch reduce the space wastage by sharing the page > allocated for the last level of linux page table with multiple pmd > entries. We call these smaller chunks PTE page fragments and allocated > page, PTE page. > > In order to support systems which doesn't have 64K HPTE support, we also > add another 2K to PTE page fragment. The second half of the PTE fragments > is used for storing slot and secondary bit information of an HPTE. With this > we now have a 4K PTE fragment. > > We use a simple approach to share the PTE page. On allocation, we bump the > PTE page refcount to 16 and share the PTE page with the next 16 pte alloc > request. This should help in the node locality of the PTE page fragment, > assuming that the immediate pte alloc request will mostly come from the > same NUMA node. We don't try to reuse the freed PTE page fragment. Hence > we could be waisting some space. > > Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx> > --- > arch/powerpc/include/asm/mmu-book3e.h | 4 + > arch/powerpc/include/asm/mmu-hash64.h | 4 + > arch/powerpc/include/asm/page.h | 4 + > arch/powerpc/include/asm/pgalloc-64.h | 72 ++++------------- > arch/powerpc/kernel/setup_64.c | 4 +- > arch/powerpc/mm/mmu_context_hash64.c | 35 +++++++++ > arch/powerpc/mm/pgtable_64.c | 137 +++++++++++++++++++++++++++++++++ > 7 files changed, 202 insertions(+), 58 deletions(-) > > diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h > index 99d43e0..affbd68 100644 > --- a/arch/powerpc/include/asm/mmu-book3e.h > +++ b/arch/powerpc/include/asm/mmu-book3e.h > @@ -231,6 +231,10 @@ typedef struct { > u64 high_slices_psize; /* 4 bits per slice for now */ > u16 user_psize; /* page size index */ > #endif > +#ifdef CONFIG_PPC_64K_PAGES > + /* for 4K PTE fragment support */ > + struct page *pgtable_page; > +#endif > } mm_context_t; > > /* Page size definitions, common between 32 and 64-bit > diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h > index 35bb51e..300ac3c 100644 > --- a/arch/powerpc/include/asm/mmu-hash64.h > +++ b/arch/powerpc/include/asm/mmu-hash64.h > @@ -498,6 +498,10 @@ typedef struct { > unsigned long acop; /* mask of enabled coprocessor types */ > unsigned int cop_pid; /* pid value used with coprocessors */ > #endif /* CONFIG_PPC_ICSWX */ > +#ifdef CONFIG_PPC_64K_PAGES > + /* for 4K PTE fragment support */ > + struct page *pgtable_page; > +#endif > } mm_context_t; > > > diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h > index f072e97..38e7ff6 100644 > --- a/arch/powerpc/include/asm/page.h > +++ b/arch/powerpc/include/asm/page.h > @@ -378,7 +378,11 @@ void arch_free_page(struct page *page, int order); > > struct vm_area_struct; > > +#ifdef CONFIG_PPC_64K_PAGES > +typedef pte_t *pgtable_t; > +#else > typedef struct page *pgtable_t; > +#endif Ugh, that's pretty horrible, though I don't see an easy way around it. > #include <asm-generic/memory_model.h> > #endif /* __ASSEMBLY__ */ > diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h > index cdbf555..3418989 100644 > --- a/arch/powerpc/include/asm/pgalloc-64.h > +++ b/arch/powerpc/include/asm/pgalloc-64.h > @@ -150,6 +150,13 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, > > #else /* if CONFIG_PPC_64K_PAGES */ > > +extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int); > +extern void page_table_free(struct mm_struct *, unsigned long *, int); > +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); > +#ifdef CONFIG_SMP > +extern void __tlb_remove_table(void *_table); > +#endif > + > #define pud_populate(mm, pud, pmd) pud_set(pud, (unsigned long)pmd) > > static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, > @@ -161,90 +168,42 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, > static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, > pgtable_t pte_page) > { > - pmd_populate_kernel(mm, pmd, page_address(pte_page)); > + pmd_set(pmd, (unsigned long)pte_page); > } > > static inline pgtable_t pmd_pgtable(pmd_t pmd) > { > - return pmd_page(pmd); > + return (pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE); > } > > static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, > unsigned long address) > { > - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); > + return (pte_t *)page_table_alloc(mm, address, 1); > } > > static inline pgtable_t pte_alloc_one(struct mm_struct *mm, > - unsigned long address) > + unsigned long address) > { > - struct page *page; > - pte_t *pte; > - > - pte = pte_alloc_one_kernel(mm, address); > - if (!pte) > - return NULL; > - page = virt_to_page(pte); > - pgtable_page_ctor(page); > - return page; > + return (pgtable_t)page_table_alloc(mm, address, 0); > } > > static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) > { > - free_page((unsigned long)pte); > + page_table_free(mm, (unsigned long *)pte, 1); > } > > static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) > { > - pgtable_page_dtor(ptepage); > - __free_page(ptepage); > -} > - > -static inline void pgtable_free(void *table, unsigned index_size) > -{ > - if (!index_size) > - free_page((unsigned long)table); > - else { > - BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); > - kmem_cache_free(PGT_CACHE(index_size), table); > - } > + page_table_free(mm, (unsigned long *)ptepage, 0); > } > > -#ifdef CONFIG_SMP > -static inline void pgtable_free_tlb(struct mmu_gather *tlb, > - void *table, int shift) > -{ > - unsigned long pgf = (unsigned long)table; > - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); > - pgf |= shift; > - tlb_remove_table(tlb, (void *)pgf); > -} > - > -static inline void __tlb_remove_table(void *_table) > -{ > - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); > - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; > - > - pgtable_free(table, shift); > -} > -#else /* !CONFIG_SMP */ > -static inline void pgtable_free_tlb(struct mmu_gather *tlb, > - void *table, int shift) > -{ > - pgtable_free(table, shift); > -} > -#endif /* CONFIG_SMP */ > - > static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, > unsigned long address) > { > - struct page *page = page_address(table); > - > tlb_flush_pgtable(tlb, address); > - pgtable_page_dtor(page); > - pgtable_free_tlb(tlb, page, 0); > + pgtable_free_tlb(tlb, table, 0); > } > - > #endif /* CONFIG_PPC_64K_PAGES */ > > static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) > @@ -258,7 +217,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) > kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd); > } > > - > #define __pmd_free_tlb(tlb, pmd, addr) \ > pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE) > #ifndef CONFIG_PPC_64K_PAGES > diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c > index 6da881b..04d833c 100644 > --- a/arch/powerpc/kernel/setup_64.c > +++ b/arch/powerpc/kernel/setup_64.c > @@ -575,7 +575,9 @@ void __init setup_arch(char **cmdline_p) > init_mm.end_code = (unsigned long) _etext; > init_mm.end_data = (unsigned long) _edata; > init_mm.brk = klimit; > - > +#ifdef CONFIG_PPC_64K_PAGES > + init_mm.context.pgtable_page = NULL; > +#endif > irqstack_early_init(); > exc_lvl_early_init(); > emergency_stack_init(); > diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c > index 59cd773..fbfdca2 100644 > --- a/arch/powerpc/mm/mmu_context_hash64.c > +++ b/arch/powerpc/mm/mmu_context_hash64.c > @@ -86,6 +86,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) > spin_lock_init(mm->context.cop_lockp); > #endif /* CONFIG_PPC_ICSWX */ > > +#ifdef CONFIG_PPC_64K_PAGES > + mm->context.pgtable_page = NULL; > +#endif > return 0; > } > > @@ -97,13 +100,45 @@ void __destroy_context(int context_id) > } > EXPORT_SYMBOL_GPL(__destroy_context); > > +#ifdef CONFIG_PPC_64K_PAGES > +static void destroy_pagetable_page(struct mm_struct *mm) > +{ > + int count; > + struct page *page; > + > + page = mm->context.pgtable_page; > + if (!page) > + return; > + > + /* drop all the pending references */ > + count = atomic_read(&page->_mapcount) + 1; > + /* We allow PTE_FRAG_NR(16) fragments from a PTE page */ > + count = atomic_sub_return(16 - count, &page->_count); You should really move PTE_FRAG_NR to a header so you can actually use it here rather than hard coding 16. It took me a fair while to convince myself that there is no race here with something altering mapcount and count between the atomic_read() and the atomic_sub_return(). It could do with a comment to explain why that is safe. Re-using the mapcount field for your index also seems odd, and it took me a while to convince myself that that's safe too. Wouldn't it be simpler to store a pointer to the next sub-page in the mm_context instead? You can get from that to the struct page easily enough with a shift and pfn_to_page(). > + if (!count) { > + pgtable_page_dtor(page); > + reset_page_mapcount(page); > + free_hot_cold_page(page, 0); It would be nice to use put_page() somehow instead of duplicating its logic, though I realise the sparc code you've based this on does the same thing. > + } > +} > + > +#else > +static inline void destroy_pagetable_page(struct mm_struct *mm) > +{ > + return; > +} > +#endif > + > + > void destroy_context(struct mm_struct *mm) > { > + > #ifdef CONFIG_PPC_ICSWX > drop_cop(mm->context.acop, mm); > kfree(mm->context.cop_lockp); > mm->context.cop_lockp = NULL; > #endif /* CONFIG_PPC_ICSWX */ > + > + destroy_pagetable_page(mm); > __destroy_context(mm->context.id); > subpage_prot_free(mm); > mm->context.id = MMU_NO_CONTEXT; > diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c > index e212a27..e79840b 100644 > --- a/arch/powerpc/mm/pgtable_64.c > +++ b/arch/powerpc/mm/pgtable_64.c > @@ -337,3 +337,140 @@ EXPORT_SYMBOL(__ioremap_at); > EXPORT_SYMBOL(iounmap); > EXPORT_SYMBOL(__iounmap); > EXPORT_SYMBOL(__iounmap_at); > + > +#ifdef CONFIG_PPC_64K_PAGES > +/* > + * we support 16 fragments per PTE page. This is limited by how many > + * bits we can pack in page->_mapcount. We use the first half for > + * tracking the usage for rcu page table free. > + */ > +#define PTE_FRAG_NR 16 > +/* > + * We use a 2K PTE page fragment and another 2K for storing > + * real_pte_t hash index > + */ > +#define PTE_FRAG_SIZE (2 * PTRS_PER_PTE * sizeof(pte_t)) > + > +static pte_t *get_from_cache(struct mm_struct *mm) > +{ > + int index; > + pte_t *ret = NULL; > + struct page *page; > + > + spin_lock(&mm->page_table_lock); > + page = mm->context.pgtable_page; > + if (page) { > + void *p = page_address(page); > + index = atomic_add_return(1, &page->_mapcount); > + ret = (pte_t *) (p + (index * PTE_FRAG_SIZE)); > + /* > + * If we have taken up all the fragments mark PTE page NULL > + */ > + if (index == PTE_FRAG_NR - 1) > + mm->context.pgtable_page = NULL; > + } > + spin_unlock(&mm->page_table_lock); > + return ret; > +} > + > +static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) > +{ > + pte_t *ret = NULL; > + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | > + __GFP_REPEAT | __GFP_ZERO); > + if (!page) > + return NULL; > + > + spin_lock(&mm->page_table_lock); > + /* > + * If we find pgtable_page set, we return > + * the allocated page with single fragement > + * count. > + */ > + if (likely(!mm->context.pgtable_page)) { > + atomic_set(&page->_count, PTE_FRAG_NR); > + atomic_set(&page->_mapcount, 0); > + mm->context.pgtable_page = page; > + } .. and in the unlikely case where there *is* a pgtable_page already set, what then? Seems like you should BUG_ON, or at least return NULL - as it is you will return the first sub-page of that page again, which is very likely in use. > + spin_unlock(&mm->page_table_lock); > + > + ret = (unsigned long *)page_address(page); > + if (!kernel) > + pgtable_page_ctor(page); > + > + return ret; > +} > + > +pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) > +{ > + pte_t *pte; > + > + pte = get_from_cache(mm); > + if (pte) > + return pte; > + > + return __alloc_for_cache(mm, kernel); > +} > + > +void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) > +{ > + struct page *page = virt_to_page(table); > + if (put_page_testzero(page)) { > + if (!kernel) > + pgtable_page_dtor(page); > + reset_page_mapcount(page); > + free_hot_cold_page(page, 0); > + } > +} > + > +#ifdef CONFIG_SMP > +static void page_table_free_rcu(void *table) > +{ > + struct page *page = virt_to_page(table); > + if (put_page_testzero(page)) { > + pgtable_page_dtor(page); > + reset_page_mapcount(page); > + free_hot_cold_page(page, 0); > + } > +} > + > +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) > +{ > + unsigned long pgf = (unsigned long)table; > + > + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); > + pgf |= shift; > + tlb_remove_table(tlb, (void *)pgf); > +} > + > +void __tlb_remove_table(void *_table) > +{ > + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); > + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; > + > + if (!shift) > + /* PTE page needs special handling */ > + page_table_free_rcu(table); > + else { > + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); > + kmem_cache_free(PGT_CACHE(shift), table); > + } > +} > +#else > +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) > +{ > + if (!shift) { > + /* PTE page needs special handling */ > + struct page *page = virt_to_page(table); > + if (put_page_testzero(page)) { > + pgtable_page_dtor(page); > + reset_page_mapcount(page); > + free_hot_cold_page(page, 0); > + } > + } else { > + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); > + kmem_cache_free(PGT_CACHE(shift), table); > + } > +} > +#endif > +#endif /* CONFIG_PPC_64K_PAGES */ -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: Digital signature