Hi David, Some initial comments below. Cheers, -- Steve On Tue, Sep 15, 2015 at 02:01:57PM -0400, David Woods wrote: > The arm64 MMU supports a Contiguous bit which is a hint that the TTE > is one of a set of contiguous entries which can be cached in a single > TLB entry. Supporting this bit adds new intermediate huge page sizes. > > The set of huge page sizes available depends on the base page size. > Without using contiguous pages the huge page sizes are as follows. > > 4KB: 2MB 1GB > 64KB: 512MB 4TB > > With 4KB pages, the contiguous bit groups together sets of 16 pages > and with 64KB pages it groups sets of 32 pages. This enables two new > huge page sizes in each case, so that the full set of available sizes > is as follows. > > 4KB: 64KB 2MB 32MB 1GB > 64KB: 2MB 512MB 16GB 4TB > > If the base page size is set to 64KB then 2MB pages are enabled by > default. It is possible in the future to make 2MB the default huge > page size for both 4KB and 64KB pages. > > Signed-off-by: David Woods <dwoods@xxxxxxxxxx> > Reviewed-by: Chris Metcalf <cmetcalf@xxxxxxxxxx> > --- > arch/arm64/Kconfig | 3 - > arch/arm64/include/asm/hugetlb.h | 4 + > arch/arm64/include/asm/pgtable-hwdef.h | 15 +++ > arch/arm64/include/asm/pgtable.h | 30 +++++- > arch/arm64/mm/hugetlbpage.c | 165 ++++++++++++++++++++++++++++++++- > 5 files changed, 210 insertions(+), 7 deletions(-) > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 7d95663..8310e38 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -447,9 +447,6 @@ config HW_PERF_EVENTS > config SYS_SUPPORTS_HUGETLBFS > def_bool y > > -config ARCH_WANT_GENERAL_HUGETLB > - def_bool y > - > config ARCH_WANT_HUGE_PMD_SHARE > def_bool y if !ARM64_64K_PAGES > > diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h > index bb4052e..e5af553 100644 > --- a/arch/arm64/include/asm/hugetlb.h > +++ b/arch/arm64/include/asm/hugetlb.h > @@ -97,4 +97,8 @@ static inline void arch_clear_hugepage_flags(struct page *page) > clear_bit(PG_dcache_clean, &page->flags); > } > > +extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, > + struct page *page, int writable); > +#define arch_make_huge_pte arch_make_huge_pte > + > #endif /* __ASM_HUGETLB_H */ > diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h > index 24154b0..da73243 100644 > --- a/arch/arm64/include/asm/pgtable-hwdef.h > +++ b/arch/arm64/include/asm/pgtable-hwdef.h > @@ -55,6 +55,19 @@ > #define SECTION_MASK (~(SECTION_SIZE-1)) > > /* > + * Contiguous large page definitions. > + */ > +#ifdef CONFIG_ARM64_64K_PAGES > +#define CONTIG_SHIFT 5 > +#define CONTIG_PAGES 32 > +#else > +#define CONTIG_SHIFT 4 > +#define CONTIG_PAGES 16 > +#endif > +#define CONTIG_PTE_SIZE (CONTIG_PAGES * PAGE_SIZE) > +#define CONTIG_PTE_MASK (~(CONTIG_PTE_SIZE - 1)) Careful here, CONTIG_PAGES should really be CONTIG_PTES. If support is added for a 16KB granule case we are allowed: 128 x 16KB pages (ptes) to make a 2MB huge page, or 32 x 32MB blocks (pmds) to make a 1GB huge page. i.e we CONTIG_PTES != CONTIG_PMDs For 4KB or 64KB pages we are only allowed contiguous pte's so CONTIG_PMDS == 0 in these cases. > + > +/* > * Hardware page table definitions. > * > * Level 1 descriptor (PUD). > @@ -83,6 +96,7 @@ > #define PMD_SECT_S (_AT(pmdval_t, 3) << 8) > #define PMD_SECT_AF (_AT(pmdval_t, 1) << 10) > #define PMD_SECT_NG (_AT(pmdval_t, 1) << 11) > +#define PMD_SECT_CONTIG (_AT(pmdval_t, 1) << 52) > #define PMD_SECT_PXN (_AT(pmdval_t, 1) << 53) > #define PMD_SECT_UXN (_AT(pmdval_t, 1) << 54) > > @@ -105,6 +119,7 @@ > #define PTE_AF (_AT(pteval_t, 1) << 10) /* Access Flag */ > #define PTE_NG (_AT(pteval_t, 1) << 11) /* nG */ > #define PTE_DBM (_AT(pteval_t, 1) << 51) /* Dirty Bit Management */ > +#define PTE_CONTIG (_AT(pteval_t, 1) << 52) /* Contiguous */ > #define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */ > #define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */ > > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h > index 6900b2d9..df5ec64 100644 > --- a/arch/arm64/include/asm/pgtable.h > +++ b/arch/arm64/include/asm/pgtable.h > @@ -144,6 +144,7 @@ extern struct page *empty_zero_page; > #define pte_special(pte) (!!(pte_val(pte) & PTE_SPECIAL)) > #define pte_write(pte) (!!(pte_val(pte) & PTE_WRITE)) > #define pte_exec(pte) (!(pte_val(pte) & PTE_UXN)) > +#define pte_contig(pte) (!!(pte_val(pte) & PTE_CONTIG)) > > #ifdef CONFIG_ARM64_HW_AFDBM > #define pte_hw_dirty(pte) (!(pte_val(pte) & PTE_RDONLY)) > @@ -206,6 +207,9 @@ static inline pte_t pte_mkspecial(pte_t pte) > return set_pte_bit(pte, __pgprot(PTE_SPECIAL)); > } > > +extern pte_t pte_mkcontig(pte_t pte); > +extern pmd_t pmd_mkcontig(pmd_t pmd); > + > static inline void set_pte(pte_t *ptep, pte_t pte) > { > *ptep = pte; > @@ -275,7 +279,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, > /* > * Hugetlb definitions. > */ > -#define HUGE_MAX_HSTATE 2 > +#define HUGE_MAX_HSTATE ((2 * CONFIG_PGTABLE_LEVELS) - 1) > #define HPAGE_SHIFT PMD_SHIFT > #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) > #define HPAGE_MASK (~(HPAGE_SIZE - 1)) > @@ -372,7 +376,8 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, > #define pmd_none(pmd) (!pmd_val(pmd)) > #define pmd_present(pmd) (pmd_val(pmd)) > > -#define pmd_bad(pmd) (!(pmd_val(pmd) & 2)) > +#define pmd_bad(pmd) (!(pmd_val(pmd) & \ > + (PMD_TABLE_BIT | PMD_SECT_CONTIG))) I'm not sure about this. A contiguous pmd (which will be a block descriptor) will no longer be bad? > > #define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ > PMD_TYPE_TABLE) > @@ -500,7 +505,8 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long addr) > static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) > { > const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | > - PTE_PROT_NONE | PTE_WRITE | PTE_TYPE_MASK; > + PTE_PROT_NONE | PTE_WRITE | PTE_TYPE_MASK | > + PTE_CONTIG; > /* preserve the hardware dirty information */ > if (pte_hw_dirty(pte)) > newprot |= PTE_DIRTY; > @@ -513,6 +519,24 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) > return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); > } > > +static inline pte_t pte_modify_pfn(pte_t pte, unsigned long newpfn) > +{ > + const pteval_t mask = PHYS_MASK & PAGE_MASK; > + > + pte_val(pte) = pfn_pte(newpfn, (pte_val(pte) & ~mask)); > + return pte; > +} > + > +#if CONFIG_PGTABLE_LEVELS > 2 > +static inline pmd_t pmd_modify_pfn(pmd_t pmd, unsigned long newpfn) > +{ > + const pmdval_t mask = PHYS_MASK & PAGE_MASK; > + > + pmd = pfn_pmd(newpfn, (pmd_val(pmd) & ~mask)); > + return pmd; > +} > +#endif We can probably get rid of these two functions, please see below. > + > #ifdef CONFIG_ARM64_HW_AFDBM > /* > * Atomic pte/pmd modifications. > diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c > index 383b03f..f5bbbbc 100644 > --- a/arch/arm64/mm/hugetlbpage.c > +++ b/arch/arm64/mm/hugetlbpage.c > @@ -41,6 +41,155 @@ int pud_huge(pud_t pud) > #endif > } > > +pte_t *huge_pte_alloc(struct mm_struct *mm, > + unsigned long addr, unsigned long sz) > +{ > + pgd_t *pgd; > + pud_t *pud; > + pte_t *pte = NULL; > + int i; > + > + pgd = pgd_offset(mm, addr); > + pud = pud_alloc(mm, pgd, addr); > + if (pud) { > + if (sz == PUD_SIZE) { > + pte = (pte_t *)pud; > + } else if (sz == PMD_SIZE) { > +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE > + if (pud_none(*pud)) > + pte = huge_pmd_share(mm, addr, pud); > + else > +#endif > + pte = (pte_t *)pmd_alloc(mm, pud, addr); > + } else if (sz == (PAGE_SIZE * CONTIG_PAGES)) { > + pmd_t *pmd = pmd_alloc(mm, pud, addr); > + > + WARN_ON(addr & (sz - 1)); > + pte = pte_alloc_map(mm, NULL, pmd, addr); > + if (pte_present(*pte)) { > + unsigned long pfn; > + *pte = pte_mkcontig(*pte); > + pfn = pte_pfn(*pte); > + for (i = 0; i < CONTIG_PAGES; i++) { > + set_pte(&pte[i], > + pte_modify_pfn(*pte, pfn + i)); > + } > + } > +#if CONFIG_PGTABLE_LEVELS > 2 > + } else if (sz == (PMD_SIZE * CONTIG_PAGES)) { > + pmd_t *pmd; > + > + pmd = pmd_alloc(mm, pud, addr); > + WARN_ON(addr & (sz - 1)); > + if (pmd && pmd_present(*pmd)) { > + unsigned long pfn; > + pmd_t pmdval; > + > + pmdval = *pmd = pmd_mkcontig(*pmd); > + pfn = pmd_pfn(*pmd); > + for (i = 0; i < CONTIG_PAGES; i++) { > + unsigned long newpfn = pfn + > + (i << (PMD_SHIFT - PAGE_SHIFT)); > + if (!pmd_present(pmd[i])) > + atomic_long_inc(&mm->nr_ptes); > + set_pmd(&pmd[i], > + pmd_modify_pfn(pmdval, newpfn)); > + } > + } > + return pmd; > +#endif > + } > + } > + > + return pte; > +} Why are we writing pte's/pmd's in the huge_pte_alloc function? What happened to set_huge_pte_at? Also, rather than call pte_modify_pfn, I would recommend something like: int loop; unsigned long pfn = pte_pfn(pte); pgprot_t hugeprot = __pgprot(pte_val(pfn_pte(pfn, 0) ^ pte_val(pte))); for (loop = 0; loop < CONTIG_PTES; loop++) { set_pte_at(mm, addr, ptep++, pfn_pte(pfn++, hugeprot)); addr += PAGE_SIZE; } i.e. extract a pgprot_t and combine with the pfn in the loop rather than calling out. > + > +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) > +{ > + pgd_t *pgd; > + pud_t *pud; > + pmd_t *pmd = NULL; > + pte_t *pte = NULL; > + > + pgd = pgd_offset(mm, addr); > + if (pgd_present(*pgd)) { > + pud = pud_offset(pgd, addr); > + if (pud_present(*pud)) { > + if (pud_huge(*pud)) > + return (pte_t *)pud; > + pmd = pmd_offset(pud, addr); > + if (pmd_present(*pmd)) { > + if (pmd_huge(*pmd)) > + return (pte_t *)pmd; > + pte = pte_offset_kernel(pmd, addr); > + if (pte_present(*pte) && pte_contig(*pte)) { > + pte = pte_offset_kernel( > + pmd, (addr & CONTIG_PTE_MASK)); > + return pte; > + } > + } > + } > + } > + return (pte_t *) NULL; > +} > + > +pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, > + struct page *page, int writable) > +{ > + size_t pagesize = huge_page_size(hstate_vma(vma)); > + pte_t nent = {0}; > + > + if (pagesize == PUD_SIZE || pagesize == PMD_SIZE) > + nent = entry; > + else if (pagesize == (PAGE_SIZE * CONTIG_PAGES)) > + nent = pte_mkcontig(entry); > +#if CONFIG_PGTABLE_LEVELS > 2 > + else if (pagesize == (PMD_SIZE * CONTIG_PAGES) || > + pagesize == (PUD_SIZE * CONTIG_PAGES)) > + nent = pmd_mkcontig(entry); > +#endif > + else { > + pr_warn("%s: unrecognized huge page size 0x%lx\n", > + __func__, pagesize); > + } > + return nent; > +} > + > +pte_t pte_mkcontig(pte_t pte) > +{ > + pte = set_pte_bit(pte, __pgprot(PTE_CONTIG)); > + pte = set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE)); > + return pte; > +} > + > +pmd_t pmd_mkcontig(pmd_t pmd) > +{ > + pmd = __pmd(pmd_val(pmd) | PMD_SECT_CONTIG); > + return pmd; > +} Can these be folded into arch_make_huge_pte? > + > +struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, > + pmd_t *pmd, int write) > +{ > + struct page *page; > + > + page = pte_page(*(pte_t *)pmd); > + if (page) > + page += ((address & ~PMD_MASK) >> PAGE_SHIFT); > + return page; > +} Do we need to think about contiguous pmd's here? It may be worth implementing follow_huge_addr? > + > +struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, > + pud_t *pud, int write) > +{ > + struct page *page; > + > + page = pte_page(*(pte_t *)pud); > + if (page) > + page += ((address & ~PUD_MASK) >> PAGE_SHIFT); > + return page; > +} > + > static __init int setup_hugepagesz(char *opt) > { > unsigned long ps = memparse(opt, &opt); > @@ -48,10 +197,24 @@ static __init int setup_hugepagesz(char *opt) > hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); > } else if (ps == PUD_SIZE) { > hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); > + } else if (ps == (PAGE_SIZE * CONTIG_PAGES)) { > + hugetlb_add_hstate(CONTIG_SHIFT); > + } else if (ps == (PMD_SIZE * CONTIG_PAGES)) { We need to distinguish between CONTIG_PTES and CONTIG_PMDS. > + hugetlb_add_hstate((PMD_SHIFT + CONTIG_SHIFT) - PAGE_SHIFT); > } else { > - pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20); > + pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); > return 0; > } > return 1; > } > __setup("hugepagesz=", setup_hugepagesz); > + > +#ifdef CONFIG_ARM64_64K_PAGES > +static __init int add_default_hugepagesz(void) > +{ > + if (size_to_hstate(CONTIG_PAGES * PAGE_SIZE) == NULL) > + hugetlb_add_hstate(CONTIG_SHIFT); > + return 0; > +} > +arch_initcall(add_default_hugepagesz); > +#endif > -- > 2.1.2 > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>