From: "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx> We now have pmd entries covering to 16MB range. To implement THP on powerpc, we double the size of PMD. The second half is used to deposit the pgtable (PTE page). We also use the depoisted PTE page for tracking the HPTE information. The information include [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry. With 16MB huge page and 64K HPTE we need 256 entries and with 4K HPTE we need 4096 entries. Both will fit in a 4K PTE page. TODO: ppc64 KVM bits not yet done Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx> --- arch/powerpc/include/asm/kvm_book3s_64.h | 1 + arch/powerpc/include/asm/mmu-hash64.h | 5 + arch/powerpc/include/asm/pgtable-ppc64.h | 31 +----- arch/powerpc/kernel/io-workarounds.c | 3 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 3 +- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 6 +- arch/powerpc/mm/Makefile | 1 + arch/powerpc/mm/hash_utils_64.c | 16 ++- arch/powerpc/mm/hugepage-hash64.c | 178 ++++++++++++++++++++++++++++++ arch/powerpc/mm/hugetlbpage.c | 31 +++++- arch/powerpc/mm/pgtable.c | 38 +++++++ arch/powerpc/mm/tlb_hash64.c | 5 +- arch/powerpc/perf/callchain.c | 2 +- arch/powerpc/platforms/pseries/eeh.c | 5 +- 14 files changed, 283 insertions(+), 42 deletions(-) create mode 100644 arch/powerpc/mm/hugepage-hash64.c diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 38bec1d..29d1228 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -110,6 +110,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, return rb; } +/* FIXME !! should we use hpte_actual_psize ? */ static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) { /* only handle 4k, 64k and 16M pages for now */ diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 46c14a2..6c02a7c 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -325,6 +325,11 @@ extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap) int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, int local, int ssize, unsigned int shift, unsigned int mmu_psize); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern int __hash_page_thp(unsigned long ea, unsigned long access, + unsigned long vsid, pmd_t *pmdp, unsigned long trap, + int local, int ssize, unsigned int psize); +#endif extern void hash_failure_debug(unsigned long ea, unsigned long access, unsigned long vsid, unsigned long trap, int ssize, int psize, int lpsize, diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 5d5b937..3ebe57d 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -346,39 +346,18 @@ static inline void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) return __pgtable_cache_add(shift, sizeof(void *) << shift, ctor); } -/* - * find_linux_pte returns the address of a linux pte for a given - * effective address and directory. If not found, it returns zero. - */ -static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - pte_t *pt = NULL; - - pg = pgdir + pgd_index(ea); - if (!pgd_none(*pg)) { - pu = pud_offset(pg, ea); - if (!pud_none(*pu)) { - pm = pmd_offset(pu, ea); - if (pmd_present(*pm)) - pt = pte_offset_kernel(pm, ea); - } - } - return pt; -} - +pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea, unsigned int *thp); #ifdef CONFIG_HUGETLB_PAGE pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - unsigned *shift); + unsigned *shift, unsigned int *hugepage); #else static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, - unsigned *shift) + unsigned *shift, + unsigned int *hugepage) { if (shift) *shift = 0; - return find_linux_pte(pgdir, ea); + return find_linux_pte(pgdir, ea, hugepage); } #endif /* !CONFIG_HUGETLB_PAGE */ diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index 50e90b7..c41998c 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -70,7 +70,8 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) return NULL; - ptep = find_linux_pte(init_mm.pgd, vaddr); + /* we won't find huge pages here */ + ptep = find_linux_pte(init_mm.pgd, vaddr, NULL); if (ptep == NULL) paddr = 0; else diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8cc18ab..c37f3f2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -682,8 +682,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * while looking up and updating the pte. */ rcu_read_lock_sched(); + /* FIXME!! */ ptep = find_linux_pte_or_hugepte(current->mm->pgd, - hva, NULL); + hva, NULL, NULL); if (ptep && pte_present(*ptep)) { pte = kvmppc_read_update_linux_pte(ptep, 1); if (pte_write(pte)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 19c93ba..598db86 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -27,7 +27,8 @@ static void *real_vmalloc_addr(void *x) unsigned long addr = (unsigned long) x; pte_t *p; - p = find_linux_pte(swapper_pg_dir, addr); +/* FIXME!! */ + p = find_linux_pte(swapper_pg_dir, addr, NULL); if (!p || !pte_present(*p)) return NULL; /* assume we don't have huge pages in vmalloc space... */ @@ -145,6 +146,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } +/* FIXME!! check */ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, int writing, unsigned long *pte_sizep) { @@ -152,7 +154,7 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, unsigned long ps = *pte_sizep; unsigned int shift; - ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); + ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift, NULL); if (!ptep) return __pte(0); if (shift) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3787b61..997deb4 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -33,6 +33,7 @@ obj-y += hugetlbpage.o obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index e55c40b..a87f1ff 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -939,7 +939,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) unsigned long vsid; struct mm_struct *mm; pte_t *ptep; - unsigned hugeshift; + unsigned hugeshift, hugepage; const struct cpumask *tmp; int rc, user_region = 0, local = 0; int psize, ssize; @@ -1005,7 +1005,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) #endif /* CONFIG_PPC_64K_PAGES */ /* Get PTE and page size from page tables */ - ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); + ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift, &hugepage); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); return 1; @@ -1028,6 +1028,12 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) ssize, hugeshift, psize); #endif /* CONFIG_HUGETLB_PAGE */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (hugepage) + return __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, + trap, local, ssize, psize); +#endif + #ifndef CONFIG_PPC_64K_PAGES DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); #else @@ -1133,7 +1139,11 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, pgdir = mm->pgd; if (pgdir == NULL) return; - ptep = find_linux_pte(pgdir, ea); + /* + * We haven't implemented update_mmu_cache_pmd yet. We get called + * only for non huge pages. Hence can ignore THP here + */ + ptep = find_linux_pte(pgdir, ea, NULL); if (!ptep) return; diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c new file mode 100644 index 0000000..c9c2640 --- /dev/null +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -0,0 +1,178 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +/* + * PPC64 THP Support for hash based MMUs + */ +#include <linux/mm.h> +#include <asm/machdep.h> + +/* + * A linux huge page PMD was changed and the corresponding hash table entry + * neesd to be flushed. FIXME!! there is no batching support yet. + * + * The linux huge page PMD now include the pmd entries followed by the address + * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits. + * [ secondary group | 3 bit hidx | valid ]. We use one byte per each HPTE entry. + * With 16MB huge page and 64K HPTE we need 256 entries and with 4K HPTE we need + * 4096 entries. Both will fit in a 4K pgtable_t. + */ +int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, + pmd_t *pmdp, unsigned long trap, int local, int ssize, + unsigned int psize) +{ + unsigned int index, valid; + unsigned char *hpte_slot_array; + unsigned long rflags, pa, hidx; + unsigned long old_pmd, new_pmd; + int ret, lpsize = MMU_PAGE_16M; + unsigned long vpn, hash, shift, slot; + + /* + * atomically mark the linux large page PMD busy and dirty + */ + do { + old_pmd = pmd_val(*pmdp); + /* If PMD busy, retry the access */ + if (unlikely(old_pmd & PMD_HUGE_BUSY)) + return 0; + /* If PMD permissions don't match, take page fault */ + if (unlikely(access & ~old_pmd)) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ + new_pmd = old_pmd | PMD_HUGE_BUSY | PMD_HUGE_ACCESSED; + if (access & _PAGE_RW) + new_pmd |= PMD_HUGE_DIRTY; + } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp, + old_pmd, new_pmd)); + /* + * derive the rflags. Default enable read (0x2) + */ + rflags = 0x2 | (!(new_pmd & PMD_HUGE_RW)); + /* PMD_HUGE_EXEC -> HW_NO_EXEC since it's inverted */ + rflags |= ((new_pmd & PMD_HUGE_EXEC) ? 0 : HPTE_R_N); + +#if 0 /* FIXME!! */ + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + } +#endif + /* + * Find the slot index details for this ea, using base page size. + */ + shift = mmu_psize_defs[psize].shift; + index = (ea & (HUGE_PAGE_SIZE - 1)) >> shift; + BUG_ON(index > 4096); + + vpn = hpt_vpn(ea, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + /* + * The hpte hindex are stored in the pgtable whose address is in the + * second half of the PMD + */ + hpte_slot_array = *(char **)(pmdp + PTRS_PER_PMD); + + valid = hpte_slot_array[index] & 0x1; + if (unlikely(valid)) { + /* update the hpte bits */ + hidx = hpte_slot_array[index] >> 1; + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + ret = ppc_md.hpte_updatepp(slot, rflags, vpn, + psize, ssize, local); + /* + * We failed to update, try to insert a new entry. + */ + if (ret == -1) { + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + hpte_slot_array[index] = 0; + valid = 0; + } + } + + if (likely(!valid)) { + unsigned long hpte_group; + + /* insert new entry */ + pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; +repeat: + hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + + /* clear the busy bits and set the hash pte bits */ + new_pmd = (new_pmd & ~PMD_HUGE_HPTEFLAGS) | PMD_HUGE_HASHPTE; + +#if 0 + /* Add in WIMG bits. FIXME!! enabled by default */ + rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | + _PAGE_COHERENT | _PAGE_GUARDED)); +#endif + /* Insert into the hash table, primary slot */ + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0, + psize, lpsize, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = ((~hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, + rflags, HPTE_V_SECONDARY, + psize, lpsize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + + ppc_md.hpte_remove(hpte_group); + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pmd and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *pmdp = __pmd(old_pmd); + hash_failure_debug(ea, access, vsid, trap, ssize, + psize, lpsize, old_pmd); + return -1; + } + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + hpte_slot_array[index] = slot << 1 | 0x1; + } + /* + * No need to use ldarx/stdcx here + */ + *pmdp = __pmd(new_pmd & ~PMD_HUGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1a6de0a..364eae6 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -67,7 +67,8 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) #define hugepd_none(hpd) ((hpd).pd == 0) -pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift, unsigned int *hugepage) { pgd_t *pg; pud_t *pu; @@ -77,6 +78,8 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift if (shift) *shift = 0; + if (hugepage) + *hugepage = 0; pg = pgdir + pgd_index(ea); if (is_hugepd(pg)) { @@ -91,12 +94,24 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift pm = pmd_offset(pu, ea); if (is_hugepd(pm)) hpdp = (hugepd_t *)pm; - else if (!pmd_none(*pm)) { + else if (pmd_large(*pm)) { + /* THP page */ + if (hugepage) { + *hugepage = 1; + /* + * This should be ok, except for few + * flags. Most of the pte and huge page + * pmd bits overlap. We don't use the + * returned value as pte_t in the caller. + */ + return (pte_t *)pm; + } else + return NULL; + } else if (!pmd_none(*pm)) { return pte_offset_kernel(pm, ea); } } } - if (!hpdp) return NULL; @@ -108,7 +123,8 @@ EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { - return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); + /* Only called for HugeTLB pages, hence can ignore THP */ + return find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); } static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, @@ -613,8 +629,11 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) struct page *page; unsigned shift; unsigned long mask; - - ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); + /* + * Transparent huge pages are handled by generic code. We can skip them + * here. + */ + ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift, NULL); /* Verify it is a huge page else bail. */ if (!ptep || !shift) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 84e7b71..ac4ca9a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -564,3 +564,41 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* + * find_linux_pte returns the address of a linux pte for a given + * effective address and directory. If not found, it returns zero. + */ +pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea, unsigned int *hugepage) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + pte_t *pt = NULL; + + if (hugepage) + *hugepage = 0; + pg = pgdir + pgd_index(ea); + if (!pgd_none(*pg)) { + pu = pud_offset(pg, ea); + if (!pud_none(*pu)) { + pm = pmd_offset(pu, ea); + if (pmd_large(*pm)) { + /* THP page */ + if (hugepage) { + *hugepage = 1; + /* + * This should be ok, except for few + * flags. Most of the pte and huge page + * pmd bits overlap. We don't use the + * returned value as pte_t in the caller. + */ + return (pte_t *)pm; + } else + return NULL; + } else if (pmd_present(*pm)) + pt = pte_offset_kernel(pm, ea); + } + } + return pt; +} diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 023ec8a..5608738 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -206,7 +206,10 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { - pte_t *ptep = find_linux_pte(mm->pgd, start); + /* + * We won't find huge pages here. + */ + pte_t *ptep = find_linux_pte(mm->pgd, start, NULL); unsigned long pte; if (ptep == NULL) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 74d1e78..578cac7 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -125,7 +125,7 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb) if (!pgdir) return -EFAULT; - ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift); + ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift, NULL); if (!shift) shift = PAGE_SHIFT; diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 9a04322..221043f 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -261,7 +261,10 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) pte_t *ptep; unsigned long pa; - ptep = find_linux_pte(init_mm.pgd, token); + /* + * We won't find huge pages here + */ + ptep = find_linux_pte(init_mm.pgd, token, NULL); if (!ptep) return token; pa = pte_pfn(*ptep) << PAGE_SHIFT; -- 1.7.10 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>