pmd_present() and pmd_trans_huge() are expected to behave in the following manner during various phases of a PMD entry. This table is derived from a previous discussion on this topic [1] and available THP documentation [2]. pmd_present(pmd): - True if PMD has a mapped huge page i.e valid pmd_page(pmd) - False if PMD does not have a mapped huge page i.e invalid pmd_page(pmd) pmd_trans_huge(pmd): - True if PMD has a mapped huge page and is a THP ------------------------------------------------------------------------- | PMD states | pmd_present | pmd_trans_huge | ------------------------------------------------------------------------- | Mapped | Yes | Yes | ------------------------------------------------------------------------- | Splitting | Yes | Yes | ------------------------------------------------------------------------- | Migration/Swap | No | No | ------------------------------------------------------------------------- Current Problem:- PMD is invalidated with pmdp_invalidate() before it's splitting. It clears PMD_SECT_VALID as below. PMD Split -> pmdp_invalidate() -> pmd_mkinvalid() -> Clears PMD_SECT_VALID Once PMD_SECT_VALID gets cleared, pmd_present() returns false. It will need a separate page table bit apart from PMD_SECT_VALID, in order to reaffirm pmd_present() as true during this THP split process. To comply with above mentioned semantics, pmd_trans_huge() should also check pmd_present() first before testing presence of an actual transparent huge page mapping. PMD_TYPE_SECT should have been used here. But it shares bit position with PMD_SECT_VALID which gets cleared during THP invalidation. Hence, it cannot be used for pmd_present() after pmdp_invalidate(). Proposed Solution:- PMD_TABLE_BIT can be set during pmdp_invalidate() process which will make pmd_present() return true as required. But then, PMD_TABLE_BIT needs to be cleared when PMD gets mapped again. This replaces pmd_present() with pte_present() in huge_pte_offset() just to preserve the existing semantics. [1]: https://lore.kernel.org/linux-mm/20181017020930.GN30832@xxxxxxxxxx/ [2]: https://www.kernel.org/doc/Documentation/vm/transhuge.txt Cc: Catalin Marinas <catalin.marinas@xxxxxxx> Cc: Will Deacon <will@xxxxxxxxxx> Cc: Mark Rutland <mark.rutland@xxxxxxx> Cc: Marc Zyngier <maz@xxxxxxxxxx> Cc: Suzuki Poulose <suzuki.poulose@xxxxxxx> Cc: linux-arm-kernel@xxxxxxxxxxxxxxxxxxx Cc: linux-kernel@xxxxxxxxxxxxxxx Signed-off-by: Anshuman Khandual <anshuman.khandual@xxxxxxx> --- arch/arm64/include/asm/pgtable.h | 92 ++++++++++++++++++++++++++++---- arch/arm64/mm/hugetlbpage.c | 2 +- arch/arm64/mm/mmu.c | 20 +++++++ 3 files changed, 102 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6dbd267ab931..560be593a8dc 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -353,15 +353,92 @@ static inline int pmd_protnone(pmd_t pmd) } #endif +#define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE) +#define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_SECT) + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * THP definitions. + * PMD Level Encoding (THP Enabled) + * + * 0b00 - Not valid Not present NA + * 0b10 - Not valid Present Huge (Splitting) + * 0b01 - Valid Present Huge (Mapped) + * 0b11 - Valid Present Table (Mapped) */ +static inline pmd_t pmd_mksplitting(pmd_t pmd) +{ + unsigned long val = pmd_val(pmd); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) + return __pmd((val & ~PMD_TYPE_MASK) | PMD_TABLE_BIT); +} + +static inline pmd_t pmd_clrsplitting(pmd_t pmd) +{ + unsigned long val = pmd_val(pmd); + + return __pmd((val & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); +} + +static inline bool pmd_splitting(pmd_t pmd) +{ + unsigned long val = pmd_val(pmd); + + if ((val & PMD_TYPE_MASK) == PMD_TABLE_BIT) + return true; + return false; +} + +static inline bool pmd_mapped(pmd_t pmd) +{ + return pmd_sect(pmd); +} + +static inline pmd_t pmd_mkinvalid(pmd_t pmd) +{ + /* + * Invalidation should not have been invoked on + * a PMD table entry. Just warn here otherwise. + */ + WARN_ON(pmd_table(pmd)); + return pmd_mksplitting(pmd); +} + +static inline int pmd_present(pmd_t pmd); + +static inline int pmd_trans_huge(pmd_t pmd) +{ + if (!pmd_present(pmd)) + return 0; + + if (!pmd_val(pmd)) + return 0; + + if (pmd_mapped(pmd)) + return 1; + + if (pmd_splitting(pmd)) + return 1; + return 0; +} + +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#define pmd_present(pmd) pte_present(pmd_pte(pmd)) +static inline int pmd_present(pmd_t pmd) +{ + pte_t pte = pmd_pte(pmd); + + if (pte_present(pte)) + return 1; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_splitting(pmd)) + return 1; +#endif + return 0; +} + #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_valid(pmd) pte_valid(pmd_pte(pmd)) @@ -371,7 +448,6 @@ static inline int pmd_protnone(pmd_t pmd) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) -#define pmd_mkinvalid(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID)) #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) @@ -404,8 +480,6 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) - #define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d)) #define __phys_to_p4d_val(phys) __phys_to_pte_val(phys) @@ -448,10 +522,6 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define pmd_bad(pmd) (!(pmd_val(pmd) & PMD_TABLE_BIT)) -#define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ - PMD_TYPE_TABLE) -#define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ - PMD_TYPE_SECT) #define pmd_leaf(pmd) pmd_sect(pmd) #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 0a52ce46f020..79a8d5c8d4f4 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -294,7 +294,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) && pmd_none(pmd)) return NULL; - if (pmd_huge(pmd) || !pmd_present(pmd)) + if (pmd_huge(pmd) || !pte_present(pmd_pte(pmd))) return (pte_t *)pmdp; if (sz == CONT_PTE_SIZE) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 990929c8837e..337519031115 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -22,6 +22,8 @@ #include <linux/io.h> #include <linux/mm.h> #include <linux/vmalloc.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -1483,3 +1485,21 @@ static int __init prevent_bootmem_remove_init(void) } device_initcall(prevent_bootmem_remove_init); #endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + /* + * PMD migration entries need to retain splitting PMD + * representation created with pmdp_invalidate(). But + * any non-migration entry which just might have been + * invalidated previously, still need be a normal huge + * page. Hence selectively clear splitting entries. + */ + if (!is_migration_entry(pmd_to_swp_entry(pmd))) + pmd = pmd_clrsplitting(pmd); + + set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); +} +#endif -- 2.20.1