This commit introduces CONFIG_FREE_USER_PTE option and implements the actual logic to the dummy helpers about pte_ref. Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx> --- include/linux/mm.h | 2 ++ include/linux/pgtable.h | 3 +- include/linux/pte_ref.h | 53 ++++++++++++++++++++++++---- mm/Kconfig | 4 +++ mm/debug_vm_pgtable.c | 2 ++ mm/memory.c | 15 ++++++++ mm/pte_ref.c | 91 +++++++++++++++++++++++++++++++++++++++++++++---- 7 files changed, 156 insertions(+), 14 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 753a9435e0d0..18fbf9e0996a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -437,6 +437,7 @@ extern pgprot_t protection_map[16]; * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * @FAULT_FLAG_PTE_GET: This means the refcount of the pte has been got. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -468,6 +469,7 @@ enum fault_flag { FAULT_FLAG_REMOTE = 1 << 7, FAULT_FLAG_INSTRUCTION = 1 << 8, FAULT_FLAG_INTERRUPTIBLE = 1 << 9, + FAULT_FLAG_PTE_GET = 1 << 10, }; /* diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c8f045705c1e..6ac51d58f11a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -480,7 +480,6 @@ static inline pte_t ptep_get_lockless(pte_t *ptep) } #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, @@ -491,6 +490,8 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h index b6d8335bdc59..8a26eaba83ef 100644 --- a/include/linux/pte_ref.h +++ b/include/linux/pte_ref.h @@ -8,6 +8,7 @@ #define _LINUX_PTE_REF_H #include <linux/pgtable.h> +#include <linux/page-flags.h> enum pte_tryget_type { TRYGET_SUCCESSED, @@ -16,12 +17,49 @@ enum pte_tryget_type { TRYGET_FAILED_HUGE_PMD, }; -bool pte_get_unless_zero(pmd_t *pmd); -enum pte_tryget_type pte_try_get(pmd_t *pmd); void pte_put_vmf(struct vm_fault *vmf); +enum pte_tryget_type pte_try_get(pmd_t *pmd); +bool pte_get_unless_zero(pmd_t *pmd); + +#ifdef CONFIG_FREE_USER_PTE +void free_user_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr); static inline void pte_ref_init(pgtable_t pte, pmd_t *pmd, int count) { + pte->pmd = pmd; + atomic_set(&pte->pte_refcount, count); +} + +static inline pmd_t *pte_to_pmd(pte_t *pte) +{ + return virt_to_page(pte)->pmd; +} + +static inline void pte_update_pmd(pmd_t old_pmd, pmd_t *new_pmd) +{ + pmd_pgtable(old_pmd)->pmd = new_pmd; +} + +static inline void pte_get_many(pmd_t *pmd, unsigned int nr) +{ + pgtable_t pte = pmd_pgtable(*pmd); + + VM_BUG_ON(!PageTable(pte)); + atomic_add(nr, &pte->pte_refcount); +} + +static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned int nr) +{ + pgtable_t pte = pmd_pgtable(*pmd); + + VM_BUG_ON(!PageTable(pte)); + if (atomic_sub_and_test(nr, &pte->pte_refcount)) + free_user_pte_table(mm, pmd, addr & PMD_MASK); +} +#else +static inline void pte_ref_init(pgtable_t pte, pmd_t *pmd, int count) +{ } static inline pmd_t *pte_to_pmd(pte_t *pte) @@ -37,6 +75,12 @@ static inline void pte_get_many(pmd_t *pmd, unsigned int nr) { } +static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned int nr) +{ +} +#endif /* CONFIG_FREE_USER_PTE */ + /* * pte_get - Increment refcount for the PTE page table. * @pmd: a pointer to the pmd entry corresponding to the PTE page table. @@ -66,11 +110,6 @@ static inline pte_t *pte_tryget_map_lock(struct mm_struct *mm, pmd_t *pmd, return pte_offset_map_lock(mm, pmd, address, ptlp); } -static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned int nr) -{ -} - /* * pte_put - Decrement refcount for the PTE page table. * @mm: the mm_struct of the target address space. diff --git a/mm/Kconfig b/mm/Kconfig index 5c5508fafcec..44549d287869 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -898,6 +898,10 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +config FREE_USER_PTE + def_bool y + depends on X86_64 + source "mm/damon/Kconfig" endmenu diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 52f006654664..757dd84ee254 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1049,8 +1049,10 @@ static void __init destroy_args(struct pgtable_debug_args *args) /* Free page table entries */ if (args->start_ptep) { pte_put(args->mm, args->start_pmdp, args->vaddr); +#ifndef CONFIG_FREE_USER_PTE pte_free(args->mm, args->start_ptep); mm_dec_nr_ptes(args->mm); +#endif } if (args->start_pmdp) { diff --git a/mm/memory.c b/mm/memory.c index e360ecd37a71..4d1ede78d1b0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -219,6 +219,17 @@ static void check_sync_rss_stat(struct task_struct *task) #endif /* SPLIT_RSS_COUNTING */ +#ifdef CONFIG_FREE_USER_PTE +static inline void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) +{ + /* + * We should never reach here since the PTE page tables are + * dynamically freed. + */ + BUG(); +} +#else /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -231,6 +242,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); } +#endif /* CONFIG_FREE_USER_PTE */ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, @@ -4631,6 +4643,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) goto retry; vmf->orig_pte = *vmf->pte; + if (IS_ENABLED(CONFIG_FREE_USER_PTE)) + vmf->flags |= FAULT_FLAG_PTE_GET; + /* * some architectures can have larger ptes than wordsize, * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and diff --git a/mm/pte_ref.c b/mm/pte_ref.c index de109905bc8f..728e61cea25e 100644 --- a/mm/pte_ref.c +++ b/mm/pte_ref.c @@ -7,7 +7,10 @@ #include <linux/pte_ref.h> #include <linux/mm.h> +#include <linux/hugetlb.h> +#include <asm/tlbflush.h> +#ifdef CONFIG_FREE_USER_PTE /* * pte_get_unless_zero - Increment refcount for the PTE page table * unless it is zero. @@ -15,7 +18,10 @@ */ bool pte_get_unless_zero(pmd_t *pmd) { - return true; + pgtable_t pte = pmd_pgtable(*pmd); + + VM_BUG_ON(!PageTable(pte)); + return atomic_inc_not_zero(&pte->pte_refcount); } /* @@ -32,12 +38,20 @@ bool pte_get_unless_zero(pmd_t *pmd) */ enum pte_tryget_type pte_try_get(pmd_t *pmd) { - if (unlikely(pmd_none(*pmd))) - return TRYGET_FAILED_NONE; - if (unlikely(is_huge_pmd(*pmd))) - return TRYGET_FAILED_HUGE_PMD; + int retval = TRYGET_SUCCESSED; + pmd_t pmdval; - return TRYGET_SUCCESSED; + rcu_read_lock(); + pmdval = READ_ONCE(*pmd); + if (unlikely(pmd_none(pmdval))) + retval = TRYGET_FAILED_NONE; + else if (unlikely(is_huge_pmd(pmdval))) + retval = TRYGET_FAILED_HUGE_PMD; + else if (!pte_get_unless_zero(&pmdval)) + retval = TRYGET_FAILED_ZERO; + rcu_read_unlock(); + + return retval; } /* @@ -52,4 +66,69 @@ enum pte_tryget_type pte_try_get(pmd_t *pmd) */ void pte_put_vmf(struct vm_fault *vmf) { + if (!(vmf->flags & FAULT_FLAG_PTE_GET)) + return; + vmf->flags &= ~FAULT_FLAG_PTE_GET; + + pte_put(vmf->vma->vm_mm, vmf->pmd, vmf->address); +} +#else +bool pte_get_unless_zero(pmd_t *pmd) +{ + return true; +} + +enum pte_tryget_type pte_try_get(pmd_t *pmd) +{ + if (unlikely(pmd_none(*pmd))) + return TRYGET_FAILED_NONE; + + if (unlikely(is_huge_pmd(*pmd))) + return TRYGET_FAILED_HUGE_PMD; + + return TRYGET_SUCCESSED; +} + +void pte_put_vmf(struct vm_fault *vmf) +{ +} +#endif /* CONFIG_FREE_USER_PTE */ + +#ifdef CONFIG_DEBUG_VM +static void pte_free_debug(pmd_t pmd) +{ + pte_t *ptep = (pte_t *)pmd_page_vaddr(pmd); + int i = 0; + + for (i = 0; i < PTRS_PER_PTE; i++) + BUG_ON(!pte_none(*ptep++)); +} +#else +static inline void pte_free_debug(pmd_t pmd) +{ +} +#endif + +static void pte_free_rcu(struct rcu_head *rcu) +{ + struct page *page = container_of(rcu, struct page, rcu_head); + + pgtable_pte_page_dtor(page); + __free_page(page); +} + +void free_user_pte_table(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +{ + struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + spinlock_t *ptl; + pmd_t pmdval; + + ptl = pmd_lock(mm, pmd); + pmdval = pmdp_huge_get_and_clear(mm, addr, pmd); + flush_tlb_range(&vma, addr, addr + PMD_SIZE); + spin_unlock(ptl); + + pte_free_debug(pmdval); + mm_dec_nr_ptes(mm); + call_rcu(&pmd_pgtable(pmdval)->rcu_head, pte_free_rcu); } -- 2.11.0