Re: [RFC PATCH 3/3] mm/hugetlb: Remove pmd_huge_split_prepare

Michal Hocko <mhocko@xxxxxxxxxx> · Thu, 27 Jul 2017 14:57:56 +0200

On Thu 27-07-17 14:07:56, Aneesh Kumar K.V wrote:
> Instead of marking the pmd ready for split, invalidate the pmd. This should
> take care of powerpc requirement.

which is?

> Only side effect is that we mark the pmd
> invalid early. This can result in us blocking access to the page a bit longer
> if we race against a thp split.

Again, this doesn't tell me what is the problem and why do we care.

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> ---
>  arch/powerpc/include/asm/book3s/64/hash-4k.h  |  2 -
>  arch/powerpc/include/asm/book3s/64/hash-64k.h |  2 -
>  arch/powerpc/include/asm/book3s/64/pgtable.h  |  9 ----
>  arch/powerpc/include/asm/book3s/64/radix.h    |  6 ---
>  arch/powerpc/mm/pgtable-hash64.c              | 22 --------
>  include/asm-generic/pgtable.h                 |  8 ---
>  mm/huge_memory.c                              | 73 +++++++++++++--------------
>  7 files changed, 35 insertions(+), 87 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> index 0c4e470571ca..7d914f4fc534 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> @@ -100,8 +100,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
>  extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
>  					 pgtable_t pgtable);
>  extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
> -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> -				      unsigned long address, pmd_t *pmdp);
>  extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
>  				       unsigned long addr, pmd_t *pmdp);
>  extern int hash__has_transparent_hugepage(void);
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> index 8c8fb6fbdabe..b856e130c678 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> @@ -164,8 +164,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
>  extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
>  					 pgtable_t pgtable);
>  extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
> -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> -				      unsigned long address, pmd_t *pmdp);
>  extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
>  				       unsigned long addr, pmd_t *pmdp);
>  extern int hash__has_transparent_hugepage(void);
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index ece6912fae8e..557915792214 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -1122,15 +1122,6 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm,
>  extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>  			     pmd_t *pmdp);
>  
> -#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
> -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
> -					   unsigned long address, pmd_t *pmdp)
> -{
> -	if (radix_enabled())
> -		return radix__pmdp_huge_split_prepare(vma, address, pmdp);
> -	return hash__pmdp_huge_split_prepare(vma, address, pmdp);
> -}
> -
>  #define pmd_move_must_withdraw pmd_move_must_withdraw
>  struct spinlock;
>  static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index 558fea3b2d22..a779a43b643b 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -270,12 +270,6 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
>  		return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
>  	return __pmd(pmd_val(pmd) | _PAGE_PTE);
>  }
> -static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> -					    unsigned long address, pmd_t *pmdp)
> -{
> -	/* Nothing to do for radix. */
> -	return;
> -}
>  
>  extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
>  					  pmd_t *pmdp, unsigned long clr,
> diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
> index c0a7372bdaa6..00aee1485714 100644
> --- a/arch/powerpc/mm/pgtable-hash64.c
> +++ b/arch/powerpc/mm/pgtable-hash64.c
> @@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
>  	return pgtable;
>  }
>  
> -void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> -			       unsigned long address, pmd_t *pmdp)
> -{
> -	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> -	VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
> -	VM_BUG_ON(pmd_devmap(*pmdp));
> -
> -	/*
> -	 * We can't mark the pmd none here, because that will cause a race
> -	 * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
> -	 * we spilt, but at the same time we wan't rest of the ppc64 code
> -	 * not to insert hash pte on this, because we will be modifying
> -	 * the deposited pgtable in the caller of this function. Hence
> -	 * clear the _PAGE_USER so that we move the fault handling to
> -	 * higher level function and that will serialize against ptl.
> -	 * We need to flush existing hash pte entries here even though,
> -	 * the translation is still valid, because we will withdraw
> -	 * pgtable_t after this.
> -	 */
> -	pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
> -}
> -
>  /*
>   * A linux hugepage PMD was changed and the corresponding hash table entries
>   * neesd to be flushed.
> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
> index ece5e399567a..b934e41277ac 100644
> --- a/include/asm-generic/pgtable.h
> +++ b/include/asm-generic/pgtable.h
> @@ -313,14 +313,6 @@ extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>  			    pmd_t *pmdp);
>  #endif
>  
> -#ifndef __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
> -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
> -					   unsigned long address, pmd_t *pmdp)
> -{
> -
> -}
> -#endif
> -
>  #ifndef __HAVE_ARCH_PTE_SAME
>  static inline int pte_same(pte_t pte_a, pte_t pte_b)
>  {
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index e654592f04e7..0bd9850b1d81 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1923,8 +1923,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  	struct mm_struct *mm = vma->vm_mm;
>  	struct page *page;
>  	pgtable_t pgtable;
> -	pmd_t old, _pmd;
> -	bool young, write, soft_dirty;
> +	pmd_t old_pmd, _pmd;
> +	bool young, write, dirty, soft_dirty;
>  	unsigned long addr;
>  	int i;
>  
> @@ -1956,14 +1956,39 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  		return __split_huge_zero_page_pmd(vma, haddr, pmd);
>  	}
>  
> -	page = pmd_page(*pmd);
> +	/*
> +	 * Up to this point the pmd is present and huge and userland has the
> +	 * whole access to the hugepage during the split (which happens in
> +	 * place). If we overwrite the pmd with the not-huge version pointing
> +	 * to the pte here (which of course we could if all CPUs were bug
> +	 * free), userland could trigger a small page size TLB miss on the
> +	 * small sized TLB while the hugepage TLB entry is still established in
> +	 * the huge TLB. Some CPU doesn't like that.
> +	 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
> +	 * 383 on page 93. Intel should be safe but is also warns that it's
> +	 * only safe if the permission and cache attributes of the two entries
> +	 * loaded in the two TLB is identical (which should be the case here).
> +	 * But it is generally safer to never allow small and huge TLB entries
> +	 * for the same virtual address to be loaded simultaneously. So instead
> +	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
> +	 * current pmd notpresent (atomically because here the pmd_trans_huge
> +	 * and pmd_trans_splitting must remain set at all times on the pmd
> +	 * until the split is complete for this pmd), then we flush the SMP TLB
> +	 * and finally we write the non-huge version of the pmd entry with
> +	 * pmd_populate.
> +	 */
> +	old_pmd = pmdp_invalidate(vma, haddr, pmd);
> +
> +	page = pmd_page(old_pmd);
>  	VM_BUG_ON_PAGE(!page_count(page), page);
>  	page_ref_add(page, HPAGE_PMD_NR - 1);
> -	write = pmd_write(*pmd);
> -	young = pmd_young(*pmd);
> -	soft_dirty = pmd_soft_dirty(*pmd);
> -
> -	pmdp_huge_split_prepare(vma, haddr, pmd);
> +	write = pmd_write(old_pmd);
> +	young = pmd_young(old_pmd);
> +	dirty = pmd_dirty(*pmd);
> +	soft_dirty = pmd_soft_dirty(old_pmd);
> +	/*
> +	 * withdraw the table only after we mark the pmd entry invalid
> +	 */
>  	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
>  	pmd_populate(mm, &_pmd, pgtable);
>  
> @@ -1990,6 +2015,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  			if (soft_dirty)
>  				entry = pte_mksoft_dirty(entry);
>  		}
> +		if (dirty)
> +			SetPageDirty(page + i);
>  		pte = pte_offset_map(&_pmd, addr);
>  		BUG_ON(!pte_none(*pte));
>  		set_pte_at(mm, addr, pte, entry);
> @@ -2017,36 +2044,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  	}
>  
>  	smp_wmb(); /* make pte visible before pmd */
> -	/*
> -	 * Up to this point the pmd is present and huge and userland has the
> -	 * whole access to the hugepage during the split (which happens in
> -	 * place). If we overwrite the pmd with the not-huge version pointing
> -	 * to the pte here (which of course we could if all CPUs were bug
> -	 * free), userland could trigger a small page size TLB miss on the
> -	 * small sized TLB while the hugepage TLB entry is still established in
> -	 * the huge TLB. Some CPU doesn't like that.
> -	 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
> -	 * 383 on page 93. Intel should be safe but is also warns that it's
> -	 * only safe if the permission and cache attributes of the two entries
> -	 * loaded in the two TLB is identical (which should be the case here).
> -	 * But it is generally safer to never allow small and huge TLB entries
> -	 * for the same virtual address to be loaded simultaneously. So instead
> -	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
> -	 * current pmd notpresent (atomically because here the pmd_trans_huge
> -	 * and pmd_trans_splitting must remain set at all times on the pmd
> -	 * until the split is complete for this pmd), then we flush the SMP TLB
> -	 * and finally we write the non-huge version of the pmd entry with
> -	 * pmd_populate.
> -	 */
> -	old = pmdp_invalidate(vma, haddr, pmd);
> -
> -	/*
> -	 * Transfer dirty bit using value returned by pmd_invalidate() to be
> -	 * sure we don't race with CPU that can set the bit under us.
> -	 */
> -	if (pmd_dirty(old))
> -		SetPageDirty(page);
> -
>  	pmd_populate(mm, pmd, pgtable);
>  
>  	if (freeze) {
> -- 
> 2.13.3
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>