Re: [PATCH 03/10] mm: Convert p[te|md]_numa users to p[te|md]_protnone_numa

Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> · Tue, 02 Dec 2014 09:42:42 +1100

On Fri, 2014-11-21 at 13:57 +0000, Mel Gorman wrote:
> Convert existing users of pte_numa and friends to the new helper. Note
> that the kernel is broken after this patch is applied until the other
> page table modifiers are also altered. This patch layout is to make
> review easier.

Aneesh, the removal of the DSISR_PROTFAULT checks, I wonder if we might
break something here ... (I know, I asked for them to be removed :-)

IE, we basically bounce all protection checks to the "normal" VMA
protection checking, so far so good...

But what about the subpage protection stuff ? Will that still work ?

Cheers,
Ben.

> Signed-off-by: Mel Gorman <mgorman@xxxxxxx>
> Acked-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Acked-by: Aneesh Kumar <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> ---
>  arch/powerpc/kvm/book3s_hv_rm_mmu.c |  2 +-
>  arch/powerpc/mm/fault.c             |  5 -----
>  arch/powerpc/mm/pgtable.c           | 11 ++++++++---
>  arch/powerpc/mm/pgtable_64.c        |  3 ++-
>  arch/x86/mm/gup.c                   |  4 ++--
>  include/uapi/linux/mempolicy.h      |  2 +-
>  mm/gup.c                            | 10 +++++-----
>  mm/huge_memory.c                    | 16 +++++++--------
>  mm/memory.c                         |  4 ++--
>  mm/mprotect.c                       | 39 ++++++++++---------------------------
>  mm/pgtable-generic.c                |  2 +-
>  11 files changed, 40 insertions(+), 58 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> index 084ad54..3e6ad3f 100644
> --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> @@ -235,7 +235,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
>  		pte_size = psize;
>  		pte = lookup_linux_pte_and_update(pgdir, hva, writing,
>  						  &pte_size);
> -		if (pte_present(pte) && !pte_numa(pte)) {
> +		if (pte_present(pte) && !pte_protnone(pte)) {
>  			if (writing && !pte_write(pte))
>  				/* make the actual HPTE be read-only */
>  				ptel = hpte_make_readonly(ptel);
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index eb79907..b434153 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -398,8 +398,6 @@ good_area:
>  		 * processors use the same I/D cache coherency mechanism
>  		 * as embedded.
>  		 */
> -		if (error_code & DSISR_PROTFAULT)
> -			goto bad_area;
>  #endif /* CONFIG_PPC_STD_MMU */
>  
>  		/*
> @@ -423,9 +421,6 @@ good_area:
>  		flags |= FAULT_FLAG_WRITE;
>  	/* a read */
>  	} else {
> -		/* protection fault */
> -		if (error_code & 0x08000000)
> -			goto bad_area;
>  		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
>  			goto bad_area;
>  	}
> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index c90e602..83dfcb5 100644
> --- a/arch/powerpc/mm/pgtable.c
> +++ b/arch/powerpc/mm/pgtable.c
> @@ -172,9 +172,14 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma,
>  void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
>  		pte_t pte)
>  {
> -#ifdef CONFIG_DEBUG_VM
> -	WARN_ON(pte_val(*ptep) & _PAGE_PRESENT);
> -#endif
> +	/*
> +	 * When handling numa faults, we already have the pte marked
> +	 * _PAGE_PRESENT, but we can be sure that it is not in hpte.
> +	 * Hence we can use set_pte_at for them.
> +	 */
> +	VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) ==
> +		(_PAGE_PRESENT | _PAGE_USER));
> +
>  	/* Note: mm->context.id might not yet have been assigned as
>  	 * this context might not have been activated yet when this
>  	 * is called.
> diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> index 87ff0c1..435ebf7 100644
> --- a/arch/powerpc/mm/pgtable_64.c
> +++ b/arch/powerpc/mm/pgtable_64.c
> @@ -718,7 +718,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>  		pmd_t *pmdp, pmd_t pmd)
>  {
>  #ifdef CONFIG_DEBUG_VM
> -	WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT);
> +	WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) ==
> +		(_PAGE_PRESENT | _PAGE_USER));
>  	assert_spin_locked(&mm->page_table_lock);
>  	WARN_ON(!pmd_trans_huge(pmd));
>  #endif
> diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
> index 207d9aef..f32e12c 100644
> --- a/arch/x86/mm/gup.c
> +++ b/arch/x86/mm/gup.c
> @@ -84,7 +84,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
>  		struct page *page;
>  
>  		/* Similar to the PMD case, NUMA hinting must take slow path */
> -		if (pte_numa(pte)) {
> +		if (pte_protnone(pte)) {
>  			pte_unmap(ptep);
>  			return 0;
>  		}
> @@ -178,7 +178,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
>  			 * slowpath for accounting purposes and so that they
>  			 * can be serialised against THP migration.
>  			 */
> -			if (pmd_numa(pmd))
> +			if (pmd_protnone(pmd))
>  				return 0;
>  			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
>  				return 0;
> diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
> index 0d11c3d..9cd8b21 100644
> --- a/include/uapi/linux/mempolicy.h
> +++ b/include/uapi/linux/mempolicy.h
> @@ -67,7 +67,7 @@ enum mpol_rebind_step {
>  #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
>  #define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */
>  #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
> -#define MPOL_F_MORON	(1 << 4) /* Migrate On pte_numa Reference On Node */
> +#define MPOL_F_MORON	(1 << 4) /* Migrate On protnone Reference On Node */
>  
> 
>  #endif /* _UAPI_LINUX_MEMPOLICY_H */
> diff --git a/mm/gup.c b/mm/gup.c
> index 0ca1df9..e5dab89 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -64,7 +64,7 @@ retry:
>  		migration_entry_wait(mm, pmd, address);
>  		goto retry;
>  	}
> -	if ((flags & FOLL_NUMA) && pte_numa(pte))
> +	if ((flags & FOLL_NUMA) && pte_protnone(pte))
>  		goto no_page;
>  	if ((flags & FOLL_WRITE) && !pte_write(pte)) {
>  		pte_unmap_unlock(ptep, ptl);
> @@ -193,7 +193,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
>  		}
>  		return page;
>  	}
> -	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
> +	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
>  		return no_page_table(vma, flags);
>  	if (pmd_trans_huge(*pmd)) {
>  		if (flags & FOLL_SPLIT) {
> @@ -740,10 +740,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
>  
>  		/*
>  		 * Similar to the PMD case below, NUMA hinting must take slow
> -		 * path
> +		 * path using the pte_protnone check.
>  		 */
>  		if (!pte_present(pte) || pte_special(pte) ||
> -			pte_numa(pte) || (write && !pte_write(pte)))
> +			pte_protnone(pte) || (write && !pte_write(pte)))
>  			goto pte_unmap;
>  
>  		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
> @@ -938,7 +938,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
>  			 * slowpath for accounting purposes and so that they
>  			 * can be serialised against THP migration.
>  			 */
> -			if (pmd_numa(pmd))
> +			if (pmd_protnone(pmd))
>  				return 0;
>  
>  			if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index a2cd021..f81fddf 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1222,7 +1222,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
>  		return ERR_PTR(-EFAULT);
>  
>  	/* Full NUMA hinting faults to serialise migration in fault paths */
> -	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
> +	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
>  		goto out;
>  
>  	page = pmd_page(*pmd);
> @@ -1353,7 +1353,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  
>  	/*
>  	 * Migrate the THP to the requested node, returns with page unlocked
> -	 * and pmd_numa cleared.
> +	 * and access rights restored.
>  	 */
>  	spin_unlock(ptl);
>  	migrated = migrate_misplaced_transhuge_page(mm, vma,
> @@ -1368,7 +1368,7 @@ clear_pmdnuma:
>  	BUG_ON(!PageLocked(page));
>  	pmd = pmd_mknonnuma(pmd);
>  	set_pmd_at(mm, haddr, pmdp, pmd);
> -	VM_BUG_ON(pmd_numa(*pmdp));
> +	VM_BUG_ON(pmd_protnone(*pmdp));
>  	update_mmu_cache_pmd(vma, addr, pmdp);
>  	unlock_page(page);
>  out_unlock:
> @@ -1514,7 +1514,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  		ret = 1;
>  		if (!prot_numa) {
>  			entry = pmdp_get_and_clear_notify(mm, addr, pmd);
> -			if (pmd_numa(entry))
> +			if (pmd_protnone(entry))
>  				entry = pmd_mknonnuma(entry);
>  			entry = pmd_modify(entry, newprot);
>  			ret = HPAGE_PMD_NR;
> @@ -1530,7 +1530,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  			 * local vs remote hits on the zero page.
>  			 */
>  			if (!is_huge_zero_page(page) &&
> -			    !pmd_numa(*pmd)) {
> +			    !pmd_protnone(*pmd)) {
>  				pmdp_set_numa(mm, addr, pmd);
>  				ret = HPAGE_PMD_NR;
>  			}
> @@ -1798,9 +1798,9 @@ static int __split_huge_page_map(struct page *page,
>  			pte_t *pte, entry;
>  			BUG_ON(PageCompound(page+i));
>  			/*
> -			 * Note that pmd_numa is not transferred deliberately
> -			 * to avoid any possibility that pte_numa leaks to
> -			 * a PROT_NONE VMA by accident.
> +			 * Note that NUMA hinting access restrictions are not
> +			 * transferred to avoid any possibility of altering
> +			 * permissions across VMAs.
>  			 */
>  			entry = mk_pte(page + i, vma->vm_page_prot);
>  			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> diff --git a/mm/memory.c b/mm/memory.c
> index ae923f5..eaa46f1 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3214,7 +3214,7 @@ static int handle_pte_fault(struct mm_struct *mm,
>  					pte, pmd, flags, entry);
>  	}
>  
> -	if (pte_numa(entry))
> +	if (pte_protnone(entry))
>  		return do_numa_page(mm, vma, address, entry, pte, pmd);
>  
>  	ptl = pte_lockptr(mm, pmd);
> @@ -3292,7 +3292,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>  			if (pmd_trans_splitting(orig_pmd))
>  				return 0;
>  
> -			if (pmd_numa(orig_pmd))
> +			if (pmd_protnone(orig_pmd))
>  				return do_huge_pmd_numa_page(mm, vma, address,
>  							     orig_pmd, pmd);
>  
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index ace9345..e93ddac 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -75,36 +75,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>  		oldpte = *pte;
>  		if (pte_present(oldpte)) {
>  			pte_t ptent;
> -			bool updated = false;
> -
> -			if (!prot_numa) {
> -				ptent = ptep_modify_prot_start(mm, addr, pte);
> -				if (pte_numa(ptent))
> -					ptent = pte_mknonnuma(ptent);
> -				ptent = pte_modify(ptent, newprot);
> -				/*
> -				 * Avoid taking write faults for pages we
> -				 * know to be dirty.
> -				 */
> -				if (dirty_accountable && pte_dirty(ptent) &&
> -				    (pte_soft_dirty(ptent) ||
> -				     !(vma->vm_flags & VM_SOFTDIRTY)))
> -					ptent = pte_mkwrite(ptent);
> -				ptep_modify_prot_commit(mm, addr, pte, ptent);
> -				updated = true;
> -			} else {
> -				struct page *page;
> -
> -				page = vm_normal_page(vma, addr, oldpte);
> -				if (page && !PageKsm(page)) {
> -					if (!pte_numa(oldpte)) {
> -						ptep_set_numa(mm, addr, pte);
> -						updated = true;
> -					}
> -				}
> +			ptent = ptep_modify_prot_start(mm, addr, pte);
> +			ptent = pte_modify(ptent, newprot);
> +
> +			/* Avoid taking write faults for known dirty pages */
> +			if (dirty_accountable && pte_dirty(ptent) &&
> +					(pte_soft_dirty(ptent) ||
> +					 !(vma->vm_flags & VM_SOFTDIRTY))) {
> +				ptent = pte_mkwrite(ptent);
>  			}
> -			if (updated)
> -				pages++;
> +			ptep_modify_prot_commit(mm, addr, pte, ptent);
> +			pages++;
>  		} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
>  			swp_entry_t entry = pte_to_swp_entry(oldpte);
>  
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index dfb79e0..4b8ad76 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -193,7 +193,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
>  		     pmd_t *pmdp)
>  {
>  	pmd_t entry = *pmdp;
> -	if (pmd_numa(entry))
> +	if (pmd_protnone(entry))
>  		entry = pmd_mknonnuma(entry);
>  	set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
>  	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>