Re: [PATCH v3 3/3] mm: optimise pte dirty/accessed bit setting by demand based pte insertion

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Nick,

On Sun, 20 Dec 2020, Nicholas Piggin wrote:

> Similarly to the previous patch, this tries to optimise dirty/accessed
> bits in ptes to avoid access costs of hardware setting them.
> 
> This tidies up a few last cases where dirty/accessed faults can be seen,
> and subsumes the pte_sw_mkyoung helper -- it's not just architectures
> with explicit software dirty/accessed bits that take expensive faults to
> modify ptes.
> 
> The vast majority of the remaining dirty/accessed faults on kbuild
> workloads after this patch are from NUMA migration, due to
> remove_migration_pte inserting old/clean ptes.

Are you sure about this patch? It looks wrong to me: because isn't
_PAGE_ACCESSED (young) already included in the vm_page_prot __S001 etc?

I haven't checked all instances below, but in general, that's why the
existing code tends not to bother to mkyoung, but does sometimes mkold
(admittedly confusing).

A quick check on x86 and powerpc looks like they do have _PAGE_ACCESSED
in there; and IIRC that's true, or ought to be true, of all architectures.

Maybe not mips, which I see you have singled out: I remember going round
this loop a few months ago with mips, where strange changes were being
proposed to compensate for not having that bit in their vm_page_prot.
I didn't follow through to see how that ended up, but I did suggest
mips needed to follow the same convention as the other architectures.

Hugh

> 
> Signed-off-by: Nicholas Piggin <npiggin@xxxxxxxxx>
> ---
>  arch/mips/include/asm/pgtable.h |  2 --
>  include/linux/pgtable.h         | 16 ----------------
>  mm/huge_memory.c                |  4 ++--
>  mm/memory.c                     | 14 +++++++-------
>  mm/migrate.c                    |  1 +
>  mm/shmem.c                      |  1 +
>  mm/userfaultfd.c                |  2 +-
>  7 files changed, 12 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
> index 4f9c37616d42..3275495adccb 100644
> --- a/arch/mips/include/asm/pgtable.h
> +++ b/arch/mips/include/asm/pgtable.h
> @@ -406,8 +406,6 @@ static inline pte_t pte_mkyoung(pte_t pte)
>  	return pte;
>  }
>  
> -#define pte_sw_mkyoung	pte_mkyoung
> -
>  #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
>  static inline int pte_huge(pte_t pte)	{ return pte_val(pte) & _PAGE_HUGE; }
>  
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index 8fcdfa52eb4b..70d04931dff4 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -424,22 +424,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
>  }
>  #endif
>  
> -/*
> - * On some architectures hardware does not set page access bit when accessing
> - * memory page, it is responsibilty of software setting this bit. It brings
> - * out extra page fault penalty to track page access bit. For optimization page
> - * access bit can be set during all page fault flow on these arches.
> - * To be differentiate with macro pte_mkyoung, this macro is used on platforms
> - * where software maintains page access bit.
> - */
> -#ifndef pte_sw_mkyoung
> -static inline pte_t pte_sw_mkyoung(pte_t pte)
> -{
> -	return pte;
> -}
> -#define pte_sw_mkyoung	pte_sw_mkyoung
> -#endif
> -
>  #ifndef pte_savedwrite
>  #define pte_savedwrite pte_write
>  #endif
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f2ca0326b5af..f6719312dc27 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2151,8 +2151,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>  			entry = maybe_mkwrite(entry, vma);
>  			if (!write)
>  				entry = pte_wrprotect(entry);
> -			if (!young)
> -				entry = pte_mkold(entry);
> +			if (young)
> +				entry = pte_mkyoung(entry);
>  			if (soft_dirty)
>  				entry = pte_mksoft_dirty(entry);
>  			if (uffd_wp)
> diff --git a/mm/memory.c b/mm/memory.c
> index dd1f364d8ca3..4cebba596660 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1639,7 +1639,7 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
>  	get_page(page);
>  	inc_mm_counter_fast(mm, mm_counter_file(page));
>  	page_add_file_rmap(page, false);
> -	set_pte_at(mm, addr, pte, mk_pte(page, prot));
> +	set_pte_at(mm, addr, pte, pte_mkyoung(mk_pte(page, prot)));
>  	return 0;
>  }
>  
> @@ -1954,10 +1954,9 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>  	else
>  		entry = pte_mkspecial(pfn_t_pte(pfn, prot));
>  
> -	if (mkwrite) {
> -		entry = pte_mkyoung(entry);
> +	entry = pte_mkyoung(entry);
> +	if (mkwrite)
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> -	}
>  
>  	set_pte_at(mm, addr, pte, entry);
>  	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
> @@ -2889,7 +2888,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>  		}
>  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
>  		entry = mk_pte(new_page, vma->vm_page_prot);
> -		entry = pte_sw_mkyoung(entry);
> +		entry = pte_mkyoung(entry);
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  		/*
>  		 * Clear the pte entry and flush it first, before updating the
> @@ -3402,6 +3401,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
>  	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
>  	pte = mk_pte(page, vma->vm_page_prot);
> +	pte = pte_mkyoung(pte);
>  	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
>  		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
>  		vmf->flags &= ~FAULT_FLAG_WRITE;
> @@ -3545,7 +3545,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
>  	__SetPageUptodate(page);
>  
>  	entry = mk_pte(page, vma->vm_page_prot);
> -	entry = pte_sw_mkyoung(entry);
> +	entry = pte_mkyoung(entry);
>  	if (vma->vm_flags & VM_WRITE)
>  		entry = pte_mkwrite(pte_mkdirty(entry));
>  
> @@ -3821,7 +3821,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
>  
>  	flush_icache_page(vma, page);
>  	entry = mk_pte(page, vma->vm_page_prot);
> -	entry = pte_sw_mkyoung(entry);
> +	entry = pte_mkyoung(entry);
>  	if (write)
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  	/* copy-on-write page */
> diff --git a/mm/migrate.c b/mm/migrate.c
> index ee5e612b4cd8..d33b2bfc846b 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -2963,6 +2963,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
>  		}
>  	} else {
>  		entry = mk_pte(page, vma->vm_page_prot);
> +		entry = pte_mkyoung(entry);
>  		if (vma->vm_flags & VM_WRITE)
>  			entry = pte_mkwrite(pte_mkdirty(entry));
>  	}
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 7c6b6d8f6c39..4f23b16d6baf 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2420,6 +2420,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
>  		goto out_release;
>  
>  	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
> +	_dst_pte = pte_mkyoung(_dst_pte);
>  	if (dst_vma->vm_flags & VM_WRITE)
>  		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
>  	else {
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 9a3d451402d7..56c44aa06a7e 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -99,7 +99,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
>  	if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
>  		goto out_release;
>  
> -	_dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
> +	_dst_pte = pte_mkdirty(pte_mkyoung(mk_pte(page, dst_vma->vm_page_prot)));
>  	if (dst_vma->vm_flags & VM_WRITE) {
>  		if (wp_copy)
>  			_dst_pte = pte_mkuffd_wp(_dst_pte);
> -- 
> 2.23.0




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux