Re: [PATCH v2] mm: make folio_pte_batch available outside of mm/memory.c

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 27/02/2024 10:42, Barry Song wrote:
> From: Barry Song <v-songbaohua@xxxxxxxx>
> 
> madvise, mprotect and some others might need folio_pte_batch to check if
> a range of PTEs are completely mapped to a large folio with contiguous
> physical addresses. Let's make it available in mm/internal.h.
> 
> Suggested-by: David Hildenbrand <david@xxxxxxxxxx>
> Cc: Lance Yang <ioworker0@xxxxxxxxx>
> Cc: Ryan Roberts <ryan.roberts@xxxxxxx>
> Cc: Yin Fengwei <fengwei.yin@xxxxxxxxx>
> [david@xxxxxxxxxx: improve the doc for the exported func]
> Signed-off-by: David Hildenbrand <david@xxxxxxxxxx>
> Signed-off-by: Barry Song <v-songbaohua@xxxxxxxx>

Reviewed-by: Ryan Roberts <ryan.roberts@xxxxxxx>

> ---
>  -v2:
>  * inline folio_pte_batch according to Ryan and David;
>  * improve the doc, thanks to David's work on this;
>  * fix tags of David and add David's s-o-b;
>  -v1:
>  https://lore.kernel.org/all/20240227024050.244567-1-21cnbao@xxxxxxxxx/
> 
>  mm/internal.h | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  mm/memory.c   | 76 -------------------------------------------
>  2 files changed, 90 insertions(+), 76 deletions(-)
> 
> diff --git a/mm/internal.h b/mm/internal.h
> index 13b59d384845..fa9e2f7db506 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -83,6 +83,96 @@ static inline void *folio_raw_mapping(struct folio *folio)
>  	return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
>  }
>  
> +/* Flags for folio_pte_batch(). */
> +typedef int __bitwise fpb_t;
> +
> +/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
> +#define FPB_IGNORE_DIRTY		((__force fpb_t)BIT(0))
> +
> +/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
> +#define FPB_IGNORE_SOFT_DIRTY		((__force fpb_t)BIT(1))
> +
> +static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
> +{
> +	if (flags & FPB_IGNORE_DIRTY)
> +		pte = pte_mkclean(pte);
> +	if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
> +		pte = pte_clear_soft_dirty(pte);
> +	return pte_wrprotect(pte_mkold(pte));
> +}
> +
> +/**
> + * folio_pte_batch - detect a PTE batch for a large folio
> + * @folio: The large folio to detect a PTE batch for.
> + * @addr: The user virtual address the first page is mapped at.
> + * @start_ptep: Page table pointer for the first entry.
> + * @pte: Page table entry for the first page.
> + * @max_nr: The maximum number of table entries to consider.
> + * @flags: Flags to modify the PTE batch semantics.
> + * @any_writable: Optional pointer to indicate whether any entry except the
> + *		  first one is writable.
> + *
> + * Detect a PTE batch: consecutive (present) PTEs that map consecutive
> + * pages of the same large folio.
> + *
> + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
> + * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
> + * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
> + *
> + * start_ptep must map any page of the folio. max_nr must be at least one and
> + * must be limited by the caller so scanning cannot exceed a single page table.
> + *
> + * Return: the number of table entries in the batch.
> + */
> +static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> +		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
> +		bool *any_writable)
> +{
> +	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
> +	const pte_t *end_ptep = start_ptep + max_nr;
> +	pte_t expected_pte, *ptep;
> +	bool writable;
> +	int nr;
> +
> +	if (any_writable)
> +		*any_writable = false;
> +
> +	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
> +	VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
> +	VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);
> +
> +	nr = pte_batch_hint(start_ptep, pte);
> +	expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
> +	ptep = start_ptep + nr;
> +
> +	while (ptep < end_ptep) {
> +		pte = ptep_get(ptep);
> +		if (any_writable)
> +			writable = !!pte_write(pte);
> +		pte = __pte_batch_clear_ignored(pte, flags);
> +
> +		if (!pte_same(pte, expected_pte))
> +			break;
> +
> +		/*
> +		 * Stop immediately once we reached the end of the folio. In
> +		 * corner cases the next PFN might fall into a different
> +		 * folio.
> +		 */
> +		if (pte_pfn(pte) >= folio_end_pfn)
> +			break;
> +
> +		if (any_writable)
> +			*any_writable |= writable;
> +
> +		nr = pte_batch_hint(ptep, pte);
> +		expected_pte = pte_advance_pfn(expected_pte, nr);
> +		ptep += nr;
> +	}
> +
> +	return min(ptep - start_ptep, max_nr);
> +}
> +
>  void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
>  						int nr_throttled);
>  static inline void acct_reclaim_writeback(struct folio *folio)
> diff --git a/mm/memory.c b/mm/memory.c
> index 1c45b6a42a1b..a7bcc39de56b 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -953,82 +953,6 @@ static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
>  	set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
>  }
>  
> -/* Flags for folio_pte_batch(). */
> -typedef int __bitwise fpb_t;
> -
> -/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
> -#define FPB_IGNORE_DIRTY		((__force fpb_t)BIT(0))
> -
> -/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
> -#define FPB_IGNORE_SOFT_DIRTY		((__force fpb_t)BIT(1))
> -
> -static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
> -{
> -	if (flags & FPB_IGNORE_DIRTY)
> -		pte = pte_mkclean(pte);
> -	if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
> -		pte = pte_clear_soft_dirty(pte);
> -	return pte_wrprotect(pte_mkold(pte));
> -}
> -
> -/*
> - * Detect a PTE batch: consecutive (present) PTEs that map consecutive
> - * pages of the same folio.
> - *
> - * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
> - * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
> - * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
> - *
> - * If "any_writable" is set, it will indicate if any other PTE besides the
> - * first (given) PTE is writable.
> - */
> -static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> -		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
> -		bool *any_writable)
> -{
> -	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
> -	const pte_t *end_ptep = start_ptep + max_nr;
> -	pte_t expected_pte, *ptep;
> -	bool writable;
> -	int nr;
> -
> -	if (any_writable)
> -		*any_writable = false;
> -
> -	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
> -
> -	nr = pte_batch_hint(start_ptep, pte);
> -	expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
> -	ptep = start_ptep + nr;
> -
> -	while (ptep < end_ptep) {
> -		pte = ptep_get(ptep);
> -		if (any_writable)
> -			writable = !!pte_write(pte);
> -		pte = __pte_batch_clear_ignored(pte, flags);
> -
> -		if (!pte_same(pte, expected_pte))
> -			break;
> -
> -		/*
> -		 * Stop immediately once we reached the end of the folio. In
> -		 * corner cases the next PFN might fall into a different
> -		 * folio.
> -		 */
> -		if (pte_pfn(pte) >= folio_end_pfn)
> -			break;
> -
> -		if (any_writable)
> -			*any_writable |= writable;
> -
> -		nr = pte_batch_hint(ptep, pte);
> -		expected_pte = pte_advance_pfn(expected_pte, nr);
> -		ptep += nr;
> -	}
> -
> -	return min(ptep - start_ptep, max_nr);
> -}
> -
>  /*
>   * Copy one present PTE, trying to batch-process subsequent PTEs that map
>   * consecutive pages of the same folio by copying them as well.





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux