Re: [PATCH 06/12] huge_memory: Allow mappings of PUD sized pages

Alistair Popple <apopple@xxxxxxxxxx> · Mon, 14 Oct 2024 17:33:44 +1100

Dan Williams <dan.j.williams@xxxxxxxxx> writes:

> Alistair Popple wrote:
>> Currently DAX folio/page reference counts are managed differently to
>> normal pages. To allow these to be managed the same as normal pages
>> introduce dax_insert_pfn_pud. This will map the entire PUD-sized folio
>> and take references as it would for a normally mapped page.
>> 
>> This is distinct from the current mechanism, vmf_insert_pfn_pud, which
>> simply inserts a special devmap PUD entry into the page table without
>> holding a reference to the page for the mapping.
>
> This is missing some description or comment in the code about the
> differences. More questions below:
>
>> Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
>> ---
>>  include/linux/huge_mm.h |  4 ++-
>>  include/linux/rmap.h    | 15 +++++++-
>>  mm/huge_memory.c        | 93 ++++++++++++++++++++++++++++++++++++------
>>  mm/rmap.c               | 49 ++++++++++++++++++++++-
>>  4 files changed, 149 insertions(+), 12 deletions(-)
>> 
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 6370026..d3a1872 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -40,6 +40,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>  
>>  vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
>>  vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
>> +vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
>>  
>>  enum transparent_hugepage_flag {
>>  	TRANSPARENT_HUGEPAGE_UNSUPPORTED,
>> @@ -114,6 +115,9 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr;
>>  #define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
>>  #define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
>>  
>> +#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
>> +#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
>> +
>>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>  
>>  extern unsigned long transparent_hugepage_flags;
>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>> index 91b5935..c465694 100644
>> --- a/include/linux/rmap.h
>> +++ b/include/linux/rmap.h
>> @@ -192,6 +192,7 @@ typedef int __bitwise rmap_t;
>>  enum rmap_level {
>>  	RMAP_LEVEL_PTE = 0,
>>  	RMAP_LEVEL_PMD,
>> +	RMAP_LEVEL_PUD,
>>  };
>>  
>>  static inline void __folio_rmap_sanity_checks(struct folio *folio,
>> @@ -228,6 +229,14 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio,
>>  		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
>>  		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
>>  		break;
>> +	case RMAP_LEVEL_PUD:
>> +		/*
>> +		 * Asume that we are creating * a single "entire" mapping of the
>> +		 * folio.
>> +		 */
>> +		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
>> +		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
>> +		break;
>>  	default:
>>  		VM_WARN_ON_ONCE(true);
>>  	}
>> @@ -251,12 +260,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
>>  	folio_add_file_rmap_ptes(folio, page, 1, vma)
>>  void folio_add_file_rmap_pmd(struct folio *, struct page *,
>>  		struct vm_area_struct *);
>> +void folio_add_file_rmap_pud(struct folio *, struct page *,
>> +		struct vm_area_struct *);
>>  void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
>>  		struct vm_area_struct *);
>>  #define folio_remove_rmap_pte(folio, page, vma) \
>>  	folio_remove_rmap_ptes(folio, page, 1, vma)
>>  void folio_remove_rmap_pmd(struct folio *, struct page *,
>>  		struct vm_area_struct *);
>> +void folio_remove_rmap_pud(struct folio *, struct page *,
>> +		struct vm_area_struct *);
>>  
>>  void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
>>  		unsigned long address, rmap_t flags);
>> @@ -341,6 +354,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
>>  		atomic_add(orig_nr_pages, &folio->_large_mapcount);
>>  		break;
>>  	case RMAP_LEVEL_PMD:
>> +	case RMAP_LEVEL_PUD:
>>  		atomic_inc(&folio->_entire_mapcount);
>>  		atomic_inc(&folio->_large_mapcount);
>>  		break;
>> @@ -437,6 +451,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
>>  		atomic_add(orig_nr_pages, &folio->_large_mapcount);
>>  		break;
>>  	case RMAP_LEVEL_PMD:
>> +	case RMAP_LEVEL_PUD:
>>  		if (PageAnonExclusive(page)) {
>>  			if (unlikely(maybe_pinned))
>>  				return -EBUSY;
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index c4b45ad..e8985a4 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1336,21 +1336,19 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
>>  	struct mm_struct *mm = vma->vm_mm;
>>  	pgprot_t prot = vma->vm_page_prot;
>>  	pud_t entry;
>> -	spinlock_t *ptl;
>>  
>> -	ptl = pud_lock(mm, pud);
>>  	if (!pud_none(*pud)) {
>>  		if (write) {
>>  			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
>>  				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
>> -				goto out_unlock;
>> +				return;
>>  			}
>>  			entry = pud_mkyoung(*pud);
>>  			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
>>  			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
>>  				update_mmu_cache_pud(vma, addr, pud);
>>  		}
>> -		goto out_unlock;
>> +		return;
>>  	}
>>  
>>  	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
>> @@ -1362,9 +1360,6 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
>>  	}
>>  	set_pud_at(mm, addr, pud, entry);
>>  	update_mmu_cache_pud(vma, addr, pud);
>> -
>> -out_unlock:
>> -	spin_unlock(ptl);
>>  }
>>  
>>  /**
>> @@ -1382,6 +1377,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
>>  	unsigned long addr = vmf->address & PUD_MASK;
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	pgprot_t pgprot = vma->vm_page_prot;
>> +	spinlock_t *ptl;
>>  
>>  	/*
>>  	 * If we had pud_special, we could avoid all these restrictions,
>> @@ -1399,10 +1395,52 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
>>  
>>  	track_pfn_insert(vma, &pgprot, pfn);
>>  
>> +	ptl = pud_lock(vma->vm_mm, vmf->pud);
>>  	insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
>> +	spin_unlock(ptl);
>> +
>>  	return VM_FAULT_NOPAGE;
>>  }
>>  EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
>> +
>> +/**
>> + * dax_insert_pfn_pud - insert a pud size pfn backed by a normal page
>> + * @vmf: Structure describing the fault
>> + * @pfn: pfn of the page to insert
>> + * @write: whether it's a write fault
>
> It strikes me that this documentation is not useful for recalling why
> both vmf_insert_pfn_pud() and dax_insert_pfn_pud() exist. It looks like
> the only difference is that the "dax_" flavor takes a reference on the
> page. So maybe all these dax_insert_pfn{,_pmd,_pud} helpers should be
> unified in a common vmf_insert_page() entry point where the caller is
> responsible for initializing the compound page metadata before calling
> the helper?

Honestly I'm impressed I wrote documentation at all :-) Even if it was
just a terrible cut-and-paste job. What exactly do you mean by
"initializing the compound page metadata" though? I assume this bit:

	folio_get(folio);
	folio_add_file_rmap_pud(folio, page, vma);
	add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);

Problem with doing that in the caller is that at the very least
folio_add_file_rmap_*() calls need to happen under PTL. Of course the
point on lack of documentation still stands, and as an aside my original
plan was to remove the vmf_insert_pfn* variants as dead-code once DAX
stopped using them, but Peter pointed out this series which added some
callers:

https://lore.kernel.org/all/20240826204353.2228736-20-peterx@xxxxxxxxxx/T/

>> + *
>> + * Return: vm_fault_t value.
>> + */
>> +vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	unsigned long addr = vmf->address & PUD_MASK;
>> +	pud_t *pud = vmf->pud;
>> +	pgprot_t prot = vma->vm_page_prot;
>> +	struct mm_struct *mm = vma->vm_mm;
>> +	spinlock_t *ptl;
>> +	struct folio *folio;
>> +	struct page *page;
>> +
>> +	if (addr < vma->vm_start || addr >= vma->vm_end)
>> +		return VM_FAULT_SIGBUS;
>> +
>> +	track_pfn_insert(vma, &prot, pfn);
>> +
>> +	ptl = pud_lock(mm, pud);
>> +	if (pud_none(*vmf->pud)) {
>> +		page = pfn_t_to_page(pfn);
>> +		folio = page_folio(page);
>> +		folio_get(folio);
>> +		folio_add_file_rmap_pud(folio, page, vma);
>> +		add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
>> +	}
>> +	insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
>> +	spin_unlock(ptl);
>> +
>> +	return VM_FAULT_NOPAGE;
>> +}
>> +EXPORT_SYMBOL_GPL(dax_insert_pfn_pud);
>>  #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>>  
>>  void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
>> @@ -1947,7 +1985,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>  			zap_deposited_table(tlb->mm, pmd);
>>  		spin_unlock(ptl);
>>  	} else if (is_huge_zero_pmd(orig_pmd)) {
>> -		zap_deposited_table(tlb->mm, pmd);
>> +		if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
>> +			zap_deposited_table(tlb->mm, pmd);
>
> This looks subtle to me. Why is it needed to skip zap_deposited_table()
> (I assume it is some THP assumption about the page being from the page
> allocator)? Why is it ok to to force the zap if the arch demands it?

We don't skip it - it happens in the "else" clause if required. Note
that vma_is_special_huge(dax_vma) == True. So the checks are to maintain
existing behaviour - previously a DAX VMA would take this path:

        if (vma_is_special_huge(vma)) {
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(tlb->mm, pmd);
                spin_unlock(ptl);

Now that DAX pages are treated normally we need to take the "else"
clause. So in the case of a zero pmd we only zap_deposited_table() if it
is not a DAX VMA or if it is a DAX VMA and the arch requires it, which is
the same as what would happen previously.

Of course that's not to say the previous behaviour was correct, I am far
from an expert on the pgtable deposit/withdraw code.

>>  		spin_unlock(ptl);
>>  	} else {
>>  		struct folio *folio = NULL;
>> @@ -2435,12 +2474,24 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>  	orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
>>  	arch_check_zapped_pud(vma, orig_pud);
>>  	tlb_remove_pud_tlb_entry(tlb, pud, addr);
>> -	if (vma_is_special_huge(vma)) {
>> +	if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
>
> If vma_is_special_huge() is true vma_is_dax() will always be false, so
> not clear to me why this check is combined?

Because that's not true:

static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
{
	return vma_is_dax(vma) || (vma->vm_file &&
				   (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
}

I suppose there is a good argument to change that now though. Thoughts?

>>  		spin_unlock(ptl);
>>  		/* No zero page support yet */
>>  	} else {
>> -		/* No support for anonymous PUD pages yet */
>> -		BUG();
>> +		struct page *page = NULL;
>> +		struct folio *folio;
>> +
>> +		/* No support for anonymous PUD pages or migration yet */
>> +		BUG_ON(vma_is_anonymous(vma) || !pud_present(orig_pud));
>> +
>> +		page = pud_page(orig_pud);
>> +		folio = page_folio(page);
>> +		folio_remove_rmap_pud(folio, page, vma);
>> +		VM_BUG_ON_PAGE(!PageHead(page), page);
>> +		add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
>> +
>> +		spin_unlock(ptl);
>> +		tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
>>  	}
>>  	return 1;
>>  }
>> @@ -2448,6 +2499,8 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>  static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
>>  		unsigned long haddr)
>>  {
>> +	pud_t old_pud;
>> +
>>  	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
>>  	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>>  	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
>> @@ -2455,7 +2508,23 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
>>  
>>  	count_vm_event(THP_SPLIT_PUD);
>>  
>> -	pudp_huge_clear_flush(vma, haddr, pud);
>> +	old_pud = pudp_huge_clear_flush(vma, haddr, pud);
>> +	if (is_huge_zero_pud(old_pud))
>> +		return;
>> +
>> +	if (vma_is_dax(vma)) {
>> +		struct page *page = pud_page(old_pud);
>> +		struct folio *folio = page_folio(page);
>> +
>> +		if (!folio_test_dirty(folio) && pud_dirty(old_pud))
>> +			folio_mark_dirty(folio);
>> +		if (!folio_test_referenced(folio) && pud_young(old_pud))
>> +			folio_set_referenced(folio);
>> +		folio_remove_rmap_pud(folio, page, vma);
>> +		folio_put(folio);
>> +		add_mm_counter(vma->vm_mm, mm_counter_file(folio),
>> +			-HPAGE_PUD_NR);
>> +	}
>
> So this does not split anything (no follow-on set_ptes()) it just clears
> and updates some folio metadata. Something is wrong if we get this far
> since the only dax mechanism that attempts PUD mappings is device-dax,
> and device-dax is not prepared for PUD mappings to be fractured.

Ok. Current upstream will just clear them though, and this just
maintains that behaviour along with updating metadata. So are you saying
that we shouldn't be able to get here? Or that just clearing the PUD is
wrong? Because unless I've missed something I think we have been able to
get here since support for transparent PUD with DAX was added.

> Peter Xu recently fixed mprotect() vs DAX PUD mappings, I need to check
> how that interacts with this.