Re: [PATCH 05/12] mm/memory: Add dax_insert_pfn

Dan Williams <dan.j.williams@xxxxxxxxx> · Sun, 22 Sep 2024 03:41:57 +0200

[ add s390 folks to comment on CONFIG_FS_DAX_LIMITED ]

Alistair Popple wrote:
> Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This
> creates a special devmap PTE entry for the pfn but does not take a
> reference on the underlying struct page for the mapping. This is
> because DAX page refcounts are treated specially, as indicated by the
> presence of a devmap entry.
> 
> To allow DAX page refcounts to be managed the same as normal page
> refcounts introduce dax_insert_pfn. This will take a reference on the
> underlying page much the same as vmf_insert_page, except it also
> permits upgrading an existing mapping to be writable if
> requested/possible.
> 
> Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
> 
> ---
> 
> Updates from v1:
> 
>  - Re-arrange code in insert_page_into_pte_locked() based on comments
>    from Jan Kara.
> 
>  - Call mkdrity/mkyoung for the mkwrite case, also suggested by Jan.
> ---
>  include/linux/mm.h |  1 +-
>  mm/memory.c        | 83 ++++++++++++++++++++++++++++++++++++++++++-----
>  2 files changed, 76 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index b0ff06d..ae6d713 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3463,6 +3463,7 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
>  				unsigned long num);
>  int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
>  				unsigned long num);
> +vm_fault_t dax_insert_pfn(struct vm_fault *vmf, pfn_t pfn_t, bool write);
>  vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>  			unsigned long pfn);
>  vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
> diff --git a/mm/memory.c b/mm/memory.c
> index d2785fb..368e15d 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2039,19 +2039,47 @@ static int validate_page_before_insert(struct vm_area_struct *vma,
>  }
>  
>  static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
> -			unsigned long addr, struct page *page, pgprot_t prot)
> +				unsigned long addr, struct page *page,
> +				pgprot_t prot, bool mkwrite)

This upgrade of insert_page_into_pte_locked() to handle write faults
deserves to be its own patch with rationale along the lines of:

"In preparation for using insert_page() for DAX, enhance
insert_page_into_pte_locked() to handle establishing writable mappings.
Recall that DAX returns VM_FAULT_NOPAGE after installing a PTE which
bypasses the typical set_pte_range() in finish_fault."


>  {
>  	struct folio *folio = page_folio(page);
> +	pte_t entry = ptep_get(pte);
>  	pte_t pteval;
>  
> -	if (!pte_none(ptep_get(pte)))
> -		return -EBUSY;
> +	if (!pte_none(entry)) {
> +		if (!mkwrite)
> +			return -EBUSY;
> +
> +		/*
> +		 * For read faults on private mappings the PFN passed in may not
> +		 * match the PFN we have mapped if the mapped PFN is a writeable
> +		 * COW page.  In the mkwrite case we are creating a writable PTE
> +		 * for a shared mapping and we expect the PFNs to match. If they
> +		 * don't match, we are likely racing with block allocation and
> +		 * mapping invalidation so just skip the update.
> +		 */
> +		if (pte_pfn(entry) != page_to_pfn(page)) {
> +			WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
> +			return -EFAULT;
> +		}
> +		entry = maybe_mkwrite(entry, vma);
> +		entry = pte_mkyoung(entry);
> +		if (ptep_set_access_flags(vma, addr, pte, entry, 1))
> +			update_mmu_cache(vma, addr, pte);

I was going to say that this should be creating a shared helper with
insert_pfn(), but on closer look the mkwrite case in insert_pfn() is now
dead code (almost, *grumbles about dcssblk*). So I would just mention
that in the changelog for this standalone patch and then we can follow
on with a cleanup like the patch at the bottom of this mail (untested).

> +		return 0;
> +	}
> +
>  	/* Ok, finally just insert the thing.. */
>  	pteval = mk_pte(page, prot);
>  	if (unlikely(is_zero_folio(folio))) {
>  		pteval = pte_mkspecial(pteval);
>  	} else {
>  		folio_get(folio);
> +		entry = mk_pte(page, prot);
> +		if (mkwrite) {
> +			entry = pte_mkyoung(entry);
> +			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +		}
>  		inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
>  		folio_add_file_rmap_pte(folio, page, vma);
>  	}
> @@ -2060,7 +2088,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
>  }
>  
>  static int insert_page(struct vm_area_struct *vma, unsigned long addr,
> -			struct page *page, pgprot_t prot)
> +			struct page *page, pgprot_t prot, bool mkwrite)
>  {
>  	int retval;
>  	pte_t *pte;
> @@ -2073,7 +2101,8 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
>  	pte = get_locked_pte(vma->vm_mm, addr, &ptl);
>  	if (!pte)
>  		goto out;
> -	retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
> +	retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
> +					mkwrite);
>  	pte_unmap_unlock(pte, ptl);
>  out:
>  	return retval;
> @@ -2087,7 +2116,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
>  	err = validate_page_before_insert(vma, page);
>  	if (err)
>  		return err;
> -	return insert_page_into_pte_locked(vma, pte, addr, page, prot);
> +	return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
>  }
>  
>  /* insert_pages() amortizes the cost of spinlock operations
> @@ -2223,7 +2252,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
>  		BUG_ON(vma->vm_flags & VM_PFNMAP);
>  		vm_flags_set(vma, VM_MIXEDMAP);
>  	}
> -	return insert_page(vma, addr, page, vma->vm_page_prot);
> +	return insert_page(vma, addr, page, vma->vm_page_prot, false);
>  }
>  EXPORT_SYMBOL(vm_insert_page);
>  
> @@ -2503,7 +2532,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
>  		 * result in pfn_t_has_page() == false.
>  		 */
>  		page = pfn_to_page(pfn_t_to_pfn(pfn));
> -		err = insert_page(vma, addr, page, pgprot);
> +		err = insert_page(vma, addr, page, pgprot, mkwrite);
>  	} else {
>  		return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
>  	}
> @@ -2516,6 +2545,44 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
>  	return VM_FAULT_NOPAGE;
>  }
>  
> +vm_fault_t dax_insert_pfn(struct vm_fault *vmf, pfn_t pfn_t, bool write)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	pgprot_t pgprot = vma->vm_page_prot;
> +	unsigned long pfn = pfn_t_to_pfn(pfn_t);
> +	struct page *page = pfn_to_page(pfn);

The problem here is that we stubbornly have __dcssblk_direct_access() to
worry about. That is the only dax driver that does not return
pfn_valid() pfns.

In fact, it looks like __dcssblk_direct_access() is the only thing
standing in the way of the removal of pfn_t.

It turns out it has been 3 years since the last time the question of
bringing s390 fully into the ZONE_DEVICE regime was raised:

https://lore.kernel.org/all/20210820210318.187742e8@thinkpad/

Given that this series removes PTE_DEVMAP which was a stumbling block,
would it be feasible to remove CONFIG_FS_DAX_LIMITED for a few kernel
cycles until someone from the s390 side can circle back to add full
ZONE_DEVICE support?

> +	unsigned long addr = vmf->address;
> +	int err;
> +
> +	if (addr < vma->vm_start || addr >= vma->vm_end)
> +		return VM_FAULT_SIGBUS;
> +
> +	track_pfn_insert(vma, &pgprot, pfn_t);
> +
> +	if (!pfn_modify_allowed(pfn, pgprot))
> +		return VM_FAULT_SIGBUS;
> +
> +	/*
> +	 * We refcount the page normally so make sure pfn_valid is true.
> +	 */
> +	if (!pfn_t_valid(pfn_t))
> +		return VM_FAULT_SIGBUS;
> +
> +	WARN_ON_ONCE(pfn_t_devmap(pfn_t));
> +
> +	if (WARN_ON(is_zero_pfn(pfn) && write))
> +		return VM_FAULT_SIGBUS;
> +
> +	err = insert_page(vma, addr, page, pgprot, write);
> +	if (err == -ENOMEM)
> +		return VM_FAULT_OOM;
> +	if (err < 0 && err != -EBUSY)
> +		return VM_FAULT_SIGBUS;
> +
> +	return VM_FAULT_NOPAGE;
> +}
> +EXPORT_SYMBOL_GPL(dax_insert_pfn);

With insert_page_into_pte_locked() split out into its own patch and the
dcssblk issue resolved to kill that special case, this patch looks good
to me.

> +
>  vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
>  		pfn_t pfn)
>  {
> -- 
> git-series 0.9.1

-- >8 --
Subject: mm: Remove vmf_insert_mixed_mkwrite()

From: Dan Williams <dan.j.williams@xxxxxxxxx>

Now that fsdax has switched to dax_insert_pfn() which uses
insert_page_into_pte_locked() internally, there are no more callers of
vmf_insert_mixed_mkwrite(). This also reveals that all remaining callers
of insert_pfn() never set @mkrite to true, so also cleanup insert_pfn()'s
@mkwrite argument.

Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
 include/linux/mm.h |    2 --
 mm/memory.c        |   60 +++++++---------------------------------------------
 2 files changed, 8 insertions(+), 54 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5976276d4494..d9517e109ac3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3444,8 +3444,6 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn, pgprot_t pgprot);
 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			pfn_t pfn);
-vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
-		unsigned long addr, pfn_t pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
 
 static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 000873596672..80b07dbd8304 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2327,7 +2327,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
 EXPORT_SYMBOL(vm_map_pages_zero);
 
 static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn, pgprot_t prot, bool mkwrite)
+			     pfn_t pfn, pgprot_t prot)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *pte, entry;
@@ -2337,38 +2337,12 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	if (!pte)
 		return VM_FAULT_OOM;
 	entry = ptep_get(pte);
-	if (!pte_none(entry)) {
-		if (mkwrite) {
-			/*
-			 * For read faults on private mappings the PFN passed
-			 * in may not match the PFN we have mapped if the
-			 * mapped PFN is a writeable COW page.  In the mkwrite
-			 * case we are creating a writable PTE for a shared
-			 * mapping and we expect the PFNs to match. If they
-			 * don't match, we are likely racing with block
-			 * allocation and mapping invalidation so just skip the
-			 * update.
-			 */
-			if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
-				WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
-				goto out_unlock;
-			}
-			entry = pte_mkyoung(entry);
-			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-			if (ptep_set_access_flags(vma, addr, pte, entry, 1))
-				update_mmu_cache(vma, addr, pte);
-		}
+	if (!pte_none(entry))
 		goto out_unlock;
-	}
 
 	/* Ok, finally just insert the thing.. */
 	entry = pte_mkspecial(pfn_t_pte(pfn, prot));
 
-	if (mkwrite) {
-		entry = pte_mkyoung(entry);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-	}
-
 	set_pte_at(mm, addr, pte, entry);
 	update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
 
@@ -2433,8 +2407,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 
 	track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
 
-	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
-			false);
+	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
 }
 EXPORT_SYMBOL(vmf_insert_pfn_prot);
 
@@ -2480,8 +2453,8 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
 	return false;
 }
 
-static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
-		unsigned long addr, pfn_t pfn, bool mkwrite)
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+			    pfn_t pfn)
 {
 	pgprot_t pgprot = vma->vm_page_prot;
 	int err;
@@ -2513,9 +2486,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
 		 * result in pfn_t_has_page() == false.
 		 */
 		page = pfn_to_page(pfn_t_to_pfn(pfn));
-		err = insert_page(vma, addr, page, pgprot, mkwrite);
+		err = insert_page(vma, addr, page, pgprot, false);
 	} else {
-		return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+		return insert_pfn(vma, addr, pfn, pgprot);
 	}
 
 	if (err == -ENOMEM)
@@ -2525,6 +2498,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
 
 	return VM_FAULT_NOPAGE;
 }
+EXPORT_SYMBOL(vmf_insert_mixed);
 
 vm_fault_t dax_insert_pfn(struct vm_fault *vmf, pfn_t pfn_t, bool write)
 {
@@ -2562,24 +2536,6 @@ vm_fault_t dax_insert_pfn(struct vm_fault *vmf, pfn_t pfn_t, bool write)
 }
 EXPORT_SYMBOL_GPL(dax_insert_pfn);
 
-vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-		pfn_t pfn)
-{
-	return __vm_insert_mixed(vma, addr, pfn, false);
-}
-EXPORT_SYMBOL(vmf_insert_mixed);
-
-/*
- *  If the insertion of PTE failed because someone else already added a
- *  different entry in the mean time, we treat that as success as we assume
- *  the same entry was actually inserted.
- */
-vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
-		unsigned long addr, pfn_t pfn)
-{
-	return __vm_insert_mixed(vma, addr, pfn, true);
-}
-
 /*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results