Re: [PATCH 10/12] fs/dax: Properly refcount fs dax pages

Dan Williams <dan.j.williams@xxxxxxxxx> · Fri, 27 Sep 2024 00:59:12 -0700

Alistair Popple wrote:
> Currently fs dax pages are considered free when the refcount drops to
> one and their refcounts are not increased when mapped via PTEs or
> decreased when unmapped. This requires special logic in mm paths to
> detect that these pages should not be properly refcounted, and to
> detect when the refcount drops to one instead of zero.
> 
> On the other hand get_user_pages(), etc. will properly refcount fs dax
> pages by taking a reference and dropping it when the page is
> unpinned.
> 
> Tracking this special behaviour requires extra PTE bits
> (eg. pte_devmap) and introduces rules that are potentially confusing
> and specific to FS DAX pages. To fix this, and to possibly allow
> removal of the special PTE bits in future, convert the fs dax page
> refcounts to be zero based and instead take a reference on the page
> each time it is mapped as is currently the case for normal pages.
> 
> This may also allow a future clean-up to remove the pgmap refcounting
> that is currently done in mm/gup.c.
> 
> Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
> ---
>  drivers/dax/device.c       |  12 +-
>  drivers/dax/super.c        |   2 +-
>  drivers/nvdimm/pmem.c      |   4 +-
>  fs/dax.c                   | 192 ++++++++++++++++++--------------------
>  fs/fuse/virtio_fs.c        |   3 +-
>  include/linux/dax.h        |   6 +-
>  include/linux/mm.h         |  27 +-----
>  include/linux/page-flags.h |   6 +-
>  mm/gup.c                   |   9 +--
>  mm/huge_memory.c           |   6 +-
>  mm/internal.h              |   2 +-
>  mm/memory-failure.c        |   6 +-
>  mm/memory.c                |   6 +-
>  mm/memremap.c              |  40 +++-----
>  mm/mlock.c                 |   2 +-
>  mm/mm_init.c               |   9 +--
>  mm/swap.c                  |   2 +-
>  17 files changed, 143 insertions(+), 191 deletions(-)
> 
> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> index 9c1a729..4d3ddd1 100644
> --- a/drivers/dax/device.c
> +++ b/drivers/dax/device.c
> @@ -126,11 +126,11 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
>  		return VM_FAULT_SIGBUS;
>  	}
>  
> -	pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
> +	pfn = phys_to_pfn_t(phys, 0);

BTW, this is part of what prompted me to do the pfn_t cleanup [1] that I
will rebase on top of your series:

[1]: http://lore.kernel.org/66f34a9caeb97_2a7f294fa@xxxxxxxxxxxxxxxxxxxxxxxxx.notmuch

[..]
> @@ -318,85 +323,58 @@ static unsigned long dax_end_pfn(void *entry)
>   */
>  #define for_each_mapped_pfn(entry, pfn) \
>  	for (pfn = dax_to_pfn(entry); \
> -			pfn < dax_end_pfn(entry); pfn++)
> +		pfn < dax_end_pfn(entry); pfn++)
>  
> -static inline bool dax_page_is_shared(struct page *page)
> +static void dax_device_folio_init(struct folio *folio, int order)
>  {
> -	return page->mapping == PAGE_MAPPING_DAX_SHARED;
> -}
> +	int orig_order = folio_order(folio);
> +	int i;
>  
> -/*
> - * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
> - * refcount.
> - */
> -static inline void dax_page_share_get(struct page *page)
> -{
> -	if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
> -		/*
> -		 * Reset the index if the page was already mapped
> -		 * regularly before.
> -		 */
> -		if (page->mapping)
> -			page->share = 1;
> -		page->mapping = PAGE_MAPPING_DAX_SHARED;
> -	}
> -	page->share++;
> -}
> +	if (orig_order != order) {
> +		struct dev_pagemap *pgmap = page_dev_pagemap(&folio->page);

Was there a discussion I missed about why the conversion to typical
folios allows the page->share accounting to be dropped.

I assume this is because the page->mapping validation was dropped, which
I think might be useful to keep at least for one development cycle to
make sure this conversion is not triggering any of the old warnings.

Otherwise, the ->share field of 'struct page' can also be cleaned up.

> -static inline unsigned long dax_page_share_put(struct page *page)
> -{
> -	return --page->share;
> -}
> +		for (i = 0; i < (1UL << orig_order); i++) {
> +			struct page *page = folio_page(folio, i);
>  
> -/*
> - * When it is called in dax_insert_entry(), the shared flag will indicate that
> - * whether this entry is shared by multiple files.  If so, set the page->mapping
> - * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
> - */
> -static void dax_associate_entry(void *entry, struct address_space *mapping,
> -		struct vm_area_struct *vma, unsigned long address, bool shared)
> -{
> -	unsigned long size = dax_entry_size(entry), pfn, index;
> -	int i = 0;
> +			ClearPageHead(page);
> +			clear_compound_head(page);
>  
> -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
> -		return;
> -
> -	index = linear_page_index(vma, address & ~(size - 1));
> -	for_each_mapped_pfn(entry, pfn) {
> -		struct page *page = pfn_to_page(pfn);
> +			/*
> +			 * Reset pgmap which was over-written by
> +			 * prep_compound_page().
> +			 */
> +			page_folio(page)->pgmap = pgmap;
>  
> -		if (shared) {
> -			dax_page_share_get(page);
> -		} else {
> -			WARN_ON_ONCE(page->mapping);
> -			page->mapping = mapping;
> -			page->index = index + i++;
> +			/* Make sure this isn't set to TAIL_MAPPING */
> +			page->mapping = NULL;
>  		}
>  	}
> +
> +	if (order > 0) {
> +		prep_compound_page(&folio->page, order);
> +		if (order > 1)
> +			INIT_LIST_HEAD(&folio->_deferred_list);
> +	}
>  }
>  
> -static void dax_disassociate_entry(void *entry, struct address_space *mapping,
> -		bool trunc)
> +static void dax_associate_new_entry(void *entry, struct address_space *mapping,
> +				pgoff_t index)

Lets call this dax_create_folio(), to mirror filemap_create_folio() and
have it transition the folio refcount from 0 to 1 to indicate that it is
allocated.

While I am not sure anything requires that, it seems odd that page cache
pages have an elevated refcount at map time and dax pages do not.

It does have implications for the dax dma-idle tracking thought, see
below.

>  {
> -	unsigned long pfn;
> +	unsigned long order = dax_entry_order(entry);
> +	struct folio *folio = dax_to_folio(entry);
>  
> -	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
> +	if (!dax_entry_size(entry))
>  		return;
>  
> -	for_each_mapped_pfn(entry, pfn) {
> -		struct page *page = pfn_to_page(pfn);
> -
> -		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
> -		if (dax_page_is_shared(page)) {
> -			/* keep the shared flag if this page is still shared */
> -			if (dax_page_share_put(page) > 0)
> -				continue;
> -		} else
> -			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
> -		page->mapping = NULL;
> -		page->index = 0;
> -	}
> +	/*
> +	 * We don't hold a reference for the DAX pagecache entry for the
> +	 * page. But we need to initialise the folio so we can hand it
> +	 * out. Nothing else should have a reference either.
> +	 */
> +	WARN_ON_ONCE(folio_ref_count(folio));

Per above I would feel more comfortable if we kept the paranoia around
to ensure that all the pages in this folio have dropped all references
and cleared ->mapping and ->index.

That paranoia can be placed behind a CONFIG_DEBUB_VM check, and we can
delete in a follow-on development cycle, but in the meantime it helps to
prove the correctness of the conversion.

[..]
> @@ -1189,11 +1165,14 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
>  	struct inode *inode = iter->inode;
>  	unsigned long vaddr = vmf->address;
>  	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
> +	struct page *page = pfn_t_to_page(pfn);
>  	vm_fault_t ret;
>  
>  	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
>  
> -	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
> +	page_ref_inc(page);
> +	ret = dax_insert_pfn(vmf, pfn, false);
> +	put_page(page);

Per above I think it is problematic to have pages live in the system
without a refcount.

One scenario where this might be needed is invalidate_inode_pages() vs
DMA. The invaldation should pause and wait for DMA pins to be dropped
before the mapping xarray is cleaned up and the dax folio is marked
free.

I think this may be a gap in the current code. I'll attempt to write a
test for this to check.

[..]
> @@ -1649,9 +1627,10 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>  	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
>  	bool write = iter->flags & IOMAP_WRITE;
>  	unsigned long entry_flags = pmd ? DAX_PMD : 0;
> -	int err = 0;
> +	int ret, err = 0;
>  	pfn_t pfn;
>  	void *kaddr;
> +	struct page *page;
>  
>  	if (!pmd && vmf->cow_page)
>  		return dax_fault_cow_page(vmf, iter);
> @@ -1684,14 +1663,21 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
>  	if (dax_fault_is_synchronous(iter, vmf->vma))
>  		return dax_fault_synchronous_pfnp(pfnp, pfn);
>  
> -	/* insert PMD pfn */
> +	page = pfn_t_to_page(pfn);

I think this is clearer if dax_insert_entry() returns folios with an
elevated refrence count that is dropped when the folio is invalidated
out of the mapping.

[..]
> @@ -519,21 +529,3 @@ void zone_device_page_init(struct page *page)
>  	lock_page(page);
>  }
>  EXPORT_SYMBOL_GPL(zone_device_page_init);
> -
> -#ifdef CONFIG_FS_DAX
> -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
> -{
> -	if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
> -		return false;
> -
> -	/*
> -	 * fsdax page refcounts are 1-based, rather than 0-based: if
> -	 * refcount is 1, then the page is free and the refcount is
> -	 * stable because nobody holds a reference on the page.
> -	 */
> -	if (folio_ref_sub_return(folio, refs) == 1)
> -		wake_up_var(&folio->_refcount);
> -	return true;

It follow from the refcount disvussion above that I think there is an
argument to still keep this wakeup based on the 2->1 transitition.
pagecache pages are refcount==1 when they are dma-idle but still
allocated. To keep the same semantics for dax a dax_folio would have an
elevated refcount whenever it is referenced by mapping entry.