Re: [PATCH v6 21/26] fs/dax: Properly refcount fs dax pages

Dan Williams <dan.j.williams@xxxxxxxxx> · Mon, 13 Jan 2025 19:35:07 -0800

Alistair Popple wrote:
> Currently fs dax pages are considered free when the refcount drops to
> one and their refcounts are not increased when mapped via PTEs or
> decreased when unmapped. This requires special logic in mm paths to
> detect that these pages should not be properly refcounted, and to
> detect when the refcount drops to one instead of zero.
> 
> On the other hand get_user_pages(), etc. will properly refcount fs dax
> pages by taking a reference and dropping it when the page is
> unpinned.
> 
> Tracking this special behaviour requires extra PTE bits
> (eg. pte_devmap) and introduces rules that are potentially confusing
> and specific to FS DAX pages. To fix this, and to possibly allow
> removal of the special PTE bits in future, convert the fs dax page
> refcounts to be zero based and instead take a reference on the page
> each time it is mapped as is currently the case for normal pages.
> 
> This may also allow a future clean-up to remove the pgmap refcounting
> that is currently done in mm/gup.c.

This patch depends on FS_DAX_LIMITED being abandoned first, so do
include the patch at the bottom of this reply in your series before this
patch.

> Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
> 
> ---
> 
> Changes since v2:
> 
> Based on some questions from Dan I attempted to have the FS DAX page
> cache (ie. address space) hold a reference to the folio whilst it was
> mapped. However I came to the strong conclusion that this was not the
> right thing to do.
> 
> If the page refcount == 0 it means the page is:
> 
> 1. not mapped into user-space
> 2. not subject to other access via DMA/GUP/etc.
> 
> Ie. From the core MM perspective the page is not in use.
> 
> The fact a page may or may not be present in one or more address space
> mappings is irrelevant for core MM. It just means the page is still in
> use or valid from the file system perspective, and it's a
> responsiblity of the file system to remove these mappings if the pfn
> mapping becomes invalid (along with first making sure the MM state,
> ie. page->refcount, is idle). So we shouldn't be trying to track that
> lifetime with MM refcounts.
> 
> Doing so just makes DMA-idle tracking more complex because there is
> now another thing (one or more address spaces) which can hold
> references on a page. And FS DAX can't even keep track of all the
> address spaces which might contain a reference to the page in the
> XFS/reflink case anyway.
> 
> We could do this if we made file systems invalidate all address space
> mappings prior to calling dax_break_layouts(), but that isn't
> currently neccessary and would lead to increased faults just so we
> could do some superfluous refcounting which the file system already
> does.
> 
> I have however put the page sharing checks and WARN_ON's back which
> also turned out to be useful for figuring out when to re-initialising
> a folio.

I feel like these comments are a useful analysis that deserve not to be
lost to the sands of time on the list.

Perhaps capture a flavor of this relevant for future consideration in a
"DAX page Lifetime" section of Documentation/filesystems/dax.rst?

> ---
>  drivers/nvdimm/pmem.c    |   4 +-
>  fs/dax.c                 | 212 +++++++++++++++++++++++-----------------
>  fs/fuse/virtio_fs.c      |   3 +-
>  fs/xfs/xfs_inode.c       |   2 +-
>  include/linux/dax.h      |   6 +-
>  include/linux/mm.h       |  27 +-----
>  include/linux/mm_types.h |   7 +-
>  mm/gup.c                 |   9 +--
>  mm/huge_memory.c         |   6 +-
>  mm/internal.h            |   2 +-
>  mm/memory-failure.c      |   6 +-
>  mm/memory.c              |   6 +-
>  mm/memremap.c            |  47 ++++-----
>  mm/mm_init.c             |   9 +--
>  mm/swap.c                |   2 +-
>  15 files changed, 183 insertions(+), 165 deletions(-)
> 
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index d81faa9..785b2d2 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -513,7 +513,7 @@ static int pmem_attach_disk(struct device *dev,
>  
>  	pmem->disk = disk;
>  	pmem->pgmap.owner = pmem;
> -	pmem->pfn_flags = PFN_DEV;
> +	pmem->pfn_flags = 0;
>  	if (is_nd_pfn(dev)) {
>  		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
>  		pmem->pgmap.ops = &fsdax_pagemap_ops;
> @@ -522,7 +522,6 @@ static int pmem_attach_disk(struct device *dev,
>  		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
>  		pmem->pfn_pad = resource_size(res) -
>  			range_len(&pmem->pgmap.range);
> -		pmem->pfn_flags |= PFN_MAP;
>  		bb_range = pmem->pgmap.range;
>  		bb_range.start += pmem->data_offset;
>  	} else if (pmem_should_map_pages(dev)) {
> @@ -532,7 +531,6 @@ static int pmem_attach_disk(struct device *dev,
>  		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
>  		pmem->pgmap.ops = &fsdax_pagemap_ops;
>  		addr = devm_memremap_pages(dev, &pmem->pgmap);
> -		pmem->pfn_flags |= PFN_MAP;
>  		bb_range = pmem->pgmap.range;
>  	} else {
>  		addr = devm_memremap(dev, pmem->phys_addr,
> diff --git a/fs/dax.c b/fs/dax.c
> index d35dbe1..19f444e 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -71,6 +71,11 @@ static unsigned long dax_to_pfn(void *entry)
>  	return xa_to_value(entry) >> DAX_SHIFT;
>  }
>  
> +static struct folio *dax_to_folio(void *entry)
> +{
> +	return page_folio(pfn_to_page(dax_to_pfn(entry)));
> +}
> +
>  static void *dax_make_entry(pfn_t pfn, unsigned long flags)
>  {
>  	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
> @@ -338,44 +343,88 @@ static unsigned long dax_entry_size(void *entry)
>  		return PAGE_SIZE;
>  }
>  
> -static unsigned long dax_end_pfn(void *entry)
> -{
> -	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
> -}
> -
> -/*
> - * Iterate through all mapped pfns represented by an entry, i.e. skip
> - * 'empty' and 'zero' entries.
> - */
> -#define for_each_mapped_pfn(entry, pfn) \
> -	for (pfn = dax_to_pfn(entry); \
> -			pfn < dax_end_pfn(entry); pfn++)
> -
>  /*
>   * A DAX page is considered shared if it has no mapping set and ->share (which
>   * shares the ->index field) is non-zero. Note this may return false even if the
>   * page is shared between multiple files but has not yet actually been mapped
>   * into multiple address spaces.
>   */
> -static inline bool dax_page_is_shared(struct page *page)
> +static inline bool dax_folio_is_shared(struct folio *folio)
>  {
> -	return !page->mapping && page->share;
> +	return !folio->mapping && folio->share;
>  }
>  
>  /*
> - * Increase the page share refcount, warning if the page is not marked as shared.
> + * Increase the folio share refcount, warning if the folio is not marked as shared.
>   */
> -static inline void dax_page_share_get(struct page *page)
> +static inline void dax_folio_share_get(void *entry)
>  {
> -	WARN_ON_ONCE(!page->share);
> -	WARN_ON_ONCE(page->mapping);
> -	page->share++;
> +	struct folio *folio = dax_to_folio(entry);
> +
> +	WARN_ON_ONCE(!folio->share);
> +	WARN_ON_ONCE(folio->mapping);
> +	WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
> +	folio->share++;
> +}
> +
> +static inline unsigned long dax_folio_share_put(struct folio *folio)
> +{
> +	unsigned long ref;
> +
> +	if (!dax_folio_is_shared(folio))
> +		ref = 0;
> +	else
> +		ref = --folio->share;
> +
> +	WARN_ON_ONCE(ref < 0);
> +	if (!ref) {
> +		folio->mapping = NULL;
> +		if (folio_order(folio)) {
> +			struct dev_pagemap *pgmap = page_pgmap(&folio->page);
> +			unsigned int order = folio_order(folio);
> +			unsigned int i;
> +
> +			for (i = 0; i < (1UL << order); i++) {
> +				struct page *page = folio_page(folio, i);
> +
> +				ClearPageHead(page);
> +				clear_compound_head(page);
> +
> +				/*
> +				 * Reset pgmap which was over-written by
> +				 * prep_compound_page().
> +				 */
> +				page_folio(page)->pgmap = pgmap;
> +
> +				/* Make sure this isn't set to TAIL_MAPPING */
> +				page->mapping = NULL;
> +				page->share = 0;
> +				WARN_ON_ONCE(page_ref_count(page));
> +			}
> +		}
> +	}
> +
> +	return ref;
>  }
>  
> -static inline unsigned long dax_page_share_put(struct page *page)
> +static void dax_device_folio_init(void *entry)

s/dax_device_folio_init/dax_folio_init/

...otherwise I do not see any connection to a "device" concept in this
file.


>  {
> -	WARN_ON_ONCE(!page->share);
> -	return --page->share;
> +	struct folio *folio = dax_to_folio(entry);
> +	int order = dax_entry_order(entry);
> +
> +	/*
> +	 * Folio should have been split back to order-0 pages in
> +	 * dax_folio_share_put() when they were removed from their
> +	 * final mapping.
> +	 */
> +	WARN_ON_ONCE(folio_order(folio));
> +
> +	if (order > 0) {
> +		prep_compound_page(&folio->page, order);
> +		if (order > 1)
> +			INIT_LIST_HEAD(&folio->_deferred_list);
> +		WARN_ON_ONCE(folio_ref_count(folio));
> +	}
>  }
>  
>  /*
> @@ -388,72 +437,58 @@ static inline unsigned long dax_page_share_put(struct page *page)
>   * dax_holder_operations.
>   */
>  static void dax_associate_entry(void *entry, struct address_space *mapping,
> -		struct vm_area_struct *vma, unsigned long address, bool shared)
> +				struct vm_area_struct *vma, unsigned long address, bool shared)
>  {
> -	unsigned long size = dax_entry_size(entry), pfn, index;
> -	int i = 0;
> +	unsigned long size = dax_entry_size(entry), index;
> +	struct folio *folio = dax_to_folio(entry);
>  
>  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
>  		return;
>  
>  	index = linear_page_index(vma, address & ~(size - 1));
> -	for_each_mapped_pfn(entry, pfn) {
> -		struct page *page = pfn_to_page(pfn);
> -
> -		if (shared && page->mapping && page->share) {
> -			if (page->mapping) {
> -				page->mapping = NULL;
> +	if (shared && (folio->mapping || dax_folio_is_shared(folio))) {

This change in logic aligns with the previous feedback on the suspect

    "if (shared && page->mapping && page->share)"

...statememt, right?

...and maybe the dax_make_shared() suggestion makes the diff smaller
here.

> +		if (folio->mapping) {
> +			folio->mapping = NULL;
>  
> -				/*
> -				 * Page has already been mapped into one address
> -				 * space so set the share count.
> -				 */
> -				page->share = 1;
> -			}
> -
> -			dax_page_share_get(page);
> -		} else {
> -			WARN_ON_ONCE(page->mapping);
> -			page->mapping = mapping;
> -			page->index = index + i++;
> +			/*
> +			 * folio has already been mapped into one address
> +			 * space so set the share count.
> +			 */
> +			folio->share = 1;
>  		}
> +
> +		dax_folio_share_get(entry);
> +	} else {
> +		WARN_ON_ONCE(folio->mapping);
> +		dax_device_folio_init(entry);
> +		folio = dax_to_folio(entry);
> +		folio->mapping = mapping;
> +		folio->index = index;
>  	}
>  }
>  
>  static void dax_disassociate_entry(void *entry, struct address_space *mapping,
> -		bool trunc)
> +				bool trunc)
>  {
> -	unsigned long pfn;
> +	struct folio *folio = dax_to_folio(entry);
>  
>  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
>  		return;
>  
> -	for_each_mapped_pfn(entry, pfn) {
> -		struct page *page = pfn_to_page(pfn);
> -
> -		WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
> -		if (dax_page_is_shared(page)) {
> -			/* keep the shared flag if this page is still shared */
> -			if (dax_page_share_put(page) > 0)
> -				continue;
> -		} else
> -			WARN_ON_ONCE(page->mapping && page->mapping != mapping);
> -		page->mapping = NULL;
> -		page->index = 0;
> -	}
> +	dax_folio_share_put(folio);

Probably should not call this "share_put" anymore since it is handling
both the shared and non-shared case.

>  }
>  
>  static struct page *dax_busy_page(void *entry)

Hmm, will this ultimately become dax_busy_folio()?

[..]
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 54b59b8..e308cb9 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -295,6 +295,8 @@ typedef struct {
>   *    anonymous memory.
>   * @index: Offset within the file, in units of pages.  For anonymous memory,
>   *    this is the index from the beginning of the mmap.
> + * @share: number of DAX mappings that reference this folio. See
> + *    dax_associate_entry.
>   * @private: Filesystem per-folio data (see folio_attach_private()).
>   * @swap: Used for swp_entry_t if folio_test_swapcache().
>   * @_mapcount: Do not access this member directly.  Use folio_mapcount() to
> @@ -344,7 +346,10 @@ struct folio {
>  				struct dev_pagemap *pgmap;
>  			};
>  			struct address_space *mapping;
> -			pgoff_t index;
> +			union {
> +				pgoff_t index;
> +				unsigned long share;
> +			};

This feels like it should be an immediate follow-on change if only to
separate fsdax conversion bugs from ->index ->share aliasing bugs, and
due to the significance of touching 'struct page'.

[..]

As I only have cosmetic comments you can add:

Reviewed-by: Dan Williams <dan.j.williams@xxxxxxxxx>

...and here is that aformentioned patch:

-- 8< --
Subject: dcssblk: Mark DAX broken, remove FS_DAX_LIMITED support

From: Dan Williams <dan.j.williams@xxxxxxxxx>

The dcssblk driver has long needed special case supoprt to enable
limited dax operation, so called CONFIG_FS_DAX_LIMITED. This mode
works around the incomplete support for ZONE_DEVICE on s390 by forgoing
the ability of dax-mapped pages to support GUP.

Now, pending cleanups to fsdax that fix its reference counting [1] depend on
the ability of all dax drivers to supply ZONE_DEVICE pages.

To allow that work to move forward, dax support needs to be paused for
dcssblk until ZONE_DEVICE support arrives. That work has been known for
a few years [2], and the removal of "pte_devmap" requirements [3] makes the
conversion easier.

For now, place the support behind CONFIG_BROKEN, and remove PFN_SPECIAL
(dcssblk was the only user).

Link: http://lore.kernel.org/cover.9f0e45d52f5cff58807831b6b867084d0b14b61c.1725941415.git-series.apopple@xxxxxxxxxx [1]
Link: http://lore.kernel.org/20210820210318.187742e8@thinkpad/ [2]
Link: http://lore.kernel.org/4511465a4f8429f45e2ac70d2e65dc5e1df1eb47.1725941415.git-series.apopple@xxxxxxxxxx [3]
Reviewed-by: Gerald Schaefer <gerald.schaefer@xxxxxxxxxxxxx>
Tested-by: Alexander Gordeev <agordeev@xxxxxxxxxxxxx>
Acked-by: David Hildenbrand <david@xxxxxxxxxx>
Cc: Heiko Carstens <hca@xxxxxxxxxxxxx>
Cc: Vasily Gorbik <gor@xxxxxxxxxxxxx>
Cc: Christian Borntraeger <borntraeger@xxxxxxxxxxxxx>
Cc: Sven Schnelle <svens@xxxxxxxxxxxxx>
Cc: Jan Kara <jack@xxxxxxx>
Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Cc: Alistair Popple <apopple@xxxxxxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
 Documentation/filesystems/dax.rst |    1 -
 drivers/s390/block/Kconfig        |   12 ++++++++++--
 drivers/s390/block/dcssblk.c      |   27 +++++++++++++++++----------
 3 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/Documentation/filesystems/dax.rst b/Documentation/filesystems/dax.rst
index 719e90f1988e..08dd5e254cc5 100644
--- a/Documentation/filesystems/dax.rst
+++ b/Documentation/filesystems/dax.rst
@@ -207,7 +207,6 @@ implement direct_access.
 
 These block devices may be used for inspiration:
 - brd: RAM backed block device driver
-- dcssblk: s390 dcss block device driver
 - pmem: NVDIMM persistent memory driver
 
 
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index e3710a762aba..4bfe469c04aa 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -4,13 +4,21 @@ comment "S/390 block device drivers"
 
 config DCSSBLK
 	def_tristate m
-	select FS_DAX_LIMITED
-	select DAX
 	prompt "DCSSBLK support"
 	depends on S390 && BLOCK
 	help
 	  Support for dcss block device
 
+config DCSSBLK_DAX
+	def_bool y
+	depends on DCSSBLK
+	# requires S390 ZONE_DEVICE support
+	depends on BROKEN
+	select DAX
+	prompt "DCSSBLK DAX support"
+	help
+	  Enable DAX operation for the dcss block device
+
 config DASD
 	def_tristate y
 	prompt "Support for DASD devices"
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 0f14d279d30b..7248e547fefb 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -534,6 +534,21 @@ static const struct attribute_group *dcssblk_dev_attr_groups[] = {
 	NULL,
 };
 
+static int dcssblk_setup_dax(struct dcssblk_dev_info *dev_info)
+{
+	struct dax_device *dax_dev;
+
+	if (!IS_ENABLED(CONFIG_DCSSBLK_DAX))
+		return 0;
+
+	dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops);
+	if (IS_ERR(dax_dev))
+		return PTR_ERR(dax_dev);
+	set_dax_synchronous(dax_dev);
+	dev_info->dax_dev = dax_dev;
+	return dax_add_host(dev_info->dax_dev, dev_info->gd);
+}
+
 /*
  * device attribute for adding devices
  */
@@ -547,7 +562,6 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	int rc, i, j, num_of_segments;
 	struct dcssblk_dev_info *dev_info;
 	struct segment_info *seg_info, *temp;
-	struct dax_device *dax_dev;
 	char *local_buf;
 	unsigned long seg_byte_size;
 
@@ -674,14 +688,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	if (rc)
 		goto put_dev;
 
-	dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops);
-	if (IS_ERR(dax_dev)) {
-		rc = PTR_ERR(dax_dev);
-		goto put_dev;
-	}
-	set_dax_synchronous(dax_dev);
-	dev_info->dax_dev = dax_dev;
-	rc = dax_add_host(dev_info->dax_dev, dev_info->gd);
+	rc = dcssblk_setup_dax(dev_info);
 	if (rc)
 		goto out_dax;
 
@@ -917,7 +924,7 @@ __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,
 		*kaddr = __va(dev_info->start + offset);
 	if (pfn)
 		*pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset),
-				PFN_DEV|PFN_SPECIAL);
+				      PFN_DEV);
 
 	return (dev_sz - offset) / PAGE_SIZE;
 }