Alistair Popple wrote: > Currently fs dax pages are considered free when the refcount drops to > one and their refcounts are not increased when mapped via PTEs or > decreased when unmapped. This requires special logic in mm paths to > detect that these pages should not be properly refcounted, and to > detect when the refcount drops to one instead of zero. > > On the other hand get_user_pages(), etc. will properly refcount fs dax > pages by taking a reference and dropping it when the page is > unpinned. > > Tracking this special behaviour requires extra PTE bits > (eg. pte_devmap) and introduces rules that are potentially confusing > and specific to FS DAX pages. To fix this, and to possibly allow > removal of the special PTE bits in future, convert the fs dax page > refcounts to be zero based and instead take a reference on the page > each time it is mapped as is currently the case for normal pages. > > This may also allow a future clean-up to remove the pgmap refcounting > that is currently done in mm/gup.c. This patch depends on FS_DAX_LIMITED being abandoned first, so do include the patch at the bottom of this reply in your series before this patch. > Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx> > > --- > > Changes since v2: > > Based on some questions from Dan I attempted to have the FS DAX page > cache (ie. address space) hold a reference to the folio whilst it was > mapped. However I came to the strong conclusion that this was not the > right thing to do. > > If the page refcount == 0 it means the page is: > > 1. not mapped into user-space > 2. not subject to other access via DMA/GUP/etc. > > Ie. From the core MM perspective the page is not in use. > > The fact a page may or may not be present in one or more address space > mappings is irrelevant for core MM. It just means the page is still in > use or valid from the file system perspective, and it's a > responsiblity of the file system to remove these mappings if the pfn > mapping becomes invalid (along with first making sure the MM state, > ie. page->refcount, is idle). So we shouldn't be trying to track that > lifetime with MM refcounts. > > Doing so just makes DMA-idle tracking more complex because there is > now another thing (one or more address spaces) which can hold > references on a page. And FS DAX can't even keep track of all the > address spaces which might contain a reference to the page in the > XFS/reflink case anyway. > > We could do this if we made file systems invalidate all address space > mappings prior to calling dax_break_layouts(), but that isn't > currently neccessary and would lead to increased faults just so we > could do some superfluous refcounting which the file system already > does. > > I have however put the page sharing checks and WARN_ON's back which > also turned out to be useful for figuring out when to re-initialising > a folio. I feel like these comments are a useful analysis that deserve not to be lost to the sands of time on the list. Perhaps capture a flavor of this relevant for future consideration in a "DAX page Lifetime" section of Documentation/filesystems/dax.rst? > --- > drivers/nvdimm/pmem.c | 4 +- > fs/dax.c | 212 +++++++++++++++++++++++----------------- > fs/fuse/virtio_fs.c | 3 +- > fs/xfs/xfs_inode.c | 2 +- > include/linux/dax.h | 6 +- > include/linux/mm.h | 27 +----- > include/linux/mm_types.h | 7 +- > mm/gup.c | 9 +-- > mm/huge_memory.c | 6 +- > mm/internal.h | 2 +- > mm/memory-failure.c | 6 +- > mm/memory.c | 6 +- > mm/memremap.c | 47 ++++----- > mm/mm_init.c | 9 +-- > mm/swap.c | 2 +- > 15 files changed, 183 insertions(+), 165 deletions(-) > > diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c > index d81faa9..785b2d2 100644 > --- a/drivers/nvdimm/pmem.c > +++ b/drivers/nvdimm/pmem.c > @@ -513,7 +513,7 @@ static int pmem_attach_disk(struct device *dev, > > pmem->disk = disk; > pmem->pgmap.owner = pmem; > - pmem->pfn_flags = PFN_DEV; > + pmem->pfn_flags = 0; > if (is_nd_pfn(dev)) { > pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; > pmem->pgmap.ops = &fsdax_pagemap_ops; > @@ -522,7 +522,6 @@ static int pmem_attach_disk(struct device *dev, > pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); > pmem->pfn_pad = resource_size(res) - > range_len(&pmem->pgmap.range); > - pmem->pfn_flags |= PFN_MAP; > bb_range = pmem->pgmap.range; > bb_range.start += pmem->data_offset; > } else if (pmem_should_map_pages(dev)) { > @@ -532,7 +531,6 @@ static int pmem_attach_disk(struct device *dev, > pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; > pmem->pgmap.ops = &fsdax_pagemap_ops; > addr = devm_memremap_pages(dev, &pmem->pgmap); > - pmem->pfn_flags |= PFN_MAP; > bb_range = pmem->pgmap.range; > } else { > addr = devm_memremap(dev, pmem->phys_addr, > diff --git a/fs/dax.c b/fs/dax.c > index d35dbe1..19f444e 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -71,6 +71,11 @@ static unsigned long dax_to_pfn(void *entry) > return xa_to_value(entry) >> DAX_SHIFT; > } > > +static struct folio *dax_to_folio(void *entry) > +{ > + return page_folio(pfn_to_page(dax_to_pfn(entry))); > +} > + > static void *dax_make_entry(pfn_t pfn, unsigned long flags) > { > return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); > @@ -338,44 +343,88 @@ static unsigned long dax_entry_size(void *entry) > return PAGE_SIZE; > } > > -static unsigned long dax_end_pfn(void *entry) > -{ > - return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; > -} > - > -/* > - * Iterate through all mapped pfns represented by an entry, i.e. skip > - * 'empty' and 'zero' entries. > - */ > -#define for_each_mapped_pfn(entry, pfn) \ > - for (pfn = dax_to_pfn(entry); \ > - pfn < dax_end_pfn(entry); pfn++) > - > /* > * A DAX page is considered shared if it has no mapping set and ->share (which > * shares the ->index field) is non-zero. Note this may return false even if the > * page is shared between multiple files but has not yet actually been mapped > * into multiple address spaces. > */ > -static inline bool dax_page_is_shared(struct page *page) > +static inline bool dax_folio_is_shared(struct folio *folio) > { > - return !page->mapping && page->share; > + return !folio->mapping && folio->share; > } > > /* > - * Increase the page share refcount, warning if the page is not marked as shared. > + * Increase the folio share refcount, warning if the folio is not marked as shared. > */ > -static inline void dax_page_share_get(struct page *page) > +static inline void dax_folio_share_get(void *entry) > { > - WARN_ON_ONCE(!page->share); > - WARN_ON_ONCE(page->mapping); > - page->share++; > + struct folio *folio = dax_to_folio(entry); > + > + WARN_ON_ONCE(!folio->share); > + WARN_ON_ONCE(folio->mapping); > + WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio)); > + folio->share++; > +} > + > +static inline unsigned long dax_folio_share_put(struct folio *folio) > +{ > + unsigned long ref; > + > + if (!dax_folio_is_shared(folio)) > + ref = 0; > + else > + ref = --folio->share; > + > + WARN_ON_ONCE(ref < 0); > + if (!ref) { > + folio->mapping = NULL; > + if (folio_order(folio)) { > + struct dev_pagemap *pgmap = page_pgmap(&folio->page); > + unsigned int order = folio_order(folio); > + unsigned int i; > + > + for (i = 0; i < (1UL << order); i++) { > + struct page *page = folio_page(folio, i); > + > + ClearPageHead(page); > + clear_compound_head(page); > + > + /* > + * Reset pgmap which was over-written by > + * prep_compound_page(). > + */ > + page_folio(page)->pgmap = pgmap; > + > + /* Make sure this isn't set to TAIL_MAPPING */ > + page->mapping = NULL; > + page->share = 0; > + WARN_ON_ONCE(page_ref_count(page)); > + } > + } > + } > + > + return ref; > } > > -static inline unsigned long dax_page_share_put(struct page *page) > +static void dax_device_folio_init(void *entry) s/dax_device_folio_init/dax_folio_init/ ...otherwise I do not see any connection to a "device" concept in this file. > { > - WARN_ON_ONCE(!page->share); > - return --page->share; > + struct folio *folio = dax_to_folio(entry); > + int order = dax_entry_order(entry); > + > + /* > + * Folio should have been split back to order-0 pages in > + * dax_folio_share_put() when they were removed from their > + * final mapping. > + */ > + WARN_ON_ONCE(folio_order(folio)); > + > + if (order > 0) { > + prep_compound_page(&folio->page, order); > + if (order > 1) > + INIT_LIST_HEAD(&folio->_deferred_list); > + WARN_ON_ONCE(folio_ref_count(folio)); > + } > } > > /* > @@ -388,72 +437,58 @@ static inline unsigned long dax_page_share_put(struct page *page) > * dax_holder_operations. > */ > static void dax_associate_entry(void *entry, struct address_space *mapping, > - struct vm_area_struct *vma, unsigned long address, bool shared) > + struct vm_area_struct *vma, unsigned long address, bool shared) > { > - unsigned long size = dax_entry_size(entry), pfn, index; > - int i = 0; > + unsigned long size = dax_entry_size(entry), index; > + struct folio *folio = dax_to_folio(entry); > > if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) > return; > > index = linear_page_index(vma, address & ~(size - 1)); > - for_each_mapped_pfn(entry, pfn) { > - struct page *page = pfn_to_page(pfn); > - > - if (shared && page->mapping && page->share) { > - if (page->mapping) { > - page->mapping = NULL; > + if (shared && (folio->mapping || dax_folio_is_shared(folio))) { This change in logic aligns with the previous feedback on the suspect "if (shared && page->mapping && page->share)" ...statememt, right? ...and maybe the dax_make_shared() suggestion makes the diff smaller here. > + if (folio->mapping) { > + folio->mapping = NULL; > > - /* > - * Page has already been mapped into one address > - * space so set the share count. > - */ > - page->share = 1; > - } > - > - dax_page_share_get(page); > - } else { > - WARN_ON_ONCE(page->mapping); > - page->mapping = mapping; > - page->index = index + i++; > + /* > + * folio has already been mapped into one address > + * space so set the share count. > + */ > + folio->share = 1; > } > + > + dax_folio_share_get(entry); > + } else { > + WARN_ON_ONCE(folio->mapping); > + dax_device_folio_init(entry); > + folio = dax_to_folio(entry); > + folio->mapping = mapping; > + folio->index = index; > } > } > > static void dax_disassociate_entry(void *entry, struct address_space *mapping, > - bool trunc) > + bool trunc) > { > - unsigned long pfn; > + struct folio *folio = dax_to_folio(entry); > > if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) > return; > > - for_each_mapped_pfn(entry, pfn) { > - struct page *page = pfn_to_page(pfn); > - > - WARN_ON_ONCE(trunc && page_ref_count(page) > 1); > - if (dax_page_is_shared(page)) { > - /* keep the shared flag if this page is still shared */ > - if (dax_page_share_put(page) > 0) > - continue; > - } else > - WARN_ON_ONCE(page->mapping && page->mapping != mapping); > - page->mapping = NULL; > - page->index = 0; > - } > + dax_folio_share_put(folio); Probably should not call this "share_put" anymore since it is handling both the shared and non-shared case. > } > > static struct page *dax_busy_page(void *entry) Hmm, will this ultimately become dax_busy_folio()? [..] > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 54b59b8..e308cb9 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -295,6 +295,8 @@ typedef struct { > * anonymous memory. > * @index: Offset within the file, in units of pages. For anonymous memory, > * this is the index from the beginning of the mmap. > + * @share: number of DAX mappings that reference this folio. See > + * dax_associate_entry. > * @private: Filesystem per-folio data (see folio_attach_private()). > * @swap: Used for swp_entry_t if folio_test_swapcache(). > * @_mapcount: Do not access this member directly. Use folio_mapcount() to > @@ -344,7 +346,10 @@ struct folio { > struct dev_pagemap *pgmap; > }; > struct address_space *mapping; > - pgoff_t index; > + union { > + pgoff_t index; > + unsigned long share; > + }; This feels like it should be an immediate follow-on change if only to separate fsdax conversion bugs from ->index ->share aliasing bugs, and due to the significance of touching 'struct page'. [..] As I only have cosmetic comments you can add: Reviewed-by: Dan Williams <dan.j.williams@xxxxxxxxx> ...and here is that aformentioned patch: -- 8< -- Subject: dcssblk: Mark DAX broken, remove FS_DAX_LIMITED support From: Dan Williams <dan.j.williams@xxxxxxxxx> The dcssblk driver has long needed special case supoprt to enable limited dax operation, so called CONFIG_FS_DAX_LIMITED. This mode works around the incomplete support for ZONE_DEVICE on s390 by forgoing the ability of dax-mapped pages to support GUP. Now, pending cleanups to fsdax that fix its reference counting [1] depend on the ability of all dax drivers to supply ZONE_DEVICE pages. To allow that work to move forward, dax support needs to be paused for dcssblk until ZONE_DEVICE support arrives. That work has been known for a few years [2], and the removal of "pte_devmap" requirements [3] makes the conversion easier. For now, place the support behind CONFIG_BROKEN, and remove PFN_SPECIAL (dcssblk was the only user). Link: http://lore.kernel.org/cover.9f0e45d52f5cff58807831b6b867084d0b14b61c.1725941415.git-series.apopple@xxxxxxxxxx [1] Link: http://lore.kernel.org/20210820210318.187742e8@thinkpad/ [2] Link: http://lore.kernel.org/4511465a4f8429f45e2ac70d2e65dc5e1df1eb47.1725941415.git-series.apopple@xxxxxxxxxx [3] Reviewed-by: Gerald Schaefer <gerald.schaefer@xxxxxxxxxxxxx> Tested-by: Alexander Gordeev <agordeev@xxxxxxxxxxxxx> Acked-by: David Hildenbrand <david@xxxxxxxxxx> Cc: Heiko Carstens <hca@xxxxxxxxxxxxx> Cc: Vasily Gorbik <gor@xxxxxxxxxxxxx> Cc: Christian Borntraeger <borntraeger@xxxxxxxxxxxxx> Cc: Sven Schnelle <svens@xxxxxxxxxxxxx> Cc: Jan Kara <jack@xxxxxxx> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: Alistair Popple <apopple@xxxxxxxxxx> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> --- Documentation/filesystems/dax.rst | 1 - drivers/s390/block/Kconfig | 12 ++++++++++-- drivers/s390/block/dcssblk.c | 27 +++++++++++++++++---------- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/Documentation/filesystems/dax.rst b/Documentation/filesystems/dax.rst index 719e90f1988e..08dd5e254cc5 100644 --- a/Documentation/filesystems/dax.rst +++ b/Documentation/filesystems/dax.rst @@ -207,7 +207,6 @@ implement direct_access. These block devices may be used for inspiration: - brd: RAM backed block device driver -- dcssblk: s390 dcss block device driver - pmem: NVDIMM persistent memory driver diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig index e3710a762aba..4bfe469c04aa 100644 --- a/drivers/s390/block/Kconfig +++ b/drivers/s390/block/Kconfig @@ -4,13 +4,21 @@ comment "S/390 block device drivers" config DCSSBLK def_tristate m - select FS_DAX_LIMITED - select DAX prompt "DCSSBLK support" depends on S390 && BLOCK help Support for dcss block device +config DCSSBLK_DAX + def_bool y + depends on DCSSBLK + # requires S390 ZONE_DEVICE support + depends on BROKEN + select DAX + prompt "DCSSBLK DAX support" + help + Enable DAX operation for the dcss block device + config DASD def_tristate y prompt "Support for DASD devices" diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 0f14d279d30b..7248e547fefb 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -534,6 +534,21 @@ static const struct attribute_group *dcssblk_dev_attr_groups[] = { NULL, }; +static int dcssblk_setup_dax(struct dcssblk_dev_info *dev_info) +{ + struct dax_device *dax_dev; + + if (!IS_ENABLED(CONFIG_DCSSBLK_DAX)) + return 0; + + dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops); + if (IS_ERR(dax_dev)) + return PTR_ERR(dax_dev); + set_dax_synchronous(dax_dev); + dev_info->dax_dev = dax_dev; + return dax_add_host(dev_info->dax_dev, dev_info->gd); +} + /* * device attribute for adding devices */ @@ -547,7 +562,6 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char int rc, i, j, num_of_segments; struct dcssblk_dev_info *dev_info; struct segment_info *seg_info, *temp; - struct dax_device *dax_dev; char *local_buf; unsigned long seg_byte_size; @@ -674,14 +688,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char if (rc) goto put_dev; - dax_dev = alloc_dax(dev_info, &dcssblk_dax_ops); - if (IS_ERR(dax_dev)) { - rc = PTR_ERR(dax_dev); - goto put_dev; - } - set_dax_synchronous(dax_dev); - dev_info->dax_dev = dax_dev; - rc = dax_add_host(dev_info->dax_dev, dev_info->gd); + rc = dcssblk_setup_dax(dev_info); if (rc) goto out_dax; @@ -917,7 +924,7 @@ __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff, *kaddr = __va(dev_info->start + offset); if (pfn) *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), - PFN_DEV|PFN_SPECIAL); + PFN_DEV); return (dev_sz - offset) / PAGE_SIZE; }