Alistair Popple wrote: > Prior to any truncation operations file systems call > dax_break_mapping() to ensure pages in the range are not under going > DMA. Later DAX page-cache entries will be removed by > truncate_folio_batch_exceptionals() in the generic page-cache code. > > However this makes it possible for folios to be removed from the > page-cache even though they are still DMA busy if the file-system > hasn't called dax_break_mapping(). It also means they can never be > waited on in future because FS DAX will lose track of them once the > page-cache entry has been deleted. > > Instead it is better to delete the FS DAX entry when the file-system > calls dax_break_mapping() as part of it's truncate operation. This > ensures only idle pages can be removed from the FS DAX page-cache and > makes it easy to detect if a file-system hasn't called > dax_break_mapping() prior to a truncate operation. > > Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx> > > --- > > Ideally I think we would move the whole wait-for-idle logic directly > into the truncate paths. However this is difficult for a few > reasons. Each filesystem needs it's own wait callback, although a new > address space operation could address that. More problematic is that > the wait-for-idle can fail as the wait is TASK_INTERRUPTIBLE, but none > of the generic truncate paths allow for failure. > > So it ends up being easier to continue to let file systems call this > and check that they behave as expected. > --- > fs/dax.c | 33 +++++++++++++++++++++++++++++++++ > fs/xfs/xfs_inode.c | 6 ++++++ > include/linux/dax.h | 2 ++ > mm/truncate.c | 16 +++++++++++++++- > 4 files changed, 56 insertions(+), 1 deletion(-) > > diff --git a/fs/dax.c b/fs/dax.c > index 9c3bd07..7008a73 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -845,6 +845,36 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) > return ret; > } > > +void dax_delete_mapping_range(struct address_space *mapping, > + loff_t start, loff_t end) > +{ > + void *entry; > + pgoff_t start_idx = start >> PAGE_SHIFT; > + pgoff_t end_idx; > + XA_STATE(xas, &mapping->i_pages, start_idx); > + > + /* If end == LLONG_MAX, all pages from start to till end of file */ > + if (end == LLONG_MAX) > + end_idx = ULONG_MAX; > + else > + end_idx = end >> PAGE_SHIFT; > + > + xas_lock_irq(&xas); > + xas_for_each(&xas, entry, end_idx) { > + if (!xa_is_value(entry)) > + continue; > + entry = wait_entry_unlocked_exclusive(&xas, entry); > + if (!entry) > + continue; > + dax_disassociate_entry(entry, mapping, true); > + xas_store(&xas, NULL); > + mapping->nrpages -= 1UL << dax_entry_order(entry); > + put_unlocked_entry(&xas, entry, WAKE_ALL); > + } > + xas_unlock_irq(&xas); > +} > +EXPORT_SYMBOL_GPL(dax_delete_mapping_range); > + > static int wait_page_idle(struct page *page, > void (cb)(struct inode *), > struct inode *inode) > @@ -874,6 +904,9 @@ int dax_break_mapping(struct inode *inode, loff_t start, loff_t end, > error = wait_page_idle(page, cb, inode); > } while (error == 0); > > + if (!page) > + dax_delete_mapping_range(inode->i_mapping, start, end); > + Just reinforcing the rename comment on the last patch... I think this is an example where the s/dax_break_mapping/dax_break_layout/ rename helps disambiguate what is related to mapping cleanup and what is related to mapping cleanup as dax_break_layout calls dax_delete_mapping. > return error; > } > EXPORT_SYMBOL_GPL(dax_break_mapping); > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 295730a..4410b42 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -2746,6 +2746,12 @@ xfs_mmaplock_two_inodes_and_break_dax_layout( > goto again; > } > > + /* > + * Normally xfs_break_dax_layouts() would delete the mapping entries as well so > + * do that here. > + */ > + dax_delete_mapping_range(VFS_I(ip2)->i_mapping, 0, LLONG_MAX); > + I think it is unfortunate that dax_break_mapping is so close to being useful for this case... how about this incremental cleanup? diff --git a/fs/dax.c b/fs/dax.c index facddd6c6bbb..1fa5521e5a2e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -942,12 +942,15 @@ static void wait_page_idle_uninterruptible(struct page *page, /* * Unmaps the inode and waits for any DMA to complete prior to deleting the * DAX mapping entries for the range. + * + * For NOWAIT behavior, pass @cb as NULL to early-exit on first found + * busy page */ int dax_break_mapping(struct inode *inode, loff_t start, loff_t end, void (cb)(struct inode *)) { struct page *page; - int error; + int error = 0; if (!dax_mapping(inode->i_mapping)) return 0; @@ -956,6 +959,10 @@ int dax_break_mapping(struct inode *inode, loff_t start, loff_t end, page = dax_layout_busy_page_range(inode->i_mapping, start, end); if (!page) break; + if (!cb) { + error = -ERESTARTSYS; + break; + } error = wait_page_idle(page, cb, inode); } while (error == 0); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7bfb4eb387c6..0988a9088259 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2739,19 +2739,13 @@ xfs_mmaplock_two_inodes_and_break_dax_layout( * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable * for this nested lock case. */ - page = dax_layout_busy_page(VFS_I(ip2)->i_mapping); - if (page && page_ref_count(page) != 0) { + error = dax_break_layout(VFS_I(ip2), 0, -1, NULL); + if (error) { xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); goto again; } - /* - * Normally xfs_break_dax_layouts() would delete the mapping entries as well so - * do that here. - */ - dax_delete_mapping_range(VFS_I(ip2)->i_mapping, 0, LLONG_MAX); - return 0; } This also addresses Darrick's feedback around introducing dax_page_in_use() which xfs does not really care about, only that no more pages are busy. > return 0; > } > > diff --git a/include/linux/dax.h b/include/linux/dax.h > index f6583d3..ef9e02c 100644 > --- a/include/linux/dax.h > +++ b/include/linux/dax.h > @@ -263,6 +263,8 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, > vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, > unsigned int order, pfn_t pfn); > int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); > +void dax_delete_mapping_range(struct address_space *mapping, > + loff_t start, loff_t end); > int dax_invalidate_mapping_entry_sync(struct address_space *mapping, > pgoff_t index); > int __must_check dax_break_mapping(struct inode *inode, loff_t start, > diff --git a/mm/truncate.c b/mm/truncate.c > index 7c304d2..b7f51a6 100644 > --- a/mm/truncate.c > +++ b/mm/truncate.c > @@ -78,8 +78,22 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping, > > if (dax_mapping(mapping)) { > for (i = j; i < nr; i++) { > - if (xa_is_value(fbatch->folios[i])) > + if (xa_is_value(fbatch->folios[i])) { > + /* > + * File systems should already have called > + * dax_break_mapping_entry() to remove all DAX > + * entries while holding a lock to prevent > + * establishing new entries. Therefore we > + * shouldn't find any here. > + */ > + WARN_ON_ONCE(1); > + > + /* > + * Delete the mapping so truncate_pagecache() > + * doesn't loop forever. > + */ > dax_delete_mapping_entry(mapping, indices[i]); > + } Looks good. With the above additional fixup you can add: Reviewed-by: Dan Williams <dan.j.williams@xxxxxxxxx>