The pin_user_pages() + page_maybe_dma_pinned() infrastructure is a framework for tackling the kernel's struggles with gup+DMA. DAX presents a unique flavor of the gup+DMA problem since pinned pages are identical to physical filesystem blocks. Unlike the page-cache case, a mapping of a file can not be truncated while DMA is in-flight because the DMA must complete before the filesystem block is reclaimed. DAX has a homegrown solution to this problem based on watching the page->_refcount go idle. Beyond being awkward to catch that idle transition in put_page(), it is overkill when only the page_maybe_dma_pinned() transition needs to be captured. Move the wakeup of filesystem-DAX truncate paths ({ext4,xfs,fuse_dax}_break_layouts()) to unpin_user_pages() with a new wakeup_fsdax_pin_waiters() helper, and use !page_maybe_dma_pinned() as the wake condition. Cc: Jan Kara <jack@xxxxxxx> Cc: "Darrick J. Wong" <djwong@xxxxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: John Hubbard <jhubbard@xxxxxxxxxx> Reported-by: Jason Gunthorpe <jgg@xxxxxxxxxx> Reported-by: Matthew Wilcox <willy@xxxxxxxxxxxxx> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> --- fs/dax.c | 4 ++-- fs/ext4/inode.c | 7 +++---- fs/fuse/dax.c | 6 +++--- fs/xfs/xfs_file.c | 6 +++--- include/linux/mm.h | 28 ++++++++++++++++++++++++++++ mm/gup.c | 6 ++++-- 6 files changed, 43 insertions(+), 14 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 0f22f7b46de0..aceb587bc27e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -395,7 +395,7 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - WARN_ON_ONCE(trunc && page_ref_count(page) > 1); + WARN_ON_ONCE(trunc && page_maybe_dma_pinned(page)); if (dax_mapping_is_cow(page->mapping)) { /* keep the CoW flag if this page is still shared */ if (page->index-- > 0) @@ -414,7 +414,7 @@ static struct page *dax_pinned_page(void *entry) for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - if (page_ref_count(page) > 1) + if (page_maybe_dma_pinned(page)) return page; } return NULL; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf49bf506965..5e68e64f155a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3961,10 +3961,9 @@ int ext4_break_layouts(struct inode *inode) if (!page) return 0; - error = ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, - TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(inode)); + error = ___wait_var_event(page, !page_maybe_dma_pinned(page), + TASK_INTERRUPTIBLE, 0, 0, + ext4_wait_dax_page(inode)); } while (error == 0); return error; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index e0b846f16bc5..6419ca420c42 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -676,9 +676,9 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, return 0; *retry = true; - return ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, - 0, 0, fuse_wait_dax_page(inode)); + return ___wait_var_event(page, !page_maybe_dma_pinned(page), + TASK_INTERRUPTIBLE, 0, 0, + fuse_wait_dax_page(inode)); } /* dmap_end == 0 leads to unmapping of whole file */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 954bb6e83796..dbffb9481b71 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -827,9 +827,9 @@ xfs_break_dax_layouts( return 0; *retry = true; - return ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, - 0, 0, xfs_wait_dax_page(inode)); + return ___wait_var_event(page, !page_maybe_dma_pinned(page), + TASK_INTERRUPTIBLE, 0, 0, + xfs_wait_dax_page(inode)); } int diff --git a/include/linux/mm.h b/include/linux/mm.h index 3bedc449c14d..557d5447ebec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1517,6 +1517,34 @@ static inline bool page_maybe_dma_pinned(struct page *page) return folio_maybe_dma_pinned(page_folio(page)); } +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) +/* + * Unlike typical file backed pages that support truncating a page from + * a file while it is under active DMA, DAX pages need to hold off + * truncate operations until transient page pins are released. + * + * The filesystem (via dax_layout_pinned_page()) takes steps to make + * sure that any observation of the !page_maybe_dma_pinned() state is + * stable until the truncation completes. + */ +static inline void wakeup_fsdax_pin_waiters(struct folio *folio) +{ + struct page *page = &folio->page; + + if (!folio_is_zone_device(folio)) + return; + if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) + return; + if (folio_maybe_dma_pinned(folio)) + return; + wake_up_var(page); +} +#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ +static inline void wakeup_fsdax_pin_waiters(struct folio *folio) +{ +} +#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ + /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. diff --git a/mm/gup.c b/mm/gup.c index 732825157430..499c46296fda 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -177,8 +177,10 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) refs *= GUP_PIN_COUNTING_BIAS; } - if (!put_devmap_managed_page_refs(&folio->page, refs)) - folio_put_refs(folio, refs); + folio_put_refs(folio, refs); + + if (flags & FOLL_PIN) + wakeup_fsdax_pin_waiters(folio); } /**