The patch titled Subject: fs/dax: don't skip locked entries when scanning entries has been added to the -mm mm-unstable branch. Its filename is fs-dax-dont-skip-locked-entries-when-scanning-entries.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/fs-dax-dont-skip-locked-entries-when-scanning-entries.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: Alistair Popple <apopple@xxxxxxxxxx> Subject: fs/dax: don't skip locked entries when scanning entries Date: Wed, 5 Feb 2025 09:48:00 +1100 Several functions internal to FS DAX use the following pattern when trying to obtain an unlocked entry: xas_for_each(&xas, entry, end_idx) { if (dax_is_locked(entry)) entry = get_unlocked_entry(&xas, 0); This is problematic because get_unlocked_entry() will get the next present entry in the range, and the next entry may not be locked. Therefore any processing of the original locked entry will be skipped. This can cause dax_layout_busy_page_range() to miss DMA-busy pages in the range, leading file systems to free blocks whilst DMA operations are ongoing which can lead to file system corruption. Instead callers from within a xas_for_each() loop should be waiting for the current entry to be unlocked without advancing the XArray state so a new function is introduced to wait. Also while we are here rename get_unlocked_entry() to get_next_unlocked_entry() to make it clear that it may advance the iterator state. Link: https://lkml.kernel.org/r/1cabe0ce0ca3b19ac9b23c4e6a492775293488ff.1738709036.git-series.apopple@xxxxxxxxxx Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx> Reviewed-by: Dan Williams <dan.j.williams@xxxxxxxxx> Tested-by: Alison Schofield <alison.schofield@xxxxxxxxx> Cc: Alexander Gordeev <agordeev@xxxxxxxxxxxxx> Cc: Asahi Lina <lina@xxxxxxxxxxxxx> Cc: Bjorn Helgaas <bhelgaas@xxxxxxxxxx> Cc: Catalin Marinas <catalin.marinas@xxxxxxx> Cc: Christian Borntraeger <borntraeger@xxxxxxxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: Chunyan Zhang <zhang.lyra@xxxxxxxxx> Cc: "Darrick J. Wong" <djwong@xxxxxxxxxx> Cc: Dave Chinner <david@xxxxxxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> Cc: Dave Jiang <dave.jiang@xxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Gerald Schaefer <gerald.schaefer@xxxxxxxxxxxxx> Cc: Heiko Carstens <hca@xxxxxxxxxxxxx> Cc: Huacai Chen <chenhuacai@xxxxxxxxxx> Cc: Ira Weiny <ira.weiny@xxxxxxxxx> Cc: Jan Kara <jack@xxxxxxx> Cc: Jason Gunthorpe <jgg@xxxxxxxxxx> Cc: Jason Gunthorpe <jgg@xxxxxxxx> Cc: John Hubbard <jhubbard@xxxxxxxxxx> Cc: linmiaohe <linmiaohe@xxxxxxxxxx> Cc: Logan Gunthorpe <logang@xxxxxxxxxxxx> Cc: Mattew Wilcox <willy@xxxxxxxxxxxxx> Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx> Cc: Nicholas Piggin <npiggin@xxxxxxxxx> Cc: Peter Xu <peterx@xxxxxxxxxx> Cc: Sven Schnelle <svens@xxxxxxxxxxxxx> Cc: Ted Ts'o <tytso@xxxxxxx> Cc: Vasily Gorbik <gor@xxxxxxxxxxxxx> Cc: Vishal Verma <vishal.l.verma@xxxxxxxxx> Cc: Vivek Goyal <vgoyal@xxxxxxxxxx> Cc: WANG Xuerui <kernel@xxxxxxxxxx> Cc: Will Deacon <will@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- fs/dax.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) --- a/fs/dax.c~fs-dax-dont-skip-locked-entries-when-scanning-entries +++ a/fs/dax.c @@ -206,7 +206,7 @@ static void dax_wake_entry(struct xa_sta * * Must be called with the i_pages lock held. */ -static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) +static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order) { void *entry; struct wait_exceptional_entry_queue ewait; @@ -236,6 +236,37 @@ static void *get_unlocked_entry(struct x } /* + * Wait for the given entry to become unlocked. Caller must hold the i_pages + * lock and call either put_unlocked_entry() if it did not lock the entry or + * dax_unlock_entry() if it did. Returns an unlocked entry if still present. + */ +static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry) +{ + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq; + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + + while (unlikely(dax_is_locked(entry))) { + wq = dax_entry_waitqueue(xas, entry, &ewait.key); + prepare_to_wait_exclusive(wq, &ewait.wait, + TASK_UNINTERRUPTIBLE); + xas_pause(xas); + xas_unlock_irq(xas); + schedule(); + finish_wait(wq, &ewait.wait); + xas_lock_irq(xas); + entry = xas_load(xas); + } + + if (xa_is_internal(entry)) + return NULL; + + return entry; +} + +/* * The only thing keeping the address space around is the i_pages lock * (it's cycled in clear_inode() after removing the entries from i_pages) * After we call xas_unlock_irq(), we cannot touch xas->xa. @@ -250,7 +281,7 @@ static void wait_entry_unlocked(struct x wq = dax_entry_waitqueue(xas, entry, &ewait.key); /* - * Unlike get_unlocked_entry() there is no guarantee that this + * Unlike get_next_unlocked_entry() there is no guarantee that this * path ever successfully retrieves an unlocked entry before an * inode dies. Perform a non-exclusive wait in case this path * never successfully performs its own wake up. @@ -580,7 +611,7 @@ static void *grab_mapping_entry(struct x retry: pmd_downgrade = false; xas_lock_irq(xas); - entry = get_unlocked_entry(xas, order); + entry = get_next_unlocked_entry(xas, order); if (entry) { if (dax_is_conflict(entry)) @@ -716,8 +747,7 @@ struct page *dax_layout_busy_page_range( xas_for_each(&xas, entry, end_idx) { if (WARN_ON_ONCE(!xa_is_value(entry))) continue; - if (unlikely(dax_is_locked(entry))) - entry = get_unlocked_entry(&xas, 0); + entry = wait_entry_unlocked_exclusive(&xas, entry); if (entry) page = dax_busy_page(entry); put_unlocked_entry(&xas, entry, WAKE_NEXT); @@ -750,7 +780,7 @@ static int __dax_invalidate_entry(struct void *entry; xas_lock_irq(&xas); - entry = get_unlocked_entry(&xas, 0); + entry = get_next_unlocked_entry(&xas, 0); if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto out; if (!trunc && @@ -776,7 +806,9 @@ static int __dax_clear_dirty_range(struc xas_lock_irq(&xas); xas_for_each(&xas, entry, end) { - entry = get_unlocked_entry(&xas, 0); + entry = wait_entry_unlocked_exclusive(&xas, entry); + if (!entry) + continue; xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); put_unlocked_entry(&xas, entry, WAKE_NEXT); @@ -940,7 +972,7 @@ static int dax_writeback_one(struct xa_s if (unlikely(dax_is_locked(entry))) { void *old_entry = entry; - entry = get_unlocked_entry(xas, 0); + entry = get_next_unlocked_entry(xas, 0); /* Entry got punched out / reallocated? */ if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) @@ -1949,7 +1981,7 @@ dax_insert_pfn_mkwrite(struct vm_fault * vm_fault_t ret; xas_lock_irq(&xas); - entry = get_unlocked_entry(&xas, order); + entry = get_next_unlocked_entry(&xas, order); /* Did we race with someone splitting entry or so? */ if (!entry || dax_is_conflict(entry) || (order == 0 && !dax_is_pte_entry(entry))) { _ Patches currently in -mm which might be from apopple@xxxxxxxxxx are fuse-fix-dax-truncate-punch_hole-fault-path.patch fs-dax-return-unmapped-busy-pages-from-dax_layout_busy_page_range.patch fs-dax-dont-skip-locked-entries-when-scanning-entries.patch fs-dax-refactor-wait-for-dax-idle-page.patch fs-dax-create-a-common-implementation-to-break-dax-layouts.patch fs-dax-always-remove-dax-page-cache-entries-when-breaking-layouts.patch fs-dax-ensure-all-pages-are-idle-prior-to-filesystem-unmount.patch fs-dax-remove-page_mapping_dax_shared-mapping-flag.patch mm-gup-remove-redundant-check-for-pci-p2pdma-page.patch mm-mm_init-move-p2pdma-page-refcount-initialisation-to-p2pdma.patch mm-allow-compound-zone-device-pages.patch mm-memory-enhance-insert_page_into_pte_locked-to-create-writable-mappings.patch mm-memory-add-vmf_insert_page_mkwrite.patch rmap-add-support-for-pud-sized-mappings-to-rmap.patch huge_memory-add-vmf_insert_folio_pud.patch huge_memory-add-vmf_insert_folio_pmd.patch mm-gup-dont-allow-foll_longterm-pinning-of-fs-dax-pages.patch fs-dax-properly-refcount-fs-dax-pages.patch device-dax-properly-refcount-device-dax-pages-when-mapping.patch