Re: [PATCH v6 07/26] fs/dax: Ensure all pages are idle prior to filesystem unmount

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Jan 10, 2025 at 05:00:35PM +1100, Alistair Popple wrote:
> File systems call dax_break_mapping() prior to reallocating file
> system blocks to ensure the page is not undergoing any DMA or other
> accesses. Generally this is needed when a file is truncated to ensure
> that if a block is reallocated nothing is writing to it. However
> filesystems currently don't call this when an FS DAX inode is evicted.
> 
> This can cause problems when the file system is unmounted as a page
> can continue to be under going DMA or other remote access after
> unmount. This means if the file system is remounted any truncate or
> other operation which requires the underlying file system block to be
> freed will not wait for the remote access to complete. Therefore a
> busy block may be reallocated to a new file leading to corruption.
> 
> Signed-off-by: Alistair Popple <apopple@xxxxxxxxxx>
> 
> ---
> 
> Changes for v5:
> 
>  - Don't wait for pages to be idle in non-DAX mappings
> ---
>  fs/dax.c            | 29 +++++++++++++++++++++++++++++
>  fs/ext4/inode.c     | 32 ++++++++++++++------------------
>  fs/xfs/xfs_inode.c  |  9 +++++++++
>  fs/xfs/xfs_inode.h  |  1 +
>  fs/xfs/xfs_super.c  | 18 ++++++++++++++++++
>  include/linux/dax.h |  2 ++
>  6 files changed, 73 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 7008a73..4e49cc4 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -883,6 +883,14 @@ static int wait_page_idle(struct page *page,
>  				TASK_INTERRUPTIBLE, 0, 0, cb(inode));
>  }
>  
> +static void wait_page_idle_uninterruptible(struct page *page,
> +					void (cb)(struct inode *),
> +					struct inode *inode)
> +{
> +	___wait_var_event(page, page_ref_count(page) == 1,
> +			TASK_UNINTERRUPTIBLE, 0, 0, cb(inode));
> +}
> +
>  /*
>   * Unmaps the inode and waits for any DMA to complete prior to deleting the
>   * DAX mapping entries for the range.
> @@ -911,6 +919,27 @@ int dax_break_mapping(struct inode *inode, loff_t start, loff_t end,
>  }
>  EXPORT_SYMBOL_GPL(dax_break_mapping);
>  
> +void dax_break_mapping_uninterruptible(struct inode *inode,
> +				void (cb)(struct inode *))
> +{
> +	struct page *page;
> +
> +	if (!dax_mapping(inode->i_mapping))
> +		return;
> +
> +	do {
> +		page = dax_layout_busy_page_range(inode->i_mapping, 0,
> +						LLONG_MAX);
> +		if (!page)
> +			break;
> +
> +		wait_page_idle_uninterruptible(page, cb, inode);
> +	} while (true);
> +
> +	dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
> +}
> +EXPORT_SYMBOL_GPL(dax_break_mapping_uninterruptible);
> +
>  /*
>   * Invalidate DAX entry if it is clean.
>   */
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index ee8e83f..fa35161 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -163,6 +163,18 @@ int ext4_inode_is_fast_symlink(struct inode *inode)
>  	       (inode->i_size < EXT4_N_BLOCKS * 4);
>  }
>  
> +static void ext4_wait_dax_page(struct inode *inode)
> +{
> +	filemap_invalidate_unlock(inode->i_mapping);
> +	schedule();
> +	filemap_invalidate_lock(inode->i_mapping);
> +}
> +
> +int ext4_break_layouts(struct inode *inode)
> +{
> +	return dax_break_mapping_inode(inode, ext4_wait_dax_page);
> +}
> +
>  /*
>   * Called at the last iput() if i_nlink is zero.
>   */
> @@ -181,6 +193,8 @@ void ext4_evict_inode(struct inode *inode)
>  
>  	trace_ext4_evict_inode(inode);
>  
> +	dax_break_mapping_uninterruptible(inode, ext4_wait_dax_page);
> +
>  	if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
>  		ext4_evict_ea_inode(inode);
>  	if (inode->i_nlink) {
> @@ -3902,24 +3916,6 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
>  	return ret;
>  }
>  
> -static void ext4_wait_dax_page(struct inode *inode)
> -{
> -	filemap_invalidate_unlock(inode->i_mapping);
> -	schedule();
> -	filemap_invalidate_lock(inode->i_mapping);
> -}
> -
> -int ext4_break_layouts(struct inode *inode)
> -{
> -	struct page *page;
> -	int error;
> -
> -	if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
> -		return -EINVAL;
> -
> -	return dax_break_mapping_inode(inode, ext4_wait_dax_page);
> -}
> -
>  /*
>   * ext4_punch_hole: punches a hole in a file by releasing the blocks
>   * associated with the given offset and length
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 4410b42..c7ec5ab 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -2997,6 +2997,15 @@ xfs_break_dax_layouts(
>  	return dax_break_mapping_inode(inode, xfs_wait_dax_page);
>  }
>  
> +void
> +xfs_break_dax_layouts_uninterruptible(
> +	struct inode		*inode)
> +{
> +	xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
> +
> +	dax_break_mapping_uninterruptible(inode, xfs_wait_dax_page);
> +}
> +
>  int
>  xfs_break_layouts(
>  	struct inode		*inode,
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index c4f03f6..613797a 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -594,6 +594,7 @@ xfs_itruncate_extents(
>  }
>  
>  int	xfs_break_dax_layouts(struct inode *inode);
> +void xfs_break_dax_layouts_uninterruptible(struct inode *inode);
>  int	xfs_break_layouts(struct inode *inode, uint *iolock,
>  		enum layout_break_reason reason);
>  
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 8524b9d..73ec060 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -751,6 +751,23 @@ xfs_fs_drop_inode(
>  	return generic_drop_inode(inode);
>  }
>  
> +STATIC void
> +xfs_fs_evict_inode(
> +	struct inode		*inode)
> +{
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
> +
> +	if (IS_DAX(inode)) {
> +		xfs_ilock(ip, iolock);
> +		xfs_break_dax_layouts_uninterruptible(inode);
> +		xfs_iunlock(ip, iolock);

If we're evicting the inode, why is it necessary to take i_rwsem and the
mmap invalidation lock?  Shouldn't the evicting thread be the only one
with access to this inode?

--D

> +	}
> +
> +	truncate_inode_pages_final(&inode->i_data);
> +	clear_inode(inode);
> +}
> +
>  static void
>  xfs_mount_free(
>  	struct xfs_mount	*mp)
> @@ -1189,6 +1206,7 @@ static const struct super_operations xfs_super_operations = {
>  	.destroy_inode		= xfs_fs_destroy_inode,
>  	.dirty_inode		= xfs_fs_dirty_inode,
>  	.drop_inode		= xfs_fs_drop_inode,
> +	.evict_inode		= xfs_fs_evict_inode,
>  	.put_super		= xfs_fs_put_super,
>  	.sync_fs		= xfs_fs_sync_fs,
>  	.freeze_fs		= xfs_fs_freeze,
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index ef9e02c..7c3773f 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -274,6 +274,8 @@ static inline int __must_check dax_break_mapping_inode(struct inode *inode,
>  {
>  	return dax_break_mapping(inode, 0, LLONG_MAX, cb);
>  }
> +void dax_break_mapping_uninterruptible(struct inode *inode,
> +				void (cb)(struct inode *));
>  int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>  				  struct inode *dest, loff_t destoff,
>  				  loff_t len, bool *is_same,
> -- 
> git-series 0.9.1
> 




[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux