Re: [PATCH v3 12/13] dax: handle truncate of dma-busy pages

Jeff Layton <jlayton@xxxxxxxxxx> · Fri, 20 Oct 2017 09:05:26 -0400

On Thu, 2017-10-19 at 19:40 -0700, Dan Williams wrote:
> get_user_pages() pins file backed memory pages for access by dma
> devices. However, it only pins the memory pages not the page-to-file
> offset association. If a file is truncated the pages are mapped out of
> the file and dma may continue indefinitely into a page that is owned by
> a device driver. This breaks coherency of the file vs dma, but the
> assumption is that if userspace wants the file-space truncated it does
> not matter what data is inbound from the device, it is not relevant
> anymore.
> 
> The assumptions of the truncate-page-cache model are broken by DAX where
> the target DMA page *is* the filesystem block. Leaving the page pinned
> for DMA, but truncating the file block out of the file, means that the
> filesytem is free to reallocate a block under active DMA to another
> file!
> 
> Here are some possible options for fixing this situation ('truncate' and
> 'fallocate(punch hole)' are synonymous below):
> 
>     1/ Fail truncate while any file blocks might be under dma
> 
>     2/ Block (sleep-wait) truncate while any file blocks might be under
>        dma
> 
>     3/ Remap file blocks to a "lost+found"-like file-inode where
>        dma can continue and we might see what inbound data from DMA was
>        mapped out of the original file. Blocks in this file could be
>        freed back to the filesystem when dma eventually ends.
> 
>     4/ Disable dax until option 3 or another long term solution has been
>        implemented. However, filesystem-dax is still marked experimental
>        for concerns like this.
> 
> Option 1 will throw failures where userspace has never expected them
> before, option 2 might hang the truncating process indefinitely, and
> option 3 requires per filesystem enabling to remap blocks from one inode
> to another.  Option 2 is implemented in this patch for the DAX path with
> the expectation that non-transient users of get_user_pages() (RDMA) are
> disallowed from setting up dax mappings and that the potential delay
> introduced to the truncate path is acceptable compared to the response
> time of the page cache case. This can only be seen as a stop-gap until
> we can solve the problem of safely sequestering unallocated filesystem
> blocks under active dma.
> 

FWIW, I like #3 a lot more than #2 here. I get that it's quite a bit
more work though, so no objection to this as a stop-gap fix.

> The solution introduces a new FL_ALLOCATED lease to pin the allocated
> blocks in a dax file while dma might be accessing them. It behaves
> identically to an FL_LAYOUT lease save for the fact that it is
> immediately sheduled to be reaped, and that the only path that waits for
> its removal is the truncate path. We can not reuse FL_LAYOUT directly
> since that would deadlock in the case where userspace did a direct-I/O
> operation with a target buffer backed by an mmap range of the same file.
> 
> Credit / inspiration for option 3 goes to Dave Hansen, who proposed
> something similar as an alternative way to solve the problem that
> MAP_DIRECT was trying to solve.
> 
> Cc: Jan Kara <jack@xxxxxxx>
> Cc: Jeff Moyer <jmoyer@xxxxxxxxxx>
> Cc: Dave Chinner <david@xxxxxxxxxxxxx>
> Cc: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx>
> Cc: "Darrick J. Wong" <darrick.wong@xxxxxxxxxx>
> Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
> Cc: Jeff Layton <jlayton@xxxxxxxxxxxxxxx>
> Cc: "J. Bruce Fields" <bfields@xxxxxxxxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Reported-by: Christoph Hellwig <hch@xxxxxx>
> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
> ---
>  fs/Kconfig          |    1 
>  fs/dax.c            |  188 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/locks.c          |   17 ++++-
>  include/linux/dax.h |   23 ++++++
>  include/linux/fs.h  |   22 +++++-
>  mm/gup.c            |   27 ++++++-
>  6 files changed, 268 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/Kconfig b/fs/Kconfig
> index 7aee6d699fd6..a7b31a96a753 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -37,6 +37,7 @@ source "fs/f2fs/Kconfig"
>  config FS_DAX
>  	bool "Direct Access (DAX) support"
>  	depends on MMU
> +	depends on FILE_LOCKING
>  	depends on !(ARM || MIPS || SPARC)
>  	select FS_IOMAP
>  	select DAX
> diff --git a/fs/dax.c b/fs/dax.c
> index b03f547b36e7..e0a3958fc5f2 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -22,6 +22,7 @@
>  #include <linux/genhd.h>
>  #include <linux/highmem.h>
>  #include <linux/memcontrol.h>
> +#include <linux/file.h>
>  #include <linux/mm.h>
>  #include <linux/mutex.h>
>  #include <linux/pagevec.h>
> @@ -1481,3 +1482,190 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
>  	}
>  }
>  EXPORT_SYMBOL_GPL(dax_iomap_fault);
> +
> +enum dax_lease_flags {
> +	DAX_LEASE_PAGES,
> +	DAX_LEASE_BREAK,
> +};
> +
> +struct dax_lease {
> +	struct page **dl_pages;
> +	unsigned long dl_nr_pages;
> +	unsigned long dl_state;
> +	struct file *dl_file;
> +	atomic_t dl_count;
> +	/*
> +	 * Once the lease is taken and the pages have references we
> +	 * start the reap_work to poll for lease release while acquiring
> +	 * fs locks that synchronize with truncate. So, either reap_work
> +	 * cleans up the dax_lease instances or truncate itself.
> +	 *
> +	 * The break_work sleepily polls for DMA completion and then
> +	 * unlocks/removes the lease.
> +	 */
> +	struct delayed_work dl_reap_work;
> +	struct delayed_work dl_break_work;
> +};
> +
> +static void put_dax_lease(struct dax_lease *dl)
> +{
> +	if (atomic_dec_and_test(&dl->dl_count)) {
> +		fput(dl->dl_file);
> +		kfree(dl);
> +	}
> +}

Any reason not to use the new refcount_t type for dl_count? Seems like a
good place for it.

> +
> +static void dax_lease_unlock_one(struct work_struct *work)
> +{
> +	struct dax_lease *dl = container_of(work, typeof(*dl),
> +			dl_break_work.work);
> +	unsigned long i;
> +
> +	/* wait for the gup path to finish recording pages in the lease */
> +	if (!test_bit(DAX_LEASE_PAGES, &dl->dl_state)) {
> +		schedule_delayed_work(&dl->dl_break_work, HZ);
> +		return;
> +	}
> +
> +	/* barrier pairs with dax_lease_set_pages() */
> +	smp_mb__after_atomic();
> +
> +	/*
> +	 * If we see all pages idle at least once we can remove the
> +	 * lease. If we happen to race with someone else taking a
> +	 * reference on a page they will have their own lease to protect
> +	 * against truncate.
> +	 */
> +	for (i = 0; i < dl->dl_nr_pages; i++)
> +		if (page_ref_count(dl->dl_pages[i]) > 1) {
> +			schedule_delayed_work(&dl->dl_break_work, HZ);
> +			return;
> +		}
> +	vfs_setlease(dl->dl_file, F_UNLCK, NULL, (void **) &dl);
> +	put_dax_lease(dl);
> +}
> +
> +static void dax_lease_reap_all(struct work_struct *work)
> +{
> +	struct dax_lease *dl = container_of(work, typeof(*dl),
> +			dl_reap_work.work);
> +	struct file *file = dl->dl_file;
> +	struct inode *inode = file_inode(file);
> +	struct address_space *mapping = inode->i_mapping;
> +
> +	if (mapping->a_ops->dax_flush_dma) {
> +		mapping->a_ops->dax_flush_dma(inode);
> +	} else {
> +		/* FIXME: dax-filesystem needs to add dax-dma support */
> +		break_allocated(inode, true);
> +	}
> +	put_dax_lease(dl);
> +}
> +
> +static bool dax_lease_lm_break(struct file_lock *fl)
> +{
> +	struct dax_lease *dl = fl->fl_owner;
> +
> +	if (!test_and_set_bit(DAX_LEASE_BREAK, &dl->dl_state)) {
> +		atomic_inc(&dl->dl_count);
> +		schedule_delayed_work(&dl->dl_break_work, HZ);
> +	}
> +

I haven't gone over this completely, but what prevents you from doing a
0->1 transition on the dl_count here, and possibly doing a use-after
free?

Ahh ok...I guess we know that we hold a reference since this is on the
flc_lease list? Fair enough. Still, might be worth a comment there as to
why that's safe.

> +	/* Tell the core lease code to wait for delayed work completion */
> +	fl->fl_break_time = 0;
> +
> +	return false;
> +}
> +
> +static int dax_lease_lm_change(struct file_lock *fl, int arg,
> +		struct list_head *dispose)
> +{
> +	struct dax_lease *dl;
> +	int rc;
> +
> +	WARN_ON(!(arg & F_UNLCK));
> +	dl = fl->fl_owner;
> +	rc = lease_modify(fl, arg, dispose);
> +	put_dax_lease(dl);
> +	return rc;
> +}
> +
> +static const struct lock_manager_operations dax_lease_lm_ops = {
> +	.lm_break = dax_lease_lm_break,
> +	.lm_change = dax_lease_lm_change,
> +};
> +
> +struct dax_lease *__dax_truncate_lease(struct vm_area_struct *vma,
> +		long nr_pages)
> +{
> +	struct file *file = vma->vm_file;
> +	struct inode *inode = file_inode(file);
> +	struct dax_lease *dl;
> +	struct file_lock *fl;
> +	int rc = -ENOMEM;
> +
> +	if (!vma_is_dax(vma))
> +		return NULL;
> +
> +	/* device-dax can not be truncated */
> +	if (!S_ISREG(inode->i_mode))
> +		return NULL;
> +
> +	dl = kzalloc(sizeof(*dl) + sizeof(struct page *) * nr_pages, GFP_KERNEL);
> +	if (!dl)
> +		return ERR_PTR(-ENOMEM);
> +
> +	fl = locks_alloc_lock();
> +	if (!fl)
> +		goto err_lock_alloc;
> +
> +	dl->dl_pages = (struct page **)(dl + 1);
> +	INIT_DELAYED_WORK(&dl->dl_break_work, dax_lease_unlock_one);
> +	INIT_DELAYED_WORK(&dl->dl_reap_work, dax_lease_reap_all);
> +	dl->dl_file = get_file(file);
> +	/* need dl alive until dax_lease_set_pages() and final put */
> +	atomic_set(&dl->dl_count, 2);
> +
> +	locks_init_lock(fl);
> +	fl->fl_lmops = &dax_lease_lm_ops;
> +	fl->fl_flags = FL_ALLOCATED;
> +	fl->fl_type = F_RDLCK;
> +	fl->fl_end = OFFSET_MAX;
> +	fl->fl_owner = dl;
> +	fl->fl_pid = current->tgid;
> +	fl->fl_file = file;
> +
> +	rc = vfs_setlease(fl->fl_file, fl->fl_type, &fl, (void **) &dl);
> +	if (rc)
> +		goto err_setlease;
> +	return dl;
> +err_setlease:
> +	locks_free_lock(fl);
> +err_lock_alloc:
> +	kfree(dl);
> +	return ERR_PTR(rc);
> +}
> +
> +void dax_lease_set_pages(struct dax_lease *dl, struct page **pages,
> +		long nr_pages)
> +{
> +	if (IS_ERR_OR_NULL(dl))
> +		return;
> +
> +	if (nr_pages <= 0) {
> +		dl->dl_nr_pages = 0;
> +		smp_mb__before_atomic();
> +		set_bit(DAX_LEASE_PAGES, &dl->dl_state);
> +		vfs_setlease(dl->dl_file, F_UNLCK, NULL, (void **) &dl);
> +		flush_delayed_work(&dl->dl_break_work);
> +		put_dax_lease(dl);
> +		return;
> +	}
> +
> +	dl->dl_nr_pages = nr_pages;
> +	memcpy(dl->dl_pages, pages, sizeof(struct page *) * nr_pages);
> +	smp_mb__before_atomic();
> +	set_bit(DAX_LEASE_PAGES, &dl->dl_state);
> +	queue_delayed_work(system_long_wq, &dl->dl_reap_work, HZ);
> +}
> +EXPORT_SYMBOL_GPL(dax_lease_set_pages);
> diff --git a/fs/locks.c b/fs/locks.c
> index 1bd71c4d663a..0a7841590b35 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -135,7 +135,7 @@
>  
>  #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
>  #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
> -#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
> +#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT|FL_ALLOCATED))
>  #define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
>  #define IS_REMOTELCK(fl)	(fl->fl_pid <= 0)
>  
> @@ -1414,7 +1414,9 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
>  
>  static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
>  {
> -	if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
> +	/* FL_LAYOUT and FL_ALLOCATED only conflict with each other */
> +	if (!!(breaker->fl_flags & (FL_LAYOUT|FL_ALLOCATED))
> +			!= !!(lease->fl_flags & (FL_LAYOUT|FL_ALLOCATED)))
>  		return false;
>  	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
>  		return false;
> @@ -1653,7 +1655,7 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
>  	int ret = 0;
>  	struct inode *inode = dentry->d_inode;
>  
> -	if (flags & FL_LAYOUT)
> +	if (flags & (FL_LAYOUT|FL_ALLOCATED))
>  		return 0;
>  
>  	if ((arg == F_RDLCK) &&
> @@ -1733,6 +1735,15 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
>  		 */
>  		if (arg == F_WRLCK)
>  			goto out;
> +
> +		/*
> +		 * Taking out a new FL_ALLOCATED lease while a previous
> +		 * one is being locked is expected since each instance
> +		 * may be responsible for a distinct range of pages.
> +		 */
> +		if (fl->fl_flags & FL_ALLOCATED)
> +			continue;
> +
>  		/*
>  		 * Modifying our existing lease is OK, but no getting a
>  		 * new lease if someone else is opening for write:
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 122197124b9d..3ff61dc6241e 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -100,10 +100,15 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
>  int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
>  				      pgoff_t index);
>  
> +struct dax_lease;
>  #ifdef CONFIG_FS_DAX
>  int __dax_zero_page_range(struct block_device *bdev,
>  		struct dax_device *dax_dev, sector_t sector,
>  		unsigned int offset, unsigned int length);
> +struct dax_lease *__dax_truncate_lease(struct vm_area_struct *vma,
> +		long nr_pages);
> +void dax_lease_set_pages(struct dax_lease *dl, struct page **pages,
> +		long nr_pages);
>  #else
>  static inline int __dax_zero_page_range(struct block_device *bdev,
>  		struct dax_device *dax_dev, sector_t sector,
> @@ -111,8 +116,26 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
>  {
>  	return -ENXIO;
>  }
> +static inline struct dax_lease *__dax_truncate_lease(struct vm_area_struct *vma,
> +		long nr_pages)
> +{
> +	return NULL;
> +}
> +
> +static inline void dax_lease_set_pages(struct dax_lease *dl,
> +		struct page **pages, long nr_pages)
> +{
> +}
>  #endif
>  
> +static inline struct dax_lease *dax_truncate_lease(struct vm_area_struct *vma,
> +		long nr_pages)
> +{
> +	if (!vma_is_dax(vma))
> +		return NULL;
> +	return __dax_truncate_lease(vma, nr_pages);
> +}
> +
>  static inline bool dax_mapping(struct address_space *mapping)
>  {
>  	return mapping->host && IS_DAX(mapping->host);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index eace2c5396a7..a3ed74833919 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -371,6 +371,9 @@ struct address_space_operations {
>  	int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
>  				sector_t *span);
>  	void (*swap_deactivate)(struct file *file);
> +
> +	/* dax dma support */
> +	void (*dax_flush_dma)(struct inode *inode);
>  };
>  
>  extern const struct address_space_operations empty_aops;
> @@ -927,6 +930,7 @@ static inline struct file *get_file(struct file *f)
>  #define FL_UNLOCK_PENDING	512 /* Lease is being broken */
>  #define FL_OFDLCK	1024	/* lock is "owned" by struct file */
>  #define FL_LAYOUT	2048	/* outstanding pNFS layout */
> +#define FL_ALLOCATED	4096	/* pin allocated dax blocks against dma */
>  
>  #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
>  
> @@ -2324,17 +2328,27 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
>  	return ret;
>  }
>  
> -static inline int break_layout(struct inode *inode, bool wait)
> +static inline int __break_layout(struct inode *inode, bool wait,
> +		unsigned int type)
>  {
>  	struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx);
>  
>  	if (ctx && !list_empty_careful(&ctx->flc_lease))
>  		return __break_lease(inode,
>  				wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
> -				FL_LAYOUT);
> +				type);
>  	return 0;
>  }
>  
> +static inline int break_layout(struct inode *inode, bool wait)
> +{
> +	return __break_layout(inode, wait, FL_LAYOUT);
> +}
> +
> +static inline int break_allocated(struct inode *inode, bool wait)
> +{
> +	return __break_layout(inode, wait, FL_LAYOUT|FL_ALLOCATED);
> +}
>  #else /* !CONFIG_FILE_LOCKING */
>  static inline int break_lease(struct inode *inode, unsigned int mode)
>  {
> @@ -2362,6 +2376,10 @@ static inline int break_layout(struct inode *inode, bool wait)
>  	return 0;
>  }
>  
> +static inline int break_allocated(struct inode *inode, bool wait)
> +{
> +	return 0;
> +}
>  #endif /* CONFIG_FILE_LOCKING */
>  
>  /* fs/open.c */
> diff --git a/mm/gup.c b/mm/gup.c
> index 308be897d22a..6a7cf371e656 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -9,6 +9,7 @@
>  #include <linux/rmap.h>
>  #include <linux/swap.h>
>  #include <linux/swapops.h>
> +#include <linux/dax.h>
>  
>  #include <linux/sched/signal.h>
>  #include <linux/rwsem.h>
> @@ -640,9 +641,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>  		unsigned int gup_flags, struct page **pages,
>  		struct vm_area_struct **vmas, int *nonblocking)
>  {
> -	long i = 0;
> +	long i = 0, result = 0;
> +	int dax_lease_once = 0;
>  	unsigned int page_mask;
>  	struct vm_area_struct *vma = NULL;
> +	struct dax_lease *dax_lease = NULL;
>  
>  	if (!nr_pages)
>  		return 0;
> @@ -693,6 +696,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>  		if (unlikely(fatal_signal_pending(current)))
>  			return i ? i : -ERESTARTSYS;
>  		cond_resched();
> +		if (pages && !dax_lease_once) {
> +			dax_lease_once = 1;
> +			dax_lease = dax_truncate_lease(vma, nr_pages);
> +			if (IS_ERR(dax_lease)) {
> +				result = PTR_ERR(dax_lease);
> +				goto out;
> +			}
> +		}
>  		page = follow_page_mask(vma, start, foll_flags, &page_mask);
>  		if (!page) {
>  			int ret;
> @@ -704,9 +715,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>  			case -EFAULT:
>  			case -ENOMEM:
>  			case -EHWPOISON:
> -				return i ? i : ret;
> +				result = i ? i : ret;
> +				goto out;
>  			case -EBUSY:
> -				return i;
> +				result = i;
> +				goto out;
>  			case -ENOENT:
>  				goto next_page;
>  			}
> @@ -718,7 +731,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>  			 */
>  			goto next_page;
>  		} else if (IS_ERR(page)) {
> -			return i ? i : PTR_ERR(page);
> +			result = i ? i : PTR_ERR(page);
> +			goto out;
>  		}
>  		if (pages) {
>  			pages[i] = page;
> @@ -738,7 +752,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>  		start += page_increm * PAGE_SIZE;
>  		nr_pages -= page_increm;
>  	} while (nr_pages);
> -	return i;
> +	result = i;
> +out:
> +	dax_lease_set_pages(dax_lease, pages, result);
> +	return result;
>  }
>  
>  static bool vma_permits_fault(struct vm_area_struct *vma,
>