[PATCH v3 12/13] dax: handle truncate of dma-busy pages

Dan Williams <dan.j.williams@xxxxxxxxx> · Thu, 19 Oct 2017 19:40:02 -0700

get_user_pages() pins file backed memory pages for access by dma
devices. However, it only pins the memory pages not the page-to-file
offset association. If a file is truncated the pages are mapped out of
the file and dma may continue indefinitely into a page that is owned by
a device driver. This breaks coherency of the file vs dma, but the
assumption is that if userspace wants the file-space truncated it does
not matter what data is inbound from the device, it is not relevant
anymore.

The assumptions of the truncate-page-cache model are broken by DAX where
the target DMA page *is* the filesystem block. Leaving the page pinned
for DMA, but truncating the file block out of the file, means that the
filesytem is free to reallocate a block under active DMA to another
file!

Here are some possible options for fixing this situation ('truncate' and
'fallocate(punch hole)' are synonymous below):

    1/ Fail truncate while any file blocks might be under dma

    2/ Block (sleep-wait) truncate while any file blocks might be under
       dma

    3/ Remap file blocks to a "lost+found"-like file-inode where
       dma can continue and we might see what inbound data from DMA was
       mapped out of the original file. Blocks in this file could be
       freed back to the filesystem when dma eventually ends.

    4/ Disable dax until option 3 or another long term solution has been
       implemented. However, filesystem-dax is still marked experimental
       for concerns like this.

Option 1 will throw failures where userspace has never expected them
before, option 2 might hang the truncating process indefinitely, and
option 3 requires per filesystem enabling to remap blocks from one inode
to another.  Option 2 is implemented in this patch for the DAX path with
the expectation that non-transient users of get_user_pages() (RDMA) are
disallowed from setting up dax mappings and that the potential delay
introduced to the truncate path is acceptable compared to the response
time of the page cache case. This can only be seen as a stop-gap until
we can solve the problem of safely sequestering unallocated filesystem
blocks under active dma.

The solution introduces a new FL_ALLOCATED lease to pin the allocated
blocks in a dax file while dma might be accessing them. It behaves
identically to an FL_LAYOUT lease save for the fact that it is
immediately sheduled to be reaped, and that the only path that waits for
its removal is the truncate path. We can not reuse FL_LAYOUT directly
since that would deadlock in the case where userspace did a direct-I/O
operation with a target buffer backed by an mmap range of the same file.

Credit / inspiration for option 3 goes to Dave Hansen, who proposed
something similar as an alternative way to solve the problem that
MAP_DIRECT was trying to solve.

Cc: Jan Kara <jack@xxxxxxx>
Cc: Jeff Moyer <jmoyer@xxxxxxxxxx>
Cc: Dave Chinner <david@xxxxxxxxxxxxx>
Cc: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: "Darrick J. Wong" <darrick.wong@xxxxxxxxxx>
Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
Cc: Jeff Layton <jlayton@xxxxxxxxxxxxxxx>
Cc: "J. Bruce Fields" <bfields@xxxxxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Reported-by: Christoph Hellwig <hch@xxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
 fs/Kconfig          |    1 
 fs/dax.c            |  188 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/locks.c          |   17 ++++-
 include/linux/dax.h |   23 ++++++
 include/linux/fs.h  |   22 +++++-
 mm/gup.c            |   27 ++++++-
 6 files changed, 268 insertions(+), 10 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 7aee6d699fd6..a7b31a96a753 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -37,6 +37,7 @@ source "fs/f2fs/Kconfig"
 config FS_DAX
 	bool "Direct Access (DAX) support"
 	depends on MMU
+	depends on FILE_LOCKING
 	depends on !(ARM || MIPS || SPARC)
 	select FS_IOMAP
 	select DAX
diff --git a/fs/dax.c b/fs/dax.c
index b03f547b36e7..e0a3958fc5f2 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -22,6 +22,7 @@
 #include <linux/genhd.h>
 #include <linux/highmem.h>
 #include <linux/memcontrol.h>
+#include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/pagevec.h>
@@ -1481,3 +1482,190 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 	}
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+enum dax_lease_flags {
+	DAX_LEASE_PAGES,
+	DAX_LEASE_BREAK,
+};
+
+struct dax_lease {
+	struct page **dl_pages;
+	unsigned long dl_nr_pages;
+	unsigned long dl_state;
+	struct file *dl_file;
+	atomic_t dl_count;
+	/*
+	 * Once the lease is taken and the pages have references we
+	 * start the reap_work to poll for lease release while acquiring
+	 * fs locks that synchronize with truncate. So, either reap_work
+	 * cleans up the dax_lease instances or truncate itself.
+	 *
+	 * The break_work sleepily polls for DMA completion and then
+	 * unlocks/removes the lease.
+	 */
+	struct delayed_work dl_reap_work;
+	struct delayed_work dl_break_work;
+};
+
+static void put_dax_lease(struct dax_lease *dl)
+{
+	if (atomic_dec_and_test(&dl->dl_count)) {
+		fput(dl->dl_file);
+		kfree(dl);
+	}
+}
+
+static void dax_lease_unlock_one(struct work_struct *work)
+{
+	struct dax_lease *dl = container_of(work, typeof(*dl),
+			dl_break_work.work);
+	unsigned long i;
+
+	/* wait for the gup path to finish recording pages in the lease */
+	if (!test_bit(DAX_LEASE_PAGES, &dl->dl_state)) {
+		schedule_delayed_work(&dl->dl_break_work, HZ);
+		return;
+	}
+
+	/* barrier pairs with dax_lease_set_pages() */
+	smp_mb__after_atomic();
+
+	/*
+	 * If we see all pages idle at least once we can remove the
+	 * lease. If we happen to race with someone else taking a
+	 * reference on a page they will have their own lease to protect
+	 * against truncate.
+	 */
+	for (i = 0; i < dl->dl_nr_pages; i++)
+		if (page_ref_count(dl->dl_pages[i]) > 1) {
+			schedule_delayed_work(&dl->dl_break_work, HZ);
+			return;
+		}
+	vfs_setlease(dl->dl_file, F_UNLCK, NULL, (void **) &dl);
+	put_dax_lease(dl);
+}
+
+static void dax_lease_reap_all(struct work_struct *work)
+{
+	struct dax_lease *dl = container_of(work, typeof(*dl),
+			dl_reap_work.work);
+	struct file *file = dl->dl_file;
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+
+	if (mapping->a_ops->dax_flush_dma) {
+		mapping->a_ops->dax_flush_dma(inode);
+	} else {
+		/* FIXME: dax-filesystem needs to add dax-dma support */
+		break_allocated(inode, true);
+	}
+	put_dax_lease(dl);
+}
+
+static bool dax_lease_lm_break(struct file_lock *fl)
+{
+	struct dax_lease *dl = fl->fl_owner;
+
+	if (!test_and_set_bit(DAX_LEASE_BREAK, &dl->dl_state)) {
+		atomic_inc(&dl->dl_count);
+		schedule_delayed_work(&dl->dl_break_work, HZ);
+	}
+
+	/* Tell the core lease code to wait for delayed work completion */
+	fl->fl_break_time = 0;
+
+	return false;
+}
+
+static int dax_lease_lm_change(struct file_lock *fl, int arg,
+		struct list_head *dispose)
+{
+	struct dax_lease *dl;
+	int rc;
+
+	WARN_ON(!(arg & F_UNLCK));
+	dl = fl->fl_owner;
+	rc = lease_modify(fl, arg, dispose);
+	put_dax_lease(dl);
+	return rc;
+}
+
+static const struct lock_manager_operations dax_lease_lm_ops = {
+	.lm_break = dax_lease_lm_break,
+	.lm_change = dax_lease_lm_change,
+};
+
+struct dax_lease *__dax_truncate_lease(struct vm_area_struct *vma,
+		long nr_pages)
+{
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct dax_lease *dl;
+	struct file_lock *fl;
+	int rc = -ENOMEM;
+
+	if (!vma_is_dax(vma))
+		return NULL;
+
+	/* device-dax can not be truncated */
+	if (!S_ISREG(inode->i_mode))
+		return NULL;
+
+	dl = kzalloc(sizeof(*dl) + sizeof(struct page *) * nr_pages, GFP_KERNEL);
+	if (!dl)
+		return ERR_PTR(-ENOMEM);
+
+	fl = locks_alloc_lock();
+	if (!fl)
+		goto err_lock_alloc;
+
+	dl->dl_pages = (struct page **)(dl + 1);
+	INIT_DELAYED_WORK(&dl->dl_break_work, dax_lease_unlock_one);
+	INIT_DELAYED_WORK(&dl->dl_reap_work, dax_lease_reap_all);
+	dl->dl_file = get_file(file);
+	/* need dl alive until dax_lease_set_pages() and final put */
+	atomic_set(&dl->dl_count, 2);
+
+	locks_init_lock(fl);
+	fl->fl_lmops = &dax_lease_lm_ops;
+	fl->fl_flags = FL_ALLOCATED;
+	fl->fl_type = F_RDLCK;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_owner = dl;
+	fl->fl_pid = current->tgid;
+	fl->fl_file = file;
+
+	rc = vfs_setlease(fl->fl_file, fl->fl_type, &fl, (void **) &dl);
+	if (rc)
+		goto err_setlease;
+	return dl;
+err_setlease:
+	locks_free_lock(fl);
+err_lock_alloc:
+	kfree(dl);
+	return ERR_PTR(rc);
+}
+
+void dax_lease_set_pages(struct dax_lease *dl, struct page **pages,
+		long nr_pages)
+{
+	if (IS_ERR_OR_NULL(dl))
+		return;
+
+	if (nr_pages <= 0) {
+		dl->dl_nr_pages = 0;
+		smp_mb__before_atomic();
+		set_bit(DAX_LEASE_PAGES, &dl->dl_state);
+		vfs_setlease(dl->dl_file, F_UNLCK, NULL, (void **) &dl);
+		flush_delayed_work(&dl->dl_break_work);
+		put_dax_lease(dl);
+		return;
+	}
+
+	dl->dl_nr_pages = nr_pages;
+	memcpy(dl->dl_pages, pages, sizeof(struct page *) * nr_pages);
+	smp_mb__before_atomic();
+	set_bit(DAX_LEASE_PAGES, &dl->dl_state);
+	queue_delayed_work(system_long_wq, &dl->dl_reap_work, HZ);
+}
+EXPORT_SYMBOL_GPL(dax_lease_set_pages);
diff --git a/fs/locks.c b/fs/locks.c
index 1bd71c4d663a..0a7841590b35 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -135,7 +135,7 @@
 
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
+#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT|FL_ALLOCATED))
 #define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
 #define IS_REMOTELCK(fl)	(fl->fl_pid <= 0)
 
@@ -1414,7 +1414,9 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 
 static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 {
-	if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+	/* FL_LAYOUT and FL_ALLOCATED only conflict with each other */
+	if (!!(breaker->fl_flags & (FL_LAYOUT|FL_ALLOCATED))
+			!= !!(lease->fl_flags & (FL_LAYOUT|FL_ALLOCATED)))
 		return false;
 	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
 		return false;
@@ -1653,7 +1655,7 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
 	int ret = 0;
 	struct inode *inode = dentry->d_inode;
 
-	if (flags & FL_LAYOUT)
+	if (flags & (FL_LAYOUT|FL_ALLOCATED))
 		return 0;
 
 	if ((arg == F_RDLCK) &&
@@ -1733,6 +1735,15 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 		 */
 		if (arg == F_WRLCK)
 			goto out;
+
+		/*
+		 * Taking out a new FL_ALLOCATED lease while a previous
+		 * one is being locked is expected since each instance
+		 * may be responsible for a distinct range of pages.
+		 */
+		if (fl->fl_flags & FL_ALLOCATED)
+			continue;
+
 		/*
 		 * Modifying our existing lease is OK, but no getting a
 		 * new lease if someone else is opening for write:
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 122197124b9d..3ff61dc6241e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -100,10 +100,15 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
 
+struct dax_lease;
 #ifdef CONFIG_FS_DAX
 int __dax_zero_page_range(struct block_device *bdev,
 		struct dax_device *dax_dev, sector_t sector,
 		unsigned int offset, unsigned int length);
+struct dax_lease *__dax_truncate_lease(struct vm_area_struct *vma,
+		long nr_pages);
+void dax_lease_set_pages(struct dax_lease *dl, struct page **pages,
+		long nr_pages);
 #else
 static inline int __dax_zero_page_range(struct block_device *bdev,
 		struct dax_device *dax_dev, sector_t sector,
@@ -111,8 +116,26 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
 {
 	return -ENXIO;
 }
+static inline struct dax_lease *__dax_truncate_lease(struct vm_area_struct *vma,
+		long nr_pages)
+{
+	return NULL;
+}
+
+static inline void dax_lease_set_pages(struct dax_lease *dl,
+		struct page **pages, long nr_pages)
+{
+}
 #endif
 
+static inline struct dax_lease *dax_truncate_lease(struct vm_area_struct *vma,
+		long nr_pages)
+{
+	if (!vma_is_dax(vma))
+		return NULL;
+	return __dax_truncate_lease(vma, nr_pages);
+}
+
 static inline bool dax_mapping(struct address_space *mapping)
 {
 	return mapping->host && IS_DAX(mapping->host);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index eace2c5396a7..a3ed74833919 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -371,6 +371,9 @@ struct address_space_operations {
 	int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
 				sector_t *span);
 	void (*swap_deactivate)(struct file *file);
+
+	/* dax dma support */
+	void (*dax_flush_dma)(struct inode *inode);
 };
 
 extern const struct address_space_operations empty_aops;
@@ -927,6 +930,7 @@ static inline struct file *get_file(struct file *f)
 #define FL_UNLOCK_PENDING	512 /* Lease is being broken */
 #define FL_OFDLCK	1024	/* lock is "owned" by struct file */
 #define FL_LAYOUT	2048	/* outstanding pNFS layout */
+#define FL_ALLOCATED	4096	/* pin allocated dax blocks against dma */
 
 #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
 
@@ -2324,17 +2328,27 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 	return ret;
 }
 
-static inline int break_layout(struct inode *inode, bool wait)
+static inline int __break_layout(struct inode *inode, bool wait,
+		unsigned int type)
 {
 	struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx);
 
 	if (ctx && !list_empty_careful(&ctx->flc_lease))
 		return __break_lease(inode,
 				wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
-				FL_LAYOUT);
+				type);
 	return 0;
 }
 
+static inline int break_layout(struct inode *inode, bool wait)
+{
+	return __break_layout(inode, wait, FL_LAYOUT);
+}
+
+static inline int break_allocated(struct inode *inode, bool wait)
+{
+	return __break_layout(inode, wait, FL_LAYOUT|FL_ALLOCATED);
+}
 #else /* !CONFIG_FILE_LOCKING */
 static inline int break_lease(struct inode *inode, unsigned int mode)
 {
@@ -2362,6 +2376,10 @@ static inline int break_layout(struct inode *inode, bool wait)
 	return 0;
 }
 
+static inline int break_allocated(struct inode *inode, bool wait)
+{
+	return 0;
+}
 #endif /* CONFIG_FILE_LOCKING */
 
 /* fs/open.c */
diff --git a/mm/gup.c b/mm/gup.c
index 308be897d22a..6a7cf371e656 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -9,6 +9,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/dax.h>
 
 #include <linux/sched/signal.h>
 #include <linux/rwsem.h>
@@ -640,9 +641,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned int gup_flags, struct page **pages,
 		struct vm_area_struct **vmas, int *nonblocking)
 {
-	long i = 0;
+	long i = 0, result = 0;
+	int dax_lease_once = 0;
 	unsigned int page_mask;
 	struct vm_area_struct *vma = NULL;
+	struct dax_lease *dax_lease = NULL;
 
 	if (!nr_pages)
 		return 0;
@@ -693,6 +696,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		if (unlikely(fatal_signal_pending(current)))
 			return i ? i : -ERESTARTSYS;
 		cond_resched();
+		if (pages && !dax_lease_once) {
+			dax_lease_once = 1;
+			dax_lease = dax_truncate_lease(vma, nr_pages);
+			if (IS_ERR(dax_lease)) {
+				result = PTR_ERR(dax_lease);
+				goto out;
+			}
+		}
 		page = follow_page_mask(vma, start, foll_flags, &page_mask);
 		if (!page) {
 			int ret;
@@ -704,9 +715,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			case -EFAULT:
 			case -ENOMEM:
 			case -EHWPOISON:
-				return i ? i : ret;
+				result = i ? i : ret;
+				goto out;
 			case -EBUSY:
-				return i;
+				result = i;
+				goto out;
 			case -ENOENT:
 				goto next_page;
 			}
@@ -718,7 +731,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			 */
 			goto next_page;
 		} else if (IS_ERR(page)) {
-			return i ? i : PTR_ERR(page);
+			result = i ? i : PTR_ERR(page);
+			goto out;
 		}
 		if (pages) {
 			pages[i] = page;
@@ -738,7 +752,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		start += page_increm * PAGE_SIZE;
 		nr_pages -= page_increm;
 	} while (nr_pages);
-	return i;
+	result = i;
+out:
+	dax_lease_set_pages(dax_lease, pages, result);
+	return result;
 }
 
 static bool vma_permits_fault(struct vm_area_struct *vma,