On Thu, 2017-10-19 at 19:40 -0700, Dan Williams wrote: > get_user_pages() pins file backed memory pages for access by dma > devices. However, it only pins the memory pages not the page-to-file > offset association. If a file is truncated the pages are mapped out of > the file and dma may continue indefinitely into a page that is owned by > a device driver. This breaks coherency of the file vs dma, but the > assumption is that if userspace wants the file-space truncated it does > not matter what data is inbound from the device, it is not relevant > anymore. > > The assumptions of the truncate-page-cache model are broken by DAX where > the target DMA page *is* the filesystem block. Leaving the page pinned > for DMA, but truncating the file block out of the file, means that the > filesytem is free to reallocate a block under active DMA to another > file! > > Here are some possible options for fixing this situation ('truncate' and > 'fallocate(punch hole)' are synonymous below): > > 1/ Fail truncate while any file blocks might be under dma > > 2/ Block (sleep-wait) truncate while any file blocks might be under > dma > > 3/ Remap file blocks to a "lost+found"-like file-inode where > dma can continue and we might see what inbound data from DMA was > mapped out of the original file. Blocks in this file could be > freed back to the filesystem when dma eventually ends. > > 4/ Disable dax until option 3 or another long term solution has been > implemented. However, filesystem-dax is still marked experimental > for concerns like this. > > Option 1 will throw failures where userspace has never expected them > before, option 2 might hang the truncating process indefinitely, and > option 3 requires per filesystem enabling to remap blocks from one inode > to another. Option 2 is implemented in this patch for the DAX path with > the expectation that non-transient users of get_user_pages() (RDMA) are > disallowed from setting up dax mappings and that the potential delay > introduced to the truncate path is acceptable compared to the response > time of the page cache case. This can only be seen as a stop-gap until > we can solve the problem of safely sequestering unallocated filesystem > blocks under active dma. > FWIW, I like #3 a lot more than #2 here. I get that it's quite a bit more work though, so no objection to this as a stop-gap fix. > The solution introduces a new FL_ALLOCATED lease to pin the allocated > blocks in a dax file while dma might be accessing them. It behaves > identically to an FL_LAYOUT lease save for the fact that it is > immediately sheduled to be reaped, and that the only path that waits for > its removal is the truncate path. We can not reuse FL_LAYOUT directly > since that would deadlock in the case where userspace did a direct-I/O > operation with a target buffer backed by an mmap range of the same file. > > Credit / inspiration for option 3 goes to Dave Hansen, who proposed > something similar as an alternative way to solve the problem that > MAP_DIRECT was trying to solve. > > Cc: Jan Kara <jack@xxxxxxx> > Cc: Jeff Moyer <jmoyer@xxxxxxxxxx> > Cc: Dave Chinner <david@xxxxxxxxxxxxx> > Cc: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx> > Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> > Cc: "Darrick J. Wong" <darrick.wong@xxxxxxxxxx> > Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> > Cc: Jeff Layton <jlayton@xxxxxxxxxxxxxxx> > Cc: "J. Bruce Fields" <bfields@xxxxxxxxxxxx> > Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> > Reported-by: Christoph Hellwig <hch@xxxxxx> > Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> > --- > fs/Kconfig | 1 > fs/dax.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++ > fs/locks.c | 17 ++++- > include/linux/dax.h | 23 ++++++ > include/linux/fs.h | 22 +++++- > mm/gup.c | 27 ++++++- > 6 files changed, 268 insertions(+), 10 deletions(-) > > diff --git a/fs/Kconfig b/fs/Kconfig > index 7aee6d699fd6..a7b31a96a753 100644 > --- a/fs/Kconfig > +++ b/fs/Kconfig > @@ -37,6 +37,7 @@ source "fs/f2fs/Kconfig" > config FS_DAX > bool "Direct Access (DAX) support" > depends on MMU > + depends on FILE_LOCKING > depends on !(ARM || MIPS || SPARC) > select FS_IOMAP > select DAX > diff --git a/fs/dax.c b/fs/dax.c > index b03f547b36e7..e0a3958fc5f2 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -22,6 +22,7 @@ > #include <linux/genhd.h> > #include <linux/highmem.h> > #include <linux/memcontrol.h> > +#include <linux/file.h> > #include <linux/mm.h> > #include <linux/mutex.h> > #include <linux/pagevec.h> > @@ -1481,3 +1482,190 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, > } > } > EXPORT_SYMBOL_GPL(dax_iomap_fault); > + > +enum dax_lease_flags { > + DAX_LEASE_PAGES, > + DAX_LEASE_BREAK, > +}; > + > +struct dax_lease { > + struct page **dl_pages; > + unsigned long dl_nr_pages; > + unsigned long dl_state; > + struct file *dl_file; > + atomic_t dl_count; > + /* > + * Once the lease is taken and the pages have references we > + * start the reap_work to poll for lease release while acquiring > + * fs locks that synchronize with truncate. So, either reap_work > + * cleans up the dax_lease instances or truncate itself. > + * > + * The break_work sleepily polls for DMA completion and then > + * unlocks/removes the lease. > + */ > + struct delayed_work dl_reap_work; > + struct delayed_work dl_break_work; > +}; > + > +static void put_dax_lease(struct dax_lease *dl) > +{ > + if (atomic_dec_and_test(&dl->dl_count)) { > + fput(dl->dl_file); > + kfree(dl); > + } > +} Any reason not to use the new refcount_t type for dl_count? Seems like a good place for it. > + > +static void dax_lease_unlock_one(struct work_struct *work) > +{ > + struct dax_lease *dl = container_of(work, typeof(*dl), > + dl_break_work.work); > + unsigned long i; > + > + /* wait for the gup path to finish recording pages in the lease */ > + if (!test_bit(DAX_LEASE_PAGES, &dl->dl_state)) { > + schedule_delayed_work(&dl->dl_break_work, HZ); > + return; > + } > + > + /* barrier pairs with dax_lease_set_pages() */ > + smp_mb__after_atomic(); > + > + /* > + * If we see all pages idle at least once we can remove the > + * lease. If we happen to race with someone else taking a > + * reference on a page they will have their own lease to protect > + * against truncate. > + */ > + for (i = 0; i < dl->dl_nr_pages; i++) > + if (page_ref_count(dl->dl_pages[i]) > 1) { > + schedule_delayed_work(&dl->dl_break_work, HZ); > + return; > + } > + vfs_setlease(dl->dl_file, F_UNLCK, NULL, (void **) &dl); > + put_dax_lease(dl); > +} > + > +static void dax_lease_reap_all(struct work_struct *work) > +{ > + struct dax_lease *dl = container_of(work, typeof(*dl), > + dl_reap_work.work); > + struct file *file = dl->dl_file; > + struct inode *inode = file_inode(file); > + struct address_space *mapping = inode->i_mapping; > + > + if (mapping->a_ops->dax_flush_dma) { > + mapping->a_ops->dax_flush_dma(inode); > + } else { > + /* FIXME: dax-filesystem needs to add dax-dma support */ > + break_allocated(inode, true); > + } > + put_dax_lease(dl); > +} > + > +static bool dax_lease_lm_break(struct file_lock *fl) > +{ > + struct dax_lease *dl = fl->fl_owner; > + > + if (!test_and_set_bit(DAX_LEASE_BREAK, &dl->dl_state)) { > + atomic_inc(&dl->dl_count); > + schedule_delayed_work(&dl->dl_break_work, HZ); > + } > + I haven't gone over this completely, but what prevents you from doing a 0->1 transition on the dl_count here, and possibly doing a use-after free? Ahh ok...I guess we know that we hold a reference since this is on the flc_lease list? Fair enough. Still, might be worth a comment there as to why that's safe. > + /* Tell the core lease code to wait for delayed work completion */ > + fl->fl_break_time = 0; > + > + return false; > +} > + > +static int dax_lease_lm_change(struct file_lock *fl, int arg, > + struct list_head *dispose) > +{ > + struct dax_lease *dl; > + int rc; > + > + WARN_ON(!(arg & F_UNLCK)); > + dl = fl->fl_owner; > + rc = lease_modify(fl, arg, dispose); > + put_dax_lease(dl); > + return rc; > +} > + > +static const struct lock_manager_operations dax_lease_lm_ops = { > + .lm_break = dax_lease_lm_break, > + .lm_change = dax_lease_lm_change, > +}; > + > +struct dax_lease *__dax_truncate_lease(struct vm_area_struct *vma, > + long nr_pages) > +{ > + struct file *file = vma->vm_file; > + struct inode *inode = file_inode(file); > + struct dax_lease *dl; > + struct file_lock *fl; > + int rc = -ENOMEM; > + > + if (!vma_is_dax(vma)) > + return NULL; > + > + /* device-dax can not be truncated */ > + if (!S_ISREG(inode->i_mode)) > + return NULL; > + > + dl = kzalloc(sizeof(*dl) + sizeof(struct page *) * nr_pages, GFP_KERNEL); > + if (!dl) > + return ERR_PTR(-ENOMEM); > + > + fl = locks_alloc_lock(); > + if (!fl) > + goto err_lock_alloc; > + > + dl->dl_pages = (struct page **)(dl + 1); > + INIT_DELAYED_WORK(&dl->dl_break_work, dax_lease_unlock_one); > + INIT_DELAYED_WORK(&dl->dl_reap_work, dax_lease_reap_all); > + dl->dl_file = get_file(file); > + /* need dl alive until dax_lease_set_pages() and final put */ > + atomic_set(&dl->dl_count, 2); > + > + locks_init_lock(fl); > + fl->fl_lmops = &dax_lease_lm_ops; > + fl->fl_flags = FL_ALLOCATED; > + fl->fl_type = F_RDLCK; > + fl->fl_end = OFFSET_MAX; > + fl->fl_owner = dl; > + fl->fl_pid = current->tgid; > + fl->fl_file = file; > + > + rc = vfs_setlease(fl->fl_file, fl->fl_type, &fl, (void **) &dl); > + if (rc) > + goto err_setlease; > + return dl; > +err_setlease: > + locks_free_lock(fl); > +err_lock_alloc: > + kfree(dl); > + return ERR_PTR(rc); > +} > + > +void dax_lease_set_pages(struct dax_lease *dl, struct page **pages, > + long nr_pages) > +{ > + if (IS_ERR_OR_NULL(dl)) > + return; > + > + if (nr_pages <= 0) { > + dl->dl_nr_pages = 0; > + smp_mb__before_atomic(); > + set_bit(DAX_LEASE_PAGES, &dl->dl_state); > + vfs_setlease(dl->dl_file, F_UNLCK, NULL, (void **) &dl); > + flush_delayed_work(&dl->dl_break_work); > + put_dax_lease(dl); > + return; > + } > + > + dl->dl_nr_pages = nr_pages; > + memcpy(dl->dl_pages, pages, sizeof(struct page *) * nr_pages); > + smp_mb__before_atomic(); > + set_bit(DAX_LEASE_PAGES, &dl->dl_state); > + queue_delayed_work(system_long_wq, &dl->dl_reap_work, HZ); > +} > +EXPORT_SYMBOL_GPL(dax_lease_set_pages); > diff --git a/fs/locks.c b/fs/locks.c > index 1bd71c4d663a..0a7841590b35 100644 > --- a/fs/locks.c > +++ b/fs/locks.c > @@ -135,7 +135,7 @@ > > #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) > #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) > -#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) > +#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT|FL_ALLOCATED)) > #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) > #define IS_REMOTELCK(fl) (fl->fl_pid <= 0) > > @@ -1414,7 +1414,9 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose) > > static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) > { > - if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) > + /* FL_LAYOUT and FL_ALLOCATED only conflict with each other */ > + if (!!(breaker->fl_flags & (FL_LAYOUT|FL_ALLOCATED)) > + != !!(lease->fl_flags & (FL_LAYOUT|FL_ALLOCATED))) > return false; > if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) > return false; > @@ -1653,7 +1655,7 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags) > int ret = 0; > struct inode *inode = dentry->d_inode; > > - if (flags & FL_LAYOUT) > + if (flags & (FL_LAYOUT|FL_ALLOCATED)) > return 0; > > if ((arg == F_RDLCK) && > @@ -1733,6 +1735,15 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr > */ > if (arg == F_WRLCK) > goto out; > + > + /* > + * Taking out a new FL_ALLOCATED lease while a previous > + * one is being locked is expected since each instance > + * may be responsible for a distinct range of pages. > + */ > + if (fl->fl_flags & FL_ALLOCATED) > + continue; > + > /* > * Modifying our existing lease is OK, but no getting a > * new lease if someone else is opening for write: > diff --git a/include/linux/dax.h b/include/linux/dax.h > index 122197124b9d..3ff61dc6241e 100644 > --- a/include/linux/dax.h > +++ b/include/linux/dax.h > @@ -100,10 +100,15 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); > int dax_invalidate_mapping_entry_sync(struct address_space *mapping, > pgoff_t index); > > +struct dax_lease; > #ifdef CONFIG_FS_DAX > int __dax_zero_page_range(struct block_device *bdev, > struct dax_device *dax_dev, sector_t sector, > unsigned int offset, unsigned int length); > +struct dax_lease *__dax_truncate_lease(struct vm_area_struct *vma, > + long nr_pages); > +void dax_lease_set_pages(struct dax_lease *dl, struct page **pages, > + long nr_pages); > #else > static inline int __dax_zero_page_range(struct block_device *bdev, > struct dax_device *dax_dev, sector_t sector, > @@ -111,8 +116,26 @@ static inline int __dax_zero_page_range(struct block_device *bdev, > { > return -ENXIO; > } > +static inline struct dax_lease *__dax_truncate_lease(struct vm_area_struct *vma, > + long nr_pages) > +{ > + return NULL; > +} > + > +static inline void dax_lease_set_pages(struct dax_lease *dl, > + struct page **pages, long nr_pages) > +{ > +} > #endif > > +static inline struct dax_lease *dax_truncate_lease(struct vm_area_struct *vma, > + long nr_pages) > +{ > + if (!vma_is_dax(vma)) > + return NULL; > + return __dax_truncate_lease(vma, nr_pages); > +} > + > static inline bool dax_mapping(struct address_space *mapping) > { > return mapping->host && IS_DAX(mapping->host); > diff --git a/include/linux/fs.h b/include/linux/fs.h > index eace2c5396a7..a3ed74833919 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -371,6 +371,9 @@ struct address_space_operations { > int (*swap_activate)(struct swap_info_struct *sis, struct file *file, > sector_t *span); > void (*swap_deactivate)(struct file *file); > + > + /* dax dma support */ > + void (*dax_flush_dma)(struct inode *inode); > }; > > extern const struct address_space_operations empty_aops; > @@ -927,6 +930,7 @@ static inline struct file *get_file(struct file *f) > #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ > #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ > #define FL_LAYOUT 2048 /* outstanding pNFS layout */ > +#define FL_ALLOCATED 4096 /* pin allocated dax blocks against dma */ > > #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) > > @@ -2324,17 +2328,27 @@ static inline int break_deleg_wait(struct inode **delegated_inode) > return ret; > } > > -static inline int break_layout(struct inode *inode, bool wait) > +static inline int __break_layout(struct inode *inode, bool wait, > + unsigned int type) > { > struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx); > > if (ctx && !list_empty_careful(&ctx->flc_lease)) > return __break_lease(inode, > wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, > - FL_LAYOUT); > + type); > return 0; > } > > +static inline int break_layout(struct inode *inode, bool wait) > +{ > + return __break_layout(inode, wait, FL_LAYOUT); > +} > + > +static inline int break_allocated(struct inode *inode, bool wait) > +{ > + return __break_layout(inode, wait, FL_LAYOUT|FL_ALLOCATED); > +} > #else /* !CONFIG_FILE_LOCKING */ > static inline int break_lease(struct inode *inode, unsigned int mode) > { > @@ -2362,6 +2376,10 @@ static inline int break_layout(struct inode *inode, bool wait) > return 0; > } > > +static inline int break_allocated(struct inode *inode, bool wait) > +{ > + return 0; > +} > #endif /* CONFIG_FILE_LOCKING */ > > /* fs/open.c */ > diff --git a/mm/gup.c b/mm/gup.c > index 308be897d22a..6a7cf371e656 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -9,6 +9,7 @@ > #include <linux/rmap.h> > #include <linux/swap.h> > #include <linux/swapops.h> > +#include <linux/dax.h> > > #include <linux/sched/signal.h> > #include <linux/rwsem.h> > @@ -640,9 +641,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > unsigned int gup_flags, struct page **pages, > struct vm_area_struct **vmas, int *nonblocking) > { > - long i = 0; > + long i = 0, result = 0; > + int dax_lease_once = 0; > unsigned int page_mask; > struct vm_area_struct *vma = NULL; > + struct dax_lease *dax_lease = NULL; > > if (!nr_pages) > return 0; > @@ -693,6 +696,14 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > if (unlikely(fatal_signal_pending(current))) > return i ? i : -ERESTARTSYS; > cond_resched(); > + if (pages && !dax_lease_once) { > + dax_lease_once = 1; > + dax_lease = dax_truncate_lease(vma, nr_pages); > + if (IS_ERR(dax_lease)) { > + result = PTR_ERR(dax_lease); > + goto out; > + } > + } > page = follow_page_mask(vma, start, foll_flags, &page_mask); > if (!page) { > int ret; > @@ -704,9 +715,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > case -EFAULT: > case -ENOMEM: > case -EHWPOISON: > - return i ? i : ret; > + result = i ? i : ret; > + goto out; > case -EBUSY: > - return i; > + result = i; > + goto out; > case -ENOENT: > goto next_page; > } > @@ -718,7 +731,8 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > */ > goto next_page; > } else if (IS_ERR(page)) { > - return i ? i : PTR_ERR(page); > + result = i ? i : PTR_ERR(page); > + goto out; > } > if (pages) { > pages[i] = page; > @@ -738,7 +752,10 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, > start += page_increm * PAGE_SIZE; > nr_pages -= page_increm; > } while (nr_pages); > - return i; > + result = i; > +out: > + dax_lease_set_pages(dax_lease, pages, result); > + return result; > } > > static bool vma_permits_fault(struct vm_area_struct *vma, >