Sometimes we run out of memory ranges. So in that case, wait for memory ranges to become free, instead of returning -EBUSY. dax fault path is holding fuse_inode->i_mmap_sem and once that is being held, memory reclaim can't be done. Its not safe to wait while holding fuse_inode->i_mmap_sem for two reasons. - Worker thread to free memory might block on fuse_inode->i_mmap_sem as well. - This inode is holding all the memory and more memory can't be freed. In both the cases, deadlock will ensue. So return -ENOSPC from iomap_begin() in fault path if memory can't be allocated. Drop fuse_inode->i_mmap_sem, and wait for a free range to become available and retry. read/write path is a different story. We hold inode lock and lock ordering allows to grab fuse_inode->immap_sem, if needed. That means we can do direct reclaim in that path. But if there is no memory allocated to this inode, then direct reclaim will not work and we need to wait for a memory range to become free. So try following order. A. Try to get a free range. B. If not, try direct reclaim. C. If not, wait for a memory range to become free Here sleeping with locks held should be fine because in step B, we made sure this inode is not holding any ranges. That means other inodes are holding ranges and somebody should be able to free memory. Also, worker thread does a trylock() on inode lock. That means worker tread will not wait on this inode and move onto next memory range. Hence above sequence should be deadlock free. Signed-off-by: Vivek Goyal <vgoyal@xxxxxxxxxx> --- fs/fuse/file.c | 60 +++++++++++++++++++++++++++++++++++++++++++------------- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 1 + 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 709747458335..d0942ce0a6c3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -220,6 +220,8 @@ static void __free_dax_mapping(struct fuse_conn *fc, { list_add_tail(&dmap->list, &fc->free_ranges); fc->nr_free_ranges++; + /* TODO: Wake up only when needed */ + wake_up(&fc->dax_range_waitq); } static void free_dax_mapping(struct fuse_conn *fc, @@ -1770,12 +1772,18 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, goto iomap_hole; /* Can't do reclaim in fault path yet due to lock ordering */ - if (flags & IOMAP_FAULT) + if (flags & IOMAP_FAULT) { alloc_dmap = alloc_dax_mapping(fc); - else + if (!alloc_dmap) + return -ENOSPC; + } else { alloc_dmap = alloc_dax_mapping_reclaim(fc, inode); + if (IS_ERR(alloc_dmap)) + return PTR_ERR(alloc_dmap); + } - if (!alloc_dmap) + /* If we are here, we should have memory allocated */ + if (WARN_ON(!alloc_dmap)) return -EBUSY; /* @@ -2596,14 +2604,24 @@ static ssize_t fuse_file_splice_read(struct file *in, loff_t *ppos, static int __fuse_dax_fault(struct vm_fault *vmf, enum page_entry_size pe_size, bool write) { - int ret; + int ret, error = 0; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; pfn_t pfn; + struct fuse_conn *fc = get_fuse_conn(inode); + bool retry = false; if (write) sb_start_pagefault(sb); +retry: + if (retry && !(fc->nr_free_ranges > 0)) { + ret = -EINTR; + if (wait_event_killable_exclusive(fc->dax_range_waitq, + (fc->nr_free_ranges > 0))) + goto out; + } + /* * We need to serialize against not only truncate but also against * fuse dax memory range reclaim. While a range is being reclaimed, @@ -2611,13 +2629,20 @@ static int __fuse_dax_fault(struct vm_fault *vmf, enum page_entry_size pe_size, * to populate page cache or access memory we are trying to free. */ down_read(&get_fuse_inode(inode)->i_mmap_sem); - ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &fuse_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + if ((ret & VM_FAULT_ERROR) && error == -ENOSPC) { + error = 0; + retry = true; + up_read(&get_fuse_inode(inode)->i_mmap_sem); + goto retry; + } if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); up_read(&get_fuse_inode(inode)->i_mmap_sem); +out: if (write) sb_end_pagefault(sb); @@ -3828,16 +3853,23 @@ static struct fuse_dax_mapping *alloc_dax_mapping_reclaim(struct fuse_conn *fc, struct fuse_dax_mapping *dmap; struct fuse_inode *fi = get_fuse_inode(inode); - dmap = alloc_dax_mapping(fc); - if (dmap) - return dmap; - - /* There are no mappings which can be reclaimed */ - if (!fi->nr_dmaps) - return NULL; + while(1) { + dmap = alloc_dax_mapping(fc); + if (dmap) + return dmap; - /* Try reclaim a fuse dax memory range */ - return fuse_dax_reclaim_first_mapping(fc, inode); + if (fi->nr_dmaps) + return fuse_dax_reclaim_first_mapping(fc, inode); + /* + * There are no mappings which can be reclaimed. + * Wait for one. + */ + if (!(fc->nr_free_ranges > 0)) { + if (wait_event_killable_exclusive(fc->dax_range_waitq, + (fc->nr_free_ranges > 0))) + return ERR_PTR(-EINTR); + } + } } int fuse_dax_free_one_mapping_locked(struct fuse_conn *fc, struct inode *inode, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bbefa7c11078..7b2db87c6ead 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -886,6 +886,9 @@ struct fuse_conn { /* Worker to free up memory ranges */ struct delayed_work dax_free_work; + /* Wait queue for a dax range to become free */ + wait_queue_head_t dax_range_waitq; + /* * DAX Window Free Ranges. TODO: This might not be best place to store * this free list diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d31acb97eede..178ac3171564 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -695,6 +695,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); init_waitqueue_head(&fc->reserved_req_waitq); + init_waitqueue_head(&fc->dax_range_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); -- 2.13.6