Add logic to free up a busy memory range. Freed memory range will be returned to free pool. Add a worker which can be started to select and free some busy memory ranges. Signed-off-by: Vivek Goyal <vgoyal@xxxxxxxxxx> --- fs/fuse/file.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- fs/fuse/fuse_i.h | 10 ++++ fs/fuse/inode.c | 2 + 3 files changed, 159 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 73068289f62e..17becdff3014 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -272,7 +272,15 @@ static int fuse_setup_one_mapping(struct inode *inode, pr_debug("fuse_setup_one_mapping() succeeded. offset=0x%llx err=%zd\n", offset, err); - /* TODO: What locking is required here. For now, using fc->lock */ + /* + * We don't take a refernce on inode. inode is valid right now and + * when inode is going away, cleanup logic should first cleanup + * dmap entries. + * + * TODO: Do we need to ensure that we are holding inode lock + * as well. + */ + dmap->inode = inode; dmap->start = offset; dmap->end = offset + FUSE_DAX_MEM_RANGE_SZ - 1; /* Protected by fi->i_dmap_sem */ @@ -347,6 +355,8 @@ void fuse_removemapping(struct inode *inode) continue; } + dmap->inode = NULL; + /* Add it back to free ranges list */ free_dax_mapping(fc, dmap); } @@ -3694,3 +3704,139 @@ void fuse_init_file_inode(struct inode *inode) inode->i_data.a_ops = &fuse_dax_file_aops; } } + +int fuse_dax_free_one_mapping_locked(struct fuse_conn *fc, struct inode *inode, + u64 dmap_start) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + + WARN_ON(!inode_is_locked(inode)); + + /* Find fuse dax mapping at file offset inode. */ + dmap = fuse_dax_interval_tree_iter_first(&fi->dmap_tree, dmap_start, + dmap_start); + + /* Range already got cleaned up by somebody else */ + if (!dmap) + return 0; + + ret = filemap_fdatawrite_range(inode->i_mapping, dmap->start, dmap->end); + if (ret) { + printk("filemap_fdatawrite_range() failed. err=%d start=0x%llx," + " end=0x%llx\n", ret, dmap->start, dmap->end); + return ret; + } + + ret = invalidate_inode_pages2_range(inode->i_mapping, + dmap->start >> PAGE_SHIFT, + dmap->end >> PAGE_SHIFT); + /* TODO: What to do if above fails? For now, + * leave the range in place. + */ + if (ret) { + printk("invalidate_inode_pages2_range() failed err=%d\n", ret); + return ret; + } + + /* Remove dax mapping from inode interval tree now */ + fuse_dax_interval_tree_remove(dmap, &fi->dmap_tree); + fi->nr_dmaps--; + + /* Cleanup dmap entry and add back to free list */ + spin_lock(&fc->lock); + list_del_init(&dmap->busy_list); + WARN_ON(fc->nr_busy_ranges == 0); + fc->nr_busy_ranges--; + dmap->inode = NULL; + dmap->start = dmap->end = 0; + __free_dax_mapping(fc, dmap); + spin_unlock(&fc->lock); + + pr_debug("fuse: freed memory range window_offset=0x%llx," + " length=0x%llx\n", dmap->window_offset, + dmap->length); + + return ret; +} + +/* + * Free a range of memory. + * Locking. + * 1. Take inode->i_rwsem to prever further read/write. + * 2. Take fuse_inode->i_mmap_sem to block dax faults. + * 3. Take fuse_inode->i_dmap_sem to protect interval tree. It might not + * be strictly necessary as lock 1 and 2 seem sufficient. + */ +int fuse_dax_free_one_mapping(struct fuse_conn *fc, struct inode *inode, + u64 dmap_start) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + + inode_lock(inode); + down_write(&fi->i_mmap_sem); + down_write(&fi->i_dmap_sem); + ret = fuse_dax_free_one_mapping_locked(fc, inode, dmap_start); + up_write(&fi->i_dmap_sem); + up_write(&fi->i_mmap_sem); + inode_unlock(inode); + return ret; +} + +int fuse_dax_free_memory(struct fuse_conn *fc, unsigned long nr_to_free) +{ + struct fuse_dax_mapping *dmap, *pos; + int ret, i; + u64 dmap_start = 0, window_offset = 0; + struct inode *inode = NULL; + + /* Pick first busy range and free it for now*/ + for (i = 0; i < nr_to_free; i++) { + dmap = NULL; + spin_lock(&fc->lock); + + list_for_each_entry(pos, &fc->busy_ranges, busy_list) { + dmap = pos; + inode = igrab(dmap->inode); + /* + * This inode is going away. That will free + * up all the ranges anyway, continue to + * next range. + */ + if (!inode) + continue; + dmap_start = dmap->start; + window_offset = dmap->window_offset; + break; + } + spin_unlock(&fc->lock); + if (!dmap) + return 0; + + ret = fuse_dax_free_one_mapping(fc, inode, dmap_start); + iput(inode); + if (ret) { + printk("%s(window_offset=0x%llx) failed. err=%d\n", + __func__, window_offset, ret); + return ret; + } + } + return 0; +} + +/* TODO: This probably should go in inode.c */ +void fuse_dax_free_mem_worker(struct work_struct *work) +{ + int ret; + struct fuse_conn *fc = container_of(work, struct fuse_conn, + dax_free_work.work); + pr_debug("fuse: Worker to free memory called.\n"); + pr_debug("fuse: Worker to free memory called. nr_free_ranges=%lu" + " nr_busy_ranges=%lu\n", fc->nr_free_ranges, + fc->nr_busy_ranges); + ret = fuse_dax_free_memory(fc, FUSE_DAX_RECLAIM_CHUNK); + if (ret) + pr_debug("fuse: fuse_dax_free_memory() failed with err=%d\n", ret); +} diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 280f717deb57..383deaf0ecf1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -50,6 +50,9 @@ #define FUSE_DAX_MEM_RANGE_SZ (2*1024*1024) #define FUSE_DAX_MEM_RANGE_PAGES (FUSE_DAX_MEM_RANGE_SZ/PAGE_SIZE) +/* Number of ranges reclaimer will try to free in one invocation */ +#define FUSE_DAX_RECLAIM_CHUNK (10) + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -102,6 +105,9 @@ struct fuse_forget_link { /** Translation information for file offsets to DAX window offsets */ struct fuse_dax_mapping { + /* Pointer to inode where this memory range is mapped */ + struct inode *inode; + /* Will connect in fc->free_ranges to keep track of free memory */ struct list_head list; @@ -870,6 +876,9 @@ struct fuse_conn { unsigned long nr_busy_ranges; struct list_head busy_ranges; + /* Worker to free up memory ranges */ + struct delayed_work dax_free_work; + /* * DAX Window Free Ranges. TODO: This might not be best place to store * this free list @@ -1244,6 +1253,7 @@ unsigned fuse_len_args(unsigned numargs, struct fuse_arg *args); * Get the next unique ID for a request */ u64 fuse_get_unique(struct fuse_iqueue *fiq); +void fuse_dax_free_mem_worker(struct work_struct *work); void fuse_removemapping(struct inode *inode); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 59fc5a7a18fc..44f7bc44e319 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -713,6 +713,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, fc->user_ns = get_user_ns(user_ns); INIT_LIST_HEAD(&fc->free_ranges); INIT_LIST_HEAD(&fc->busy_ranges); + INIT_DELAYED_WORK(&fc->dax_free_work, fuse_dax_free_mem_worker); } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -721,6 +722,7 @@ void fuse_conn_put(struct fuse_conn *fc) if (refcount_dec_and_test(&fc->count)) { if (fc->destroy_req) fuse_request_free(fc->destroy_req); + flush_delayed_work(&fc->dax_free_work); if (fc->dax_dev) fuse_free_dax_mem_ranges(&fc->free_ranges); put_pid_ns(fc->pid_ns); -- 2.13.6