On 11/16/24 6:44 AM, Joanne Koong wrote: > In the current FUSE writeback design (see commit 3be5a52b30aa > ("fuse: support writable mmap")), a temp page is allocated for every > dirty page to be written back, the contents of the dirty page are copied over > to the temp page, and the temp page gets handed to the server to write back. > > This is done so that writeback may be immediately cleared on the dirty page, > and this in turn is done for two reasons: > a) in order to mitigate the following deadlock scenario that may arise > if reclaim waits on writeback on the dirty page to complete: > * single-threaded FUSE server is in the middle of handling a request > that needs a memory allocation > * memory allocation triggers direct reclaim > * direct reclaim waits on a folio under writeback > * the FUSE server can't write back the folio since it's stuck in > direct reclaim > b) in order to unblock internal (eg sync, page compaction) waits on > writeback without needing the server to complete writing back to disk, > which may take an indeterminate amount of time. > > With a recent change that added AS_WRITEBACK_INDETERMINATE and mitigates > the situations described above, FUSE writeback does not need to use > temp pages if it sets AS_WRITEBACK_INDETERMINATE on its inode mappings. > > This commit sets AS_WRITEBACK_INDETERMINATE on the inode mappings > and removes the temporary pages + extra copying and the internal rb > tree. > > fio benchmarks -- > (using averages observed from 10 runs, throwing away outliers) > > Setup: > sudo mount -t tmpfs -o size=30G tmpfs ~/tmp_mount > ./libfuse/build/example/passthrough_ll -o writeback -o max_threads=4 -o source=~/tmp_mount ~/fuse_mount > > fio --name=writeback --ioengine=sync --rw=write --bs={1k,4k,1M} --size=2G > --numjobs=2 --ramp_time=30 --group_reporting=1 --directory=/root/fuse_mount > > bs = 1k 4k 1M > Before 351 MiB/s 1818 MiB/s 1851 MiB/s > After 341 MiB/s 2246 MiB/s 2685 MiB/s > % diff -3% 23% 45% > > Signed-off-by: Joanne Koong <joannelkoong@xxxxxxxxx> > --- > fs/fuse/file.c | 339 +++---------------------------------------------- > 1 file changed, 20 insertions(+), 319 deletions(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index 88d0946b5bc9..56289ac58596 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -415,89 +415,11 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) > > struct fuse_writepage_args { > struct fuse_io_args ia; > - struct rb_node writepages_entry; > struct list_head queue_entry; > - struct fuse_writepage_args *next; > struct inode *inode; > struct fuse_sync_bucket *bucket; > }; > > -static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, > - pgoff_t idx_from, pgoff_t idx_to) > -{ > - struct rb_node *n; > - > - n = fi->writepages.rb_node; > - > - while (n) { > - struct fuse_writepage_args *wpa; > - pgoff_t curr_index; > - > - wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); > - WARN_ON(get_fuse_inode(wpa->inode) != fi); > - curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; > - if (idx_from >= curr_index + wpa->ia.ap.num_folios) > - n = n->rb_right; > - else if (idx_to < curr_index) > - n = n->rb_left; > - else > - return wpa; > - } > - return NULL; > -} > - > -/* > - * Check if any page in a range is under writeback > - */ > -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, > - pgoff_t idx_to) > -{ > - struct fuse_inode *fi = get_fuse_inode(inode); > - bool found; > - > - if (RB_EMPTY_ROOT(&fi->writepages)) > - return false; > - > - spin_lock(&fi->lock); > - found = fuse_find_writeback(fi, idx_from, idx_to); > - spin_unlock(&fi->lock); > - > - return found; > -} > - > -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) > -{ > - return fuse_range_is_writeback(inode, index, index); > -} > - > -/* > - * Wait for page writeback to be completed. > - * > - * Since fuse doesn't rely on the VM writeback tracking, this has to > - * use some other means. > - */ > -static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) > -{ > - struct fuse_inode *fi = get_fuse_inode(inode); > - > - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); > -} > - > -static inline bool fuse_folio_is_writeback(struct inode *inode, > - struct folio *folio) > -{ > - pgoff_t last = folio_next_index(folio) - 1; > - return fuse_range_is_writeback(inode, folio_index(folio), last); > -} > - > -static void fuse_wait_on_folio_writeback(struct inode *inode, > - struct folio *folio) > -{ > - struct fuse_inode *fi = get_fuse_inode(inode); > - > - wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); > -} > - > /* > * Wait for all pending writepages on the inode to finish. > * > @@ -886,13 +808,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) > ssize_t res; > u64 attr_ver; > > - /* > - * With the temporary pages that are used to complete writeback, we can > - * have writeback that extends beyond the lifetime of the folio. So > - * make sure we read a properly synced folio. > - */ > - fuse_wait_on_folio_writeback(inode, folio); > - > attr_ver = fuse_get_attr_version(fm->fc); > > /* Don't overflow end offset */ > @@ -1003,17 +918,12 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) > static void fuse_readahead(struct readahead_control *rac) > { > struct inode *inode = rac->mapping->host; > - struct fuse_inode *fi = get_fuse_inode(inode); > struct fuse_conn *fc = get_fuse_conn(inode); > unsigned int max_pages, nr_pages; > - pgoff_t first = readahead_index(rac); > - pgoff_t last = first + readahead_count(rac) - 1; > > if (fuse_is_bad(inode)) > return; > > - wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); > - > max_pages = min_t(unsigned int, fc->max_pages, > fc->max_read / PAGE_SIZE); > > @@ -1172,7 +1082,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, > int err; > > for (i = 0; i < ap->num_folios; i++) > - fuse_wait_on_folio_writeback(inode, ap->folios[i]); > + folio_wait_writeback(ap->folios[i]); > > fuse_write_args_fill(ia, ff, pos, count); > ia->write.in.flags = fuse_write_flags(iocb); > @@ -1622,7 +1532,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, > return res; > } > } > - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { > + if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { > if (!write) > inode_lock(inode); > fuse_sync_writes(inode); > @@ -1825,7 +1735,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) > fuse_sync_bucket_dec(wpa->bucket); > > for (i = 0; i < ap->num_folios; i++) > - folio_put(ap->folios[i]); > + folio_end_writeback(ap->folios[i]); I noticed that if we folio_end_writeback() in fuse_writepage_finish() (rather than fuse_writepage_free()), there's ~50% buffer write bandwridth performance gain (5500MB -> 8500MB)[*] The fuse server is generally implemented in multi-thread style, and multi (fuse server) worker threads could fetch and process FUSE_WRITE requests of one fuse inode. Then there's serious lock contention for the xarray lock (of the address space) when these multi worker threads call fuse_writepage_end->folio_end_writeback when they are sending replies of FUSE_WRITE requests. The lock contention is greatly alleviated when folio_end_writeback() is serialized with fi->lock. IOWs in the current implementation (folio_end_writeback() in fuse_writepage_free()), each worker thread needs to compete for the xarray lock for 256 times (one fuse request can contain at most 256 pages if FUSE_MAX_MAX_PAGES is 256) when completing a FUSE_WRITE request. After moving folio_end_writeback() to fuse_writepage_finish(), each worker thread needs to compete for fi->lock only once. IOWs the locking granularity is larger now. > @@ -2367,54 +2111,23 @@ static int fuse_writepages_fill(struct folio *folio, > data->wpa = NULL; > } > > - err = -ENOMEM; > - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); > - if (!tmp_folio) > - goto out_unlock; > - > - /* > - * The page must not be redirtied until the writeout is completed > - * (i.e. userspace has sent a reply to the write request). Otherwise > - * there could be more than one temporary page instance for each real > - * page. > - * > - * This is ensured by holding the page lock in page_mkwrite() while > - * checking fuse_page_is_writeback(). We already hold the page lock > - * since clear_page_dirty_for_io() and keep it held until we add the > - * request to the fi->writepages list and increment ap->num_folios. > - * After this fuse_page_is_writeback() will indicate that the page is > - * under writeback, so we can release the page lock. > - */ > if (data->wpa == NULL) { > err = -ENOMEM; > wpa = fuse_writepage_args_setup(folio, data->ff); > - if (!wpa) { > - folio_put(tmp_folio); > + if (!wpa) > goto out_unlock; > - } > fuse_file_get(wpa->ia.ff); > data->max_folios = 1; > ap = &wpa->ia.ap; > } > folio_start_writeback(folio); There's also a lock contention for the xarray lock when calling folio_start_writeback(). I also noticed a strange thing that, if we lock fi->lock and unlock immediately, the write bandwidth improves by 5% (8500MB -> 9000MB). The palce where to insert the "locking fi->lock and unlocking" actually doesn't matter. "perf lock contention" shows the lock contention for the xarray lock is greatly alleviated, though I can't understand how it is done quite well... As the performance gain is not significant (~5%), I think we can leave this stange phenomenon aside for now. [*] test case: ./passthrough_hp --bypass-rw 2 /tmp /mnt (testbench mode in https://github.com/libfuse/libfuse/pull/807/commits/e83789cc6e83ca42ccc9899c4f7f8c69f31cbff9 bypass the buffer copy along with the persistence procedure) fio -fallocate=0 -numjobs=32 -iodepth=1 -ioengine=sync -sync=0 --direct=0 -rw=write -bs=1M -size=100G --time_based --runtime=300 -directory=/mnt/ --group_reporting --name=Fio -- Thanks, Jingbo