Currently if writeback cache policy is enabled, fuse will allocate temporary pages with GFP_NOFS|__GFP_HIGHMEM for writeback io, commit 3be5a52b30aa ("fuse: support writable mmap") introduces this behavior primarily for the following two reasons: 1. filesystem based on fuse may also need additional memory resources to complete a WRITE request, then there's a great danger of a deadlock if that allocation may wait for the writepage to finish. 2. buggy or even malicious userspace filesystem may fail to complete WRITE requests, then unrelated parts of the system may hang, for example, sync operations may hang. As commit 3be5a52b30aa said, this approach is wasteful in both memory and CPU bandwidth, but it's necessary for traditional fuse userspace filesystems. With the emergence of the virtio-fs file system, the virtio-fs userspace backend are typically provided by cloud providers, which may have bugs, but should not be seem as malicious, also vm and virtio-fs userspace backend normally run on different os, so the deadlock should not happen, though vm hypervisor is running on the os same to virtio-fs userspace backend. Nowadays, modern DPUs(Data Processing Unit) start to support virtio filesystem device, below is virtio-fs spec: https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-49600011 Cloud providers or hardware manufacturers provide DPUs, they should be seem as trusted hardwares, just like nvme ssd nic, etc. The DPU also has a running os interenally, the basic architecture would look like below: -------------------------- | host machine | | | |--------------------------| | os/virtio-fs | ---------------------------| | hardware | -------------------------- | |(pci) v ------------------------------- | DPU | | | | virtio-fs userspace backend | |-------------------------------| | os | | ------------------------------| | hardware | ------------------------------- Obviously, for virtio-fs case, temporary pages are unnecessary, so this patch changes the write-back cache policy a bit, if fuse req is passed on virtio_fs channel, there's no needs to allocate temporary pages, use page cache pages for write-back io directly. Use fio to evaluate performance improvement: [global] ioengine=psync fsync=1 direct=0 bs=1048576 rw=randwrite randrepeat=0 iodepth=1 size=512M runtime=30 time_based filename=/home/lege/mntpoint/testfile2 [job1] cpus_allowed=1 Before patch: WRITE: bw=655MiB/s (686MB/s), 655MiB/s-655MiB/s (686MB/s-686MB/s), io=19.2GiB (20.6GB), run=30001-30001msec After patch: WRITE: bw=754MiB/s (790MB/s), 754MiB/s-754MiB/s (790MB/s-790MB/s), io=22.1GiB (23.7GB), run=30001-30001msec About 15% throughput improvement. Signed-off-by: Xiaoguang Wang <lege.wang@xxxxxxxxxxxxxxx> --- fs/fuse/file.c | 109 +++++++++++++++++++++++++++++++------------- fs/fuse/fuse_i.h | 6 +++ fs/fuse/inode.c | 5 +- fs/fuse/virtio_fs.c | 1 + 4 files changed, 89 insertions(+), 32 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1cdb6327511e..3d14a291cd3e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -400,6 +400,7 @@ struct fuse_writepage_args { struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; + bool has_temporary_page; }; static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, @@ -1660,8 +1661,10 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_pages; i++) - __free_page(ap->pages[i]); + if (wpa->has_temporary_page) { + for (i = 0; i < ap->num_pages; i++) + __free_page(ap->pages[i]); + } if (wpa->ia.ff) fuse_file_put(wpa->ia.ff, false, false); @@ -1681,10 +1684,14 @@ static void fuse_writepage_finish(struct fuse_mount *fm, for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); + if (wpa->has_temporary_page) + dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); + else + end_page_writeback(ap->pages[i]); wb_writeout_inc(&bdi->wb); } - wake_up(&fi->page_waitq); + if (wpa->has_temporary_page) + wake_up(&fi->page_waitq); } /* Called under fi->lock, may release and reacquire it */ @@ -1823,6 +1830,9 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); + if (!wpa->has_temporary_page) + goto skip; + rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { struct fuse_mount *fm = get_fuse_mount(inode); @@ -1859,6 +1869,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, */ fuse_send_writepage(fm, next, inarg->offset + inarg->size); } +skip: fi->writectr--; fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); @@ -1952,8 +1963,9 @@ static int fuse_writepage_locked(struct page *page) struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct page *tmp_page; + struct page *tmp_page = NULL; int error = -ENOMEM; + int needs_temporary_page = !fc->no_temporary_page; set_page_writeback(page); @@ -1962,9 +1974,11 @@ static int fuse_writepage_locked(struct page *page) goto err; ap = &wpa->ia.ap; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) - goto err_free; + if (needs_temporary_page) { + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto err_free; + } error = -EIO; wpa->ia.ff = fuse_write_file_get(fi); @@ -1974,19 +1988,24 @@ static int fuse_writepage_locked(struct page *page) fuse_writepage_add_to_bucket(fc, wpa); fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); - copy_highpage(tmp_page, page); + if (needs_temporary_page) { + copy_highpage(tmp_page, page); + ap->pages[0] = tmp_page; + inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + } else { + ap->pages[0] = page; + } wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->next = NULL; ap->args.in_pages = true; ap->num_pages = 1; - ap->pages[0] = tmp_page; ap->descs[0].offset = 0; ap->descs[0].length = PAGE_SIZE; ap->args.end = fuse_writepage_end; wpa->inode = inode; + wpa->has_temporary_page = needs_temporary_page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); tree_insert(&fi->writepages, wpa); @@ -1994,12 +2013,14 @@ static int fuse_writepage_locked(struct page *page) fuse_flush_writepages(inode); spin_unlock(&fi->lock); - end_page_writeback(page); + if (needs_temporary_page) + end_page_writeback(page); return 0; err_nofile: - __free_page(tmp_page); + if (needs_temporary_page) + __free_page(tmp_page); err_free: kfree(wpa); err: @@ -2013,7 +2034,7 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) struct fuse_conn *fc = get_fuse_conn(page->mapping->host); int err; - if (fuse_page_is_writeback(page->mapping->host, page->index)) { + if ((!fc->no_temporary_page) && fuse_page_is_writeback(page->mapping->host, page->index)) { /* * ->writepages() should be called for sync() and friends. We * should only get here on direct reclaim and then we are @@ -2084,8 +2105,11 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) fuse_flush_writepages(inode); spin_unlock(&fi->lock); - for (i = 0; i < num_pages; i++) - end_page_writeback(data->orig_pages[i]); + + if (wpa->has_temporary_page) { + for (i = 0; i < num_pages; i++) + end_page_writeback(data->orig_pages[i]); + } } /* @@ -2156,7 +2180,8 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, * the pages are faulted with get_user_pages(), and then after the read * completed. */ - if (fuse_page_is_writeback(data->inode, page->index)) + if (data->wpa->has_temporary_page && + fuse_page_is_writeback(data->inode, page->index)) return true; /* Reached max pages */ @@ -2187,8 +2212,9 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct page *tmp_page; + struct page *tmp_page = NULL; int err; + int needs_temporary_page = !fc->no_temporary_page; if (!data->ff) { err = -EIO; @@ -2202,10 +2228,12 @@ static int fuse_writepages_fill(struct folio *folio, data->wpa = NULL; } - err = -ENOMEM; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) - goto out_unlock; + if (needs_temporary_page) { + err = -ENOMEM; + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto out_unlock; + } /* * The page must not be redirtied until the writeout is completed @@ -2223,7 +2251,7 @@ static int fuse_writepages_fill(struct folio *folio, if (data->wpa == NULL) { err = -ENOMEM; wpa = fuse_writepage_args_alloc(); - if (!wpa) { + if (!wpa && tmp_page) { __free_page(tmp_page); goto out_unlock; } @@ -2242,14 +2270,20 @@ static int fuse_writepages_fill(struct folio *folio, } folio_start_writeback(folio); - copy_highpage(tmp_page, &folio->page); - ap->pages[ap->num_pages] = tmp_page; + if (needs_temporary_page) { + copy_highpage(tmp_page, &folio->page); + ap->pages[ap->num_pages] = tmp_page; + data->orig_pages[ap->num_pages] = &folio->page; + inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + } else { + ap->pages[ap->num_pages] = &folio->page; + data->orig_pages[ap->num_pages] = &folio->page; + } ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; - data->orig_pages[ap->num_pages] = &folio->page; + wpa->has_temporary_page = needs_temporary_page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; if (data->wpa) { @@ -2260,10 +2294,16 @@ static int fuse_writepages_fill(struct folio *folio, spin_lock(&fi->lock); ap->num_pages++; spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, &folio->page)) { - data->wpa = wpa; + } else if (needs_temporary_page) { + if (fuse_writepage_add(wpa, &folio->page)) + data->wpa = wpa; + else + folio_end_writeback(folio); } else { - folio_end_writeback(folio); + spin_lock(&fi->lock); + data->wpa = wpa; + ap->num_pages++; + spin_unlock(&fi->lock); } out_unlock: folio_unlock(folio); @@ -2436,6 +2476,9 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct folio *folio = page_folio(vmf->page); + int has_temporary_page = !fc->no_temporary_page; file_update_time(vmf->vma->vm_file); lock_page(page); @@ -2444,7 +2487,11 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_NOPAGE; } - fuse_wait_on_page_writeback(inode, page->index); + if (has_temporary_page) + fuse_wait_on_page_writeback(inode, page->index); + else + folio_wait_stable(folio); + return VM_FAULT_LOCKED; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6e6e721f421b..9958f672ba47 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -646,9 +646,15 @@ struct fuse_conn { /** Filesystem supports NFS exporting. Only set in INIT */ unsigned export_support:1; + /** Indicate fuse req is passed on virtio_fs channel **/ + unsigned virtio_fs_ch:1; + /** write-back cache policy (default is write-through) */ unsigned writeback_cache:1; + /** Indicate whether write-back cache policy should use temporary pages **/ + unsigned no_temporary_page:1; + /** allow parallel lookups and readdir (default is serialized) */ unsigned parallel_dirops:1; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 74d4f09d5827..69da39728ae0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1191,8 +1191,11 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, } if (flags & FUSE_ASYNC_DIO) fc->async_dio = 1; - if (flags & FUSE_WRITEBACK_CACHE) + if (flags & FUSE_WRITEBACK_CACHE) { fc->writeback_cache = 1; + if (fc->virtio_fs_ch) + fc->no_temporary_page = 1; + } if (flags & FUSE_PARALLEL_DIROPS) fc->parallel_dirops = 1; if (flags & FUSE_HANDLE_KILLPRIV) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 5f1be1da92ce..dfa3806f979f 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1346,6 +1346,7 @@ static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) /* Previous unmount will stop all queues. Start these again */ virtio_fs_start_all_queues(fs); + fc->virtio_fs_ch = 1; fuse_send_init(fm); mutex_unlock(&virtio_fs_mutex); return 0; -- 2.40.0