I did a little benchmarking on fusexmp_fh over tmpfs. First off, xattr support in the filesystem needs to be turned off otherwise the kernel will do getxattr on each write(2) call. This can be achieved by e.g. commenting out the "#define HAVE_SETXATTR 1" line in include/config.h in the fuse tree and recompiling the examples. Splice support was broken in libfuse git, fixed now (head is 0096c126aa45). Added kernel support to splice directly into a tmpfs file (patch attached; has been tested, but please be careful, don't use on important data, etc). Is against: git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git for-next I'm using iozone, as it is somewhat more suited to the task than dd. Tested two blocksizes 4k and 128k: iozone -s 128M -r4k -i0 -+n -e -f $file iozone -s 128M -r128k -i0 -+n -e -f $file Results are in MB/s. 4k 128k test # of memcpy's --------------------------------------------------- 1030 1140 native 1 80 750 fuse (direct_io,splice_read) 1 80 600 fuse (direct_io) 2 500 520 fuse (writeback_cache,splice_read) 2 320 320 fuse (writeback_cache) 4 75 470 fuse (big_writes,splice_read) 2 75 360 fuse (big_writes) 3 So, things aren't looking all that bad. The "direct_io" option is the best performer if using a large blocksize. But "writeback_cache,splice_read" isn't doing bad at all, and is fast regardless of the blocksize (as Maxim mentioned there's a performance bug in there that causes slowdown for <4k blocksizes, but that's easily fixable). In case you're wondering why writeback_cache is doing 4 copies, not 3: the fuse kernel module uses a temporary buffer for cached writeback. With splice+writeback_cache this temporary page can be directly inserted into the target file's page cache and so two copies can be saved (one copy from temporary page to userspace buffer and one copy from userspace buffer to target's page cache). Without the attached patch "writeback_cache,splice_read" would be 3 copies and performance somewhere in between the 4 and 2 copies. Also note: the above measurements are *sustained* write rates. With "writeback_cache" writes can be as quick as native. This can be demonstrated by omitting the "-e" option of iozone. To make better use of the writeback cache it is possible to enlarge the maximum size of the cache with: echo 100 > /sys/class/bdi/0:30/max_ratio Where "100" is a percent (default is 1 for fuse 100 for everything else). And "30" is the device number of the fuse filesystem. Can be determined by stat -c%d fuse-mountpoint Thanks, Miklos --- fs/fuse/dev.c | 31 +++++++++++-- fs/fuse/file.c | 8 ++- fs/fuse/fuse_i.h | 3 + mm/shmem.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 153 insertions(+), 14 deletions(-) --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -724,6 +724,7 @@ static int fuse_copy_fill(struct fuse_co buf->page = page; buf->offset = 0; buf->len = 0; + buf->ops = &nosteal_pipe_buf_ops; cs->currbuf = buf; cs->mapaddr = kmap(page); @@ -884,6 +885,26 @@ static int fuse_try_move_page(struct fus return 1; } +static int fuse_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + lock_page(page); + return 0; +} + + +/* HACK HACK HACK: This will Oops if buffer persist after module unload: */ +const struct pipe_buf_operations fuse_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = fuse_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, unsigned offset, unsigned count) { @@ -900,6 +921,10 @@ static int fuse_ref_page(struct fuse_cop buf->page = page; buf->offset = offset; buf->len = count; + if (cs->req->in.stealpages) + buf->ops = &fuse_pipe_buf_ops; + else + buf->ops = &nosteal_pipe_buf_ops; cs->pipebufs++; cs->nr_segs++; @@ -1342,11 +1367,7 @@ static ssize_t fuse_dev_splice_read(stru buf->page = bufs[page_nr].page; buf->offset = bufs[page_nr].offset; buf->len = bufs[page_nr].len; - /* - * Need to be careful about this. Having buf->ops in module - * code can Oops if the buffer persists after module unload. - */ - buf->ops = &nosteal_pipe_buf_ops; + buf->ops = bufs[page_nr].ops; pipe->nrbufs++; page_nr++; --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1530,7 +1530,7 @@ static void fuse_writepage_free(struct f int i; for (i = 0; i < req->num_pages; i++) - __free_page(req->pages[i]); + put_page(req->pages[i]); if (req->ff) fuse_file_put(req->ff, false); @@ -1704,6 +1704,7 @@ static int fuse_writepage_locked(struct req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; req->misc.write.next = NULL; req->in.argpages = 1; + req->in.stealpages = 1; req->num_pages = 1; req->pages[0] = tmp_page; req->page_descs[0].offset = 0; @@ -1897,7 +1898,7 @@ static int fuse_writepages_fill(struct p err = -ENOMEM; req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ); if (!req) { - __free_page(tmp_page); + put_page(tmp_page); goto out_unlock; } @@ -1905,6 +1906,7 @@ static int fuse_writepages_fill(struct p req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; req->misc.write.next = NULL; req->in.argpages = 1; + req->in.stealpages = 1; req->background = 1; req->num_pages = 0; req->end = fuse_writepage_end; @@ -2670,7 +2672,7 @@ long fuse_do_ioctl(struct file *file, un fuse_put_request(fc, req); free_page((unsigned long) iov_page); while (num_pages) - __free_page(pages[--num_pages]); + put_page(pages[--num_pages]); kfree(pages); return err ? err : outarg.result; --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -175,6 +175,9 @@ struct fuse_in { /** True if the data for the last argument is in req->pages */ unsigned argpages:1; + /** True if the pages can be stolen */ + unsigned stealpages:1; + /** Number of arguments */ unsigned numargs; --- a/mm/shmem.c +++ b/mm/shmem.c @@ -32,6 +32,7 @@ #include <linux/export.h> #include <linux/swap.h> #include <linux/aio.h> +#include <linux/pipe_fs_i.h> static struct vfsmount *shm_mnt; @@ -116,13 +117,14 @@ static bool shmem_should_replace_page(st static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type, + struct pipe_buffer *bufm, struct pipe_inode_info *pipe); static inline int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, int *fault_type) { return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), fault_type); + mapping_gfp_mask(inode->i_mapping), fault_type, NULL, NULL); } static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) @@ -1065,7 +1067,8 @@ static int shmem_replace_page(struct pag * entry since a page cannot live in both the swap and page cache */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type, + struct pipe_buffer *buf, struct pipe_inode_info *pipe) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info; @@ -1075,6 +1078,7 @@ static int shmem_getpage_gfp(struct inod int error; int once = 0; int alloced = 0; + int lru = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) return -EFBIG; @@ -1191,6 +1195,15 @@ static int shmem_getpage_gfp(struct inod percpu_counter_inc(&sbinfo->used_blocks); } + if (buf && buf->ops->steal(pipe, buf) == 0) { + page = buf->page; + ClearPageMappedToDisk(page); + page_cache_get(page); + SetPageSwapBacked(page); + if (buf->flags & PIPE_BUF_FLAG_LRU) + lru = 1; + goto skip_alloc; + } page = shmem_alloc_page(gfp, info, index); if (!page) { error = -ENOMEM; @@ -1199,6 +1212,7 @@ static int shmem_getpage_gfp(struct inod SetPageSwapBacked(page); __set_page_locked(page); +skip_alloc: error = mem_cgroup_cache_charge(page, current->mm, gfp & GFP_RECLAIM_MASK); if (error) @@ -1213,7 +1227,8 @@ static int shmem_getpage_gfp(struct inod mem_cgroup_uncharge_cache_page(page); goto decused; } - lru_cache_add_anon(page); + if (!lru) + lru_cache_add_anon(page); spin_lock(&info->lock); info->alloced++; @@ -1714,6 +1729,103 @@ static ssize_t shmem_file_splice_read(st return error; } +static int shmem_pipe_to_file(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, + struct splice_desc *sd) +{ + struct file *file = sd->u.file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned int offset, this_len; + pgoff_t index; + struct page *page; + int ret; + bool whole_page; + + index = sd->pos >> PAGE_CACHE_SHIFT; + offset = sd->pos & ~PAGE_CACHE_MASK; + + this_len = sd->len; + if (this_len + offset > PAGE_CACHE_SIZE) + this_len = PAGE_CACHE_SIZE - offset; + + whole_page = (offset == 0) && (this_len == PAGE_CACHE_SIZE); + + ret = shmem_getpage_gfp(inode, index, &page, SGP_WRITE, + mapping_gfp_mask(mapping), NULL, + whole_page ? buf : NULL, pipe); + if (unlikely(ret)) + goto out; + + if (buf->page != page) { + char *src = buf->ops->map(pipe, buf, 1); + char *dst = kmap_atomic(page); + + memcpy(dst + offset, src + buf->offset, this_len); + flush_dcache_page(page); + kunmap_atomic(dst); + buf->ops->unmap(pipe, buf, src); + } + mark_page_accessed(page); + ret = shmem_write_end(file, mapping, sd->pos, this_len, this_len, + page, NULL); +out: + return ret; +} + +static ssize_t shmem_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + struct address_space *mapping = out->f_mapping; + struct inode *inode = mapping->host; + struct splice_desc sd = { + .total_len = len, + .flags = flags, + .pos = *ppos, + .u.file = out, + }; + ssize_t ret; + + pipe_lock(pipe); + + splice_from_pipe_begin(&sd); + do { + ret = splice_from_pipe_next(pipe, &sd); + if (ret <= 0) + break; + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + ret = file_remove_suid(out); + if (!ret) { + ret = file_update_time(out); + if (!ret) + ret = splice_from_pipe_feed(pipe, &sd, + shmem_pipe_to_file); + } + mutex_unlock(&inode->i_mutex); + } while (ret > 0); + splice_from_pipe_end(pipe, &sd); + + pipe_unlock(pipe); + + if (sd.num_spliced) + ret = sd.num_spliced; + + if (ret > 0) { + int err; + + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; + balance_dirty_pages_ratelimited(mapping); + } + + return ret; +} + /* * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. */ @@ -2714,7 +2826,7 @@ static const struct file_operations shme .aio_write = generic_file_aio_write, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = shmem_file_splice_write, .fallocate = shmem_fallocate, #endif }; @@ -3034,7 +3146,8 @@ struct page *shmem_read_mapping_page_gfp int error; BUG_ON(mapping->a_ops != &shmem_aops); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, + NULL, NULL, NULL); if (error) page = ERR_PTR(error); else -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html