fuse write performance

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I did a little benchmarking on fusexmp_fh over tmpfs.

First off, xattr support in the filesystem needs to be turned off otherwise the
kernel will do getxattr on each write(2) call.  This can be achieved by
e.g. commenting out the "#define HAVE_SETXATTR 1" line in include/config.h in
the fuse tree and recompiling the examples.

Splice support was broken in libfuse git, fixed now (head is 0096c126aa45).

Added kernel support to splice directly into a tmpfs file (patch attached; has
been tested, but please be careful, don't use on important data, etc).  Is
against:

  git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git for-next

I'm using iozone, as it is somewhat more suited to the task than dd.

Tested two blocksizes 4k and 128k:

  iozone -s 128M -r4k -i0 -+n -e -f $file
  iozone -s 128M -r128k -i0 -+n -e -f $file

Results are in MB/s.

4k    128k  test                     # of memcpy's
---------------------------------------------------
1030  1140  native                               1
  80   750  fuse (direct_io,splice_read)	 1
  80   600  fuse (direct_io)			 2
 500   520  fuse (writeback_cache,splice_read)	 2
 320   320  fuse (writeback_cache)		 4
  75   470  fuse (big_writes,splice_read)	 2
  75   360  fuse (big_writes)			 3

So, things aren't looking all that bad.  The "direct_io" option is the best
performer if using a large blocksize.  But "writeback_cache,splice_read" isn't
doing bad at all, and is fast regardless of the blocksize (as Maxim mentioned
there's a performance bug in there that causes slowdown for <4k blocksizes, but
that's easily fixable).

In case you're wondering why writeback_cache is doing 4 copies, not 3: the fuse
kernel module uses a temporary buffer for cached writeback.  With
splice+writeback_cache this temporary page can be directly inserted into the
target file's page cache and so two copies can be saved (one copy from temporary
page to userspace buffer and one copy from userspace buffer to target's page
cache).  Without the attached patch "writeback_cache,splice_read" would be 3
copies and performance somewhere in between the 4 and 2 copies.

Also note: the above measurements are *sustained* write rates.  With
"writeback_cache" writes can be as quick as native.  This can be demonstrated by
omitting the "-e" option of iozone.  To make better use of the writeback cache
it is possible to enlarge the maximum size of the cache with:

  echo 100 > /sys/class/bdi/0:30/max_ratio

Where "100" is a percent (default is 1 for fuse 100 for everything else).

And "30" is the device number of the fuse filesystem.  Can be determined by

  stat -c%d fuse-mountpoint

Thanks,
Miklos


---
 fs/fuse/dev.c    |   31 +++++++++++--
 fs/fuse/file.c   |    8 ++-
 fs/fuse/fuse_i.h |    3 +
 mm/shmem.c       |  125 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 153 insertions(+), 14 deletions(-)

--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -724,6 +724,7 @@ static int fuse_copy_fill(struct fuse_co
 			buf->page = page;
 			buf->offset = 0;
 			buf->len = 0;
+			buf->ops = &nosteal_pipe_buf_ops;
 
 			cs->currbuf = buf;
 			cs->mapaddr = kmap(page);
@@ -884,6 +885,26 @@ static int fuse_try_move_page(struct fus
 	return 1;
 }
 
+static int fuse_pipe_buf_steal(struct pipe_inode_info *pipe,
+			       struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+	lock_page(page);
+	return 0;
+}
+
+
+/* HACK HACK HACK: This will Oops if buffer persist after module unload: */
+const struct pipe_buf_operations fuse_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = fuse_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
 static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
 			 unsigned offset, unsigned count)
 {
@@ -900,6 +921,10 @@ static int fuse_ref_page(struct fuse_cop
 	buf->page = page;
 	buf->offset = offset;
 	buf->len = count;
+	if (cs->req->in.stealpages)
+		buf->ops = &fuse_pipe_buf_ops;
+	else
+		buf->ops = &nosteal_pipe_buf_ops;
 
 	cs->pipebufs++;
 	cs->nr_segs++;
@@ -1342,11 +1367,7 @@ static ssize_t fuse_dev_splice_read(stru
 		buf->page = bufs[page_nr].page;
 		buf->offset = bufs[page_nr].offset;
 		buf->len = bufs[page_nr].len;
-		/*
-		 * Need to be careful about this.  Having buf->ops in module
-		 * code can Oops if the buffer persists after module unload.
-		 */
-		buf->ops = &nosteal_pipe_buf_ops;
+		buf->ops = bufs[page_nr].ops;
 
 		pipe->nrbufs++;
 		page_nr++;
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1530,7 +1530,7 @@ static void fuse_writepage_free(struct f
 	int i;
 
 	for (i = 0; i < req->num_pages; i++)
-		__free_page(req->pages[i]);
+		put_page(req->pages[i]);
 
 	if (req->ff)
 		fuse_file_put(req->ff, false);
@@ -1704,6 +1704,7 @@ static int fuse_writepage_locked(struct
 	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
 	req->misc.write.next = NULL;
 	req->in.argpages = 1;
+	req->in.stealpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = tmp_page;
 	req->page_descs[0].offset = 0;
@@ -1897,7 +1898,7 @@ static int fuse_writepages_fill(struct p
 		err = -ENOMEM;
 		req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
 		if (!req) {
-			__free_page(tmp_page);
+			put_page(tmp_page);
 			goto out_unlock;
 		}
 
@@ -1905,6 +1906,7 @@ static int fuse_writepages_fill(struct p
 		req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
 		req->misc.write.next = NULL;
 		req->in.argpages = 1;
+		req->in.stealpages = 1;
 		req->background = 1;
 		req->num_pages = 0;
 		req->end = fuse_writepage_end;
@@ -2670,7 +2672,7 @@ long fuse_do_ioctl(struct file *file, un
 		fuse_put_request(fc, req);
 	free_page((unsigned long) iov_page);
 	while (num_pages)
-		__free_page(pages[--num_pages]);
+		put_page(pages[--num_pages]);
 	kfree(pages);
 
 	return err ? err : outarg.result;
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -175,6 +175,9 @@ struct fuse_in {
 	/** True if the data for the last argument is in req->pages */
 	unsigned argpages:1;
 
+	/** True if the pages can be stolen */
+	unsigned stealpages:1;
+
 	/** Number of arguments */
 	unsigned numargs;
 
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -32,6 +32,7 @@
 #include <linux/export.h>
 #include <linux/swap.h>
 #include <linux/aio.h>
+#include <linux/pipe_fs_i.h>
 
 static struct vfsmount *shm_mnt;
 
@@ -116,13 +117,14 @@ static bool shmem_should_replace_page(st
 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 				struct shmem_inode_info *info, pgoff_t index);
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
-	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
+	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type,
+	struct pipe_buffer *bufm, struct pipe_inode_info *pipe);
 
 static inline int shmem_getpage(struct inode *inode, pgoff_t index,
 	struct page **pagep, enum sgp_type sgp, int *fault_type)
 {
 	return shmem_getpage_gfp(inode, index, pagep, sgp,
-			mapping_gfp_mask(inode->i_mapping), fault_type);
+		mapping_gfp_mask(inode->i_mapping), fault_type, NULL, NULL);
 }
 
 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -1065,7 +1067,8 @@ static int shmem_replace_page(struct pag
  * entry since a page cannot live in both the swap and page cache
  */
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
-	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
+	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type,
+	struct pipe_buffer *buf, struct pipe_inode_info *pipe)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info;
@@ -1075,6 +1078,7 @@ static int shmem_getpage_gfp(struct inod
 	int error;
 	int once = 0;
 	int alloced = 0;
+	int lru = 0;
 
 	if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
 		return -EFBIG;
@@ -1191,6 +1195,15 @@ static int shmem_getpage_gfp(struct inod
 			percpu_counter_inc(&sbinfo->used_blocks);
 		}
 
+		if (buf && buf->ops->steal(pipe, buf) == 0) {
+			page = buf->page;
+			ClearPageMappedToDisk(page);
+			page_cache_get(page);
+			SetPageSwapBacked(page);
+			if (buf->flags & PIPE_BUF_FLAG_LRU)
+				lru = 1;
+			goto skip_alloc;
+		}
 		page = shmem_alloc_page(gfp, info, index);
 		if (!page) {
 			error = -ENOMEM;
@@ -1199,6 +1212,7 @@ static int shmem_getpage_gfp(struct inod
 
 		SetPageSwapBacked(page);
 		__set_page_locked(page);
+skip_alloc:
 		error = mem_cgroup_cache_charge(page, current->mm,
 						gfp & GFP_RECLAIM_MASK);
 		if (error)
@@ -1213,7 +1227,8 @@ static int shmem_getpage_gfp(struct inod
 			mem_cgroup_uncharge_cache_page(page);
 			goto decused;
 		}
-		lru_cache_add_anon(page);
+		if (!lru)
+			lru_cache_add_anon(page);
 
 		spin_lock(&info->lock);
 		info->alloced++;
@@ -1714,6 +1729,103 @@ static ssize_t shmem_file_splice_read(st
 	return error;
 }
 
+static int shmem_pipe_to_file(struct pipe_inode_info *pipe,
+			      struct pipe_buffer *buf,
+			      struct splice_desc *sd)
+{
+	struct file *file = sd->u.file;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	unsigned int offset, this_len;
+	pgoff_t index;
+	struct page *page;
+	int ret;
+	bool whole_page;
+
+	index = sd->pos >> PAGE_CACHE_SHIFT;
+	offset = sd->pos & ~PAGE_CACHE_MASK;
+
+	this_len = sd->len;
+	if (this_len + offset > PAGE_CACHE_SIZE)
+		this_len = PAGE_CACHE_SIZE - offset;
+
+	whole_page = (offset == 0) && (this_len == PAGE_CACHE_SIZE);
+
+	ret = shmem_getpage_gfp(inode, index, &page, SGP_WRITE,
+				mapping_gfp_mask(mapping), NULL,
+				whole_page ? buf : NULL, pipe);
+	if (unlikely(ret))
+		goto out;
+
+	if (buf->page != page) {
+		char *src = buf->ops->map(pipe, buf, 1);
+		char *dst = kmap_atomic(page);
+
+		memcpy(dst + offset, src + buf->offset, this_len);
+		flush_dcache_page(page);
+		kunmap_atomic(dst);
+		buf->ops->unmap(pipe, buf, src);
+	}
+	mark_page_accessed(page);
+	ret = shmem_write_end(file, mapping, sd->pos, this_len, this_len,
+			      page, NULL);
+out:
+	return ret;
+}
+
+static ssize_t shmem_file_splice_write(struct pipe_inode_info *pipe,
+				       struct file *out, loff_t *ppos,
+				       size_t len, unsigned int flags)
+{
+	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
+	ssize_t ret;
+
+	pipe_lock(pipe);
+
+	splice_from_pipe_begin(&sd);
+	do {
+		ret = splice_from_pipe_next(pipe, &sd);
+		if (ret <= 0)
+			break;
+
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		ret = file_remove_suid(out);
+		if (!ret) {
+			ret = file_update_time(out);
+			if (!ret)
+				ret = splice_from_pipe_feed(pipe, &sd,
+							    shmem_pipe_to_file);
+		}
+		mutex_unlock(&inode->i_mutex);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, &sd);
+
+	pipe_unlock(pipe);
+
+	if (sd.num_spliced)
+		ret = sd.num_spliced;
+
+	if (ret > 0) {
+		int err;
+
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
+		balance_dirty_pages_ratelimited(mapping);
+	}
+
+	return ret;
+}
+
 /*
  * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
  */
@@ -2714,7 +2826,7 @@ static const struct file_operations shme
 	.aio_write	= generic_file_aio_write,
 	.fsync		= noop_fsync,
 	.splice_read	= shmem_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= shmem_file_splice_write,
 	.fallocate	= shmem_fallocate,
 #endif
 };
@@ -3034,7 +3146,8 @@ struct page *shmem_read_mapping_page_gfp
 	int error;
 
 	BUG_ON(mapping->a_ops != &shmem_aops);
-	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp,
+				  NULL, NULL, NULL);
 	if (error)
 		page = ERR_PTR(error);
 	else
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux