[RFC][PATCH] Basic pre-allocation support for nfsd write

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi All

Please see the attached patches for adding
pre-allocation support into nfsd writes. Comments follow.

Patches:
a. 01_vfs_fallocate.patch
Adds vfs_fallocate. Basically, encapsulates the call to
inode->i_op->fallocate, which is currently called directly from
sys_fallocate, which takes a file descriptor as argument, but nfsd
needs to operate on struct file's.

b. 02_init_file_prealloc_limit.patch
Adds a new member to struct file, to keep track of how much has been
preallocated for this file. For now, adding to struct file seemed an
easy way to keep per-file state about preallocation but this can be
changed to use a nfsd-specific hash table that maps (dev, ino) to
per-file pre-allocation state.

c. 03_nfsd_fallocate.patch
Wires in the call to vfs_fallocate into nfsd_vfs_write.
For now, the function nfsd_get_prealloc_len uses a very simple
method to determine when and how much to pre-allocate. This can change
if needed.
This patch also adds two module_params that control pre-allocation:

 1. /sys/module/nfsd/parameters/nfsd_prealloc
  Determine whether to pre-allocate.

 2. /sys/module/nfsd/parameters/nfsd_prealloc_len
  How much to pre-allocate. Default is 5Megs.

The patches are based against 2.6.25.11.

See the following two plots for read and write performance, with and
without pre-allocation support. Tests were run using iozone. The
filesystem was ext4 with extents enabled. The testbed used two Itanium
machines as client and server, connected through a Gbit network with
jumbo frames enabled. The filesystem was aged with various iozone and
kernel compilation workloads that consumed 45G of a 64G disk.

Server side mount options:
rw,sync,insecure,no_root_squash,no_subtree_check,no_wdelay

Client side mount options:
intr,wsize=65536,rsize=65536

1. Read test
http://www.gelato.unsw.edu.au/~shehjart/docs/nfsmeasurements/ext4fallocate_read.png
Read throughput clearly benefits due to the contiguity of disk blocks.
In the best case, i.e. with pre-allocation of 4 and 5 Mb during the
writing of the test file, throughput, during read of the same
file, more than doubles.

2. Write test
http://www.gelato.unsw.edu.au/~shehjart/docs/nfsmeasurements/ext4fallocate_write.png
Going just by read performance, pre-allocation would be a nice thing
to have *but* note that write throughput also decreases drastically,
by almost 10 Mb/sec with just 1Mb of pre-allocation.

A question at this point is, how well does pre-allocation perform
under other filesystems? I have no idea yet. I'll try to test XFS,
RSN.

Comments/suggestions are welcome.

Regards
Shehjar
diff --git a/fs/open.c b/fs/open.c
index a99ad09..b5b641a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -359,39 +359,34 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
 }
 #endif
 
-asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+long vfs_fallocate(struct file * file, int mode, loff_t offset, loff_t len)
 {
-	struct file *file;
 	struct inode *inode;
 	long ret = -EINVAL;
 
 	if (offset < 0 || len <= 0)
 		goto out;
-
+	
 	/* Return error if mode is not supported */
 	ret = -EOPNOTSUPP;
 	if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
 		goto out;
 
-	ret = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
 	if (!(file->f_mode & FMODE_WRITE))
-		goto out_fput;
+		goto out;
 	/*
 	 * Revalidate the write permissions, in case security policy has
 	 * changed since the files were opened.
 	 */
 	ret = security_file_permission(file, MAY_WRITE);
 	if (ret)
-		goto out_fput;
+		goto out;
 
 	inode = file->f_path.dentry->d_inode;
 
 	ret = -ESPIPE;
 	if (S_ISFIFO(inode->i_mode))
-		goto out_fput;
+		goto out;
 
 	ret = -ENODEV;
 	/*
@@ -399,19 +394,33 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
 	 * for directories or not.
 	 */
 	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
-		goto out_fput;
+		goto out;
 
 	ret = -EFBIG;
 	/* Check for wrap through zero too */
 	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
-		goto out_fput;
+		goto out;
 
 	if (inode->i_op && inode->i_op->fallocate)
 		ret = inode->i_op->fallocate(inode, mode, offset, len);
 	else
 		ret = -EOPNOTSUPP;
+out:
+	return ret;
+}
+EXPORT_SYMBOL(vfs_fallocate);
 
-out_fput:
+asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+{
+	struct file *file;
+	long ret;
+
+	ret = -EBADF;
+	file = fget(fd);
+	if (!file)
+		goto out;
+
+	ret = vfs_fallocate(file, mode, offset, len);
 	fput(file);
 out:
 	return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d8e2762..498a422 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1287,6 +1287,7 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *);
 extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
 		unsigned long, loff_t *);
+extern long vfs_fallocate(struct file * file, int mode, loff_t offset, loff_t len);
 
 struct super_operations {
    	struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/fs/open.c b/fs/open.c
index b5b641a..77820d3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -832,6 +832,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	f->f_path.mnt = mnt;
 	f->f_pos = 0;
 	f->f_op = fops_get(inode->i_fop);
+	f->f_prealloc_limit = 0;
 	file_move(f, &inode->i_sb->s_files);
 
 	error = security_dentry_open(f);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 498a422..5aaf82b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -799,6 +799,7 @@ struct file {
 	struct fown_struct	f_owner;
 	unsigned int		f_uid, f_gid;
 	struct file_ra_state	f_ra;
+	unsigned long 		f_prealloc_limit;
 
 	u64			f_version;
 #ifdef CONFIG_SECURITY
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 46f59d5..3d7c48d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -57,6 +57,7 @@
 #include <linux/jhash.h>
 
 #include <asm/uaccess.h>
+#include <linux/falloc.h>
 
 #define NFSDDBG_FACILITY		NFSDDBG_FILEOP
 
@@ -89,6 +90,14 @@ static struct raparms *		raparml;
 #define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
 static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];
 
+/* User-definable preallocation size in bytes */
+unsigned long nfsd_prealloc_len = 5242880;
+module_param(nfsd_prealloc_len, ulong, S_IRUGO|S_IWUSR);
+
+/* 0 if preallocation is disabled, 1 otherwise. */
+int nfsd_prealloc = 0;
+module_param(nfsd_prealloc, int, S_IRUGO|S_IWUSR);
+
 /* 
  * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
  * a mount point.
@@ -954,6 +963,21 @@ static void kill_suid(struct dentry *dentry)
 	mutex_unlock(&dentry->d_inode->i_mutex);
 }
 
+
+static unsigned long
+nfsd_get_prealloc_len(struct file * file, loff_t offset, unsigned long cnt)
+{
+	/* Might want to do something more complex here to decide
+	 * pre-allocation size.
+	 */
+	if(file->f_prealloc_limit > (offset + cnt))
+		return 0;
+
+	file->f_prealloc_limit = offset + nfsd_prealloc_len;
+	return nfsd_prealloc_len;
+}
+
+
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
@@ -966,6 +990,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	__be32			err = 0;
 	int			host_err;
 	int			stable = *stablep;
+	unsigned long		prealloc_len = 0;
 
 #ifdef MSNFS
 	err = nfserr_perm;
@@ -998,6 +1023,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	if (stable && !EX_WGATHER(exp))
 		file->f_flags |= O_SYNC;
 
+	if(nfsd_prealloc && inode->i_op && inode->i_op->fallocate) {
+		prealloc_len = nfsd_get_prealloc_len(file, offset, cnt);
+		if(prealloc_len > 0)
+			vfs_fallocate(file, FALLOC_FL_KEEP_SIZE, offset, prealloc_len);
+	}
 	/* Write the data. */
 	oldfs = get_fs(); set_fs(KERNEL_DS);
 	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);

[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux