Re: [RFC][PATCH] Basic pre-allocation support for nfsd write

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Jul 16, 2008 at 11:44:13AM +1000, Shehjar Tikoo wrote:
> Please see the attached patches for adding
> pre-allocation support into nfsd writes. Comments follow.
>
> Patches:
> a. 01_vfs_fallocate.patch
> Adds vfs_fallocate. Basically, encapsulates the call to
> inode->i_op->fallocate, which is currently called directly from
> sys_fallocate, which takes a file descriptor as argument, but nfsd
> needs to operate on struct file's.
>
> b. 02_init_file_prealloc_limit.patch
> Adds a new member to struct file, to keep track of how much has been
> preallocated for this file. For now, adding to struct file seemed an
> easy way to keep per-file state about preallocation but this can be
> changed to use a nfsd-specific hash table that maps (dev, ino) to
> per-file pre-allocation state.
>
> c. 03_nfsd_fallocate.patch
> Wires in the call to vfs_fallocate into nfsd_vfs_write.
> For now, the function nfsd_get_prealloc_len uses a very simple
> method to determine when and how much to pre-allocate. This can change
> if needed.
> This patch also adds two module_params that control pre-allocation:
>
>  1. /sys/module/nfsd/parameters/nfsd_prealloc
>   Determine whether to pre-allocate.
>
>  2. /sys/module/nfsd/parameters/nfsd_prealloc_len
>   How much to pre-allocate. Default is 5Megs.

So, if I understand the algorithm right:

	- Initialize f_prealloc_len to 0.
	- Ignore any write(offset, cnt) contained entirely in the range
	  (0, f_prealloc_len).
	- For any write outside that range, extend f_prealloc_len to
	  offset + 5MB and call vfs_alloc(., ., offset, 5MB)

(where the 5MB is actually the configurable nfsd_prealloc_len parameter
above).

>
> The patches are based against 2.6.25.11.
>
> See the following two plots for read and write performance, with and
> without pre-allocation support. Tests were run using iozone. The
> filesystem was ext4 with extents enabled. The testbed used two Itanium
> machines as client and server, connected through a Gbit network with
> jumbo frames enabled. The filesystem was aged with various iozone and
> kernel compilation workloads that consumed 45G of a 64G disk.
>
> Server side mount options:
> rw,sync,insecure,no_root_squash,no_subtree_check,no_wdelay
>
> Client side mount options:
> intr,wsize=65536,rsize=65536
>
> 1. Read test
> http://www.gelato.unsw.edu.au/~shehjart/docs/nfsmeasurements/ext4fallocate_read.png

Sorry, I don't understand exactly what iozone is doing in this test (and
the below).  Is it just doing sequential 64k reads (or, below, writes)
through a 2G file?

> Read throughput clearly benefits due to the contiguity of disk blocks.
> In the best case, i.e. with pre-allocation of 4 and 5 Mb during the
> writing of the test file, throughput, during read of the same
> file, more than doubles.
>
> 2. Write test
> http://www.gelato.unsw.edu.au/~shehjart/docs/nfsmeasurements/ext4fallocate_write.png
> Going just by read performance, pre-allocation would be a nice thing
> to have *but* note that write throughput also decreases drastically,
> by almost 10 Mb/sec with just 1Mb of pre-allocation.

So I guess it's not surprising--you're doing extra work at write time in
order to make the reads go faster.

A general question: since this preallocation isn't already being done by
the filesystem, there must be some reason you think it's appropriate for
nfsd but not for other users.  What makes nfsd special?

--b.

>
> A question at this point is, how well does pre-allocation perform
> under other filesystems? I have no idea yet. I'll try to test XFS,
> RSN.
>
> Comments/suggestions are welcome.
>
> Regards
> Shehjar

> diff --git a/fs/open.c b/fs/open.c
> index a99ad09..b5b641a 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -359,39 +359,34 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
>  }
>  #endif
>  
> -asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
> +long vfs_fallocate(struct file * file, int mode, loff_t offset, loff_t len)
>  {
> -	struct file *file;
>  	struct inode *inode;
>  	long ret = -EINVAL;
>  
>  	if (offset < 0 || len <= 0)
>  		goto out;
> -
> +	
>  	/* Return error if mode is not supported */
>  	ret = -EOPNOTSUPP;
>  	if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
>  		goto out;
>  
> -	ret = -EBADF;
> -	file = fget(fd);
> -	if (!file)
> -		goto out;
>  	if (!(file->f_mode & FMODE_WRITE))
> -		goto out_fput;
> +		goto out;
>  	/*
>  	 * Revalidate the write permissions, in case security policy has
>  	 * changed since the files were opened.
>  	 */
>  	ret = security_file_permission(file, MAY_WRITE);
>  	if (ret)
> -		goto out_fput;
> +		goto out;
>  
>  	inode = file->f_path.dentry->d_inode;
>  
>  	ret = -ESPIPE;
>  	if (S_ISFIFO(inode->i_mode))
> -		goto out_fput;
> +		goto out;
>  
>  	ret = -ENODEV;
>  	/*
> @@ -399,19 +394,33 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
>  	 * for directories or not.
>  	 */
>  	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
> -		goto out_fput;
> +		goto out;
>  
>  	ret = -EFBIG;
>  	/* Check for wrap through zero too */
>  	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
> -		goto out_fput;
> +		goto out;
>  
>  	if (inode->i_op && inode->i_op->fallocate)
>  		ret = inode->i_op->fallocate(inode, mode, offset, len);
>  	else
>  		ret = -EOPNOTSUPP;
> +out:
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfs_fallocate);
>  
> -out_fput:
> +asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
> +{
> +	struct file *file;
> +	long ret;
> +
> +	ret = -EBADF;
> +	file = fget(fd);
> +	if (!file)
> +		goto out;
> +
> +	ret = vfs_fallocate(file, mode, offset, len);
>  	fput(file);
>  out:
>  	return ret;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index d8e2762..498a422 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1287,6 +1287,7 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
>  		unsigned long, loff_t *);
>  extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
>  		unsigned long, loff_t *);
> +extern long vfs_fallocate(struct file * file, int mode, loff_t offset, loff_t len);
>  
>  struct super_operations {
>     	struct inode *(*alloc_inode)(struct super_block *sb);

> diff --git a/fs/open.c b/fs/open.c
> index b5b641a..77820d3 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -832,6 +832,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
>  	f->f_path.mnt = mnt;
>  	f->f_pos = 0;
>  	f->f_op = fops_get(inode->i_fop);
> +	f->f_prealloc_limit = 0;
>  	file_move(f, &inode->i_sb->s_files);
>  
>  	error = security_dentry_open(f);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 498a422..5aaf82b 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -799,6 +799,7 @@ struct file {
>  	struct fown_struct	f_owner;
>  	unsigned int		f_uid, f_gid;
>  	struct file_ra_state	f_ra;
> +	unsigned long 		f_prealloc_limit;
>  
>  	u64			f_version;
>  #ifdef CONFIG_SECURITY

> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 46f59d5..3d7c48d 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -57,6 +57,7 @@
>  #include <linux/jhash.h>
>  
>  #include <asm/uaccess.h>
> +#include <linux/falloc.h>
>  
>  #define NFSDDBG_FACILITY		NFSDDBG_FILEOP
>  
> @@ -89,6 +90,14 @@ static struct raparms *		raparml;
>  #define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
>  static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];
>  
> +/* User-definable preallocation size in bytes */
> +unsigned long nfsd_prealloc_len = 5242880;
> +module_param(nfsd_prealloc_len, ulong, S_IRUGO|S_IWUSR);
> +
> +/* 0 if preallocation is disabled, 1 otherwise. */
> +int nfsd_prealloc = 0;
> +module_param(nfsd_prealloc, int, S_IRUGO|S_IWUSR);
> +
>  /* 
>   * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
>   * a mount point.
> @@ -954,6 +963,21 @@ static void kill_suid(struct dentry *dentry)
>  	mutex_unlock(&dentry->d_inode->i_mutex);
>  }
>  
> +
> +static unsigned long
> +nfsd_get_prealloc_len(struct file * file, loff_t offset, unsigned long cnt)
> +{
> +	/* Might want to do something more complex here to decide
> +	 * pre-allocation size.
> +	 */
> +	if(file->f_prealloc_limit > (offset + cnt))
> +		return 0;
> +
> +	file->f_prealloc_limit = offset + nfsd_prealloc_len;
> +	return nfsd_prealloc_len;
> +}
> +
> +
>  static __be32
>  nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
>  				loff_t offset, struct kvec *vec, int vlen,
> @@ -966,6 +990,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
>  	__be32			err = 0;
>  	int			host_err;
>  	int			stable = *stablep;
> +	unsigned long		prealloc_len = 0;
>  
>  #ifdef MSNFS
>  	err = nfserr_perm;
> @@ -998,6 +1023,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
>  	if (stable && !EX_WGATHER(exp))
>  		file->f_flags |= O_SYNC;
>  
> +	if(nfsd_prealloc && inode->i_op && inode->i_op->fallocate) {
> +		prealloc_len = nfsd_get_prealloc_len(file, offset, cnt);
> +		if(prealloc_len > 0)
> +			vfs_fallocate(file, FALLOC_FL_KEEP_SIZE, offset, prealloc_len);
> +	}
>  	/* Write the data. */
>  	oldfs = get_fs(); set_fs(KERNEL_DS);
>  	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux