On Wed, Jul 16, 2008 at 11:44:13AM +1000, Shehjar Tikoo wrote:
Please see the attached patches for adding
pre-allocation support into nfsd writes. Comments follow.
Patches:
a. 01_vfs_fallocate.patch
Adds vfs_fallocate. Basically, encapsulates the call to
inode->i_op->fallocate, which is currently called directly from
sys_fallocate, which takes a file descriptor as argument, but nfsd
needs to operate on struct file's.
b. 02_init_file_prealloc_limit.patch
Adds a new member to struct file, to keep track of how much has been
preallocated for this file. For now, adding to struct file seemed an
easy way to keep per-file state about preallocation but this can be
changed to use a nfsd-specific hash table that maps (dev, ino) to
per-file pre-allocation state.
c. 03_nfsd_fallocate.patch
Wires in the call to vfs_fallocate into nfsd_vfs_write.
For now, the function nfsd_get_prealloc_len uses a very simple
method to determine when and how much to pre-allocate. This can change
if needed.
This patch also adds two module_params that control pre-allocation:
1. /sys/module/nfsd/parameters/nfsd_prealloc
Determine whether to pre-allocate.
2. /sys/module/nfsd/parameters/nfsd_prealloc_len
How much to pre-allocate. Default is 5Megs.
So, if I understand the algorithm right:
- Initialize f_prealloc_len to 0.
- Ignore any write(offset, cnt) contained entirely in the range
(0, f_prealloc_len).
- For any write outside that range, extend f_prealloc_len to
offset + 5MB and call vfs_alloc(., ., offset, 5MB)
(where the 5MB is actually the configurable nfsd_prealloc_len parameter
above).
--b.
A question at this point is, how well does pre-allocation perform
under other filesystems? I have no idea yet. I'll try to test XFS,
RSN.
Comments/suggestions are welcome.
Regards
Shehjar
diff --git a/fs/open.c b/fs/open.c
index a99ad09..b5b641a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -359,39 +359,34 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
}
#endif
-asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+long vfs_fallocate(struct file * file, int mode, loff_t offset, loff_t len)
{
- struct file *file;
struct inode *inode;
long ret = -EINVAL;
if (offset < 0 || len <= 0)
goto out;
-
+
/* Return error if mode is not supported */
ret = -EOPNOTSUPP;
if (mode && !(mode & FALLOC_FL_KEEP_SIZE))
goto out;
- ret = -EBADF;
- file = fget(fd);
- if (!file)
- goto out;
if (!(file->f_mode & FMODE_WRITE))
- goto out_fput;
+ goto out;
/*
* Revalidate the write permissions, in case security policy has
* changed since the files were opened.
*/
ret = security_file_permission(file, MAY_WRITE);
if (ret)
- goto out_fput;
+ goto out;
inode = file->f_path.dentry->d_inode;
ret = -ESPIPE;
if (S_ISFIFO(inode->i_mode))
- goto out_fput;
+ goto out;
ret = -ENODEV;
/*
@@ -399,19 +394,33 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
* for directories or not.
*/
if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
- goto out_fput;
+ goto out;
ret = -EFBIG;
/* Check for wrap through zero too */
if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
- goto out_fput;
+ goto out;
if (inode->i_op && inode->i_op->fallocate)
ret = inode->i_op->fallocate(inode, mode, offset, len);
else
ret = -EOPNOTSUPP;
+out:
+ return ret;
+}
+EXPORT_SYMBOL(vfs_fallocate);
-out_fput:
+asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+{
+ struct file *file;
+ long ret;
+
+ ret = -EBADF;
+ file = fget(fd);
+ if (!file)
+ goto out;
+
+ ret = vfs_fallocate(file, mode, offset, len);
fput(file);
out:
return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d8e2762..498a422 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1287,6 +1287,7 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
unsigned long, loff_t *);
+extern long vfs_fallocate(struct file * file, int mode, loff_t offset, loff_t len);
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/fs/open.c b/fs/open.c
index b5b641a..77820d3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -832,6 +832,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
f->f_path.mnt = mnt;
f->f_pos = 0;
f->f_op = fops_get(inode->i_fop);
+ f->f_prealloc_limit = 0;
file_move(f, &inode->i_sb->s_files);
error = security_dentry_open(f);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 498a422..5aaf82b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -799,6 +799,7 @@ struct file {
struct fown_struct f_owner;
unsigned int f_uid, f_gid;
struct file_ra_state f_ra;
+ unsigned long f_prealloc_limit;
u64 f_version;
#ifdef CONFIG_SECURITY
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 46f59d5..3d7c48d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -57,6 +57,7 @@
#include <linux/jhash.h>
#include <asm/uaccess.h>
+#include <linux/falloc.h>
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
@@ -89,6 +90,14 @@ static struct raparms * raparml;
#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
+/* User-definable preallocation size in bytes */
+unsigned long nfsd_prealloc_len = 5242880;
+module_param(nfsd_prealloc_len, ulong, S_IRUGO|S_IWUSR);
+
+/* 0 if preallocation is disabled, 1 otherwise. */
+int nfsd_prealloc = 0;
+module_param(nfsd_prealloc, int, S_IRUGO|S_IWUSR);
+
/*
* Called from nfsd_lookup and encode_dirent. Check if we have crossed
* a mount point.
@@ -954,6 +963,21 @@ static void kill_suid(struct dentry *dentry)
mutex_unlock(&dentry->d_inode->i_mutex);
}
+
+static unsigned long
+nfsd_get_prealloc_len(struct file * file, loff_t offset, unsigned long cnt)
+{
+ /* Might want to do something more complex here to decide
+ * pre-allocation size.
+ */
+ if(file->f_prealloc_limit > (offset + cnt))
+ return 0;
+
+ file->f_prealloc_limit = offset + nfsd_prealloc_len;
+ return nfsd_prealloc_len;
+}
+
+
static __be32
nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen,
@@ -966,6 +990,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
__be32 err = 0;
int host_err;
int stable = *stablep;
+ unsigned long prealloc_len = 0;
#ifdef MSNFS
err = nfserr_perm;
@@ -998,6 +1023,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (stable && !EX_WGATHER(exp))
file->f_flags |= O_SYNC;
+ if(nfsd_prealloc && inode->i_op && inode->i_op->fallocate) {
+ prealloc_len = nfsd_get_prealloc_len(file, offset, cnt);
+ if(prealloc_len > 0)
+ vfs_fallocate(file, FALLOC_FL_KEEP_SIZE, offset, prealloc_len);
+ }
/* Write the data. */
oldfs = get_fs(); set_fs(KERNEL_DS);
host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);