Hi.
Attached is a patch which addresses a continuing problem with
the NFS client generating out of order WRITE requests. While
this is compliant with all of the current protocol
specifications, there are servers in the market which can not
handle out of order WRITE requests very well. Also, this may
lead to sub-optimal block allocations in the underlying file
system on the server. This may cause the read throughputs to
be reduced when reading the file from the server.
There has been a lot of work recently done to address out of
order issues on a systemic level. However, the NFS client is
still susceptible to the problem. Out of order WRITE
requests can occur when pdflush is in the middle of writing
out pages while the process dirtying the pages calls
generic_file_buffered_write which calls
generic_perform_write which calls
balance_dirty_pages_rate_limited which ends up calling
writeback_inodes which ends up calling back into the NFS
client to writes out dirty pages for the same file that
pdflush happens to be working with.
The attached patch supplies synchronization in the NFS client
code itself. The entry point in the NFS client for both of
the code paths mentioned is nfs_writepages, so serializing
there resolves this issue.
My testing, informal, showed no degradation in WRITE
throughput.
Thanx...
ps
Signed-off-by: Peter Staubach <staubach@xxxxxxxxxx>
--- linux-2.6.28.i686/fs/nfs/write.c.org
+++ linux-2.6.28.i686/fs/nfs/write.c
@@ -310,6 +310,29 @@ static int nfs_writepages_callback(struc
return ret;
}
+/*
+ * Wait for the WRITE flushing to complete.
+ */
+static int nfs_wait_on_flushing(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ int error;
+
+ error = wait_on_bit_lock(&nfsi->flags, NFS_INO_FLUSHING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+
+ return error;
+}
+
+static void nfs_wake_up_after_flushing(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ clear_bit(NFS_INO_FLUSHING, &nfsi->flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&nfsi->flags, NFS_INO_FLUSHING);
+}
+
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
@@ -318,9 +341,14 @@ int nfs_writepages(struct address_space
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
+ nfs_wait_on_flushing(inode);
+
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
nfs_pageio_complete(&pgio);
+
+ nfs_wake_up_after_flushing(inode);
+
if (err < 0)
return err;
if (pgio.pg_error < 0)
--- linux-2.6.28.i686/fs/nfs/pagelist.c.org
+++ linux-2.6.28.i686/fs/nfs/pagelist.c
@@ -176,7 +176,7 @@ void nfs_release_request(struct nfs_page
kref_put(&req->wb_kref, nfs_free_request);
}
-static int nfs_wait_bit_killable(void *word)
+int nfs_wait_bit_killable(void *word)
{
int ret = 0;
--- linux-2.6.28.i686/include/linux/nfs_fs.h.org
+++ linux-2.6.28.i686/include/linux/nfs_fs.h
@@ -207,6 +207,7 @@ struct nfs_inode {
#define NFS_INO_STALE (1) /* possible stale inode */
#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
#define NFS_INO_MOUNTPOINT (3) /* inode is remote mountpoint */
+#define NFS_INO_FLUSHING (4) /* flushing WRITE requests */
static inline struct nfs_inode *NFS_I(const struct inode *inode)
{
@@ -466,6 +467,11 @@ extern int nfs_writeback_done(struct rpc
extern void nfs_writedata_release(void *);
/*
+ * linux/fs/nfs/pagelist.c
+ */
+extern int nfs_wait_bit_killable(void *);
+
+/*
* Try to write back everything synchronously (but check the
* return value!)
*/