Hi, while tracking down a very interesting interaction between Sun/SAM-FS and Linux NFS clients, we found out that the value of NFS_MAX_READ_AHEAD is to agressive/big for the specific use-case. For testing, instead of always recompiling the kernel with different values, I came up with the following patch. It introduces a tunable "/proc/sys/fs/nfs/nfs_ra_factor" with possible values between 0-15. Not sure whether it is actually a good thing to have. Better would be to set the read-ahead factor per filesystem via a mount option. The patch is against 2.6.24. It applies with offsets against 2.6.25-rc9. In case my mail client messes up the whitespace, the patch is also attached. Signed-off-by: Martin Knoblauch <knobi@xxxxxxxxxxxx> diff -ru linux-2.6.24-orig/fs/nfs/client.c linux-2.6.24-nfs_ra/fs/nfs/client.c --- linux-2.6.24-orig/fs/nfs/client.c 2008-01-24 23:58:37.000000000 +0100 +++ linux-2.6.24-nfs_ra/fs/nfs/client.c 2008-04-14 16:44:22.000000000 +0200 @@ -601,6 +601,11 @@ } /* + * NFS Client Read-Ahead factor +*/ +unsigned int nfs_ra_factor; + +/* * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) @@ -625,7 +630,9 @@ server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + printk(KERN_INFO "nfs_server_set_fsinfo: rsize, wsize, rpages, nfs_ra_factor, ra_pages: %d %d %d %d %d\n", + server->rsize,server->wsize,server->rpages,nfs_ra_factor,server->rpages * nfs_ra_factor); + server->backing_dev_info.ra_pages = server->rpages * nfs_ra_factor; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; @@ -1359,6 +1366,8 @@ { struct proc_dir_entry *p; + nfs_ra_factor = NFS_MAX_READAHEAD; + proc_fs_nfs = proc_mkdir("nfsfs", proc_root_fs); if (!proc_fs_nfs) goto error_0; diff -ru linux-2.6.24-orig/fs/nfs/sysctl.c linux-2.6.24-nfs_ra/fs/nfs/sysctl.c --- linux-2.6.24-orig/fs/nfs/sysctl.c 2008-01-24 23:58:37.000000000 +0100 +++ linux-2.6.24-nfs_ra/fs/nfs/sysctl.c 2008-04-14 16:15:03.000000000 +0200 @@ -14,9 +14,12 @@ #include <linux/nfs_fs.h> #include "callback.h" +#include "internal.h" static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; +static const unsigned int min_nfs_ra_factor = 0; +static const unsigned int max_nfs_ra_factor = NFS_MAX_READAHEAD; static struct ctl_table_header *nfs_callback_sysctl_table; static ctl_table nfs_cb_sysctls[] = { @@ -58,6 +61,16 @@ .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nfs_ra_factor", + .data = &nfs_ra_factor, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (unsigned int *)&min_nfs_ra_factor, + .extra2 = (unsigned int *)&max_nfs_ra_factor, + }, { .ctl_name = 0 } }; diff -ru linux-2.6.24-orig/include/linux/nfs_fs.h linux-2.6.24-nfs_ra/include/linux/nfs_fs.h --- linux-2.6.24-orig/include/linux/nfs_fs.h 2008-01-24 23:58:37.000000000 +0100 +++ linux-2.6.24-nfs_ra/include/linux/nfs_fs.h 2008-04-14 15:58:57.000000000 +0200 @@ -415,6 +415,11 @@ extern void nfs_writedata_release(void *); /* + * linux/fs/nfs/client.c +*/ +extern unsigned int nfs_ra_factor; + +/* * Try to write back everything synchronously (but check the * return value!) */ Cheers Martin ------------------------------------------------------ Martin Knoblauch email: k n o b i AT knobisoft DOT de www: http://www.knobisoft.de
diff -ru linux-2.6.24-orig/fs/nfs/client.c linux-2.6.24-nfs_ra/fs/nfs/client.c --- linux-2.6.24-orig/fs/nfs/client.c 2008-01-24 23:58:37.000000000 +0100 +++ linux-2.6.24-nfs_ra/fs/nfs/client.c 2008-04-14 16:44:22.000000000 +0200 @@ -601,6 +601,11 @@ } /* + * NFS Client Read-Ahead factor +*/ +unsigned int nfs_ra_factor; + +/* * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) @@ -625,7 +630,9 @@ server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + printk(KERN_INFO "nfs_server_set_fsinfo: rsize, wsize, rpages, nfs_ra_factor, ra_pages: %d %d %d %d %d\n", + server->rsize,server->wsize,server->rpages,nfs_ra_factor,server->rpages * nfs_ra_factor); + server->backing_dev_info.ra_pages = server->rpages * nfs_ra_factor; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; @@ -1359,6 +1366,8 @@ { struct proc_dir_entry *p; + nfs_ra_factor = NFS_MAX_READAHEAD; + proc_fs_nfs = proc_mkdir("nfsfs", proc_root_fs); if (!proc_fs_nfs) goto error_0; diff -ru linux-2.6.24-orig/fs/nfs/sysctl.c linux-2.6.24-nfs_ra/fs/nfs/sysctl.c --- linux-2.6.24-orig/fs/nfs/sysctl.c 2008-01-24 23:58:37.000000000 +0100 +++ linux-2.6.24-nfs_ra/fs/nfs/sysctl.c 2008-04-14 16:15:03.000000000 +0200 @@ -14,9 +14,12 @@ #include <linux/nfs_fs.h> #include "callback.h" +#include "internal.h" static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; +static const unsigned int min_nfs_ra_factor = 0; +static const unsigned int max_nfs_ra_factor = NFS_MAX_READAHEAD; static struct ctl_table_header *nfs_callback_sysctl_table; static ctl_table nfs_cb_sysctls[] = { @@ -58,6 +61,16 @@ .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nfs_ra_factor", + .data = &nfs_ra_factor, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (unsigned int *)&min_nfs_ra_factor, + .extra2 = (unsigned int *)&max_nfs_ra_factor, + }, { .ctl_name = 0 } }; diff -ru linux-2.6.24-orig/include/linux/nfs_fs.h linux-2.6.24-nfs_ra/include/linux/nfs_fs.h --- linux-2.6.24-orig/include/linux/nfs_fs.h 2008-01-24 23:58:37.000000000 +0100 +++ linux-2.6.24-nfs_ra/include/linux/nfs_fs.h 2008-04-14 15:58:57.000000000 +0200 @@ -415,6 +415,11 @@ extern void nfs_writedata_release(void *); /* + * linux/fs/nfs/client.c +*/ +extern unsigned int nfs_ra_factor; + +/* * Try to write back everything synchronously (but check the * return value!) */