On Thu, Aug 20, 2015 at 4:17 AM, Jeff Layton <jlayton@xxxxxxxxxxxxxxx> wrote: > Currently, NFSv2/3 reads and writes have to open a file, do the read or > write and then close it again for each RPC. This is highly inefficient, > especially when the underlying filesystem has a relatively slow open > routine. > > This patch adds a new open file cache to knfsd. Rather than doing an > open for each RPC, the read/write handlers can call into this cache to > see if there is one already there for the correct filehandle and > NFS_MAY_READ/WRITE flags. > > If there isn't an entry, then we create a new one and attempt to > perform the open. If there is, then we wait until the entry is fully > instantiated and return it if it is at the end of the wait. If it's > not, then we attempt to take over construction. > > Since the main goal is to speed up NFSv2/3 I/O, we don't want to > close these files on last put of these objects. We need to keep them > around for a little while since we never know when the next READ/WRITE > will come in. > > Signed-off-by: Jeff Layton <jeff.layton@xxxxxxxxxxxxxxx> > --- > fs/nfsd/Makefile | 3 +- > fs/nfsd/filecache.c | 273 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > fs/nfsd/filecache.h | 29 ++++++ > fs/nfsd/nfssvc.c | 10 +- > 4 files changed, 313 insertions(+), 2 deletions(-) > create mode 100644 fs/nfsd/filecache.c > create mode 100644 fs/nfsd/filecache.h > > diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile > index 9a6028e120c6..8908bb467727 100644 > --- a/fs/nfsd/Makefile > +++ b/fs/nfsd/Makefile > @@ -10,7 +10,8 @@ obj-$(CONFIG_NFSD) += nfsd.o > nfsd-y += trace.o > > nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ > - export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o > + export.o auth.o lockd.o nfscache.o nfsxdr.o \ > + stats.o filecache.o > nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o > nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o > nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o > diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c > new file mode 100644 > index 000000000000..5bb56fa9002f > --- /dev/null > +++ b/fs/nfsd/filecache.c > @@ -0,0 +1,273 @@ > +/* > + * Open file cache. > + * > + * (c) 2015 - Jeff Layton <jeff.layton@xxxxxxxxxxxxxxx> > + */ > + > +#include <linux/hash.h> > +#include <linux/slab.h> > +#include <linux/hash.h> > +#include <linux/file.h> > +#include <linux/sched.h> > + > +#include "vfs.h" > +#include "nfsd.h" > +#include "nfsfh.h" > +#include "filecache.h" > + > +#define NFSDDBG_FACILITY NFSDDBG_FH > + > +/* hash table for nfs4_file */ > +#define NFSD_FILE_HASH_BITS 8 > +#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS) > + > +/* We only care about NFSD_MAY_READ/WRITE for this cache */ > +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) > + > +struct nfsd_fcache_bucket { > + struct hlist_head nfb_head; > + spinlock_t nfb_lock; > +}; > + > +static struct nfsd_fcache_bucket *nfsd_file_hashtbl; > + > +static struct nfsd_file * > +nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval) > +{ > + struct nfsd_file *nf; > + > + /* FIXME: create a new slabcache for these? */ > + nf = kzalloc(sizeof(*nf), GFP_KERNEL); > + if (nf) { > + INIT_HLIST_NODE(&nf->nf_node); > + INIT_LIST_HEAD(&nf->nf_dispose); > + nf->nf_inode = inode; > + nf->nf_hashval = hashval; > + atomic_set(&nf->nf_ref, 1); > + nf->nf_may = NFSD_FILE_MAY_MASK & may; > + } > + return nf; > +} > + > +static void > +nfsd_file_put_final(struct nfsd_file *nf) > +{ > + if (nf->nf_file) > + fput(nf->nf_file); > + kfree_rcu(nf, nf_rcu); > +} > + > +static bool > +nfsd_file_unhash(struct nfsd_file *nf) > +{ > + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); > + > + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { > + clear_bit(NFSD_FILE_HASHED, &nf->nf_flags); nit: why not test_and_clear_bit()? > + hlist_del_rcu(&nf->nf_node); > + return true; > + } > + return false; > +} > + > +static void > +nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose) > +{ > + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); > + > + if (!nfsd_file_unhash(nf)) > + return; > + if (!atomic_dec_and_test(&nf->nf_ref)) > + return; > + > + list_add(&nf->nf_dispose, dispose); > +} > + > +void > +nfsd_file_put(struct nfsd_file *nf) > +{ > + if (!atomic_dec_and_test(&nf->nf_ref)) > + return; > + > + WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags)); > + nfsd_file_put_final(nf); > +} > + > +struct nfsd_file * > +nfsd_file_get(struct nfsd_file *nf) > +{ > + if (likely(atomic_inc_not_zero(&nf->nf_ref))) > + return nf; > + return NULL; > +} > + > +static void > +nfsd_file_dispose_list(struct list_head *dispose) > +{ > + struct nfsd_file *nf; > + > + while(!list_empty(dispose)) { > + nf = list_first_entry(dispose, struct nfsd_file, nf_dispose); > + list_del(&nf->nf_dispose); > + nfsd_file_put_final(nf); > + } > +} > + > +int > +nfsd_file_cache_init(void) > +{ > + unsigned int i; > + > + if (nfsd_file_hashtbl) > + return 0; > + > + nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE, > + sizeof(*nfsd_file_hashtbl), GFP_KERNEL); > + if (!nfsd_file_hashtbl) > + goto out_nomem; > + > + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { > + INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head); > + spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock); > + } > + > + return 0; > +out_nomem: > + printk(KERN_ERR "nfsd: failed to init nfsd file cache\n"); > + return -ENOMEM; > +} > + > +void > +nfsd_file_cache_shutdown(void) > +{ > + unsigned int i; > + struct nfsd_file *nf; > + LIST_HEAD(dispose); > + > + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { > + spin_lock(&nfsd_file_hashtbl[i].nfb_lock); > + while(!hlist_empty(&nfsd_file_hashtbl[i].nfb_head)) { > + nf = hlist_entry(nfsd_file_hashtbl[i].nfb_head.first, > + struct nfsd_file, nf_node); > + nfsd_file_unhash_and_release_locked(nf, &dispose); > + } > + spin_unlock(&nfsd_file_hashtbl[i].nfb_lock); > + nfsd_file_dispose_list(&dispose); > + } > + kfree(nfsd_file_hashtbl); > + nfsd_file_hashtbl = NULL; > +} > + > +/* > + * Search nfsd_file_hashtbl[] for file. We hash on the filehandle and also on > + * the NFSD_MAY_READ/WRITE flags. If the file is open for r/w, then it's usable > + * for either. > + */ > +static struct nfsd_file * > +nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, > + unsigned int hashval) > +{ > + struct nfsd_file *nf; > + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; > + > + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, > + nf_node) { > + if ((need & nf->nf_may) != need) > + continue; > + if (nf->nf_inode == inode) > + return nfsd_file_get(nf); > + } > + return NULL; > +} > + > +__be32 > +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > + unsigned int may_flags, struct nfsd_file **pnf) > +{ > + __be32 status = nfs_ok; > + struct nfsd_file *nf, *new = NULL; > + struct inode *inode; > + unsigned int hashval; > + > + /* FIXME: skip this if fh_dentry is already set? */ > + status = fh_verify(rqstp, fhp, S_IFREG, may_flags); > + if (status != nfs_ok) > + return status; > + > + /* Mask off any extraneous bits */ > + may_flags &= NFSD_FILE_MAY_MASK; > + > + inode = d_inode(fhp->fh_dentry); > + hashval = (unsigned int)hash_ptr(inode, NFSD_FILE_HASH_BITS); > +retry: > + rcu_read_lock(); > + nf = nfsd_file_find_locked(inode, may_flags, hashval); > + rcu_read_unlock(); > + if (nf) > + goto wait_for_construction; > + > + if (!new) { > + new = nfsd_file_alloc(inode, may_flags, hashval); > + if (!new) > + return nfserr_jukebox; > + } > + > + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); > + nf = nfsd_file_find_locked(inode, may_flags, hashval); > + if (likely(nf == NULL)) { > + /* Take reference for the hashtable */ > + atomic_inc(&new->nf_ref); > + __set_bit(NFSD_FILE_HASHED, &new->nf_flags); > + __set_bit(NFSD_FILE_PENDING, &new->nf_flags); > + hlist_add_head_rcu(&new->nf_node, > + &nfsd_file_hashtbl[hashval].nfb_head); > + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); > + nf = new; > + new = NULL; > + goto open_file; > + } > + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); > + > +wait_for_construction: > + wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); > + > + /* Did construction of this file fail? */ > + if (!nf->nf_file) { > + /* > + * We can only take over construction for this nfsd_file if the > + * MAY flags are equal. Otherwise, we put the reference and try > + * again. > + */ > + if (may_flags != nf->nf_may) { > + nfsd_file_put(nf); > + goto retry; > + } > + > + /* try to take over construction for this file */ > + if (test_and_set_bit(NFSD_FILE_PENDING, &nf->nf_flags)) > + goto wait_for_construction; > + goto open_file; > + } > + > + /* > + * We have a file that was opened in the context of another rqst. We > + * must check permissions. Since we're dealing with open files here, > + * we always want to set the OWNER_OVERRIDE bit. > + */ > + status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, > + may_flags|NFSD_MAY_OWNER_OVERRIDE); > +out: > + if (status == nfs_ok) > + *pnf = nf; > + else > + nfsd_file_put(nf); > + > + if (new) > + nfsd_file_put(new); > + return status; > +open_file: > + status = nfsd_open(rqstp, fhp, S_IFREG, may_flags, &nf->nf_file); > + clear_bit(NFSD_FILE_PENDING, &nf->nf_flags); DO you need clear_bit_unlock() and smp_mb__after_atomic() to make sure waiters can see the flag change when they are waken up? Cheers, Tao > + wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); > + goto out; > +} > diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h > new file mode 100644 > index 000000000000..b0f500353ed4 > --- /dev/null > +++ b/fs/nfsd/filecache.h > @@ -0,0 +1,29 @@ > +#ifndef _FS_NFSD_FILECACHE_H > +#define _FS_NFSD_FILECACHE_H > +/* > + * A representation of a file that has been opened by knfsd. These are hashed > + * in the hashtable by inode pointer value. Note that this object doesn't > + * hold a reference to the inode by itself, so the nf_inode pointer should > + * never be dereferenced, only be used for comparison. > + */ > +struct nfsd_file { > + struct hlist_node nf_node; > + struct list_head nf_dispose; > + struct rcu_head nf_rcu; > + struct file *nf_file; > +#define NFSD_FILE_HASHED (0) > +#define NFSD_FILE_PENDING (1) > + unsigned long nf_flags; > + struct inode *nf_inode; > + unsigned int nf_hashval; > + atomic_t nf_ref; > + unsigned char nf_may; > +}; > + > +int nfsd_file_cache_init(void); > +void nfsd_file_cache_shutdown(void); > +void nfsd_file_put(struct nfsd_file *nf); > +struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); > +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, > + unsigned int may_flags, struct nfsd_file **nfp); > +#endif /* _FS_NFSD_FILECACHE_H */ > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c > index ad4e2377dd63..d816bb3faa6e 100644 > --- a/fs/nfsd/nfssvc.c > +++ b/fs/nfsd/nfssvc.c > @@ -22,6 +22,7 @@ > #include "cache.h" > #include "vfs.h" > #include "netns.h" > +#include "filecache.h" > > #define NFSDDBG_FACILITY NFSDDBG_SVC > > @@ -224,11 +225,17 @@ static int nfsd_startup_generic(int nrservs) > if (ret) > goto dec_users; > > - ret = nfs4_state_start(); > + ret = nfsd_file_cache_init(); > if (ret) > goto out_racache; > + > + ret = nfs4_state_start(); > + if (ret) > + goto out_file_cache; > return 0; > > +out_file_cache: > + nfsd_file_cache_shutdown(); > out_racache: > nfsd_racache_shutdown(); > dec_users: > @@ -242,6 +249,7 @@ static void nfsd_shutdown_generic(void) > return; > > nfs4_state_shutdown(); > + nfsd_file_cache_shutdown(); > nfsd_racache_shutdown(); > } > > -- > 2.4.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html