On Mon, May 3, 2010 at 7:48 AM, Benny Halevy <bhalevy@xxxxxxxxxxx> wrote: > On Apr. 26, 2010, 19:18 +0300, andros@xxxxxxxxxx wrote: >> From: Andy Adamson <andros@xxxxxxxxxx> >> >> A shared RCU device ID cache servicing multiple mounts of a single layout type >> per meta data server (struct nfs_client). >> >> Device IDs of type deviceid4 are required by all layout types, long lived and >> read at each I/O. They are added to the deviceid cache at first reference by >> a layout via GETDEVICEINFO and (currently) are only removed at umount. >> >> Reference count the device ID cache for each mounted file system >> in the initialize_mountpoint layoutdriver_io_operation. >> >> Dereference the device id cache on file system in the uninitialize_mountpoint >> layoutdriver_io_operation called at umount >> >> Each layoutsegment assigns a pointer and takes a reference to the >> nfs4_deviceid structure identified by the layout deviceid. >> This is so that there are no deviceid lookups for the normal I/O path. >> >> Even thought required by all layouttypes, the deviceid is not exposed in the >> LAYOUTGET4res but is instead hidden in the opaque layouttype4. >> >> Therefore, each layout type alloc_lseg calls nfs4_set_layout_deviceid, >> and free_lseg calls nfs4_unset_layout_deviceid. >> >> While the file layout driver will not cache very many deviceid's, the object >> and block layout drivers could cache 100's for a large installation. >> Use an hlist. >> >> Signed-off-by: Andy Adamson <andros@xxxxxxxxxx> >> --- >> fs/nfs/pnfs.c | 167 +++++++++++++++++++++++++++++++++++++++++++++ >> include/linux/nfs4_pnfs.h | 50 +++++++++++++ >> include/linux/nfs_fs_sb.h | 1 + >> 3 files changed, 218 insertions(+), 0 deletions(-) >> >> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c >> index 91572aa..bf906cc 100644 >> --- a/fs/nfs/pnfs.c >> +++ b/fs/nfs/pnfs.c >> @@ -45,6 +45,7 @@ >> #include <linux/nfs4.h> >> #include <linux/pnfs_xdr.h> >> #include <linux/nfs4_pnfs.h> >> +#include <linux/rculist.h> >> >> #include "internal.h" >> #include "nfs4_fs.h" >> @@ -2296,3 +2297,169 @@ struct pnfs_client_operations pnfs_ops = { >> >> EXPORT_SYMBOL(pnfs_unregister_layoutdriver); >> EXPORT_SYMBOL(pnfs_register_layoutdriver); >> + >> + >> +/* Device ID cache. Supports one layout type per struct nfs_client */ >> +int >> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp, >> + void (*free_callback)(struct kref *)) >> +{ >> + struct nfs4_deviceid_cache *c; >> + >> + c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL); >> + if (!c) >> + return -ENOMEM; >> + spin_lock(&clp->cl_lock); >> + if (clp->cl_devid_cache != NULL) { >> + kref_get(&clp->cl_devid_cache->dc_kref); >> + spin_unlock(&clp->cl_lock); >> + dprintk("%s [kref [%d]]\n", __func__, >> + atomic_read(&clp->cl_devid_cache->dc_kref.refcount)); >> + kfree(c); >> + } else { >> + int i; >> + >> + spin_lock_init(&c->dc_lock); >> + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++) >> + INIT_HLIST_HEAD(&c->dc_deviceids[i]); >> + kref_init(&c->dc_kref); >> + c->dc_free_callback = free_callback; >> + clp->cl_devid_cache = c; >> + spin_unlock(&clp->cl_lock); >> + dprintk("%s [new]\n", __func__); >> + } >> + return 0; >> +} >> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache); >> + >> +void >> +nfs4_init_deviceid_node(struct nfs4_deviceid *d) >> +{ >> + INIT_HLIST_NODE(&d->de_node); >> + kref_init(&d->de_kref); >> +} >> +EXPORT_SYMBOL(nfs4_init_deviceid_node); >> + >> +/* Called from layoutdriver_io_operations->alloc_lseg */ >> +void >> +nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d) >> +{ >> + dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); >> + l->deviceid = d; >> + kref_get(&d->de_kref); >> +} >> +EXPORT_SYMBOL(nfs4_set_layout_deviceid); >> + >> +/* Called from layoutdriver_io_operations->free_lseg */ >> +void >> +nfs4_unset_layout_deviceid(struct pnfs_layout_segment *l, >> + struct nfs4_deviceid *d, >> + void (*free_callback)(struct kref *)) >> +{ >> + dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); >> + l->deviceid = NULL; >> + kref_put(&d->de_kref, free_callback); >> +} >> +EXPORT_SYMBOL(nfs4_unset_layout_deviceid); >> + >> +struct nfs4_deviceid * >> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) >> +{ >> + struct nfs4_deviceid *d; >> + struct hlist_node *n; >> + long hash = nfs4_deviceid_hash(id); >> + >> + dprintk("--> %s hash %ld\n", __func__, hash); >> + rcu_read_lock(); >> + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { >> + if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) { >> + rcu_read_unlock(); >> + return d; >> + } >> + } >> + rcu_read_unlock(); >> + return NULL; >> +} >> +EXPORT_SYMBOL(nfs4_find_deviceid); >> + >> +/* >> + * Add or kref_get a deviceid. >> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new >> + */ >> +struct nfs4_deviceid * >> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new) >> +{ >> + struct nfs4_deviceid *d; >> + struct hlist_node *n; >> + long hash = nfs4_deviceid_hash(&new->de_id); >> + >> + dprintk("--> %s hash %ld\n", __func__, hash); >> + spin_lock(&c->dc_lock); >> + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { >> + if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) { >> + spin_unlock(&c->dc_lock); >> + dprintk("%s [discard]\n", __func__); >> + c->dc_free_callback(&new->de_kref); >> + return d; >> + } >> + } >> + hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); >> + spin_unlock(&c->dc_lock); >> + dprintk("%s [new]\n", __func__); >> + return new; >> +} >> +EXPORT_SYMBOL(nfs4_add_deviceid); >> + >> +static int >> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash) >> +{ >> + struct nfs4_deviceid *d; >> + struct hlist_node *n; >> + >> + dprintk("--> %s hash %ld\n", __func__, hash); >> + spin_lock(&c->dc_lock); >> + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { >> + hlist_del_rcu(&d->de_node); >> + spin_unlock(&c->dc_lock); >> + synchronize_rcu(); >> + dprintk("%s [%d]\n", __func__, >> + atomic_read(&d->de_kref.refcount)); >> + kref_put(&d->de_kref, c->dc_free_callback); >> + return 1; >> + } >> + spin_unlock(&c->dc_lock); >> + return 0; >> +} >> + >> +static void >> +nfs4_free_deviceid_cache(struct kref *kref) >> +{ >> + struct nfs4_deviceid_cache *cache = >> + container_of(kref, struct nfs4_deviceid_cache, dc_kref); >> + int more; >> + long i; >> + >> + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) { >> + more = 1; >> + while (more) >> + more = nfs4_remove_deviceid(cache, i); > > Andy, this can be simplified to > > while (nfs4_remove_deviceid(cache, i)) > ; > > If ok with you, I'll make this change upon merging. Yes - looks fine, thanks. -->Andy > > Benny > >> + } >> + kfree(cache); >> +} >> + >> +void >> +nfs4_put_deviceid_cache(struct nfs_client *clp) >> +{ >> + struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache; >> + int refcount; >> + >> + dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); >> + spin_lock(&clp->cl_lock); >> + refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount); >> + if (refcount == 1) >> + clp->cl_devid_cache = NULL; >> + spin_unlock(&clp->cl_lock); >> + dprintk("%s [%d]\n", __func__, refcount); >> + kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache); >> +} >> +EXPORT_SYMBOL(nfs4_put_deviceid_cache); >> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h >> index 3caac60..3b7aeb7 100644 >> --- a/include/linux/nfs4_pnfs.h >> +++ b/include/linux/nfs4_pnfs.h >> @@ -106,6 +106,7 @@ struct pnfs_layout_segment { >> struct kref kref; >> bool valid; >> struct pnfs_layout_type *layout; >> + struct nfs4_deviceid *deviceid; >> u8 ld_data[]; /* layout driver private data */ >> }; >> >> @@ -275,6 +276,55 @@ struct pnfs_devicelist { >> struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; >> }; >> >> +/* >> + * Device ID RCU cache. A device ID is unique per client ID and layout type. >> + */ >> +#define NFS4_DEVICE_ID_HASH_BITS 5 >> +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) >> +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) >> + >> +static inline u32 >> +nfs4_deviceid_hash(struct pnfs_deviceid *id) >> +{ >> + unsigned char *cptr = (unsigned char *)id->data; >> + unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE; >> + u32 x = 0; >> + >> + while (nbytes--) { >> + x *= 37; >> + x += *cptr++; >> + } >> + return x & NFS4_DEVICE_ID_HASH_MASK; >> +} >> + >> +struct nfs4_deviceid_cache { >> + spinlock_t dc_lock; >> + struct kref dc_kref; >> + void (*dc_free_callback)(struct kref *); >> + struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; >> +}; >> + >> +/* Device ID cache node */ >> +struct nfs4_deviceid { >> + struct hlist_node de_node; >> + struct pnfs_deviceid de_id; >> + struct kref de_kref; >> +}; >> + >> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *, >> + void (*free_callback)(struct kref *)); >> +extern void nfs4_put_deviceid_cache(struct nfs_client *); >> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *); >> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *, >> + struct pnfs_deviceid *); >> +extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid_cache *, >> + struct nfs4_deviceid *); >> +extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *, >> + struct nfs4_deviceid *); >> +extern void nfs4_unset_layout_deviceid(struct pnfs_layout_segment *, >> + struct nfs4_deviceid *, >> + void (*free_callback)(struct kref *)); >> + >> /* pNFS client callback functions. >> * These operations allow the layout driver to access pNFS client >> * specific information or call pNFS client->server operations. >> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h >> index 8522461..ef2e18e 100644 >> --- a/include/linux/nfs_fs_sb.h >> +++ b/include/linux/nfs_fs_sb.h >> @@ -87,6 +87,7 @@ struct nfs_client { >> u32 cl_exchange_flags; >> struct nfs4_session *cl_session; /* sharred session */ >> struct list_head cl_lo_inodes; /* Inodes having layouts */ >> + struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ >> #endif /* CONFIG_NFS_V4_1 */ >> >> #ifdef CONFIG_NFS_FSCACHE > > -- > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html