On Apr. 22, 2010, 18:47 +0300, "William A. (Andy) Adamson" <androsadamson@xxxxxxxxx> wrote: > On Thu, Apr 22, 2010 at 7:20 AM, Benny Halevy <bhalevy@xxxxxxxxxxx> wrote: >> On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsadamson@xxxxxxxxx> wrote: >>> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@xxxxxxxxxxx> wrote: >>>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@xxxxxxxxx> wrote: >>>>> On Fri, Apr 16, 2010 at 11:52 AM, <andros@xxxxxxxxxx> wrote: >>>>>> From: Andy Adamson <andros@xxxxxxxxxx> >>>>>> >>>>>> A shared RCU device ID cache servicing multiple mounts of a single layout type >>>>>> per meta data server (struct nfs_client). >>>>>> >>>>>> Device IDs of type deviceid4 are required by all layout types, long lived and >>>>>> read at each I/O. They are added to the deviceid cache at first reference by >>>>>> a layout via GETDEVICEINFO and (currently) are only removed at umount. >>>>>> >>>>>> Reference count the device ID cache for each mounted file system >>>>>> in the initialize_mountpoint layoutdriver_io_operation. >>>>>> >>>>>> Dereference the device id cache on file system in the uninitialize_mountpoint >>>>>> layoutdriver_io_operation called at umount >>>>>> >>>>>> Signed-off-by: Andy Adamson <andros@xxxxxxxxxx> >>>>>> --- >>>>>> fs/nfs/pnfs.c | 119 +++++++++++++++++++++++++++++++++++++++++++++ >>>>>> include/linux/nfs4_pnfs.h | 27 ++++++++++ >>>>>> include/linux/nfs_fs_sb.h | 1 + >>>>>> 3 files changed, 147 insertions(+), 0 deletions(-) >>>>>> >>>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c >>>>>> index 91572aa..8492aef 100644 >>>>>> --- a/fs/nfs/pnfs.c >>>>>> +++ b/fs/nfs/pnfs.c >>>>>> @@ -45,6 +45,7 @@ >>>>>> #include <linux/nfs4.h> >>>>>> #include <linux/pnfs_xdr.h> >>>>>> #include <linux/nfs4_pnfs.h> >>>>>> +#include <linux/rculist.h> >>>>>> >>>>>> #include "internal.h" >>>>>> #include "nfs4_fs.h" >>>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = { >>>>>> >>>>>> EXPORT_SYMBOL(pnfs_unregister_layoutdriver); >>>>>> EXPORT_SYMBOL(pnfs_register_layoutdriver); >>>>>> + >>>>>> + >>>>>> +/* Device ID cache. Supports one layout type per struct nfs_client */ >>>>>> +int >>>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp, >>>>>> + void (*free_callback)(struct rcu_head *)) >>>>>> +{ >>>>>> + struct nfs4_deviceid_cache *c; >>>>>> + >>>>>> + c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL); >>>>>> + if (!c) >>>>>> + return -ENOMEM; >>>>>> + spin_lock(&clp->cl_lock); >>>>>> + if (clp->cl_devid_cache != NULL) { >>>>>> + kref_get(&clp->cl_devid_cache->dc_kref); >>>>>> + spin_unlock(&clp->cl_lock); >>>>>> + dprintk("%s [kref [%d]]\n", __func__, >>>>>> + atomic_read(&clp->cl_devid_cache->dc_kref.refcount)); >>>>>> + kfree(c); >>>>>> + } else { >>>>>> + spin_lock_init(&c->dc_lock); >>>>>> + INIT_LIST_HEAD(&c->dc_deviceids); >>>>>> + kref_init(&c->dc_kref); >>>>>> + c->dc_free_callback = free_callback; >>>>>> + c->dc_clp = clp; >>>>>> + clp->cl_devid_cache = c; >>>>>> + spin_unlock(&clp->cl_lock); >>>>>> + dprintk("%s [new]\n", __func__); >>>>>> + } >>>>>> + return 0; >>>>>> +} >>>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache); >>>>>> + >>>>>> +void >>>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d) >>>>>> +{ >>>>>> + INIT_LIST_HEAD(&d->de_node); >>>>>> + INIT_RCU_HEAD(&d->de_rcu); >>>>>> +} >>>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node); >>>>>> + >>>>>> +struct nfs4_deviceid * >>>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) >>>>>> +{ >>>>>> + struct nfs4_deviceid *d; >>>>>> + >>>>>> + rcu_read_lock(); >>>>>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) { >>>>>> + if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) { >>>>>> + rcu_read_unlock(); >>>>>> + return d; >>>>>> + } >>>>>> + } >>>>>> + rcu_read_unlock(); >>>> >>>> I hope this is worth the added complexity... >>>> >>>> Out of curiosity, do you have a benchmark comparing the cost >>>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)? >>> >>> The deviceid cache is read at each I/O. If we use a spin_lock to >> >> Yeah, I see where this goes... >> In the objects layout driver we get a reference on the device structure >> at alloc_lseg time and keep a pointer to it throughout the lseg's life time. >> This saves the deviceid lookup on every I/O. > > Perhaps that is a better way to go. How many deviceid's is 'normal' > for the object driver? > I'd say that order of 10's to a few 100's is normal. 1000 and up is a big installation. Benny > -->Andy > > >> >> Benny >> >>> protect the deviceid cache, this would mean that all I/0 is serialized >>> behind the spin_lock even though the deviceid cache is changed >>> infrequently. The RCU allows readers to "run almost naked" and does >>> not serialize I/O behind reading the deviceid cache. >>> >>> So I think it is worth it. I have not benchmarked the difference. >>> >>> >>>> >>>>>> + return NULL; >>>>>> +} >>>>>> +EXPORT_SYMBOL(nfs4_find_deviceid); >>>>>> + >>>>>> +/* >>>>>> + * Add or kref_get a deviceid. >>>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new >>>>>> + */ >>>>>> +void >>>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new) >>>>>> +{ >>>>>> + struct nfs4_deviceid *d; >>>>>> + >>>>>> + spin_lock(&c->dc_lock); >>>>>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) { >>>>>> + if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) { >>>>>> + spin_unlock(&c->dc_lock); >>>>>> + dprintk("%s [discard]\n", __func__); >>>>>> + c->dc_free_callback(&new->de_rcu); >>>>>> + } >>>>>> + } >>>>>> + list_add_rcu(&new->de_node, &c->dc_deviceids); >>>>>> + spin_unlock(&c->dc_lock); >>>>>> + dprintk("%s [new]\n", __func__); >>>>>> +} >>>>>> +EXPORT_SYMBOL(nfs4_add_deviceid); >>>>>> + >>>>>> +static int >>>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c) >>>>>> +{ >>>>>> + struct nfs4_deviceid *d; >>>>>> + >>>>>> + spin_lock(&c->dc_lock); >>>>>> + list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) { >>>>>> + list_del_rcu(&d->de_node); >>>>>> + spin_unlock(&c->dc_lock); >>>>>> + synchronize_rcu(); >>>>>> + c->dc_free_callback(&d->de_rcu); >>>>>> + return 1; >>>>>> + } >>>>>> + spin_unlock(&c->dc_lock); >>>>>> + return 0; >>>>>> +} >>>>>> + >>>>>> +static void >>>>>> +nfs4_free_deviceid_cache(struct kref *kref) >>>>>> +{ >>>>>> + struct nfs4_deviceid_cache *cache = >>>>>> + container_of(kref, struct nfs4_deviceid_cache, dc_kref); >>>>>> + int more = 1; >>>>>> + >>>>>> + while (more) >>>>>> + more = nfs4_remove_deviceid(cache); >>>>>> + cache->dc_clp->cl_devid_cache = NULL; >>>>> >>>>> Need to take the cl_lock around this assignment >>>>> >>>>> spin_lock(&cache->dc_clp->cl_lock); >>>>> cache->dc_clp->cl_devid_cache = NULL >>>>> spin_unlock(&cache->dc_clp->cl_lock); >>>>> >>>>> >>>> >>>> That must happen atomically before kref_put. >>>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp >>>> without a reference count backing it up. >>>> Otherwise, if accessed concurrently to this piece of code >>>> someone might call kref_get while the refcount is zero. >>>> >>>> Normally, you'd first clear the referencing pointer to prevent any >>>> new reference to it and only then kref_put it, e.g.: >>>> >>>> spin_lock(&cache->dc_clp->cl_lock); >>>> tmp = cache->dc_clp->cl_devid_cache; >>>> cache->dc_clp->cl_devid_cache = NULL >>>> spin_unlock(&cache->dc_clp->cl_lock); >>>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache); >>> >>> Good point. Thanks for the review. I'll rethink and resend >>> >>> -->Andy >>> >>>> >>>> Benny >>>> >>>>>> + kfree(cache); >>>>>> +} >>>>>> + >>>>>> +void >>>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c) >>>>>> +{ >>>>>> + dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount)); >>>>>> + kref_put(&c->dc_kref, nfs4_free_deviceid_cache); >>>>>> +} >>>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache); >>>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h >>>>>> index 1d521f4..2a88a06 100644 >>>>>> --- a/include/linux/nfs4_pnfs.h >>>>>> +++ b/include/linux/nfs4_pnfs.h >>>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist { >>>>>> struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; >>>>>> }; >>>>>> >>>>>> +/* >>>>>> + * Device ID RCU cache. A device ID is unique per client ID and layout type. >>>>>> + */ >>>>>> +struct nfs4_deviceid_cache { >>>>>> + spinlock_t dc_lock; >>>>>> + struct list_head dc_deviceids; >>>>>> + struct kref dc_kref; >>>>>> + void (*dc_free_callback)(struct rcu_head *); >>>>>> + struct nfs_client *dc_clp; >>>>>> +}; >>>>>> + >>>>>> +/* Device ID cache node */ >>>>>> +struct nfs4_deviceid { >>>>>> + struct list_head de_node; >>>>>> + struct rcu_head de_rcu; >>>>>> + struct pnfs_deviceid de_id; >>>>>> +}; >>>>>> + >>>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *, >>>>>> + void (*free_callback)(struct rcu_head *)); >>>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *); >>>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *); >>>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *, >>>>>> + struct pnfs_deviceid *); >>>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *, >>>>>> + struct nfs4_deviceid *); >>>>>> + >>>>>> /* pNFS client callback functions. >>>>>> * These operations allow the layout driver to access pNFS client >>>>>> * specific information or call pNFS client->server operations. >>>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h >>>>>> index 8522461..ef2e18e 100644 >>>>>> --- a/include/linux/nfs_fs_sb.h >>>>>> +++ b/include/linux/nfs_fs_sb.h >>>>>> @@ -87,6 +87,7 @@ struct nfs_client { >>>>>> u32 cl_exchange_flags; >>>>>> struct nfs4_session *cl_session; /* sharred session */ >>>>>> struct list_head cl_lo_inodes; /* Inodes having layouts */ >>>>>> + struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ >>>>>> #endif /* CONFIG_NFS_V4_1 */ >>>>>> >>>>>> #ifdef CONFIG_NFS_FSCACHE >>>>>> -- >>>>>> 1.6.6 >>>>>> >>>>>> -- >>>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in >>>>>> the body of a message to majordomo@xxxxxxxxxxxxxxx >>>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>>>>> >>>>> _______________________________________________ >>>>> pNFS mailing list >>>>> pNFS@xxxxxxxxxxxxx >>>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs >>>> >> >> >> -- >> Benny Halevy >> Software Architect >> Panasas, Inc. >> bhalevy@xxxxxxxxxxx >> Tel/Fax: +972-3-647-8340 >> Mobile: +972-54-802-8340 >> >> Panasas: The Leader in Parallel Storage >> www.panasas.com >> -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html