Re: [pnfs] [PATCH 1/3] SQUASHME pnfs_submit: generic device ID cache

Benny Halevy <bhalevy@xxxxxxxxxxx> · Thu, 22 Apr 2010 18:51:45 +0300

On Apr. 22, 2010, 18:47 +0300, "William A. (Andy) Adamson" <androsadamson@xxxxxxxxx> wrote:
> On Thu, Apr 22, 2010 at 7:20 AM, Benny Halevy <bhalevy@xxxxxxxxxxx> wrote:
>> On Apr. 21, 2010, 18:22 +0300, "William A. (Andy) Adamson" <androsadamson@xxxxxxxxx> wrote:
>>> On Wed, Apr 21, 2010 at 1:59 AM, Benny Halevy <bhalevy@xxxxxxxxxxx> wrote:
>>>> On Apr. 16, 2010, 19:04 +0300, "William A. (Andy) Adamson" <androsadamson@xxxxxxxxx> wrote:
>>>>> On Fri, Apr 16, 2010 at 11:52 AM,  <andros@xxxxxxxxxx> wrote:
>>>>>> From: Andy Adamson <andros@xxxxxxxxxx>
>>>>>>
>>>>>> A shared RCU device ID cache servicing multiple mounts of a single layout type
>>>>>> per meta data server (struct nfs_client).
>>>>>>
>>>>>> Device IDs of type deviceid4 are required by all layout types, long lived and
>>>>>> read at each I/O.  They are added to the deviceid cache at first reference by
>>>>>> a layout via GETDEVICEINFO and (currently) are only removed at umount.
>>>>>>
>>>>>> Reference count the device ID cache for each mounted file system
>>>>>> in the initialize_mountpoint layoutdriver_io_operation.
>>>>>>
>>>>>> Dereference the device id cache on file system in the uninitialize_mountpoint
>>>>>> layoutdriver_io_operation called at umount
>>>>>>
>>>>>> Signed-off-by: Andy Adamson <andros@xxxxxxxxxx>
>>>>>> ---
>>>>>>  fs/nfs/pnfs.c             |  119 +++++++++++++++++++++++++++++++++++++++++++++
>>>>>>  include/linux/nfs4_pnfs.h |   27 ++++++++++
>>>>>>  include/linux/nfs_fs_sb.h |    1 +
>>>>>>  3 files changed, 147 insertions(+), 0 deletions(-)
>>>>>>
>>>>>> diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
>>>>>> index 91572aa..8492aef 100644
>>>>>> --- a/fs/nfs/pnfs.c
>>>>>> +++ b/fs/nfs/pnfs.c
>>>>>> @@ -45,6 +45,7 @@
>>>>>>  #include <linux/nfs4.h>
>>>>>>  #include <linux/pnfs_xdr.h>
>>>>>>  #include <linux/nfs4_pnfs.h>
>>>>>> +#include <linux/rculist.h>
>>>>>>
>>>>>>  #include "internal.h"
>>>>>>  #include "nfs4_fs.h"
>>>>>> @@ -2296,3 +2297,121 @@ struct pnfs_client_operations pnfs_ops = {
>>>>>>
>>>>>>  EXPORT_SYMBOL(pnfs_unregister_layoutdriver);
>>>>>>  EXPORT_SYMBOL(pnfs_register_layoutdriver);
>>>>>> +
>>>>>> +
>>>>>> +/* Device ID cache. Supports one layout type per struct nfs_client */
>>>>>> +int
>>>>>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>>>>>> +                        void (*free_callback)(struct rcu_head *))
>>>>>> +{
>>>>>> +       struct nfs4_deviceid_cache *c;
>>>>>> +
>>>>>> +       c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>>>>>> +       if (!c)
>>>>>> +               return -ENOMEM;
>>>>>> +       spin_lock(&clp->cl_lock);
>>>>>> +       if (clp->cl_devid_cache != NULL) {
>>>>>> +               kref_get(&clp->cl_devid_cache->dc_kref);
>>>>>> +               spin_unlock(&clp->cl_lock);
>>>>>> +               dprintk("%s [kref [%d]]\n", __func__,
>>>>>> +                       atomic_read(&clp->cl_devid_cache->dc_kref.refcount));
>>>>>> +               kfree(c);
>>>>>> +       } else {
>>>>>> +               spin_lock_init(&c->dc_lock);
>>>>>> +               INIT_LIST_HEAD(&c->dc_deviceids);
>>>>>> +               kref_init(&c->dc_kref);
>>>>>> +               c->dc_free_callback = free_callback;
>>>>>> +               c->dc_clp = clp;
>>>>>> +               clp->cl_devid_cache = c;
>>>>>> +               spin_unlock(&clp->cl_lock);
>>>>>> +               dprintk("%s [new]\n", __func__);
>>>>>> +       }
>>>>>> +       return 0;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache);
>>>>>> +
>>>>>> +void
>>>>>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>>>>>> +{
>>>>>> +       INIT_LIST_HEAD(&d->de_node);
>>>>>> +       INIT_RCU_HEAD(&d->de_rcu);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_init_deviceid_node);
>>>>>> +
>>>>>> +struct nfs4_deviceid *
>>>>>> +nfs4_find_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid *d;
>>>>>> +
>>>>>> +       rcu_read_lock();
>>>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> +               if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>>> +                       rcu_read_unlock();
>>>>>> +                       return d;
>>>>>> +               }
>>>>>> +       }
>>>>>> +       rcu_read_unlock();
>>>>
>>>> I hope this is worth the added complexity...
>>>>
>>>> Out of curiosity, do you have a benchmark comparing the cost
>>>> of rcu_read_lock (and unlock) vs. spin_lock(&c->dc_lock)?
>>>
>>> The deviceid cache is read at each I/O. If we use a spin_lock to
>>
>> Yeah, I see where this goes...
>> In the objects layout driver we get a reference on the device structure
>> at alloc_lseg time and keep a pointer to it throughout the lseg's life time.
>> This saves the deviceid lookup on every I/O.
> 
> Perhaps that is a better way to go. How many deviceid's is 'normal'
> for the object driver?
> 

I'd say that order of 10's to a few 100's is normal.
1000 and up is a big installation.

Benny

> -->Andy
> 
> 
>>
>> Benny
>>
>>> protect the deviceid cache, this would mean that all I/0 is serialized
>>> behind the spin_lock even though the deviceid cache is changed
>>> infrequently. The RCU allows readers to "run almost naked" and does
>>> not serialize I/O behind reading the deviceid cache.
>>>
>>> So I think it is worth it. I have not benchmarked the difference.
>>>
>>>
>>>>
>>>>>> +       return NULL;
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_find_deviceid);
>>>>>> +
>>>>>> +/*
>>>>>> + * Add or kref_get a deviceid.
>>>>>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>>>>>> + */
>>>>>> +void
>>>>>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid *d;
>>>>>> +
>>>>>> +       spin_lock(&c->dc_lock);
>>>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> +               if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) {
>>>>>> +                       spin_unlock(&c->dc_lock);
>>>>>> +                       dprintk("%s [discard]\n", __func__);
>>>>>> +                       c->dc_free_callback(&new->de_rcu);
>>>>>> +               }
>>>>>> +       }
>>>>>> +       list_add_rcu(&new->de_node, &c->dc_deviceids);
>>>>>> +       spin_unlock(&c->dc_lock);
>>>>>> +       dprintk("%s [new]\n", __func__);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_add_deviceid);
>>>>>> +
>>>>>> +static int
>>>>>> +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid *d;
>>>>>> +
>>>>>> +       spin_lock(&c->dc_lock);
>>>>>> +       list_for_each_entry_rcu(d, &c->dc_deviceids, de_node) {
>>>>>> +               list_del_rcu(&d->de_node);
>>>>>> +               spin_unlock(&c->dc_lock);
>>>>>> +               synchronize_rcu();
>>>>>> +               c->dc_free_callback(&d->de_rcu);
>>>>>> +               return 1;
>>>>>> +       }
>>>>>> +       spin_unlock(&c->dc_lock);
>>>>>> +       return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +nfs4_free_deviceid_cache(struct kref *kref)
>>>>>> +{
>>>>>> +       struct nfs4_deviceid_cache *cache =
>>>>>> +               container_of(kref, struct nfs4_deviceid_cache, dc_kref);
>>>>>> +       int more = 1;
>>>>>> +
>>>>>> +       while (more)
>>>>>> +               more = nfs4_remove_deviceid(cache);
>>>>>> +       cache->dc_clp->cl_devid_cache = NULL;
>>>>>
>>>>> Need to take the cl_lock around this assignment
>>>>>
>>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>>> cache->dc_clp->cl_devid_cache = NULL
>>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>>>
>>>>>
>>>>
>>>> That must happen atomically before kref_put.
>>>> It's illegal to have cl_devid_cache be referenced by cache->dc_clp
>>>> without a reference count backing it up.
>>>> Otherwise, if accessed concurrently to this piece of code
>>>> someone might call kref_get while the refcount is zero.
>>>>
>>>> Normally, you'd first clear the referencing pointer to prevent any
>>>> new reference to it and only then kref_put it, e.g.:
>>>>
>>>> spin_lock(&cache->dc_clp->cl_lock);
>>>> tmp = cache->dc_clp->cl_devid_cache;
>>>> cache->dc_clp->cl_devid_cache = NULL
>>>> spin_unlock(&cache->dc_clp->cl_lock);
>>>> kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache);
>>>
>>> Good point. Thanks for the review. I'll rethink and resend
>>>
>>> -->Andy
>>>
>>>>
>>>> Benny
>>>>
>>>>>> +       kfree(cache);
>>>>>> +}
>>>>>> +
>>>>>> +void
>>>>>> +nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *c)
>>>>>> +{
>>>>>> +       dprintk("%s [%d]\n", __func__, atomic_read(&c->dc_kref.refcount));
>>>>>> +       kref_put(&c->dc_kref, nfs4_free_deviceid_cache);
>>>>>> +}
>>>>>> +EXPORT_SYMBOL(nfs4_put_deviceid_cache);
>>>>>> diff --git a/include/linux/nfs4_pnfs.h b/include/linux/nfs4_pnfs.h
>>>>>> index 1d521f4..2a88a06 100644
>>>>>> --- a/include/linux/nfs4_pnfs.h
>>>>>> +++ b/include/linux/nfs4_pnfs.h
>>>>>> @@ -281,6 +281,33 @@ struct pnfs_devicelist {
>>>>>>        struct pnfs_deviceid    dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM];
>>>>>>  };
>>>>>>
>>>>>> +/*
>>>>>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>>>>>> + */
>>>>>> +struct nfs4_deviceid_cache {
>>>>>> +       spinlock_t              dc_lock;
>>>>>> +       struct list_head        dc_deviceids;
>>>>>> +       struct kref             dc_kref;
>>>>>> +       void                    (*dc_free_callback)(struct rcu_head *);
>>>>>> +       struct nfs_client       *dc_clp;
>>>>>> +};
>>>>>> +
>>>>>> +/* Device ID cache node */
>>>>>> +struct nfs4_deviceid {
>>>>>> +       struct list_head        de_node;
>>>>>> +       struct rcu_head         de_rcu;
>>>>>> +       struct pnfs_deviceid    de_id;
>>>>>> +};
>>>>>> +
>>>>>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>>>>>> +                               void (*free_callback)(struct rcu_head *));
>>>>>> +extern void nfs4_put_deviceid_cache(struct nfs4_deviceid_cache *);
>>>>>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>>>>>> +extern struct nfs4_deviceid *nfs4_find_deviceid(struct nfs4_deviceid_cache *,
>>>>>> +                               struct pnfs_deviceid *);
>>>>>> +extern void nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>>>>>> +                               struct nfs4_deviceid *);
>>>>>> +
>>>>>>  /* pNFS client callback functions.
>>>>>>  * These operations allow the layout driver to access pNFS client
>>>>>>  * specific information or call pNFS client->server operations.
>>>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>>>>>> index 8522461..ef2e18e 100644
>>>>>> --- a/include/linux/nfs_fs_sb.h
>>>>>> +++ b/include/linux/nfs_fs_sb.h
>>>>>> @@ -87,6 +87,7 @@ struct nfs_client {
>>>>>>        u32                     cl_exchange_flags;
>>>>>>        struct nfs4_session     *cl_session;    /* sharred session */
>>>>>>        struct list_head        cl_lo_inodes;   /* Inodes having layouts */
>>>>>> +       struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>>>>>  #endif /* CONFIG_NFS_V4_1 */
>>>>>>
>>>>>>  #ifdef CONFIG_NFS_FSCACHE
>>>>>> --
>>>>>> 1.6.6
>>>>>>
>>>>>> --
>>>>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>>>>> the body of a message to majordomo@xxxxxxxxxxxxxxx
>>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>>>>
>>>>> _______________________________________________
>>>>> pNFS mailing list
>>>>> pNFS@xxxxxxxxxxxxx
>>>>> http://linux-nfs.org/cgi-bin/mailman/listinfo/pnfs
>>>>
>>
>>
>> --
>> Benny Halevy
>> Software Architect
>> Panasas, Inc.
>> bhalevy@xxxxxxxxxxx
>> Tel/Fax: +972-3-647-8340
>> Mobile: +972-54-802-8340
>>
>> Panasas: The Leader in Parallel Storage
>> www.panasas.com
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html