Re: [PATCH 11/12] RFC: pnfs: add LAYOUTGET and GETDEVICEINFO infrastructure

Fred Isaman <iisaman@xxxxxxxxxx> · Mon, 20 Sep 2010 10:56:15 -0400

On Sun, Sep 19, 2010 at 3:07 PM, Boaz Harrosh <bharrosh@xxxxxxxxxxx> wrote:
> On 09/18/2010 05:17 AM, Fred Isaman wrote:
>> From: The pNFS Team <linux-nfs@xxxxxxxxxxxxxxx>
>>
>> Add the ability to actually send LAYOUTGET and GETDEVICEINFO.  This also adds
>> in the machinery to handle layout state and the deviceid cache.  Note that
>> GETDEVICEINFO is not called directly by the generic layer.  Instead it
>> is called by the drivers while parsing the LAYOUTGET opaque data in response
>> to an unknown device id embedded therein.  Annoyingly, RFC 5661 only encodes
>> device ids within the driver-specific opaque data.
>>
>
> Fred, In regard to device cache at least, this is a complete new code then what
> I knew before. I wish you could send a SQUASHME diff first of what has changed.
>

Many of these changes were sent off-list to you in the initial format
of original-code plus squashmes.  In particular,
there was a patch labeled "SQUASHME pnfs_submit_1 fix pnfs_submit:
generic pnfs deviceid cache"

>> Signed-off-by: TBD - melding/reorganization of several patches
>
> <snip>
>
>> +/*
>> + * Device ID cache. Currently supports one layout type per struct nfs_client.
>> + * Add layout type to the lookup key to expand to support multiple types.
>> + */
>> +int
>> +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp,
>> +                      void (*free_callback)(struct nfs4_deviceid *))
>> +{
>> +     struct nfs4_deviceid_cache *c;
>> +
>> +     c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL);
>> +     if (!c)
>> +             return -ENOMEM;
>> +     spin_lock(&clp->cl_lock);
>> +     if (clp->cl_devid_cache != NULL) {
>> +             atomic_inc(&clp->cl_devid_cache->dc_ref);
>> +             dprintk("%s [kref [%d]]\n", __func__,
>> +                     atomic_read(&clp->cl_devid_cache->dc_ref));
>> +             kfree(c);
>> +     } else {
>> +             /* kzalloc initializes hlists */
>> +             spin_lock_init(&c->dc_lock);
>> +             atomic_set(&c->dc_ref, 1);
>> +             c->dc_free_callback = free_callback;
>> +             clp->cl_devid_cache = c;
>> +             dprintk("%s [new]\n", __func__);
>> +     }
>> +     spin_unlock(&clp->cl_lock);
>> +     return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(nfs4_alloc_init_deviceid_cache);
>> +
>> +void
>> +nfs4_init_deviceid_node(struct nfs4_deviceid *d)
>> +{
>> +     INIT_HLIST_NODE(&d->de_node);
>> +     atomic_set(&d->de_ref, 1);
>> +}
>> +EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
>> +
>> +/*
>> + * Called from pnfs_layoutdriver_type->free_lseg
>> + * last layout segment reference frees deviceid
>> + */
>> +void
>> +nfs4_put_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *devid)
>> +{
>> +     struct pnfs_deviceid *id = &devid->de_id;
>> +     struct nfs4_deviceid *d;
>> +     struct hlist_node *n;
>> +     long h = nfs4_deviceid_hash(id);
>> +
>> +     dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
>> +     if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
>> +             return;
>> +
>> +     hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
>> +             if (!memcmp(&d->de_id, id, sizeof(*id))) {
>> +                     hlist_del_rcu(&d->de_node);
>> +                     spin_unlock(&c->dc_lock);
>> +                     synchronize_rcu();
>> +                     c->dc_free_callback(devid);
>
> What?!? free the devid on every put? surly something went hey-wire
>

Did you miss the atomic_dec_and_lock above?

> This used to be kref(ed)
>

The original code required access to the internals of the kref, which
was frowned upon.

>> +                     return;
>> +             }
>> +     spin_unlock(&c->dc_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(nfs4_put_deviceid);
>> +
>> +/* Find and reference a deviceid */
>> +struct nfs4_deviceid *
>> +nfs4_find_get_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id)
>> +{
>> +     struct nfs4_deviceid *d;
>> +     struct hlist_node *n;
>> +     long hash = nfs4_deviceid_hash(id);
>> +
>> +     dprintk("--> %s hash %ld\n", __func__, hash);
>> +     rcu_read_lock();
>> +     hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
>> +             if (!memcmp(&d->de_id, id, sizeof(*id))) {
>> +                     if (!atomic_inc_not_zero(&d->de_ref)) {
>> +                             goto fail;
>> +                     } else {
>> +                             rcu_read_unlock();
>> +                             return d;
>> +                     }
>> +             }
>> +     }
>> +fail:
>> +     rcu_read_unlock();
>> +     return NULL;
>> +}
>> +EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
>> +
>> +/*
>> + * Add a deviceid to the cache.
>> + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
>> + */
>> +struct nfs4_deviceid *
>> +nfs4_add_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new)
>> +{
>> +     struct nfs4_deviceid *d;
>> +     struct hlist_node *n;
>> +     long hash = nfs4_deviceid_hash(&new->de_id);
>> +
>> +     dprintk("--> %s hash %ld\n", __func__, hash);
>> +     spin_lock(&c->dc_lock);
>> +     hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
>> +             if (!memcmp(&d->de_id, &new->de_id, sizeof(new->de_id))) {
>> +                     spin_unlock(&c->dc_lock);
>> +                     dprintk("%s [discard]\n", __func__);
>> +                     c->dc_free_callback(new);
>> +                     return d;
>
> I am totally confused and should be smacked on the head strong. As said above
> one alloc_lseg adds the first, second comes in, but same deviceid, finds the first
> and returns it (no refcounting). Then a first free_lseg nfs4_put_deviceid is called,
> so the second lseg holds free memory?
>

You are right, there is a missing increment of the refcount here.

Part of the problem here is that the flow is obscurd because it is
being called form the driver layer.
I think it might be better if nfs4_find_get_deviceid did the
GETDEVICEINFO on a cache miss, instead of relying on the driver to do
it.

>> +             }
>> +     }
>> +     hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
>> +     spin_unlock(&c->dc_lock);
>> +     dprintk("%s [new]\n", __func__);
>> +     return new;
>> +}
>> +EXPORT_SYMBOL_GPL(nfs4_add_deviceid);
>> +
>> +void
>> +nfs4_put_deviceid_cache(struct nfs_client *clp)
>> +{
>> +     struct nfs4_deviceid_cache *local = clp->cl_devid_cache;
>> +
>> +     dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
>> +     if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
>
> Don't you want to make sure there are no hanging devices on this
> cache. Please note that an add or find does not take the cache
> reference only server_init does, right?
>

If there are any hanging devices at this point, it is a pretty serious bug.

>> +             clp->cl_devid_cache = NULL;
>> +             spin_unlock(&clp->cl_lock);
>> +             kfree(local);
>> +     }
>> +}
>> +EXPORT_SYMBOL_GPL(nfs4_put_deviceid_cache);
>
> Note to self. Test code alot.
>
> Fred:
>  I don't understand. A cache property I'm looking for and which I do not
>  See in this code, perhaps because I'm clueless.
>  (And to be honest I have not understood it before as well, since I never
>  actually ran with that code I posted.)
>  Is:
>  - An _add_ to queue should magically hold two references to the item, so
>  - A matching _put_ does not deallocate the item but keeps it referenced
>   once.
>  - A find/put pair does not remove the item from cache as well (+=1/-=1)
>  - [optional] At some future out-of-memory condition or when cache is to
>   big, according to some specified max cache. The most unused items are
>   dereferenced and if are not used (ref==0) are then removed.
>  - At driver shut-down all cached devices are dereferenced (and if not used
>   which they should) are dealocated.

I agree.  The original deviceid code had an issue where nothing was
ever deallocated until shutdown.  The code as given here ties the
deviceid lifetime to existing references.  This is not ideal, because
a brief loss of reference to a deviceid will cause an unnecessary
GETDEVICEINFO.  However, for the current submission, it has the
advantage that it is simple and correct.  Adding the machinery to do
as you suggest above is indeed a (fairly high) priority goal, but was
intentionally deferred.

>
> So in effect if [optional] code above is not yet implemented then a
> getdeviceinfo is never preformed twice for the same deviceid
> (Assuming no device recall). This is what I had in osd, and is cardinal
> for it's performance. Is that what we are aiming at?

Yes, that is the goal.  Right now, if a layout is still around that
references a deviceid, a second GETDEVICEINFO will not be sent.
However, if all layouts referencing a specific deviceid have been
returned, then yes, a second GETDEVICEINFO will be called.

>
>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
>> index 1c3eb02..b7de762 100644
>> --- a/fs/nfs/pnfs.h
>> +++ b/fs/nfs/pnfs.h
>> @@ -32,11 +32,13 @@
>>
>>  struct pnfs_layout_segment {
>>       struct list_head fi_list;
>> -     u32 iomode;
>> +     struct pnfs_layout_range range;
>>       struct kref kref;
>>       struct pnfs_layout_hdr *layout;
>>  };
>>
>> +#define NFS4_PNFS_DEVICEID4_SIZE 16
>> +
>>  #ifdef CONFIG_NFS_V4_1
>>
>>  #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
>> @@ -44,6 +46,7 @@ struct pnfs_layout_segment {
>>  enum {
>>       NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
>>       NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
>> +     NFS_LAYOUT_STATEID_SET,         /* have a valid layout stateid */
>>  };
>>
>>  /* Per-layout driver specific registration structure */
>> @@ -54,26 +57,102 @@ struct pnfs_layoutdriver_type {
>>       struct module *owner;
>>       int (*initialize_mountpoint) (struct nfs_server *);
>>       int (*uninitialize_mountpoint) (struct nfs_server *);
>> +     struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
>> +     void (*free_lseg) (struct pnfs_layout_segment *lseg);
>>  };
>>
>>  struct pnfs_layout_hdr {
>>       unsigned long           refcount;
>>       struct list_head        layouts;   /* other client layouts */
>>       struct list_head        segs;      /* layout segments list */
>> +     seqlock_t               seqlock;   /* Protects the stateid */
>> +     nfs4_stateid            stateid;
>>       unsigned long           state;
>>       struct inode            *inode;
>>  };
>>
>> +struct pnfs_deviceid {
>
> Please for my own sanity, could we change those names.
> I'll post a patch once we settle on final code. I want this
> to be nfs4_deviceid (And perhaps in that other header)
>
>> +     char data[NFS4_PNFS_DEVICEID4_SIZE];
>> +};
>> +
>> +struct pnfs_device {
>> +     struct pnfs_deviceid dev_id;
>> +     unsigned int  layout_type;
>> +     unsigned int  mincount;
>> +     struct page **pages;
>> +     void          *area;
>> +     unsigned int  pgbase;
>> +     unsigned int  pglen;
>> +};
>> +
>> +/*
>> + * Device ID RCU cache. A device ID is unique per client ID and layout type.
>> + */
>> +#define NFS4_DEVICE_ID_HASH_BITS     5
>> +#define NFS4_DEVICE_ID_HASH_SIZE     (1 << NFS4_DEVICE_ID_HASH_BITS)
>> +#define NFS4_DEVICE_ID_HASH_MASK     (NFS4_DEVICE_ID_HASH_SIZE - 1)
>> +
>> +static inline u32
>> +nfs4_deviceid_hash(struct pnfs_deviceid *id)
>> +{
>> +     unsigned char *cptr = (unsigned char *)id->data;
>> +     unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE;
>> +     u32 x = 0;
>> +
>> +     while (nbytes--) {
>> +             x *= 37;
>> +             x += *cptr++;
>> +     }
>> +     return x & NFS4_DEVICE_ID_HASH_MASK;
>> +}
>> +
>> +/* Device ID cache node */
>> +struct nfs4_deviceid {
>
> And this one rename to pnfs_deviceid_node
>
>> +     struct hlist_node       de_node;
>> +     struct pnfs_deviceid    de_id;
>> +     atomic_t                de_ref;
>> +};
>> +
>> +struct nfs4_deviceid_cache {
>> +     spinlock_t              dc_lock;
>> +     atomic_t                dc_ref;
>> +     void                    (*dc_free_callback)(struct nfs4_deviceid *);
>> +     struct hlist_head       dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
>> +     struct hlist_head       dc_to_free;
>> +};
>> +
>
> All these cache APIs below, the naming convention in this header is
> to call them pnfs_xxx. They don't have any use in general nfs4.
> (and are implemented in pnfs.c)

I'll defer to Andy regarding naming.

>
>> +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *,
>> +                             void (*free_callback)(struct nfs4_deviceid *));
>> +extern void nfs4_put_deviceid_cache(struct nfs_client *);
>> +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *);
>
> Why is above exported? Can't we just call it from _add_
>

You are right.  It is a remnant of older code organization.

Fred

> Boaz
>
>> +extern struct nfs4_deviceid *nfs4_find_get_deviceid(
>> +                             struct nfs4_deviceid_cache *,
>> +                             struct pnfs_deviceid *);
>> +extern struct nfs4_deviceid *nfs4_add_deviceid(struct nfs4_deviceid_cache *,
>> +                             struct nfs4_deviceid *);
>> +extern void nfs4_put_deviceid(struct nfs4_deviceid_cache *c,
>> +                           struct nfs4_deviceid *devid);
>> +
>>  extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
>>  extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
>>
>> +/* nfs4proc.c */
>> +extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
>> +                                struct pnfs_device *dev);
>> +extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
>> +
>> +/* pnfs.c */
>>  struct pnfs_layout_segment *
>>  pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
>>                  enum pnfs_iomode access_type);
>>  void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
>>  void unset_pnfs_layoutdriver(struct nfs_server *);
>> +int pnfs_layout_process(struct nfs4_layoutget *lgp);
>>  void pnfs_destroy_layout(struct nfs_inode *);
>>  void pnfs_destroy_all_layouts(struct nfs_client *);
>> +void put_layout_hdr(struct inode *inode);
>> +void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
>> +                          struct nfs4_state *open_state);
>>
>>
>>  static inline int lo_fail_bit(u32 iomode)
>> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
>> index 2dde7c8..dcdd11c 100644
>> --- a/include/linux/nfs4.h
>> +++ b/include/linux/nfs4.h
>> @@ -545,6 +545,8 @@ enum {
>>       NFSPROC4_CLNT_SEQUENCE,
>>       NFSPROC4_CLNT_GET_LEASE_TIME,
>>       NFSPROC4_CLNT_RECLAIM_COMPLETE,
>> +     NFSPROC4_CLNT_LAYOUTGET,
>> +     NFSPROC4_CLNT_GETDEVICEINFO,
>>  };
>>
>>  /* nfs41 types */
>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
>> index e670a9c..7512886 100644
>> --- a/include/linux/nfs_fs_sb.h
>> +++ b/include/linux/nfs_fs_sb.h
>> @@ -83,6 +83,7 @@ struct nfs_client {
>>       u32                     cl_exchange_flags;
>>       struct nfs4_session     *cl_session;    /* sharred session */
>>       struct list_head        cl_layouts;
>> +     struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
>>  #endif /* CONFIG_NFS_V4_1 */
>>
>>  #ifdef CONFIG_NFS_FSCACHE
>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
>> index 7d68a5c..4953b58 100644
>> --- a/include/linux/nfs_xdr.h
>> +++ b/include/linux/nfs_xdr.h
>> @@ -186,6 +186,55 @@ struct nfs4_get_lease_time_res {
>>       struct nfs4_sequence_res        lr_seq_res;
>>  };
>>
>> +#define PNFS_LAYOUT_MAXSIZE 4096
>> +
>> +struct nfs4_layoutdriver_data {
>> +     __u32 len;
>> +     void *buf;
>> +};
>> +
>> +struct pnfs_layout_range {
>> +     u32 iomode;
>> +     u64 offset;
>> +     u64 length;
>> +};
>> +
>> +struct nfs4_layoutget_args {
>> +     __u32 type;
>> +     struct pnfs_layout_range range;
>> +     __u64 minlength;
>> +     __u32 maxcount;
>> +     struct inode *inode;
>> +     struct nfs_open_context *ctx;
>> +     struct nfs4_sequence_args seq_args;
>> +};
>> +
>> +struct nfs4_layoutget_res {
>> +     __u32 return_on_close;
>> +     struct pnfs_layout_range range;
>> +     __u32 type;
>> +     nfs4_stateid stateid;
>> +     struct nfs4_layoutdriver_data layout;
>> +     struct nfs4_sequence_res seq_res;
>> +};
>> +
>> +struct nfs4_layoutget {
>> +     struct nfs4_layoutget_args args;
>> +     struct nfs4_layoutget_res res;
>> +     struct pnfs_layout_segment **lsegpp;
>> +     int status;
>> +};
>> +
>> +struct nfs4_getdeviceinfo_args {
>> +     struct pnfs_device *pdev;
>> +     struct nfs4_sequence_args seq_args;
>> +};
>> +
>> +struct nfs4_getdeviceinfo_res {
>> +     struct pnfs_device *pdev;
>> +     struct nfs4_sequence_res seq_res;
>> +};
>> +
>>  /*
>>   * Arguments to the open call.
>>   */
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html