Re: [PATCH 1/1] ceph: add getvxattr op

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Jan 11, 2022 at 7:18 PM Jeff Layton <jlayton@xxxxxxxxxx> wrote:
>
> On Tue, 2022-01-11 at 18:53 +0530, Milind Changire wrote:
> > responses below ...
> >
> > On Tue, Jan 11, 2022 at 6:19 PM Jeff Layton <jlayton@xxxxxxxxxx>
> > wrote:
> > > On Tue, 2022-01-11 at 17:54 +0530, Milind Changire wrote:
> > > > Problem:
> > > > Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not
> > > > be
> > > > propagated to the client as frequently to keep them updated. This
> > > > creates vxattr availability problems.
> > > >
> > > > Solution:
> > > > Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
> > > > ceph.file.layout* vxattrs.
> > > > If the entire layout for a dir or a file is being set, then it is
> > > > expected that the layout be set in standard JSON format.
> > > > Individual
> > > > field value retrieval is not wrapped in JSON. The JSON format also
> > > > applies while setting the vxattr if the entire layout is being set
> > > > in
> > > > one go.
> > > > As a temporary measure, setting a vxattr can also be done in the
> > > > old
> > > > format. The old format will be deprecated in the future.
> > > >
> > > > URL: https://tracker.ceph.com/issues/51062
> > > > Signed-off-by: Milind Changire <mchangir@xxxxxxxxxx>
> > > > ---
> > > >   fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++
> > > >   fs/ceph/mds_client.c         | 27 ++++++++++++++-
> > > >   fs/ceph/mds_client.h         | 12 ++++++-
> > > >   fs/ceph/strings.c            |  1 +
> > > >   fs/ceph/super.h              |  1 +
> > > >   fs/ceph/xattr.c              | 65
> > > > ++++++++++++++++++++++++++++++++++++
> > > >   include/linux/ceph/ceph_fs.h |  1 +
> > > >   7 files changed, 156 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > index e3322fcb2e8d..f29d59b63c52 100644
> > > > --- a/fs/ceph/inode.c
> > > > +++ b/fs/ceph/inode.c
> > > > @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode,
> > > > struct page *locked_page,
> > > >        return err;
> > > >   }
> > > >
> > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void
> > > > *value,
> > > > +                   size_t size)
> > > > +{
> > > > +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> > > > +     struct ceph_mds_client *mdsc = fsc->mdsc;
> > > > +     struct ceph_mds_request *req;
> > > > +     int mode = USE_AUTH_MDS;
> > > > +     int err;
> > > > +     char *xattr_value;
> > > > +     size_t xattr_value_len;
> > > > +
> > > > +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR,
> > > > mode);
> > > > +     if (IS_ERR(req)) {
> > > > +             err = -ENOMEM;
> > > > +             goto out;
> > > > +     }
> > > > +
> > > > +     req->r_path2 = kstrdup(name, GFP_NOFS);
> > > > +     if (!req->r_path2) {
> > > > +             err = -ENOMEM;
> > > > +             goto put;
> > > > +     }
> > > > +
> > > > +     ihold(inode);
> > > > +     req->r_inode = inode;
> > > > +     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > +     if (err < 0)
> > > > +             goto put;
> > > > +
> > > > +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
> > > > +     xattr_value_len = req-
> > > > >r_reply_info.xattr_info.xattr_value_len;
> > > > +
> > > > +     dout("do_getvxattr xattr_value_len:%lu, size:%lu\n",
> > > > xattr_value_len, size);
> > > > +
> > > > +     err = xattr_value_len;
> > > > +     if (size == 0)
> > > > +             goto put;
> > > > +
> > > > +     if (xattr_value_len > size) {
> > > > +             err = -ERANGE;
> > > > +             goto put;
> > > > +     }
> > > > +
> > > > +     memcpy(value, xattr_value, xattr_value_len);
> > > > +put:
> > > > +     ceph_mdsc_put_request(req);
> > > > +out:
> > > > +     dout("do_getvxattr result=%d\n", err);
> > > > +     return err;
> > > > +}
> > > > +
> > > >
> > > >   /*
> > > >    * Check inode permissions.  We verify we have a valid value for
> > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > index c30eefc0ac19..a5eafc71d976 100644
> > > > --- a/fs/ceph/mds_client.c
> > > > +++ b/fs/ceph/mds_client.c
> > > > @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p,
> > > > void *end,
> > > >        return -EIO;
> > > >   }
> > > >
> > > > +static int parse_reply_info_getvxattr(void **p, void *end,
> > > > +                                   struct
> > > > ceph_mds_reply_info_parsed *info,
> > > > +                                   u64 features)
> > > > +{
> > > > +     u8 struct_v, struct_compat;
> > > > +     u32 struct_len;
> > > > +     u32 value_len;
> > > > +
> > > > +     ceph_decode_8_safe(p, end, struct_v, bad);
> > > > +     ceph_decode_8_safe(p, end, struct_compat, bad);
> > > > +     ceph_decode_32_safe(p, end, struct_len, bad);
> > > > +     ceph_decode_32_safe(p, end, value_len, bad);
> > > > +
> > > > +     if (value_len == end - *p) {
> > > > +       info->xattr_info.xattr_value = *p;
> > > > +       info->xattr_info.xattr_value_len = end - *p;
> > > > +       *p = end;
> > > > +       return info->xattr_info.xattr_value_len;
> > > > +     }
> > >
> > > The kernel uses tab indent almost exclusively. Can you fix the
> > > indention
> > > above (and anywhere else I missed)?
> > >
> >
> >
> > I'll fix this. newbie issues :P
> >
> > >
> > > > +bad:
> > > > +     return -EIO;
> > > > +}
> > > > +
> > > >   /*
> > > >    * parse extra results
> > > >    */
> > > > @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p,
> > > > void *end,
> > > >                return parse_reply_info_readdir(p, end, info,
> > > > features);
> > > >        else if (op == CEPH_MDS_OP_CREATE)
> > > >                return parse_reply_info_create(p, end, info,
> > > > features, s);
> > > > +     else if (op == CEPH_MDS_OP_GETVXATTR)
> > > > +             return parse_reply_info_getvxattr(p, end, info,
> > > > features);
> > > >        else
> > > >                return -EIO;
> > > >   }
> > > > @@ -615,7 +640,7 @@ static int parse_reply_info(struct
> > > > ceph_mds_session *s, struct ceph_msg *msg,
> > > >
> > > >        if (p != end)
> > > >                goto bad;
> > > > -     return 0;
> > > > +     return err;
> > > >
> > > >   bad:
> > > >        err = -EIO;
> > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > index 97c7f7bfa55f..f2a8e5af3c2e 100644
> > > > --- a/fs/ceph/mds_client.h
> > > > +++ b/fs/ceph/mds_client.h
> > > > @@ -29,8 +29,10 @@ enum ceph_feature_type {
> > > >        CEPHFS_FEATURE_MULTI_RECONNECT,
> > > >        CEPHFS_FEATURE_DELEG_INO,
> > > >        CEPHFS_FEATURE_METRIC_COLLECT,
> > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,
> > > > +     CEPHFS_FEATURE_GETVXATTR,
> > > >
> > > > -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> > > > +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
> > > >   };
> > > >
> > > >   /*
> > > > @@ -45,6 +47,8 @@ enum ceph_feature_type {
> > > >        CEPHFS_FEATURE_MULTI_RECONNECT,         \
> > > >        CEPHFS_FEATURE_DELEG_INO,               \
> > > >        CEPHFS_FEATURE_METRIC_COLLECT,          \
> > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
> > > > +     CEPHFS_FEATURE_GETVXATTR,               \
> > > >                                                \
> > > >        CEPHFS_FEATURE_MAX,                     \
> > > >   }
> > > > @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
> > > >        loff_t                        offset;
> > > >   };
> > > >
> > > > +struct ceph_mds_reply_xattr {
> > > > +     char *xattr_value;
> > > > +     size_t xattr_value_len;
> > > > +};
> > > > +
> > > >   /*
> > > >    * parsed info about an mds reply, including information about
> > > >    * either: 1) the target inode and/or its parent directory and
> > > > dentry,
> > > > @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
> > > >        char                          *dname;
> > > >        u32                           dname_len;
> > > >        struct ceph_mds_reply_lease   *dlease;
> > > > +     struct ceph_mds_reply_xattr   xattr_info;
> > > >
> > > >        /* extra */
> > > >        union {
> > > > diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
> > > > index 573bb9556fb5..e36e8948e728 100644
> > > > --- a/fs/ceph/strings.c
> > > > +++ b/fs/ceph/strings.c
> > > > @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
> > > >        case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
> > > >        case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
> > > >        case CEPH_MDS_OP_GETATTR:  return "getattr";
> > > > +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
> > > >        case CEPH_MDS_OP_SETXATTR: return "setxattr";
> > > >        case CEPH_MDS_OP_SETATTR: return "setattr";
> > > >        case CEPH_MDS_OP_RMXATTR: return "rmxattr";
> > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > index ac331aa07cfa..a627fa69668e 100644
> > > > --- a/fs/ceph/super.h
> > > > +++ b/fs/ceph/super.h
> > > > @@ -1043,6 +1043,7 @@ static inline bool
> > > > ceph_inode_is_shutdown(struct inode *inode)
> > > >
> > > >   /* xattr.c */
> > > >   int __ceph_setxattr(struct inode *, const char *, const void *,
> > > > size_t, int);
> > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void
> > > > *value, size_t size);
> > > >   ssize_t __ceph_getxattr(struct inode *, const char *, void *,
> > > > size_t);
> > > >   extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
> > > >   extern struct ceph_buffer *__ceph_build_xattrs_blob(struct
> > > > ceph_inode_info *ci);
> > > > diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> > > > index fcf7dfdecf96..fdde8ffa71bb 100644
> > > > --- a/fs/ceph/xattr.c
> > > > +++ b/fs/ceph/xattr.c
> > > > @@ -918,6 +918,64 @@ static inline int __get_request_mask(struct
> > > > inode *in) {
> > > >        return mask;
> > > >   }
> > > >
> > > > +static inline bool ceph_is_passthrough_vxattr(const char *name)
> > > > +{
> > > > +     const char *vxattr_list[] = {
> > > > +             "ceph.dir.pin",
> > > > +             "ceph.dir.pin.random",
> > > > +             "ceph.dir.pin.distributed",
> > > > +             "ceph.dir.layout",
> > > > +             "ceph.dir.layout.object_size",
> > > > +             "ceph.dir.layout.stripe_count",
> > > > +             "ceph.dir.layout.stripe_unit",
> > > > +             "ceph.dir.layout.pool",
> > > > +             "ceph.dir.layout.pool_name",
> > > > +             "ceph.dir.layout.pool_id",
> > > > +             "ceph.dir.layout.pool_namespace",
> > > > +             "ceph.file.layout",
> > > > +             "ceph.file.layout.object_size",
> > > > +             "ceph.file.layout.stripe_count",
> > > > +             "ceph.file.layout.stripe_unit",
> > > > +             "ceph.file.layout.pool",
> > > > +             "ceph.file.layout.pool_name",
> > > > +             "ceph.file.layout.pool_id",
> > > > +             "ceph.file.layout.pool_namespace",
> > > > +     };
> > > > +     int i = 0;
> > > > +     int n = sizeof(vxattr_list)/sizeof(vxattr_list[0]);
> > > > +
> > > > +     while (i < n) {
> > > > +             if (!strcmp(name, vxattr_list[i]))
> > > > +                     return true;
> > > > +             i++;
> > > > +     }
> > > > +     return false;
> > > > +}
> > >
> > > This part, I'm not so crazy about. This list will be hard to keep in
> > > sync with the userland code and you're almost guaranteed to have a
> > > mismatch between what the kernel and userland supports since they're
> > > on
> > > different release schedules.
> > >
> > > I think what we probably ought to do is run the ceph_match_vxattr
> > > call
> > > first, and if it doesn't match the name and the name starts with
> > > "ceph.", then we'll send a GETVXATTR call to the MDS.
> > >
> > > Sound ok?
> > >
> >
> >
> > I was trying to preempt a getvxattr call if the full xattr name was
> > invalid.
>
> Huh, why? That seems like the point of the whole bug. We need a way to
> hand vxattrs that the client doesn't recognize off to the MDS so it can
> satisfy the request.

File and dir layout vxattrs are server-side xattrs. So, it seemed prudent to
delegate them to the server. The idea was also to help identify layouts and
their inheritance for a dir hierarchy or a file at the server side and possibly
avoiding client round trips to the server for learning about the same (and
maintain consistency?)

Here's some discussion about the same:
https://tracker.ceph.com/issues/51062#note-3

>
> > I could just test for the prefixes: ceph.dir.pin, ceph.dir.layout and
> > ceph.file.layout and send them over. Eventually, we'll get an ENODATA
> > if xattr name validation fails on the MDS side.
> >
> > The problem with doing a ceph_match_vxattr() first is that it will
> > succeed
> > for any of these vxattrs, but we'll land in the same place as the one
> > you've
> > described in the tracker.
> > The very idea of implementing getvxattr is to bypass the existing
> > "cached"
> > service mechanism for special vxattrs.
> >
>
> I don't get it. What's the benefit of calling out to the MDS to ask for
> ceph.file.layout.pool instead of just satisfying it from the info we
> have?
>
> I'd prefer we just add a fallback for "ceph.*" vxattrs that for which we
> don't have a local handler.

See my explanation above.

>
>
> > >
> > > > +
> > > > +/* check if the entire cluster supports the given feature */
> > > > +static inline bool ceph_cluster_has_feature(struct inode *inode,
> > > > int feature_bit)
> > > > +{
> > > > +     int64_t i;
> > > > +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > > > +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
> > > > +     int64_t num_sessions = atomic_read(&fsc->mdsc-
> > > > >num_sessions);
> > > > +
> > > > +     if (fsc->mdsc->stopping)
> > > > +             return false;
> > > > +
> > > > +     if (!sessions)
> > > > +             return false;
> > > > +
> > > > +     for (i = 0; i < num_sessions; i++) {
> > > > +             struct ceph_mds_session *session = sessions[i];
> > > > +             if (!session)
> > > > +                     return false;
> > > > +             if (!test_bit(feature_bit, &session->s_features))
> > > > +                     return false;
> > > > +     }
> > > > +     return true;
> > > > +}
> > > > +
> > > >   ssize_t __ceph_getxattr(struct inode *inode, const char *name,
> > > > void *value,
> > > >                      size_t size)
> > > >   {
> > > > @@ -927,6 +985,13 @@ ssize_t __ceph_getxattr(struct inode *inode,
> > > > const char *name, void *value,
> > > >        int req_mask;
> > > >        ssize_t err;
> > > >
> > > > +     if (ceph_is_passthrough_vxattr(name) &&
> > > > +         ceph_cluster_has_feature(inode,
> > > > CEPHFS_FEATURE_GETVXATTR)) {
> > > > +             err = ceph_do_getvxattr(inode, name, value, size);
> > > > +             return err;
> > > > +     }
> > > > +     dout("vxattr is not passthrough or kclient doesn't support
> > > > GETVXATTR\n");
> > > > +
> > > >        /* let's see if a virtual xattr was requested */
> > > >        vxattr = ceph_match_vxattr(inode, name);
> > > >        if (vxattr) {
> > > > diff --git a/include/linux/ceph/ceph_fs.h
> > > > b/include/linux/ceph/ceph_fs.h
> > > > index 7ad6c3d0db7d..66db21ac5f0c 100644
> > > > --- a/include/linux/ceph/ceph_fs.h
> > > > +++ b/include/linux/ceph/ceph_fs.h
> > > > @@ -328,6 +328,7 @@ enum {
> > > >        CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
> > > >        CEPH_MDS_OP_LOOKUPINO  = 0x00104,
> > > >        CEPH_MDS_OP_LOOKUPNAME = 0x00105,
> > > > +     CEPH_MDS_OP_GETVXATTR  = 0x00106,
> > > >
> > > >        CEPH_MDS_OP_SETXATTR   = 0x01105,
> > > >        CEPH_MDS_OP_RMXATTR    = 0x01106,
> > >
>
> --
> Jeff Layton <jlayton@xxxxxxxxxx>



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux