Re: [PATCH v2 3/6] libceph: rados pool namesapce support

Ilya Dryomov <idryomov@xxxxxxxxx> · Tue, 22 Mar 2016 10:19:19 +0100



On Tue, Mar 22, 2016 at 10:17 AM, Ilya Dryomov <idryomov@xxxxxxxxx> wrote:
> On Tue, Mar 22, 2016 at 8:52 AM, Yan, Zheng <zyan@xxxxxxxxxx> wrote:
>>
>>> On Mar 22, 2016, at 14:11, Ilya Dryomov <idryomov@xxxxxxxxx> wrote:
>>>
>>> On Sat, Feb 6, 2016 at 8:00 AM, Yan, Zheng <zyan@xxxxxxxxxx> wrote:
>>>> Signed-off-by: Yan, Zheng <zyan@xxxxxxxxxx>
>>>> ---
>>>> drivers/block/rbd.c          |  1 +
>>>> fs/ceph/inode.c              |  3 +++
>>>> include/linux/ceph/ceph_fs.h |  2 ++
>>>> include/linux/ceph/osdmap.h  |  2 ++
>>>> net/ceph/osd_client.c        | 37 ++++++++++++++++++++++++++-----------
>>>> net/ceph/osdmap.c            | 33 +++++++++++++++++++++++++++------
>>>> 6 files changed, 61 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>>> index b0bcb2d..0423493 100644
>>>> --- a/drivers/block/rbd.c
>>>> +++ b/drivers/block/rbd.c
>>>> @@ -4088,6 +4088,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
>>>>        rbd_dev->layout.stripe_count = 1;
>>>>        rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
>>>>        rbd_dev->layout.pool_id = spec->pool_id;
>>>> +       RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
>>>>
>>>>        /*
>>>>         * If this is a mapping rbd_dev (as opposed to a parent one),
>>>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>>>> index b0ad53d..3c220f1 100644
>>>> --- a/fs/ceph/inode.c
>>>> +++ b/fs/ceph/inode.c
>>>> @@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>>>>        ci->i_symlink = NULL;
>>>>
>>>>        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
>>>> +       RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
>>>>
>>>>        ci->i_fragtree = RB_ROOT;
>>>>        mutex_init(&ci->i_fragtree_mutex);
>>>> @@ -518,6 +519,8 @@ void ceph_destroy_inode(struct inode *inode)
>>>>        if (ci->i_xattrs.prealloc_blob)
>>>>                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
>>>>
>>>> +       ceph_put_string(ci->i_layout.pool_ns);
>>>> +
>>>>        call_rcu(&inode->i_rcu, ceph_i_callback);
>>>> }
>>>>
>>>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>>>> index 7d8728e..3858923 100644
>>>> --- a/include/linux/ceph/ceph_fs.h
>>>> +++ b/include/linux/ceph/ceph_fs.h
>>>> @@ -53,6 +53,7 @@ struct ceph_file_layout_legacy {
>>>>        __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
>>>> } __attribute__ ((packed));
>>>>
>>>> +struct ceph_string;
>>>> /*
>>>>  * ceph_file_layout - describe data layout for a file/inode
>>>>  */
>>>> @@ -62,6 +63,7 @@ struct ceph_file_layout {
>>>>        u32 stripe_count;  /* over this many objects */
>>>>        u32 object_size;   /* until objects are this big */
>>>>        s64 pool_id;        /* rados pool id */
>>>> +       struct ceph_string __rcu *pool_ns; /* rados pool namespace */
>>>> };
>>>>
>>>> extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
>>>> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
>>>> index e55c08b..3d59d6c 100644
>>>> --- a/include/linux/ceph/osdmap.h
>>>> +++ b/include/linux/ceph/osdmap.h
>>>> @@ -55,6 +55,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
>>>>
>>>> struct ceph_object_locator {
>>>>        s64 pool;
>>>> +       struct ceph_string *pool_ns;
>>>> };
>>>>
>>>> /*
>>>> @@ -63,6 +64,7 @@ struct ceph_object_locator {
>>>>  * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
>>>>  */
>>>> #define CEPH_MAX_OID_NAME_LEN 100
>>>> +#define CEPH_MAX_NAMESPACE_LEN 100
>>>>
>>>> struct ceph_object_id {
>>>>        char name[CEPH_MAX_OID_NAME_LEN];
>>>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>>>> index 450955e..68e7f68 100644
>>>> --- a/net/ceph/osd_client.c
>>>> +++ b/net/ceph/osd_client.c
>>>> @@ -339,6 +339,8 @@ static void ceph_osdc_release_request(struct kref *kref)
>>>>                kfree(req->r_ops);
>>>>
>>>>        ceph_put_snap_context(req->r_snapc);
>>>> +       ceph_put_string(req->r_base_oloc.pool_ns);
>>>> +
>>>>        if (req->r_mempool)
>>>>                mempool_free(req, req->r_osdc->req_mempool);
>>>>        else
>>>> @@ -388,6 +390,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>>>>        req->r_num_ops = 0;
>>>>        req->r_max_ops = num_ops;
>>>>
>>>> +       req->r_base_oloc.pool = -1;
>>>> +       req->r_target_oloc.pool = -1;
>>>> +
>>>>        if (num_ops <= CEPH_OSD_INITIAL_OP) {
>>>>                req->r_ops = req->r_inline_ops;
>>>>        } else {
>>>> @@ -409,9 +414,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>>>>        INIT_LIST_HEAD(&req->r_req_lru_item);
>>>>        INIT_LIST_HEAD(&req->r_osd_item);
>>>>
>>>> -       req->r_base_oloc.pool = -1;
>>>> -       req->r_target_oloc.pool = -1;
>>>> -
>>>>        /* create reply message */
>>>>        msg_size = OSD_OPREPLY_FRONT_LEN;
>>>>        if (num_ops > CEPH_OSD_INITIAL_OP) {
>>>> @@ -433,7 +435,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>>>>
>>>>        /* create request message; allow space for oid */
>>>>        msg_size = 4 + 4 + 8 + 8 + 4 + 8;
>>>> -       msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
>>>> +       msg_size += 2 + 4 + 8 + 4 + 4 + 4 + CEPH_MAX_NAMESPACE_LEN; /* oloc */
>>>>        msg_size += 1 + 8 + 4 + 4;     /* pg_t */
>>>>        msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
>>>>        msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
>>>> @@ -864,6 +866,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
>>>>        }
>>>>
>>>>        req->r_base_oloc.pool = layout->pool_id;
>>>> +       req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
>>>>
>>>>        snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
>>>>                 "%llx.%08llx", vino.ino, objnum);
>>>> @@ -1719,10 +1722,10 @@ static int ceph_oloc_decode(void **p, void *end,
>>>>        }
>>>>
>>>>        if (struct_v >= 5) {
>>>> -               len = ceph_decode_32(p);
>>>> -               if (len > 0) {
>>>> -                       pr_warn("ceph_object_locator::nspace is set\n");
>>>> -                       goto e_inval;
>>>> +               u32 ns_len = ceph_decode_32(p);
>>>> +               if (ns_len > 0) {
>>>> +                       ceph_decode_need(p, end, ns_len, e_inval);
>>>> +                       *p += ns_len;
>>>>                }
>>>>        }
>>>>
>>>> @@ -1907,7 +1910,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
>>>>
>>>>                __unregister_request(osdc, req);
>>>>
>>>> -               req->r_target_oloc = redir.oloc; /* struct */
>>>> +               req->r_target_oloc.pool = redir.oloc.pool;
>>>>
>>>>                /*
>>>>                 * Start redirect requests with nofail=true.  If
>>>> @@ -2459,6 +2462,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
>>>>                                struct timespec *mtime)
>>>> {
>>>>        struct ceph_msg *msg = req->r_request;
>>>> +       struct ceph_string *pool_ns;
>>>>        void *p;
>>>>        size_t msg_size;
>>>>        int flags = req->r_flags;
>>>> @@ -2483,14 +2487,25 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
>>>>        req->r_request_reassert_version = p;
>>>>        p += sizeof(struct ceph_eversion); /* will get filled in */
>>>>
>>>> +       if (req->r_base_oloc.pool_ns)
>>>> +               pool_ns = req->r_base_oloc.pool_ns;
>>>> +       else
>>>> +               pool_ns = NULL;
>>>> +
>>>>        /* oloc */
>>>> +       ceph_encode_8(&p, 5);
>>>>        ceph_encode_8(&p, 4);
>>>> -       ceph_encode_8(&p, 4);
>>>> -       ceph_encode_32(&p, 8 + 4 + 4);
>>>> +       ceph_encode_32(&p, 8 + 4 + 4 + 4 + (pool_ns ? pool_ns->len : 0));
>>>>        req->r_request_pool = p;
>>>>        p += 8;
>>>>        ceph_encode_32(&p, -1);  /* preferred */
>>>>        ceph_encode_32(&p, 0);   /* key len */
>>>> +       if (pool_ns) {
>>>> +               ceph_encode_32(&p, pool_ns->len);
>>>> +               ceph_encode_copy(&p, pool_ns->str, pool_ns->len);
>>>> +       } else {
>>>> +               ceph_encode_32(&p, 0);
>>>> +       }
>>>>
>>>>        ceph_encode_8(&p, 1);
>>>>        req->r_request_pgid = p;
>>>> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
>>>> index f033ca5..f117848 100644
>>>> --- a/net/ceph/osdmap.c
>>>> +++ b/net/ceph/osdmap.c
>>>> @@ -1470,12 +1470,33 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
>>>>        if (!pi)
>>>>                return -EIO;
>>>>
>>>> -       pg_out->pool = oloc->pool;
>>>> -       pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
>>>> -                                    oid->name_len);
>>>> -
>>>> -       dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
>>>> -            pg_out->pool, pg_out->seed);
>>>> +       if (!oloc->pool_ns) {
>>>> +               pg_out->pool = oloc->pool;
>>>> +               pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
>>>> +                                            oid->name_len);
>>>> +               dout("%s '%.*s' pgid %llu.%x\n", __func__,
>>>> +                    oid->name_len, oid->name, pg_out->pool, pg_out->seed);
>>>> +       } else {
>>>> +               char stack_buf[256];
>>>> +               char *buf = stack_buf;
>>>> +               int nsl = oloc->pool_ns->len;
>>>> +               size_t total = nsl + 1 + oid->name_len;
>>>> +               if (total > sizeof(stack_buf)) {
>>>> +                       buf = kmalloc(total, GFP_NOFS);
>>>> +                       if (!buf)
>>>> +                               return -ENOMEM;
>>>> +               }
>>>
>>> This ties into my question about how namespaces are going to be used
>>> and how long the namespace name is allowed to be.
>>>
>>> CEPH_MAX_NAMESPACE_LEN is defined to 100 above, but that definition is
>>> removed in patch 5.  That needs fixing, and if the 100 char limit is
>>> real, then buf can just be
>>>
>>>    CEPH_MAX_OID_NAME_LEN + CEPH_MAX_NAMESPACE_LEN + 1
>>>
>>> with no need for a kmalloc().
>>
>> CEPH_MAX_NAMESPACE_LEN is a intermediate variable for splitting patches (make individual patch be able to compile). As I know there is no limitation on namespace length.

(adding ceph-devel back)

>
> To me, that's indication of a poorly structured series.
>
> I understand that it's just a std::string in userspace and so there
> isn't a limit as such.  Same goes for OIDs, but we do limit those in
> the kernel client.  Can we do the same for namespace names?

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html