Re: [PATCH] rbd: support v2 fancy striping

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Jan 30, 2018 at 12:27 AM, Maged Mokhtar <mmokhtar@xxxxxxxxxxx> wrote:
> Adds v2 fancy striping support to kernel rbd. Adds libceph striper.c based
> on user space osdc/Striper.cc. Clone images are limited to have
> same striping layout as parents in order to simplify callback of copyup
> requests and insure they are atomic. If they have different layout we fail
> during image probe.
>
> Signed-off-by: Maged Mokhtar <mmokhtar@xxxxxxxxxxx>
> ---
>  drivers/block/rbd.c          |  131 ++++++++++++++++++---------------
>  include/linux/ceph/striper.h |   34 ++++++++
>  net/ceph/Makefile            |    2
>  net/ceph/striper.c           |   81 ++++++++++++++++++++
>  4 files changed, 191 insertions(+), 57 deletions(-)
>
> diff -urNp a/drivers/block/rbd.c b/drivers/block/rbd.c
> --- a/drivers/block/rbd.c       2018-01-28 23:20:33.000000000 +0200
> +++ b/drivers/block/rbd.c       2018-01-29 22:23:18.755108873 +0200
> @@ -33,6 +33,7 @@
>  #include <linux/ceph/mon_client.h>
>  #include <linux/ceph/cls_lock_client.h>
>  #include <linux/ceph/decode.h>
> +#include <linux/ceph/striper.h>
>  #include <linux/parser.h>
>  #include <linux/bsearch.h>
>
> @@ -1231,27 +1232,6 @@ static void rbd_dev_mapping_clear(struct
>         rbd_dev->mapping.features = 0;
>  }
>
> -static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
> -{
> -       u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
> -
> -       return offset & (segment_size - 1);
> -}
> -
> -static u64 rbd_segment_length(struct rbd_device *rbd_dev,
> -                               u64 offset, u64 length)
> -{
> -       u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
> -
> -       offset &= segment_size - 1;
> -
> -       rbd_assert(length <= U64_MAX - offset);
> -       if (offset + length > segment_size)
> -               length = segment_size - offset;
> -
> -       return length;
> -}
> -
>  /*
>   * bio helpers
>   */
> @@ -2427,9 +2407,15 @@ static int rbd_img_request_fill(struct r
>
>         while (resid) {
>                 struct ceph_osd_request *osd_req;
> -               u64 object_no = img_offset >> rbd_dev->header.obj_order;
> -               u64 offset = rbd_segment_offset(rbd_dev, img_offset);
> -               u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
> +               u64 object_no;
> +               u64 offset;
> +               u64 length;
> +               struct stripe_extent ext;
> +
> +               get_stripe_extent(&rbd_dev->layout,img_offset,resid,&ext);
> +               object_no = ext.objectno;
> +               offset = ext.offset;
> +               length = ext.length;
>
>                 obj_request = rbd_obj_request_create(type);
>                 if (!obj_request)
> @@ -2624,7 +2610,6 @@ out_err:
>   * object request from the image request does not exist.
>   *
>   * A page array big enough to hold the returned data is allocated
> - * and supplied to rbd_img_request_fill() as the "data descriptor."
>   * When the read completes, this page array will be transferred to
>   * the original object request for the copyup operation.
>   *
> @@ -2641,25 +2626,47 @@ static int rbd_img_obj_parent_read_full(
>         u32 page_count;
>         int result;
>
> -       rbd_assert(rbd_dev->parent != NULL);
> +       struct rbd_obj_request *parent_obj_request = NULL;
> +       struct ceph_osd_request *osd_req;
>
> -       /*
> -        * Determine the byte range covered by the object in the
> -        * child image to which the original request was to be sent.
> -        */
> -       img_offset = obj_request->img_offset - obj_request->offset;
> -       length = rbd_obj_bytes(&rbd_dev->header);
> +       rbd_assert(rbd_dev->parent != NULL);
> +       if (rbd_dev->header.stripe_count !=
> +           rbd_dev->parent->header.stripe_count ||
> +           rbd_dev->header.stripe_unit !=
> +           rbd_dev->parent->header.stripe_unit) {
> +               rbd_warn(rbd_dev,"Cannot perform parent full object read due "
> +                        "to stripe mis-match\n");
> +               result = -EINVAL;
> +               goto out_err;
> +       }
>
>         /*
>          * There is no defined parent data beyond the parent
>          * overlap, so limit what we read at that boundary if
>          * necessary.
>          */
> -       if (img_offset + length > rbd_dev->parent_overlap) {
> -               rbd_assert(img_offset < rbd_dev->parent_overlap);
> -               length = rbd_dev->parent_overlap - img_offset;
> -       }
>
> +       img_offset = get_object_start_offset(&rbd_dev->layout,
> +                                            obj_request->object_no);
> +       rbd_assert(img_offset < rbd_dev->parent_overlap);
> +       if (rbd_dev->parent_overlap < get_object_end_offset(&rbd_dev->layout,
> +                                               obj_request->object_no)) {
> +               u64 diff = rbd_dev->parent_overlap - img_offset;
> +               u64 stripe_row_size = rbd_dev->header.stripe_unit *
> +                       rbd_dev->header.stripe_count;
> +               u64 rows = diff / stripe_row_size;
> +               u64 remain = diff - rows * stripe_row_size;
> +               length = rows * rbd_dev->header.stripe_unit;
> +               if (rbd_dev->header.stripe_unit < remain)
> +                       length = length + rbd_dev->header.stripe_unit;
> +               else
> +                       length = length + remain;
> +       }
> +       else {
> +               /* copy entire parent object */
> +               length = rbd_obj_bytes(&rbd_dev->header);
> +       }
> +
>         /*
>          * Allocate a page array big enough to receive the data read
>          * from the parent.
> @@ -2678,9 +2685,27 @@ static int rbd_img_obj_parent_read_full(
>         if (!parent_request)
>                 goto out_err;
>
> -       result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
> -       if (result)
> +       parent_obj_request = rbd_obj_request_create(OBJ_REQUEST_PAGES);
> +       if (!obj_request) {
> +               rbd_img_obj_request_del(parent_request, parent_obj_request);
>                 goto out_err;
> +       }
> +       rbd_img_obj_request_add(parent_request, parent_obj_request);
> +       parent_obj_request->object_no = obj_request->object_no;
> +       parent_obj_request->offset = 0;
> +       parent_obj_request->length = length;
> +       parent_obj_request->pages = pages;
> +       page_count = (u32)calc_pages_for(0, length);
> +       parent_obj_request->page_count = page_count;
> +       osd_req = rbd_osd_req_create(rbd_dev,OBJ_OP_READ,1,parent_obj_request);
> +       if (!osd_req) {
> +               rbd_img_obj_request_del(parent_request, parent_obj_request);
> +               goto out_err;
> +       }
> +       parent_obj_request->osd_req = osd_req;
> +       parent_obj_request->callback = rbd_img_obj_callback;
> +       parent_obj_request->img_offset = img_offset;
> +       rbd_img_obj_request_fill(parent_obj_request, osd_req, OBJ_OP_READ, 0);
>
>         parent_request->copyup_pages = pages;
>         parent_request->copyup_page_count = page_count;
> @@ -5090,28 +5115,10 @@ static int rbd_dev_v2_striping_info(stru
>         if (ret < size)
>                 return -ERANGE;
>
> -       /*
> -        * We don't actually support the "fancy striping" feature
> -        * (STRIPINGV2) yet, but if the striping sizes are the
> -        * defaults the behavior is the same as before.  So find
> -        * out, and only fail if the image has non-default values.
> -        */
> -       ret = -EINVAL;
>         obj_size = rbd_obj_bytes(&rbd_dev->header);
>         p = &striping_info_buf;
>         stripe_unit = ceph_decode_64(&p);
> -       if (stripe_unit != obj_size) {
> -               rbd_warn(rbd_dev, "unsupported stripe unit "
> -                               "(got %llu want %llu)",
> -                               stripe_unit, obj_size);
> -               return -EINVAL;
> -       }
>         stripe_count = ceph_decode_64(&p);
> -       if (stripe_count != 1) {
> -               rbd_warn(rbd_dev, "unsupported stripe count "
> -                               "(got %llu want 1)", stripe_count);
> -               return -EINVAL;
> -       }
>         rbd_dev->header.stripe_unit = stripe_unit;
>         rbd_dev->header.stripe_count = stripe_count;
>
> @@ -6090,6 +6097,18 @@ static int rbd_dev_image_probe(struct rb
>         ret = rbd_dev_probe_parent(rbd_dev, depth);
>         if (ret)
>                 goto err_out_probe;
> +
> +       if (rbd_dev->parent != NULL) {
> +               if (rbd_dev->header.stripe_count !=
> +                   rbd_dev->parent->header.stripe_count ||
> +                   rbd_dev->header.stripe_unit !=
> +                   rbd_dev->parent->header.stripe_unit) {
> +                       rbd_warn(rbd_dev,"Cannot map child image with "
> +                                "different striping than parent");
> +                       ret = -EINVAL;
> +                       goto err_out_probe;
> +               }
> +       }
>
>         dout("discovered format %u image, header name is %s\n",
>                 rbd_dev->image_format, rbd_dev->header_oid.name);
> diff -urNp a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h
> --- a/include/linux/ceph/striper.h      1970-01-01 02:00:00.000000000 +0200
> +++ b/include/linux/ceph/striper.h      2018-01-29 22:23:18.755108873 +0200
> @@ -0,0 +1,34 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _FS_CEPH_STRIPER_H
> +#define _FS_CEPH_STRIPER_H
> +
> +#include <linux/ceph/ceph_fs.h>
> +
> +struct ceph_file_layout;
> +
> +struct stripe_extent {
> +       u64     objectno;
> +       u64     offset;
> +       u64     length;
> +};
> +
> +/* Logical to Object address mapping */
> +void get_stripe_extent(struct ceph_file_layout *layout,u64 offset,
> +                       u64 len,struct stripe_extent *ext);
> +
> +/* Object to Logical address mapping */
> +u64 get_file_offset(struct ceph_file_layout *layout,u64 objectno,u64 off);
> +
> +static inline u64 get_object_start_offset(struct ceph_file_layout *layout,
> +                                          u64 objectno)
> +{
> +       return get_file_offset(layout,objectno,0);
> +}
> +
> +static inline u64 get_object_end_offset(struct ceph_file_layout *layout,
> +                                        u64 objectno)
> +{
> +       return get_file_offset(layout,objectno,layout->object_size);
> +}
> +
> +#endif
> diff -urNp a/net/ceph/Makefile b/net/ceph/Makefile
> --- a/net/ceph/Makefile 2018-01-28 23:20:33.000000000 +0200
> +++ b/net/ceph/Makefile 2018-01-29 22:23:18.755108873 +0200
> @@ -13,5 +13,5 @@ libceph-y := ceph_common.o messenger.o m
>         crypto.o armor.o \
>         auth_x.o \
>         ceph_fs.o ceph_strings.o ceph_hash.o \
> -       pagevec.o snapshot.o string_table.o
> +       pagevec.o snapshot.o string_table.o striper.o
>
> diff -urNp a/net/ceph/striper.c b/net/ceph/striper.c
> --- a/net/ceph/striper.c        1970-01-01 02:00:00.000000000 +0200
> +++ b/net/ceph/striper.c        2018-01-29 22:23:18.755108873 +0200
> @@ -0,0 +1,81 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/ceph/messenger.h>
> +#include <linux/ceph/striper.h>
> +
> +/*
> + * Address mappings for striped objects
> + * Based on user space osdc/Striper.cc
> + */
> +
> +/* Logical to Object address, based on osdc/Striper.cc file_to_extents() */
> +void get_stripe_extent(struct ceph_file_layout *layout,u64 offset,
> +                       u64 len,struct stripe_extent *ext)
> +{
> +       u64 object_size;
> +       u64 su;
> +       u64 stripe_count;
> +       u64 stripes_per_object;
> +       u64 blockno;
> +       u64 stripeno;
> +       u64 stripepos;
> +       u64 objectsetno;
> +       u64 objectno;
> +       u64 block_start;
> +       u64 block_off;
> +       u64 max;
> +
> +       object_size = layout->object_size;
> +       su = layout->stripe_unit;
> +       stripe_count = layout->stripe_count;
> +       stripes_per_object = object_size / su;
> +
> +       blockno = offset / su; /* which block */
> +       stripeno = blockno / stripe_count; /* which horizontal stripe Y */
> +       stripepos = blockno % stripe_count; /* which object in object set X */
> +       objectsetno = stripeno / stripes_per_object; /* which object set */
> +       objectno = objectsetno * stripe_count + stripepos;  /* object id */
> +
> +       // map range into object
> +       block_start = (stripeno % stripes_per_object) * su;
> +       block_off = offset % su;
> +       max = su - block_off;
> +
> +       ext->objectno = objectno;
> +       ext->offset = block_start + block_off;
> +       if (len > max)
> +               ext->length = max;
> +       else
> +               ext->length = len;
> +}
> +EXPORT_SYMBOL(get_stripe_extent);
> +
> +/* Object to Logical address, based on osdc/Striper.cc extent_to_file() */
> +u64 get_file_offset(struct ceph_file_layout *layout,u64 objectno,u64 off)
> +{
> +       u64 object_size;
> +       u64 su;
> +       u64 stripe_count;
> +       u64 stripes_per_object;
> +       u64 stripepos;
> +       u64 objectsetno;
> +       u64 stripeno;
> +       u64 blockno;
> +       u64 off_in_block;
> +       u64 file_offset;
> +
> +       object_size = layout->object_size;
> +       su = layout->stripe_unit;
> +       stripe_count = layout->stripe_count;
> +       stripes_per_object = object_size / su;
> +       off_in_block = off % su;
> +
> +       stripepos = objectno % stripe_count;
> +       objectsetno = objectno / stripe_count;
> +       stripeno = off / su + objectsetno * stripes_per_object;
> +       blockno = stripeno * stripe_count + stripepos;
> +       file_offset = blockno * su + off_in_block;
> +
> +       return file_offset;
> +}
> +EXPORT_SYMBOL(get_file_offset);

Hi Maged,

I'm finishing up a full striping v2 (i.e. adjacent extents are merged
together, no same layout limitation, etc) right now.  It will be posted
to ceph-devel in the next week or two.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux