Re: [PATCH] rbd: support v2 fancy striping

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Ilya,

Glad you have already added this, i will wait for your changes and apply them when done.

Cheers

Maged

On 2018-01-30 15:28, Ilya Dryomov wrote:

On Tue, Jan 30, 2018 at 12:27 AM, Maged Mokhtar <mmokhtar@xxxxxxxxxxx> wrote:

Adds v2 fancy striping support to kernel rbd. Adds libceph striper.c based
on user space osdc/Striper.cc. Clone images are limited to have
same striping layout as parents in order to simplify callback of copyup requests and insure they are atomic. If they have different layout we fail
during image probe.

Signed-off-by: Maged Mokhtar <mmokhtar@xxxxxxxxxxx>
---
drivers/block/rbd.c          |  131 ++++++++++++++++++---------------
include/linux/ceph/striper.h |   34 ++++++++
net/ceph/Makefile            |    2
net/ceph/striper.c           |   81 ++++++++++++++++++++
4 files changed, 191 insertions(+), 57 deletions(-)

diff -urNp a/drivers/block/rbd.c b/drivers/block/rbd.c
--- a/drivers/block/rbd.c       2018-01-28 23:20:33.000000000 +0200
+++ b/drivers/block/rbd.c       2018-01-29 22:23:18.755108873 +0200
@@ -33,6 +33,7 @@
#include <linux/ceph/mon_client.h>
#include <linux/ceph/cls_lock_client.h>
#include <linux/ceph/decode.h>
+#include <linux/ceph/striper.h>
#include <linux/parser.h>
#include <linux/bsearch.h>

@@ -1231,27 +1232,6 @@ static void rbd_dev_mapping_clear(struct
rbd_dev->mapping.features = 0;
}

-static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
-{
-       u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
-
-       return offset & (segment_size - 1);
-}
-
-static u64 rbd_segment_length(struct rbd_device *rbd_dev,
-                               u64 offset, u64 length)
-{
-       u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
-
-       offset &= segment_size - 1;
-
-       rbd_assert(length <= U64_MAX - offset);
-       if (offset + length > segment_size)
-               length = segment_size - offset;
-
-       return length;
-}
-
/*
* bio helpers
*/
@@ -2427,9 +2407,15 @@ static int rbd_img_request_fill(struct r

while (resid) {
struct ceph_osd_request *osd_req;
- u64 object_no = img_offset >> rbd_dev->header.obj_order;
-               u64 offset = rbd_segment_offset(rbd_dev, img_offset);
- u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
+               u64 object_no;
+               u64 offset;
+               u64 length;
+               struct stripe_extent ext;
+
+ get_stripe_extent(&rbd_dev->layout,img_offset,resid,&ext);
+               object_no = ext.objectno;
+               offset = ext.offset;
+               length = ext.length;

obj_request = rbd_obj_request_create(type);
if (!obj_request)
@@ -2624,7 +2610,6 @@ out_err:
* object request from the image request does not exist.
*
* A page array big enough to hold the returned data is allocated
- * and supplied to rbd_img_request_fill() as the "data descriptor."
* When the read completes, this page array will be transferred to
* the original object request for the copyup operation.
*
@@ -2641,25 +2626,47 @@ static int rbd_img_obj_parent_read_full(
u32 page_count;
int result;

-       rbd_assert(rbd_dev->parent != NULL);
+       struct rbd_obj_request *parent_obj_request = NULL;
+       struct ceph_osd_request *osd_req;

-       /*
-        * Determine the byte range covered by the object in the
-        * child image to which the original request was to be sent.
-        */
-       img_offset = obj_request->img_offset - obj_request->offset;
-       length = rbd_obj_bytes(&rbd_dev->header);
+       rbd_assert(rbd_dev->parent != NULL);
+       if (rbd_dev->header.stripe_count !=
+           rbd_dev->parent->header.stripe_count ||
+           rbd_dev->header.stripe_unit !=
+           rbd_dev->parent->header.stripe_unit) {
+ rbd_warn(rbd_dev,"Cannot perform parent full object read due "
+                        "to stripe mis-match\n");
+               result = -EINVAL;
+               goto out_err;
+       }

/*
* There is no defined parent data beyond the parent
* overlap, so limit what we read at that boundary if
* necessary.
*/
-       if (img_offset + length > rbd_dev->parent_overlap) {
-               rbd_assert(img_offset < rbd_dev->parent_overlap);
-               length = rbd_dev->parent_overlap - img_offset;
-       }

+       img_offset = get_object_start_offset(&rbd_dev->layout,
+                                            obj_request->object_no);
+       rbd_assert(img_offset < rbd_dev->parent_overlap);
+ if (rbd_dev->parent_overlap < get_object_end_offset(&rbd_dev->layout, + obj_request->object_no)) {
+               u64 diff = rbd_dev->parent_overlap - img_offset;
+               u64 stripe_row_size = rbd_dev->header.stripe_unit *
+                       rbd_dev->header.stripe_count;
+               u64 rows = diff / stripe_row_size;
+               u64 remain = diff - rows * stripe_row_size;
+               length = rows * rbd_dev->header.stripe_unit;
+               if (rbd_dev->header.stripe_unit < remain)
+                       length = length + rbd_dev->header.stripe_unit;
+               else
+                       length = length + remain;
+       }
+       else {
+               /* copy entire parent object */
+               length = rbd_obj_bytes(&rbd_dev->header);
+       }
+
/*
* Allocate a page array big enough to receive the data read
* from the parent.
@@ -2678,9 +2685,27 @@ static int rbd_img_obj_parent_read_full(
if (!parent_request)
goto out_err;

- result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
-       if (result)
+ parent_obj_request = rbd_obj_request_create(OBJ_REQUEST_PAGES);
+       if (!obj_request) {
+ rbd_img_obj_request_del(parent_request, parent_obj_request);
goto out_err;
+       }
+       rbd_img_obj_request_add(parent_request, parent_obj_request);
+       parent_obj_request->object_no = obj_request->object_no;
+       parent_obj_request->offset = 0;
+       parent_obj_request->length = length;
+       parent_obj_request->pages = pages;
+       page_count = (u32)calc_pages_for(0, length);
+       parent_obj_request->page_count = page_count;
+ osd_req = rbd_osd_req_create(rbd_dev,OBJ_OP_READ,1,parent_obj_request);
+       if (!osd_req) {
+ rbd_img_obj_request_del(parent_request, parent_obj_request);
+               goto out_err;
+       }
+       parent_obj_request->osd_req = osd_req;
+       parent_obj_request->callback = rbd_img_obj_callback;
+       parent_obj_request->img_offset = img_offset;
+ rbd_img_obj_request_fill(parent_obj_request, osd_req, OBJ_OP_READ, 0);

parent_request->copyup_pages = pages;
parent_request->copyup_page_count = page_count;
@@ -5090,28 +5115,10 @@ static int rbd_dev_v2_striping_info(stru
if (ret < size)
return -ERANGE;

-       /*
-        * We don't actually support the "fancy striping" feature
-        * (STRIPINGV2) yet, but if the striping sizes are the
-        * defaults the behavior is the same as before.  So find
-        * out, and only fail if the image has non-default values.
-        */
-       ret = -EINVAL;
obj_size = rbd_obj_bytes(&rbd_dev->header);
p = &striping_info_buf;
stripe_unit = ceph_decode_64(&p);
-       if (stripe_unit != obj_size) {
-               rbd_warn(rbd_dev, "unsupported stripe unit "
-                               "(got %llu want %llu)",
-                               stripe_unit, obj_size);
-               return -EINVAL;
-       }
stripe_count = ceph_decode_64(&p);
-       if (stripe_count != 1) {
-               rbd_warn(rbd_dev, "unsupported stripe count "
-                               "(got %llu want 1)", stripe_count);
-               return -EINVAL;
-       }
rbd_dev->header.stripe_unit = stripe_unit;
rbd_dev->header.stripe_count = stripe_count;

@@ -6090,6 +6097,18 @@ static int rbd_dev_image_probe(struct rb
ret = rbd_dev_probe_parent(rbd_dev, depth);
if (ret)
goto err_out_probe;
+
+       if (rbd_dev->parent != NULL) {
+               if (rbd_dev->header.stripe_count !=
+                   rbd_dev->parent->header.stripe_count ||
+                   rbd_dev->header.stripe_unit !=
+                   rbd_dev->parent->header.stripe_unit) {
+ rbd_warn(rbd_dev,"Cannot map child image with "
+                                "different striping than parent");
+                       ret = -EINVAL;
+                       goto err_out_probe;
+               }
+       }

dout("discovered format %u image, header name is %s\n",
rbd_dev->image_format, rbd_dev->header_oid.name);
diff -urNp a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h --- a/include/linux/ceph/striper.h 1970-01-01 02:00:00.000000000 +0200 +++ b/include/linux/ceph/striper.h 2018-01-29 22:23:18.755108873 +0200
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_CEPH_STRIPER_H
+#define _FS_CEPH_STRIPER_H
+
+#include <linux/ceph/ceph_fs.h>
+
+struct ceph_file_layout;
+
+struct stripe_extent {
+       u64     objectno;
+       u64     offset;
+       u64     length;
+};
+
+/* Logical to Object address mapping */
+void get_stripe_extent(struct ceph_file_layout *layout,u64 offset,
+                       u64 len,struct stripe_extent *ext);
+
+/* Object to Logical address mapping */
+u64 get_file_offset(struct ceph_file_layout *layout,u64 objectno,u64 off);
+
+static inline u64 get_object_start_offset(struct ceph_file_layout *layout,
+                                          u64 objectno)
+{
+       return get_file_offset(layout,objectno,0);
+}
+
+static inline u64 get_object_end_offset(struct ceph_file_layout *layout,
+                                        u64 objectno)
+{
+       return get_file_offset(layout,objectno,layout->object_size);
+}
+
+#endif
diff -urNp a/net/ceph/Makefile b/net/ceph/Makefile
--- a/net/ceph/Makefile 2018-01-28 23:20:33.000000000 +0200
+++ b/net/ceph/Makefile 2018-01-29 22:23:18.755108873 +0200
@@ -13,5 +13,5 @@ libceph-y := ceph_common.o messenger.o m
crypto.o armor.o \
auth_x.o \
ceph_fs.o ceph_strings.o ceph_hash.o \
-       pagevec.o snapshot.o string_table.o
+       pagevec.o snapshot.o string_table.o striper.o

diff -urNp a/net/ceph/striper.c b/net/ceph/striper.c
--- a/net/ceph/striper.c        1970-01-01 02:00:00.000000000 +0200
+++ b/net/ceph/striper.c        2018-01-29 22:23:18.755108873 +0200
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/striper.h>
+
+/*
+ * Address mappings for striped objects
+ * Based on user space osdc/Striper.cc
+ */
+
+/* Logical to Object address, based on osdc/Striper.cc file_to_extents() */
+void get_stripe_extent(struct ceph_file_layout *layout,u64 offset,
+                       u64 len,struct stripe_extent *ext)
+{
+       u64 object_size;
+       u64 su;
+       u64 stripe_count;
+       u64 stripes_per_object;
+       u64 blockno;
+       u64 stripeno;
+       u64 stripepos;
+       u64 objectsetno;
+       u64 objectno;
+       u64 block_start;
+       u64 block_off;
+       u64 max;
+
+       object_size = layout->object_size;
+       su = layout->stripe_unit;
+       stripe_count = layout->stripe_count;
+       stripes_per_object = object_size / su;
+
+       blockno = offset / su; /* which block */
+ stripeno = blockno / stripe_count; /* which horizontal stripe Y */ + stripepos = blockno % stripe_count; /* which object in object set X */ + objectsetno = stripeno / stripes_per_object; /* which object set */ + objectno = objectsetno * stripe_count + stripepos; /* object id */
+
+       // map range into object
+       block_start = (stripeno % stripes_per_object) * su;
+       block_off = offset % su;
+       max = su - block_off;
+
+       ext->objectno = objectno;
+       ext->offset = block_start + block_off;
+       if (len > max)
+               ext->length = max;
+       else
+               ext->length = len;
+}
+EXPORT_SYMBOL(get_stripe_extent);
+
+/* Object to Logical address, based on osdc/Striper.cc extent_to_file() */ +u64 get_file_offset(struct ceph_file_layout *layout,u64 objectno,u64 off)
+{
+       u64 object_size;
+       u64 su;
+       u64 stripe_count;
+       u64 stripes_per_object;
+       u64 stripepos;
+       u64 objectsetno;
+       u64 stripeno;
+       u64 blockno;
+       u64 off_in_block;
+       u64 file_offset;
+
+       object_size = layout->object_size;
+       su = layout->stripe_unit;
+       stripe_count = layout->stripe_count;
+       stripes_per_object = object_size / su;
+       off_in_block = off % su;
+
+       stripepos = objectno % stripe_count;
+       objectsetno = objectno / stripe_count;
+       stripeno = off / su + objectsetno * stripes_per_object;
+       blockno = stripeno * stripe_count + stripepos;
+       file_offset = blockno * su + off_in_block;
+
+       return file_offset;
+}
+EXPORT_SYMBOL(get_file_offset);

Hi Maged,

I'm finishing up a full striping v2 (i.e. adjacent extents are merged
together, no same layout limitation, etc) right now.  It will be posted
to ceph-devel in the next week or two.

Thanks,

Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux