Re: [PATCH v3 13/15] rbd: append journal event in image request state machine

Dongsheng Yang <dongsheng.yang@xxxxxxxxxxxx> · Mon, 26 Aug 2019 10:54:07 +0800

On 08/19/2019 06:38 PM, Ilya Dryomov wrote:
On Mon, Jul 29, 2019 at 11:43 AM Dongsheng Yang
<dongsheng.yang@xxxxxxxxxxxx>  wrote:
Introduce RBD_IMG_APPEND_JOURNAL and __RBD_IMG_APPEND_JOURNAL in rbd_img_state.
When a image request after RBD_IMG_EXCLUSIVE_LOCK, it will go into __RBD_IMG_APPEND_JOURNAL
and then RBD_IMG_APPEND_JOURNAL. after that, it then would go into __RBD_IMG_OBJECT_REQUESTS.

That means, we will append journal event before send the data object request for image request.

Signed-off-by: Dongsheng Yang<dongsheng.yang@xxxxxxxxxxxx>
---
  drivers/block/rbd.c | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++-
  1 file changed, 249 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 86008f2..89bc7b3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -121,6 +121,7 @@ static int atomic_dec_return_safe(atomic_t *v)
  #define RBD_FEATURE_OBJECT_MAP         (1ULL<<3)
  #define RBD_FEATURE_FAST_DIFF          (1ULL<<4)
  #define RBD_FEATURE_DEEP_FLATTEN       (1ULL<<5)
+#define RBD_FEATURE_JOURNALING          (1ULL<<6)
  #define RBD_FEATURE_DATA_POOL          (1ULL<<7)
  #define RBD_FEATURE_OPERATIONS         (1ULL<<8)

@@ -327,6 +328,8 @@ enum img_req_flags {
  enum rbd_img_state {
         RBD_IMG_START = 1,
         RBD_IMG_EXCLUSIVE_LOCK,
+       __RBD_IMG_APPEND_JOURNAL,
+       RBD_IMG_APPEND_JOURNAL,
         __RBD_IMG_OBJECT_REQUESTS,
         RBD_IMG_OBJECT_REQUESTS,
  };
@@ -355,6 +358,7 @@ struct rbd_img_request {
         int                     work_result;

         struct completion       completion;
+       uint64_t                journaler_commit_tid;

         struct kref             kref;
  };
@@ -448,6 +452,8 @@ struct rbd_device {
         atomic_t                parent_ref;
         struct rbd_device       *parent;

+       struct rbd_journal      *journal;
+
         /* Block layer tags. */
         struct blk_mq_tag_set   tag_set;

@@ -3650,6 +3656,19 @@ static void rbd_img_object_requests(struct rbd_img_request *img_req)
         }
  }

+static bool rbd_img_need_journal(struct rbd_img_request *img_req) {
+       struct rbd_device *rbd_dev = img_req->rbd_dev;
+
+       if (img_req->op_type == OBJ_OP_READ)
+               return false;
+
+       if (!(rbd_dev->header.features & RBD_FEATURE_JOURNALING))
+               return false;
+
+       return true;
+}
+
+static void rbd_img_journal_append(struct rbd_img_request *img_req);
  static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
  {
         struct rbd_device *rbd_dev = img_req->rbd_dev;
@@ -3676,6 +3695,27 @@ static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
                 rbd_assert(!need_exclusive_lock(img_req) ||
                            __rbd_is_lock_owner(rbd_dev));

+               if (!rbd_img_need_journal(img_req)) {
+                       img_req->state = RBD_IMG_APPEND_JOURNAL;
+                       goto again;
+               }
+
+               rbd_img_journal_append(img_req);
+               if (!img_req->pending.num_pending) {
+                       *result = img_req->pending.result;
+                       img_req->state = RBD_IMG_OBJECT_REQUESTS;
+                       goto again;
+               }
+               img_req->state = __RBD_IMG_APPEND_JOURNAL;
+               return false;
+       case __RBD_IMG_APPEND_JOURNAL:
+               if (!pending_result_dec(&img_req->pending, result))
+                       return false;
+               /* fall through */
+       case RBD_IMG_APPEND_JOURNAL:
+               if (*result)
+                       return true;
+
                 rbd_img_object_requests(img_req);
                 if (!img_req->pending.num_pending) {
                         *result = img_req->pending.result;
@@ -3744,9 +3784,15 @@ static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
         } else {
                 struct request *rq = img_req->rq;

+               if (!result && img_req->journaler_commit_tid) {
+                       ceph_journaler_client_committed(img_req->rbd_dev->journal->journaler,
+                                                       img_req->journaler_commit_tid);
+               }
+
                 complete_all(&img_req->completion);
                 rbd_img_request_put(img_req);
-               blk_mq_end_request(rq, errno_to_blk_status(result));
+               if (rq)
+                       blk_mq_end_request(rq, errno_to_blk_status(result));
         }
  }

@@ -6927,6 +6973,208 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
         return ret;
  }

+enum rbd_journal_event_type {
+  EVENT_TYPE_AIO_DISCARD           = 0,
+  EVENT_TYPE_AIO_WRITE             = 1,
+  EVENT_TYPE_AIO_FLUSH             = 2,
+  EVENT_TYPE_OP_FINISH             = 3,
+  EVENT_TYPE_SNAP_CREATE           = 4,
+  EVENT_TYPE_SNAP_REMOVE           = 5,
+  EVENT_TYPE_SNAP_RENAME           = 6,
+  EVENT_TYPE_SNAP_PROTECT          = 7,
+  EVENT_TYPE_SNAP_UNPROTECT        = 8,
+  EVENT_TYPE_SNAP_ROLLBACK         = 9,
+  EVENT_TYPE_RENAME                = 10,
+  EVENT_TYPE_RESIZE                = 11,
+  EVENT_TYPE_FLATTEN               = 12,
+  EVENT_TYPE_DEMOTE_PROMOTE        = 13,
+  EVENT_TYPE_SNAP_LIMIT            = 14,
+  EVENT_TYPE_UPDATE_FEATURES       = 15,
+  EVENT_TYPE_METADATA_SET          = 16,
+  EVENT_TYPE_METADATA_REMOVE       = 17,
+  EVENT_TYPE_AIO_WRITESAME         = 18,
+  EVENT_TYPE_AIO_COMPARE_AND_WRITE = 19,
+};
+
+
+// RBD_EVENT_FIXED_SIZE(10 = CEPH_ENCODING_START_BLK_LEN(6) + EVENT_TYPE(4))
+static const uint32_t RBD_EVENT_FIXED_SIZE = 10;
+
+static struct bio_vec *setup_write_bvecs(void *buf, u64 offset, u64 length)
+{
+       u32 i;
+       struct bio_vec *bvecs;
+       u32 bvec_count;
+
+       bvec_count = calc_pages_for(offset, length);
+       bvecs = kcalloc(bvec_count, sizeof(*bvecs), GFP_NOIO);
+       if (!bvecs)
+               goto err;
+
+       offset = offset % PAGE_SIZE;
+       for (i = 0; i < bvec_count; i++) {
+               unsigned int len = min(length, (u64)PAGE_SIZE - offset);
+
+               bvecs[i].bv_page = alloc_page(GFP_NOIO);
+               if (!bvecs[i].bv_page)
+                       goto free_bvecs;
+
+               bvecs[i].bv_offset = offset;
+               bvecs[i].bv_len = len;
+               memcpy(page_address(bvecs[i].bv_page) + bvecs[i].bv_offset, buf, bvecs[i].bv_len);
+               length -= len;
+               buf += len;
+               offset = 0;
+       }
+
+       rbd_assert(!length);
+
+       return bvecs;
+
+free_bvecs:
+err:
+       return NULL;
+}
+
+static void rbd_journal_callback(struct ceph_journaler_ctx *journaler_ctx)
+{
+       struct rbd_img_request *img_req = journaler_ctx->priv;
+       int result = journaler_ctx->result;
+
+       ceph_journaler_ctx_put(journaler_ctx);
+       rbd_img_handle_request(img_req, result);
+}
+
+static void img_journal_append_write_event(struct rbd_img_request *img_req)
+{
+       struct rbd_journal *journal = img_req->rbd_dev->journal;
+       struct ceph_journaler_ctx *journaler_ctx;
+       u64 offset = (u64)blk_rq_pos(img_req->rq) << SECTOR_SHIFT;
+       u64 length = blk_rq_bytes(img_req->rq);
+       struct bio *bio = img_req->rq->bio;
+       uint64_t prefix_len = RBD_EVENT_FIXED_SIZE + 20;
+       uint64_t max_append_size = ceph_journaler_get_max_append_size(journal->journaler) - prefix_len;
+       uint64_t append_size = min(max_append_size, length);
+       uint64_t bio_offset = 0;
+       void *p;
+       int ret;
+
+       rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
+
+       while (length > 0) {
+               journaler_ctx = ceph_journaler_ctx_alloc();
+               if (!journaler_ctx) {
+                       img_req->pending.result = -ENOMEM;
+                       return;
+               }
+
+               journaler_ctx->bio_iter.bio = bio;
+               journaler_ctx->bio_iter.iter = bio->bi_iter;
+
+               ceph_bio_iter_advance(&journaler_ctx->bio_iter, bio_offset);
+               append_size = min(max_append_size, length);
+               journaler_ctx->bio_len = append_size;
+               bio_offset += append_size;
+               length -= append_size;
+
+               // RBD_EVENT_FIXED_SIZE + offset(8) + length(8) + string_len(4) = 30
+               journaler_ctx->prefix_len = prefix_len;
+               journaler_ctx->prefix_offset = PAGE_SIZE - journaler_ctx->prefix_len;
+
+               p = page_address(journaler_ctx->prefix_page) + journaler_ctx->prefix_offset;
+
+               ceph_start_encoding(&p, 1, 1, journaler_ctx->prefix_len + journaler_ctx->bio_len - 6);
+
+               ceph_encode_32(&p, EVENT_TYPE_AIO_WRITE);
+
+               ceph_encode_64(&p, offset);
Looks like this is encoding the same offset for all append chunks?
This, coupled with always checksumming the entire bio, makes me think
that this code path hasn't been tested.

Oh, yes. Because the rbd_mirror.sh test in ceph-qa is all about 4K 
writing, which will not go into next loop here.

I will add a case in qa/workunits/rbd/rbd_mirror.sh to cover this code path.
+               ceph_encode_64(&p, append_size);
+
+               // first part of ceph_encode_string();
+               ceph_encode_32(&p, journaler_ctx->bio_len);
+
+               journaler_ctx->priv = img_req;
+               journaler_ctx->callback = rbd_journal_callback;
+
+               ret = ceph_journaler_append(journal->journaler, journal->tag_tid,
+                                           journaler_ctx);
+               if (ret) {
+                       ceph_journaler_ctx_put(journaler_ctx);
+                       img_req->pending.result = ret;
+                       return;
+               }
+
+               rbd_assert(!ret);
Bogus assert.

ok
+               img_req->pending.num_pending++;
+               img_req->journaler_commit_tid = journaler_ctx->commit_tid;
+       }
+}
+
+static void img_journal_append_discard_event(struct rbd_img_request *img_req)
+{
+       struct rbd_journal *journal = img_req->rbd_dev->journal;
+       struct ceph_journaler_ctx *journaler_ctx;
+       u64 offset = (u64)blk_rq_pos(img_req->rq) << SECTOR_SHIFT;
+       u64 length = blk_rq_bytes(img_req->rq);
+       struct bio *bio = img_req->rq->bio;
+       void *p;
+       int ret;
+
+       rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
Move this assert to rbd_img_journal_append() to avoid duplicating it
for write event and for discard event.

sounds good.
+
+       journaler_ctx = ceph_journaler_ctx_alloc();
+       if (!journaler_ctx) {
+               img_req->pending.result = -ENOMEM;
+               return;
+       }
+
+       journaler_ctx->bio_iter.bio = bio;
+       journaler_ctx->bio_iter.iter = bio->bi_iter;
+       journaler_ctx->bio_len = 0;
+
+       // RBD_EVENT_FIXED_SIZE + offset(8) + length(8) = 26
+       journaler_ctx->prefix_len = RBD_EVENT_FIXED_SIZE + 16;
+       journaler_ctx->prefix_offset = PAGE_SIZE - journaler_ctx->prefix_len;
+
+       p = page_address(journaler_ctx->prefix_page) + journaler_ctx->prefix_offset;
+
+       ceph_start_encoding(&p, 1, 1, journaler_ctx->prefix_len + journaler_ctx->bio_len - 6);
+
+       ceph_encode_32(&p, EVENT_TYPE_AIO_DISCARD);
+
+       ceph_encode_64(&p, offset);
+       ceph_encode_64(&p, length);
+
+       journaler_ctx->priv = img_req;
+       journaler_ctx->callback = rbd_journal_callback;
+
+       ret = ceph_journaler_append(journal->journaler, journal->tag_tid,
+                                   journaler_ctx);
+       if (ret) {
+               ceph_journaler_ctx_put(journaler_ctx);
+               img_req->pending.result = ret;
+               return;
+       }
+
+       rbd_assert(!ret);
Bogus assert.

ok
+       img_req->pending.num_pending++;
+       img_req->journaler_commit_tid = journaler_ctx->commit_tid;
+}
+
+static void rbd_img_journal_append(struct rbd_img_request *img_req)
+{
+       switch (img_req->op_type) {
+       case OBJ_OP_WRITE:
+               img_journal_append_write_event(img_req);
+               break;
+       case OBJ_OP_DISCARD:
+               img_journal_append_discard_event(img_req);
+               break;
+       default:
+               img_req->pending.result = -ENOTSUPP;
What about zeroouts?

As this work start before rbd supporting zeroout, so this code does not 
cover zeroout.

I will add zeroout in journaling in next version of this patchset.

Thanx
Thanks,

                 Ilya