With journaling feature enabled, we need to append event to journal before sending img_request. Signed-off-by: Dongsheng Yang <dongsheng.yang@xxxxxxxxxxxx> --- drivers/block/rbd.c | 204 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 196 insertions(+), 8 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bd90c17..5b641f8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -298,6 +298,7 @@ struct rbd_img_request { u32 pending_count; struct completion completion; + uint64_t journaler_commit_tid; struct kref kref; }; @@ -441,6 +442,7 @@ struct rbd_journal { static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; +static struct kmem_cache *rbd_journal_ctx_cache; static int rbd_major; static DEFINE_IDA(rbd_dev_id_ida); @@ -2616,12 +2618,20 @@ static void rbd_img_end_child_request(struct rbd_img_request *img_req) static void rbd_img_end_request(struct rbd_img_request *img_req) { rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); - rbd_assert((!img_req->result && - img_req->xferred == blk_rq_bytes(img_req->rq)) || - (img_req->result < 0 && !img_req->xferred)); - blk_mq_end_request(img_req->rq, - errno_to_blk_status(img_req->result)); + if (img_req->rq) { + rbd_assert((!img_req->result && + img_req->xferred == blk_rq_bytes(img_req->rq)) || + (img_req->result < 0 && !img_req->xferred)); + blk_mq_end_request(img_req->rq, + errno_to_blk_status(img_req->result)); + } + + if (img_req->journaler_commit_tid) { + ceph_journaler_client_committed(img_req->rbd_dev->journal->journaler, + img_req->journaler_commit_tid); + } + complete_all(&img_req->completion); rbd_img_request_put(img_req); } @@ -3689,6 +3699,21 @@ static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire) return ret; } +struct rbd_journal_ctx { + struct rbd_device *rbd_dev; + struct rbd_img_request *img_request; + struct request *rq; + struct ceph_snap_context *snapc; + int result; + bool must_be_locked; + + struct ceph_bio_iter bio_iter; +}; + +static int rbd_journal_append(struct rbd_device *rbd_dev, struct bio *bio, + u64 offset, u64 length, enum obj_operation_type op_type, + struct rbd_journal_ctx *ctx); + static void rbd_queue_workfn(struct work_struct *work) { struct request *rq = blk_mq_rq_from_pdu(work); @@ -3794,7 +3819,29 @@ static void rbd_queue_workfn(struct work_struct *work) if (result) goto err_img_request; - rbd_img_request_submit(img_request); + if (!(rbd_dev->header.features & RBD_FEATURE_JOURNALING) || + (op_type == OBJ_OP_READ)) { + rbd_img_request_submit(img_request); + } else { + struct rbd_journal_ctx *ctx = kmem_cache_zalloc(rbd_journal_ctx_cache, GFP_NOIO); + + if (!ctx){ + result = -ENOMEM; + goto err_unlock; + } + + ctx->img_request = img_request; + ctx->rq = rq; + ctx->snapc = snapc; + ctx->must_be_locked = must_be_locked; + ctx->rbd_dev = rbd_dev; + result = rbd_journal_append(rbd_dev, rq->bio, offset, length, op_type, ctx); + if (result) { + rbd_warn(rbd_dev, "error in rbd_journal_append"); + goto err_unlock; + } + } + if (must_be_locked) up_read(&rbd_dev->lock_rwsem); return; @@ -5996,6 +6043,140 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) rbd_dev->spec->image_id = NULL; } +static void rbd_journal_callback(struct ceph_journaler_ctx *journaler_ctx) +{ + struct rbd_journal_ctx *ctx = journaler_ctx->priv; + int result = journaler_ctx->result; + struct rbd_device *rbd_dev = ctx->rbd_dev; + bool must_be_locked = ctx->must_be_locked; + + if (result) + goto err_rq; + + if (must_be_locked) + down_read(&rbd_dev->lock_rwsem); + + rbd_img_request_submit(ctx->img_request); + + if (must_be_locked) + up_read(&rbd_dev->lock_rwsem); + + goto out; + +err_rq: + ceph_put_snap_context(ctx->snapc); + blk_mq_end_request(ctx->rq, errno_to_blk_status(result)); + rbd_img_request_put(ctx->img_request); +out: + kmem_cache_free(rbd_journal_ctx_cache, ctx); + ceph_journaler_ctx_put(journaler_ctx); +} + +static int rbd_journal_append_write_event(struct rbd_device *rbd_dev, struct bio *bio, + u64 offset, u64 length, struct rbd_journal_ctx *ctx) +{ + void *p = NULL; + struct ceph_journaler_ctx *journaler_ctx; + int ret = 0; + + journaler_ctx = ceph_journaler_ctx_alloc(); + if (!journaler_ctx) { + return -ENOMEM; + } + + ctx->bio_iter.bio = bio; + ctx->bio_iter.iter = bio->bi_iter; + + journaler_ctx->bio_iter = &ctx->bio_iter; + journaler_ctx->bio_len = length; + + // EVENT_FIXED_SIZE(10 = CEPH_ENCODING_START_BLK_LEN(6) + EVENT_TYPE(4)) + + // offset(8) + length(8) + string_len(4) = 30 + journaler_ctx->prefix_len = 30; + journaler_ctx->prefix_offset = PAGE_SIZE - journaler_ctx->prefix_len; + + p = page_address(journaler_ctx->prefix_page) + journaler_ctx->prefix_offset; + + ceph_start_encoding(&p, 1, 1, journaler_ctx->prefix_len + journaler_ctx->bio_len - 6); + + ceph_encode_32(&p, EVENT_TYPE_AIO_WRITE); + + ceph_encode_64(&p, offset); + ceph_encode_64(&p, length); + + // first part of ceph_encode_string(); + ceph_encode_32(&p, journaler_ctx->bio_len); + + journaler_ctx->priv = ctx; + journaler_ctx->callback = rbd_journal_callback; + + ret = ceph_journaler_append(rbd_dev->journal->journaler, rbd_dev->journal->tag_tid, + &ctx->img_request->journaler_commit_tid, journaler_ctx); + if (ret) { + ceph_journaler_ctx_put(journaler_ctx); + return ret; + } + return 0; +} + +static int rbd_journal_append_discard_event(struct rbd_device *rbd_dev, struct bio *bio, + u64 offset, u64 length, struct rbd_journal_ctx *ctx) +{ + void *p = NULL; + struct ceph_journaler_ctx *journaler_ctx; + int ret = 0; + + journaler_ctx = ceph_journaler_ctx_alloc(); + if (!journaler_ctx) { + return -ENOMEM; + } + + ctx->bio_iter.bio = bio; + ctx->bio_iter.iter = bio->bi_iter; + + journaler_ctx->bio_iter = &ctx->bio_iter; + journaler_ctx->bio_len = 0; + + // EVENT_FIXED_SIZE(10 = CEPH_ENCODING_START_BLK_LEN(6) + EVENT_TYPE(4)) + + // offset(8) + length(8) = 26 + journaler_ctx->prefix_len = 26; + journaler_ctx->prefix_offset = PAGE_SIZE - journaler_ctx->prefix_len; + + p = page_address(journaler_ctx->prefix_page) + journaler_ctx->prefix_offset; + + ceph_start_encoding(&p, 1, 1, journaler_ctx->prefix_len + journaler_ctx->bio_len - 6); + + ceph_encode_32(&p, EVENT_TYPE_AIO_DISCARD); + + ceph_encode_64(&p, offset); + ceph_encode_64(&p, length); + + journaler_ctx->priv = ctx; + journaler_ctx->callback = rbd_journal_callback; + + ret = ceph_journaler_append(rbd_dev->journal->journaler, rbd_dev->journal->tag_tid, + &ctx->img_request->journaler_commit_tid, journaler_ctx); + if (ret) { + ceph_journaler_ctx_put(journaler_ctx); + return ret; + } + return 0; +} + +static int rbd_journal_append(struct rbd_device *rbd_dev, struct bio *bio, + u64 offset, u64 length, enum obj_operation_type op_type, + struct rbd_journal_ctx *ctx) +{ + switch (op_type) { + case OBJ_OP_WRITE: + return rbd_journal_append_write_event(rbd_dev, bio, offset, length, ctx); + case OBJ_OP_DISCARD: + return rbd_journal_append_discard_event(rbd_dev, bio, offset, length, ctx); + default: + return 0; + } +} + /* * Probe for the existence of the header object for the given rbd * device. If this image is the one being mapped (i.e., not a @@ -6369,11 +6550,18 @@ static int __init rbd_slab_init(void) rbd_assert(!rbd_obj_request_cache); rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); if (!rbd_obj_request_cache) - goto out_err; + goto destroy_img_request_cache; + rbd_assert(!rbd_journal_ctx_cache); + rbd_journal_ctx_cache = KMEM_CACHE(rbd_journal_ctx, 0); + if (!rbd_journal_ctx_cache) + goto destroy_obj_request_cache; return 0; -out_err: +destroy_obj_request_cache: + kmem_cache_destroy(rbd_obj_request_cache); + rbd_obj_request_cache = NULL; +destroy_img_request_cache: kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; return -ENOMEM; -- 1.8.3.1