Signed-off-by: Dongsheng Yang <dongsheng.yang@xxxxxxxxxxxx> --- drivers/block/rbd.c | 566 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 559 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8db4c36..0ba9750 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -28,16 +28,19 @@ */ +#include <linux/crc32c.h> #include <linux/ceph/libceph.h> #include <linux/ceph/osd_client.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/cls_lock_client.h> #include <linux/ceph/striper.h> #include <linux/ceph/decode.h> +#include <linux/ceph/journaler.h> #include <linux/parser.h> #include <linux/bsearch.h> #include <linux/kernel.h> +#include <linux/bio.h> #include <linux/device.h> #include <linux/module.h> #include <linux/blk-mq.h> @@ -115,12 +118,14 @@ static int atomic_dec_return_safe(atomic_t *v) #define RBD_FEATURE_LAYERING (1ULL<<0) #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) +#define RBD_FEATURE_JOURNALING (1ULL<<6) #define RBD_FEATURE_DATA_POOL (1ULL<<7) #define RBD_FEATURE_OPERATIONS (1ULL<<8) #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ RBD_FEATURE_STRIPINGV2 | \ RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_JOURNALING | \ RBD_FEATURE_DATA_POOL | \ RBD_FEATURE_OPERATIONS) @@ -295,6 +300,7 @@ struct rbd_img_request { u32 pending_count; struct completion completion; + uint64_t journaler_commit_tid; struct kref kref; }; @@ -384,6 +390,8 @@ struct rbd_device { atomic_t parent_ref; struct rbd_device *parent; + struct rbd_journal *journal; + /* Block layer tags. */ struct blk_mq_tag_set tag_set; @@ -414,6 +422,15 @@ enum rbd_dev_flags { RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */ }; +#define LOCAL_MIRROR_UUID "" +#define LOCAL_CLIENT_ID "" + +struct rbd_journal { + struct ceph_journaler *journaler; + struct ceph_journaler_client *client; + uint64_t tag_tid; +}; + static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ static LIST_HEAD(rbd_dev_list); /* devices */ @@ -2601,12 +2618,20 @@ static void rbd_img_end_child_request(struct rbd_img_request *img_req) static void rbd_img_end_request(struct rbd_img_request *img_req) { rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); - rbd_assert((!img_req->result && - img_req->xferred == blk_rq_bytes(img_req->rq)) || - (img_req->result < 0 && !img_req->xferred)); - blk_mq_end_request(img_req->rq, - errno_to_blk_status(img_req->result)); + if (img_req->rq) { + rbd_assert((!img_req->result && + img_req->xferred == blk_rq_bytes(img_req->rq)) || + (img_req->result < 0 && !img_req->xferred)); + blk_mq_end_request(img_req->rq, + errno_to_blk_status(img_req->result)); + } + + if (img_req->journaler_commit_tid) { + ceph_journaler_client_committed(img_req->rbd_dev->journal->journaler, + img_req->journaler_commit_tid); + } + complete_all(&img_req->completion); rbd_img_request_put(img_req); } @@ -3674,6 +3699,19 @@ static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire) return ret; } +struct rbd_journal_ctx { + struct rbd_device *rbd_dev; + struct rbd_img_request *img_request; + struct request *rq; + struct ceph_snap_context *snapc; + int result; + bool must_be_locked; +}; + +static int rbd_journal_append(struct rbd_device *rbd_dev, struct bio *bio, + u64 offset, u64 length, enum obj_operation_type op_type, + struct rbd_journal_ctx *ctx); + static void rbd_queue_workfn(struct work_struct *work) { struct request *rq = blk_mq_rq_from_pdu(work); @@ -3779,7 +3817,29 @@ static void rbd_queue_workfn(struct work_struct *work) if (result) goto err_img_request; - rbd_img_request_submit(img_request); + if (!(rbd_dev->header.features & RBD_FEATURE_JOURNALING) || + (op_type == OBJ_OP_READ)) { + rbd_img_request_submit(img_request); + } else { + struct rbd_journal_ctx *ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + + if (!ctx){ + result = -ENOMEM; + goto err_unlock; + } + + ctx->img_request = img_request; + ctx->rq = rq; + ctx->snapc = snapc; + ctx->must_be_locked = must_be_locked; + ctx->rbd_dev = rbd_dev; + result = rbd_journal_append(rbd_dev, rq->bio, offset, length, op_type, ctx); + if (result) { + pr_err("error in rbd_journal_append"); + goto err_unlock; + } + } + if (must_be_locked) up_read(&rbd_dev->lock_rwsem); return; @@ -5730,6 +5790,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) if (ret) goto err_out_disk; + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); @@ -5771,6 +5832,336 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) return ret; } +enum rbd_journal_event_type { + EVENT_TYPE_AIO_DISCARD = 0, + EVENT_TYPE_AIO_WRITE = 1, + EVENT_TYPE_AIO_FLUSH = 2, + EVENT_TYPE_OP_FINISH = 3, + EVENT_TYPE_SNAP_CREATE = 4, + EVENT_TYPE_SNAP_REMOVE = 5, + EVENT_TYPE_SNAP_RENAME = 6, + EVENT_TYPE_SNAP_PROTECT = 7, + EVENT_TYPE_SNAP_UNPROTECT = 8, + EVENT_TYPE_SNAP_ROLLBACK = 9, + EVENT_TYPE_RENAME = 10, + EVENT_TYPE_RESIZE = 11, + EVENT_TYPE_FLATTEN = 12, + EVENT_TYPE_DEMOTE_PROMOTE = 13, + EVENT_TYPE_SNAP_LIMIT = 14, + EVENT_TYPE_UPDATE_FEATURES = 15, + EVENT_TYPE_METADATA_SET = 16, + EVENT_TYPE_METADATA_REMOVE = 17, + EVENT_TYPE_AIO_WRITESAME = 18, + EVENT_TYPE_AIO_COMPARE_AND_WRITE = 19, +}; + +static struct bio_vec *setup_write_bvecs(void *buf, u64 offset, u64 length) +{ + u32 i; + struct bio_vec *bvecs = NULL; + u32 bvec_count = 0; + + bvec_count = calc_pages_for(offset, length); + bvecs = kcalloc(bvec_count, sizeof(*bvecs), GFP_NOIO); + if (!bvecs) + goto err; + + offset = offset % PAGE_SIZE; + for (i = 0; i < bvec_count; i++) { + unsigned int len = min(length, (u64)PAGE_SIZE - offset); + + bvecs[i].bv_page = alloc_page(GFP_NOIO); + if (!bvecs[i].bv_page) + goto free_bvecs; + + bvecs[i].bv_offset = offset; + bvecs[i].bv_len = len; + memcpy(page_address(bvecs[i].bv_page) + bvecs[i].bv_offset, buf, bvecs[i].bv_len); + length -= len; + buf += len; + offset = 0; + } + + rbd_assert(!length); + + return bvecs; + +free_bvecs: +err: + return NULL; +} + +static int rbd_journal_handle_aio_discard(struct rbd_device *rbd_dev, void **p, void *end, u8 struct_v, uint64_t commit_tid) +{ + uint64_t offset; + uint64_t length; + int result = 0; + enum obj_operation_type op_type; + struct rbd_img_request *img_request; + struct ceph_snap_context *snapc = NULL; + + offset = ceph_decode_64(p); + length = ceph_decode_64(p); + + snapc = rbd_dev->header.snapc; + ceph_get_snap_context(snapc); + op_type = OBJ_OP_WRITE; + + img_request = rbd_img_request_create(rbd_dev, op_type, snapc); + if (!img_request) { + result = -ENOMEM; + goto err; + } + img_request->journaler_commit_tid = commit_tid; + + result = rbd_img_fill_nodata(img_request, offset, length); + if (result) + goto err; + + rbd_img_request_submit(img_request); + result = wait_for_completion_interruptible(&img_request->completion); +err: + return result; +} + +static int rbd_journal_handle_aio_write(struct rbd_device *rbd_dev, void **p, void *end, u8 struct_v, uint64_t commit_tid) +{ + uint64_t offset; + uint64_t length; + char *data; + ssize_t data_len; + int result = 0; + enum obj_operation_type op_type; + struct ceph_snap_context *snapc = NULL; + struct rbd_img_request *img_request; + + struct ceph_file_extent ex; + struct bio_vec *bvecs; + + offset = ceph_decode_64(p); + length = ceph_decode_64(p); + + data = ceph_extract_encoded_string(p, end, &data_len, GFP_NOIO); + if (!data) + return -ENOMEM; + + snapc = rbd_dev->header.snapc; + ceph_get_snap_context(snapc); + op_type = OBJ_OP_DISCARD; + + img_request = rbd_img_request_create(rbd_dev, op_type, snapc); + if (!img_request) { + result = -ENOMEM; + goto err; + } + + img_request->journaler_commit_tid = commit_tid; + snapc = NULL; /* img_request consumes a ref */ + + ex.fe_off = offset; + ex.fe_len = length; + + bvecs = setup_write_bvecs(data, offset, length); + if (!bvecs) + pr_err("failed to alloc bvecs."); + result = rbd_img_fill_from_bvecs(img_request, + &ex, 1, bvecs); + if (result) + goto err; + + rbd_img_request_submit(img_request); + result = wait_for_completion_interruptible(&img_request->completion); + +err: + return result; +} + +static int rbd_journal_replay(void *entry_handler, struct ceph_journaler_entry *entry, uint64_t commit_tid) +{ + struct rbd_device *rbd_dev = entry_handler; + void **p = (void **)(&entry->data); + void *end = *p + entry->data_len; + uint32_t event_type; + u8 struct_v; + u32 struct_len; + int ret = 0; + + ret = ceph_start_decoding(p, end, 1, "rbd_decode_entry", + &struct_v, &struct_len); + if (ret) + return -EINVAL; + + event_type = ceph_decode_32(p); + + switch (event_type) { + case EVENT_TYPE_AIO_WRITE: + rbd_journal_handle_aio_write(rbd_dev, p, end, struct_v, commit_tid); + break; + case EVENT_TYPE_AIO_DISCARD: + rbd_journal_handle_aio_discard(rbd_dev, p, end, struct_v, commit_tid); + break; + case EVENT_TYPE_AIO_FLUSH: + break; + default: + pr_err("unknown event_type: %u", event_type); + return -EINVAL; + } + return 0; +} + +static int rbd_journal_allocate_tag(struct rbd_journal *journal); +static int rbd_dev_open_journal(struct rbd_device *rbd_dev) +{ + int ret = 0; + struct rbd_journal *journal = NULL; + struct ceph_journaler *journaler = NULL; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + + journal = kzalloc(sizeof(struct rbd_journal), GFP_KERNEL); + if (!journal) + return -ENOMEM; + + journaler = ceph_journaler_create(osdc, &rbd_dev->header_oloc, + rbd_dev->spec->image_id, + LOCAL_CLIENT_ID); + if (!journaler) { + ret = -ENOMEM; + goto err_free_journal; + } + + journaler->entry_handler = rbd_dev; + journaler->handle_entry = rbd_journal_replay; + + ret = ceph_journaler_open(journaler); + if (ret) + goto err_destroy_journaler; + + journal->journaler = journaler; + ret = ceph_journaler_start_replay(journaler); + if (ret) + goto err_close_journaler; + + rbd_dev->journal = journal; + ret = rbd_journal_allocate_tag(journal); + if (ret) + goto err_close_journaler; + + return ret; + +err_close_journaler: + ceph_journaler_close(journaler); +err_destroy_journaler: + ceph_journaler_destroy(journaler); +err_free_journal: + kfree(rbd_dev->journal); + rbd_dev->journal = NULL; + return ret; +} + +static void rbd_dev_close_journal(struct rbd_device *rbd_dev) +{ + struct ceph_journaler *journaler = NULL; + + if (!rbd_dev->journal) + return; + + journaler = rbd_dev->journal->journaler; + ceph_journaler_close(journaler); + ceph_journaler_destroy(journaler); + kfree(rbd_dev->journal); + rbd_dev->journal = NULL; +} + +typedef struct rbd_journal_tag_predecessor { + bool commit_valid; + uint64_t tag_tid; + uint64_t entry_tid; + uint32_t uuid_len; + char *mirror_uuid; +} rbd_journal_tag_predecessor; + +typedef struct rbd_journal_tag_data { + struct rbd_journal_tag_predecessor predecessor; + uint32_t uuid_len; + char *mirror_uuid; +} rbd_journal_tag_data; + +static uint32_t tag_data_encoding_size(struct rbd_journal_tag_data *tag_data) +{ + // sizeof(uuid_len) 4 + uuid_len + 1 commit_valid + 8 tag_tid + 8 entry_tid + 4 sizeof(uuid_len) + uuid_len + return (4 + tag_data->uuid_len + 1 + 8 + 8 + 4 + tag_data->predecessor.uuid_len); +} + +static void predecessor_encode(void **p, void *end, struct rbd_journal_tag_predecessor *predecessor) +{ + ceph_encode_string(p, end, predecessor->mirror_uuid, predecessor->uuid_len); + ceph_encode_8(p, predecessor->commit_valid); + ceph_encode_64(p, predecessor->tag_tid); + ceph_encode_64(p, predecessor->entry_tid); +} + +static int rbd_journal_encode_tag_data(void **p, void *end, struct rbd_journal_tag_data *tag_data) +{ + struct rbd_journal_tag_predecessor *predecessor = &tag_data->predecessor; + + ceph_encode_string(p, end, tag_data->mirror_uuid, tag_data->uuid_len); + predecessor_encode(p, end, predecessor); + + return 0; +} + +static int rbd_journal_allocate_tag(struct rbd_journal *journal) +{ + struct ceph_journaler_tag tag; + struct ceph_journaler *journaler = journal->journaler; + struct ceph_journaler_client *client; + struct rbd_journal_tag_predecessor *predecessor; + struct ceph_journaler_object_pos *position; + struct rbd_journal_tag_data tag_data; + void *buf = NULL, *p = NULL, *end = NULL; + uint32_t buf_len; + int ret = 0; + + ret = ceph_journaler_get_cached_client(journaler, LOCAL_CLIENT_ID, &client); + if (ret) + goto out; + + predecessor = &tag_data.predecessor; + position = list_first_entry(&client->object_positions, struct ceph_journaler_object_pos, node); + + predecessor->commit_valid = true; + predecessor->tag_tid = position->tag_tid; + predecessor->entry_tid = position->entry_tid; + predecessor->uuid_len = 0; + predecessor->mirror_uuid = LOCAL_MIRROR_UUID; + + tag_data.uuid_len = 0; + tag_data.mirror_uuid = LOCAL_MIRROR_UUID; + + buf_len = tag_data_encoding_size(&tag_data); + + p = kmalloc(buf_len, GFP_KERNEL); + if (!p) { + pr_err("failed to allocate tag data"); + return -ENOMEM; + } + + end = p + buf_len; + buf = p; + ret = rbd_journal_encode_tag_data(&p, end, &tag_data); + if (ret) { + pr_err("error in tag data"); + return ret; + } + + ret = ceph_journaler_allocate_tag(journaler, 0, buf, buf_len, &tag); + + journal->tag_tid = tag.tid; + +out: + return ret; +} + static void rbd_dev_image_release(struct rbd_device *rbd_dev) { rbd_dev_unprobe(rbd_dev); @@ -5781,6 +6172,145 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) rbd_dev->spec->image_id = NULL; } +struct AioWriteEvent { + enum rbd_journal_event_type type; + uint64_t offset; + uint64_t length; + char data[0]; +}; + + +static void rbd_journal_callback(struct ceph_journaler_ctx *journaler_ctx) +{ + struct rbd_journal_ctx *ctx = journaler_ctx->priv; + int result = journaler_ctx->result; + struct rbd_device *rbd_dev = ctx->rbd_dev; + bool must_be_locked = ctx->must_be_locked; + + if (result) + goto err_rq; + + if (must_be_locked) + down_read(&rbd_dev->lock_rwsem); + + rbd_img_request_submit(ctx->img_request); + + if (must_be_locked) + up_read(&rbd_dev->lock_rwsem); + + kfree(ctx); + return; + +err_rq: + ceph_put_snap_context(ctx->snapc); + blk_mq_end_request(ctx->rq, errno_to_blk_status(result)); + rbd_img_request_put(ctx->img_request); + kfree(ctx); +} + +static int rbd_journal_append_write_event(struct rbd_device *rbd_dev, struct bio *bio, + u64 offset, u64 length, struct rbd_journal_ctx *ctx) +{ + void *p = NULL; + struct ceph_journaler_ctx *journaler_ctx; + struct ceph_bio_iter *bio_iter; + + bio_iter = kzalloc(sizeof(struct ceph_bio_iter), GFP_KERNEL); + bio_iter->bio = bio; + bio_iter->iter = bio->bi_iter; + bio_iter->bio_len = length; + + bio_iter->prefix_page = alloc_page(GFP_KERNEL); + memset(page_address(bio_iter->prefix_page), 0, PAGE_SIZE); + + // EVENT_FIXED_SIZE(10 = CEPH_ENCODING_START_BLK_LEN(6) + EVENT_TYPE(4)) + + // offset(8) + length(8) + string_len(4) = 30 + bio_iter->prefix_len = 30; + bio_iter->prefix_offset = PAGE_SIZE - bio_iter->prefix_len; + + p = page_address(bio_iter->prefix_page) + bio_iter->prefix_offset; + + journaler_ctx = kmalloc(sizeof(*journaler_ctx), GFP_KERNEL); + if (!journaler_ctx) { + return -ENOMEM; + } + + ceph_start_encoding(&p, 1, 1, bio_iter->prefix_len + bio_iter->bio_len - 6); + + ceph_encode_32(&p, EVENT_TYPE_AIO_WRITE); + + ceph_encode_64(&p, offset); + ceph_encode_64(&p, length); + + // first part of ceph_encode_string(); + ceph_encode_32(&p, bio_iter->bio_len); + + journaler_ctx->priv = ctx; + journaler_ctx->callback = rbd_journal_callback; + journaler_ctx->bio_iter = bio_iter; + + ceph_journaler_append(rbd_dev->journal->journaler, rbd_dev->journal->tag_tid, + &ctx->img_request->journaler_commit_tid, journaler_ctx); + return 0; +} + +static int rbd_journal_append_discard_event(struct rbd_device *rbd_dev, struct bio *bio, + u64 offset, u64 length, struct rbd_journal_ctx *ctx) +{ + void *p = NULL; + void *buf = NULL; + struct ceph_journaler_ctx *journaler_ctx; + struct ceph_bio_iter *bio_iter; + + bio_iter = kzalloc(sizeof(struct ceph_bio_iter), GFP_KERNEL); + bio_iter->bio = bio; + bio_iter->iter = bio->bi_iter; + bio_iter->bio_len = 0; + + bio_iter->prefix_page = alloc_page(GFP_KERNEL); + // EVENT_FIXED_SIZE(10 = CEPH_ENCODING_START_BLK_LEN(6) + EVENT_TYPE(4)) + + // offset(8) + length(8) = 26 + bio_iter->prefix_len = 26; + bio_iter->prefix_offset = PAGE_SIZE - bio_iter->prefix_len; + + p = page_address(bio_iter->prefix_page) + bio_iter->prefix_offset; + buf = p + bio_iter->prefix_len; + + journaler_ctx = kmalloc(sizeof(*journaler_ctx), GFP_KERNEL); + if (!journaler_ctx) { + return -ENOMEM; + } + + ceph_start_encoding(&p, 1, 1, bio_iter->prefix_len + bio_iter->bio_len - 6); + + ceph_encode_32(&p, EVENT_TYPE_AIO_DISCARD); + + ceph_encode_64(&p, offset); + ceph_encode_64(&p, length); + + journaler_ctx->priv = ctx; + journaler_ctx->callback = rbd_journal_callback; + journaler_ctx->bio_iter = bio_iter; + + ceph_journaler_append(rbd_dev->journal->journaler, rbd_dev->journal->tag_tid, + &ctx->img_request->journaler_commit_tid, journaler_ctx); + return 0; +} + +static int rbd_journal_append(struct rbd_device *rbd_dev, struct bio *bio, + u64 offset, u64 length, enum obj_operation_type op_type, + struct rbd_journal_ctx *ctx) +{ + switch (op_type) { + case OBJ_OP_WRITE: + return rbd_journal_append_write_event(rbd_dev, bio, offset, length, ctx); + case OBJ_OP_DISCARD: + return rbd_journal_append_discard_event(rbd_dev, bio, offset, length, ctx); + default: + return 0; + } +} + /* * Probe for the existence of the header object for the given rbd * device. If this image is the one being mapped (i.e., not a @@ -5947,11 +6477,30 @@ static ssize_t do_rbd_add(struct bus_type *bus, goto err_out_device_setup; } + if (rbd_dev->header.features & RBD_FEATURE_JOURNALING) { + set_bit(RBD_LOCK_FLAG_RELEASE_WAIT, &rbd_dev->lock_flags); + rc = rbd_add_acquire_lock(rbd_dev); + if (rc) { + clear_bit(RBD_LOCK_FLAG_RELEASE_WAIT, &rbd_dev->lock_flags); + complete_all(&rbd_dev->lock_wait); + goto err_out_device_setup; + } + + rc = rbd_dev_open_journal(rbd_dev); + if (rc) { + clear_bit(RBD_LOCK_FLAG_RELEASE_WAIT, &rbd_dev->lock_flags); + complete_all(&rbd_dev->lock_wait); + goto err_out_image_lock; + } + clear_bit(RBD_LOCK_FLAG_RELEASE_WAIT, &rbd_dev->lock_flags); + complete_all(&rbd_dev->lock_wait); + } + /* Everything's ready. Announce the disk to the world. */ rc = device_add(&rbd_dev->dev); if (rc) - goto err_out_image_lock; + goto err_out_close_journal; add_disk(rbd_dev->disk); /* see rbd_init_disk() */ @@ -5969,6 +6518,8 @@ static ssize_t do_rbd_add(struct bus_type *bus, module_put(THIS_MODULE); return rc; +err_out_close_journal: + rbd_dev_close_journal(rbd_dev); err_out_image_lock: rbd_dev_image_unlock(rbd_dev); err_out_device_setup: @@ -6095,6 +6646,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus, device_del(&rbd_dev->dev); rbd_dev_image_unlock(rbd_dev); + rbd_dev_close_journal(rbd_dev); rbd_dev_device_release(rbd_dev); rbd_dev_image_release(rbd_dev); rbd_dev_destroy(rbd_dev); -- 1.8.3.1