[PATCH 11/11] rbd: enable journaling

Dongsheng Yang <dongsheng.yang@xxxxxxxxxxxx> · Mon, 3 Dec 2018 07:50:35 -0500

Signed-off-by: Dongsheng Yang <dongsheng.yang@xxxxxxxxxxxx>
---
 drivers/block/rbd.c | 566 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 559 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8db4c36..0ba9750 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -28,16 +28,19 @@
 
  */
 
+#include <linux/crc32c.h>
 #include <linux/ceph/libceph.h>
 #include <linux/ceph/osd_client.h>
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/cls_lock_client.h>
 #include <linux/ceph/striper.h>
 #include <linux/ceph/decode.h>
+#include <linux/ceph/journaler.h>
 #include <linux/parser.h>
 #include <linux/bsearch.h>
 
 #include <linux/kernel.h>
+#include <linux/bio.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/blk-mq.h>
@@ -115,12 +118,14 @@ static int atomic_dec_return_safe(atomic_t *v)
 #define RBD_FEATURE_LAYERING		(1ULL<<0)
 #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
 #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
+#define RBD_FEATURE_JOURNALING          (1ULL<<6)
 #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
 #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
 
 #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
 				 RBD_FEATURE_STRIPINGV2 |	\
 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
+				 RBD_FEATURE_JOURNALING |	\
 				 RBD_FEATURE_DATA_POOL |	\
 				 RBD_FEATURE_OPERATIONS)
 
@@ -295,6 +300,7 @@ struct rbd_img_request {
 	u32			pending_count;
 
 	struct completion	completion;
+	uint64_t		journaler_commit_tid;
 
 	struct kref		kref;
 };
@@ -384,6 +390,8 @@ struct rbd_device {
 	atomic_t		parent_ref;
 	struct rbd_device	*parent;
 
+	struct rbd_journal	*journal;
+
 	/* Block layer tags. */
 	struct blk_mq_tag_set	tag_set;
 
@@ -414,6 +422,15 @@ enum rbd_dev_flags {
 	RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
 };
 
+#define	LOCAL_MIRROR_UUID	""
+#define	LOCAL_CLIENT_ID		""
+
+struct rbd_journal {
+	struct ceph_journaler *journaler;
+	struct ceph_journaler_client *client;
+	uint64_t		tag_tid;
+};
+
 static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
 
 static LIST_HEAD(rbd_dev_list);    /* devices */
@@ -2601,12 +2618,20 @@ static void rbd_img_end_child_request(struct rbd_img_request *img_req)
 static void rbd_img_end_request(struct rbd_img_request *img_req)
 {
 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
-	rbd_assert((!img_req->result &&
-		    img_req->xferred == blk_rq_bytes(img_req->rq)) ||
-		   (img_req->result < 0 && !img_req->xferred));
 
-	blk_mq_end_request(img_req->rq,
-			   errno_to_blk_status(img_req->result));
+	if (img_req->rq) {
+		rbd_assert((!img_req->result &&
+			    img_req->xferred == blk_rq_bytes(img_req->rq)) ||
+			   (img_req->result < 0 && !img_req->xferred));
+		blk_mq_end_request(img_req->rq,
+				   errno_to_blk_status(img_req->result));
+	}
+
+	if (img_req->journaler_commit_tid) {
+		ceph_journaler_client_committed(img_req->rbd_dev->journal->journaler,
+						img_req->journaler_commit_tid);
+	}
+
 	complete_all(&img_req->completion);
 	rbd_img_request_put(img_req);
 }
@@ -3674,6 +3699,19 @@ static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
 	return ret;
 }
 
+struct rbd_journal_ctx {
+	struct rbd_device *rbd_dev;
+	struct rbd_img_request *img_request;
+	struct request *rq;
+	struct ceph_snap_context *snapc;
+	int result;
+	bool must_be_locked;
+};
+
+static int rbd_journal_append(struct rbd_device *rbd_dev, struct bio *bio,
+			      u64 offset, u64 length, enum obj_operation_type op_type,
+			      struct rbd_journal_ctx *ctx);
+
 static void rbd_queue_workfn(struct work_struct *work)
 {
 	struct request *rq = blk_mq_rq_from_pdu(work);
@@ -3779,7 +3817,29 @@ static void rbd_queue_workfn(struct work_struct *work)
 	if (result)
 		goto err_img_request;
 
-	rbd_img_request_submit(img_request);
+	if (!(rbd_dev->header.features & RBD_FEATURE_JOURNALING) ||
+		(op_type == OBJ_OP_READ)) {
+		rbd_img_request_submit(img_request);
+	} else {
+		struct rbd_journal_ctx *ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+
+		if (!ctx){
+			result = -ENOMEM;
+			goto err_unlock;
+		}
+
+		ctx->img_request = img_request;
+		ctx->rq = rq;
+		ctx->snapc = snapc;
+		ctx->must_be_locked = must_be_locked;
+		ctx->rbd_dev = rbd_dev;
+		result = rbd_journal_append(rbd_dev, rq->bio, offset, length, op_type, ctx);
+		if (result) {
+			pr_err("error in rbd_journal_append");
+			goto err_unlock;
+		}
+	}
+
 	if (must_be_locked)
 		up_read(&rbd_dev->lock_rwsem);
 	return;
@@ -5730,6 +5790,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
 	if (ret)
 		goto err_out_disk;
 
+
 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
 	set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
 
@@ -5771,6 +5832,336 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
 	return ret;
 }
 
+enum rbd_journal_event_type {
+  EVENT_TYPE_AIO_DISCARD           = 0,
+  EVENT_TYPE_AIO_WRITE             = 1,
+  EVENT_TYPE_AIO_FLUSH             = 2,
+  EVENT_TYPE_OP_FINISH             = 3,
+  EVENT_TYPE_SNAP_CREATE           = 4,
+  EVENT_TYPE_SNAP_REMOVE           = 5,
+  EVENT_TYPE_SNAP_RENAME           = 6,
+  EVENT_TYPE_SNAP_PROTECT          = 7,
+  EVENT_TYPE_SNAP_UNPROTECT        = 8,
+  EVENT_TYPE_SNAP_ROLLBACK         = 9,
+  EVENT_TYPE_RENAME                = 10,
+  EVENT_TYPE_RESIZE                = 11,
+  EVENT_TYPE_FLATTEN               = 12,
+  EVENT_TYPE_DEMOTE_PROMOTE        = 13,
+  EVENT_TYPE_SNAP_LIMIT            = 14,
+  EVENT_TYPE_UPDATE_FEATURES       = 15,
+  EVENT_TYPE_METADATA_SET          = 16,
+  EVENT_TYPE_METADATA_REMOVE       = 17,
+  EVENT_TYPE_AIO_WRITESAME         = 18,
+  EVENT_TYPE_AIO_COMPARE_AND_WRITE = 19,
+};
+
+static struct bio_vec *setup_write_bvecs(void *buf, u64 offset, u64 length)
+{
+	u32 i;
+	struct bio_vec *bvecs = NULL;
+	u32 bvec_count = 0;
+
+	bvec_count = calc_pages_for(offset, length);
+	bvecs = kcalloc(bvec_count, sizeof(*bvecs), GFP_NOIO);
+	if (!bvecs)
+		goto err;
+
+	offset = offset % PAGE_SIZE;
+	for (i = 0; i < bvec_count; i++) {
+		unsigned int len = min(length, (u64)PAGE_SIZE - offset);
+
+		bvecs[i].bv_page = alloc_page(GFP_NOIO);
+		if (!bvecs[i].bv_page)
+			goto free_bvecs;
+
+		bvecs[i].bv_offset = offset;
+		bvecs[i].bv_len = len;
+		memcpy(page_address(bvecs[i].bv_page) + bvecs[i].bv_offset, buf, bvecs[i].bv_len);
+		length -= len;
+		buf += len;
+		offset = 0;
+	}
+
+	rbd_assert(!length);
+
+	return bvecs;
+
+free_bvecs:
+err:
+	return NULL;
+}
+
+static int rbd_journal_handle_aio_discard(struct rbd_device *rbd_dev, void **p, void *end, u8 struct_v, uint64_t commit_tid)
+{
+	uint64_t offset;
+	uint64_t length;
+	int result = 0;
+	enum obj_operation_type op_type;
+	struct rbd_img_request *img_request;
+	struct ceph_snap_context *snapc = NULL;
+
+	offset = ceph_decode_64(p);
+	length = ceph_decode_64(p);
+
+	snapc = rbd_dev->header.snapc;
+	ceph_get_snap_context(snapc);
+	op_type = OBJ_OP_WRITE;
+
+	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
+	if (!img_request) {
+		result = -ENOMEM;
+		goto err;
+	}
+	img_request->journaler_commit_tid = commit_tid;
+
+	result = rbd_img_fill_nodata(img_request, offset, length);
+	if (result)
+		goto err;
+
+	rbd_img_request_submit(img_request);
+	result = wait_for_completion_interruptible(&img_request->completion);
+err:
+	return result;
+}
+
+static int rbd_journal_handle_aio_write(struct rbd_device *rbd_dev, void **p, void *end, u8 struct_v, uint64_t commit_tid)
+{
+	uint64_t offset;
+	uint64_t length;
+	char *data;
+	ssize_t data_len;
+	int result = 0;
+	enum obj_operation_type op_type;
+	struct ceph_snap_context *snapc = NULL;
+	struct rbd_img_request *img_request;
+
+	struct ceph_file_extent ex;
+	struct bio_vec *bvecs;
+
+	offset = ceph_decode_64(p);
+	length = ceph_decode_64(p);
+
+	data = ceph_extract_encoded_string(p, end, &data_len, GFP_NOIO); 
+	if (!data)
+		return -ENOMEM;
+
+	snapc = rbd_dev->header.snapc;
+	ceph_get_snap_context(snapc);
+	op_type = OBJ_OP_DISCARD;
+
+	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
+	if (!img_request) {
+		result = -ENOMEM;
+		goto err;
+	}
+
+	img_request->journaler_commit_tid = commit_tid;
+	snapc = NULL; /* img_request consumes a ref */
+
+	ex.fe_off = offset;
+	ex.fe_len = length;
+
+	bvecs = setup_write_bvecs(data, offset, length);
+	if (!bvecs)
+		pr_err("failed to alloc bvecs.");
+	result = rbd_img_fill_from_bvecs(img_request,
+				         &ex, 1, bvecs);
+	if (result)
+		goto err;
+
+	rbd_img_request_submit(img_request);
+	result = wait_for_completion_interruptible(&img_request->completion);
+
+err:
+	return result;
+}
+
+static int rbd_journal_replay(void *entry_handler, struct ceph_journaler_entry *entry, uint64_t commit_tid)
+{
+	struct rbd_device *rbd_dev = entry_handler;
+	void **p = (void **)(&entry->data);
+	void *end = *p + entry->data_len;
+	uint32_t event_type;
+	u8 struct_v;
+	u32 struct_len;
+	int ret = 0;
+
+	ret = ceph_start_decoding(p, end, 1, "rbd_decode_entry",
+				  &struct_v, &struct_len);
+	if (ret)
+		return -EINVAL;
+
+	event_type = ceph_decode_32(p);
+
+	switch (event_type) {
+	case EVENT_TYPE_AIO_WRITE:
+		rbd_journal_handle_aio_write(rbd_dev, p, end, struct_v, commit_tid);
+		break;
+	case EVENT_TYPE_AIO_DISCARD:
+		rbd_journal_handle_aio_discard(rbd_dev, p, end, struct_v, commit_tid);
+		break;
+	case EVENT_TYPE_AIO_FLUSH:
+		break;
+	default:
+		pr_err("unknown event_type: %u", event_type);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int rbd_journal_allocate_tag(struct rbd_journal *journal);
+static int rbd_dev_open_journal(struct rbd_device *rbd_dev)
+{
+	int ret = 0;
+	struct rbd_journal *journal = NULL;
+	struct ceph_journaler *journaler = NULL;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+
+	journal = kzalloc(sizeof(struct rbd_journal), GFP_KERNEL);
+	if (!journal)
+		return -ENOMEM;
+
+	journaler = ceph_journaler_create(osdc, &rbd_dev->header_oloc,
+					  rbd_dev->spec->image_id,
+					  LOCAL_CLIENT_ID);
+	if (!journaler) {
+		ret = -ENOMEM;
+		goto err_free_journal;
+	}
+
+	journaler->entry_handler = rbd_dev;
+	journaler->handle_entry = rbd_journal_replay;
+
+	ret = ceph_journaler_open(journaler);
+	if (ret)
+		goto err_destroy_journaler;
+
+	journal->journaler = journaler;
+	ret = ceph_journaler_start_replay(journaler);
+	if (ret)
+		goto err_close_journaler;
+
+	rbd_dev->journal = journal;
+	ret = rbd_journal_allocate_tag(journal);
+	if (ret)
+		goto err_close_journaler;
+
+	return ret;
+
+err_close_journaler:
+	ceph_journaler_close(journaler);
+err_destroy_journaler:
+	ceph_journaler_destroy(journaler);
+err_free_journal:
+	kfree(rbd_dev->journal);
+	rbd_dev->journal = NULL;
+	return ret;
+}
+
+static void rbd_dev_close_journal(struct rbd_device *rbd_dev)
+{
+	struct ceph_journaler *journaler = NULL;
+
+	if (!rbd_dev->journal)
+		return;
+
+	journaler = rbd_dev->journal->journaler;
+	ceph_journaler_close(journaler);
+	ceph_journaler_destroy(journaler);
+	kfree(rbd_dev->journal);
+	rbd_dev->journal = NULL;
+}
+
+typedef struct rbd_journal_tag_predecessor {
+	bool commit_valid;
+	uint64_t tag_tid;
+	uint64_t entry_tid;
+	uint32_t uuid_len;
+	char *mirror_uuid;
+} rbd_journal_tag_predecessor;
+
+typedef struct rbd_journal_tag_data {
+	struct rbd_journal_tag_predecessor predecessor;
+	uint32_t uuid_len;
+	char *mirror_uuid;
+} rbd_journal_tag_data;
+
+static uint32_t tag_data_encoding_size(struct rbd_journal_tag_data *tag_data)
+{
+	// sizeof(uuid_len) 4 + uuid_len + 1 commit_valid + 8 tag_tid + 8 entry_tid + 4 sizeof(uuid_len) + uuid_len
+	return (4 + tag_data->uuid_len + 1 + 8 + 8 + 4 + tag_data->predecessor.uuid_len);
+}
+
+static void predecessor_encode(void **p, void *end, struct rbd_journal_tag_predecessor *predecessor)
+{
+	ceph_encode_string(p, end, predecessor->mirror_uuid, predecessor->uuid_len);
+	ceph_encode_8(p, predecessor->commit_valid);
+	ceph_encode_64(p, predecessor->tag_tid);
+	ceph_encode_64(p, predecessor->entry_tid);
+}
+
+static int rbd_journal_encode_tag_data(void **p, void *end, struct rbd_journal_tag_data *tag_data)
+{
+	struct rbd_journal_tag_predecessor *predecessor = &tag_data->predecessor;
+
+	ceph_encode_string(p, end, tag_data->mirror_uuid, tag_data->uuid_len);
+	predecessor_encode(p, end, predecessor);
+
+	return 0;
+}
+
+static int rbd_journal_allocate_tag(struct rbd_journal *journal)
+{
+	struct ceph_journaler_tag tag;
+	struct ceph_journaler *journaler = journal->journaler;
+	struct ceph_journaler_client *client;
+	struct rbd_journal_tag_predecessor *predecessor;
+	struct ceph_journaler_object_pos *position;
+	struct rbd_journal_tag_data tag_data;
+	void *buf = NULL, *p = NULL, *end = NULL;
+	uint32_t buf_len;
+	int ret = 0;
+
+	ret = ceph_journaler_get_cached_client(journaler, LOCAL_CLIENT_ID, &client);
+	if (ret)
+		goto out;
+
+	predecessor = &tag_data.predecessor;
+	position = list_first_entry(&client->object_positions, struct ceph_journaler_object_pos, node);
+
+	predecessor->commit_valid = true;
+	predecessor->tag_tid = position->tag_tid;
+	predecessor->entry_tid = position->entry_tid;
+	predecessor->uuid_len = 0;
+	predecessor->mirror_uuid = LOCAL_MIRROR_UUID;
+
+	tag_data.uuid_len = 0;
+	tag_data.mirror_uuid = LOCAL_MIRROR_UUID;
+
+	buf_len = tag_data_encoding_size(&tag_data);
+
+	p = kmalloc(buf_len, GFP_KERNEL);
+	if (!p) {
+		pr_err("failed to allocate tag data");
+		return -ENOMEM;
+	}
+
+	end = p + buf_len;
+	buf = p;
+	ret = rbd_journal_encode_tag_data(&p, end, &tag_data);
+	if (ret) {
+		pr_err("error in tag data");
+		return ret;
+	}
+
+	ret = ceph_journaler_allocate_tag(journaler, 0, buf, buf_len, &tag);
+
+	journal->tag_tid = tag.tid;
+
+out:
+	return ret;
+}
+
 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
 {
 	rbd_dev_unprobe(rbd_dev);
@@ -5781,6 +6172,145 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev)
 	rbd_dev->spec->image_id = NULL;
 }
 
+struct AioWriteEvent {
+	enum rbd_journal_event_type type;
+	uint64_t offset;
+	uint64_t length;
+	char	 data[0];
+};
+
+
+static void rbd_journal_callback(struct ceph_journaler_ctx *journaler_ctx)
+{
+	struct rbd_journal_ctx *ctx = journaler_ctx->priv;
+	int result = journaler_ctx->result;
+	struct rbd_device *rbd_dev = ctx->rbd_dev;
+	bool must_be_locked = ctx->must_be_locked;
+
+	if (result)
+		goto err_rq;
+
+	if (must_be_locked)
+		down_read(&rbd_dev->lock_rwsem);
+
+	rbd_img_request_submit(ctx->img_request);
+
+	if (must_be_locked)
+		up_read(&rbd_dev->lock_rwsem);
+
+	kfree(ctx);
+	return;
+
+err_rq:
+	ceph_put_snap_context(ctx->snapc);
+	blk_mq_end_request(ctx->rq, errno_to_blk_status(result));
+	rbd_img_request_put(ctx->img_request);
+	kfree(ctx);
+}
+
+static int rbd_journal_append_write_event(struct rbd_device *rbd_dev, struct bio *bio,
+					  u64 offset, u64 length, struct rbd_journal_ctx *ctx)
+{
+	void *p = NULL;
+	struct ceph_journaler_ctx *journaler_ctx;
+	struct ceph_bio_iter *bio_iter;
+
+	bio_iter = kzalloc(sizeof(struct ceph_bio_iter), GFP_KERNEL);
+	bio_iter->bio = bio;
+	bio_iter->iter = bio->bi_iter;
+	bio_iter->bio_len = length;
+
+	bio_iter->prefix_page = alloc_page(GFP_KERNEL);
+	memset(page_address(bio_iter->prefix_page), 0, PAGE_SIZE);
+	
+	// EVENT_FIXED_SIZE(10 = CEPH_ENCODING_START_BLK_LEN(6) + EVENT_TYPE(4)) + 
+	// 	offset(8) + length(8) + string_len(4) = 30
+	bio_iter->prefix_len = 30;
+	bio_iter->prefix_offset = PAGE_SIZE - bio_iter->prefix_len;
+
+	p = page_address(bio_iter->prefix_page) + bio_iter->prefix_offset;
+
+	journaler_ctx = kmalloc(sizeof(*journaler_ctx), GFP_KERNEL);
+	if (!journaler_ctx) {
+		return -ENOMEM;
+	}
+
+	ceph_start_encoding(&p, 1, 1, bio_iter->prefix_len + bio_iter->bio_len - 6);
+
+	ceph_encode_32(&p, EVENT_TYPE_AIO_WRITE);
+
+	ceph_encode_64(&p, offset);
+	ceph_encode_64(&p, length);
+
+	// first part of ceph_encode_string();
+	ceph_encode_32(&p, bio_iter->bio_len);
+
+	journaler_ctx->priv = ctx;
+	journaler_ctx->callback = rbd_journal_callback;
+	journaler_ctx->bio_iter = bio_iter;
+
+	ceph_journaler_append(rbd_dev->journal->journaler, rbd_dev->journal->tag_tid,
+			      &ctx->img_request->journaler_commit_tid, journaler_ctx);
+	return 0;
+}
+
+static int rbd_journal_append_discard_event(struct rbd_device *rbd_dev, struct bio *bio,
+					    u64 offset, u64 length, struct rbd_journal_ctx *ctx)
+{
+	void *p = NULL;
+	void *buf = NULL;
+	struct ceph_journaler_ctx *journaler_ctx;
+	struct ceph_bio_iter *bio_iter;
+
+	bio_iter = kzalloc(sizeof(struct ceph_bio_iter), GFP_KERNEL);
+	bio_iter->bio = bio;
+	bio_iter->iter = bio->bi_iter;
+	bio_iter->bio_len = 0;
+
+	bio_iter->prefix_page = alloc_page(GFP_KERNEL);
+	// EVENT_FIXED_SIZE(10 = CEPH_ENCODING_START_BLK_LEN(6) + EVENT_TYPE(4)) + 
+	// 	offset(8) + length(8) = 26
+	bio_iter->prefix_len = 26;
+	bio_iter->prefix_offset = PAGE_SIZE - bio_iter->prefix_len;
+
+	p = page_address(bio_iter->prefix_page) + bio_iter->prefix_offset;
+	buf = p + bio_iter->prefix_len;
+
+	journaler_ctx = kmalloc(sizeof(*journaler_ctx), GFP_KERNEL);
+	if (!journaler_ctx) {
+		return -ENOMEM;
+	}
+
+	ceph_start_encoding(&p, 1, 1, bio_iter->prefix_len + bio_iter->bio_len - 6);
+
+	ceph_encode_32(&p, EVENT_TYPE_AIO_DISCARD);
+
+	ceph_encode_64(&p, offset);
+	ceph_encode_64(&p, length);
+
+	journaler_ctx->priv = ctx;
+	journaler_ctx->callback = rbd_journal_callback;
+	journaler_ctx->bio_iter = bio_iter;
+
+	ceph_journaler_append(rbd_dev->journal->journaler, rbd_dev->journal->tag_tid,
+			      &ctx->img_request->journaler_commit_tid, journaler_ctx);
+	return 0;
+}
+
+static int rbd_journal_append(struct rbd_device *rbd_dev, struct bio *bio,
+			      u64 offset, u64 length, enum obj_operation_type op_type,
+			      struct rbd_journal_ctx *ctx)
+{
+	switch (op_type) {
+	case OBJ_OP_WRITE:
+		return rbd_journal_append_write_event(rbd_dev, bio, offset, length, ctx);
+	case OBJ_OP_DISCARD:
+		return rbd_journal_append_discard_event(rbd_dev, bio, offset, length, ctx);
+	default:
+		return 0;
+	}
+}
+
 /*
  * Probe for the existence of the header object for the given rbd
  * device.  If this image is the one being mapped (i.e., not a
@@ -5947,11 +6477,30 @@ static ssize_t do_rbd_add(struct bus_type *bus,
 			goto err_out_device_setup;
 	}
 
+	if (rbd_dev->header.features & RBD_FEATURE_JOURNALING) {
+		set_bit(RBD_LOCK_FLAG_RELEASE_WAIT, &rbd_dev->lock_flags);
+		rc = rbd_add_acquire_lock(rbd_dev);
+		if (rc) {
+			clear_bit(RBD_LOCK_FLAG_RELEASE_WAIT, &rbd_dev->lock_flags);
+			complete_all(&rbd_dev->lock_wait);
+			goto err_out_device_setup;
+		}
+
+		rc = rbd_dev_open_journal(rbd_dev);
+		if (rc) {
+			clear_bit(RBD_LOCK_FLAG_RELEASE_WAIT, &rbd_dev->lock_flags);
+			complete_all(&rbd_dev->lock_wait);
+			goto err_out_image_lock;
+		}
+		clear_bit(RBD_LOCK_FLAG_RELEASE_WAIT, &rbd_dev->lock_flags);
+		complete_all(&rbd_dev->lock_wait);
+	}
+
 	/* Everything's ready.  Announce the disk to the world. */
 
 	rc = device_add(&rbd_dev->dev);
 	if (rc)
-		goto err_out_image_lock;
+		goto err_out_close_journal;
 
 	add_disk(rbd_dev->disk);
 	/* see rbd_init_disk() */
@@ -5969,6 +6518,8 @@ static ssize_t do_rbd_add(struct bus_type *bus,
 	module_put(THIS_MODULE);
 	return rc;
 
+err_out_close_journal:
+	rbd_dev_close_journal(rbd_dev);
 err_out_image_lock:
 	rbd_dev_image_unlock(rbd_dev);
 err_out_device_setup:
@@ -6095,6 +6646,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus,
 	device_del(&rbd_dev->dev);
 
 	rbd_dev_image_unlock(rbd_dev);
+	rbd_dev_close_journal(rbd_dev);
 	rbd_dev_device_release(rbd_dev);
 	rbd_dev_image_release(rbd_dev);
 	rbd_dev_destroy(rbd_dev);
-- 
1.8.3.1