This commit introduce 3 APIs for journal recording: (1) ceph_journaler_allocate_tag() This api allocate a new tag for user to get a unified tag_tid. Then each event appended by this user will be tagged by this tag_tid. (2) ceph_journaler_append() This api allow user to append event to journal objects. (3) ceph_journaler_client_committed() This api will notify journaling that a event is already committed, you can remove it from journal if there is no other client refre to it. Signed-off-by: Dongsheng Yang <dongsheng.yang@xxxxxxxxxxxx> --- include/linux/ceph/journaler.h | 12 + net/ceph/journaler.c | 678 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 690 insertions(+) diff --git a/include/linux/ceph/journaler.h b/include/linux/ceph/journaler.h index acc1c6f..a8c1838 100644 --- a/include/linux/ceph/journaler.h +++ b/include/linux/ceph/journaler.h @@ -132,6 +132,7 @@ struct ceph_journaler { struct workqueue_struct *task_wq; struct workqueue_struct *finish_wq; + struct workqueue_struct *notify_wq; struct work_struct notify_update_work; struct work_struct commit_work; struct work_struct finish_work; @@ -161,4 +162,15 @@ int ceph_journaler_get_cached_client(struct ceph_journaler *journaler, char *cli struct ceph_journaler_client **client_result); // replaying int ceph_journaler_start_replay(struct ceph_journaler *journaler); + +// recording +int ceph_journaler_append(struct ceph_journaler *journaler, + uint64_t tag_tid, uint64_t *commit_tid, + struct ceph_journaler_ctx *ctx); +void ceph_journaler_client_committed(struct ceph_journaler *journaler, + uint64_t commit_tid); +int ceph_journaler_allocate_tag(struct ceph_journaler *journaler, + uint64_t tag_class, void *buf, + uint32_t buf_len, + struct ceph_journaler_tag *tag); #endif diff --git a/net/ceph/journaler.c b/net/ceph/journaler.c index 3b73725..e0dce2a 100644 --- a/net/ceph/journaler.c +++ b/net/ceph/journaler.c @@ -28,6 +28,11 @@ static char *object_oid_prefix(int pool_id, const char *journal_id) } static void watch_cb_func(struct work_struct *work); +static void journaler_flush(struct work_struct *work); +static void journaler_finish(struct work_struct *work); +static void journaler_client_commit(struct work_struct *work); +static void journaler_notify_update(struct work_struct *work); +static void journaler_overflow(struct work_struct *work); struct ceph_journaler *ceph_journaler_create(struct ceph_osd_client *osdc, struct ceph_object_locator *oloc, @@ -98,6 +103,11 @@ struct ceph_journaler *ceph_journaler_create(struct ceph_osd_client *osdc, spin_lock_init(&journaler->finish_lock); INIT_WORK(&journaler->watch_cb_work, watch_cb_func); + INIT_WORK(&journaler->flush_work, journaler_flush); + INIT_WORK(&journaler->finish_work, journaler_finish); + INIT_WORK(&journaler->commit_work, journaler_client_commit); + INIT_WORK(&journaler->notify_update_work, journaler_notify_update); + INIT_WORK(&journaler->overflow_work, journaler_overflow); return journaler; @@ -1046,3 +1056,671 @@ int ceph_journaler_start_replay(struct ceph_journaler *journaler) return ret; } EXPORT_SYMBOL(ceph_journaler_start_replay); + +// recording +// TODO use kmem_cache for it. +struct journaler_write_ctx { + struct list_head node; + + struct ceph_journaler *journaler; + struct ceph_journaler_future *future; + struct ceph_journaler_ctx *journaler_ctx; + + uint64_t splay_offset; + uint64_t object_num; + + struct page *req_page; + struct ceph_bio_iter *bio_iter; +}; + +static int allocate_entry_tid(struct ceph_journaler *journaler, + uint64_t tag_tid, uint64_t *entry_tid) +{ + struct entry_tid *pos = NULL; + + spin_lock(&journaler->entry_tid_lock); + list_for_each_entry(pos, &journaler->entry_tids, node) { + if (pos->tag_tid == tag_tid) { + *entry_tid = pos->entry_tid++; + spin_unlock(&journaler->entry_tid_lock); + return 0; + } + } + pos = kzalloc(sizeof(struct entry_tid), GFP_KERNEL); + WARN_ON(pos == NULL); + if (!pos) { + spin_unlock(&journaler->entry_tid_lock); + pr_err("failed to allocate new entry."); + return -ENOMEM; + } + + pos->tag_tid = tag_tid; + pos->entry_tid = 0; + INIT_LIST_HEAD(&pos->node); + + list_add_tail(&pos->node, &journaler->entry_tids); + *entry_tid = pos->entry_tid++; + spin_unlock(&journaler->entry_tid_lock); + + return 0; +} + +static uint64_t get_object(struct ceph_journaler *journaler, uint64_t splay_offset) +{ + return splay_offset + (journaler->splay_width * journaler->active_set); +} + +static struct ceph_journaler_future *create_future(uint64_t tag_tid, + uint64_t entry_tid, + uint64_t commit_tid) +{ + struct ceph_journaler_future *future = NULL; + + future = kzalloc(sizeof(struct ceph_journaler_future), GFP_KERNEL); + if (!future) + return NULL; + + future->tag_tid = tag_tid; + future->entry_tid = entry_tid; + future->commit_tid = commit_tid; + + future->safe = false; + future->consistent = false; + + future->ctx = NULL; + future->wait = NULL; + + return future; +} + +static void set_prev_future(struct ceph_journaler *journaler, + struct ceph_journaler_future *future) +{ + if (journaler->prev_future == NULL) { + future->consistent = true; + } else if (journaler->prev_future->consistent && + journaler->prev_future->safe) { + future->consistent = true; + } else { + journaler->prev_future->wait = future; + } + journaler->prev_future = future; +} + +static struct ceph_journaler_entry *create_entry(uint64_t tag_tid, + uint64_t entry_tid, + struct ceph_bio_iter *bio_iter) +{ + struct ceph_journaler_entry *entry = NULL; + + entry = kzalloc(sizeof(struct ceph_journaler_entry), GFP_KERNEL); + if (!entry) + return NULL; + entry->tag_tid = tag_tid; + entry->entry_tid = entry_tid; + entry->bio_iter = bio_iter; + entry->data_len = bio_iter->bio_len + + bio_iter->prefix_len + bio_iter->suffix_len; + + return entry; +} + +static void journaler_entry_encode_prefix(struct ceph_journaler_entry *entry, + void **p, void *end) +{ + ceph_encode_64(p, PREAMBLE); + ceph_encode_8(p, (uint8_t)1); + ceph_encode_64(p, entry->entry_tid); + ceph_encode_64(p, entry->tag_tid); + + ceph_encode_32(p, entry->data_len); +} + +static uint32_t crc_bio(uint32_t crc, struct bio *bio) +{ + struct bio_vec bv; + struct bvec_iter iter; + char *buf = NULL; + u64 offset = 0; + +next: + bio_for_each_segment(bv, bio, iter) { + buf = page_address(bv.bv_page) + bv.bv_offset; + crc = crc32c(crc, buf, bv.bv_len); + offset += bv.bv_len; + } + + if (bio->bi_next) { + bio = bio->bi_next; + goto next; + } + + return crc; +} + +static void journaler_finish(struct work_struct *work) +{ + struct ceph_journaler *journaler = container_of(work, struct ceph_journaler, + finish_work); + struct ceph_journaler_ctx *ctx_pos, *next; + + spin_lock(&journaler->finish_lock); + list_for_each_entry_safe(ctx_pos, next, &journaler->ctx_list, node) { + list_del(&ctx_pos->node); + ctx_pos->callback(ctx_pos); + } + spin_unlock(&journaler->finish_lock); +} + +static void future_consistent(struct ceph_journaler *journaler, + struct ceph_journaler_future *future); +static void future_finish(struct ceph_journaler *journaler, + struct ceph_journaler_future *future) { + struct ceph_journaler_ctx *journaler_ctx = future->ctx; + + if (future->safe && future->consistent) { + spin_lock(&journaler->finish_lock); + list_add_tail(&journaler_ctx->node, &journaler->ctx_list); + spin_unlock(&journaler->finish_lock); + + queue_work(journaler->finish_wq, &journaler->finish_work); + + if (future->wait) + future_consistent(journaler, future->wait); + } +} + +static void future_consistent(struct ceph_journaler *journaler, + struct ceph_journaler_future *future) { + future->consistent = true; + future_finish(journaler, future); +} + +static void future_safe(struct ceph_journaler *journaler, + struct ceph_journaler_future *future) { + future->safe = true; + future_finish(journaler, future); +} + +static void journaler_notify_update(struct work_struct *work) +{ + struct ceph_journaler *journaler = container_of(work, + struct ceph_journaler, + notify_update_work); + int ret = 0; + + ret = ceph_osdc_notify(journaler->osdc, &journaler->header_oid, + &journaler->header_oloc, NULL, 0, + 5000, NULL, NULL); + if (ret) + pr_err("notify_update failed: %d", ret); +} + +static bool advance_object_set(struct ceph_journaler *journaler) +{ + int ret = 0; + int i = 0; + struct object_recorder *obj_recorder; + uint64_t active_set = 0; + + spin_lock(&journaler->advancing_lock); + if (journaler->advancing) { + spin_unlock(&journaler->advancing_lock); + return false; + } + + // make sure all inflight appending finish + for (i = 0; i < journaler->splay_width; i++) { + obj_recorder = &journaler->obj_recorders[i]; + spin_lock(&obj_recorder->lock); + if (obj_recorder->inflight_append) { + spin_unlock(&obj_recorder->lock); + spin_unlock(&journaler->advancing_lock); + return false; + } + spin_unlock(&obj_recorder->lock); + } + + journaler->advancing = true; + + active_set = journaler->active_set + 1; + spin_unlock(&journaler->advancing_lock); + + ret = ceph_cls_journaler_set_active_set(journaler->osdc, + &journaler->header_oid, &journaler->header_oloc, + active_set); + if (ret) { + pr_err("error in set active_set: %d", ret); + } + + queue_work(journaler->task_wq, &journaler->notify_update_work); + + return true; +} + +static void journaler_overflow(struct work_struct *work) +{ + struct ceph_journaler *journaler = container_of(work, + struct ceph_journaler, + overflow_work); + if (advance_object_set(journaler)) { + queue_work(journaler->task_wq, &journaler->flush_work); + } +} + +static void journaler_write_callback(struct ceph_osd_request *osd_req) +{ + struct journaler_write_ctx *ctx = osd_req->r_priv; + struct ceph_journaler *journaler = ctx->journaler; + struct ceph_journaler_ctx *journaler_ctx = ctx->journaler_ctx; + struct ceph_journaler_future *future = ctx->future; + int ret = osd_req->r_result; + struct object_recorder *obj_recorder = &journaler->obj_recorders[ctx->splay_offset]; + + __free_page(ctx->req_page); + ceph_osdc_put_request(osd_req); + + if (ret == -EOVERFLOW) { + spin_lock(&obj_recorder->lock); + if (!obj_recorder->overflowed) + obj_recorder->overflowed = true; + list_add_tail(&ctx->node, &obj_recorder->overflow_list); + if (--obj_recorder->inflight_append == 0) + queue_work(journaler->task_wq, &journaler->overflow_work); + spin_unlock(&obj_recorder->lock); + return; + } + + spin_lock(&obj_recorder->lock); + --obj_recorder->inflight_append; + spin_unlock(&obj_recorder->lock); + + ret = add_commit_entry(journaler, ctx->future->commit_tid, ctx->object_num, + ctx->future->tag_tid, ctx->future->entry_tid); + if (ret) { + pr_err("failed to add_commit_entry"); + } + + journaler_ctx->result = ret; + future_safe(journaler, future); + kfree(ctx); + + if (journaler_ctx->bio_iter->prefix_page) + __free_page(journaler_ctx->bio_iter->prefix_page); + if (journaler_ctx->bio_iter->suffix_page) + __free_page(journaler_ctx->bio_iter->suffix_page); +} + +static int ceph_journaler_obj_write(struct ceph_journaler *journaler, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + struct journaler_write_ctx *ctx) + +{ + struct ceph_osd_client *osdc = journaler->osdc; + struct ceph_osd_request *req; + void *p; + int ret; + + req = ceph_osdc_alloc_request(osdc, NULL, 2, false, GFP_NOIO); + if (!req) + return -ENOMEM; + + ceph_oid_copy(&req->r_base_oid, oid); + ceph_oloc_copy(&req->r_base_oloc, oloc); + req->r_flags = CEPH_OSD_FLAG_WRITE; + req->r_callback = journaler_write_callback; + req->r_priv = ctx; + + // guard_append + ctx->req_page = alloc_page(GFP_NOIO); + if (!ctx->req_page) { + ret = -ENOMEM; + goto out_req; + } + + p = page_address(ctx->req_page); + ceph_encode_64(&p, 1 << journaler->order); + + ret = osd_req_op_cls_init(req, 0, "journal", "guard_append"); + if (ret) + goto out_free_page; + + osd_req_op_cls_request_data_pages(req, 0, &ctx->req_page, 8, 0, false, false); + + // append_data + osd_req_op_extent_init(req, 1, CEPH_OSD_OP_APPEND, 0, + ctx->bio_iter->prefix_len + ctx->bio_iter->bio_len + ctx->bio_iter->suffix_len, 0, 0); + osd_req_op_extent_osd_data_bio(req, 1, ctx->bio_iter, + ctx->bio_iter->prefix_len + ctx->bio_iter->bio_len + ctx->bio_iter->suffix_len); + ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); + if (ret) + goto out_free_page; + + ceph_osdc_start_request(osdc, req, false); + return 0; + +out_free_page: + __free_page(ctx->req_page); +out_req: + ceph_osdc_put_request(req); + return ret; +} + +static int send_append_request(struct ceph_journaler *journaler, + uint64_t object_num, + struct journaler_write_ctx *ctx) +{ + struct ceph_object_id object_oid; + int ret = 0; + + ceph_oid_init(&object_oid); + ret = ceph_oid_aprintf(&object_oid, GFP_KERNEL, "%s%llu", + journaler->object_oid_prefix, object_num); + if (ret) { + pr_err("failed to initialize object id: %d", ret); + goto out; + } + + ret = ceph_journaler_obj_write(journaler, &object_oid, + &journaler->data_oloc, ctx); +out: + ceph_oid_destroy(&object_oid); + return ret; +} + +static void journaler_flush(struct work_struct *work) +{ + struct ceph_journaler *journaler = container_of(work, + struct ceph_journaler, + flush_work); + int i = 0; + int ret = 0; + struct object_recorder *obj_recorder; + struct journaler_write_ctx *ctx; + int req_num = 0; + LIST_HEAD(tmp); + + if (journaler->advancing) { + return; + } + + for (i = 0; i < journaler->splay_width; i++) { + req_num = 0; + INIT_LIST_HEAD(&tmp); + obj_recorder = &journaler->obj_recorders[i]; + spin_lock(&obj_recorder->lock); + list_splice_tail_init(&obj_recorder->overflow_list, &tmp); + list_splice_tail_init(&obj_recorder->append_list, &tmp); + spin_unlock(&obj_recorder->lock); + + list_for_each_entry(ctx, &tmp, node) { + ctx->object_num = get_object(journaler, obj_recorder->splay_offset); + ret = send_append_request(journaler, ctx->object_num, ctx); + if (ret) { + // TODO + pr_err("failed to send append request: %d", ret); + } + req_num++; + } + + spin_lock(&obj_recorder->lock); + obj_recorder->inflight_append += req_num; + spin_unlock(&obj_recorder->lock); + } +} + +static int ceph_journaler_object_append(struct ceph_journaler *journaler, + uint64_t splay_offset, + struct ceph_journaler_future *future, + struct ceph_journaler_entry *entry, + struct ceph_journaler_ctx *journaler_ctx) +{ + void *buf = NULL; + void *end = NULL; + int ret = 0; + uint32_t crc = 0; + struct ceph_bio_iter *bio_iter = journaler_ctx->bio_iter; + struct journaler_write_ctx *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + struct object_recorder *obj_recorder; + + if (!ctx) { + return -ENOMEM; + } + + // PEAMBLE(8) + version(1) + entry_tid(8) + tag_tid(8) + string_len(4) = 29 + bio_iter->prefix_offset = bio_iter->prefix_offset - 29; + bio_iter->prefix_len = bio_iter->prefix_len + 29; + buf = page_address(bio_iter->prefix_page) + bio_iter->prefix_offset; + end = buf + 29; + + ctx->journaler = journaler; + ctx->future = future; + ctx->journaler_ctx = journaler_ctx; + ctx->splay_offset = splay_offset; + INIT_LIST_HEAD(&ctx->node); + + journaler_entry_encode_prefix(entry, &buf, end); + + // size of crc is 4 + bio_iter->suffix_page = alloc_page(GFP_KERNEL); + bio_iter->suffix_offset = 0; + bio_iter->suffix_len = 4; + + buf = page_address(bio_iter->suffix_page); + end = buf + 4; + + crc = crc32c(crc, page_address(bio_iter->prefix_page) + bio_iter->prefix_offset, + bio_iter->prefix_len); + if (bio_iter->bio_len) + crc = crc_bio(crc, bio_iter->bio); + + ceph_encode_32(&buf, crc); + ctx->bio_iter = bio_iter; + + obj_recorder = &journaler->obj_recorders[ctx->splay_offset]; + spin_lock(&obj_recorder->lock); + list_add_tail(&ctx->node, &obj_recorder->append_list); + queue_work(journaler->task_wq, &journaler->flush_work); + spin_unlock(&obj_recorder->lock); + + return ret; +} + +int ceph_journaler_append(struct ceph_journaler *journaler, uint64_t tag_tid, + uint64_t *commit_tid, struct ceph_journaler_ctx *ctx) +{ + uint8_t splay_width; + uint8_t splay_offset; + uint64_t object_num; + uint64_t entry_tid; + struct ceph_journaler_future *future; + struct ceph_journaler_entry *entry; + struct object_recorder *obj_recorder; + int ret = 0; + + spin_lock(&journaler->meta_lock); + ret = allocate_entry_tid(journaler, tag_tid, &entry_tid); + if (ret) { + goto unlock; + } + + splay_width = journaler->splay_width; + splay_offset = entry_tid % splay_width; + obj_recorder = &journaler->obj_recorders[splay_width]; + + object_num = get_object(journaler, splay_offset); + *commit_tid = allocate_commit_tid(journaler); + + future = create_future(tag_tid, entry_tid, *commit_tid); + if (!future) { + ret = -ENOMEM; + goto unlock; + } + + future->ctx = ctx; + set_prev_future(journaler, future); + + entry = create_entry(tag_tid, entry_tid, ctx->bio_iter); + if (!entry) { + ret = -ENOMEM; + goto unlock; + } + spin_unlock(&journaler->meta_lock); + + ret = ceph_journaler_object_append(journaler, splay_offset, future, entry, ctx); + return ret; + +unlock: + spin_unlock(&journaler->meta_lock); + return ret; +} +EXPORT_SYMBOL(ceph_journaler_append); + +static void journaler_client_commit(struct work_struct *work) +{ + struct ceph_journaler *journaler = container_of(work, struct ceph_journaler, + commit_work); + + struct list_head object_positions; + struct ceph_journaler_object_pos *pos = NULL, *next = NULL; + int ret = 0; + + INIT_LIST_HEAD(&object_positions); + spin_lock(&journaler->commit_lock); + list_for_each_entry_safe(pos, next, &journaler->object_positions_pending, + node) { + struct ceph_journaler_object_pos *new_pos = NULL; + + ret = copy_object_pos(pos, &new_pos); + list_add_tail(&new_pos->node, &object_positions); + } + spin_unlock(&journaler->commit_lock); + + ret = ceph_cls_journaler_client_committed(journaler->osdc, + &journaler->header_oid, &journaler->header_oloc, + journaler->client, &object_positions); + + if (ret) { + pr_err("error in client committed: %d", ret); + } + + list_for_each_entry_safe(pos, next, &object_positions, node) { + list_del(&pos->node); + kfree(pos); + } + + queue_work(journaler->notify_wq, &journaler->notify_update_work); + return; +} + +// hold journaler->commit_lock +static int add_object_position(struct commit_entry *entry, + struct list_head *object_positions, + uint64_t splay_width) +{ + struct ceph_journaler_object_pos *position = NULL; + uint8_t splay_offset = entry->object_num % splay_width; + bool found = false; + int ret = 0; + + list_for_each_entry(position, object_positions, node) { + if (splay_offset == position->object_num % splay_width) { + found = true; + break; + } + } + + if (!found) { + position = kzalloc(sizeof(*position), GFP_KERNEL); + + if (!position) { + pr_err("failed to allocate position"); + return -ENOMEM; + } + list_add(&position->node, object_positions); + } else { + list_move(&position->node, object_positions); + } + + position->object_num = entry->object_num; + position->tag_tid = entry->tag_tid; + position->entry_tid = entry->entry_tid; + + return ret; +} + +void ceph_journaler_client_committed(struct ceph_journaler *journaler, uint64_t commit_tid) +{ + struct commit_entry *entry = NULL; + bool update_client_commit = true; + struct rb_node *n; + LIST_HEAD(object_positions); + + spin_lock(&journaler->commit_lock); + for (n = rb_first(&journaler->commit_entries); n; n = rb_next(n)) { + entry = rb_entry(n, struct commit_entry, r_node); + if (entry->commit_tid == commit_tid) { + entry->committed = true; + break; + } + if (entry->committed == false) + update_client_commit = false; + } + + if (update_client_commit) { + for (n = rb_first(&journaler->commit_entries); n;) { + entry = rb_entry(n, struct commit_entry, r_node); + n = rb_next(n); + + if (entry->commit_tid > commit_tid) + break; + add_object_position(entry, + &journaler->object_positions_pending, + journaler->splay_width); + erase_commit_entry(&journaler->commit_entries, entry); + kfree(entry); + } + } + spin_unlock(&journaler->commit_lock); + + if (update_client_commit) { + queue_work(journaler->task_wq, &journaler->commit_work); + } +} +EXPORT_SYMBOL(ceph_journaler_client_committed); + +int ceph_journaler_allocate_tag(struct ceph_journaler *journaler, + uint64_t tag_class, void *buf, + uint32_t buf_len, + struct ceph_journaler_tag *tag) +{ + uint64_t tag_tid = 0; + int ret = 0; + + ret = ceph_cls_journaler_get_next_tag_tid(journaler->osdc, + &journaler->header_oid, + &journaler->header_oloc, + &tag_tid); + if (ret) + goto out; + + ret = ceph_cls_journaler_tag_create(journaler->osdc, + &journaler->header_oid, + &journaler->header_oloc, + tag_tid, tag_class, + buf, buf_len); + if (ret) + goto out; + + ret = ceph_cls_journaler_get_tag(journaler->osdc, + &journaler->header_oid, + &journaler->header_oloc, + tag_tid, tag); + if (ret) + goto out; + +out: + return ret; +} +EXPORT_SYMBOL(ceph_journaler_allocate_tag); -- 1.8.3.1