This patch implements two methods to read jset from media for journal replay, - __jnl_rd_bkt() for block device This is the legacy method to read jset via block device interface. - __jnl_rd_nvm_bkt() for NVDIMM This is the method to read jset from NVDIMM memory interface, a.k.a memcopy() from NVDIMM pages to DRAM pages. If BCH_FEATURE_INCOMPAT_NVDIMM_META is set in incompat feature set, during running cache set, journal_read_bucket() will read the journal content from NVDIMM by __jnl_rd_nvm_bkt(). The linear addresses of NVDIMM pages to read jset are stored in sb.d[SB_JOURNAL_BUCKETS], which were initialized and maintained in previous runs of the cache set. A thing should be noticed is, when bch_journal_read() is called, the linear address of NVDIMM pages is not loaded and initialized yet, it is necessary to call __bch_journal_nvdimm_init() before reading the jset from NVDIMM pages. Signed-off-by: Coly Li <colyli@xxxxxxx> Cc: Jianpeng Ma <jianpeng.ma@xxxxxxxxx> Cc: Qiaowei Ren <qiaowei.ren@xxxxxxxxx> --- drivers/md/bcache/journal.c | 81 ++++++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 24 deletions(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 243253974cab..d1c76facb6b2 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -34,60 +34,84 @@ static void journal_read_endio(struct bio *bio) closure_put(cl); } +static struct jset *__jnl_rd_bkt(struct cache *ca, unsigned int bkt_idx, + unsigned int len, unsigned int offset, + struct closure *cl) +{ + sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bkt_idx]); + struct bio *bio = &ca->journal.bio; + struct jset *data = ca->set->journal.w[0].data; + + bio_reset(bio); + bio->bi_iter.bi_sector = bucket + offset; + bio_set_dev(bio, ca->bdev); + bio->bi_iter.bi_size = len << 9; + bio->bi_end_io = journal_read_endio; + bio->bi_private = cl; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + bch_bio_map(bio, data); + + closure_bio_submit(ca->set, bio, cl); + closure_sync(cl); + + /* Indeed journal.w[0].data */ + return data; +} + +static struct jset *__jnl_rd_nvm_bkt(struct cache *ca, unsigned int bkt_idx, + unsigned int len, unsigned int offset) +{ + void *jset_addr = (void *)ca->sb.d[bkt_idx] + (offset << 9); + struct jset *data = ca->set->journal.w[0].data; + + memcpy(data, jset_addr, len << 9); + + /* Indeed journal.w[0].data */ + return data; +} + static int journal_read_bucket(struct cache *ca, struct list_head *list, - unsigned int bucket_index) + unsigned int bucket_idx) { struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->bio; struct journal_replay *i; - struct jset *j, *data = ca->set->journal.w[0].data; + struct jset *j; struct closure cl; unsigned int len, left, offset = 0; int ret = 0; - sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); closure_init_stack(&cl); - pr_debug("reading %u\n", bucket_index); + pr_debug("reading %u\n", bucket_idx); while (offset < ca->sb.bucket_size) { reread: left = ca->sb.bucket_size - offset; len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); - bio_reset(bio); - bio->bi_iter.bi_sector = bucket + offset; - bio_set_dev(bio, ca->bdev); - bio->bi_iter.bi_size = len << 9; - - bio->bi_end_io = journal_read_endio; - bio->bi_private = &cl; - bio_set_op_attrs(bio, REQ_OP_READ, 0); - bch_bio_map(bio, data); - - closure_bio_submit(ca->set, bio, &cl); - closure_sync(&cl); + if (!bch_has_feature_nvdimm_meta(&ca->sb)) + j = __jnl_rd_bkt(ca, bucket_idx, len, offset, &cl); + else + j = __jnl_rd_nvm_bkt(ca, bucket_idx, len, offset); /* This function could be simpler now since we no longer write * journal entries that overlap bucket boundaries; this means * the start of a bucket will always have a valid journal entry * if it has any journal entries at all. */ - - j = data; while (len) { struct list_head *where; size_t blocks, bytes = set_bytes(j); if (j->magic != jset_magic(&ca->sb)) { - pr_debug("%u: bad magic\n", bucket_index); + pr_debug("%u: bad magic\n", bucket_idx); return ret; } if (bytes > left << 9 || bytes > PAGE_SIZE << JSET_BITS) { pr_info("%u: too big, %zu bytes, offset %u\n", - bucket_index, bytes, offset); + bucket_idx, bytes, offset); return ret; } @@ -96,7 +120,7 @@ reread: left = ca->sb.bucket_size - offset; if (j->csum != csum_set(j)) { pr_info("%u: bad csum, %zu bytes, offset %u\n", - bucket_index, bytes, offset); + bucket_idx, bytes, offset); return ret; } @@ -158,8 +182,8 @@ reread: left = ca->sb.bucket_size - offset; list_add(&i->list, where); ret = 1; - if (j->seq > ja->seq[bucket_index]) - ja->seq[bucket_index] = j->seq; + if (j->seq > ja->seq[bucket_idx]) + ja->seq[bucket_idx] = j->seq; next_set: offset += blocks * ca->sb.block_size; len -= blocks * ca->sb.block_size; @@ -170,6 +194,8 @@ reread: left = ca->sb.bucket_size - offset; return ret; } +static int __bch_journal_nvdimm_init(struct cache *ca); + int bch_journal_read(struct cache_set *c, struct list_head *list) { #define read_bucket(b) \ @@ -188,6 +214,13 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) unsigned int i, l, r, m; uint64_t seq; + /* + * Linear addresses of NVDIMM pages for journaling is not + * initialized yet, do it before read jset from NVDIMM pages. + */ + if (bch_has_feature_nvdimm_meta(&ca->sb)) + __bch_journal_nvdimm_init(ca); + bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); pr_debug("%u journal buckets\n", ca->sb.njournal_buckets); -- 2.26.2