On Wed, Mar 27, 2019 at 1:41 PM Igor Konopko <igor.j.konopko@xxxxxxxxx> wrote: > > This patch changes the approach to handling partial read path, what is > needed for supporting multi-page bvec, which is currently broken. > > In old approach merging of data from round buffer and drive was fully > made by drive. This had some disadvantages - code was complex and > relies on bio internals, so it was hard to maintain and was strongly > dependent on bio changes. > > In new approach most of the handling is done mostly by block layer > functions such as bio_split(), bio_chain() and generic_make request() > and generally is less complex and easier to maintain. Below some more > details of the new approach. > > When read bio arrives, it is cloned for pblk internal purposes. All > the L2P mapping, which includes copying data from round buffer to bio > and thus bio_advance() calls is done on the cloned bio, so the original > bio is untouched. Later if we found that we have partial read case, we > still have original bio untouched, so we can split it and continue to > process only first 4K of it in current context, when the rest will be > called as separate bio request which is passed to generic_make_request() > for further processing. > > Signed-off-by: Igor Konopko <igor.j.konopko@xxxxxxxxx> I've played around with fio to figure out if there is a read latency performance hit, and I have not found any diff that can't be written off as noise. (i did seq write + random 4k reads with different sizes and checked the latency stats) > --- > drivers/lightnvm/pblk-core.c | 16 +++ > drivers/lightnvm/pblk-read.c | 276 +++++++++++-------------------------------- > drivers/lightnvm/pblk.h | 22 ++-- > 3 files changed, 97 insertions(+), 217 deletions(-) Nice cleanup :) I hope to find time early next week to go through the patch in depth. Thanks, Hans > > diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c > index 39280c1..f2edec6 100644 > --- a/drivers/lightnvm/pblk-core.c > +++ b/drivers/lightnvm/pblk-core.c > @@ -1529,6 +1529,22 @@ void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd) > pblk_ppa_to_line_put(pblk, ppa_list[i]); > } > > +void pblk_rq_to_line_partial_put(struct pblk *pblk, struct nvm_rq *rqd, > + int offset) > +{ > + struct ppa_addr ppa; > + struct pblk_line *line; > + int i = offset; > + > + for (; i < rqd->nr_ppas; i++) { > + ppa = rqd->ppa_list[i]; > + if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { > + line = pblk_ppa_to_line(pblk, ppa); > + kref_put(&line->ref, pblk_line_put); > + } > + } > +} > + > static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line) > { > lockdep_assert_held(&pblk->l_mg.free_lock); > diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c > index 597fe6d..264d1d7 100644 > --- a/drivers/lightnvm/pblk-read.c > +++ b/drivers/lightnvm/pblk-read.c > @@ -41,28 +41,30 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, > > static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, > struct bio *bio, sector_t blba, > - unsigned long *read_bitmap) > + struct pblk_read_info *info) > { > void *meta_list = rqd->meta_list; > - struct ppa_addr ppas[NVM_MAX_VLBA]; > int nr_secs = rqd->nr_ppas; > bool advanced_bio = false; > - int i, j = 0; > + int i; > > - pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs); > + pblk_lookup_l2p_seq(pblk, rqd->ppa_list, blba, nr_secs); > + info->first_sec_from_cache = nr_secs; > + info->first_sec_from_drive = nr_secs; > > for (i = 0; i < nr_secs; i++) { > - struct ppa_addr p = ppas[i]; > struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); > sector_t lba = blba + i; > > retry: > - if (pblk_ppa_empty(p)) { > + if (pblk_ppa_empty(rqd->ppa_list[i])) { > __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); > > - WARN_ON(test_and_set_bit(i, read_bitmap)); > - meta->lba = addr_empty; > + info->secs_from_cache = true; > + info->first_sec_from_cache = min_t(int, i, > + info->first_sec_from_cache); > > + meta->lba = addr_empty; > if (unlikely(!advanced_bio)) { > bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE); > advanced_bio = true; > @@ -74,13 +76,18 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, > /* Try to read from write buffer. The address is later checked > * on the write buffer to prevent retrieving overwritten data. > */ > - if (pblk_addr_in_cache(p)) { > - if (!pblk_read_from_cache(pblk, bio, lba, p, i, > - advanced_bio)) { > - pblk_lookup_l2p_seq(pblk, &p, lba, 1); > + if (pblk_addr_in_cache(rqd->ppa_list[i])) { > + if (!pblk_read_from_cache(pblk, bio, lba, > + rqd->ppa_list[i], i, advanced_bio)) { > + pblk_lookup_l2p_seq(pblk, &rqd->ppa_list[i], > + lba, 1); > goto retry; > } > - WARN_ON(test_and_set_bit(i, read_bitmap)); > + > + info->secs_from_cache = true; > + info->first_sec_from_cache = min_t(int, i, > + info->first_sec_from_cache); > + > meta->lba = cpu_to_le64(lba); > advanced_bio = true; > #ifdef CONFIG_NVM_PBLK_DEBUG > @@ -88,7 +95,9 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, > #endif > } else { > /* Read from media non-cached sectors */ > - rqd->ppa_list[j++] = p; > + info->secs_from_drive = true; > + info->first_sec_from_drive = min_t(int, i, > + info->first_sec_from_drive); > } > > next: > @@ -223,172 +232,8 @@ static void pblk_end_io_read(struct nvm_rq *rqd) > __pblk_end_io_read(pblk, rqd, true); > } > > -static void pblk_end_partial_read(struct nvm_rq *rqd) > -{ > - struct pblk *pblk = rqd->private; > - struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); > - struct pblk_pr_ctx *pr_ctx = r_ctx->private; > - struct pblk_sec_meta *meta; > - struct bio *new_bio = rqd->bio; > - struct bio *bio = pr_ctx->orig_bio; > - struct bio_vec src_bv, dst_bv; > - void *meta_list = rqd->meta_list; > - int bio_init_idx = pr_ctx->bio_init_idx; > - unsigned long *read_bitmap = pr_ctx->bitmap; > - int nr_secs = pr_ctx->orig_nr_secs; > - int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); > - void *src_p, *dst_p; > - int hole, i; > - > - if (unlikely(nr_holes == 1)) { > - struct ppa_addr ppa; > - > - ppa = rqd->ppa_addr; > - rqd->ppa_list = pr_ctx->ppa_ptr; > - rqd->dma_ppa_list = pr_ctx->dma_ppa_list; > - rqd->ppa_list[0] = ppa; > - } > - > - for (i = 0; i < nr_secs; i++) { > - meta = pblk_get_meta(pblk, meta_list, i); > - pr_ctx->lba_list_media[i] = le64_to_cpu(meta->lba); > - meta->lba = cpu_to_le64(pr_ctx->lba_list_mem[i]); > - } > - > - /* Fill the holes in the original bio */ > - i = 0; > - hole = find_first_zero_bit(read_bitmap, nr_secs); > - do { > - struct pblk_line *line; > - > - line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]); > - kref_put(&line->ref, pblk_line_put); > - > - meta = pblk_get_meta(pblk, meta_list, hole); > - meta->lba = cpu_to_le64(pr_ctx->lba_list_media[i]); > - > - src_bv = new_bio->bi_io_vec[i++]; > - dst_bv = bio->bi_io_vec[bio_init_idx + hole]; > - > - src_p = kmap_atomic(src_bv.bv_page); > - dst_p = kmap_atomic(dst_bv.bv_page); > - > - memcpy(dst_p + dst_bv.bv_offset, > - src_p + src_bv.bv_offset, > - PBLK_EXPOSED_PAGE_SIZE); > - > - kunmap_atomic(src_p); > - kunmap_atomic(dst_p); > - > - mempool_free(src_bv.bv_page, &pblk->page_bio_pool); > - > - hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1); > - } while (hole < nr_secs); > - > - bio_put(new_bio); > - kfree(pr_ctx); > - > - /* restore original request */ > - rqd->bio = NULL; > - rqd->nr_ppas = nr_secs; > - > - pblk_end_user_read(bio, rqd->error); > - __pblk_end_io_read(pblk, rqd, false); > -} > - > -static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd, > - unsigned int bio_init_idx, > - unsigned long *read_bitmap, > - int nr_holes) > -{ > - void *meta_list = rqd->meta_list; > - struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); > - struct pblk_pr_ctx *pr_ctx; > - struct bio *new_bio, *bio = r_ctx->private; > - int nr_secs = rqd->nr_ppas; > - int i; > - > - new_bio = bio_alloc(GFP_KERNEL, nr_holes); > - > - if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) > - goto fail_bio_put; > - > - if (nr_holes != new_bio->bi_vcnt) { > - WARN_ONCE(1, "pblk: malformed bio\n"); > - goto fail_free_pages; > - } > - > - pr_ctx = kzalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL); > - if (!pr_ctx) > - goto fail_free_pages; > - > - for (i = 0; i < nr_secs; i++) { > - struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i); > - > - pr_ctx->lba_list_mem[i] = le64_to_cpu(meta->lba); > - } > - > - new_bio->bi_iter.bi_sector = 0; /* internal bio */ > - bio_set_op_attrs(new_bio, REQ_OP_READ, 0); > - > - rqd->bio = new_bio; > - rqd->nr_ppas = nr_holes; > - > - pr_ctx->orig_bio = bio; > - bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA); > - pr_ctx->bio_init_idx = bio_init_idx; > - pr_ctx->orig_nr_secs = nr_secs; > - r_ctx->private = pr_ctx; > - > - if (unlikely(nr_holes == 1)) { > - pr_ctx->ppa_ptr = rqd->ppa_list; > - pr_ctx->dma_ppa_list = rqd->dma_ppa_list; > - rqd->ppa_addr = rqd->ppa_list[0]; > - } > - return 0; > - > -fail_free_pages: > - pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt); > -fail_bio_put: > - bio_put(new_bio); > - > - return -ENOMEM; > -} > - > -static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, > - unsigned int bio_init_idx, > - unsigned long *read_bitmap, int nr_secs) > -{ > - int nr_holes; > - int ret; > - > - nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); > - > - if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap, > - nr_holes)) > - return NVM_IO_ERR; > - > - rqd->end_io = pblk_end_partial_read; > - > - ret = pblk_submit_io(pblk, rqd); > - if (ret) { > - bio_put(rqd->bio); > - pblk_err(pblk, "partial read IO submission failed\n"); > - goto err; > - } > - > - return NVM_IO_OK; > - > -err: > - pblk_err(pblk, "failed to perform partial read\n"); > - > - /* Free allocated pages in new bio */ > - pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt); > - return NVM_IO_ERR; > -} > - > static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, > - sector_t lba, unsigned long *read_bitmap) > + sector_t lba, struct pblk_read_info *info) > { > struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0); > struct ppa_addr ppa; > @@ -403,8 +248,8 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, > if (pblk_ppa_empty(ppa)) { > __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); > > - WARN_ON(test_and_set_bit(0, read_bitmap)); > meta->lba = addr_empty; > + info->secs_from_cache = true; > return; > } > > @@ -417,14 +262,14 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio, > goto retry; > } > > - WARN_ON(test_and_set_bit(0, read_bitmap)); > meta->lba = cpu_to_le64(lba); > - > + info->secs_from_cache = true; > #ifdef CONFIG_NVM_PBLK_DEBUG > atomic_long_inc(&pblk->cache_reads); > #endif > } else { > rqd->ppa_addr = ppa; > + info->secs_from_drive = true; > } > } > > @@ -436,15 +281,12 @@ void pblk_submit_read(struct pblk *pblk, struct bio *bio) > unsigned int nr_secs = pblk_get_secs(bio); > struct pblk_g_ctx *r_ctx; > struct nvm_rq *rqd; > - struct bio *int_bio; > - unsigned int bio_init_idx; > - DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA); > + struct bio *int_bio, *split_bio; > + struct pblk_read_info info = {0}; > > generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio), > &pblk->disk->part0); > > - bitmap_zero(read_bitmap, nr_secs); > - > rqd = pblk_alloc_rqd(pblk, PBLK_READ); > > rqd->opcode = NVM_OP_PREAD; > @@ -456,11 +298,6 @@ void pblk_submit_read(struct pblk *pblk, struct bio *bio) > r_ctx->start_time = jiffies; > r_ctx->lba = blba; > > - /* Save the index for this bio's start. This is needed in case > - * we need to fill a partial read. > - */ > - bio_init_idx = pblk_get_bi_idx(bio); > - > if (pblk_alloc_rqd_meta(pblk, rqd)) { > bio_io_error(bio); > pblk_free_rqd(pblk, rqd, PBLK_READ); > @@ -474,34 +311,63 @@ void pblk_submit_read(struct pblk *pblk, struct bio *bio) > int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); > > if (nr_secs > 1) > - pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap); > + pblk_read_ppalist_rq(pblk, rqd, int_bio, blba, &info); > else > - pblk_read_rq(pblk, rqd, bio, blba, read_bitmap); > + pblk_read_rq(pblk, rqd, int_bio, blba, &info); > > +split_retry: > r_ctx->private = bio; /* original bio */ > rqd->bio = int_bio; /* internal bio */ > > - if (bitmap_full(read_bitmap, nr_secs)) { > + if (info.secs_from_cache && !info.secs_from_drive) { > pblk_end_user_read(bio, 0); > atomic_inc(&pblk->inflight_io); > __pblk_end_io_read(pblk, rqd, false); > return; > } > > - if (!bitmap_empty(read_bitmap, rqd->nr_ppas)) { > + if (info.secs_from_cache && info.secs_from_drive) { > /* The read bio request could be partially filled by the write > * buffer, but there are some holes that need to be read from > - * the drive. > + * the drive. In order to handle this, we will use block layer > + * mechanism to split this request in to smaller ones and make > + * a chain of it. > + */ > + int split = max_t(int, info.first_sec_from_cache, > + info.first_sec_from_drive); > + > + split_bio = bio_split(bio, split * NR_PHY_IN_LOG, GFP_KERNEL, > + &pblk_bio_set); > + bio_chain(split_bio, bio); > + generic_make_request(bio); > + > + /* New bio contains first N sectors of the previous one, so > + * we can continue to use existing rqd, but we need to shrink > + * the number of PPAs in it. We also need to release line > + * references on the rest of the PPAs. Finally we also need > + * to update pblk_read_info struct properly in order to avoid > + * another interation of l2p lookup. > + */ > + bio = split_bio; > + pblk_rq_to_line_partial_put(pblk, rqd, split); > +#ifdef CONFIG_NVM_PBLK_DEBUG > + atomic_long_sub((rqd->nr_ppas - split), &pblk->inflight_reads); > +#endif > + rqd->nr_ppas = split; > + if (rqd->nr_ppas == 1) > + rqd->ppa_addr = rqd->ppa_list[0]; > + > + if (info.first_sec_from_cache > info.first_sec_from_drive) > + info.secs_from_cache = false; > + else > + info.secs_from_drive = false; > + > + /* Recreate int_bio - existing might have some needed internal > + * fields modified already. > */ > bio_put(int_bio); > - rqd->bio = NULL; > - if (pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap, > - nr_secs)) { > - pblk_err(pblk, "read IO submission failed\n"); > - bio_io_error(bio); > - __pblk_end_io_read(pblk, rqd, false); > - } > - return; > + int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set); > + goto split_retry; > } > > /* All sectors are to be read from the device */ > diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h > index a3c925d..e284a93 100644 > --- a/drivers/lightnvm/pblk.h > +++ b/drivers/lightnvm/pblk.h > @@ -123,18 +123,6 @@ struct pblk_g_ctx { > u64 lba; > }; > > -/* partial read context */ > -struct pblk_pr_ctx { > - struct bio *orig_bio; > - DECLARE_BITMAP(bitmap, NVM_MAX_VLBA); > - unsigned int orig_nr_secs; > - unsigned int bio_init_idx; > - void *ppa_ptr; > - dma_addr_t dma_ppa_list; > - u64 lba_list_mem[NVM_MAX_VLBA]; > - u64 lba_list_media[NVM_MAX_VLBA]; > -}; > - > /* Pad context */ > struct pblk_pad_rq { > struct pblk *pblk; > @@ -726,6 +714,14 @@ struct pblk_line_ws { > struct work_struct ws; > }; > > +/* L2P lookup on read path info */ > +struct pblk_read_info { > + int first_sec_from_cache; > + int first_sec_from_drive; > + bool secs_from_cache; > + bool secs_from_drive; > +}; > + > #define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx)) > #define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx)) > > @@ -809,6 +805,8 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); > struct pblk_line *pblk_line_replace_data(struct pblk *pblk); > void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa); > void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd); > +void pblk_rq_to_line_partial_put(struct pblk *pblk, struct nvm_rq *rqd, > + int offset); > int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); > void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); > struct pblk_line *pblk_line_get_data(struct pblk *pblk); > -- > 2.9.5 >