Re: [PATCH v3 02/10] lightnvm: pblk: simplify partial read path

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Mar 27, 2019 at 1:41 PM Igor Konopko <igor.j.konopko@xxxxxxxxx> wrote:
>
> This patch changes the approach to handling partial read path, what is
> needed for supporting multi-page bvec, which is currently broken.
>
> In old approach merging of data from round buffer and drive was fully
> made by drive. This had some disadvantages - code was complex and
> relies on bio internals, so it was hard to maintain and was strongly
> dependent on bio changes.
>
> In new approach most of the handling is done mostly by block layer
> functions such as bio_split(), bio_chain() and generic_make request()
> and generally is less complex and easier to maintain. Below some more
> details of the new approach.
>
> When read bio arrives, it is cloned for pblk internal purposes. All
> the L2P mapping, which includes copying data from round buffer to bio
> and thus bio_advance() calls is done on the cloned bio, so the original
> bio is untouched. Later if we found that we have partial read case, we
> still have original bio untouched, so we can split it and continue to
> process only first 4K of it in current context, when the rest will be
> called as separate bio request which is passed to generic_make_request()
> for further processing.
>
> Signed-off-by: Igor Konopko <igor.j.konopko@xxxxxxxxx>

I've played around with fio to figure out if there is a read latency
performance hit, and I have not found any diff that can't be written
off as noise.
(i did seq write + random 4k reads with different sizes and checked
the latency stats)

> ---
>  drivers/lightnvm/pblk-core.c |  16 +++
>  drivers/lightnvm/pblk-read.c | 276 +++++++++++--------------------------------
>  drivers/lightnvm/pblk.h      |  22 ++--
>  3 files changed, 97 insertions(+), 217 deletions(-)

Nice cleanup :)
I hope to find time early next week to go through the patch in depth.

Thanks,
Hans

>
> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
> index 39280c1..f2edec6 100644
> --- a/drivers/lightnvm/pblk-core.c
> +++ b/drivers/lightnvm/pblk-core.c
> @@ -1529,6 +1529,22 @@ void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd)
>                 pblk_ppa_to_line_put(pblk, ppa_list[i]);
>  }
>
> +void pblk_rq_to_line_partial_put(struct pblk *pblk, struct nvm_rq *rqd,
> +                                       int offset)
> +{
> +       struct ppa_addr ppa;
> +       struct pblk_line *line;
> +       int i = offset;
> +
> +       for (; i < rqd->nr_ppas; i++) {
> +               ppa = rqd->ppa_list[i];
> +               if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
> +                       line = pblk_ppa_to_line(pblk, ppa);
> +                       kref_put(&line->ref, pblk_line_put);
> +               }
> +       }
> +}
> +
>  static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line)
>  {
>         lockdep_assert_held(&pblk->l_mg.free_lock);
> diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
> index 597fe6d..264d1d7 100644
> --- a/drivers/lightnvm/pblk-read.c
> +++ b/drivers/lightnvm/pblk-read.c
> @@ -41,28 +41,30 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
>
>  static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
>                                  struct bio *bio, sector_t blba,
> -                                unsigned long *read_bitmap)
> +                                struct pblk_read_info *info)
>  {
>         void *meta_list = rqd->meta_list;
> -       struct ppa_addr ppas[NVM_MAX_VLBA];
>         int nr_secs = rqd->nr_ppas;
>         bool advanced_bio = false;
> -       int i, j = 0;
> +       int i;
>
> -       pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
> +       pblk_lookup_l2p_seq(pblk, rqd->ppa_list, blba, nr_secs);
> +       info->first_sec_from_cache = nr_secs;
> +       info->first_sec_from_drive = nr_secs;
>
>         for (i = 0; i < nr_secs; i++) {
> -               struct ppa_addr p = ppas[i];
>                 struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
>                 sector_t lba = blba + i;
>
>  retry:
> -               if (pblk_ppa_empty(p)) {
> +               if (pblk_ppa_empty(rqd->ppa_list[i])) {
>                         __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
>
> -                       WARN_ON(test_and_set_bit(i, read_bitmap));
> -                       meta->lba = addr_empty;
> +                       info->secs_from_cache = true;
> +                       info->first_sec_from_cache = min_t(int, i,
> +                                               info->first_sec_from_cache);
>
> +                       meta->lba = addr_empty;
>                         if (unlikely(!advanced_bio)) {
>                                 bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
>                                 advanced_bio = true;
> @@ -74,13 +76,18 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
>                 /* Try to read from write buffer. The address is later checked
>                  * on the write buffer to prevent retrieving overwritten data.
>                  */
> -               if (pblk_addr_in_cache(p)) {
> -                       if (!pblk_read_from_cache(pblk, bio, lba, p, i,
> -                                                               advanced_bio)) {
> -                               pblk_lookup_l2p_seq(pblk, &p, lba, 1);
> +               if (pblk_addr_in_cache(rqd->ppa_list[i])) {
> +                       if (!pblk_read_from_cache(pblk, bio, lba,
> +                                       rqd->ppa_list[i], i, advanced_bio)) {
> +                               pblk_lookup_l2p_seq(pblk, &rqd->ppa_list[i],
> +                                                       lba, 1);
>                                 goto retry;
>                         }
> -                       WARN_ON(test_and_set_bit(i, read_bitmap));
> +
> +                       info->secs_from_cache = true;
> +                       info->first_sec_from_cache = min_t(int, i,
> +                                               info->first_sec_from_cache);
> +
>                         meta->lba = cpu_to_le64(lba);
>                         advanced_bio = true;
>  #ifdef CONFIG_NVM_PBLK_DEBUG
> @@ -88,7 +95,9 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
>  #endif
>                 } else {
>                         /* Read from media non-cached sectors */
> -                       rqd->ppa_list[j++] = p;
> +                       info->secs_from_drive = true;
> +                       info->first_sec_from_drive = min_t(int, i,
> +                                               info->first_sec_from_drive);
>                 }
>
>  next:
> @@ -223,172 +232,8 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
>         __pblk_end_io_read(pblk, rqd, true);
>  }
>
> -static void pblk_end_partial_read(struct nvm_rq *rqd)
> -{
> -       struct pblk *pblk = rqd->private;
> -       struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
> -       struct pblk_pr_ctx *pr_ctx = r_ctx->private;
> -       struct pblk_sec_meta *meta;
> -       struct bio *new_bio = rqd->bio;
> -       struct bio *bio = pr_ctx->orig_bio;
> -       struct bio_vec src_bv, dst_bv;
> -       void *meta_list = rqd->meta_list;
> -       int bio_init_idx = pr_ctx->bio_init_idx;
> -       unsigned long *read_bitmap = pr_ctx->bitmap;
> -       int nr_secs = pr_ctx->orig_nr_secs;
> -       int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
> -       void *src_p, *dst_p;
> -       int hole, i;
> -
> -       if (unlikely(nr_holes == 1)) {
> -               struct ppa_addr ppa;
> -
> -               ppa = rqd->ppa_addr;
> -               rqd->ppa_list = pr_ctx->ppa_ptr;
> -               rqd->dma_ppa_list = pr_ctx->dma_ppa_list;
> -               rqd->ppa_list[0] = ppa;
> -       }
> -
> -       for (i = 0; i < nr_secs; i++) {
> -               meta = pblk_get_meta(pblk, meta_list, i);
> -               pr_ctx->lba_list_media[i] = le64_to_cpu(meta->lba);
> -               meta->lba = cpu_to_le64(pr_ctx->lba_list_mem[i]);
> -       }
> -
> -       /* Fill the holes in the original bio */
> -       i = 0;
> -       hole = find_first_zero_bit(read_bitmap, nr_secs);
> -       do {
> -               struct pblk_line *line;
> -
> -               line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]);
> -               kref_put(&line->ref, pblk_line_put);
> -
> -               meta = pblk_get_meta(pblk, meta_list, hole);
> -               meta->lba = cpu_to_le64(pr_ctx->lba_list_media[i]);
> -
> -               src_bv = new_bio->bi_io_vec[i++];
> -               dst_bv = bio->bi_io_vec[bio_init_idx + hole];
> -
> -               src_p = kmap_atomic(src_bv.bv_page);
> -               dst_p = kmap_atomic(dst_bv.bv_page);
> -
> -               memcpy(dst_p + dst_bv.bv_offset,
> -                       src_p + src_bv.bv_offset,
> -                       PBLK_EXPOSED_PAGE_SIZE);
> -
> -               kunmap_atomic(src_p);
> -               kunmap_atomic(dst_p);
> -
> -               mempool_free(src_bv.bv_page, &pblk->page_bio_pool);
> -
> -               hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
> -       } while (hole < nr_secs);
> -
> -       bio_put(new_bio);
> -       kfree(pr_ctx);
> -
> -       /* restore original request */
> -       rqd->bio = NULL;
> -       rqd->nr_ppas = nr_secs;
> -
> -       pblk_end_user_read(bio, rqd->error);
> -       __pblk_end_io_read(pblk, rqd, false);
> -}
> -
> -static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
> -                           unsigned int bio_init_idx,
> -                           unsigned long *read_bitmap,
> -                           int nr_holes)
> -{
> -       void *meta_list = rqd->meta_list;
> -       struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
> -       struct pblk_pr_ctx *pr_ctx;
> -       struct bio *new_bio, *bio = r_ctx->private;
> -       int nr_secs = rqd->nr_ppas;
> -       int i;
> -
> -       new_bio = bio_alloc(GFP_KERNEL, nr_holes);
> -
> -       if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
> -               goto fail_bio_put;
> -
> -       if (nr_holes != new_bio->bi_vcnt) {
> -               WARN_ONCE(1, "pblk: malformed bio\n");
> -               goto fail_free_pages;
> -       }
> -
> -       pr_ctx = kzalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
> -       if (!pr_ctx)
> -               goto fail_free_pages;
> -
> -       for (i = 0; i < nr_secs; i++) {
> -               struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
> -
> -               pr_ctx->lba_list_mem[i] = le64_to_cpu(meta->lba);
> -       }
> -
> -       new_bio->bi_iter.bi_sector = 0; /* internal bio */
> -       bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
> -
> -       rqd->bio = new_bio;
> -       rqd->nr_ppas = nr_holes;
> -
> -       pr_ctx->orig_bio = bio;
> -       bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
> -       pr_ctx->bio_init_idx = bio_init_idx;
> -       pr_ctx->orig_nr_secs = nr_secs;
> -       r_ctx->private = pr_ctx;
> -
> -       if (unlikely(nr_holes == 1)) {
> -               pr_ctx->ppa_ptr = rqd->ppa_list;
> -               pr_ctx->dma_ppa_list = rqd->dma_ppa_list;
> -               rqd->ppa_addr = rqd->ppa_list[0];
> -       }
> -       return 0;
> -
> -fail_free_pages:
> -       pblk_bio_free_pages(pblk, new_bio, 0, new_bio->bi_vcnt);
> -fail_bio_put:
> -       bio_put(new_bio);
> -
> -       return -ENOMEM;
> -}
> -
> -static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
> -                                unsigned int bio_init_idx,
> -                                unsigned long *read_bitmap, int nr_secs)
> -{
> -       int nr_holes;
> -       int ret;
> -
> -       nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
> -
> -       if (pblk_setup_partial_read(pblk, rqd, bio_init_idx, read_bitmap,
> -                                   nr_holes))
> -               return NVM_IO_ERR;
> -
> -       rqd->end_io = pblk_end_partial_read;
> -
> -       ret = pblk_submit_io(pblk, rqd);
> -       if (ret) {
> -               bio_put(rqd->bio);
> -               pblk_err(pblk, "partial read IO submission failed\n");
> -               goto err;
> -       }
> -
> -       return NVM_IO_OK;
> -
> -err:
> -       pblk_err(pblk, "failed to perform partial read\n");
> -
> -       /* Free allocated pages in new bio */
> -       pblk_bio_free_pages(pblk, rqd->bio, 0, rqd->bio->bi_vcnt);
> -       return NVM_IO_ERR;
> -}
> -
>  static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
> -                        sector_t lba, unsigned long *read_bitmap)
> +                        sector_t lba, struct pblk_read_info *info)
>  {
>         struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0);
>         struct ppa_addr ppa;
> @@ -403,8 +248,8 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
>         if (pblk_ppa_empty(ppa)) {
>                 __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
>
> -               WARN_ON(test_and_set_bit(0, read_bitmap));
>                 meta->lba = addr_empty;
> +               info->secs_from_cache = true;
>                 return;
>         }
>
> @@ -417,14 +262,14 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
>                         goto retry;
>                 }
>
> -               WARN_ON(test_and_set_bit(0, read_bitmap));
>                 meta->lba = cpu_to_le64(lba);
> -
> +               info->secs_from_cache = true;
>  #ifdef CONFIG_NVM_PBLK_DEBUG
>                 atomic_long_inc(&pblk->cache_reads);
>  #endif
>         } else {
>                 rqd->ppa_addr = ppa;
> +               info->secs_from_drive = true;
>         }
>  }
>
> @@ -436,15 +281,12 @@ void pblk_submit_read(struct pblk *pblk, struct bio *bio)
>         unsigned int nr_secs = pblk_get_secs(bio);
>         struct pblk_g_ctx *r_ctx;
>         struct nvm_rq *rqd;
> -       struct bio *int_bio;
> -       unsigned int bio_init_idx;
> -       DECLARE_BITMAP(read_bitmap, NVM_MAX_VLBA);
> +       struct bio *int_bio, *split_bio;
> +       struct pblk_read_info info = {0};
>
>         generic_start_io_acct(q, REQ_OP_READ, bio_sectors(bio),
>                               &pblk->disk->part0);
>
> -       bitmap_zero(read_bitmap, nr_secs);
> -
>         rqd = pblk_alloc_rqd(pblk, PBLK_READ);
>
>         rqd->opcode = NVM_OP_PREAD;
> @@ -456,11 +298,6 @@ void pblk_submit_read(struct pblk *pblk, struct bio *bio)
>         r_ctx->start_time = jiffies;
>         r_ctx->lba = blba;
>
> -       /* Save the index for this bio's start. This is needed in case
> -        * we need to fill a partial read.
> -        */
> -       bio_init_idx = pblk_get_bi_idx(bio);
> -
>         if (pblk_alloc_rqd_meta(pblk, rqd)) {
>                 bio_io_error(bio);
>                 pblk_free_rqd(pblk, rqd, PBLK_READ);
> @@ -474,34 +311,63 @@ void pblk_submit_read(struct pblk *pblk, struct bio *bio)
>         int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
>
>         if (nr_secs > 1)
> -               pblk_read_ppalist_rq(pblk, rqd, bio, blba, read_bitmap);
> +               pblk_read_ppalist_rq(pblk, rqd, int_bio, blba, &info);
>         else
> -               pblk_read_rq(pblk, rqd, bio, blba, read_bitmap);
> +               pblk_read_rq(pblk, rqd, int_bio, blba, &info);
>
> +split_retry:
>         r_ctx->private = bio; /* original bio */
>         rqd->bio = int_bio; /* internal bio */
>
> -       if (bitmap_full(read_bitmap, nr_secs)) {
> +       if (info.secs_from_cache && !info.secs_from_drive) {
>                 pblk_end_user_read(bio, 0);
>                 atomic_inc(&pblk->inflight_io);
>                 __pblk_end_io_read(pblk, rqd, false);
>                 return;
>         }
>
> -       if (!bitmap_empty(read_bitmap, rqd->nr_ppas)) {
> +       if (info.secs_from_cache && info.secs_from_drive) {
>                 /* The read bio request could be partially filled by the write
>                  * buffer, but there are some holes that need to be read from
> -                * the drive.
> +                * the drive. In order to handle this, we will use block layer
> +                * mechanism to split this request in to smaller ones and make
> +                * a chain of it.
> +                */
> +               int split = max_t(int, info.first_sec_from_cache,
> +                                       info.first_sec_from_drive);
> +
> +               split_bio = bio_split(bio, split * NR_PHY_IN_LOG, GFP_KERNEL,
> +                                       &pblk_bio_set);
> +               bio_chain(split_bio, bio);
> +               generic_make_request(bio);
> +
> +               /* New bio contains first N sectors of the previous one, so
> +                * we can continue to use existing rqd, but we need to shrink
> +                * the number of PPAs in it. We also need to release line
> +                * references on the rest of the PPAs. Finally we also need
> +                * to update pblk_read_info struct properly in order to avoid
> +                * another interation of l2p lookup.
> +                */
> +               bio = split_bio;
> +               pblk_rq_to_line_partial_put(pblk, rqd, split);
> +#ifdef CONFIG_NVM_PBLK_DEBUG
> +               atomic_long_sub((rqd->nr_ppas - split), &pblk->inflight_reads);
> +#endif
> +               rqd->nr_ppas = split;
> +               if (rqd->nr_ppas == 1)
> +                       rqd->ppa_addr = rqd->ppa_list[0];
> +
> +               if (info.first_sec_from_cache > info.first_sec_from_drive)
> +                       info.secs_from_cache = false;
> +               else
> +                       info.secs_from_drive = false;
> +
> +               /* Recreate int_bio - existing might have some needed internal
> +                * fields modified already.
>                  */
>                 bio_put(int_bio);
> -               rqd->bio = NULL;
> -               if (pblk_partial_read_bio(pblk, rqd, bio_init_idx, read_bitmap,
> -                                           nr_secs)) {
> -                       pblk_err(pblk, "read IO submission failed\n");
> -                       bio_io_error(bio);
> -                       __pblk_end_io_read(pblk, rqd, false);
> -               }
> -               return;
> +               int_bio = bio_clone_fast(bio, GFP_KERNEL, &pblk_bio_set);
> +               goto split_retry;
>         }
>
>         /* All sectors are to be read from the device */
> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> index a3c925d..e284a93 100644
> --- a/drivers/lightnvm/pblk.h
> +++ b/drivers/lightnvm/pblk.h
> @@ -123,18 +123,6 @@ struct pblk_g_ctx {
>         u64 lba;
>  };
>
> -/* partial read context */
> -struct pblk_pr_ctx {
> -       struct bio *orig_bio;
> -       DECLARE_BITMAP(bitmap, NVM_MAX_VLBA);
> -       unsigned int orig_nr_secs;
> -       unsigned int bio_init_idx;
> -       void *ppa_ptr;
> -       dma_addr_t dma_ppa_list;
> -       u64 lba_list_mem[NVM_MAX_VLBA];
> -       u64 lba_list_media[NVM_MAX_VLBA];
> -};
> -
>  /* Pad context */
>  struct pblk_pad_rq {
>         struct pblk *pblk;
> @@ -726,6 +714,14 @@ struct pblk_line_ws {
>         struct work_struct ws;
>  };
>
> +/* L2P lookup on read path info */
> +struct pblk_read_info {
> +       int first_sec_from_cache;
> +       int first_sec_from_drive;
> +       bool secs_from_cache;
> +       bool secs_from_drive;
> +};
> +
>  #define pblk_g_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_g_ctx))
>  #define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
>
> @@ -809,6 +805,8 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
>  struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
>  void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa);
>  void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd);
> +void pblk_rq_to_line_partial_put(struct pblk *pblk, struct nvm_rq *rqd,
> +                                       int offset);
>  int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
>  void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
>  struct pblk_line *pblk_line_get_data(struct pblk *pblk);
> --
> 2.9.5
>



[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux