Re: [RFC PATCH 6/6] lightnvm: pblk: Integrate RAIL

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Sep 17, 2018 at 7:30 AM Heiner Litz <hlitz@xxxxxxxx> wrote:
>
> Integrate Redundant Array of Independent Luns (RAIL) into lightnvm. RAIL
> enforces low tail read latency by guaranteeing that reads are never
> serialized behind writes and erases to the same LUN. Whenever LUNs serve a
> high latency operation, reads are performed by recomputing the original
> utilizing redundant parity information.
> Rail trades-off read latency for capacity (redundancy) which, however, can
> be leveraged for fault tolerance.
>
> On FIO, with the kyber scheduler set to a target read latency of 500us,
> RAIL reduces tail latency percentiles (us) as follows:
>
>        Avg    90%    99%     99.9%  99.95%  99.99%
>        pblk   90     1000    2200   3000    6000
>        RAIL   85     100     250    400     500
>
> Signed-off-by: Heiner Litz <hlitz@xxxxxxxx>
> ---
>  drivers/lightnvm/Kconfig      | 10 ++++++++++
>  drivers/lightnvm/Makefile     |  1 +
>  drivers/lightnvm/pblk-core.c  | 36 ++++++++++++++++++++++++++++++++++-
>  drivers/lightnvm/pblk-init.c  | 17 +++++++++++++++++
>  drivers/lightnvm/pblk-rail.c  |  1 +
>  drivers/lightnvm/pblk-rb.c    |  6 ++++++
>  drivers/lightnvm/pblk-read.c  |  9 +++++++++
>  drivers/lightnvm/pblk-write.c |  9 +++++++++
>  drivers/lightnvm/pblk.h       |  5 +++++
>  9 files changed, 93 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
> index a872cd720967..165d5a29acc3 100644
> --- a/drivers/lightnvm/Kconfig
> +++ b/drivers/lightnvm/Kconfig
> @@ -35,6 +35,16 @@ config NVM_PBLK_DEBUG
>           vocal error messages, and extra tracking fields in the pblk sysfs
>           entries.
>
> +config NVM_PBLK_RAIL
> +       bool "Pblk RAIL Support"
> +       default n
> +       help
> +         Enables RAIL for pblk. RAIL enforces tail read latency guarantees by
> +        eliminiating reads being serialized behind writes to the same LUN.
> +        RAIL partitions LUNs into strides and enforces that only one LUN per
> +        stride is written at a time. Reads can bypass busy LUNs by recompting
> +        requested data using parity redundancy.
> +
>  endif # NVM_PBLK_DEBUG

Having a compile-time option forces the user (or even worse,
distribution provider) to pick the rail- OR non-rail version of pblk.
It's also a pain having to re-compile and re-provision the kernel when testing.

I see no reason why this should not be dynamically handled within pblk
(rail on/off and stride width could be supplied via the create ioctl)
One would want to configure stride-width to fit a given workload in any case.

nvm_ioctl_create_extended has 16 reserved bits, so we have room for
adding RAIL parameters.

>
>  endif # NVM
> diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
> index 97d9d7c71550..92f4376428cc 100644
> --- a/drivers/lightnvm/Makefile
> +++ b/drivers/lightnvm/Makefile
> @@ -5,6 +5,7 @@
>
>  obj-$(CONFIG_NVM)              := core.o
>  obj-$(CONFIG_NVM_PBLK)         += pblk.o
> +obj-$(CONFIG_NVM_PBLK_RAIL)    += pblk-rail.o
>  pblk-y                         := pblk-init.o pblk-core.o pblk-rb.o \
>                                    pblk-write.o pblk-cache.o pblk-read.o \
>                                    pblk-gc.o pblk-recovery.o pblk-map.o \
> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
> index a31bf359f905..ca74d7763fa9 100644
> --- a/drivers/lightnvm/pblk-core.c
> +++ b/drivers/lightnvm/pblk-core.c
> @@ -113,6 +113,12 @@ static void pblk_end_io_erase(struct nvm_rq *rqd)
>  {
>         struct pblk *pblk = rqd->private;
>
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
> +
> +       pblk_up_chunk(pblk, ppa_list[0]);
> +#endif
> +
>         __pblk_end_io_erase(pblk, rqd);
>         mempool_free(rqd, &pblk->e_rq_pool);
>  }
> @@ -940,7 +946,11 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
>         /* The write thread schedules erases so that it minimizes disturbances
>          * with writes. Thus, there is no need to take the LUN semaphore.
>          */
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       ret = pblk_submit_io_sync_sem(pblk, &rqd);
> +#else
>         ret = pblk_submit_io_sync(pblk, &rqd);
> +#endif
>         rqd.private = pblk;
>         __pblk_end_io_erase(pblk, &rqd);
>
> @@ -1754,7 +1764,11 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
>         /* The write thread schedules erases so that it minimizes disturbances
>          * with writes. Thus, there is no need to take the LUN semaphore.
>          */
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       err = pblk_submit_io_sem(pblk, rqd);
> +#else
>         err = pblk_submit_io(pblk, rqd);
> +#endif
>         if (err) {
>                 struct nvm_tgt_dev *dev = pblk->dev;
>                 struct nvm_geo *geo = &dev->geo;
> @@ -1909,6 +1923,10 @@ void pblk_line_close_ws(struct work_struct *work)
>         if (w_err_gc->has_write_err)
>                 pblk_save_lba_list(pblk, line);
>
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       pblk_rail_line_close(pblk, line);
> +#endif
> +
>         pblk_line_close(pblk, line);
>         mempool_free(line_ws, &pblk->gen_ws_pool);
>  }
> @@ -1938,8 +1956,12 @@ static void __pblk_down_chunk(struct pblk *pblk, int pos)
>          * Only send one inflight I/O per LUN. Since we map at a page
>          * granurality, all ppas in the I/O will map to the same LUN
>          */
> -
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       (void)rlun;
> +       ret = pblk_rail_down_stride(pblk, pos, msecs_to_jiffies(30000));
> +#else
>         ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
> +#endif
>         if (ret == -ETIME || ret == -EINTR)
>                 pblk_err(pblk, "taking lun semaphore timed out: err %d\n",
>                                 -ret);
> @@ -1978,7 +2000,13 @@ void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa)
>         int pos = pblk_ppa_to_pos(geo, ppa);
>
>         rlun = &pblk->luns[pos];
> +
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       pblk_rail_up_stride(pblk, pos);
> +#else
>         up(&rlun->wr_sem);
> +#endif
> +
>  }
>
>  void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap)
> @@ -1991,7 +2019,13 @@ void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap)
>
>         while ((bit = find_next_bit(lun_bitmap, num_lun, bit + 1)) < num_lun) {
>                 rlun = &pblk->luns[bit];
> +
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +               pblk_rail_up_stride(pblk, bit);
> +#else
>                 up(&rlun->wr_sem);
> +#endif
> +
>         }
>  }
>
> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> index 2b9c6ebd9fac..3e8255c8873f 100644
> --- a/drivers/lightnvm/pblk-init.c
> +++ b/drivers/lightnvm/pblk-init.c
> @@ -1050,6 +1050,7 @@ static int pblk_lines_init(struct pblk *pblk)
>         kfree(pblk->lines);
>  fail_free_chunk_meta:
>         kfree(chunk_meta);
> +
>  fail_free_luns:
>         kfree(pblk->luns);
>  fail_free_meta:
> @@ -1108,6 +1109,11 @@ static void pblk_tear_down(struct pblk *pblk, bool graceful)
>                 __pblk_pipeline_flush(pblk);
>         __pblk_pipeline_stop(pblk);
>         pblk_writer_stop(pblk);
> +
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       pblk_rail_free(pblk);
> +#endif
> +
>         pblk_rb_sync_l2p(&pblk->rwb);
>         pblk_rl_free(&pblk->rl);
>
> @@ -1226,6 +1232,12 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
>                 goto fail_stop_writer;
>         }
>
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       ret = pblk_rail_init(pblk);
> +       if (ret)
> +               goto fail_free_gc;
> +#endif
> +
>         /* inherit the size from the underlying device */
>         blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
>         blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
> @@ -1249,6 +1261,11 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
>
>         return pblk;
>
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +fail_free_gc:
> +       pblk_gc_exit(pblk, false);
> +#endif
> +
>  fail_stop_writer:
>         pblk_writer_stop(pblk);
>  fail_free_l2p:
> diff --git a/drivers/lightnvm/pblk-rail.c b/drivers/lightnvm/pblk-rail.c
> index a48ed31a0ba9..619ff9689d29 100644
> --- a/drivers/lightnvm/pblk-rail.c
> +++ b/drivers/lightnvm/pblk-rail.c
> @@ -1,3 +1,4 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
>  /*
>   * Copyright (C) 2018 Heiner Litz
>   * Initial release: Heiner Litz <hlitz@xxxxxxxx>
> diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
> index a7648e12f54f..b04462479fe3 100644
> --- a/drivers/lightnvm/pblk-rb.c
> +++ b/drivers/lightnvm/pblk-rb.c
> @@ -389,8 +389,14 @@ static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
>         sync = READ_ONCE(rb->sync);
>         mem = READ_ONCE(rb->mem);
>
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) <
> +           nr_entries + pblk_rail_rb_delay(rb))
> +               return 0;
> +#else
>         if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries)
>                 return 0;
> +#endif
>
>         if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
>                 return 0;
> diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
> index 67d44caefff4..a3f33503f60c 100644
> --- a/drivers/lightnvm/pblk-read.c
> +++ b/drivers/lightnvm/pblk-read.c
> @@ -472,6 +472,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
>                 return NVM_IO_DONE;
>         }
>
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       ret = pblk_rail_read_bio(pblk, rqd, blba, read_bitmap, bio_init_idx,
> +                                &bio);
> +       if (ret == NVM_IO_OK)
> +               return ret;
> +       if (ret == NVM_IO_ERR)
> +               goto fail_end_io;
> +#endif
> +
>         /* All sectors are to be read from the device */
>         if (bitmap_empty(read_bitmap, rqd->nr_ppas)) {
>                 struct bio *int_bio = NULL;
> diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
> index 6eba38b83acd..db42184cfba3 100644
> --- a/drivers/lightnvm/pblk-write.c
> +++ b/drivers/lightnvm/pblk-write.c
> @@ -469,6 +469,11 @@ static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
>                                 test_bit(pos_opt, data_line->blk_bitmap))
>                 return true;
>
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       if (unlikely(pblk_rail_meta_distance(data_line)))
> +               data_line->meta_distance--;
> +#endif
> +
>         if (unlikely(pblk_ppa_comp(ppa_opt, ppa)))
>                 data_line->meta_distance--;
>
> @@ -571,6 +576,10 @@ static int pblk_submit_write(struct pblk *pblk)
>         unsigned long pos;
>         unsigned int resubmit;
>
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       pblk_rail_submit_write(pblk);
> +#endif
> +
>         spin_lock(&pblk->resubmit_lock);
>         resubmit = !list_empty(&pblk->resubmit_list);
>         spin_unlock(&pblk->resubmit_lock);
> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> index 01fe4362b27e..9742524f74ea 100644
> --- a/drivers/lightnvm/pblk.h
> +++ b/drivers/lightnvm/pblk.h
> @@ -758,6 +758,11 @@ struct pblk {
>         struct pblk_gc gc;
>
>         pblk_map_page_fn *map_page;
> +
> +#ifdef CONFIG_NVM_PBLK_RAIL
> +       struct pblk_rail rail;
> +#endif
> +
>  };
>
>  struct pblk_line_ws {
> --
> 2.17.1
>



[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux