Integrate Redundant Array of Independent Luns (RAIL) into lightnvm. RAIL enforces low tail read latency by guaranteeing that reads are never serialized behind writes and erases to the same LUN. Whenever LUNs serve a high latency operation, reads are performed by recomputing the original utilizing redundant parity information. Rail trades-off read latency for capacity (redundancy) which, however, can be leveraged for fault tolerance. On FIO, with the kyber scheduler set to a target read latency of 500us, RAIL reduces tail latency percentiles (us) as follows: Avg 90% 99% 99.9% 99.95% 99.99% pblk 90 1000 2200 3000 6000 RAIL 85 100 250 400 500 Signed-off-by: Heiner Litz <hlitz@xxxxxxxx> --- drivers/lightnvm/Kconfig | 10 ++++++++++ drivers/lightnvm/Makefile | 1 + drivers/lightnvm/pblk-core.c | 36 ++++++++++++++++++++++++++++++++++- drivers/lightnvm/pblk-init.c | 17 +++++++++++++++++ drivers/lightnvm/pblk-rail.c | 1 + drivers/lightnvm/pblk-rb.c | 6 ++++++ drivers/lightnvm/pblk-read.c | 9 +++++++++ drivers/lightnvm/pblk-write.c | 9 +++++++++ drivers/lightnvm/pblk.h | 5 +++++ 9 files changed, 93 insertions(+), 1 deletion(-) diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index a872cd720967..165d5a29acc3 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -35,6 +35,16 @@ config NVM_PBLK_DEBUG vocal error messages, and extra tracking fields in the pblk sysfs entries. +config NVM_PBLK_RAIL + bool "Pblk RAIL Support" + default n + help + Enables RAIL for pblk. RAIL enforces tail read latency guarantees by + eliminiating reads being serialized behind writes to the same LUN. + RAIL partitions LUNs into strides and enforces that only one LUN per + stride is written at a time. Reads can bypass busy LUNs by recompting + requested data using parity redundancy. + endif # NVM_PBLK_DEBUG endif # NVM diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile index 97d9d7c71550..92f4376428cc 100644 --- a/drivers/lightnvm/Makefile +++ b/drivers/lightnvm/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_NVM) := core.o obj-$(CONFIG_NVM_PBLK) += pblk.o +obj-$(CONFIG_NVM_PBLK_RAIL) += pblk-rail.o pblk-y := pblk-init.o pblk-core.o pblk-rb.o \ pblk-write.o pblk-cache.o pblk-read.o \ pblk-gc.o pblk-recovery.o pblk-map.o \ diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index a31bf359f905..ca74d7763fa9 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -113,6 +113,12 @@ static void pblk_end_io_erase(struct nvm_rq *rqd) { struct pblk *pblk = rqd->private; +#ifdef CONFIG_NVM_PBLK_RAIL + struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd); + + pblk_up_chunk(pblk, ppa_list[0]); +#endif + __pblk_end_io_erase(pblk, rqd); mempool_free(rqd, &pblk->e_rq_pool); } @@ -940,7 +946,11 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) /* The write thread schedules erases so that it minimizes disturbances * with writes. Thus, there is no need to take the LUN semaphore. */ +#ifdef CONFIG_NVM_PBLK_RAIL + ret = pblk_submit_io_sync_sem(pblk, &rqd); +#else ret = pblk_submit_io_sync(pblk, &rqd); +#endif rqd.private = pblk; __pblk_end_io_erase(pblk, &rqd); @@ -1754,7 +1764,11 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) /* The write thread schedules erases so that it minimizes disturbances * with writes. Thus, there is no need to take the LUN semaphore. */ +#ifdef CONFIG_NVM_PBLK_RAIL + err = pblk_submit_io_sem(pblk, rqd); +#else err = pblk_submit_io(pblk, rqd); +#endif if (err) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; @@ -1909,6 +1923,10 @@ void pblk_line_close_ws(struct work_struct *work) if (w_err_gc->has_write_err) pblk_save_lba_list(pblk, line); +#ifdef CONFIG_NVM_PBLK_RAIL + pblk_rail_line_close(pblk, line); +#endif + pblk_line_close(pblk, line); mempool_free(line_ws, &pblk->gen_ws_pool); } @@ -1938,8 +1956,12 @@ static void __pblk_down_chunk(struct pblk *pblk, int pos) * Only send one inflight I/O per LUN. Since we map at a page * granurality, all ppas in the I/O will map to the same LUN */ - +#ifdef CONFIG_NVM_PBLK_RAIL + (void)rlun; + ret = pblk_rail_down_stride(pblk, pos, msecs_to_jiffies(30000)); +#else ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); +#endif if (ret == -ETIME || ret == -EINTR) pblk_err(pblk, "taking lun semaphore timed out: err %d\n", -ret); @@ -1978,7 +2000,13 @@ void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa) int pos = pblk_ppa_to_pos(geo, ppa); rlun = &pblk->luns[pos]; + +#ifdef CONFIG_NVM_PBLK_RAIL + pblk_rail_up_stride(pblk, pos); +#else up(&rlun->wr_sem); +#endif + } void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap) @@ -1991,7 +2019,13 @@ void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap) while ((bit = find_next_bit(lun_bitmap, num_lun, bit + 1)) < num_lun) { rlun = &pblk->luns[bit]; + +#ifdef CONFIG_NVM_PBLK_RAIL + pblk_rail_up_stride(pblk, bit); +#else up(&rlun->wr_sem); +#endif + } } diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 2b9c6ebd9fac..3e8255c8873f 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -1050,6 +1050,7 @@ static int pblk_lines_init(struct pblk *pblk) kfree(pblk->lines); fail_free_chunk_meta: kfree(chunk_meta); + fail_free_luns: kfree(pblk->luns); fail_free_meta: @@ -1108,6 +1109,11 @@ static void pblk_tear_down(struct pblk *pblk, bool graceful) __pblk_pipeline_flush(pblk); __pblk_pipeline_stop(pblk); pblk_writer_stop(pblk); + +#ifdef CONFIG_NVM_PBLK_RAIL + pblk_rail_free(pblk); +#endif + pblk_rb_sync_l2p(&pblk->rwb); pblk_rl_free(&pblk->rl); @@ -1226,6 +1232,12 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, goto fail_stop_writer; } +#ifdef CONFIG_NVM_PBLK_RAIL + ret = pblk_rail_init(pblk); + if (ret) + goto fail_free_gc; +#endif + /* inherit the size from the underlying device */ blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue)); blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue)); @@ -1249,6 +1261,11 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, return pblk; +#ifdef CONFIG_NVM_PBLK_RAIL +fail_free_gc: + pblk_gc_exit(pblk, false); +#endif + fail_stop_writer: pblk_writer_stop(pblk); fail_free_l2p: diff --git a/drivers/lightnvm/pblk-rail.c b/drivers/lightnvm/pblk-rail.c index a48ed31a0ba9..619ff9689d29 100644 --- a/drivers/lightnvm/pblk-rail.c +++ b/drivers/lightnvm/pblk-rail.c @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2018 Heiner Litz * Initial release: Heiner Litz <hlitz@xxxxxxxx> diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index a7648e12f54f..b04462479fe3 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -389,8 +389,14 @@ static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries, sync = READ_ONCE(rb->sync); mem = READ_ONCE(rb->mem); +#ifdef CONFIG_NVM_PBLK_RAIL + if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < + nr_entries + pblk_rail_rb_delay(rb)) + return 0; +#else if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries) return 0; +#endif if (pblk_rb_update_l2p(rb, nr_entries, mem, sync)) return 0; diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 67d44caefff4..a3f33503f60c 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -472,6 +472,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) return NVM_IO_DONE; } +#ifdef CONFIG_NVM_PBLK_RAIL + ret = pblk_rail_read_bio(pblk, rqd, blba, read_bitmap, bio_init_idx, + &bio); + if (ret == NVM_IO_OK) + return ret; + if (ret == NVM_IO_ERR) + goto fail_end_io; +#endif + /* All sectors are to be read from the device */ if (bitmap_empty(read_bitmap, rqd->nr_ppas)) { struct bio *int_bio = NULL; diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 6eba38b83acd..db42184cfba3 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -469,6 +469,11 @@ static inline bool pblk_valid_meta_ppa(struct pblk *pblk, test_bit(pos_opt, data_line->blk_bitmap)) return true; +#ifdef CONFIG_NVM_PBLK_RAIL + if (unlikely(pblk_rail_meta_distance(data_line))) + data_line->meta_distance--; +#endif + if (unlikely(pblk_ppa_comp(ppa_opt, ppa))) data_line->meta_distance--; @@ -571,6 +576,10 @@ static int pblk_submit_write(struct pblk *pblk) unsigned long pos; unsigned int resubmit; +#ifdef CONFIG_NVM_PBLK_RAIL + pblk_rail_submit_write(pblk); +#endif + spin_lock(&pblk->resubmit_lock); resubmit = !list_empty(&pblk->resubmit_list); spin_unlock(&pblk->resubmit_lock); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 01fe4362b27e..9742524f74ea 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -758,6 +758,11 @@ struct pblk { struct pblk_gc gc; pblk_map_page_fn *map_page; + +#ifdef CONFIG_NVM_PBLK_RAIL + struct pblk_rail rail; +#endif + }; struct pblk_line_ws { -- 2.17.1