[RFC PATCH 6/6] lightnvm: pblk: Integrate RAIL

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Integrate Redundant Array of Independent Luns (RAIL) into lightnvm. RAIL
enforces low tail read latency by guaranteeing that reads are never
serialized behind writes and erases to the same LUN. Whenever LUNs serve a
high latency operation, reads are performed by recomputing the original
utilizing redundant parity information.
Rail trades-off read latency for capacity (redundancy) which, however, can
be leveraged for fault tolerance.

On FIO, with the kyber scheduler set to a target read latency of 500us,
RAIL reduces tail latency percentiles (us) as follows:

       Avg    90%    99%     99.9%  99.95%  99.99%
       pblk   90     1000    2200   3000    6000
       RAIL   85     100     250    400     500

Signed-off-by: Heiner Litz <hlitz@xxxxxxxx>
---
 drivers/lightnvm/Kconfig      | 10 ++++++++++
 drivers/lightnvm/Makefile     |  1 +
 drivers/lightnvm/pblk-core.c  | 36 ++++++++++++++++++++++++++++++++++-
 drivers/lightnvm/pblk-init.c  | 17 +++++++++++++++++
 drivers/lightnvm/pblk-rail.c  |  1 +
 drivers/lightnvm/pblk-rb.c    |  6 ++++++
 drivers/lightnvm/pblk-read.c  |  9 +++++++++
 drivers/lightnvm/pblk-write.c |  9 +++++++++
 drivers/lightnvm/pblk.h       |  5 +++++
 9 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index a872cd720967..165d5a29acc3 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -35,6 +35,16 @@ config NVM_PBLK_DEBUG
 	  vocal error messages, and extra tracking fields in the pblk sysfs
 	  entries.
 
+config NVM_PBLK_RAIL
+       bool "Pblk RAIL Support"
+       default n
+       help
+         Enables RAIL for pblk. RAIL enforces tail read latency guarantees by
+	 eliminiating reads being serialized behind writes to the same LUN.
+	 RAIL partitions LUNs into strides and enforces that only one LUN per
+	 stride is written at a time. Reads can bypass busy LUNs by recompting
+	 requested data using parity redundancy.
+
 endif # NVM_PBLK_DEBUG
 
 endif # NVM
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index 97d9d7c71550..92f4376428cc 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -5,6 +5,7 @@
 
 obj-$(CONFIG_NVM)		:= core.o
 obj-$(CONFIG_NVM_PBLK)		+= pblk.o
+obj-$(CONFIG_NVM_PBLK_RAIL)	+= pblk-rail.o
 pblk-y				:= pblk-init.o pblk-core.o pblk-rb.o \
 				   pblk-write.o pblk-cache.o pblk-read.o \
 				   pblk-gc.o pblk-recovery.o pblk-map.o \
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index a31bf359f905..ca74d7763fa9 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -113,6 +113,12 @@ static void pblk_end_io_erase(struct nvm_rq *rqd)
 {
 	struct pblk *pblk = rqd->private;
 
+#ifdef CONFIG_NVM_PBLK_RAIL
+	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
+
+	pblk_up_chunk(pblk, ppa_list[0]);
+#endif
+
 	__pblk_end_io_erase(pblk, rqd);
 	mempool_free(rqd, &pblk->e_rq_pool);
 }
@@ -940,7 +946,11 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
 	/* The write thread schedules erases so that it minimizes disturbances
 	 * with writes. Thus, there is no need to take the LUN semaphore.
 	 */
+#ifdef CONFIG_NVM_PBLK_RAIL
+	ret = pblk_submit_io_sync_sem(pblk, &rqd);
+#else
 	ret = pblk_submit_io_sync(pblk, &rqd);
+#endif
 	rqd.private = pblk;
 	__pblk_end_io_erase(pblk, &rqd);
 
@@ -1754,7 +1764,11 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
 	/* The write thread schedules erases so that it minimizes disturbances
 	 * with writes. Thus, there is no need to take the LUN semaphore.
 	 */
+#ifdef CONFIG_NVM_PBLK_RAIL
+	err = pblk_submit_io_sem(pblk, rqd);
+#else
 	err = pblk_submit_io(pblk, rqd);
+#endif
 	if (err) {
 		struct nvm_tgt_dev *dev = pblk->dev;
 		struct nvm_geo *geo = &dev->geo;
@@ -1909,6 +1923,10 @@ void pblk_line_close_ws(struct work_struct *work)
 	if (w_err_gc->has_write_err)
 		pblk_save_lba_list(pblk, line);
 
+#ifdef CONFIG_NVM_PBLK_RAIL
+	pblk_rail_line_close(pblk, line);
+#endif
+
 	pblk_line_close(pblk, line);
 	mempool_free(line_ws, &pblk->gen_ws_pool);
 }
@@ -1938,8 +1956,12 @@ static void __pblk_down_chunk(struct pblk *pblk, int pos)
 	 * Only send one inflight I/O per LUN. Since we map at a page
 	 * granurality, all ppas in the I/O will map to the same LUN
 	 */
-
+#ifdef CONFIG_NVM_PBLK_RAIL
+	(void)rlun;
+	ret = pblk_rail_down_stride(pblk, pos, msecs_to_jiffies(30000));
+#else
 	ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000));
+#endif
 	if (ret == -ETIME || ret == -EINTR)
 		pblk_err(pblk, "taking lun semaphore timed out: err %d\n",
 				-ret);
@@ -1978,7 +2000,13 @@ void pblk_up_chunk(struct pblk *pblk, struct ppa_addr ppa)
 	int pos = pblk_ppa_to_pos(geo, ppa);
 
 	rlun = &pblk->luns[pos];
+
+#ifdef CONFIG_NVM_PBLK_RAIL
+	pblk_rail_up_stride(pblk, pos);
+#else
 	up(&rlun->wr_sem);
+#endif
+
 }
 
 void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap)
@@ -1991,7 +2019,13 @@ void pblk_up_rq(struct pblk *pblk, unsigned long *lun_bitmap)
 
 	while ((bit = find_next_bit(lun_bitmap, num_lun, bit + 1)) < num_lun) {
 		rlun = &pblk->luns[bit];
+
+#ifdef CONFIG_NVM_PBLK_RAIL
+		pblk_rail_up_stride(pblk, bit);
+#else
 		up(&rlun->wr_sem);
+#endif
+
 	}
 }
 
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 2b9c6ebd9fac..3e8255c8873f 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -1050,6 +1050,7 @@ static int pblk_lines_init(struct pblk *pblk)
 	kfree(pblk->lines);
 fail_free_chunk_meta:
 	kfree(chunk_meta);
+
 fail_free_luns:
 	kfree(pblk->luns);
 fail_free_meta:
@@ -1108,6 +1109,11 @@ static void pblk_tear_down(struct pblk *pblk, bool graceful)
 		__pblk_pipeline_flush(pblk);
 	__pblk_pipeline_stop(pblk);
 	pblk_writer_stop(pblk);
+
+#ifdef CONFIG_NVM_PBLK_RAIL
+	pblk_rail_free(pblk);
+#endif
+
 	pblk_rb_sync_l2p(&pblk->rwb);
 	pblk_rl_free(&pblk->rl);
 
@@ -1226,6 +1232,12 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 		goto fail_stop_writer;
 	}
 
+#ifdef CONFIG_NVM_PBLK_RAIL
+	ret = pblk_rail_init(pblk);
+	if (ret)
+		goto fail_free_gc;
+#endif
+
 	/* inherit the size from the underlying device */
 	blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
 	blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
@@ -1249,6 +1261,11 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 
 	return pblk;
 
+#ifdef CONFIG_NVM_PBLK_RAIL
+fail_free_gc:
+	pblk_gc_exit(pblk, false);
+#endif
+
 fail_stop_writer:
 	pblk_writer_stop(pblk);
 fail_free_l2p:
diff --git a/drivers/lightnvm/pblk-rail.c b/drivers/lightnvm/pblk-rail.c
index a48ed31a0ba9..619ff9689d29 100644
--- a/drivers/lightnvm/pblk-rail.c
+++ b/drivers/lightnvm/pblk-rail.c
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (C) 2018 Heiner Litz
  * Initial release: Heiner Litz <hlitz@xxxxxxxx>
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index a7648e12f54f..b04462479fe3 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -389,8 +389,14 @@ static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
 	sync = READ_ONCE(rb->sync);
 	mem = READ_ONCE(rb->mem);
 
+#ifdef CONFIG_NVM_PBLK_RAIL
+	if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) <
+	    nr_entries + pblk_rail_rb_delay(rb))
+		return 0;
+#else
 	if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries)
 		return 0;
+#endif
 
 	if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
 		return 0;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 67d44caefff4..a3f33503f60c 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -472,6 +472,15 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
 		return NVM_IO_DONE;
 	}
 
+#ifdef CONFIG_NVM_PBLK_RAIL
+	ret = pblk_rail_read_bio(pblk, rqd, blba, read_bitmap, bio_init_idx,
+				 &bio);
+	if (ret == NVM_IO_OK)
+		return ret;
+	if (ret == NVM_IO_ERR)
+		goto fail_end_io;
+#endif
+
 	/* All sectors are to be read from the device */
 	if (bitmap_empty(read_bitmap, rqd->nr_ppas)) {
 		struct bio *int_bio = NULL;
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 6eba38b83acd..db42184cfba3 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -469,6 +469,11 @@ static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
 				test_bit(pos_opt, data_line->blk_bitmap))
 		return true;
 
+#ifdef CONFIG_NVM_PBLK_RAIL
+	if (unlikely(pblk_rail_meta_distance(data_line)))
+		data_line->meta_distance--;
+#endif
+
 	if (unlikely(pblk_ppa_comp(ppa_opt, ppa)))
 		data_line->meta_distance--;
 
@@ -571,6 +576,10 @@ static int pblk_submit_write(struct pblk *pblk)
 	unsigned long pos;
 	unsigned int resubmit;
 
+#ifdef CONFIG_NVM_PBLK_RAIL
+	pblk_rail_submit_write(pblk);
+#endif
+
 	spin_lock(&pblk->resubmit_lock);
 	resubmit = !list_empty(&pblk->resubmit_list);
 	spin_unlock(&pblk->resubmit_lock);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 01fe4362b27e..9742524f74ea 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -758,6 +758,11 @@ struct pblk {
 	struct pblk_gc gc;
 
 	pblk_map_page_fn *map_page;
+
+#ifdef CONFIG_NVM_PBLK_RAIL
+	struct pblk_rail rail;
+#endif
+
 };
 
 struct pblk_line_ws {
-- 
2.17.1




[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux