LightNVM implements the internal logic of an SSD within the host system. This includes logic such as translation tables for logical to physical address translation, garbage collection and wear-leveling. It is designed to be used either standalone or with a LightNVM compatible firmware. If used standalone, NVM memory can be simulated by passing timings to the dm target table. If used with a LightNVM compatible device, the device will be queued upon initialized for the relevant values. The last part is still in progress and a fully working prototype will be presented in upcoming patches. Contributions to make this possible by the following people: Aviad Zuck <aviadzuc@xxxxxxxxx> Jesper Madsen <jmad@xxxxxx> Signed-off-by: Matias Bjorling <m@xxxxxxxxxxx> --- drivers/md/Kconfig | 1 + drivers/md/Makefile | 1 + drivers/md/lightnvm/Kconfig | 14 + drivers/md/lightnvm/Makefile | 1 + drivers/md/lightnvm/core.c | 705 +++++++++++++++++++++++++++++++++++++++++ drivers/md/lightnvm/gc.c | 208 ++++++++++++ drivers/md/lightnvm/lightnvm.c | 589 ++++++++++++++++++++++++++++++++++ drivers/md/lightnvm/lightnvm.h | 592 ++++++++++++++++++++++++++++++++++ drivers/md/lightnvm/reg.c | 41 +++ 9 files changed, 2152 insertions(+) create mode 100644 drivers/md/lightnvm/Kconfig create mode 100644 drivers/md/lightnvm/Makefile create mode 100644 drivers/md/lightnvm/core.c create mode 100644 drivers/md/lightnvm/gc.c create mode 100644 drivers/md/lightnvm/lightnvm.c create mode 100644 drivers/md/lightnvm/lightnvm.h create mode 100644 drivers/md/lightnvm/reg.c diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index f2ccbc3..ffce728 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -175,6 +175,7 @@ config MD_FAULTY In unsure, say N. source "drivers/md/bcache/Kconfig" +source "drivers/md/lightnvm/Kconfig" config BLK_DEV_DM tristate "Device mapper support" diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 2acc43f..ee1d9d7 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_BCACHE) += bcache/ +obj-$(CONFIG_LIGHTNVM) += lightnvm/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_BUFIO) += dm-bufio.o diff --git a/drivers/md/lightnvm/Kconfig b/drivers/md/lightnvm/Kconfig new file mode 100644 index 0000000..1f10554 --- /dev/null +++ b/drivers/md/lightnvm/Kconfig @@ -0,0 +1,14 @@ +config LIGHTNVM + tristate "LightNVM translation layer support (EXPERIMENTAL)" + depends on BLK_DEV_DM + ---help--- + A target that implements the internals of SSDs within the host. + The target can be used with LightNVM compatible device or as an + in-memory store. The device mapper is used together with a + "bare" firmware. It exposes direct access to the underlying NVM. + + To compile this code as a module, choose M here: the module will + be called dm-lightnvm. + + If unsure, say N. + diff --git a/drivers/md/lightnvm/Makefile b/drivers/md/lightnvm/Makefile new file mode 100644 index 0000000..4fb03ba --- /dev/null +++ b/drivers/md/lightnvm/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_LIGHTNVM) += lightnvm.o reg.o core.o gc.o diff --git a/drivers/md/lightnvm/core.c b/drivers/md/lightnvm/core.c new file mode 100644 index 0000000..113fde9 --- /dev/null +++ b/drivers/md/lightnvm/core.c @@ -0,0 +1,705 @@ +#include "lightnvm.h" + +/* alloc pbd, but also decorate it with bio */ +static struct per_bio_data *alloc_init_pbd(struct nvmd *nvmd, struct bio *bio) +{ + struct per_bio_data *pb = mempool_alloc(nvmd->per_bio_pool, GFP_NOIO); + + if (!pb) { + DMERR("Couldn't allocate per_bio_data"); + return NULL; + } + + pb->bi_end_io = bio->bi_end_io; + pb->bi_private = bio->bi_private; + + bio->bi_private = pb; + + return pb; +} + +static void free_pbd(struct nvmd *nvmd, struct per_bio_data *pb) +{ + mempool_free(pb, nvmd->per_bio_pool); +} + +/* bio to be stripped from the pbd structure */ +static void exit_pbd(struct per_bio_data *pb, struct bio *bio) +{ + bio->bi_private = pb->bi_private; + bio->bi_end_io = pb->bi_end_io; +} + +/* deferred bios are used when no available nvm pages. Allowing GC to execute + * and resubmit bios */ +void nvm_defer_bio(struct nvmd *nvmd, struct bio *bio, void *private) +{ + spin_lock(&nvmd->deferred_lock); + bio_list_add(&nvmd->deferred_bios, bio); + spin_unlock(&nvmd->deferred_lock); +} + +void nvm_deferred_bio_submit(struct work_struct *work) +{ + struct nvmd *nvmd = container_of(work, struct nvmd, deferred_ws); + struct bio *bio; + + spin_lock(&nvmd->deferred_lock); + bio = bio_list_get(&nvmd->deferred_bios); + spin_unlock(&nvmd->deferred_lock); + + while (bio) { + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + if (bio_data_dir(bio) == WRITE) + nvmd->type->write_bio(nvmd, bio); + else + nvmd->type->read_bio(nvmd, bio); + bio = next; + } +} + +/* delayed bios are used for making pool accesses sequential */ +void nvm_delayed_bio_submit(struct work_struct *work) +{ + struct nvm_pool *pool = container_of(work, struct nvm_pool, waiting_ws); + struct bio *bio; + struct per_bio_data *pb; + + spin_lock(&pool->waiting_lock); + bio = bio_list_pop(&pool->waiting_bios); + + pool->cur_bio = bio; + if (!bio) { + atomic_dec(&pool->is_active); + spin_unlock(&pool->waiting_lock); + return; + } + + spin_unlock(&pool->waiting_lock); + + /* setup timings to track end timings accordently */ + pb = bio->bi_private; + getnstimeofday(&pb->start_tv); + + submit_bio(bio->bi_rw, bio); +} + +/* requires lock on the translation map used */ +void invalidate_block_page(struct nvmd *nvmd, struct nvm_addr *p) +{ + unsigned int page_offset; + struct nvm_block *block = p->block; + + page_offset = p->addr % nvmd->nr_host_pages_in_blk; + spin_lock(&block->lock); + WARN_ON(test_and_set_bit(page_offset, block->invalid_pages)); + block->nr_invalid_pages++; + spin_unlock(&block->lock); +} + +void nvm_update_map(struct nvmd *nvmd, sector_t l_addr, struct nvm_addr *p, + int is_gc, struct nvm_addr *trans_map) +{ + struct nvm_addr *gp; + struct nvm_rev_addr *rev; + + BUG_ON(l_addr >= nvmd->nr_pages); + BUG_ON(p->addr >= nvmd->nr_pages); + + gp = &trans_map[l_addr]; + spin_lock(&nvmd->rev_lock); + if (gp->block) { + invalidate_block_page(nvmd, gp); + nvmd->rev_trans_map[gp->addr].addr = LTOP_POISON; + } + + gp->addr = p->addr; + gp->block = p->block; + + rev = &nvmd->rev_trans_map[p->addr]; + rev->addr = l_addr; + rev->trans_map = trans_map; + spin_unlock(&nvmd->rev_lock); +} + +/* requires pool->lock taken */ +inline void nvm_reset_block(struct nvm_block *block) +{ + struct nvmd *nvmd = block->pool->nvmd; + + BUG_ON(!block); + + spin_lock(&block->lock); + bitmap_zero(block->invalid_pages, nvmd->nr_host_pages_in_blk); + block->ap = NULL; + block->next_page = 0; + block->next_offset = 0; + block->nr_invalid_pages = 0; + atomic_set(&block->gc_running, 0); + atomic_set(&block->data_size, 0); + atomic_set(&block->data_cmnt_size, 0); + spin_unlock(&block->lock); +} + +/* use pool_[get/put]_block to administer the blocks in use for each pool. + * Whenever a block is in used by an append point, we store it within the + * used_list. We then move it back when its free to be used by another append + * point. + * + * The newly acclaimed block is always added to the back of user_list. As we + * assume that the start of used list is the oldest block, and therefore higher + * probability of invalidated pages. + */ +struct nvm_block *nvm_pool_get_block(struct nvm_pool *pool, int is_gc) +{ + struct nvmd *nvmd = pool->nvmd; + struct nvm_block *block = NULL; + + BUG_ON(!pool); + + spin_lock(&pool->lock); + + if (list_empty(&pool->free_list)) { + DMERR_LIMIT("Pool have no free pages available"); + spin_unlock(&pool->lock); + show_pool(pool); + return NULL; + } + + while (!is_gc && pool->nr_free_blocks < nvmd->nr_aps) { + spin_unlock(&pool->lock); + return NULL; + } + + block = list_first_entry(&pool->free_list, struct nvm_block, list); + list_move_tail(&block->list, &pool->used_list); + + pool->nr_free_blocks--; + + spin_unlock(&pool->lock); + + nvm_reset_block(block); + + block->data = mempool_alloc(nvmd->block_page_pool, GFP_ATOMIC); + BUG_ON(!block->data); + + return block; +} + +/* We assume that all valid pages have already been moved when added back to the + * free list. We add it last to allow round-robin use of all pages. Thereby + * provide simple (naive) wear-leveling. + */ +void nvm_pool_put_block(struct nvm_block *block) +{ + struct nvm_pool *pool = block->pool; + + spin_lock(&pool->lock); + + list_move_tail(&block->list, &pool->free_list); + pool->nr_free_blocks++; + + spin_unlock(&pool->lock); +} + +static sector_t __nvm_alloc_phys_addr(struct nvm_block *block, + nvm_page_special_fn ps) +{ + struct nvmd *nvmd; + sector_t addr = LTOP_EMPTY; + + BUG_ON(!block); + + nvmd = block->pool->nvmd; + + spin_lock(&block->lock); + + if (block_is_full(block)) + goto out; + + /* If there is multiple host pages within a flash page, we add the + * the offset to the address, instead of requesting a new page + * from the physical block */ + if (block->next_offset == NR_HOST_PAGES_IN_FLASH_PAGE) { + if (ps && !ps(nvmd, block->next_page + 1)) + goto out; + + block->next_offset = 0; + block->next_page++; + } + + addr = block_to_addr(block) + + (block->next_page * NR_HOST_PAGES_IN_FLASH_PAGE) + + block->next_offset; + block->next_offset++; + + if (nvmd->type->alloc_phys_addr) + nvmd->type->alloc_phys_addr(nvmd, block); + +out: + spin_unlock(&block->lock); + return addr; +} + +sector_t nvm_alloc_phys_addr_special(struct nvm_block *block, + nvm_page_special_fn ps) +{ + return __nvm_alloc_phys_addr(block, ps); +} + +sector_t nvm_alloc_phys_addr(struct nvm_block *block) +{ + return __nvm_alloc_phys_addr(block, NULL); +} + +/* requires ap->lock taken */ +void nvm_set_ap_cur(struct nvm_ap *ap, struct nvm_block *block) +{ + BUG_ON(!ap); + BUG_ON(!block); + + if (ap->cur) { + spin_lock(&ap->cur->lock); + WARN_ON(!block_is_full(ap->cur)); + spin_unlock(&ap->cur->lock); + ap->cur->ap = NULL; + } + ap->cur = block; + ap->cur->ap = ap; +} + +/* requires ap->lock held */ +struct nvm_addr *nvm_alloc_addr_from_ap(struct nvm_ap *ap, int is_gc) +{ + struct nvmd *nvmd = ap->parent; + struct nvm_block *p_block; + struct nvm_pool *pool; + struct nvm_addr *p; + sector_t p_addr; + + p = mempool_alloc(nvmd->addr_pool, GFP_ATOMIC); + if (!p) + return NULL; + + p_block = ap->cur; + pool = p_block->pool; + p_addr = nvm_alloc_phys_addr(p_block); + + if (p_addr == LTOP_EMPTY) { + p_block = nvm_pool_get_block(pool, 0); + + if (!p_block) { + if (is_gc) { + p_addr = nvm_alloc_phys_addr(ap->gc_cur); + if (p_addr == LTOP_EMPTY) { + p_block = nvm_pool_get_block(pool, 1); + ap->gc_cur = p_block; + ap->gc_cur->ap = ap; + if (!p_block) { + show_all_pools(ap->parent); + DMERR("No more blocks"); + goto finished; + } else { + p_addr = + nvm_alloc_phys_addr(ap->gc_cur); + } + } + p_block = ap->gc_cur; + } + goto finished; + } + + nvm_set_ap_cur(ap, p_block); + p_addr = nvm_alloc_phys_addr(p_block); + } + +finished: + if (p_addr == LTOP_EMPTY) { + mempool_free(p, nvmd->addr_pool); + return NULL; + } + + p->addr = p_addr; + p->block = p_block; + p->private = NULL; + + if (!p_block) + WARN_ON(is_gc); + + return p; +} + +void nvm_erase_block(struct nvm_block *block) +{ + /* Send erase command to device. */ +} + +static void nvm_fill_bio_and_end(struct bio *bio) +{ + zero_fill_bio(bio); + bio_endio(bio, 0); +} + +struct nvm_addr *nvm_lookup_ltop_map(struct nvmd *nvmd, sector_t l_addr, + struct nvm_addr *map, void *private) +{ + struct nvm_addr *gp, *p; + + BUG_ON(!(l_addr >= 0 && l_addr < nvmd->nr_pages)); + + p = mempool_alloc(nvmd->addr_pool, GFP_ATOMIC); + if (!p) + return NULL; + + gp = &map[l_addr]; + + p->addr = gp->addr; + p->block = gp->block; + + /* if it has not been written, p is inited to 0. */ + if (p->block) { + /* during gc, the mapping will be updated accordently. We + * therefore stop submitting new reads to the address, until it + * is copied to the new place. */ + if (atomic_read(&p->block->gc_running)) + goto err; + } + + p->private = private; + + return p; +err: + mempool_free(p, nvmd->addr_pool); + return NULL; + +} + +/* lookup the primary translation table. If there isn't an associated block to + * the addr. We assume that there is no data and doesn't take a ref */ +struct nvm_addr *nvm_lookup_ltop(struct nvmd *nvmd, sector_t l_addr) +{ + return nvm_lookup_ltop_map(nvmd, l_addr, nvmd->trans_map, NULL); +} + +/* Simple round-robin Logical to physical address translation. + * + * Retrieve the mapping using the active append point. Then update the ap for + * the next write to the disk. + * + * Returns nvm_addr with the physical address and block. Remember to return to + * nvmd->addr_cache when bio is finished. + */ +struct nvm_addr *nvm_map_ltop_rr(struct nvmd *nvmd, sector_t l_addr, int is_gc, + struct nvm_addr *trans_map, void *private) +{ + struct nvm_ap *ap; + struct nvm_addr *p; + int i = 0; + + + if (!is_gc) { + ap = get_next_ap(nvmd); + } else { + /* during GC, we don't care about RR, instead we want to make + * sure that we maintain evenness between the block pools. */ + unsigned int i; + struct nvm_pool *pool, *max_free; + + max_free = &nvmd->pools[0]; + /* prevent GC-ing pool from devouring pages of a pool with + * little free blocks. We don't take the lock as we only need an + * estimate. */ + nvm_for_each_pool(nvmd, pool, i) { + if (pool->nr_free_blocks > max_free->nr_free_blocks) + max_free = pool; + } + + ap = &nvmd->aps[max_free->id]; + } + + spin_lock(&ap->lock); + p = nvm_alloc_addr_from_ap(ap, is_gc); + spin_unlock(&ap->lock); + + if (p) + nvm_update_map(nvmd, l_addr, p, is_gc, trans_map); + + return p; +} + +static void nvm_endio(struct bio *bio, int err) +{ + struct per_bio_data *pb; + struct nvmd *nvmd; + struct nvm_ap *ap; + struct nvm_pool *pool; + struct nvm_addr *p; + struct nvm_block *block; + struct timespec end_tv, diff_tv; + unsigned long diff, dev_wait, total_wait = 0; + unsigned int data_cnt; + + pb = get_per_bio_data(bio); + p = pb->addr; + block = p->block; + ap = pb->ap; + nvmd = ap->parent; + pool = ap->pool; + + nvm_unlock_addr(nvmd, pb->l_addr); + + if (bio_data_dir(bio) == WRITE) { + /* maintain data in buffer until block is full */ + data_cnt = atomic_inc_return(&block->data_cmnt_size); + if (data_cnt == nvmd->nr_host_pages_in_blk) { + mempool_free(block->data, nvmd->block_page_pool); + block->data = NULL; + + spin_lock(&pool->lock); + list_add_tail(&block->prio, &pool->prio_list); + spin_unlock(&pool->lock); + } + + /* physical waits if hardware doesn't have a real backend */ + dev_wait = ap->t_write; + } else { + dev_wait = ap->t_read; + } + + + if (nvmd->type->endio) + nvmd->type->endio(nvmd, bio, pb, &dev_wait); + + if (!(nvmd->config.flags & NVM_OPT_NO_WAITS) && dev_wait) { +wait_longer: + getnstimeofday(&end_tv); + diff_tv = timespec_sub(end_tv, pb->start_tv); + diff = timespec_to_ns(&diff_tv) / 1000; + if (dev_wait > diff) { + total_wait = dev_wait - diff; + WARN_ON(total_wait > 1500); + if (total_wait > 10) + udelay(5); + goto wait_longer; + } + } + + if (nvmd->config.flags & NVM_OPT_POOL_SERIALIZE) { + /* we need this. updating pool current only by waiting_bios + * worker leaves a windows where current is bio thats was + * already ended */ + spin_lock(&pool->waiting_lock); + pool->cur_bio = NULL; + spin_unlock(&pool->waiting_lock); + + queue_work(nvmd->kbiod_wq, &pool->waiting_ws); + } + + /* Finish up */ + exit_pbd(pb, bio); + + if (bio->bi_end_io) + bio->bi_end_io(bio, err); + + if (pb->orig_bio) + bio_endio(pb->orig_bio, err); + + if (pb->event) { + complete(pb->event); + /* all submitted bios allocate their own addr, + * except GC reads */ + if (bio_data_dir(bio) == READ) + goto free_pb; + } + + mempool_free(pb->addr, nvmd->addr_pool); +free_pb: + free_pbd(nvmd, pb); +} + +static void nvm_end_read_bio(struct bio *bio, int err) +{ + /* FIXME: Implement error handling of reads + * Remember that bio->bi_end_io is overwritten during bio_split() + */ + nvm_endio(bio, err); +} + +static void nvm_end_write_bio(struct bio *bio, int err) +{ + /* FIXME: Implement error handling of writes */ + nvm_endio(bio, err); + + /* separate bio is allocated on write. Remember to free it */ + bio_put(bio); +} + +int nvm_read_bio(struct nvmd *nvmd, struct bio *bio) +{ + struct nvm_addr *p; + sector_t l_addr; + + l_addr = bio->bi_sector / NR_PHY_IN_LOG; + + nvm_lock_addr(nvmd, l_addr); + + p = nvmd->type->lookup_ltop(nvmd, l_addr); + + if (!p) { + nvm_unlock_addr(nvmd, l_addr); + nvm_defer_bio(nvmd, bio, NULL); + nvm_gc_kick(nvmd); + goto finished; + } + + bio->bi_sector = p->addr * NR_PHY_IN_LOG + + (bio->bi_sector % NR_PHY_IN_LOG); + + if (!p->block) { + bio->bi_sector = 0; + nvm_fill_bio_and_end(bio); + mempool_free(p, nvmd->addr_pool); + nvm_unlock_addr(nvmd, l_addr); + goto finished; + } + + nvm_submit_bio(nvmd, p, l_addr, READ, bio, NULL, NULL, nvmd->trans_map); +finished: + return DM_MAPIO_SUBMITTED; +} + +int nvm_bv_copy(struct nvm_addr *p, struct bio_vec *bv) +{ + struct nvmd *nvmd = p->block->pool->nvmd; + struct nvm_block *block = p->block; + unsigned int idx; + void *src_p, *dst_p; + + idx = p->addr % nvmd->nr_host_pages_in_blk; + src_p = kmap_atomic(bv->bv_page); + dst_p = kmap_atomic(&block->data[idx]); + memcpy(dst_p, src_p, bv->bv_len); + + kunmap_atomic(dst_p); + kunmap_atomic(src_p); + + return atomic_inc_return(&block->data_size); +} + +struct bio *nvm_write_init_bio(struct nvmd *nvmd, struct bio *bio, + struct nvm_addr *p) +{ + struct bio *issue_bio; + int i, size; + + /* FIXME: check for failure */ + issue_bio = bio_alloc(GFP_NOIO, NR_HOST_PAGES_IN_FLASH_PAGE); + issue_bio->bi_bdev = nvmd->dev->bdev; + issue_bio->bi_sector = p->addr * NR_PHY_IN_LOG; + + size = nvm_bv_copy(p, bio_iovec(bio)); + for (i = 0; i < NR_HOST_PAGES_IN_FLASH_PAGE; i++) { + unsigned int idx = size - NR_HOST_PAGES_IN_FLASH_PAGE + i; + bio_add_page(issue_bio, &p->block->data[idx], PAGE_SIZE, 0); + } + return issue_bio; +} + +/* Assumes that l_addr is locked with nvm_lock_addr() */ +int nvm_write_bio(struct nvmd *nvmd, + struct bio *bio, int is_gc, + void *private, struct completion *sync, + struct nvm_addr *trans_map, unsigned int complete_bio) +{ + struct nvm_addr *p; + struct bio *issue_bio; + sector_t l_addr = bio->bi_sector / NR_PHY_IN_LOG; + + p = nvmd->type->map_ltop(nvmd, l_addr, is_gc, trans_map, private); + if (!p) { + BUG_ON(is_gc); + nvm_unlock_addr(nvmd, l_addr); + nvmd->type->defer_bio(nvmd, bio, trans_map); + nvm_gc_kick(nvmd); + + return NVM_WRITE_DEFERRED; + } + + issue_bio = nvm_write_init_bio(nvmd, bio, p); + if (complete_bio) + nvm_submit_bio(nvmd, p, l_addr, WRITE, issue_bio, bio, sync, + trans_map); + else + nvm_submit_bio(nvmd, p, l_addr, WRITE, issue_bio, NULL, sync, + trans_map); + + return NVM_WRITE_SUCCESS; +} + +void nvm_bio_wait_add(struct bio_list *bl, struct bio *bio, void *p_private) +{ + bio_list_add(bl, bio); +} + +/* remember to lock l_addr before calling nvm_submit_bio */ +void nvm_submit_bio(struct nvmd *nvmd, struct nvm_addr *p, sector_t l_addr, + int rw, struct bio *bio, + struct bio *orig_bio, + struct completion *sync, + struct nvm_addr *trans_map) +{ + struct nvm_block *block = p->block; + struct nvm_ap *ap = block_to_ap(nvmd, block); + struct nvm_pool *pool = ap->pool; + struct per_bio_data *pb; + + pb = alloc_init_pbd(nvmd, bio); + pb->ap = ap; + pb->addr = p; + pb->l_addr = l_addr; + pb->event = sync; + pb->orig_bio = orig_bio; + pb->trans_map = trans_map; + + /* is set prematurely because we need it if bio is defered */ + bio->bi_rw |= rw; + if (sync) + bio->bi_rw |= REQ_SYNC; + + if (rw == WRITE) + bio->bi_end_io = nvm_end_write_bio; + else + bio->bi_end_io = nvm_end_read_bio; + + /* We allow counting to be semi-accurate as theres + * no lock for accounting. */ + ap->io_accesses[bio_data_dir(bio)]++; + + if (nvmd->config.flags & NVM_OPT_POOL_SERIALIZE) { + spin_lock(&pool->waiting_lock); + nvmd->type->bio_wait_add(&pool->waiting_bios, bio, p->private); + + if (atomic_inc_return(&pool->is_active) != 1) { + atomic_dec(&pool->is_active); + spin_unlock(&pool->waiting_lock); + return; + } + + bio = bio_list_peek(&pool->waiting_bios); + + /* we're not the only bio waiting */ + if (!bio) { + atomic_dec(&pool->is_active); + spin_unlock(&pool->waiting_lock); + return; + } + + /* we're the only bio waiting. queue relevant worker*/ + queue_work(nvmd->kbiod_wq, &pool->waiting_ws); + spin_unlock(&pool->waiting_lock); + return; + } + + submit_bio(bio->bi_rw, bio); +} diff --git a/drivers/md/lightnvm/gc.c b/drivers/md/lightnvm/gc.c new file mode 100644 index 0000000..04294be --- /dev/null +++ b/drivers/md/lightnvm/gc.c @@ -0,0 +1,208 @@ +#include "lightnvm.h" + +/* Run only GC if less than 1/X blocks are free */ +#define GC_LIMIT_INVERSE 10 + +static void queue_pool_gc(struct nvm_pool *pool) +{ + struct nvmd *nvmd = pool->nvmd; + queue_work(nvmd->kbiod_wq, &pool->gc_ws); +} + +void nvm_gc_cb(unsigned long data) +{ + struct nvmd *nvmd = (struct nvmd *)data; + struct nvm_pool *pool; + int i; + + nvm_for_each_pool(nvmd, pool, i) + queue_pool_gc(pool); + + mod_timer(&nvmd->gc_timer, + jiffies + msecs_to_jiffies(nvmd->config.gc_time)); +} + +static void __erase_block(struct nvm_block *block) +{ + /* TODO: Perform device flash erase */ +} + +/* the block with highest number of invalid pages, will be in the beginning + * of the list */ +static struct nvm_block *block_max_invalid(struct nvm_block *a, + struct nvm_block *b) +{ + BUG_ON(!a || !b); + + if (a->nr_invalid_pages == b->nr_invalid_pages) + return a; + + return (a->nr_invalid_pages < b->nr_invalid_pages) ? b : a; +} + +/* linearly find the block with highest number of invalid pages + * requires pool->lock */ +static struct nvm_block *block_prio_find_max(struct nvm_pool *pool) +{ + struct list_head *list = &pool->prio_list; + struct nvm_block *block, *max; + + BUG_ON(list_empty(list)); + + max = list_first_entry(list, struct nvm_block, prio); + list_for_each_entry(block, list, prio) + max = block_max_invalid(max, block); + + return max; +} + +/* Move data away from flash block to be erased. Additionally update the + * l to p and p to l mappings. */ +static void nvm_move_valid_pages(struct nvmd *nvmd, struct nvm_block *block) +{ + struct nvm_addr src; + struct nvm_rev_addr *rev; + struct bio *src_bio; + struct page *page; + int slot; + DECLARE_COMPLETION(sync); + + if (bitmap_full(block->invalid_pages, nvmd->nr_host_pages_in_blk)) + return; + + while ((slot = find_first_zero_bit(block->invalid_pages, + nvmd->nr_host_pages_in_blk)) < + nvmd->nr_host_pages_in_blk) { + /* Perform read */ + src.addr = block_to_addr(block) + slot; + src.block = block; + + BUG_ON(src.addr >= nvmd->nr_pages); + + /* TODO: check for memory failure */ + src_bio = bio_alloc(GFP_NOIO, 1); + src_bio->bi_bdev = nvmd->dev->bdev; + src_bio->bi_sector = src.addr * NR_PHY_IN_LOG; + + page = mempool_alloc(nvmd->page_pool, GFP_NOIO); + + /* TODO: may fail with EXP_PG_SIZE > PAGE_SIZE */ + bio_add_page(src_bio, page, EXPOSED_PAGE_SIZE, 0); + + /* We take the reverse lock here, and make sure that we only + * release it when we have locked its logical address. If + * another write on the same logical address is + * occuring, we just let it stall the pipeline. + * + * We do this for both the read and write. Fixing it after each + * IO. + */ + spin_lock(&nvmd->rev_lock); + /* We use the physical address to go to the logical page addr, + * and then update its mapping to its new place. */ + rev = &nvmd->rev_trans_map[src.addr]; + + /* already updated by previous regular write */ + if (rev->addr == LTOP_POISON) { + spin_unlock(&nvmd->rev_lock); + goto overwritten; + } + + /* unlocked by nvm_submit_bio nvm_endio */ + __nvm_lock_addr(nvmd, rev->addr, 1); + spin_unlock(&nvmd->rev_lock); + + init_completion(&sync); + nvm_submit_bio(nvmd, &src, rev->addr, READ, src_bio, NULL, + &sync, rev->trans_map); + wait_for_completion(&sync); + + /* ok, now fix the write and make sure that it haven't been + * moved in the meantime. */ + spin_lock(&nvmd->rev_lock); + + /* already updated by previous regular write */ + if (rev->addr == LTOP_POISON) { + spin_unlock(&nvmd->rev_lock); + goto overwritten; + } + + src_bio->bi_sector = rev->addr * NR_PHY_IN_LOG; + + /* again, unlocked by nvm_endio */ + __nvm_lock_addr(nvmd, rev->addr, 1); + spin_unlock(&nvmd->rev_lock); + + init_completion(&sync); + nvm_write_bio(nvmd, src_bio, 1, NULL, &sync, + rev->trans_map, 1); + wait_for_completion(&sync); + +overwritten: + bio_put(src_bio); + mempool_free(page, nvmd->page_pool); + } + WARN_ON(!bitmap_full(block->invalid_pages, nvmd->nr_host_pages_in_blk)); +} + +void nvm_gc_collect(struct work_struct *work) +{ + struct nvm_pool *pool = container_of(work, struct nvm_pool, gc_ws); + struct nvmd *nvmd = pool->nvmd; + struct nvm_block *block; + unsigned int nr_blocks_need; + + nr_blocks_need = pool->nr_blocks / 10; + + if (nr_blocks_need < nvmd->nr_aps) + nr_blocks_need = nvmd->nr_aps; + + spin_lock(&pool->lock); + while (nr_blocks_need > pool->nr_free_blocks && + !list_empty(&pool->prio_list)) { + block = block_prio_find_max(pool); + + if (!block->nr_invalid_pages) { + spin_unlock(&pool->lock); + show_pool(pool); + spin_lock(&pool->lock); + DMERR("No invalid pages\n"); + break; + } + + list_del_init(&block->prio); + + BUG_ON(!block_is_full(block)); + BUG_ON(atomic_inc_return(&block->gc_running) != 1); + + queue_work(nvmd->kgc_wq, &block->ws_gc); + + nr_blocks_need--; + } + spin_unlock(&pool->lock); + nvmd->next_collect_pool++; + + queue_work(nvmd->kbiod_wq, &nvmd->deferred_ws); +} + +void nvm_gc_block(struct work_struct *work) +{ + struct nvm_block *block = container_of(work, struct nvm_block, ws_gc); + struct nvmd *nvmd = block->pool->nvmd; + + /* TODO: move outside lock to allow multiple pages + * in parallel to be erased. */ + nvm_move_valid_pages(nvmd, block); + __erase_block(block); + nvm_pool_put_block(block); +} + +void nvm_gc_kick(struct nvmd *nvmd) +{ + struct nvm_pool *pool; + unsigned int i; + BUG_ON(!nvmd); + + nvm_for_each_pool(nvmd, pool, i) + queue_pool_gc(pool); +} diff --git a/drivers/md/lightnvm/lightnvm.c b/drivers/md/lightnvm/lightnvm.c new file mode 100644 index 0000000..a6d919b --- /dev/null +++ b/drivers/md/lightnvm/lightnvm.c @@ -0,0 +1,589 @@ +/* + * Copyright (C) 2014 Matias Bjørling. + * + * Todo + * + * - Implement fetching of bad pages from flash + * - configurable sector size + * - handle case of in-page bv_offset (currently hidden assumption of offset=0, + * and bv_len spans entire page) + * + * Optimization possibilities + * - Move ap_next_write into a conconcurrency friendly data structure. Could be + * handled by more intelligent map_ltop function. + * - Implement per-cpu nvm_block data structure ownership. Removes need + * for taking lock on block next_write_id function. I.e. page allocation + * becomes nearly lockless, with occasionally movement of blocks on + * nvm_block lists. + */ + +#include "lightnvm.h" + +/* Defaults + * Number of append points per pool. We assume that accesses within a pool is + * serial (NAND flash/PCM/etc.) + */ +#define APS_PER_POOL 1 + +/* If enabled, we delay bios on each ap to run serialized. */ +#define SERIALIZE_POOL_ACCESS 0 + +/* Sleep timings before simulating device specific storage (in us) */ +#define TIMING_READ 25 +#define TIMING_WRITE 500 +#define TIMING_ERASE 1500 + +/* Run GC every X seconds */ +#define GC_TIME 10 + +/* Minimum pages needed within a pool */ +#define MIN_POOL_PAGES 16 + +static struct kmem_cache *_per_bio_cache; +static struct kmem_cache *_addr_cache; + +static int nvm_ioctl(struct dm_target *ti, unsigned int cmd, unsigned long arg) +{ + struct nvmd *nvmd = ti->private; + + switch (cmd) { + case LIGHTNVM_IOCTL_ID: + return 0xCECECECE; /* TODO: Fetch ID from disk */ + break; + } + + if (nvmd->type->ioctl) + return nvmd->type->ioctl(nvmd, cmd, arg); + + return 0; +} + +static int nvm_map(struct dm_target *ti, struct bio *bio) +{ + struct nvmd *nvmd = ti->private; + int ret = DM_MAPIO_SUBMITTED; + + if (bio->bi_sector / NR_PHY_IN_LOG >= nvmd->nr_pages) { + DMERR("Illegal nvm address: %lu %ld", bio_data_dir(bio), + bio->bi_sector / NR_PHY_IN_LOG); + bio_io_error(bio); + return ret; + }; + + bio->bi_bdev = nvmd->dev->bdev; + + /* limited currently to 4k write IOs */ + if (bio_data_dir(bio) == WRITE) { + if (bio_sectors(bio) != NR_PHY_IN_LOG) { + DMERR("Write sectors size not supported (%u)", + bio_sectors(bio)); + bio_io_error(bio); + return ret; + } + ret = nvmd->type->write_bio(nvmd, bio); + } else { + ret = nvmd->type->read_bio(nvmd, bio); + } + + return ret; +} + +static void nvm_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct nvmd *nvmd = ti->private; + struct nvm_ap *ap; + int i, sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("Use table information"); + break; + case STATUSTYPE_TABLE: + nvm_for_each_ap(nvmd, ap, i) { + DMEMIT("Reads: %lu Writes: %lu Delayed: %lu", + ap->io_accesses[0], + ap->io_accesses[1], + ap->io_delayed); + } + break; + } +} + +static int nvm_pool_init(struct nvmd *nvmd, struct dm_target *ti) +{ + struct nvm_pool *pool; + struct nvm_block *block; + struct nvm_ap *ap; + int i, j; + + spin_lock_init(&nvmd->deferred_lock); + spin_lock_init(&nvmd->rev_lock); + INIT_WORK(&nvmd->deferred_ws, nvm_deferred_bio_submit); + bio_list_init(&nvmd->deferred_bios); + + nvmd->pools = kzalloc(sizeof(struct nvm_pool) * nvmd->nr_pools, + GFP_KERNEL); + if (!nvmd->pools) + goto err_pool; + + nvm_for_each_pool(nvmd, pool, i) { + spin_lock_init(&pool->lock); + spin_lock_init(&pool->waiting_lock); + + init_completion(&pool->gc_finished); + + INIT_WORK(&pool->gc_ws, nvm_gc_collect); + INIT_WORK(&pool->waiting_ws, nvm_delayed_bio_submit); + + INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->used_list); + INIT_LIST_HEAD(&pool->prio_list); + + pool->id = i; + pool->nvmd = nvmd; + pool->phy_addr_start = i * nvmd->nr_blks_per_pool; + pool->phy_addr_end = (i + 1) * nvmd->nr_blks_per_pool - 1; + pool->nr_free_blocks = pool->nr_blocks = + pool->phy_addr_end - pool->phy_addr_start + 1; + bio_list_init(&pool->waiting_bios); + atomic_set(&pool->is_active, 0); + + pool->blocks = kzalloc(sizeof(struct nvm_block) * + pool->nr_blocks, GFP_KERNEL); + if (!pool->blocks) + goto err_blocks; + + spin_lock(&pool->lock); + pool_for_each_block(pool, block, j) { + spin_lock_init(&block->lock); + atomic_set(&block->gc_running, 0); + INIT_LIST_HEAD(&block->list); + INIT_LIST_HEAD(&block->prio); + + block->pool = pool; + block->id = (i * nvmd->nr_blks_per_pool) + j; + + list_add_tail(&block->list, &pool->free_list); + INIT_WORK(&block->ws_gc, nvm_gc_block); + } + spin_unlock(&pool->lock); +} + + nvmd->nr_aps = nvmd->nr_aps_per_pool * nvmd->nr_pools; + nvmd->aps = kzalloc(sizeof(struct nvm_ap) * nvmd->nr_aps, GFP_KERNEL); + if (!nvmd->aps) + goto err_blocks; + + nvm_for_each_ap(nvmd, ap, i) { + spin_lock_init(&ap->lock); + ap->parent = nvmd; + ap->pool = &nvmd->pools[i / nvmd->nr_aps_per_pool]; + + block = nvm_pool_get_block(ap->pool, 0); + nvm_set_ap_cur(ap, block); + /* Emergency gc block */ + block = nvm_pool_get_block(ap->pool, 1); + ap->gc_cur = block; + + ap->t_read = nvmd->config.t_read; + ap->t_write = nvmd->config.t_write; + ap->t_erase = nvmd->config.t_erase; + } + + /* we make room for each pool context. */ + nvmd->kbiod_wq = alloc_workqueue("knvm-work", WQ_MEM_RECLAIM|WQ_UNBOUND, + nvmd->nr_pools); + if (!nvmd->kbiod_wq) { + DMERR("Couldn't start knvm-work"); + goto err_blocks; + } + + nvmd->kgc_wq = alloc_workqueue("knvm-gc", WQ_MEM_RECLAIM, 1); + if (!nvmd->kgc_wq) { + DMERR("Couldn't start knvm-gc"); + goto err_wq; + } + + return 0; +err_wq: + destroy_workqueue(nvmd->kbiod_wq); +err_blocks: + nvm_for_each_pool(nvmd, pool, i) { + if (!pool->blocks) + break; + kfree(pool->blocks); + } + kfree(nvmd->pools); +err_pool: + ti->error = "Cannot allocate lightnvm data structures"; + return -ENOMEM; +} + +static int nvm_init(struct dm_target *ti, struct nvmd *nvmd) +{ + int i; + unsigned int order; + + nvmd->trans_map = vmalloc(sizeof(struct nvm_addr) * nvmd->nr_pages); + if (!nvmd->trans_map) + return -ENOMEM; + memset(nvmd->trans_map, 0, sizeof(struct nvm_addr) * nvmd->nr_pages); + + nvmd->rev_trans_map = vmalloc(sizeof(struct nvm_rev_addr) + * nvmd->nr_pages); + if (!nvmd->rev_trans_map) + goto err_rev_trans_map; + + for (i = 0; i < nvmd->nr_pages; i++) { + struct nvm_addr *p = &nvmd->trans_map[i]; + struct nvm_rev_addr *r = &nvmd->rev_trans_map[i]; + + p->addr = LTOP_EMPTY; + + r->addr = 0xDEADBEEF; + r->trans_map = NULL; + } + + nvmd->per_bio_pool = mempool_create_slab_pool(16, _per_bio_cache); + if (!nvmd->per_bio_pool) + goto err_dev_lookup; + + nvmd->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); + if (!nvmd->page_pool) + goto err_per_bio_pool; + + nvmd->addr_pool = mempool_create_slab_pool(64, _addr_cache); + if (!nvmd->addr_pool) + goto err_page_pool; + + order = ffs(nvmd->nr_host_pages_in_blk) - 1; + nvmd->block_page_pool = mempool_create_page_pool(nvmd->nr_aps, order); + if (!nvmd->block_page_pool) + goto err_addr_pool; + + if (bdev_physical_block_size(nvmd->dev->bdev) > EXPOSED_PAGE_SIZE) { + ti->error = "bad sector size."; + goto err_block_page_pool; + } + nvmd->sector_size = EXPOSED_PAGE_SIZE; + + /* inflight maintainence */ + percpu_ida_init(&nvmd->free_inflight, NVM_INFLIGHT_TAGS); + + for (i = 0; i < NVM_INFLIGHT_PARTITIONS; i++) { + spin_lock_init(&nvmd->inflight_map[i].lock); + INIT_LIST_HEAD(&nvmd->inflight_map[i].addrs); + } + + /* simple round-robin strategy */ + atomic_set(&nvmd->next_write_ap, -1); + + nvmd->ti = ti; + ti->private = nvmd; + + /* Initialize pools. */ + nvm_pool_init(nvmd, ti); + + if (nvmd->type->init && nvmd->type->init(nvmd)) + goto err_block_page_pool; + + /* FIXME: Clean up pool init on failure. */ + setup_timer(&nvmd->gc_timer, nvm_gc_cb, (unsigned long)nvmd); + mod_timer(&nvmd->gc_timer, jiffies + msecs_to_jiffies(1000)); + + return 0; +err_block_page_pool: + mempool_destroy(nvmd->block_page_pool); +err_addr_pool: + mempool_destroy(nvmd->addr_pool); +err_page_pool: + mempool_destroy(nvmd->page_pool); +err_per_bio_pool: + mempool_destroy(nvmd->per_bio_pool); +err_dev_lookup: + vfree(nvmd->rev_trans_map); +err_rev_trans_map: + vfree(nvmd->trans_map); + return -ENOMEM; +} + +/* + * Accepts an LightNVM-backed block-device. The LightNVM device should run the + * corresponding physical firmware that exports the flash as physical without + * any mapping and garbage collection as it will be taken care of. + */ +static int nvm_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + struct nvmd *nvmd; + unsigned int tmp; + char dummy; + + if (argc < 5) { + ti->error = "Insufficient arguments"; + return -EINVAL; + } + + nvmd = kzalloc(sizeof(*nvmd), GFP_KERNEL); + if (!nvmd) { + ti->error = "No enough memory for data structures"; + return -ENOMEM; + } + + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), + &nvmd->dev)) + goto err_map; + + dm_set_target_max_io_len(ti, NR_PHY_IN_LOG); + + nvmd->type = find_nvm_target_type(argv[1]); + if (!nvmd->type) { + ti->error = "NVM target type doesn't exist"; + goto err_map; + } + + if (sscanf(argv[2], "%u%c", &tmp, &dummy) != 1) { + ti->error = "Cannot read number of pools"; + goto err_map; + } + nvmd->nr_pools = tmp; + + if (sscanf(argv[3], "%u%c", &tmp, &dummy) != 1) { + ti->error = "Cannot read number of blocks within a pool"; + goto err_map; + } + nvmd->nr_blks_per_pool = tmp; + + if (sscanf(argv[4], "%u%c", &tmp, &dummy) != 1) { + ti->error = "Cannot read number of pages within a block"; + goto err_map; + } + nvmd->nr_pages_per_blk = tmp; + + /* Optional */ + nvmd->nr_aps_per_pool = APS_PER_POOL; + if (argc > 5) { + if (sscanf(argv[5], "%u%c", &tmp, &dummy) == 1) { + if (!tmp) { + DMERR("Number of aps set to 1."); + tmp = APS_PER_POOL; + } + nvmd->nr_aps_per_pool = tmp; + } else { + ti->error = "Cannot read number of append points"; + goto err_map; + } + } + + if (argc > 6) { + if (sscanf(argv[6], "%u%c", &tmp, &dummy) == 1) { + nvmd->config.flags |= (tmp << NVM_OPT_MISC_OFFSET); + } else { + ti->error = "Cannot read flags"; + goto err_map; + } + } + + nvmd->config.gc_time = GC_TIME; + if (argc > 7) { + if (sscanf(argv[7], "%u%c", &tmp, &dummy) == 1) { + nvmd->config.gc_time = tmp; + if (nvmd->config.gc_time <= 0) + nvmd->config.gc_time = 1000; + } else { + ti->error = "Cannot read gc timing"; + goto err_map; + } + } + + nvmd->config.t_read = TIMING_READ; + if (argc > 8) { + if (sscanf(argv[8], "%u%c", &tmp, &dummy) == 1) { + nvmd->config.t_read = tmp; + } else { + ti->error = "Cannot read read access timing"; + goto err_map; + } + } + + nvmd->config.t_write = TIMING_WRITE; + if (argc > 9) { + if (sscanf(argv[9], "%u%c", &tmp, &dummy) == 1) { + nvmd->config.t_write = tmp; + } else { + ti->error = "Cannot read write access timing"; + goto err_map; + } + } + + nvmd->config.t_erase = TIMING_ERASE; + if (argc > 10) { + if (sscanf(argv[10], "%u%c", &tmp, &dummy) == 1) { + nvmd->config.t_erase = tmp; + } else { + ti->error = "Cannot read erase access timing"; + goto err_map; + } + } + + nvmd->nr_host_pages_in_blk = NR_HOST_PAGES_IN_FLASH_PAGE + * nvmd->nr_pages_per_blk; + nvmd->nr_pages = nvmd->nr_pools * nvmd->nr_blks_per_pool + * nvmd->nr_host_pages_in_blk; + + /* Invalid pages in block bitmap is preallocated. */ + if (nvmd->nr_host_pages_in_blk > + MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) { + ti->error = "Num pages per block is too high"; + return -EINVAL; + } + + + if (nvm_init(ti, nvmd) < 0) { + ti->error = "Cannot initialize lightnvm structure"; + goto err_map; + } + + DMINFO("Configured with"); + DMINFO("Pools: %u Blocks: %u Pages: %u APs: %u Pool per AP: %u", + nvmd->nr_pools, + nvmd->nr_blks_per_pool, + nvmd->nr_pages_per_blk, + nvmd->nr_aps, + nvmd->nr_aps_per_pool); + DMINFO("Timings: %u/%u/%u", + nvmd->config.t_read, + nvmd->config.t_write, + nvmd->config.t_erase); + DMINFO("Target sector size=%d", nvmd->sector_size); + DMINFO("Disk logical sector size=%d", + bdev_logical_block_size(nvmd->dev->bdev)); + DMINFO("Disk physical sector size=%d", + bdev_physical_block_size(nvmd->dev->bdev)); + DMINFO("Disk flash page size=%d", FLASH_PAGE_SIZE); + DMINFO("Allocated %lu physical pages (%lu KB)", + nvmd->nr_pages, nvmd->nr_pages * nvmd->sector_size / 1024); + + return 0; +err_map: + kfree(nvmd); + return -ENOMEM; +} + +static void nvm_dtr(struct dm_target *ti) +{ + struct nvmd *nvmd = ti->private; + struct nvm_pool *pool; + int i; + + if (nvmd->type->exit) + nvmd->type->exit(nvmd); + + del_timer(&nvmd->gc_timer); + + nvm_for_each_pool(nvmd, pool, i) { + while (bio_list_peek(&pool->waiting_bios)) + flush_scheduled_work(); + } + + /* TODO: remember outstanding block refs, waiting to be erased... */ + nvm_for_each_pool(nvmd, pool, i) + kfree(pool->blocks); + + kfree(nvmd->pools); + kfree(nvmd->aps); + + vfree(nvmd->trans_map); + vfree(nvmd->rev_trans_map); + + destroy_workqueue(nvmd->kbiod_wq); + destroy_workqueue(nvmd->kgc_wq); + + mempool_destroy(nvmd->per_bio_pool); + mempool_destroy(nvmd->page_pool); + mempool_destroy(nvmd->addr_pool); + + percpu_ida_destroy(&nvmd->free_inflight); + + dm_put_device(ti, nvmd->dev); + + kfree(nvmd); + + DMINFO("successfully unloaded"); +} + +static int nvm_none_write_bio(struct nvmd *nvmd, struct bio *bio) +{ + sector_t l_addr = bio->bi_sector / NR_PHY_IN_LOG; + nvm_lock_addr(nvmd, l_addr); + + nvm_write_bio(nvmd, bio, 0, NULL, NULL, nvmd->trans_map, 1); + return DM_MAPIO_SUBMITTED; +} + +/* none target type, round robin, page-based FTL, and cost-based GC */ +static struct nvm_target_type nvm_target_none = { + .name = "none", + .version = {1, 0, 0}, + .lookup_ltop = nvm_lookup_ltop, + .map_ltop = nvm_map_ltop_rr, + .write_bio = nvm_none_write_bio, + .read_bio = nvm_read_bio, + .defer_bio = nvm_defer_bio, + .bio_wait_add = nvm_bio_wait_add, +}; + +static struct target_type lightnvm_target = { + .name = "lightnvm", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = nvm_ctr, + .dtr = nvm_dtr, + .map = nvm_map, + .ioctl = nvm_ioctl, + .status = nvm_status, +}; + +static int __init dm_lightnvm_init(void) +{ + int ret = -ENOMEM; + + _per_bio_cache = kmem_cache_create("lightnvm_per_bio_cache", + sizeof(struct per_bio_data), 0, 0, NULL); + if (!_per_bio_cache) + return ret; + + _addr_cache = kmem_cache_create("lightnvm_addr_cache", + sizeof(struct nvm_addr), 0, 0, NULL); + if (!_addr_cache) + goto err_pbc; + + nvm_register_target(&nvm_target_none); + + ret = dm_register_target(&lightnvm_target); + if (ret < 0) { + DMERR("register failed %d", ret); + goto err_adp; + } + + return ret; +err_adp: + kmem_cache_destroy(_addr_cache); +err_pbc: + kmem_cache_destroy(_per_bio_cache); + return ret; +} + +static void __exit dm_lightnvm_exit(void) +{ + dm_unregister_target(&lightnvm_target); + kmem_cache_destroy(_per_bio_cache); + kmem_cache_destroy(_addr_cache); +} + +module_init(dm_lightnvm_init); +module_exit(dm_lightnvm_exit); + +MODULE_DESCRIPTION(DM_NAME " target"); +MODULE_AUTHOR("Matias Bjorling <m@xxxxxxxxxxx>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/lightnvm/lightnvm.h b/drivers/md/lightnvm/lightnvm.h new file mode 100644 index 0000000..1f6d775 --- /dev/null +++ b/drivers/md/lightnvm/lightnvm.h @@ -0,0 +1,592 @@ +/* + * Copyright (C) 2014 Matias Bj�g. + * + * This file is released under the GPL. + */ + +#ifndef DM_LIGHTNVM_H_ +#define DM_LIGHTNVM_H_ + +#include <linux/device-mapper.h> +#include <linux/dm-io.h> +#include <linux/dm-kcopyd.h> +#include <linux/blkdev.h> +#include <linux/list.h> +#include <linux/list_sort.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/atomic.h> +#include <linux/delay.h> +#include <linux/time.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/mempool.h> +#include <linux/kref.h> +#include <linux/completion.h> +#include <linux/hashtable.h> +#include <linux/percpu_ida.h> + +#define DM_MSG_PREFIX "lightnvm" +#define LTOP_EMPTY -1 +#define LTOP_POISON 0xD3ADB33F + +#define LIGHTNVM_IOC_MAGIC 'O' +#define LIGHTNVM_IOCTL_ID _IO(LIGHTNVM_IOC_MAGIC, 0x40) + +/* + * For now we hardcode some of the configuration for the LightNVM device that we + * have. In the future this should be made configurable. + * + * Configuration: + * EXPOSED_PAGE_SIZE - the page size of which we tell the layers above the + * driver to issue. This usually is 512 bytes for 4K for simplivity. + * FLASH_PAGE_SIZE - the flash size of the individual flash pages. These should + * match the hardware flash chips. Currently only the same page size as + * EXPOSED_PAGE_SIZE is supported. + * + */ + +#define EXPOSED_PAGE_SIZE 4096 +#define FLASH_PAGE_SIZE EXPOSED_PAGE_SIZE + +/* Useful shorthands */ +#define NR_HOST_PAGES_IN_FLASH_PAGE (FLASH_PAGE_SIZE / EXPOSED_PAGE_SIZE) +/* We currently assume that we the lightnvm device is accepting data in 512 + * bytes chunks. This should be set to the smallest command size available for a + * given device. + */ +#define NR_PHY_IN_LOG (EXPOSED_PAGE_SIZE / 512) + +/* We partition the namespace of translation map into these pieces for tracking + * in-flight addresses. */ +#define NVM_INFLIGHT_PARTITIONS 8 +#define NVM_INFLIGHT_TAGS 256 + +#define NVM_WRITE_SUCCESS 0 +#define NVM_WRITE_DEFERRED 1 +#define NVM_WRITE_GC_ABORT 2 + +#define NVM_OPT_MISC_OFFSET 15 + +enum ltop_flags { + /* Update primary mapping (and init secondary mapping as a result) */ + MAP_PRIMARY = 1 << 0, + /* Update only shaddow mapping */ + MAP_SHADOW = 1 << 1, + /* Update only the relevant mapping (primary/shaddow) */ + MAP_SINGLE = 1 << 2, +}; + +enum target_flags { + /* No hints applied */ + NVM_OPT_ENGINE_NONE = 0 << 0, + /* Swap aware hints. Detected from block request type */ + NVM_OPT_ENGINE_SWAP = 1 << 0, + /* IOCTL aware hints. Applications may submit direct hints */ + NVM_OPT_ENGINE_IOCTL = 1 << 1, + /* Latency aware hints. Detected from file type or directly from app */ + NVM_OPT_ENGINE_LATENCY = 1 << 2, + /* Pack aware hints. Detected from file type or directly from app */ + NVM_OPT_ENGINE_PACK = 1 << 3, + + /* Control accesses to append points in the host. Enable this for + * devices that doesn't have an internal queue that only lets one + * command run at a time within an append point */ + NVM_OPT_POOL_SERIALIZE = 1 << NVM_OPT_MISC_OFFSET, + /* Use fast/slow page access pattern */ + NVM_OPT_FAST_SLOW_PAGES = 1 << (NVM_OPT_MISC_OFFSET+1), + /* Disable dev waits */ + NVM_OPT_NO_WAITS = 1 << (NVM_OPT_MISC_OFFSET+2), +}; + +/* Pool descriptions */ +struct nvm_block { + struct { + spinlock_t lock; + /* points to the next writable flash page within a block */ + unsigned int next_page; + /* if a flash page can have multiple host pages, + fill up the flash page before going to the next + writable flash page */ + unsigned char next_offset; + /* number of pages that are invalid, wrt host page size */ + unsigned int nr_invalid_pages; +#define MAX_INVALID_PAGES_STORAGE 8 + /* Bitmap for invalid page intries */ + unsigned long invalid_pages[MAX_INVALID_PAGES_STORAGE]; + } ____cacheline_aligned_in_smp; + + unsigned int id; + struct nvm_pool *pool; + struct nvm_ap *ap; + + /* Management and GC structures */ + struct list_head list; + struct list_head prio; + + /* Persistent data structures */ + struct page *data; + atomic_t data_size; /* data pages inserted into data variable */ + atomic_t data_cmnt_size; /* data pages committed to stable storage */ + + /* Block state handling */ + atomic_t gc_running; + struct work_struct ws_gc; +}; + +/* Logical to physical mapping */ +struct nvm_addr { + sector_t addr; + struct nvm_block *block; + void *private; +}; + +/* Physical to logical mapping */ +struct nvm_rev_addr { + sector_t addr; + struct nvm_addr *trans_map; +}; + +struct nvm_pool { + /* Pool block lists */ + struct { + spinlock_t lock; + } ____cacheline_aligned_in_smp; + + struct list_head used_list; /* In-use blocks */ + struct list_head free_list; /* Not used blocks i.e. released + * and ready for use */ + struct list_head prio_list; /* Blocks that may be GC'ed. */ + + unsigned int id; + /* References the physical start block */ + unsigned long phy_addr_start; + /* References the physical end block */ + unsigned int phy_addr_end; + + unsigned int nr_blocks; /* end_block - start_block. */ + unsigned int nr_free_blocks; /* Number of unused blocks */ + + struct nvm_block *blocks; + struct nvmd *nvmd; + + /* Postpone issuing I/O if append point is active */ + atomic_t is_active; + + spinlock_t waiting_lock; + struct work_struct waiting_ws; + struct bio_list waiting_bios; + + struct bio *cur_bio; + + unsigned int gc_running; + struct completion gc_finished; + struct work_struct gc_ws; + + void *private; +}; + +/* + * nvm_ap. ap is an append point. A pool can have 1..X append points attached. + * An append point has a current block, that it writes to, and when its full, + * it requests a new block, of which it continues its writes. + * + * one ap per pool may be reserved for pack-hints related writes. + * In those that are not not, private is NULL. + */ +struct nvm_ap { + spinlock_t lock; + struct nvmd *parent; + struct nvm_pool *pool; + struct nvm_block *cur; + struct nvm_block *gc_cur; + + /* Timings used for end_io waiting */ + unsigned long t_read; + unsigned long t_write; + unsigned long t_erase; + + unsigned long io_delayed; + unsigned long io_accesses[2]; + + /* Private field for submodules */ + void *private; +}; + +struct nvm_config { + unsigned long flags; + + unsigned int gc_time; /* GC every X microseconds */ + + unsigned int t_read; + unsigned int t_write; + unsigned int t_erase; +}; + +struct nvm_inflight_addr { + struct list_head list; + sector_t l_addr; + int tag; +}; + +struct nvm_inflight { + spinlock_t lock; + struct list_head addrs; +}; + +struct nvmd; +struct per_bio_data; + +/* overridable functionality */ +typedef struct nvm_addr *(*nvm_map_ltop_fn)(struct nvmd *, sector_t, int, + struct nvm_addr *, void *); +typedef struct nvm_addr *(*nvm_lookup_ltop_fn)(struct nvmd *, sector_t); +typedef int (*nvm_write_bio_fn)(struct nvmd *, struct bio *); +typedef int (*nvm_read_bio_fn)(struct nvmd *, struct bio *); +typedef void (*nvm_alloc_phys_addr_fn)(struct nvmd *, struct nvm_block *); +typedef void (*nvm_defer_bio_fn)(struct nvmd *, struct bio *, void *); +typedef void (*nvm_bio_wait_add_fn)(struct bio_list *, struct bio *, void *); +typedef int (*nvm_ioctl_fn)(struct nvmd *, + unsigned int cmd, unsigned long arg); +typedef int (*nvm_init_fn)(struct nvmd *); +typedef void (*nvm_exit_fn)(struct nvmd *); +typedef void (*nvm_endio_fn)(struct nvmd *, struct bio *, + struct per_bio_data *, unsigned long *delay); + +typedef int (*nvm_page_special_fn)(struct nvmd *, unsigned int); + +struct nvm_target_type { + const char *name; + unsigned version[3]; + nvm_map_ltop_fn map_ltop; + + /* lookup functions */ + nvm_lookup_ltop_fn lookup_ltop; + + /* handling of bios */ + nvm_write_bio_fn write_bio; + nvm_read_bio_fn read_bio; + nvm_ioctl_fn ioctl; + nvm_endio_fn endio; + + /* engine specific overrides */ + nvm_alloc_phys_addr_fn alloc_phys_addr; + nvm_defer_bio_fn defer_bio; + nvm_bio_wait_add_fn bio_wait_add; + + /* module specific init/teardown */ + nvm_init_fn init; + nvm_exit_fn exit; + + /* For lightnvm internal use */ + struct list_head list; +}; + +/* Main structure */ +struct nvmd { + struct dm_dev *dev; + struct dm_target *ti; + uint32_t sector_size; + + struct nvm_target_type *type; + + /* Simple translation map of logical addresses to physical addresses. + * The logical addresses is known by the host system, while the physical + * addresses are used when writing to the disk block device. */ + struct nvm_addr *trans_map; + /* also store a reverse map for garbage collection */ + struct nvm_rev_addr *rev_trans_map; + spinlock_t rev_lock; + /* Usually instantiated to the number of available parallel channels + * within the hardware device. i.e. a controller with 4 flash channels, + * would have 4 pools. + * + * We assume that the device exposes its channels as a linear address + * space. A pool therefore have a phy_addr_start and phy_addr_end that + * denotes the start and end. This abstraction is used to let the + * lightnvm (or any other device) expose its read/write/erase interface + * and be administrated by the host system. + */ + struct nvm_pool *pools; + + /* Append points */ + struct nvm_ap *aps; + + mempool_t *per_bio_pool; + mempool_t *addr_pool; + mempool_t *page_pool; + mempool_t *block_page_pool; + + /* Frequently used config variables */ + int nr_pools; + int nr_blks_per_pool; + int nr_pages_per_blk; + int nr_aps; + int nr_aps_per_pool; + + /* Calculated values */ + unsigned int nr_host_pages_in_blk; + unsigned long nr_pages; + + unsigned int next_collect_pool; + + /* Write strategy variables. Move these into each for structure for each + * strategy */ + atomic_t next_write_ap; /* Whenever a page is written, this is updated + * to point to the next write append point */ + struct workqueue_struct *kbiod_wq; + struct workqueue_struct *kgc_wq; + + spinlock_t deferred_lock; + struct work_struct deferred_ws; + struct bio_list deferred_bios; + + struct timer_list gc_timer; + + /* in-flight data lookup, lookup by logical address. Remember the + * overhead of cachelines being used. Keep it low for better cache + * utilization. */ + struct percpu_ida free_inflight; + struct nvm_inflight inflight_map[NVM_INFLIGHT_PARTITIONS]; + struct nvm_inflight_addr inflight_addrs[NVM_INFLIGHT_TAGS]; + + /* nvm module specific data */ + void *private; + + /* User configuration */ + struct nvm_config config; +}; + +struct per_bio_data { + struct nvm_ap *ap; + struct nvm_addr *addr; + struct timespec start_tv; + sector_t l_addr; + + /* Hook up for our overwritten bio fields */ + bio_end_io_t *bi_end_io; + void *bi_private; + struct completion *event; + struct bio *orig_bio; + unsigned int sync; + unsigned int ref_put; + struct nvm_addr *trans_map; +}; + +/* reg.c */ +int nvm_register_target(struct nvm_target_type *t); +void nvm_unregister_target(struct nvm_target_type *t); +struct nvm_target_type *find_nvm_target_type(const char *name); + +/* core.c */ +/* Helpers */ +struct nvm_block *nvm_pool_get_block(struct nvm_pool *, int is_gc); +void invalidate_block_page(struct nvmd *, struct nvm_addr *); +void nvm_set_ap_cur(struct nvm_ap *, struct nvm_block *); +void nvm_defer_bio(struct nvmd *nvmd, struct bio *bio, void *private); +void nvm_bio_wait_add(struct bio_list *bl, struct bio *bio, void *p_private); +sector_t nvm_alloc_phys_addr(struct nvm_block *); +sector_t nvm_alloc_phys_addr_special(struct nvm_block *, nvm_page_special_fn); + +/* Naive implementations */ +void nvm_delayed_bio_submit(struct work_struct *); +void nvm_deferred_bio_submit(struct work_struct *); +void nvm_gc_block(struct work_struct *); + +/* Allocation of physical addresses from block + * when increasing responsibility. */ +struct nvm_addr *nvm_alloc_addr_from_ap(struct nvm_ap *, int is_gc); +struct nvm_addr *nvm_map_ltop_rr(struct nvmd *, sector_t l_addr, int is_gc, + struct nvm_addr *trans_map, void *private); + +/* Gets an address from nvm->trans_map and take a ref count on the blocks usage. + * Remember to put later */ +struct nvm_addr *nvm_lookup_ltop_map(struct nvmd *, sector_t l_addr, + struct nvm_addr *l2p_map, void *private); +struct nvm_addr *nvm_lookup_ltop(struct nvmd *, sector_t l_addr); + +/* I/O bio related */ +struct nvm_addr *nvm_get_trans_map(struct nvmd *nvmd, void *private); +struct bio *nvm_write_init_bio(struct nvmd *, struct bio *, struct nvm_addr *); +int nvm_bv_copy(struct nvm_addr *p, struct bio_vec *bv); +/* FIXME: Shorten */ +int nvm_write_bio(struct nvmd *, struct bio *bio, int is_gc, void *private, + struct completion *sync, struct nvm_addr *trans_map, + unsigned int complete_bio); +int nvm_read_bio(struct nvmd *, struct bio *bio); +/* FIXME: Shorten */ +void nvm_update_map(struct nvmd *nvmd, sector_t l_addr, struct nvm_addr *p, + int is_gc, struct nvm_addr *trans_map); +/* FIXME: Shorten */ +void nvm_submit_bio(struct nvmd *, struct nvm_addr *, sector_t, int rw, + struct bio *, struct bio *orig_bio, struct completion *sync, + struct nvm_addr *trans_map); +void nvm_defer_write_bio(struct nvmd *nvmd, struct bio *bio, void *private); + +/* NVM device related */ +void nvm_block_release(struct kref *); + +/* Block maintanence */ +void nvm_pool_put_block(struct nvm_block *); +void nvm_reset_block(struct nvm_block *); + +/* gc.c */ +void nvm_block_erase(struct kref *); +void nvm_gc_cb(unsigned long data); +void nvm_gc_collect(struct work_struct *work); +void nvm_gc_kick(struct nvmd *nvmd); + +#define nvm_for_each_pool(n, pool, i) \ + for ((i) = 0, pool = &(n)->pools[0]; \ + (i) < (n)->nr_pools; (i)++, pool = &(n)->pools[(i)]) + +#define nvm_for_each_ap(n, ap, i) \ + for ((i) = 0, ap = &(n)->aps[0]; \ + (i) < (n)->nr_aps; (i)++, ap = &(n)->aps[(i)]) + +#define pool_for_each_block(p, b, i) \ + for ((i) = 0, b = &(p)->blocks[0]; \ + (i) < (p)->nr_blocks; (i)++, b = &(p)->blocks[(i)]) + +static inline struct nvm_ap *get_next_ap(struct nvmd *n) +{ + return &n->aps[atomic_inc_return(&n->next_write_ap) % n->nr_aps]; +} + +static inline int block_is_full(struct nvm_block *block) +{ + struct nvmd *nvmd = block->pool->nvmd; + return (block->next_page * NR_HOST_PAGES_IN_FLASH_PAGE) + + block->next_offset == nvmd->nr_host_pages_in_blk; +} + +static inline sector_t block_to_addr(struct nvm_block *block) +{ + struct nvmd *nvmd; + BUG_ON(!block); + nvmd = block->pool->nvmd; + return block->id * nvmd->nr_host_pages_in_blk; +} + +static inline struct nvm_pool *paddr_to_pool(struct nvmd *n, sector_t p_addr) +{ + return &n->pools[p_addr / (n->nr_pages / n->nr_pools)]; +} + +static inline struct nvm_ap *block_to_ap(struct nvmd *n, struct nvm_block *b) +{ + unsigned int ap_idx, div, mod; + + div = b->id / n->nr_blks_per_pool; + mod = b->id % n->nr_blks_per_pool; + ap_idx = div + (mod / (n->nr_blks_per_pool / n->nr_aps_per_pool)); + + return &n->aps[ap_idx]; +} + +static inline int physical_to_slot(struct nvmd *n, sector_t phys) +{ + return (phys % (n->nr_pages_per_blk * NR_HOST_PAGES_IN_FLASH_PAGE)) / + NR_HOST_PAGES_IN_FLASH_PAGE; +} + +static inline struct per_bio_data *get_per_bio_data(struct bio *bio) +{ + return bio->bi_private; +} + +static inline struct nvm_inflight *nvm_hash_addr_to_inflight(struct nvmd *nvmd, + sector_t l_addr) +{ + return &nvmd->inflight_map[l_addr % NVM_INFLIGHT_PARTITIONS]; +} + +static inline void __nvm_lock_addr(struct nvmd *nvmd, sector_t l_addr, int spin) +{ + struct nvm_inflight *inflight = nvm_hash_addr_to_inflight(nvmd, l_addr); + struct nvm_inflight_addr *a; + int tag = percpu_ida_alloc(&nvmd->free_inflight, __GFP_WAIT); + + BUG_ON(l_addr >= nvmd->nr_pages); + +retry: + spin_lock(&inflight->lock); + + list_for_each_entry(a, &inflight->addrs, list) { + if (a->l_addr == l_addr) { + spin_unlock(&inflight->lock); + /* TODO: give up control and come back. I haven't found + * a good way to complete the work, when the data the + * complete structure is being reused */ + if (!spin) + schedule(); + goto retry; + } + } + + a = &nvmd->inflight_addrs[tag]; + + a->l_addr = l_addr; + a->tag = tag; + + list_add_tail(&a->list, &inflight->addrs); + spin_unlock(&inflight->lock); +} + +static inline void nvm_lock_addr(struct nvmd *nvmd, sector_t l_addr) +{ + __nvm_lock_addr(nvmd, l_addr, 0); +} + +static inline void nvm_unlock_addr(struct nvmd *nvmd, sector_t l_addr) +{ + struct nvm_inflight *inflight = + nvm_hash_addr_to_inflight(nvmd, l_addr); + struct nvm_inflight_addr *a = NULL; + + spin_lock(&inflight->lock); + + BUG_ON(list_empty(&inflight->addrs)); + + list_for_each_entry(a, &inflight->addrs, list) + if (a->l_addr == l_addr) + break; + + BUG_ON(!a && a->l_addr != l_addr); + + a->l_addr = LTOP_POISON; + + list_del_init(&a->list); + spin_unlock(&inflight->lock); + percpu_ida_free(&nvmd->free_inflight, a->tag); +} + +static inline void show_pool(struct nvm_pool *pool) +{ + struct list_head *head, *cur; + unsigned int free_cnt = 0, used_cnt = 0, prio_cnt = 0; + + spin_lock(&pool->lock); + list_for_each_safe(head, cur, &pool->free_list) + free_cnt++; + list_for_each_safe(head, cur, &pool->used_list) + used_cnt++; + list_for_each_safe(head, cur, &pool->prio_list) + prio_cnt++; + spin_unlock(&pool->lock); + + DMERR("P-%d F:%u U:%u P:%u", pool->id, free_cnt, used_cnt, prio_cnt); +} + +static inline void show_all_pools(struct nvmd *nvmd) +{ + struct nvm_pool *pool; + unsigned int i; + + nvm_for_each_pool(nvmd, pool, i) + show_pool(pool); +} + +#endif /* DM_LIGHTNVM_H_ */ + diff --git a/drivers/md/lightnvm/reg.c b/drivers/md/lightnvm/reg.c new file mode 100644 index 0000000..ce39da0 --- /dev/null +++ b/drivers/md/lightnvm/reg.c @@ -0,0 +1,41 @@ +#include <linux/list.h> +#include <linux/sem.h> +#include "lightnvm.h" + +static LIST_HEAD(_targets); +static DECLARE_RWSEM(_lock); + +inline struct nvm_target_type *find_nvm_target_type(const char *name) +{ + struct nvm_target_type *t; + + list_for_each_entry(t, &_targets, list) + if (!strcmp(name, t->name)) + return t; + + return NULL; +} + +int nvm_register_target(struct nvm_target_type *t) +{ + int ret = 0; + + down_write(&_lock); + if (find_nvm_target_type(t->name)) + ret = -EEXIST; + else + list_add(&t->list, &_targets); + up_write(&_lock); + return ret; +} + +void nvm_unregister_target(struct nvm_target_type *t) +{ + if (!t) + return; + + down_write(&_lock); + list_del(&t->list); + up_write(&_lock); +} + -- 1.8.3.2 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html