On Fri, Apr 20, 2018 at 9:38 PM, Javier Gonzalez <javier@xxxxxxxxxxxx> wrote: >> On 19 Apr 2018, at 09.39, Hans Holmberg <hans.ml.holmberg@xxxxxxxxxxxxx> wrote: >> >> From: Hans Holmberg <hans.holmberg@xxxxxxxxxxxx> >> >> The write error recovery path is incomplete, so rework >> the write error recovery handling to do resubmits directly >> from the write buffer. >> >> When a write error occurs, the remaining sectors in the chunk are >> mapped out and invalidated and the request inserted in a resubmit list. >> >> The writer thread checks if there are any requests to resubmit, >> scans and invalidates any lbas that have been overwritten by later >> writes and resubmits the failed entries. >> >> Signed-off-by: Hans Holmberg <hans.holmberg@xxxxxxxxxxxx> >> --- >> drivers/lightnvm/pblk-init.c | 2 + >> drivers/lightnvm/pblk-recovery.c | 91 --------------- >> drivers/lightnvm/pblk-write.c | 241 ++++++++++++++++++++++++++++----------- >> drivers/lightnvm/pblk.h | 8 +- >> 4 files changed, 180 insertions(+), 162 deletions(-) >> >> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c >> index bfc488d..6f06727 100644 >> --- a/drivers/lightnvm/pblk-init.c >> +++ b/drivers/lightnvm/pblk-init.c >> @@ -426,6 +426,7 @@ static int pblk_core_init(struct pblk *pblk) >> goto free_r_end_wq; >> >> INIT_LIST_HEAD(&pblk->compl_list); >> + INIT_LIST_HEAD(&pblk->resubmit_list); >> >> return 0; >> >> @@ -1185,6 +1186,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, >> pblk->state = PBLK_STATE_RUNNING; >> pblk->gc.gc_enabled = 0; >> >> + spin_lock_init(&pblk->resubmit_lock); >> spin_lock_init(&pblk->trans_lock); >> spin_lock_init(&pblk->lock); >> >> diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c >> index 9cb6d5d..5983428 100644 >> --- a/drivers/lightnvm/pblk-recovery.c >> +++ b/drivers/lightnvm/pblk-recovery.c >> @@ -16,97 +16,6 @@ >> >> #include "pblk.h" >> >> -void pblk_submit_rec(struct work_struct *work) >> -{ >> - struct pblk_rec_ctx *recovery = >> - container_of(work, struct pblk_rec_ctx, ws_rec); >> - struct pblk *pblk = recovery->pblk; >> - struct nvm_rq *rqd = recovery->rqd; >> - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); >> - struct bio *bio; >> - unsigned int nr_rec_secs; >> - unsigned int pgs_read; >> - int ret; >> - >> - nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status, >> - NVM_MAX_VLBA); >> - >> - bio = bio_alloc(GFP_KERNEL, nr_rec_secs); >> - >> - bio->bi_iter.bi_sector = 0; >> - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); >> - rqd->bio = bio; >> - rqd->nr_ppas = nr_rec_secs; >> - >> - pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed, >> - nr_rec_secs); > > Please, remove functions that are not longer used. Doing a pass on the > rest of the removed functions would be a good idea. Yes, thanks. > >> - if (pgs_read != nr_rec_secs) { >> - pr_err("pblk: could not read recovery entries\n"); >> - goto err; >> - } >> - >> - if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) { > > Same here I'll clean it up. >> - >> -#ifdef CONFIG_NVM_DEBUG >> - atomic_long_add(nr_rec_secs, &pblk->recov_writes); >> -#endif > > Can you add this debug counter to the new path? I see you added other > counters, if it is a rename, can you put it on a separate patch? Thanks for catching the lost recov counter update, what other counters are you referring to? > >> - >> - ret = pblk_submit_io(pblk, rqd); >> - if (ret) { >> - pr_err("pblk: I/O submission failed: %d\n", ret); >> - goto err; >> - } >> - >> - mempool_free(recovery, pblk->rec_pool); >> - return; >> - >> -err: >> - bio_put(bio); >> - pblk_free_rqd(pblk, rqd, PBLK_WRITE); >> -} >> - >> -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, >> - struct pblk_rec_ctx *recovery, u64 *comp_bits, >> - unsigned int comp) >> -{ >> - struct nvm_rq *rec_rqd; >> - struct pblk_c_ctx *rec_ctx; >> - int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded; >> - >> - rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); >> - rec_ctx = nvm_rq_to_pdu(rec_rqd); >> - >> - /* Copy completion bitmap, but exclude the first X completed entries */ >> - bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status, >> - (unsigned long int *)comp_bits, >> - comp, NVM_MAX_VLBA); >> - >> - /* Save the context for the entries that need to be re-written and >> - * update current context with the completed entries. >> - */ >> - rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp); >> - if (comp >= c_ctx->nr_valid) { >> - rec_ctx->nr_valid = 0; >> - rec_ctx->nr_padded = nr_entries - comp; >> - >> - c_ctx->nr_padded = comp - c_ctx->nr_valid; >> - } else { >> - rec_ctx->nr_valid = c_ctx->nr_valid - comp; >> - rec_ctx->nr_padded = c_ctx->nr_padded; >> - >> - c_ctx->nr_valid = comp; >> - c_ctx->nr_padded = 0; >> - } >> - >> - recovery->rqd = rec_rqd; >> - recovery->pblk = pblk; >> - >> - return 0; >> -} >> - >> int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf) >> { >> u32 crc; >> diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c >> index 3e6f1eb..ab45157 100644 >> --- a/drivers/lightnvm/pblk-write.c >> +++ b/drivers/lightnvm/pblk-write.c >> @@ -103,68 +103,147 @@ static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd, >> pblk_rb_sync_end(&pblk->rwb, &flags); >> } >> >> -/* When a write fails, we are not sure whether the block has grown bad or a page >> - * range is more susceptible to write errors. If a high number of pages fail, we >> - * assume that the block is bad and we mark it accordingly. In all cases, we >> - * remap and resubmit the failed entries as fast as possible; if a flush is >> - * waiting on a completion, the whole stack would stall otherwise. >> - */ >> -static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) >> +/* Map remaining sectors in chunk, starting from ppa */ >> +static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa) >> { >> - void *comp_bits = &rqd->ppa_status; >> - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); >> - struct pblk_rec_ctx *recovery; >> - struct ppa_addr *ppa_list = rqd->ppa_list; >> - int nr_ppas = rqd->nr_ppas; >> - unsigned int c_entries; >> - int bit, ret; >> + struct nvm_tgt_dev *dev = pblk->dev; >> + struct nvm_geo *geo = &dev->geo; >> + struct pblk_line *line; >> + struct ppa_addr map_ppa = *ppa; >> + u64 paddr; >> + int done = 0; >> >> - if (unlikely(nr_ppas == 1)) >> - ppa_list = &rqd->ppa_addr; >> + line = &pblk->lines[pblk_ppa_to_line(*ppa)]; >> + spin_lock(&line->lock); >> >> - recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC); >> + while (!done) { >> + paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa); >> >> - INIT_LIST_HEAD(&recovery->failed); >> + if (!test_and_set_bit(paddr, line->map_bitmap)) >> + line->left_msecs--; >> >> - bit = -1; >> - while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) { >> - struct pblk_rb_entry *entry; >> - struct ppa_addr ppa; >> + if (!test_and_set_bit(paddr, line->invalid_bitmap)) >> + le32_add_cpu(line->vsc, -1); >> >> - /* Logic error */ >> - if (bit > c_ctx->nr_valid) { >> - WARN_ONCE(1, "pblk: corrupted write request\n"); >> - mempool_free(recovery, pblk->rec_pool); >> - goto out; >> + if (geo->version == NVM_OCSSD_SPEC_12) { >> + map_ppa.ppa++; >> + if (map_ppa.g.pg == geo->num_pg) >> + done = 1; >> + } else { >> + map_ppa.m.sec++; >> + if (map_ppa.m.sec == geo->clba) >> + done = 1; >> } >> + } >> >> - ppa = ppa_list[bit]; >> - entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa); >> - if (!entry) { >> - pr_err("pblk: could not scan entry on write failure\n"); >> - mempool_free(recovery, pblk->rec_pool); >> - goto out; >> - } >> + spin_unlock(&line->lock); >> +} >> + >> +static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry, >> + unsigned int nr_entries) > > Can you align this? Done. > >> +{ >> + struct pblk_rb *rb = &pblk->rwb; >> + struct pblk_rb_entry *entry; >> + struct pblk_line *line; >> + struct pblk_w_ctx *w_ctx; >> + struct ppa_addr ppa_l2p; >> + int flags; >> + unsigned int pos, i; >> + >> + spin_lock(&pblk->trans_lock); >> + pos = sentry; >> + for (i = 0; i < nr_entries; i++) { >> + entry = &rb->entries[pos]; >> + w_ctx = &entry->w_ctx; >> + >> + /* Check if the lba has been overwritten */ >> + ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba); >> + if (!pblk_ppa_comp(ppa_l2p, entry->cacheline)) >> + w_ctx->lba = ADDR_EMPTY; >> + >> + /* Mark up the entry as submittable again */ >> + flags = READ_ONCE(w_ctx->flags); >> + flags |= PBLK_WRITTEN_DATA; >> + /* Release flags on write context. Protect from writes */ >> + smp_store_release(&w_ctx->flags, flags); >> >> - /* The list is filled first and emptied afterwards. No need for >> - * protecting it with a lock >> + /* Decrese the reference count to the line as we will >> + * re-map these entries >> */ >> - list_add_tail(&entry->index, &recovery->failed); >> + line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)]; >> + kref_put(&line->ref, pblk_line_put); >> + >> + pos = (pos + 1) & (rb->nr_entries - 1); >> } >> + spin_unlock(&pblk->trans_lock); >> +} >> >> - c_entries = find_first_bit(comp_bits, nr_ppas); >> - ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries); >> - if (ret) { >> - pr_err("pblk: could not recover from write failure\n"); >> - mempool_free(recovery, pblk->rec_pool); >> - goto out; >> +static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx) >> +{ >> + struct pblk_c_ctx *r_ctx; >> + >> + r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL); >> + if (!r_ctx) { >> + pr_err("pblk: failed to allocate resubmit context"); > > No need to print allocation failures - I know there are a few of these > in the code, but they should be removed. Yep, checkpatch warns on these. Done. > >> + return; >> } >> >> + r_ctx->lun_bitmap = NULL; >> + r_ctx->sentry = c_ctx->sentry; >> + r_ctx->nr_valid = c_ctx->nr_valid; >> + r_ctx->nr_padded = c_ctx->nr_padded; >> + >> + spin_lock(&pblk->resubmit_lock); >> + list_add_tail(&r_ctx->list, &pblk->resubmit_list); >> + spin_unlock(&pblk->resubmit_lock); >> +} >> + >> +static void pblk_submit_rec(struct work_struct *work) >> +{ >> + struct pblk_rec_ctx *recovery = >> + container_of(work, struct pblk_rec_ctx, ws_rec); >> + struct pblk *pblk = recovery->pblk; >> + struct nvm_rq *rqd = recovery->rqd; >> + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); >> + struct ppa_addr *ppa_list; >> + >> + pblk_log_write_err(pblk, rqd); >> + >> + if (rqd->nr_ppas == 1) >> + ppa_list = &rqd->ppa_addr; >> + else >> + ppa_list = rqd->ppa_list; >> + >> + pblk_map_remaining(pblk, ppa_list); >> + pblk_queue_resubmit(pblk, c_ctx); >> + >> + pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap); >> + if (c_ctx->nr_padded) >> + pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, >> + c_ctx->nr_padded); >> + bio_put(rqd->bio); >> + pblk_free_rqd(pblk, rqd, PBLK_WRITE); >> + mempool_free(recovery, pblk->rec_pool); >> + >> + atomic_dec(&pblk->inflight_io); >> +} >> + >> + >> +static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) >> +{ >> + struct pblk_rec_ctx *recovery; >> + >> + recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC); >> + if (!recovery) { >> + pr_err("pblk: could not allocate recovery work\n"); >> + return; >> + } >> + >> + recovery->pblk = pblk; >> + recovery->rqd = rqd; >> + >> INIT_WORK(&recovery->ws_rec, pblk_submit_rec); >> queue_work(pblk->close_wq, &recovery->ws_rec); >> - >> -out: >> - pblk_complete_write(pblk, rqd, c_ctx); >> } >> >> static void pblk_end_io_write(struct nvm_rq *rqd) >> @@ -173,8 +252,8 @@ static void pblk_end_io_write(struct nvm_rq *rqd) >> struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); >> >> if (rqd->error) { >> - pblk_log_write_err(pblk, rqd); >> - return pblk_end_w_fail(pblk, rqd); >> + pblk_end_w_fail(pblk, rqd); >> + return; >> } >> #ifdef CONFIG_NVM_DEBUG >> else >> @@ -339,6 +418,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) >> bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, >> l_mg->emeta_alloc_type, GFP_KERNEL); >> if (IS_ERR(bio)) { >> + pr_err("pblk: failed to map emeta io"); >> ret = PTR_ERR(bio); >> goto fail_free_rqd; >> } >> @@ -515,26 +595,54 @@ static int pblk_submit_write(struct pblk *pblk) >> unsigned int secs_avail, secs_to_sync, secs_to_com; >> unsigned int secs_to_flush; >> unsigned long pos; >> + unsigned int resubmit; >> >> - /* If there are no sectors in the cache, flushes (bios without data) >> - * will be cleared on the cache threads >> - */ >> - secs_avail = pblk_rb_read_count(&pblk->rwb); >> - if (!secs_avail) >> - return 1; >> - >> - secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); >> - if (!secs_to_flush && secs_avail < pblk->min_write_pgs) >> - return 1; >> - >> - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); >> - if (secs_to_sync > pblk->max_write_pgs) { >> - pr_err("pblk: bad buffer sync calculation\n"); >> - return 1; >> - } >> + spin_lock(&pblk->resubmit_lock); >> + resubmit = !list_empty(&pblk->resubmit_list); >> + spin_unlock(&pblk->resubmit_lock); >> + >> + /* Resubmit failed writes first */ >> + if (resubmit) { >> + struct pblk_c_ctx *r_ctx; >> + >> + spin_lock(&pblk->resubmit_lock); >> + r_ctx = list_first_entry(&pblk->resubmit_list, >> + struct pblk_c_ctx, list); >> + list_del(&r_ctx->list); >> + spin_unlock(&pblk->resubmit_lock); >> >> - secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync; >> - pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); >> + secs_avail = r_ctx->nr_valid; >> + pos = r_ctx->sentry; >> + >> + pblk_prepare_resubmit(pblk, pos, secs_avail); >> + secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, >> + secs_avail); >> + >> + kfree(r_ctx); >> + } else { >> + /* If there are no sectors in the cache, >> + * flushes (bios without data) will be cleared on >> + * the cache threads >> + */ >> + secs_avail = pblk_rb_read_count(&pblk->rwb); >> + if (!secs_avail) >> + return 1; >> + >> + secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb); >> + if (!secs_to_flush && secs_avail < pblk->min_write_pgs) >> + return 1; >> + >> + secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, >> + secs_to_flush); >> + if (secs_to_sync > pblk->max_write_pgs) { >> + pr_err("pblk: bad buffer sync calculation\n"); >> + return 1; >> + } >> + >> + secs_to_com = (secs_to_sync > secs_avail) ? >> + secs_avail : secs_to_sync; >> + pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); >> + } >> >> bio = bio_alloc(GFP_KERNEL, secs_to_sync); >> >> @@ -553,6 +661,7 @@ static int pblk_submit_write(struct pblk *pblk) >> if (pblk_submit_io_set(pblk, rqd)) >> goto fail_free_bio; >> >> + > > No need for the extra line Removed. > >> #ifdef CONFIG_NVM_DEBUG >> atomic_long_add(secs_to_sync, &pblk->sub_writes); >> #endif >> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h >> index 9838d03..cff6aea 100644 >> --- a/drivers/lightnvm/pblk.h >> +++ b/drivers/lightnvm/pblk.h >> @@ -128,7 +128,6 @@ struct pblk_pad_rq { >> struct pblk_rec_ctx { >> struct pblk *pblk; >> struct nvm_rq *rqd; >> - struct list_head failed; >> struct work_struct ws_rec; >> }; >> >> @@ -664,6 +663,9 @@ struct pblk { >> >> struct list_head compl_list; >> >> + spinlock_t resubmit_lock; /* Resubmit list lock */ >> + struct list_head resubmit_list; /* Resubmit list for failed writes*/ >> + >> mempool_t *page_bio_pool; >> mempool_t *gen_ws_pool; >> mempool_t *rec_pool; >> @@ -849,13 +851,9 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); >> /* >> * pblk recovery >> */ >> -void pblk_submit_rec(struct work_struct *work); >> struct pblk_line *pblk_recov_l2p(struct pblk *pblk); >> int pblk_recov_pad(struct pblk *pblk); >> int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta); >> -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, >> - struct pblk_rec_ctx *recovery, u64 *comp_bits, >> - unsigned int comp); >> >> /* >> * pblk gc >> -- >> 2.7.4 > > Otherwise, it looks good to me. Great, thanks for the review!