Re: [PATCH 1/3] lightnvm: pblk: rework write error recovery path

Javier Gonzalez <javier@xxxxxxxxxxxx> · Fri, 20 Apr 2018 18:38:58 +0000

> On 19 Apr 2018, at 09.39, Hans Holmberg <hans.ml.holmberg@xxxxxxxxxxxxx> wrote:
> 
> From: Hans Holmberg <hans.holmberg@xxxxxxxxxxxx>
> 
> The write error recovery path is incomplete, so rework
> the write error recovery handling to do resubmits directly
> from the write buffer.
> 
> When a write error occurs, the remaining sectors in the chunk are
> mapped out and invalidated and the request inserted in a resubmit list.
> 
> The writer thread checks if there are any requests to resubmit,
> scans and invalidates any lbas that have been overwritten by later
> writes and resubmits the failed entries.
> 
> Signed-off-by: Hans Holmberg <hans.holmberg@xxxxxxxxxxxx>
> ---
> drivers/lightnvm/pblk-init.c     |   2 +
> drivers/lightnvm/pblk-recovery.c |  91 ---------------
> drivers/lightnvm/pblk-write.c    | 241 ++++++++++++++++++++++++++++-----------
> drivers/lightnvm/pblk.h          |   8 +-
> 4 files changed, 180 insertions(+), 162 deletions(-)
> 
> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> index bfc488d..6f06727 100644
> --- a/drivers/lightnvm/pblk-init.c
> +++ b/drivers/lightnvm/pblk-init.c
> @@ -426,6 +426,7 @@ static int pblk_core_init(struct pblk *pblk)
> 		goto free_r_end_wq;
> 
> 	INIT_LIST_HEAD(&pblk->compl_list);
> +	INIT_LIST_HEAD(&pblk->resubmit_list);
> 
> 	return 0;
> 
> @@ -1185,6 +1186,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
> 	pblk->state = PBLK_STATE_RUNNING;
> 	pblk->gc.gc_enabled = 0;
> 
> +	spin_lock_init(&pblk->resubmit_lock);
> 	spin_lock_init(&pblk->trans_lock);
> 	spin_lock_init(&pblk->lock);
> 
> diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
> index 9cb6d5d..5983428 100644
> --- a/drivers/lightnvm/pblk-recovery.c
> +++ b/drivers/lightnvm/pblk-recovery.c
> @@ -16,97 +16,6 @@
> 
> #include "pblk.h"
> 
> -void pblk_submit_rec(struct work_struct *work)
> -{
> -	struct pblk_rec_ctx *recovery =
> -			container_of(work, struct pblk_rec_ctx, ws_rec);
> -	struct pblk *pblk = recovery->pblk;
> -	struct nvm_rq *rqd = recovery->rqd;
> -	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
> -	struct bio *bio;
> -	unsigned int nr_rec_secs;
> -	unsigned int pgs_read;
> -	int ret;
> -
> -	nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
> -								NVM_MAX_VLBA);
> -
> -	bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
> -
> -	bio->bi_iter.bi_sector = 0;
> -	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
> -	rqd->bio = bio;
> -	rqd->nr_ppas = nr_rec_secs;
> -
> -	pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
> -								nr_rec_secs);

Please, remove functions that are not longer used. Doing a pass on the
rest of the removed functions would be a good idea.

> -	if (pgs_read != nr_rec_secs) {
> -		pr_err("pblk: could not read recovery entries\n");
> -		goto err;
> -	}
> -
> -	if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {

Same here

> -		pr_err("pblk: could not setup recovery request\n");
> -		goto err;
> -	}
> -
> -#ifdef CONFIG_NVM_DEBUG
> -	atomic_long_add(nr_rec_secs, &pblk->recov_writes);
> -#endif

Can you add this debug counter to the new path? I see you added other
counters, if it is a rename, can you put it on a separate patch?

> -
> -	ret = pblk_submit_io(pblk, rqd);
> -	if (ret) {
> -		pr_err("pblk: I/O submission failed: %d\n", ret);
> -		goto err;
> -	}
> -
> -	mempool_free(recovery, pblk->rec_pool);
> -	return;
> -
> -err:
> -	bio_put(bio);
> -	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
> -}
> -
> -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
> -			struct pblk_rec_ctx *recovery, u64 *comp_bits,
> -			unsigned int comp)
> -{
> -	struct nvm_rq *rec_rqd;
> -	struct pblk_c_ctx *rec_ctx;
> -	int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
> -
> -	rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
> -	rec_ctx = nvm_rq_to_pdu(rec_rqd);
> -
> -	/* Copy completion bitmap, but exclude the first X completed entries */
> -	bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
> -				(unsigned long int *)comp_bits,
> -				comp, NVM_MAX_VLBA);
> -
> -	/* Save the context for the entries that need to be re-written and
> -	 * update current context with the completed entries.
> -	 */
> -	rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
> -	if (comp >= c_ctx->nr_valid) {
> -		rec_ctx->nr_valid = 0;
> -		rec_ctx->nr_padded = nr_entries - comp;
> -
> -		c_ctx->nr_padded = comp - c_ctx->nr_valid;
> -	} else {
> -		rec_ctx->nr_valid = c_ctx->nr_valid - comp;
> -		rec_ctx->nr_padded = c_ctx->nr_padded;
> -
> -		c_ctx->nr_valid = comp;
> -		c_ctx->nr_padded = 0;
> -	}
> -
> -	recovery->rqd = rec_rqd;
> -	recovery->pblk = pblk;
> -
> -	return 0;
> -}
> -
> int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
> {
> 	u32 crc;
> diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
> index 3e6f1eb..ab45157 100644
> --- a/drivers/lightnvm/pblk-write.c
> +++ b/drivers/lightnvm/pblk-write.c
> @@ -103,68 +103,147 @@ static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
> 	pblk_rb_sync_end(&pblk->rwb, &flags);
> }
> 
> -/* When a write fails, we are not sure whether the block has grown bad or a page
> - * range is more susceptible to write errors. If a high number of pages fail, we
> - * assume that the block is bad and we mark it accordingly. In all cases, we
> - * remap and resubmit the failed entries as fast as possible; if a flush is
> - * waiting on a completion, the whole stack would stall otherwise.
> - */
> -static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
> +/* Map remaining sectors in chunk, starting from ppa */
> +static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
> {
> -	void *comp_bits = &rqd->ppa_status;
> -	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
> -	struct pblk_rec_ctx *recovery;
> -	struct ppa_addr *ppa_list = rqd->ppa_list;
> -	int nr_ppas = rqd->nr_ppas;
> -	unsigned int c_entries;
> -	int bit, ret;
> +	struct nvm_tgt_dev *dev = pblk->dev;
> +	struct nvm_geo *geo = &dev->geo;
> +	struct pblk_line *line;
> +	struct ppa_addr map_ppa = *ppa;
> +	u64 paddr;
> +	int done = 0;
> 
> -	if (unlikely(nr_ppas == 1))
> -		ppa_list = &rqd->ppa_addr;
> +	line = &pblk->lines[pblk_ppa_to_line(*ppa)];
> +	spin_lock(&line->lock);
> 
> -	recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
> +	while (!done)  {
> +		paddr = pblk_dev_ppa_to_line_addr(pblk, map_ppa);
> 
> -	INIT_LIST_HEAD(&recovery->failed);
> +		if (!test_and_set_bit(paddr, line->map_bitmap))
> +			line->left_msecs--;
> 
> -	bit = -1;
> -	while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
> -		struct pblk_rb_entry *entry;
> -		struct ppa_addr ppa;
> +		if (!test_and_set_bit(paddr, line->invalid_bitmap))
> +			le32_add_cpu(line->vsc, -1);
> 
> -		/* Logic error */
> -		if (bit > c_ctx->nr_valid) {
> -			WARN_ONCE(1, "pblk: corrupted write request\n");
> -			mempool_free(recovery, pblk->rec_pool);
> -			goto out;
> +		if (geo->version == NVM_OCSSD_SPEC_12) {
> +			map_ppa.ppa++;
> +			if (map_ppa.g.pg == geo->num_pg)
> +				done = 1;
> +		} else {
> +			map_ppa.m.sec++;
> +			if (map_ppa.m.sec == geo->clba)
> +				done = 1;
> 		}
> +	}
> 
> -		ppa = ppa_list[bit];
> -		entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
> -		if (!entry) {
> -			pr_err("pblk: could not scan entry on write failure\n");
> -			mempool_free(recovery, pblk->rec_pool);
> -			goto out;
> -		}
> +	spin_unlock(&line->lock);
> +}
> +
> +static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
> +		unsigned int nr_entries)

Can you align this?

> +{
> +	struct pblk_rb *rb = &pblk->rwb;
> +	struct pblk_rb_entry *entry;
> +	struct pblk_line *line;
> +	struct pblk_w_ctx *w_ctx;
> +	struct ppa_addr ppa_l2p;
> +	int flags;
> +	unsigned int pos, i;
> +
> +	spin_lock(&pblk->trans_lock);
> +	pos = sentry;
> +	for (i = 0; i < nr_entries; i++) {
> +		entry = &rb->entries[pos];
> +		w_ctx = &entry->w_ctx;
> +
> +		/* Check if the lba has been overwritten */
> +		ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
> +		if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
> +			w_ctx->lba = ADDR_EMPTY;
> +
> +		/* Mark up the entry as submittable again */
> +		flags = READ_ONCE(w_ctx->flags);
> +		flags |= PBLK_WRITTEN_DATA;
> +		/* Release flags on write context. Protect from writes */
> +		smp_store_release(&w_ctx->flags, flags);
> 
> -		/* The list is filled first and emptied afterwards. No need for
> -		 * protecting it with a lock
> +		/* Decrese the reference count to the line as we will
> +		 * re-map these entries
> 		 */
> -		list_add_tail(&entry->index, &recovery->failed);
> +		line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
> +		kref_put(&line->ref, pblk_line_put);
> +
> +		pos = (pos + 1) & (rb->nr_entries - 1);
> 	}
> +	spin_unlock(&pblk->trans_lock);
> +}
> 
> -	c_entries = find_first_bit(comp_bits, nr_ppas);
> -	ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
> -	if (ret) {
> -		pr_err("pblk: could not recover from write failure\n");
> -		mempool_free(recovery, pblk->rec_pool);
> -		goto out;
> +static void pblk_queue_resubmit(struct pblk *pblk, struct pblk_c_ctx *c_ctx)
> +{
> +	struct pblk_c_ctx *r_ctx;
> +
> +	r_ctx = kzalloc(sizeof(struct pblk_c_ctx), GFP_KERNEL);
> +	if (!r_ctx) {
> +		pr_err("pblk: failed to allocate resubmit context");

No need to print allocation failures - I know there are a few of these
in the code, but they should be removed.

> +		return;
> 	}
> 
> +	r_ctx->lun_bitmap = NULL;
> +	r_ctx->sentry = c_ctx->sentry;
> +	r_ctx->nr_valid = c_ctx->nr_valid;
> +	r_ctx->nr_padded = c_ctx->nr_padded;
> +
> +	spin_lock(&pblk->resubmit_lock);
> +	list_add_tail(&r_ctx->list, &pblk->resubmit_list);
> +	spin_unlock(&pblk->resubmit_lock);
> +}
> +
> +static void pblk_submit_rec(struct work_struct *work)
> +{
> +	struct pblk_rec_ctx *recovery =
> +			container_of(work, struct pblk_rec_ctx, ws_rec);
> +	struct pblk *pblk = recovery->pblk;
> +	struct nvm_rq *rqd = recovery->rqd;
> +	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
> +	struct ppa_addr *ppa_list;
> +
> +	pblk_log_write_err(pblk, rqd);
> +
> +	if (rqd->nr_ppas == 1)
> +		ppa_list = &rqd->ppa_addr;
> +	else
> +		ppa_list = rqd->ppa_list;
> +
> +	pblk_map_remaining(pblk, ppa_list);
> +	pblk_queue_resubmit(pblk, c_ctx);
> +
> +	pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
> +	if (c_ctx->nr_padded)
> +		pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid,
> +							c_ctx->nr_padded);
> +	bio_put(rqd->bio);
> +	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
> +	mempool_free(recovery, pblk->rec_pool);
> +
> +	atomic_dec(&pblk->inflight_io);
> +}
> +
> +
> +static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
> +{
> +	struct pblk_rec_ctx *recovery;
> +
> +	recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
> +	if (!recovery) {
> +		pr_err("pblk: could not allocate recovery work\n");
> +		return;
> +	}
> +
> +	recovery->pblk = pblk;
> +	recovery->rqd = rqd;
> +
> 	INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
> 	queue_work(pblk->close_wq, &recovery->ws_rec);
> -
> -out:
> -	pblk_complete_write(pblk, rqd, c_ctx);
> }
> 
> static void pblk_end_io_write(struct nvm_rq *rqd)
> @@ -173,8 +252,8 @@ static void pblk_end_io_write(struct nvm_rq *rqd)
> 	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
> 
> 	if (rqd->error) {
> -		pblk_log_write_err(pblk, rqd);
> -		return pblk_end_w_fail(pblk, rqd);
> +		pblk_end_w_fail(pblk, rqd);
> +		return;
> 	}
> #ifdef CONFIG_NVM_DEBUG
> 	else
> @@ -339,6 +418,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
> 	bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len,
> 					l_mg->emeta_alloc_type, GFP_KERNEL);
> 	if (IS_ERR(bio)) {
> +		pr_err("pblk: failed to map emeta io");
> 		ret = PTR_ERR(bio);
> 		goto fail_free_rqd;
> 	}
> @@ -515,26 +595,54 @@ static int pblk_submit_write(struct pblk *pblk)
> 	unsigned int secs_avail, secs_to_sync, secs_to_com;
> 	unsigned int secs_to_flush;
> 	unsigned long pos;
> +	unsigned int resubmit;
> 
> -	/* If there are no sectors in the cache, flushes (bios without data)
> -	 * will be cleared on the cache threads
> -	 */
> -	secs_avail = pblk_rb_read_count(&pblk->rwb);
> -	if (!secs_avail)
> -		return 1;
> -
> -	secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
> -	if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
> -		return 1;
> -
> -	secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
> -	if (secs_to_sync > pblk->max_write_pgs) {
> -		pr_err("pblk: bad buffer sync calculation\n");
> -		return 1;
> -	}
> +	spin_lock(&pblk->resubmit_lock);
> +	resubmit = !list_empty(&pblk->resubmit_list);
> +	spin_unlock(&pblk->resubmit_lock);
> +
> +	/* Resubmit failed writes first */
> +	if (resubmit) {
> +		struct pblk_c_ctx *r_ctx;
> +
> +		spin_lock(&pblk->resubmit_lock);
> +		r_ctx = list_first_entry(&pblk->resubmit_list,
> +					struct pblk_c_ctx, list);
> +		list_del(&r_ctx->list);
> +		spin_unlock(&pblk->resubmit_lock);
> 
> -	secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
> -	pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
> +		secs_avail = r_ctx->nr_valid;
> +		pos = r_ctx->sentry;
> +
> +		pblk_prepare_resubmit(pblk, pos, secs_avail);
> +		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
> +				secs_avail);
> +
> +		kfree(r_ctx);
> +	} else {
> +		/* If there are no sectors in the cache,
> +		 * flushes (bios without data) will be cleared on
> +		 * the cache threads
> +		 */
> +		secs_avail = pblk_rb_read_count(&pblk->rwb);
> +		if (!secs_avail)
> +			return 1;
> +
> +		secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
> +		if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
> +			return 1;
> +
> +		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
> +					secs_to_flush);
> +		if (secs_to_sync > pblk->max_write_pgs) {
> +			pr_err("pblk: bad buffer sync calculation\n");
> +			return 1;
> +		}
> +
> +		secs_to_com = (secs_to_sync > secs_avail) ?
> +			secs_avail : secs_to_sync;
> +		pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
> +	}
> 
> 	bio = bio_alloc(GFP_KERNEL, secs_to_sync);
> 
> @@ -553,6 +661,7 @@ static int pblk_submit_write(struct pblk *pblk)
> 	if (pblk_submit_io_set(pblk, rqd))
> 		goto fail_free_bio;
> 
> +

No need for the extra line

> #ifdef CONFIG_NVM_DEBUG
> 	atomic_long_add(secs_to_sync, &pblk->sub_writes);
> #endif
> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> index 9838d03..cff6aea 100644
> --- a/drivers/lightnvm/pblk.h
> +++ b/drivers/lightnvm/pblk.h
> @@ -128,7 +128,6 @@ struct pblk_pad_rq {
> struct pblk_rec_ctx {
> 	struct pblk *pblk;
> 	struct nvm_rq *rqd;
> -	struct list_head failed;
> 	struct work_struct ws_rec;
> };
> 
> @@ -664,6 +663,9 @@ struct pblk {
> 
> 	struct list_head compl_list;
> 
> +	spinlock_t resubmit_lock;	 /* Resubmit list lock */
> +	struct list_head resubmit_list; /* Resubmit list for failed writes*/
> +
> 	mempool_t *page_bio_pool;
> 	mempool_t *gen_ws_pool;
> 	mempool_t *rec_pool;
> @@ -849,13 +851,9 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
> /*
>  * pblk recovery
>  */
> -void pblk_submit_rec(struct work_struct *work);
> struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
> int pblk_recov_pad(struct pblk *pblk);
> int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
> -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
> -			struct pblk_rec_ctx *recovery, u64 *comp_bits,
> -			unsigned int comp);
> 
> /*
>  * pblk gc
> --
> 2.7.4

Otherwise, it looks good to me.

Javier
Attachment:
signature.asc

Description: Message signed with OpenPGP