Re: [PATCH 1/2] lightnvm: pblk: refactor metadata paths

Javier Gonzalez <javier@xxxxxxxxxxxx> · Tue, 21 Aug 2018 11:56:41 +0000

> On 21 Aug 2018, at 13.54, Matias Bjørling <mb@xxxxxxxxxxx> wrote:
> 
>> On 08/17/2018 12:19 PM, Javier González wrote:
>> pblk maintains two different metadata paths for smeta and emeta, which
>> store metadata at the start of the line and at the end of the line,
>> respectively. Until now, these path has been common for writing and
>> retrieving metadata, however, as these paths diverge, the common code
>> becomes less clear and unnecessary complicated.
>> In preparation for further changes to the metadata write path, this
>> patch separates the write and read paths for smeta and emeta and
>> removes the synchronous emeta path as it not used anymore (emeta is
>> scheduled asynchronously to prevent jittering due to internal I/Os).
>> Signed-off-by: Javier González <javier@xxxxxxxxxxxx>
>> ---
>>  drivers/lightnvm/pblk-core.c     | 338 ++++++++++++++++++---------------------
>>  drivers/lightnvm/pblk-gc.c       |   2 +-
>>  drivers/lightnvm/pblk-recovery.c |   4 +-
>>  drivers/lightnvm/pblk.h          |   5 +-
>>  4 files changed, 164 insertions(+), 185 deletions(-)
>> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
>> index 72de7456845b..52306573cc0e 100644
>> --- a/drivers/lightnvm/pblk-core.c
>> +++ b/drivers/lightnvm/pblk-core.c
>> @@ -621,12 +621,137 @@ u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line)
>>      return paddr;
>>  }
>>  -/*
>> - * Submit emeta to one LUN in the raid line at the time to avoid a deadlock when
>> - * taking the per LUN semaphore.
>> - */
>> -static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
>> -                     void *emeta_buf, u64 paddr, int dir)
>> +u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
>> +{
>> +    struct nvm_tgt_dev *dev = pblk->dev;
>> +    struct nvm_geo *geo = &dev->geo;
>> +    struct pblk_line_meta *lm = &pblk->lm;
>> +    int bit;
>> +
>> +    /* This usually only happens on bad lines */
>> +    bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
>> +    if (bit >= lm->blk_per_line)
>> +        return -1;
>> +
>> +    return bit * geo->ws_opt;
>> +}
>> +
>> +int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line)
>> +{
>> +    struct nvm_tgt_dev *dev = pblk->dev;
>> +    struct pblk_line_meta *lm = &pblk->lm;
>> +    struct bio *bio;
>> +    struct nvm_rq rqd;
>> +    u64 paddr = pblk_line_smeta_start(pblk, line);
>> +    int i, ret;
>> +
>> +    memset(&rqd, 0, sizeof(struct nvm_rq));
>> +
>> +    rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
>> +                            &rqd.dma_meta_list);
>> +    if (!rqd.meta_list)
>> +        return -ENOMEM;
>> +
>> +    rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size;
>> +    rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size;
>> +
>> +    bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
>> +    if (IS_ERR(bio)) {
>> +        ret = PTR_ERR(bio);
>> +        goto free_ppa_list;
>> +    }
>> +
>> +    bio->bi_iter.bi_sector = 0; /* internal bio */
>> +    bio_set_op_attrs(bio, REQ_OP_READ, 0);
>> +
>> +    rqd.bio = bio;
>> +    rqd.opcode = NVM_OP_PREAD;
>> +    rqd.nr_ppas = lm->smeta_sec;
>> +    rqd.is_seq = 1;
>> +
>> +    for (i = 0; i < lm->smeta_sec; i++, paddr++)
>> +        rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
>> +
>> +    ret = pblk_submit_io_sync(pblk, &rqd);
>> +    if (ret) {
>> +        pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
>> +        bio_put(bio);
>> +        goto free_ppa_list;
>> +    }
>> +
>> +    atomic_dec(&pblk->inflight_io);
>> +
>> +    if (rqd.error)
>> +        pblk_log_read_err(pblk, &rqd);
>> +
>> +free_ppa_list:
>> +    nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
>> +    return ret;
>> +}
>> +
>> +static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
>> +                 u64 paddr)
>> +{
>> +    struct nvm_tgt_dev *dev = pblk->dev;
>> +    struct pblk_line_meta *lm = &pblk->lm;
>> +    struct bio *bio;
>> +    struct nvm_rq rqd;
>> +    __le64 *lba_list = emeta_to_lbas(pblk, line->emeta->buf);
>> +    __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
>> +    int i, ret;
>> +
>> +    memset(&rqd, 0, sizeof(struct nvm_rq));
>> +
>> +    rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
>> +                            &rqd.dma_meta_list);
>> +    if (!rqd.meta_list)
>> +        return -ENOMEM;
>> +
>> +    rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size;
>> +    rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size;
>> +
>> +    bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
>> +    if (IS_ERR(bio)) {
>> +        ret = PTR_ERR(bio);
>> +        goto free_ppa_list;
>> +    }
>> +
>> +    bio->bi_iter.bi_sector = 0; /* internal bio */
>> +    bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
>> +
>> +    rqd.bio = bio;
>> +    rqd.opcode = NVM_OP_PWRITE;
>> +    rqd.nr_ppas = lm->smeta_sec;
>> +    rqd.is_seq = 1;
>> +
>> +    for (i = 0; i < lm->smeta_sec; i++, paddr++) {
>> +        struct pblk_sec_meta *meta_list = rqd.meta_list;
>> +
>> +        rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
>> +        meta_list[i].lba = lba_list[paddr] = addr_empty;
>> +    }
>> +
>> +    ret = pblk_submit_io_sync(pblk, &rqd);
>> +    if (ret) {
>> +        pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
>> +        bio_put(bio);
>> +        goto free_ppa_list;
>> +    }
>> +
>> +    atomic_dec(&pblk->inflight_io);
>> +
>> +    if (rqd.error) {
>> +        pblk_log_write_err(pblk, &rqd);
>> +        ret = -EIO;
>> +    }
>> +
>> +free_ppa_list:
>> +    nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
>> +    return ret;
>> +}
>> +
>> +int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
>> +             void *emeta_buf)
>>  {
>>      struct nvm_tgt_dev *dev = pblk->dev;
>>      struct nvm_geo *geo = &dev->geo;
>> @@ -635,24 +760,15 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
>>      void *ppa_list, *meta_list;
>>      struct bio *bio;
>>      struct nvm_rq rqd;
>> +    u64 paddr = line->emeta_ssec;
>>      dma_addr_t dma_ppa_list, dma_meta_list;
>>      int min = pblk->min_write_pgs;
>>      int left_ppas = lm->emeta_sec[0];
>> -    int id = line->id;
>> +    int line_id = line->id;
>>      int rq_ppas, rq_len;
>> -    int cmd_op, bio_op;
>>      int i, j;
>>      int ret;
>>  -    if (dir == PBLK_WRITE) {
>> -        bio_op = REQ_OP_WRITE;
>> -        cmd_op = NVM_OP_PWRITE;
>> -    } else if (dir == PBLK_READ) {
>> -        bio_op = REQ_OP_READ;
>> -        cmd_op = NVM_OP_PREAD;
>> -    } else
>> -        return -EINVAL;
>> -
>>      meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
>>                              &dma_meta_list);
>>      if (!meta_list)
>> @@ -675,64 +791,43 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
>>      }
>>        bio->bi_iter.bi_sector = 0; /* internal bio */
>> -    bio_set_op_attrs(bio, bio_op, 0);
>> +    bio_set_op_attrs(bio, REQ_OP_READ, 0);
>>        rqd.bio = bio;
>>      rqd.meta_list = meta_list;
>>      rqd.ppa_list = ppa_list;
>>      rqd.dma_meta_list = dma_meta_list;
>>      rqd.dma_ppa_list = dma_ppa_list;
>> -    rqd.opcode = cmd_op;
>> +    rqd.opcode = NVM_OP_PREAD;
>>      rqd.nr_ppas = rq_ppas;
>>  -    if (dir == PBLK_WRITE) {
>> -        struct pblk_sec_meta *meta_list = rqd.meta_list;
>> +    for (i = 0; i < rqd.nr_ppas; ) {
>> +        struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, line_id);
>> +        int pos = pblk_ppa_to_pos(geo, ppa);
>>  -        rqd.is_seq = 1;
>> -        for (i = 0; i < rqd.nr_ppas; ) {
>> -            spin_lock(&line->lock);
>> -            paddr = __pblk_alloc_page(pblk, line, min);
>> -            spin_unlock(&line->lock);
>> -            for (j = 0; j < min; j++, i++, paddr++) {
>> -                meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
>> -                rqd.ppa_list[i] =
>> -                    addr_to_gen_ppa(pblk, paddr, id);
>> -            }
>> -        }
>> -    } else {
>> -        for (i = 0; i < rqd.nr_ppas; ) {
>> -            struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
>> -            int pos = pblk_ppa_to_pos(geo, ppa);
>> +        if (pblk_io_aligned(pblk, rq_ppas))
>> +            rqd.is_seq = 1;
>>  -            if (pblk_io_aligned(pblk, rq_ppas))
>> -                rqd.is_seq = 1;
>> -
>> -            while (test_bit(pos, line->blk_bitmap)) {
>> -                paddr += min;
>> -                if (pblk_boundary_paddr_checks(pblk, paddr)) {
>> -                    pblk_err(pblk, "corrupt emeta line:%d\n",
>> -                                line->id);
>> -                    bio_put(bio);
>> -                    ret = -EINTR;
>> -                    goto free_rqd_dma;
>> -                }
>> -
>> -                ppa = addr_to_gen_ppa(pblk, paddr, id);
>> -                pos = pblk_ppa_to_pos(geo, ppa);
>> -            }
>> -
>> -            if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
>> -                pblk_err(pblk, "corrupt emeta line:%d\n",
>> -                                line->id);
>> +        while (test_bit(pos, line->blk_bitmap)) {
>> +            paddr += min;
>> +            if (pblk_boundary_paddr_checks(pblk, paddr)) {
>>                  bio_put(bio);
>>                  ret = -EINTR;
>>                  goto free_rqd_dma;
>>              }
>>  -            for (j = 0; j < min; j++, i++, paddr++)
>> -                rqd.ppa_list[i] =
>> -                    addr_to_gen_ppa(pblk, paddr, line->id);
>> +            ppa = addr_to_gen_ppa(pblk, paddr, line_id);
>> +            pos = pblk_ppa_to_pos(geo, ppa);
>>          }
>> +
>> +        if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
>> +            bio_put(bio);
>> +            ret = -EINTR;
>> +            goto free_rqd_dma;
>> +        }
>> +
>> +        for (j = 0; j < min; j++, i++, paddr++)
>> +            rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line_id);
>>      }
>>        ret = pblk_submit_io_sync(pblk, &rqd);
>> @@ -744,136 +839,19 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
>>        atomic_dec(&pblk->inflight_io);
>>  -    if (rqd.error) {
>> -        if (dir == PBLK_WRITE)
>> -            pblk_log_write_err(pblk, &rqd);
>> -        else
>> -            pblk_log_read_err(pblk, &rqd);
>> -    }
>> +    if (rqd.error)
>> +        pblk_log_read_err(pblk, &rqd);
>>        emeta_buf += rq_len;
>>      left_ppas -= rq_ppas;
>>      if (left_ppas)
>>          goto next_rq;
>> +
>>  free_rqd_dma:
>>      nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
>>      return ret;
>>  }
>>  -u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
>> -{
>> -    struct nvm_tgt_dev *dev = pblk->dev;
>> -    struct nvm_geo *geo = &dev->geo;
>> -    struct pblk_line_meta *lm = &pblk->lm;
>> -    int bit;
>> -
>> -    /* This usually only happens on bad lines */
>> -    bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
>> -    if (bit >= lm->blk_per_line)
>> -        return -1;
>> -
>> -    return bit * geo->ws_opt;
>> -}
>> -
>> -static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
>> -                     u64 paddr, int dir)
>> -{
>> -    struct nvm_tgt_dev *dev = pblk->dev;
>> -    struct pblk_line_meta *lm = &pblk->lm;
>> -    struct bio *bio;
>> -    struct nvm_rq rqd;
>> -    __le64 *lba_list = NULL;
>> -    int i, ret;
>> -    int cmd_op, bio_op;
>> -
>> -    if (dir == PBLK_WRITE) {
>> -        bio_op = REQ_OP_WRITE;
>> -        cmd_op = NVM_OP_PWRITE;
>> -        lba_list = emeta_to_lbas(pblk, line->emeta->buf);
>> -    } else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) {
>> -        bio_op = REQ_OP_READ;
>> -        cmd_op = NVM_OP_PREAD;
>> -    } else
>> -        return -EINVAL;
>> -
>> -    memset(&rqd, 0, sizeof(struct nvm_rq));
>> -
>> -    rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
>> -                            &rqd.dma_meta_list);
>> -    if (!rqd.meta_list)
>> -        return -ENOMEM;
>> -
>> -    rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size;
>> -    rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size;
>> -
>> -    bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
>> -    if (IS_ERR(bio)) {
>> -        ret = PTR_ERR(bio);
>> -        goto free_ppa_list;
>> -    }
>> -
>> -    bio->bi_iter.bi_sector = 0; /* internal bio */
>> -    bio_set_op_attrs(bio, bio_op, 0);
>> -
>> -    rqd.bio = bio;
>> -    rqd.opcode = cmd_op;
>> -    rqd.is_seq = 1;
>> -    rqd.nr_ppas = lm->smeta_sec;
>> -
>> -    for (i = 0; i < lm->smeta_sec; i++, paddr++) {
>> -        struct pblk_sec_meta *meta_list = rqd.meta_list;
>> -
>> -        rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
>> -
>> -        if (dir == PBLK_WRITE) {
>> -            __le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
>> -
>> -            meta_list[i].lba = lba_list[paddr] = addr_empty;
>> -        }
>> -    }
>> -
>> -    /*
>> -     * This I/O is sent by the write thread when a line is replace. Since
>> -     * the write thread is the only one sending write and erase commands,
>> -     * there is no need to take the LUN semaphore.
>> -     */
>> -    ret = pblk_submit_io_sync(pblk, &rqd);
>> -    if (ret) {
>> -        pblk_err(pblk, "smeta I/O submission failed: %d\n", ret);
>> -        bio_put(bio);
>> -        goto free_ppa_list;
>> -    }
>> -
>> -    atomic_dec(&pblk->inflight_io);
>> -
>> -    if (rqd.error) {
>> -        if (dir == PBLK_WRITE) {
>> -            pblk_log_write_err(pblk, &rqd);
>> -            ret = 1;
>> -        } else if (dir == PBLK_READ)
>> -            pblk_log_read_err(pblk, &rqd);
>> -    }
>> -
>> -free_ppa_list:
>> -    nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list);
>> -
>> -    return ret;
>> -}
>> -
>> -int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
>> -{
>> -    u64 bpaddr = pblk_line_smeta_start(pblk, line);
>> -
>> -    return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV);
>> -}
>> -
>> -int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
>> -             void *emeta_buf)
>> -{
>> -    return pblk_line_submit_emeta_io(pblk, line, emeta_buf,
>> -                        line->emeta_ssec, PBLK_READ);
>> -}
>> -
>>  static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
>>                  struct ppa_addr ppa)
>>  {
>> @@ -1102,7 +1080,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
>>      line->smeta_ssec = off;
>>      line->cur_sec = off + lm->smeta_sec;
>>  -    if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
>> +    if (init && pblk_line_smeta_write(pblk, line, off)) {
>>          pblk_debug(pblk, "line smeta I/O failed. Retry\n");
>>          return 0;
>>      }
>> diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
>> index 157c2567c9e8..189d92b58447 100644
>> --- a/drivers/lightnvm/pblk-gc.c
>> +++ b/drivers/lightnvm/pblk-gc.c
>> @@ -144,7 +144,7 @@ static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
>>      if (!emeta_buf)
>>          return NULL;
>>  -    ret = pblk_line_read_emeta(pblk, line, emeta_buf);
>> +    ret = pblk_line_emeta_read(pblk, line, emeta_buf);
>>      if (ret) {
>>          pblk_err(pblk, "line %d read emeta failed (%d)\n",
>>                  line->id, ret);
>> diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
>> index cf629ab016ba..7b27e958dc8e 100644
>> --- a/drivers/lightnvm/pblk-recovery.c
>> +++ b/drivers/lightnvm/pblk-recovery.c
>> @@ -835,7 +835,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
>>              continue;
>>            /* Lines that cannot be read are assumed as not written here */
>> -        if (pblk_line_read_smeta(pblk, line))
>> +        if (pblk_line_smeta_read(pblk, line))
>>              continue;
>>            crc = pblk_calc_smeta_crc(pblk, smeta_buf);
>> @@ -905,7 +905,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
>>          line->emeta = emeta;
>>          memset(line->emeta->buf, 0, lm->emeta_len[0]);
>>  -        if (pblk_line_read_emeta(pblk, line, line->emeta->buf)) {
>> +        if (pblk_line_emeta_read(pblk, line, line->emeta->buf)) {
>>              pblk_recov_l2p_from_oob(pblk, line);
>>              goto next;
>>          }
>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>> index 48b3035df3c4..a71f9847957b 100644
>> --- a/drivers/lightnvm/pblk.h
>> +++ b/drivers/lightnvm/pblk.h
>> @@ -782,6 +782,7 @@ void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
>>  void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
>>  int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
>>  int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd);
>> +int pblk_submit_io_sync_sem(struct pblk *pblk, struct nvm_rq *rqd);
> 
> I'll remove this from the patch and add to the next.
> 
>>  int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line);
>>  struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
>>                    unsigned int nr_secs, unsigned int len,
>> @@ -806,8 +807,8 @@ void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
>>               void (*work)(struct work_struct *), gfp_t gfp_mask,
>>               struct workqueue_struct *wq);
>>  u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
>> -int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line);
>> -int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
>> +int pblk_line_smeta_read(struct pblk *pblk, struct pblk_line *line);
>> +int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
>>               void *emeta_buf);
>>  int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
>>  void pblk_line_put(struct kref *ref);
> 
> Thanks. Applied for 4.20. I think the split could be optimized quite a bit with respect to complexity. Now a lot of very similar code is spread across two functions. I may look into optimizing the request alloc path, such that it not open coded in most places.

Let me take another round at it. I’ll submit a V2 later this week. 

Javier.