From: Leon Romanovsky <leonro@xxxxxxxxxx> This demonstrates how the new DMA API can fit into the NVMe driver and replace the old DMA APIs. As this is an RFC, I expect more robust error handling, optimizations, and in-depth testing for the final version once we agree on DMA API architecture. Following is the performance comparision for existing DMA API case with sg_table and with dma_map, once we have agreement on the new DMA API design I intend to get similar profiling numbers for new DMA API. sgl (sg_table + old dma API ) vs no_sgl (iod_dma_map + new DMA API) :- block size IOPS (k) Average of 3 4K -------------------------------------------------------------- sg-list-fio-perf.bs-4k-1.fio: 68.6 sg-list-fio-perf.bs-4k-2.fio: 68 68.36 sg-list-fio-perf.bs-4k-3.fio: 68.5 no-sg-list-fio-perf.bs-4k-1.fio: 68.7 no-sg-list-fio-perf.bs-4k-2.fio: 68.5 68.43 no-sg-list-fio-perf.bs-4k-3.fio: 68.1 % Change default vs new DMA API = +0.0975% 8K -------------------------------------------------------------- sg-list-fio-perf.bs-8k-1.fio: 67 sg-list-fio-perf.bs-8k-2.fio: 67.1 67.03 sg-list-fio-perf.bs-8k-3.fio: 67 no-sg-list-fio-perf.bs-8k-1.fio: 66.7 no-sg-list-fio-perf.bs-8k-2.fio: 66.7 66.7 no-sg-list-fio-perf.bs-8k-3.fio: 66.7 % Change default vs new DMA API = +0.4993% 16K -------------------------------------------------------------- sg-list-fio-perf.bs-16k-1.fio: 63.8 sg-list-fio-perf.bs-16k-2.fio: 63.4 63.5 sg-list-fio-perf.bs-16k-3.fio: 63.3 no-sg-list-fio-perf.bs-16k-1.fio: 63.5 no-sg-list-fio-perf.bs-16k-2.fio: 63.4 63.33 no-sg-list-fio-perf.bs-16k-3.fio: 63.1 % Change default vs new DMA API = -0.2632% 32K -------------------------------------------------------------- sg-list-fio-perf.bs-32k-1.fio: 59.3 sg-list-fio-perf.bs-32k-2.fio: 59.3 59.36 sg-list-fio-perf.bs-32k-3.fio: 59.5 no-sg-list-fio-perf.bs-32k-1.fio: 59.5 no-sg-list-fio-perf.bs-32k-2.fio: 59.6 59.43 no-sg-list-fio-perf.bs-32k-3.fio: 59.2 % Change default vs new DMA API = +0.1122% 64K -------------------------------------------------------------- sg-list-fio-perf.bs-64k-1.fio: 53.7 sg-list-fio-perf.bs-64k-2.fio: 53.4 53.56 sg-list-fio-perf.bs-64k-3.fio: 53.6 no-sg-list-fio-perf.bs-64k-1.fio: 53.5 no-sg-list-fio-perf.bs-64k-2.fio: 53.8 53.63 no-sg-list-fio-perf.bs-64k-3.fio: 53.6 % Change default vs new DMA API = +0.1246% 128K -------------------------------------------------------------- sg-list-fio-perf/bs-128k-1.fio: 48 sg-list-fio-perf/bs-128k-2.fio: 46.4 47.13 sg-list-fio-perf/bs-128k-3.fio: 47 no-sg-list-fio-perf/bs-128k-1.fio: 46.6 no-sg-list-fio-perf/bs-128k-2.fio: 47 46.9 no-sg-list-fio-perf/bs-128k-3.fio: 47.1 % Change default vs new DMA API = −0.495% 256K -------------------------------------------------------------- sg-list-fio-perf/bs-256k-1.fio: 37 sg-list-fio-perf/bs-256k-2.fio: 41 39.93 sg-list-fio-perf/bs-256k-3.fio: 41.8 no-sg-list-fio-perf/bs-256k-1.fio: 37.5 no-sg-list-fio-perf/bs-256k-2.fio: 41.4 40.5 no-sg-list-fio-perf/bs-256k-3.fio: 42.6 % Change default vs new DMA API = +1.42% 512K -------------------------------------------------------------- sg-list-fio-perf/bs-512k-1.fio: 28.5 sg-list-fio-perf/bs-512k-2.fio: 28.2 28.4 sg-list-fio-perf/bs-512k-3.fio: 28.5 no-sg-list-fio-perf/bs-512k-1.fio: 28.7 no-sg-list-fio-perf/bs-512k-2.fio: 28.6 28.7 no-sg-list-fio-perf/bs-512k-3.fio: 28.8 % Change default vs new DMA API = +1.06% Signed-off-by: Chaitanya Kulkarni <kch@xxxxxxxxxx> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxx> --- drivers/nvme/host/pci.c | 354 ++++++++++++++++++++++++---------------- 1 file changed, 215 insertions(+), 139 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2b236b1d209e..881cbf2c0cac 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -221,6 +221,12 @@ union nvme_descriptor { __le64 *prp_list; }; +/* TODO: move to common header */ +struct dma_entry { + dma_addr_t addr; + unsigned int len; +}; + /* * The nvme_iod describes the data in an I/O. * @@ -234,9 +240,11 @@ struct nvme_iod { u8 nr_dmas; s8 nr_allocations; /* PRP list pool allocations. 0 means small pool in use */ + struct dma_iova_state state; + struct dma_entry dma; + struct dma_entry *map; dma_addr_t first_dma; dma_addr_t meta_dma; - struct sg_table sgt; union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; }; @@ -540,10 +548,9 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req) static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - - WARN_ON_ONCE(!iod->sgt.nents); - - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); + struct req_iterator iter; + struct bio_vec bv; + int cnt = 0; if (iod->nr_allocations == 0) dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, @@ -553,20 +560,17 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) iod->first_dma); else nvme_free_prps(dev, req); - mempool_free(iod->sgt.sgl, dev->iod_mempool); -} -static void nvme_print_sgl(struct scatterlist *sgl, int nents) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sgl, sg, nents, i) { - dma_addr_t phys = sg_phys(sg); - pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " - "dma_address:%pad dma_length:%d\n", - i, &phys, sg->offset, sg->length, &sg_dma_address(sg), - sg_dma_len(sg)); + if (iod->map) { + rq_for_each_bvec(bv, req, iter) { + dma_unmap_page(dev->dev, iod->map[cnt].addr, + iod->map[cnt].len, rq_dma_dir(req)); + cnt++; + } + kfree(iod->map); + } else { + dma_unlink_range(&iod->state); + dma_free_iova(&iod->state); } } @@ -574,97 +578,63 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, struct request *req, struct nvme_rw_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; - int length = blk_rq_payload_bytes(req); - struct scatterlist *sg = iod->sgt.sgl; - int dma_len = sg_dma_len(sg); - u64 dma_addr = sg_dma_address(sg); - int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); - __le64 *prp_list; - dma_addr_t prp_dma; - int nprps, i; - - length -= (NVME_CTRL_PAGE_SIZE - offset); - if (length <= 0) { - iod->first_dma = 0; - goto done; - } - - dma_len -= (NVME_CTRL_PAGE_SIZE - offset); - if (dma_len) { - dma_addr += (NVME_CTRL_PAGE_SIZE - offset); - } else { - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } + __le64 *prp_list = iod->list[0].prp_list; + int i = 0, idx = 0; + struct bio_vec bv; + struct req_iterator iter; + dma_addr_t offset = 0; - if (length <= NVME_CTRL_PAGE_SIZE) { - iod->first_dma = dma_addr; - goto done; + if (iod->nr_dmas <= 2) { + i = iod->nr_dmas; + /* We can use the inline PRP/SG list */ + goto set_addr; } - nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); - if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; - iod->nr_allocations = 0; - } else { - pool = dev->prp_page_pool; - iod->nr_allocations = 1; - } + rq_for_each_bvec(bv, req, iter) { + dma_addr_t addr; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) { - iod->nr_allocations = -1; - return BLK_STS_RESOURCE; - } - iod->list[0].prp_list = prp_list; - iod->first_dma = prp_dma; - i = 0; - for (;;) { - if (i == NVME_CTRL_PAGE_SIZE >> 3) { - __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) - goto free_prps; - iod->list[iod->nr_allocations++].prp_list = prp_list; - prp_list[0] = old_prp_list[i - 1]; - old_prp_list[i - 1] = cpu_to_le64(prp_dma); - i = 1; + if (iod->map) + offset = 0; + + while (offset < bv.bv_len) { + if (iod->map) + addr = iod->map[i].addr; + else + addr = iod->dma.addr; + + prp_list[idx] = cpu_to_le64(addr + offset); + offset += NVME_CTRL_PAGE_SIZE; + idx++; } - prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= NVME_CTRL_PAGE_SIZE; - dma_addr += NVME_CTRL_PAGE_SIZE; - length -= NVME_CTRL_PAGE_SIZE; - if (length <= 0) - break; - if (dma_len > 0) - continue; - if (unlikely(dma_len < 0)) - goto bad_sgl; - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } -done: - cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl)); - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); + i++; + } + +set_addr: + if (iod->map) + cmnd->dptr.prp1 = cpu_to_le64(iod->map[0].addr); + else + cmnd->dptr.prp1 = cpu_to_le64(iod->dma.addr); + if (idx == 1 && i == 1) + cmnd->dptr.prp2 = 0; + else if (idx == 2 && i == 2) + if (iod->map) + cmnd->dptr.prp2 = + cpu_to_le64((iod->map[0].addr + NVME_CTRL_PAGE_SIZE) & + ~(NVME_CTRL_PAGE_SIZE - 1)); + else + cmnd->dptr.prp2 = + cpu_to_le64((iod->dma.addr + NVME_CTRL_PAGE_SIZE) & + ~(NVME_CTRL_PAGE_SIZE - 1)); + else + cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); return BLK_STS_OK; -free_prps: - nvme_free_prps(dev, req); - return BLK_STS_RESOURCE; -bad_sgl: - WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), - "Invalid SGL for payload:%d nents:%d\n", - blk_rq_payload_bytes(req), iod->sgt.nents); - return BLK_STS_IOERR; } -static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, - struct scatterlist *sg) +static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, dma_addr_t addr, + int len) { - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->addr = cpu_to_le64(addr); + sge->length = cpu_to_le32(len); sge->type = NVME_SGL_FMT_DATA_DESC << 4; } @@ -680,17 +650,77 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, struct request *req, struct nvme_rw_command *cmd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; - struct nvme_sgl_desc *sg_list; - struct scatterlist *sg = iod->sgt.sgl; - unsigned int entries = iod->sgt.nents; - dma_addr_t sgl_dma; - int i = 0; + struct nvme_sgl_desc *sg_list = iod->list[0].sg_list; + struct bio_vec bv = req_bvec(req); + struct req_iterator iter; + int i = 0, idx = 0; + dma_addr_t offset = 0; /* setting the transfer type as SGL */ cmd->flags = NVME_CMD_SGL_METABUF; - if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { + if (iod->nr_dmas <= 1) + /* We can use the inline PRP/SG list */ + goto set_addr; + + rq_for_each_bvec(bv, req, iter) { + dma_addr_t addr; + + if (iod->map) + offset = 0; + + while (offset < bv.bv_len) { + if (iod->map) + addr = iod->map[i].addr; + else + addr = iod->dma.addr; + + nvme_pci_sgl_set_data(&sg_list[idx], addr + offset, + bv.bv_len); + offset += NVME_CTRL_PAGE_SIZE; + idx++; + } + i++; + } + +set_addr: + nvme_pci_sgl_set_seg(&cmd->dptr.sgl, iod->first_dma, + blk_rq_nr_phys_segments(req)); + return BLK_STS_OK; +} + +static void nvme_pci_free_pool(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + if (iod->nr_allocations == 0) + dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, + iod->first_dma); + else if (iod->nr_allocations == 1) + dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list, + iod->first_dma); + else + nvme_free_prps(dev, req); +} + +static blk_status_t nvme_pci_setup_pool(struct nvme_dev *dev, + struct request *req, bool is_sgl) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct dma_pool *pool; + size_t entry_sz; + dma_addr_t addr; + u8 entries; + void *list; + + if (iod->nr_dmas <= 2) + /* Do nothing, we can use the inline PRP/SG list */ + return BLK_STS_OK; + + /* First DMA address goes to prp1 anyway */ + entries = iod->nr_dmas - 1; + entry_sz = (is_sgl) ? sizeof(struct nvme_sgl_desc) : sizeof(__le64); + if (entries <= (256 / entry_sz)) { pool = dev->prp_small_pool; iod->nr_allocations = 0; } else { @@ -698,21 +728,20 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, iod->nr_allocations = 1; } - sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); - if (!sg_list) { + /* TBD: allocate mulitple pools and chain them */ + WARN_ON(entries > 512); + + list = dma_pool_alloc(pool, GFP_ATOMIC, &addr); + if (!list) { iod->nr_allocations = -1; return BLK_STS_RESOURCE; } - iod->list[0].sg_list = sg_list; - iod->first_dma = sgl_dma; - - nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); - do { - nvme_pci_sgl_set_data(&sg_list[i++], sg); - sg = sg_next(sg); - } while (--entries > 0); - + if (is_sgl) + iod->list[0].sg_list = list; + else + iod->list[0].prp_list = list; + iod->first_dma = addr; return BLK_STS_OK; } @@ -721,36 +750,84 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret = BLK_STS_RESOURCE; - int rc; - - iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); - if (!iod->sgt.sgl) + unsigned short n_segments = blk_rq_nr_phys_segments(req); + struct bio_vec bv = req_bvec(req); + struct req_iterator iter; + dma_addr_t dma_addr; + int rc, cnt = 0; + bool is_sgl; + + dma_init_iova_state(&iod->state, dev->dev, rq_dma_dir(req)); + dma_set_iova_state(&iod->state, bv.bv_page, bv.bv_len); + + rc = dma_start_range(&iod->state); + if (rc) return BLK_STS_RESOURCE; - sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); - iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl); - if (!iod->sgt.orig_nents) - goto out_free_sg; - rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); - if (rc) { - if (rc == -EREMOTEIO) - ret = BLK_STS_TARGET; - goto out_free_sg; + iod->dma.len = 0; + iod->dma.addr = 0; + + if (dma_can_use_iova(&iod->state)) { + iod->map = NULL; + rc = dma_alloc_iova_unaligned(&iod->state, bvec_phys(&bv), + blk_rq_payload_bytes(req)); + if (rc) + return BLK_STS_RESOURCE; + + rq_for_each_bvec(bv, req, iter) { + dma_addr = dma_link_range(&iod->state, bvec_phys(&bv), + bv.bv_len); + if (dma_mapping_error(dev->dev, dma_addr)) + goto out_free; + + if (!iod->dma.addr) + iod->dma.addr = dma_addr; + } + WARN_ON(blk_rq_payload_bytes(req) != iod->state.range_size); + } else { + iod->map = kmalloc_array(n_segments, sizeof(*iod->map), + GFP_ATOMIC); + if (!iod->map) + return BLK_STS_RESOURCE; + + rq_for_each_bvec(bv, req, iter) { + dma_addr = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); + if (dma_mapping_error(dev->dev, dma_addr)) + goto out_free; + + iod->map[cnt].addr = dma_addr; + iod->map[cnt].len = bv.bv_len; + cnt++; + } } + dma_end_range(&iod->state); - if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) + is_sgl = nvme_pci_use_sgls(dev, req, n_segments); + ret = nvme_pci_setup_pool(dev, req, is_sgl); + if (ret != BLK_STS_OK) + goto out_free; + + if (is_sgl) ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); else ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); if (ret != BLK_STS_OK) - goto out_unmap_sg; + goto out_free_pool; + return BLK_STS_OK; -out_unmap_sg: - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); -out_free_sg: - mempool_free(iod->sgt.sgl, dev->iod_mempool); +out_free_pool: + nvme_pci_free_pool(dev, req); +out_free: + if (iod->map) { + while (cnt--) + dma_unmap_page(dev->dev, iod->map[cnt].addr, + iod->map[cnt].len, rq_dma_dir(req)); + kfree(iod->map); + } else { + dma_unlink_range(&iod->state); + dma_free_iova(&iod->state); + } return ret; } @@ -791,7 +868,6 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) iod->aborted = false; iod->nr_allocations = -1; - iod->sgt.nents = 0; ret = nvme_setup_cmd(req->q->queuedata, req); if (ret) -- 2.46.0