From: Keith Busch <kbusch@xxxxxxxxxx> Implement callbacks to convert a registered bio_vec to a prp list, and use this for each IO that uses the returned tag. This saves repeated IO conversions and dma mapping/unmapping. In many cases, the driver can skip per-IO pool allocations entirely, saving potentially signficant CPU cycles. Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> --- drivers/nvme/host/pci.c | 291 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 283 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 644664098ae7..571d955eaef0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -110,6 +110,14 @@ struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); +struct nvme_dma_mapping { + int nr_pages; + u16 offset; + u8 rsvd[2]; + dma_addr_t prp_dma_addr; + __le64 *prps; +}; + /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -544,6 +552,35 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) return true; } +static void nvme_sync_dma(struct nvme_dev *dev, struct request *req) +{ + int index, offset, i, length, nprps; + struct nvme_dma_mapping *mapping; + bool needs_sync; + + mapping = blk_rq_dma_tag(req); + offset = blk_rq_dma_offset(req) + mapping->offset; + index = offset >> NVME_CTRL_PAGE_SHIFT; + needs_sync = rq_data_dir(req) == READ && + dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[index])); + + if (!needs_sync) + return; + + offset = offset & (NVME_CTRL_PAGE_SIZE - 1); + length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset); + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); + + dma_sync_single_for_cpu(dev->dev, + le64_to_cpu(mapping->prps[index++]), + NVME_CTRL_PAGE_SIZE - offset, DMA_FROM_DEVICE); + for (i = 1; i < nprps; i++) { + dma_sync_single_for_cpu(dev->dev, + le64_to_cpu(mapping->prps[index++]), + NVME_CTRL_PAGE_SIZE, DMA_FROM_DEVICE); + } +} + static void nvme_free_prps(struct nvme_dev *dev, struct request *req) { const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; @@ -576,6 +613,21 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) } } +static void nvme_free_prp_chain(struct nvme_dev *dev, struct request *req, + struct nvme_iod *iod) +{ + if (iod->npages < 0) + return; + + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], + iod->first_dma); + else if (iod->use_sgl) + nvme_free_sgls(dev, req); + else + nvme_free_prps(dev, req); +} + static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -595,18 +647,15 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, rq_dma_dir(req)); return; + } else if (blk_rq_dma_tag(req)) { + nvme_sync_dma(dev, req); + nvme_free_prp_chain(dev, req, iod); + return; } WARN_ON_ONCE(!iod->nents); - nvme_unmap_sg(dev, req); - if (iod->npages == 0) - dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], - iod->first_dma); - else if (iod->use_sgl) - nvme_free_sgls(dev, req); - else - nvme_free_prps(dev, req); + nvme_free_prp_chain(dev, req, iod); mempool_free(iod->sg, dev->iod_mempool); } @@ -835,6 +884,122 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, return BLK_STS_OK; } +static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req, + struct nvme_rw_command *cmnd, + struct nvme_iod *iod) +{ + static const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; + dma_addr_t prp_list_start, prp_list_end, prp_dma; + int index, offset, i, length, nprps, nprps_left; + void **list = nvme_pci_iod_list(req); + struct nvme_dma_mapping *mapping; + struct dma_pool *pool; + __le64 *prp_list; + bool needs_sync; + + mapping = blk_rq_dma_tag(req); + offset = blk_rq_dma_offset(req) + mapping->offset; + index = offset >> NVME_CTRL_PAGE_SHIFT; + offset = offset & (NVME_CTRL_PAGE_SIZE - 1); + needs_sync = rq_data_dir(req) == WRITE && + dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[index])); + + /* + * XXX: For PAGE_SIZE > NVME_CTRL_PAGE_SIZE, is it faster to save the + * PRP list implementation and sync multiple partial pages, more + * efficient to sync PAGE_SIZE and build the PRP list per-IO from a + * host PAGE_SIZE representation, or cleverly sync physically + * contiguous regions? + */ + if (needs_sync) { + dma_sync_single_for_device(dev->dev, + le64_to_cpu(mapping->prps[index]), + NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE); + } + + length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset); + cmnd->dptr.prp1 = cpu_to_le64(le64_to_cpu(mapping->prps[index++]) + offset); + + if (length <= 0) + return BLK_STS_OK; + + if (length <= NVME_CTRL_PAGE_SIZE) { + if (needs_sync) + dma_sync_single_for_device(dev->dev, + le64_to_cpu(mapping->prps[index]), + NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE); + cmnd->dptr.prp2 = mapping->prps[index]; + return BLK_STS_OK; + } + + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); + prp_list_start = mapping->prp_dma_addr + 8 * index; + prp_list_end = prp_list_start + 8 * nprps; + + /* Optimization when remaining list fits in one nvme page */ + if ((prp_list_start >> NVME_CTRL_PAGE_SHIFT) == + (prp_list_end >> NVME_CTRL_PAGE_SHIFT)) { + cmnd->dptr.prp2 = cpu_to_le64(prp_list_start); + goto sync; + } + + if (nprps <= (256 / 8)) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + if (!prp_list) { + iod->npages = -1; + return BLK_STS_RESOURCE; + } + + list[0] = prp_list; + iod->first_dma = prp_dma; + i = 0; + for (;;) { + dma_addr_t next_prp_dma; + __le64 *next_prp_list; + + if (nprps_left <= last_prp + 1) { + memcpy(prp_list, &mapping->prps[index], nprps_left * 8); + break; + } + + memcpy(prp_list, &mapping->prps[index], + NVME_CTRL_PAGE_SIZE - 8); + nprps_left -= last_prp; + index += last_prp; + + next_prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &next_prp_dma); + if (!next_prp_list) + goto free_prps; + + prp_list[last_prp] = cpu_to_le64(next_prp_dma); + prp_list = next_prp_list; + prp_dma = next_prp_dma; + list[iod->npages++] = prp_list; + } + cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); + +sync: + if (!needs_sync) + return BLK_STS_OK; + + for (i = 0; i < nprps; i++) + dma_sync_single_for_device(dev->dev, + le64_to_cpu(mapping->prps[index++]), + NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE); + return BLK_STS_OK; + +free_prps: + nvme_free_prps(dev, req); + return BLK_STS_RESOURCE; +} + static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { @@ -842,6 +1007,12 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, blk_status_t ret = BLK_STS_RESOURCE; int nr_mapped; + if (blk_rq_dma_tag(req)) { + iod->dma_len = 0; + iod->use_sgl = false; + return nvme_premapped(dev, req, &cmnd->rw, iod); + } + if (blk_rq_nr_phys_segments(req) == 1) { struct bio_vec bv = req_bvec(req); @@ -1732,6 +1903,106 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) return result; } +#ifdef CONFIG_HAS_DMA +/* + * Important: bvec must be describing a virtually contiguous buffer. + */ +static void *nvme_pci_dma_map(struct request_queue *q, + struct bio_vec *bvec, int nr_vecs) +{ + const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE); + struct nvme_ns *ns = q->queuedata; + struct nvme_dev *dev = to_nvme_dev(ns->ctrl); + struct nvme_dma_mapping *mapping; + int i, j, k, size, ret = -ENOMEM; + + if (!nr_vecs) + return ERR_PTR(-EINVAL); + + mapping = kzalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return ERR_PTR(-ENOMEM); + + mapping->nr_pages = nr_vecs * nvme_pages; + size = sizeof(*mapping->prps) * mapping->nr_pages; + mapping->prps = dma_alloc_coherent(dev->dev, size, + &mapping->prp_dma_addr, GFP_KERNEL); + if (!mapping->prps) + goto free_mapping; + + for (i = 0, k = 0; i < nr_vecs; i++) { + struct bio_vec *bv = bvec + i; + int pages_per = nvme_pages; + dma_addr_t dma_addr; + + if (i == 0) { + mapping->offset = bv->bv_offset; + pages_per -= mapping->offset >> NVME_CTRL_PAGE_SHIFT; + } else if (bv->bv_offset) { + ret = -EINVAL; + goto err; + } + + if (bv->bv_offset + bv->bv_len != PAGE_SIZE && + i < nr_vecs - 1) { + ret = -EINVAL; + goto err; + } + + dma_addr = dma_map_bvec(dev->dev, bv, 0, 0); + if (dma_mapping_error(dev->dev, dma_addr)) { + ret = -EIO; + goto err; + } + + if (i == 0) + dma_addr -= mapping->offset; + + for (j = 0; j < nvme_pages; j++) + mapping->prps[k++] = cpu_to_le64(dma_addr + + j * NVME_CTRL_PAGE_SIZE); + } + + get_device(dev->dev); + return mapping; + +err: + for (i = 0; i < k; i += nvme_pages) { + __u64 dma_addr = le64_to_cpu(mapping->prps[i]); + + dma_unmap_page(dev->dev, dma_addr, + PAGE_SIZE - offset_in_page(dma_addr), 0); + } + + dma_free_coherent(dev->dev, size, (void *)mapping->prps, + mapping->prp_dma_addr); +free_mapping: + kfree(mapping); + return ERR_PTR(ret); +} + +static void nvme_pci_dma_unmap(struct request_queue *q, void *dma_tag) +{ + const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE); + struct nvme_ns *ns = q->queuedata; + struct nvme_dev *dev = to_nvme_dev(ns->ctrl); + struct nvme_dma_mapping *mapping = dma_tag; + int i; + + for (i = 0; i < mapping->nr_pages; i += nvme_pages) { + __u64 dma_addr = le64_to_cpu(mapping->prps[i]); + + dma_unmap_page(dev->dev, dma_addr, + PAGE_SIZE - offset_in_page(dma_addr), 0); + } + + dma_free_coherent(dev->dev, mapping->nr_pages * sizeof(*mapping->prps), + (void *)mapping->prps, mapping->prp_dma_addr); + kfree(mapping); + put_device(dev->dev); +} +#endif + static const struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, @@ -1750,6 +2021,10 @@ static const struct blk_mq_ops nvme_mq_ops = { .map_queues = nvme_pci_map_queues, .timeout = nvme_timeout, .poll = nvme_poll, +#ifdef CONFIG_HAS_DMA + .dma_map = nvme_pci_dma_map, + .dma_unmap = nvme_pci_dma_unmap, +#endif }; static void nvme_dev_remove_admin(struct nvme_dev *dev) -- 2.30.2