[RFC v2 20/21] nvme-pci: use new dma API

Leon Romanovsky <leon@xxxxxxxxxx> · Thu, 12 Sep 2024 14:15:55 +0300

From: Leon Romanovsky <leonro@xxxxxxxxxx>

This demonstrates how the new DMA API can fit into the NVMe driver and
replace the old DMA APIs.

As this is an RFC, I expect more robust error handling, optimizations,
and in-depth testing for the final version once we agree on DMA API
architecture.

Following is the performance comparision for existing DMA API case
with sg_table and with dma_map, once we have agreement on the new DMA
API design I intend to get similar profiling numbers for new DMA API.

sgl (sg_table + old dma API ) vs no_sgl (iod_dma_map + new DMA API) :-

block size                               IOPS (k) Average of 3

4K
--------------------------------------------------------------
sg-list-fio-perf.bs-4k-1.fio:             68.6
sg-list-fio-perf.bs-4k-2.fio:             68       68.36
sg-list-fio-perf.bs-4k-3.fio:             68.5

no-sg-list-fio-perf.bs-4k-1.fio:          68.7
no-sg-list-fio-perf.bs-4k-2.fio:          68.5     68.43
no-sg-list-fio-perf.bs-4k-3.fio:          68.1

% Change default vs new DMA API =       +0.0975%

8K
--------------------------------------------------------------
sg-list-fio-perf.bs-8k-1.fio:             67
sg-list-fio-perf.bs-8k-2.fio:             67.1     67.03
sg-list-fio-perf.bs-8k-3.fio:             67

no-sg-list-fio-perf.bs-8k-1.fio:          66.7
no-sg-list-fio-perf.bs-8k-2.fio:          66.7     66.7
no-sg-list-fio-perf.bs-8k-3.fio:          66.7

% Change default vs new DMA API =       +0.4993%

16K
--------------------------------------------------------------
sg-list-fio-perf.bs-16k-1.fio:            63.8
sg-list-fio-perf.bs-16k-2.fio:            63.4     63.5
sg-list-fio-perf.bs-16k-3.fio:            63.3

no-sg-list-fio-perf.bs-16k-1.fio:         63.5
no-sg-list-fio-perf.bs-16k-2.fio:         63.4     63.33
no-sg-list-fio-perf.bs-16k-3.fio:         63.1

% Change default vs new DMA API =       -0.2632%

32K
--------------------------------------------------------------
sg-list-fio-perf.bs-32k-1.fio:            59.3
sg-list-fio-perf.bs-32k-2.fio:            59.3     59.36
sg-list-fio-perf.bs-32k-3.fio:            59.5

no-sg-list-fio-perf.bs-32k-1.fio:         59.5
no-sg-list-fio-perf.bs-32k-2.fio:         59.6     59.43
no-sg-list-fio-perf.bs-32k-3.fio:         59.2

% Change default vs new DMA API =       +0.1122%

64K
--------------------------------------------------------------
sg-list-fio-perf.bs-64k-1.fio:            53.7
sg-list-fio-perf.bs-64k-2.fio:            53.4     53.56
sg-list-fio-perf.bs-64k-3.fio:            53.6

no-sg-list-fio-perf.bs-64k-1.fio:         53.5
no-sg-list-fio-perf.bs-64k-2.fio:         53.8     53.63
no-sg-list-fio-perf.bs-64k-3.fio:         53.6

% Change default vs new DMA API =        +0.1246%

128K
--------------------------------------------------------------
sg-list-fio-perf/bs-128k-1.fio:           48
sg-list-fio-perf/bs-128k-2.fio:           46.4     47.13
sg-list-fio-perf/bs-128k-3.fio:           47

no-sg-list-fio-perf/bs-128k-1.fio:        46.6
no-sg-list-fio-perf/bs-128k-2.fio:        47        46.9
no-sg-list-fio-perf/bs-128k-3.fio:        47.1

% Change default vs new DMA API =       −0.495%

256K
--------------------------------------------------------------
sg-list-fio-perf/bs-256k-1.fio:           37
sg-list-fio-perf/bs-256k-2.fio:           41        39.93
sg-list-fio-perf/bs-256k-3.fio:           41.8

no-sg-list-fio-perf/bs-256k-1.fio:        37.5
no-sg-list-fio-perf/bs-256k-2.fio:        41.4      40.5
no-sg-list-fio-perf/bs-256k-3.fio:        42.6

% Change default vs new DMA API =       +1.42%

512K
--------------------------------------------------------------
sg-list-fio-perf/bs-512k-1.fio:           28.5
sg-list-fio-perf/bs-512k-2.fio:           28.2      28.4
sg-list-fio-perf/bs-512k-3.fio:           28.5

no-sg-list-fio-perf/bs-512k-1.fio:        28.7
no-sg-list-fio-perf/bs-512k-2.fio:        28.6      28.7
no-sg-list-fio-perf/bs-512k-3.fio:        28.8

% Change default vs new DMA API =       +1.06%

Signed-off-by: Chaitanya Kulkarni <kch@xxxxxxxxxx>
Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxx>
---
 drivers/nvme/host/pci.c | 354 ++++++++++++++++++++++++----------------
 1 file changed, 215 insertions(+), 139 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2b236b1d209e..881cbf2c0cac 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -221,6 +221,12 @@ union nvme_descriptor {
 	__le64			*prp_list;
 };
 
+/* TODO: move to common header */
+struct dma_entry {
+	dma_addr_t addr;
+	unsigned int len;
+};
+
 /*
  * The nvme_iod describes the data in an I/O.
  *
@@ -234,9 +240,11 @@ struct nvme_iod {
 	u8 nr_dmas;
 	s8 nr_allocations;	/* PRP list pool allocations. 0 means small
 				   pool in use */
+	struct dma_iova_state state;
+	struct dma_entry dma;
+	struct dma_entry *map;
 	dma_addr_t first_dma;
 	dma_addr_t meta_dma;
-	struct sg_table sgt;
 	union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
 };
 
@@ -540,10 +548,9 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
-	WARN_ON_ONCE(!iod->sgt.nents);
-
-	dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
+	struct req_iterator iter;
+	struct bio_vec bv;
+	int cnt = 0;
 
 	if (iod->nr_allocations == 0)
 		dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
@@ -553,20 +560,17 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 			      iod->first_dma);
 	else
 		nvme_free_prps(dev, req);
-	mempool_free(iod->sgt.sgl, dev->iod_mempool);
-}
 
-static void nvme_print_sgl(struct scatterlist *sgl, int nents)
-{
-	int i;
-	struct scatterlist *sg;
-
-	for_each_sg(sgl, sg, nents, i) {
-		dma_addr_t phys = sg_phys(sg);
-		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
-			"dma_address:%pad dma_length:%d\n",
-			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
-			sg_dma_len(sg));
+	if (iod->map) {
+		rq_for_each_bvec(bv, req, iter) {
+			dma_unmap_page(dev->dev, iod->map[cnt].addr,
+				       iod->map[cnt].len, rq_dma_dir(req));
+			cnt++;
+		}
+		kfree(iod->map);
+	} else {
+		dma_unlink_range(&iod->state);
+		dma_free_iova(&iod->state);
 	}
 }
 
@@ -574,97 +578,63 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
 		struct request *req, struct nvme_rw_command *cmnd)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct dma_pool *pool;
-	int length = blk_rq_payload_bytes(req);
-	struct scatterlist *sg = iod->sgt.sgl;
-	int dma_len = sg_dma_len(sg);
-	u64 dma_addr = sg_dma_address(sg);
-	int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
-	__le64 *prp_list;
-	dma_addr_t prp_dma;
-	int nprps, i;
-
-	length -= (NVME_CTRL_PAGE_SIZE - offset);
-	if (length <= 0) {
-		iod->first_dma = 0;
-		goto done;
-	}
-
-	dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
-	if (dma_len) {
-		dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
-	} else {
-		sg = sg_next(sg);
-		dma_addr = sg_dma_address(sg);
-		dma_len = sg_dma_len(sg);
-	}
+	__le64 *prp_list = iod->list[0].prp_list;
+	int i = 0, idx = 0;
+	struct bio_vec bv;
+	struct req_iterator iter;
+	dma_addr_t offset = 0;
 
-	if (length <= NVME_CTRL_PAGE_SIZE) {
-		iod->first_dma = dma_addr;
-		goto done;
+	if (iod->nr_dmas <= 2) {
+		i = iod->nr_dmas;
+		/* We can use the inline PRP/SG list */
+		goto set_addr;
 	}
 
-	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
-	if (nprps <= (256 / 8)) {
-		pool = dev->prp_small_pool;
-		iod->nr_allocations = 0;
-	} else {
-		pool = dev->prp_page_pool;
-		iod->nr_allocations = 1;
-	}
+	rq_for_each_bvec(bv, req, iter) {
+		dma_addr_t addr;
 
-	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
-	if (!prp_list) {
-		iod->nr_allocations = -1;
-		return BLK_STS_RESOURCE;
-	}
-	iod->list[0].prp_list = prp_list;
-	iod->first_dma = prp_dma;
-	i = 0;
-	for (;;) {
-		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
-			__le64 *old_prp_list = prp_list;
-			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
-			if (!prp_list)
-				goto free_prps;
-			iod->list[iod->nr_allocations++].prp_list = prp_list;
-			prp_list[0] = old_prp_list[i - 1];
-			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
-			i = 1;
+		if (iod->map)
+			offset = 0;
+
+		while (offset < bv.bv_len) {
+			if (iod->map)
+				addr = iod->map[i].addr;
+			else
+				addr = iod->dma.addr;
+
+			prp_list[idx] = cpu_to_le64(addr + offset);
+			offset += NVME_CTRL_PAGE_SIZE;
+			idx++;
 		}
-		prp_list[i++] = cpu_to_le64(dma_addr);
-		dma_len -= NVME_CTRL_PAGE_SIZE;
-		dma_addr += NVME_CTRL_PAGE_SIZE;
-		length -= NVME_CTRL_PAGE_SIZE;
-		if (length <= 0)
-			break;
-		if (dma_len > 0)
-			continue;
-		if (unlikely(dma_len < 0))
-			goto bad_sgl;
-		sg = sg_next(sg);
-		dma_addr = sg_dma_address(sg);
-		dma_len = sg_dma_len(sg);
-	}
-done:
-	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl));
-	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
+		i++;
+	}
+
+set_addr:
+	if (iod->map)
+		cmnd->dptr.prp1 = cpu_to_le64(iod->map[0].addr);
+	else
+		cmnd->dptr.prp1 = cpu_to_le64(iod->dma.addr);
+	if (idx == 1 && i == 1)
+		cmnd->dptr.prp2 = 0;
+	else if (idx == 2 && i == 2)
+		if (iod->map)
+			cmnd->dptr.prp2 =
+				cpu_to_le64((iod->map[0].addr + NVME_CTRL_PAGE_SIZE) &
+					    ~(NVME_CTRL_PAGE_SIZE - 1));
+		else
+			cmnd->dptr.prp2 =
+				cpu_to_le64((iod->dma.addr + NVME_CTRL_PAGE_SIZE) &
+					    ~(NVME_CTRL_PAGE_SIZE - 1));
+	else
+		cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
 	return BLK_STS_OK;
-free_prps:
-	nvme_free_prps(dev, req);
-	return BLK_STS_RESOURCE;
-bad_sgl:
-	WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
-			"Invalid SGL for payload:%d nents:%d\n",
-			blk_rq_payload_bytes(req), iod->sgt.nents);
-	return BLK_STS_IOERR;
 }
 
-static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
-		struct scatterlist *sg)
+static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, dma_addr_t addr,
+				  int len)
 {
-	sge->addr = cpu_to_le64(sg_dma_address(sg));
-	sge->length = cpu_to_le32(sg_dma_len(sg));
+	sge->addr = cpu_to_le64(addr);
+	sge->length = cpu_to_le32(len);
 	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
 }
 
@@ -680,17 +650,77 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 		struct request *req, struct nvme_rw_command *cmd)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-	struct dma_pool *pool;
-	struct nvme_sgl_desc *sg_list;
-	struct scatterlist *sg = iod->sgt.sgl;
-	unsigned int entries = iod->sgt.nents;
-	dma_addr_t sgl_dma;
-	int i = 0;
+	struct nvme_sgl_desc *sg_list = iod->list[0].sg_list;
+	struct bio_vec bv = req_bvec(req);
+	struct req_iterator iter;
+	int i = 0, idx = 0;
+	dma_addr_t offset = 0;
 
 	/* setting the transfer type as SGL */
 	cmd->flags = NVME_CMD_SGL_METABUF;
 
-	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
+	if (iod->nr_dmas <= 1)
+		/* We can use the inline PRP/SG list */
+		goto set_addr;
+
+	rq_for_each_bvec(bv, req, iter) {
+		dma_addr_t addr;
+
+		if (iod->map)
+			offset = 0;
+
+		while (offset < bv.bv_len) {
+			if (iod->map)
+				addr = iod->map[i].addr;
+			else
+				addr = iod->dma.addr;
+
+			nvme_pci_sgl_set_data(&sg_list[idx], addr + offset,
+					      bv.bv_len);
+			offset += NVME_CTRL_PAGE_SIZE;
+			idx++;
+		}
+		i++;
+	}
+
+set_addr:
+	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, iod->first_dma,
+			     blk_rq_nr_phys_segments(req));
+	return BLK_STS_OK;
+}
+
+static void nvme_pci_free_pool(struct nvme_dev *dev, struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+	if (iod->nr_allocations == 0)
+		dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
+			      iod->first_dma);
+	else if (iod->nr_allocations == 1)
+		dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
+			      iod->first_dma);
+	else
+		nvme_free_prps(dev, req);
+}
+
+static blk_status_t nvme_pci_setup_pool(struct nvme_dev *dev,
+					struct request *req, bool is_sgl)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct dma_pool *pool;
+	size_t entry_sz;
+	dma_addr_t addr;
+	u8 entries;
+	void *list;
+
+	if (iod->nr_dmas <= 2)
+		/* Do nothing, we can use the inline PRP/SG list */
+		return BLK_STS_OK;
+
+	/* First DMA address goes to prp1 anyway */
+	entries = iod->nr_dmas - 1;
+	entry_sz = (is_sgl) ? sizeof(struct nvme_sgl_desc) : sizeof(__le64);
+	if (entries <= (256 / entry_sz)) {
 		pool = dev->prp_small_pool;
 		iod->nr_allocations = 0;
 	} else {
@@ -698,21 +728,20 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 		iod->nr_allocations = 1;
 	}
 
-	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
-	if (!sg_list) {
+	/* TBD: allocate mulitple pools and chain them */
+	WARN_ON(entries > 512);
+
+	list = dma_pool_alloc(pool, GFP_ATOMIC, &addr);
+	if (!list) {
 		iod->nr_allocations = -1;
 		return BLK_STS_RESOURCE;
 	}
 
-	iod->list[0].sg_list = sg_list;
-	iod->first_dma = sgl_dma;
-
-	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
-	do {
-		nvme_pci_sgl_set_data(&sg_list[i++], sg);
-		sg = sg_next(sg);
-	} while (--entries > 0);
-
+	if (is_sgl)
+		iod->list[0].sg_list = list;
+	else
+		iod->list[0].prp_list = list;
+	iod->first_dma = addr;
 	return BLK_STS_OK;
 }
 
@@ -721,36 +750,84 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	blk_status_t ret = BLK_STS_RESOURCE;
-	int rc;
-
-	iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
-	if (!iod->sgt.sgl)
+	unsigned short n_segments = blk_rq_nr_phys_segments(req);
+	struct bio_vec bv = req_bvec(req);
+	struct req_iterator iter;
+	dma_addr_t dma_addr;
+	int rc, cnt = 0;
+	bool is_sgl;
+
+	dma_init_iova_state(&iod->state, dev->dev, rq_dma_dir(req));
+	dma_set_iova_state(&iod->state, bv.bv_page, bv.bv_len);
+
+	rc = dma_start_range(&iod->state);
+	if (rc)
 		return BLK_STS_RESOURCE;
-	sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req));
-	iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl);
-	if (!iod->sgt.orig_nents)
-		goto out_free_sg;
 
-	rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req),
-			     DMA_ATTR_NO_WARN);
-	if (rc) {
-		if (rc == -EREMOTEIO)
-			ret = BLK_STS_TARGET;
-		goto out_free_sg;
+	iod->dma.len = 0;
+	iod->dma.addr = 0;
+
+	if (dma_can_use_iova(&iod->state)) {
+		iod->map = NULL;
+		rc = dma_alloc_iova_unaligned(&iod->state, bvec_phys(&bv),
+					      blk_rq_payload_bytes(req));
+		if (rc)
+			return BLK_STS_RESOURCE;
+
+		rq_for_each_bvec(bv, req, iter) {
+			dma_addr = dma_link_range(&iod->state, bvec_phys(&bv),
+						  bv.bv_len);
+			if (dma_mapping_error(dev->dev, dma_addr))
+				goto out_free;
+
+			if (!iod->dma.addr)
+				iod->dma.addr = dma_addr;
+		}
+		WARN_ON(blk_rq_payload_bytes(req) != iod->state.range_size);
+	} else {
+		iod->map = kmalloc_array(n_segments, sizeof(*iod->map),
+					 GFP_ATOMIC);
+		if (!iod->map)
+			return BLK_STS_RESOURCE;
+
+		rq_for_each_bvec(bv, req, iter) {
+			dma_addr = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
+			if (dma_mapping_error(dev->dev, dma_addr))
+				goto out_free;
+
+			iod->map[cnt].addr = dma_addr;
+			iod->map[cnt].len = bv.bv_len;
+			cnt++;
+		}
 	}
+	dma_end_range(&iod->state);
 
-	if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
+	is_sgl = nvme_pci_use_sgls(dev, req, n_segments);
+	ret = nvme_pci_setup_pool(dev, req, is_sgl);
+	if (ret != BLK_STS_OK)
+		goto out_free;
+
+	if (is_sgl)
 		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
 	else
 		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
 	if (ret != BLK_STS_OK)
-		goto out_unmap_sg;
+		goto out_free_pool;
+
 	return BLK_STS_OK;
 
-out_unmap_sg:
-	dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
-out_free_sg:
-	mempool_free(iod->sgt.sgl, dev->iod_mempool);
+out_free_pool:
+	nvme_pci_free_pool(dev, req);
+out_free:
+	if (iod->map) {
+		while (cnt--)
+			dma_unmap_page(dev->dev, iod->map[cnt].addr,
+				       iod->map[cnt].len, rq_dma_dir(req));
+		kfree(iod->map);
+	} else {
+		dma_unlink_range(&iod->state);
+		dma_free_iova(&iod->state);
+	}
 	return ret;
 }
 
@@ -791,7 +868,6 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
 
 	iod->aborted = false;
 	iod->nr_allocations = -1;
-	iod->sgt.nents = 0;
 
 	ret = nvme_setup_cmd(req->q->queuedata, req);
 	if (ret)
-- 
2.46.0