[PATCH V3] net/mlx4: Get rid of page operation after dma_alloc_coherent

Stephen Warren <swarren@xxxxxxxxxxxxx> · Wed, 19 Dec 2018 11:20:31 -0700

From: Stephen Warren <swarren@xxxxxxxxxx>

This patch solves a crash at the time of mlx4 driver unload or system
shutdown. The crash occurs because dma_alloc_coherent() returns one
value in mlx4_alloc_icm_coherent(), but a different value is passed to
dma_free_coherent() in mlx4_free_icm_coherent(). In turn this is because
when allocated, that pointer is passed to sg_set_buf() to record it,
then when freed it is re-calculated by calling
lowmem_page_address(sg_page()) which returns a different value. Solve
this by recording the value that dma_alloc_coherent() returns, and
passing this to dma_free_coherent().

This patch is roughly equivalent to commit 378efe798ecf ("RDMA/hns: Get
rid of page operation after dma_alloc_coherent"). That patch was
described as:

> In general, dma_alloc_coherent() returns a CPU virtual address and
> a DMA address, and we have no guarantee that the underlying memory
> even has an associated struct page at all.
>
> This patch gets rid of the page operation after dma_alloc_coherent,
> and records the VA returned form dma_alloc_coherent in the struct
> of hem in hns RoCE driver.

However, this patch reworks the code to store all information about
coherent chunks separately from the sg list, since using sg lists for
them doesn't make sense. Hence, the structure of this patch is quite
different compared to the hns patch.

Based-on-code-from: Christoph Hellwig <hch@xxxxxx>
Signed-off-by: Stephen Warren <swarren@xxxxxxxxxx>
---
v3:
- Rework chunk data structure to store all data for coherent allocations
  separately from the sg list. Code from Christoph Hellwig with fixes by
  me. Notes:
  - chunk->coherent is an int not a bool since checkpatch complains about
    using bool in structs; see https://lkml.org/lkml/2017/11/21/384.
  - chunk->coherent is used rather than chunk->table->coherent since the
    table pointer isn't available when creating chunks. This duplicates
    data, but simplifies the patch.
v2:
- Rework mlx4_table_find() to explicitly calculate the returned address
  differently depending on wheter the table was allocated using
  dma_alloc_coherent() or alloc_pages(), which in turn allows the
  changes to mlx4_alloc_icm_pages() to be dropped.
- Drop changes to mlx4_alloc/free_icm_pages. This path uses
  pci_map_sg() which can re-write the sg list which in turn would cause
  chunk->mem[] (the sg list) and chunk->buf[] to become inconsistent.
- Enhance commit description.

Note: I've tested this patch in a downstream 4.14 based kernel (using
ibping, ib_read_bw, and ib_write_bw), but can't test it in mainline
since my system isn't supported there yet. I have compile-tested it in
mainline at least, for ARM64.
---
 drivers/net/ethernet/mellanox/mlx4/icm.c | 92 ++++++++++++++----------
 drivers/net/ethernet/mellanox/mlx4/icm.h | 22 +++++-
 2 files changed, 75 insertions(+), 39 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 4b4351141b94..76b84d08a058 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -57,12 +57,12 @@ static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chu
 	int i;
 
 	if (chunk->nsg > 0)
-		pci_unmap_sg(dev->persist->pdev, chunk->mem, chunk->npages,
+		pci_unmap_sg(dev->persist->pdev, chunk->sg, chunk->npages,
 			     PCI_DMA_BIDIRECTIONAL);
 
 	for (i = 0; i < chunk->npages; ++i)
-		__free_pages(sg_page(&chunk->mem[i]),
-			     get_order(chunk->mem[i].length));
+		__free_pages(sg_page(&chunk->sg[i]),
+			     get_order(chunk->sg[i].length));
 }
 
 static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
@@ -71,9 +71,9 @@ static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *
 
 	for (i = 0; i < chunk->npages; ++i)
 		dma_free_coherent(&dev->persist->pdev->dev,
-				  chunk->mem[i].length,
-				  lowmem_page_address(sg_page(&chunk->mem[i])),
-				  sg_dma_address(&chunk->mem[i]));
+				  chunk->buf[i].size,
+				  chunk->buf[i].addr,
+				  chunk->buf[i].dma_addr);
 }
 
 void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent)
@@ -111,22 +111,21 @@ static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order,
 	return 0;
 }
 
-static int mlx4_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
-				    int order, gfp_t gfp_mask)
+static int mlx4_alloc_icm_coherent(struct device *dev, struct mlx4_icm_buf *buf,
+				   int order, gfp_t gfp_mask)
 {
-	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order,
-				       &sg_dma_address(mem), gfp_mask);
-	if (!buf)
+	buf->addr = dma_alloc_coherent(dev, PAGE_SIZE << order,
+				       &buf->dma_addr, gfp_mask);
+	if (!buf->addr)
 		return -ENOMEM;
 
-	if (offset_in_page(buf)) {
-		dma_free_coherent(dev, PAGE_SIZE << order,
-				  buf, sg_dma_address(mem));
+	if (offset_in_page(buf->addr)) {
+		dma_free_coherent(dev, PAGE_SIZE << order, buf->addr,
+				  buf->dma_addr);
 		return -ENOMEM;
 	}
 
-	sg_set_buf(mem, buf, PAGE_SIZE << order);
-	sg_dma_len(mem) = PAGE_SIZE << order;
+	buf->size = PAGE_SIZE << order;
 	return 0;
 }
 
@@ -159,21 +158,21 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 
 	while (npages > 0) {
 		if (!chunk) {
-			chunk = kmalloc_node(sizeof(*chunk),
+			chunk = kzalloc_node(sizeof(*chunk),
 					     gfp_mask & ~(__GFP_HIGHMEM |
 							  __GFP_NOWARN),
 					     dev->numa_node);
 			if (!chunk) {
-				chunk = kmalloc(sizeof(*chunk),
+				chunk = kzalloc(sizeof(*chunk),
 						gfp_mask & ~(__GFP_HIGHMEM |
 							     __GFP_NOWARN));
 				if (!chunk)
 					goto fail;
 			}
+			chunk->coherent = coherent;
 
-			sg_init_table(chunk->mem, MLX4_ICM_CHUNK_LEN);
-			chunk->npages = 0;
-			chunk->nsg    = 0;
+			if (!coherent)
+				sg_init_table(chunk->sg, MLX4_ICM_CHUNK_LEN);
 			list_add_tail(&chunk->list, &icm->chunk_list);
 		}
 
@@ -186,10 +185,10 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 
 		if (coherent)
 			ret = mlx4_alloc_icm_coherent(&dev->persist->pdev->dev,
-						      &chunk->mem[chunk->npages],
-						      cur_order, mask);
+						&chunk->buf[chunk->npages],
+						cur_order, mask);
 		else
-			ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages],
+			ret = mlx4_alloc_icm_pages(&chunk->sg[chunk->npages],
 						   cur_order, mask,
 						   dev->numa_node);
 
@@ -205,7 +204,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 		if (coherent)
 			++chunk->nsg;
 		else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
-			chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem,
+			chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->sg,
 						chunk->npages,
 						PCI_DMA_BIDIRECTIONAL);
 
@@ -220,7 +219,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 	}
 
 	if (!coherent && chunk) {
-		chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem,
+		chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->sg,
 					chunk->npages,
 					PCI_DMA_BIDIRECTIONAL);
 
@@ -320,7 +319,7 @@ void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj,
 	u64 idx;
 	struct mlx4_icm_chunk *chunk;
 	struct mlx4_icm *icm;
-	struct page *page = NULL;
+	void *addr = NULL;
 
 	if (!table->lowmem)
 		return NULL;
@@ -336,28 +335,49 @@ void *mlx4_table_find(struct mlx4_icm_table *table, u32 obj,
 
 	list_for_each_entry(chunk, &icm->chunk_list, list) {
 		for (i = 0; i < chunk->npages; ++i) {
+			dma_addr_t dma_addr;
+			size_t len;
+
+			if (table->coherent) {
+				len = chunk->buf[i].size;
+				dma_addr = chunk->buf[i].dma_addr;
+				addr = chunk->buf[i].addr;
+			} else {
+				struct page *page;
+
+				len = sg_dma_len(&chunk->sg[i]);
+				dma_addr = sg_dma_address(&chunk->sg[i]);
+
+				/* XXX: we should never do this for highmem
+				 * allocation.  This function either needs
+				 * to be split, or the kernel virtual address
+				 * return needs to be made optional.
+				 */
+				page = sg_page(&chunk->sg[i]);
+				addr = lowmem_page_address(page);
+			}
+
 			if (dma_handle && dma_offset >= 0) {
-				if (sg_dma_len(&chunk->mem[i]) > dma_offset)
-					*dma_handle = sg_dma_address(&chunk->mem[i]) +
-						dma_offset;
-				dma_offset -= sg_dma_len(&chunk->mem[i]);
+				if (len > dma_offset)
+					*dma_handle = dma_addr + dma_offset;
+				dma_offset -= len;
 			}
+
 			/*
 			 * DMA mapping can merge pages but not split them,
 			 * so if we found the page, dma_handle has already
 			 * been assigned to.
 			 */
-			if (chunk->mem[i].length > offset) {
-				page = sg_page(&chunk->mem[i]);
+			if (len > offset)
 				goto out;
-			}
-			offset -= chunk->mem[i].length;
+			offset -= len;
 		}
 	}
 
+	addr = NULL;
 out:
 	mutex_unlock(&table->mutex);
-	return page ? lowmem_page_address(page) + offset : NULL;
+	return addr ? addr + offset : NULL;
 }
 
 int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table,
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.h b/drivers/net/ethernet/mellanox/mlx4/icm.h
index c9169a490557..574f4bcdd090 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.h
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.h
@@ -47,11 +47,21 @@ enum {
 	MLX4_ICM_PAGE_SIZE	= 1 << MLX4_ICM_PAGE_SHIFT,
 };
 
+struct mlx4_icm_buf {
+	void			*addr;
+	size_t			size;
+	dma_addr_t		dma_addr;
+};
+
 struct mlx4_icm_chunk {
 	struct list_head	list;
 	int			npages;
 	int			nsg;
-	struct scatterlist	mem[MLX4_ICM_CHUNK_LEN];
+	int			coherent;
+	union {
+		struct scatterlist	sg[MLX4_ICM_CHUNK_LEN];
+		struct mlx4_icm_buf	buf[MLX4_ICM_CHUNK_LEN];
+	};
 };
 
 struct mlx4_icm {
@@ -114,12 +124,18 @@ static inline void mlx4_icm_next(struct mlx4_icm_iter *iter)
 
 static inline dma_addr_t mlx4_icm_addr(struct mlx4_icm_iter *iter)
 {
-	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+	if (iter->chunk->coherent)
+		return iter->chunk->buf[iter->page_idx].dma_addr;
+	else
+		return sg_dma_address(&iter->chunk->sg[iter->page_idx]);
 }
 
 static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter)
 {
-	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+	if (iter->chunk->coherent)
+		return iter->chunk->buf[iter->page_idx].size;
+	else
+		return sg_dma_len(&iter->chunk->sg[iter->page_idx]);
 }
 
 int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm);
-- 
2.19.2