Re: (Octeon) MMC performance degradation due to too many requests

Ladislav Michl <oss-lists@xxxxxxxxx> · Fri, 11 Aug 2023 13:39:42 +0200

Hi Linus,

On Sun, Aug 06, 2023 at 10:44:21PM +0200, Linus Walleij wrote:
[snip]
> So you are talking about something along the lines of:
> 
> commit bd9b902798ab14d19ca116b10bde581ddff8f905
> Author: Linus Walleij <linus.walleij@xxxxxxxxxx>
> Date:   Mon Jan 29 00:44:53 2018 +0100
> 
>     mmc: sdhci: Implement an SDHCI-specific bounce buffer
> 
> ?
> 
> Yeah I guess that if this is needed by more than one driver it
> should be made into a library, or say a piece of code turned on by
> a config option that the dependent drivers select.
> 
> Interested in the job? :D

Just for the reference, implementation based on your above mentioned patch
which restores 4.9 vendor kernel performance can be found below.

It still needs some love, generalization and so on, but as I have already
started work on fixing Octeon's USB and ethernet, this is far beyond my
time limit.

I hope to return to this in the near future and convert it into proper
patch series.

Btw, we really need to invent some other name than 'bounce buffer'...

---
 drivers/mmc/host/cavium.c | 124 +++++++++++++++++++++++++++++++++++++-
 drivers/mmc/host/cavium.h |   4 ++
 2 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/host/cavium.c b/drivers/mmc/host/cavium.c
index cd8f2233ea34..75e2fab58268 100644
--- a/drivers/mmc/host/cavium.c
+++ b/drivers/mmc/host/cavium.c
@@ -370,6 +370,32 @@ static int get_dma_dir(struct mmc_data *data)
 	return (data->flags & MMC_DATA_WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 }
 
+static int finish_dma_bounce(struct cvm_mmc_host *host, struct mmc_data *data)
+{
+	data->bytes_xfered = data->blocks * data->blksz;
+	data->error = 0;
+
+	dma_sync_single_for_cpu(host->dev, host->bounce_addr,
+				host->bounce_buffer_size,
+				mmc_get_dma_dir(data));
+
+	/* On reads, copy the bounced data into the sglist */
+	if (mmc_get_dma_dir(data) == DMA_FROM_DEVICE) {
+		unsigned int length = data->bytes_xfered;
+
+		if (length > host->bounce_buffer_size) {
+			pr_err("bounce buffer is %u bytes but DMA claims to have transferred %u bytes\n",
+			       host->bounce_buffer_size,
+			       data->bytes_xfered);
+			/* Cap it down and continue */
+			length = host->bounce_buffer_size;
+		}
+		sg_copy_from_buffer(data->sg, data->sg_len, host->bounce_buffer,
+				    length);
+	}
+	return 1;
+}
+
 static int finish_dma_single(struct cvm_mmc_host *host, struct mmc_data *data)
 {
 	data->bytes_xfered = data->blocks * data->blksz;
@@ -400,7 +426,9 @@ static int finish_dma_sg(struct cvm_mmc_host *host, struct mmc_data *data)
 
 static int finish_dma(struct cvm_mmc_host *host, struct mmc_data *data)
 {
-	if (host->use_sg && data->sg_len > 1)
+	if (host->bounce_buffer)
+		return finish_dma_bounce(host, data);
+	else if (host->use_sg && data->sg_len > 1)
 		return finish_dma_sg(host, data);
 	else
 		return finish_dma_single(host, data);
@@ -509,6 +537,45 @@ irqreturn_t cvm_mmc_interrupt(int irq, void *dev_id)
 	return IRQ_RETVAL(emm_int != 0);
 }
 
+static u64 prepare_dma_bounce(struct cvm_mmc_host *host, struct mmc_data *data)
+{
+	u64 dma_cfg, addr;
+	int rw = 0;
+	unsigned int length = data->blksz * data->blocks;
+
+	if (length > host->bounce_buffer_size) {
+		pr_err("asked for transfer of %u bytes exceeds bounce buffer %u bytes\n",
+		       length, host->bounce_buffer_size);
+		return 0;
+	}
+	if (mmc_get_dma_dir(data) == DMA_TO_DEVICE) {
+		/* Copy the data to the bounce buffer */
+		sg_copy_to_buffer(data->sg, data->sg_len,
+				  host->bounce_buffer, length);
+		rw = 1;
+	}
+	/* Switch ownership to the DMA */
+	dma_sync_single_for_device(host->dev, host->bounce_addr,
+				   host->bounce_buffer_size,
+				   mmc_get_dma_dir(data));
+
+	dma_cfg = FIELD_PREP(MIO_EMM_DMA_CFG_EN, 1) |
+		  FIELD_PREP(MIO_EMM_DMA_CFG_RW, rw);
+#ifdef __LITTLE_ENDIAN
+	dma_cfg |= FIELD_PREP(MIO_EMM_DMA_CFG_ENDIAN, 1);
+#endif
+	dma_cfg |= FIELD_PREP(MIO_EMM_DMA_CFG_SIZE, length / 8 - 1);
+
+	addr = host->bounce_addr;
+	if (!host->big_dma_addr)
+		dma_cfg |= FIELD_PREP(MIO_EMM_DMA_CFG_ADR, addr);
+	writeq(dma_cfg, host->dma_base + MIO_EMM_DMA_CFG(host));
+
+	if (host->big_dma_addr)
+		writeq(addr, host->dma_base + MIO_EMM_DMA_ADR(host));
+	return addr;
+}
+
 /*
  * Program DMA_CFG and if needed DMA_ADR.
  * Returns 0 on error, DMA address otherwise.
@@ -616,6 +683,8 @@ static u64 prepare_dma_sg(struct cvm_mmc_host *host, struct mmc_data *data)
 
 static u64 prepare_dma(struct cvm_mmc_host *host, struct mmc_data *data)
 {
+	if (host->bounce_buffer)
+		return prepare_dma_bounce(host, data);
 	if (host->use_sg && data->sg_len > 1)
 		return prepare_dma_sg(host, data);
 	else
@@ -1006,6 +1075,53 @@ static int cvm_mmc_of_parse(struct device *dev, struct cvm_mmc_slot *slot)
 	return id;
 }
 
+static void cvm_allocate_bounce_buffer(struct mmc_host *mmc, struct cvm_mmc_host *host)
+{
+	unsigned int max_blocks;
+	unsigned int size;
+
+	/*
+	 * Cap the bounce buffer at 64KB. Using a bigger bounce buffer
+	 * has diminishing returns, this is probably because SD/MMC
+	 * cards are usually optimized to handle this size of requests.
+	 */
+	size = SZ_64K;
+	/*
+	 * Adjust downwards to maximum request size if this is less
+	 * than our segment size, else hammer down the maximum
+	 * request size to the maximum buffer size.
+	 */
+	if (mmc->max_req_size < size)
+		size = mmc->max_req_size;
+	max_blocks = size / 512;
+
+	/*
+	 * When we just support one segment, we can get significant
+	 * speedups by the help of a bounce buffer to group scattered
+	 * reads/writes together.
+	 */
+	host->bounce_buffer = devm_kmalloc(mmc->parent, size, GFP_KERNEL);
+	if (!host->bounce_buffer) {
+		pr_err("failed to allocate %u bytes for bounce buffer, falling back to single segments\n",
+		       size);
+		return;
+	}
+
+	host->bounce_addr = dma_map_single(mmc->parent, host->bounce_buffer,
+					   size, DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(mmc->parent, host->bounce_addr))
+		return;
+	host->bounce_buffer_size = size;
+
+	/* Lie about this since we're bouncing */
+	mmc->max_segs = max_blocks;
+	mmc->max_seg_size = size;
+	mmc->max_req_size = size;
+
+	pr_info("bounce up to %u segments into one, max segment size %u bytes\n",
+		max_blocks, size);
+}
+
 int cvm_mmc_of_slot_probe(struct device *dev, struct cvm_mmc_host *host)
 {
 	struct cvm_mmc_slot *slot;
@@ -1050,6 +1166,10 @@ int cvm_mmc_of_slot_probe(struct device *dev, struct cvm_mmc_host *host)
 	/* DMA block count field is 15 bits */
 	mmc->max_blk_count = 32767;
 
+	/* This may alter mmc->*_blk_* parameters */
+	if (mmc->max_segs == 1)
+		cvm_allocate_bounce_buffer(mmc, host);
+
 	slot->clock = mmc->f_min;
 	slot->bus_id = id;
 	slot->cached_rca = 1;
@@ -1064,6 +1184,7 @@ int cvm_mmc_of_slot_probe(struct device *dev, struct cvm_mmc_host *host)
 	if (ret) {
 		dev_err(dev, "mmc_add_host() returned %d\n", ret);
 		slot->host->slot[id] = NULL;
+		kfree(host->bounce_buffer);
 		goto error;
 	}
 	return 0;
@@ -1078,5 +1199,6 @@ int cvm_mmc_of_slot_remove(struct cvm_mmc_slot *slot)
 	mmc_remove_host(slot->mmc);
 	slot->host->slot[slot->bus_id] = NULL;
 	mmc_free_host(slot->mmc);
+	kfree(slot->host->bounce_buffer);
 	return 0;
 }
diff --git a/drivers/mmc/host/cavium.h b/drivers/mmc/host/cavium.h
index f3eea5eaa678..4fcddb76768d 100644
--- a/drivers/mmc/host/cavium.h
+++ b/drivers/mmc/host/cavium.h
@@ -64,6 +64,10 @@ struct cvm_mmc_host {
 	struct clk *clk;
 	int sys_freq;
 
+	char *bounce_buffer;	/* For packing DMA reads/writes */
+	dma_addr_t bounce_addr;
+	unsigned int bounce_buffer_size;
+
 	struct mmc_request *current_req;
 	struct sg_mapping_iter smi;
 	bool dma_active;
-- 
2.39.2