This is a note to let you know that I've just added the patch titled mmc: sdhci: Implement an SDHCI-specific bounce buffer to the 4.15-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: mmc-sdhci-implement-an-sdhci-specific-bounce-buffer.patch and it can be found in the queue-4.15 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. >From bd9b902798ab14d19ca116b10bde581ddff8f905 Mon Sep 17 00:00:00 2001 From: Linus Walleij <linus.walleij@xxxxxxxxxx> Date: Mon, 29 Jan 2018 00:44:53 +0100 Subject: mmc: sdhci: Implement an SDHCI-specific bounce buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Linus Walleij <linus.walleij@xxxxxxxxxx> commit bd9b902798ab14d19ca116b10bde581ddff8f905 upstream. The bounce buffer is gone from the MMC core, and now we found out that there are some (crippled) i.MX boards out there that have broken ADMA (cannot do scatter-gather), and also broken PIO so they must use SDMA. Closer examination shows a less significant slowdown also on SDMA-only capable Laptop hosts. SDMA sets down the number of segments to one, so that each segment gets turned into a singular request that ping-pongs to the block layer before the next request/segment is issued. Apparently it happens a lot that the block layer send requests that include a lot of physically discontiguous segments. My guess is that this phenomenon is coming from the file system. These devices that cannot handle scatterlists in hardware can see major benefits from a DMA-contiguous bounce buffer. This patch accumulates those fragmented scatterlists in a physically contiguous bounce buffer so that we can issue bigger DMA data chunks to/from the card. When tested with a PCI-integrated host (1217:8221) that only supports SDMA: 0b:00.0 SD Host controller: O2 Micro, Inc. OZ600FJ0/OZ900FJ0/OZ600FJS SD/MMC Card Reader Controller (rev 05) This patch gave ~1Mbyte/s improved throughput on large reads and writes when testing using iozone than without the patch. dmesg: sdhci-pci 0000:0b:00.0: SDHCI controller found [1217:8221] (rev 5) mmc0 bounce up to 128 segments into one, max segment size 65536 bytes mmc0: SDHCI controller on PCI [0000:0b:00.0] using DMA On the i.MX SDHCI controllers on the crippled i.MX 25 and i.MX 35 the patch restores the performance to what it was before we removed the bounce buffers. Cc: Pierre Ossman <pierre@xxxxxxxxx> Cc: Benoît Thébaudeau <benoit@xxxxxxxxxxx> Cc: Fabio Estevam <fabio.estevam@xxxxxxx> Cc: Benjamin Beckmeyer <beckmeyer.b@xxxxxxxxx> Cc: stable@xxxxxxxxxxxxxxx # v4.14+ Fixes: de3ee99b097d ("mmc: Delete bounce buffer handling") Tested-by: Benjamin Beckmeyer <beckmeyer.b@xxxxxxxxx> Acked-by: Adrian Hunter <adrian.hunter@xxxxxxxxx> Signed-off-by: Linus Walleij <linus.walleij@xxxxxxxxxx> Signed-off-by: Ulf Hansson <ulf.hansson@xxxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- drivers/mmc/host/sdhci.c | 164 ++++++++++++++++++++++++++++++++++++++++++++--- drivers/mmc/host/sdhci.h | 3 2 files changed, 159 insertions(+), 8 deletions(-) --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -21,6 +21,7 @@ #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/scatterlist.h> +#include <linux/sizes.h> #include <linux/swiotlb.h> #include <linux/regulator/consumer.h> #include <linux/pm_runtime.h> @@ -502,8 +503,35 @@ static int sdhci_pre_dma_transfer(struct if (data->host_cookie == COOKIE_PRE_MAPPED) return data->sg_count; - sg_count = dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len, - mmc_get_dma_dir(data)); + /* Bounce write requests to the bounce buffer */ + if (host->bounce_buffer) { + unsigned int length = data->blksz * data->blocks; + + if (length > host->bounce_buffer_size) { + pr_err("%s: asked for transfer of %u bytes exceeds bounce buffer %u bytes\n", + mmc_hostname(host->mmc), length, + host->bounce_buffer_size); + return -EIO; + } + if (mmc_get_dma_dir(data) == DMA_TO_DEVICE) { + /* Copy the data to the bounce buffer */ + sg_copy_to_buffer(data->sg, data->sg_len, + host->bounce_buffer, + length); + } + /* Switch ownership to the DMA */ + dma_sync_single_for_device(host->mmc->parent, + host->bounce_addr, + host->bounce_buffer_size, + mmc_get_dma_dir(data)); + /* Just a dummy value */ + sg_count = 1; + } else { + /* Just access the data directly from memory */ + sg_count = dma_map_sg(mmc_dev(host->mmc), + data->sg, data->sg_len, + mmc_get_dma_dir(data)); + } if (sg_count == 0) return -ENOSPC; @@ -673,6 +701,14 @@ static void sdhci_adma_table_post(struct } } +static u32 sdhci_sdma_address(struct sdhci_host *host) +{ + if (host->bounce_buffer) + return host->bounce_addr; + else + return sg_dma_address(host->data->sg); +} + static u8 sdhci_calc_timeout(struct sdhci_host *host, struct mmc_command *cmd) { u8 count; @@ -858,8 +894,8 @@ static void sdhci_prepare_data(struct sd SDHCI_ADMA_ADDRESS_HI); } else { WARN_ON(sg_cnt != 1); - sdhci_writel(host, sg_dma_address(data->sg), - SDHCI_DMA_ADDRESS); + sdhci_writel(host, sdhci_sdma_address(host), + SDHCI_DMA_ADDRESS); } } @@ -2248,7 +2284,12 @@ static void sdhci_pre_req(struct mmc_hos mrq->data->host_cookie = COOKIE_UNMAPPED; - if (host->flags & SDHCI_REQ_USE_DMA) + /* + * No pre-mapping in the pre hook if we're using the bounce buffer, + * for that we would need two bounce buffers since one buffer is + * in flight when this is getting called. + */ + if (host->flags & SDHCI_REQ_USE_DMA && !host->bounce_buffer) sdhci_pre_dma_transfer(host, mrq->data, COOKIE_PRE_MAPPED); } @@ -2352,8 +2393,45 @@ static bool sdhci_request_done(struct sd struct mmc_data *data = mrq->data; if (data && data->host_cookie == COOKIE_MAPPED) { - dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len, - mmc_get_dma_dir(data)); + if (host->bounce_buffer) { + /* + * On reads, copy the bounced data into the + * sglist + */ + if (mmc_get_dma_dir(data) == DMA_FROM_DEVICE) { + unsigned int length = data->bytes_xfered; + + if (length > host->bounce_buffer_size) { + pr_err("%s: bounce buffer is %u bytes but DMA claims to have transferred %u bytes\n", + mmc_hostname(host->mmc), + host->bounce_buffer_size, + data->bytes_xfered); + /* Cap it down and continue */ + length = host->bounce_buffer_size; + } + dma_sync_single_for_cpu( + host->mmc->parent, + host->bounce_addr, + host->bounce_buffer_size, + DMA_FROM_DEVICE); + sg_copy_from_buffer(data->sg, + data->sg_len, + host->bounce_buffer, + length); + } else { + /* No copying, just switch ownership */ + dma_sync_single_for_cpu( + host->mmc->parent, + host->bounce_addr, + host->bounce_buffer_size, + mmc_get_dma_dir(data)); + } + } else { + /* Unmap the raw data */ + dma_unmap_sg(mmc_dev(host->mmc), data->sg, + data->sg_len, + mmc_get_dma_dir(data)); + } data->host_cookie = COOKIE_UNMAPPED; } } @@ -2636,7 +2714,8 @@ static void sdhci_data_irq(struct sdhci_ */ if (intmask & SDHCI_INT_DMA_END) { u32 dmastart, dmanow; - dmastart = sg_dma_address(host->data->sg); + + dmastart = sdhci_sdma_address(host); dmanow = dmastart + host->data->bytes_xfered; /* * Force update to the next DMA block boundary. @@ -3217,6 +3296,68 @@ void __sdhci_read_caps(struct sdhci_host } EXPORT_SYMBOL_GPL(__sdhci_read_caps); +static int sdhci_allocate_bounce_buffer(struct sdhci_host *host) +{ + struct mmc_host *mmc = host->mmc; + unsigned int max_blocks; + unsigned int bounce_size; + int ret; + + /* + * Cap the bounce buffer at 64KB. Using a bigger bounce buffer + * has diminishing returns, this is probably because SD/MMC + * cards are usually optimized to handle this size of requests. + */ + bounce_size = SZ_64K; + /* + * Adjust downwards to maximum request size if this is less + * than our segment size, else hammer down the maximum + * request size to the maximum buffer size. + */ + if (mmc->max_req_size < bounce_size) + bounce_size = mmc->max_req_size; + max_blocks = bounce_size / 512; + + /* + * When we just support one segment, we can get significant + * speedups by the help of a bounce buffer to group scattered + * reads/writes together. + */ + host->bounce_buffer = devm_kmalloc(mmc->parent, + bounce_size, + GFP_KERNEL); + if (!host->bounce_buffer) { + pr_err("%s: failed to allocate %u bytes for bounce buffer, falling back to single segments\n", + mmc_hostname(mmc), + bounce_size); + /* + * Exiting with zero here makes sure we proceed with + * mmc->max_segs == 1. + */ + return 0; + } + + host->bounce_addr = dma_map_single(mmc->parent, + host->bounce_buffer, + bounce_size, + DMA_BIDIRECTIONAL); + ret = dma_mapping_error(mmc->parent, host->bounce_addr); + if (ret) + /* Again fall back to max_segs == 1 */ + return 0; + host->bounce_buffer_size = bounce_size; + + /* Lie about this since we're bouncing */ + mmc->max_segs = max_blocks; + mmc->max_seg_size = bounce_size; + mmc->max_req_size = bounce_size; + + pr_info("%s bounce up to %u segments into one, max segment size %u bytes\n", + mmc_hostname(mmc), max_blocks, bounce_size); + + return 0; +} + int sdhci_setup_host(struct sdhci_host *host) { struct mmc_host *mmc; @@ -3713,6 +3854,13 @@ int sdhci_setup_host(struct sdhci_host * */ mmc->max_blk_count = (host->quirks & SDHCI_QUIRK_NO_MULTIBLOCK) ? 1 : 65535; + if (mmc->max_segs == 1) { + /* This may alter mmc->*_blk_* parameters */ + ret = sdhci_allocate_bounce_buffer(host); + if (ret) + return ret; + } + return 0; unreg: --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -440,6 +440,9 @@ struct sdhci_host { int irq; /* Device IRQ */ void __iomem *ioaddr; /* Mapped address */ + char *bounce_buffer; /* For packing SDMA reads/writes */ + dma_addr_t bounce_addr; + unsigned int bounce_buffer_size; const struct sdhci_ops *ops; /* Low level hw interface */ Patches currently in stable-queue which might be from linus.walleij@xxxxxxxxxx are queue-4.15/mmc-sdhci-implement-an-sdhci-specific-bounce-buffer.patch queue-4.15/arm-dts-nomadik-add-interrupt-parent-for-clcd.patch