Re: MMC quirks relating to performance/lifetime.

Andrei Warkentin <andreiw@xxxxxxxxxxxx> · Fri, 11 Feb 2011 16:27:51 -0600

On Wed, Feb 9, 2011 at 2:37 AM, Linus Walleij <linus.walleij@xxxxxxxxxx> wrote:
> [Quoting in verbatin so the orginal mail hits linux-mmc, this is very
> interesting!]
>
> 2011/2/8 Andrei Warkentin <andreiw@xxxxxxxxxxxx>:
>> Hi,
>>
>> I'm not sure if this is the best place to bring this up, but Russel's
>> name is on a fair share of drivers/mmc code, and there does seem to be
>> quite a bit of MMC-related discussions. Excuse me in advance if this
>> isn't the right forum :-).
>>
>> Certain MMC vendors (maybe even quite a bit of them) use a pretty
>> rigid buffering scheme when it comes to handling writes. There is
>> usually a buffer A for random accesses, and a buffer B for sequential
>> accesses. For certain Toshiba parts, it looks like buffer A is 8KB
>> wide, with buffer B being 4MB wide, and all accesses larger than 8KB
>> effectively equating to 4MB accesses. Worse, consecutive small (8k)
>> writes are treated as one large sequential access, once again ending
>> up in buffer B, thus necessitating out-of-order writing to work around
>> this.
>>
>> What this means is decreased life span for the parts, and it also
>> means a performance impact on small writes, but the first item is much
>> more crucial, especially for smaller parts.
>>
>> As I've mentioned, probably more vendors are affected. How about a
>> generic MMC_BLOCK quirk that splits the requests (and optionally
>> reorders) them? The thresholds would then be adjustable as
>> module/kernel parameters based on manfid. I'm asking because I have a
>> patch now, but its ugly and hardcoded against a specific manufacturer.
>
> There is a quirk API so that specific quirks can be flagged for certain
> vendors and cards, e.g. some Toshibas in this case. e.g. grep the
> kernel source for MMC_QUIRK_BLKSZ_FOR_BYTE_MODE.
>
> But as Russell says this probably needs to be signalled up to the
> block layer to be handled properly.
>
> Why don't you post the code you have today as an RFC: patch,
> I think many will be interested?
>
> Yours,
> Linus Walleij
>

I think it's worthwhile to make make the upper block layers aware of
MMC (and apparently other flash memory) limitations, but I think as a
first step it could make sense (for me) to reformat the patch I am
attaching into something that looks better.

Don't take the attached patch too seriously :-).

Thanks,
A

diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 7054fd5..3b32329 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -60,6 +60,7 @@ struct mmc_blk_data {
 	spinlock_t	lock;
 	struct gendisk	*disk;
 	struct mmc_queue queue;
+	char            *bounce;
 
 	unsigned int	usage;
 	unsigned int	read_only;
@@ -93,6 +94,9 @@ static void mmc_blk_put(struct mmc_blk_data *md)
 
 		__clear_bit(devidx, dev_use);
 
+		if (md->bounce)
+			kfree(md->bounce);
+
 		put_disk(md->disk);
 		kfree(md);
 	}
@@ -312,6 +316,157 @@ out:
 	return err ? 0 : 1;
 }
 
+/*
+ * Workaround for Toshiba eMMC performance.  If the request is less than two
+ * flash pages in size, then we want to split the write into one or two
+ * page-aligned writes to take advantage of faster buffering.  Here we can
+ * adjust the size of the MMC request and let the block layer request handler
+ * deal with generating another MMC request.
+ */
+#define TOSHIBA_MANFID 0x11
+#define TOSHIBA_PAGE_SIZE 16		/* sectors */
+#define TOSHIBA_ADJUST_THRESHOLD 24	/* sectors */
+static bool mmc_adjust_toshiba_write(struct mmc_card *card,
+                                     struct mmc_request *mrq)
+{
+	if (mmc_card_mmc(card) && card->cid.manfid == TOSHIBA_MANFID &&
+	    mrq->data->blocks <= TOSHIBA_ADJUST_THRESHOLD) {
+		int sectors_in_page = TOSHIBA_PAGE_SIZE -
+		                      (mrq->cmd->arg % TOSHIBA_PAGE_SIZE);
+		if (mrq->data->blocks > sectors_in_page) {
+			mrq->data->blocks = sectors_in_page;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * This is another strange workaround to try to close the gap on Toshiba eMMC
+ * performance when compared to other vendors.  In order to take advantage
+ * of certain optimizations and assumptions in those cards, we will look for
+ * multiblock write transfers below a certain size and we do the following:
+ *
+ * - Break them up into seperate page-aligned (8k flash pages) transfers.
+ * - Execute the transfers in reverse order.
+ * - Use "reliable write" transfer mode.
+ *
+ * Neither the block I/O layer nor the scatterlist design seem to lend them-
+ * selves well to executing a block request out of order.  So instead we let
+ * mmc_blk_issue_rq() setup the MMC request for the entire transfer and then
+ * break it up and reorder it here.  This also requires that we put the data
+ * into a bounce buffer and send it as individual sg's.
+ */
+#define TOSHIBA_LOW_THRESHOLD 48	/* sectors */
+#define TOSHIBA_HIGH_THRESHOLD 64	/* sectors */
+static bool mmc_handle_toshiba_write(struct mmc_queue *mq,
+                                     struct mmc_card *card,
+                                     struct mmc_request *mrq)
+{
+	struct mmc_blk_data *md = mq->data;
+	unsigned int first_page, last_page, page;
+	unsigned long flags;
+
+	if (!md->bounce ||
+	    mrq->data->blocks > TOSHIBA_HIGH_THRESHOLD ||
+	    mrq->data->blocks < TOSHIBA_LOW_THRESHOLD)
+		return false;
+
+	first_page = mrq->cmd->arg / TOSHIBA_PAGE_SIZE;
+	last_page = (mrq->cmd->arg + mrq->data->blocks - 1) / TOSHIBA_PAGE_SIZE;
+
+	/* Single page write: just do it the normal way */
+	if (first_page == last_page)
+		return false;
+
+	local_irq_save(flags);
+	sg_copy_to_buffer(mrq->data->sg, mrq->data->sg_len,
+	                  md->bounce, mrq->data->blocks * 512);
+	local_irq_restore(flags);
+
+	for (page = last_page; page >= first_page; page--) {
+		unsigned long offset, length;
+		struct mmc_blk_request brq;
+		struct mmc_command cmd;
+		struct scatterlist sg;
+
+		memset(&brq, 0, sizeof(struct mmc_blk_request));
+		brq.mrq.cmd = &brq.cmd;
+		brq.mrq.data = &brq.data;
+
+		brq.cmd.arg = page * TOSHIBA_PAGE_SIZE;
+		brq.data.blksz = 512;
+		if (page == first_page) {
+			brq.cmd.arg = mrq->cmd->arg;
+			brq.data.blocks = TOSHIBA_PAGE_SIZE -
+			                  (mrq->cmd->arg % TOSHIBA_PAGE_SIZE);
+		} else if (page == last_page)
+			brq.data.blocks = (mrq->cmd->arg + mrq->data->blocks) %
+			                  TOSHIBA_PAGE_SIZE;
+		if (brq.data.blocks == 0)
+			brq.data.blocks = TOSHIBA_PAGE_SIZE;
+
+		if (!mmc_card_blockaddr(card))
+			brq.cmd.arg <<= 9;
+		brq.cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC;
+		brq.stop.opcode = MMC_STOP_TRANSMISSION;
+		brq.stop.arg = 0;
+		brq.stop.flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
+
+		brq.data.flags |= MMC_DATA_WRITE;
+		if (brq.data.blocks > 1) {
+			if (!mmc_host_is_spi(card->host))
+				brq.mrq.stop = &brq.stop;
+			brq.cmd.opcode = MMC_WRITE_MULTIPLE_BLOCK;
+		} else {
+			brq.mrq.stop = NULL;
+			brq.cmd.opcode = MMC_WRITE_BLOCK;
+		}
+
+		if (brq.cmd.opcode == MMC_WRITE_MULTIPLE_BLOCK &&
+		    brq.data.blocks <= card->ext_csd.rel_wr_sec_c) {
+			int err;
+
+			cmd.opcode = MMC_SET_BLOCK_COUNT;
+			cmd.arg = brq.data.blocks | (1 << 31);
+			cmd.flags = MMC_RSP_R1 | MMC_CMD_AC;
+			err = mmc_wait_for_cmd(card->host, &cmd, 0);
+			if (!err)
+				brq.mrq.stop = NULL;
+		}
+
+		mmc_set_data_timeout(&brq.data, card);
+
+		offset = (brq.cmd.arg - mrq->cmd->arg) * 512;
+		length = brq.data.blocks * 512;
+		sg_init_one(&sg, md->bounce + offset, length);
+		brq.data.sg = &sg;
+		brq.data.sg_len = 1;
+
+		mmc_wait_for_req(card->host, &brq.mrq);
+
+		mrq->data->bytes_xfered += brq.data.bytes_xfered;
+
+		if (brq.cmd.error || brq.data.error || brq.stop.error) {
+			mrq->cmd->error = brq.cmd.error;
+			mrq->data->error = brq.data.error;
+			mrq->stop->error = brq.stop.error;
+
+			/*
+			 * We're executing the request backwards, so don't let
+			 * the block layer think some part of it has succeeded.
+			 * It will get it wrong.  Since the failure will cause
+			 * us to fall back on single block writes, we're better
+			 * off reporting that none of the data was written.
+			 */
+			mrq->data->bytes_xfered = 0;
+			break;
+		}
+	}
+
+	return true;
+}
 static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 {
 	struct mmc_blk_data *md = mq->data;
@@ -378,6 +533,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 			brq.data.flags |= MMC_DATA_WRITE;
 		}
 
+		if (rq_data_dir(req) == WRITE)
+			mmc_adjust_toshiba_write(card, &brq.mrq);
+
 		mmc_set_data_timeout(&brq.data, card);
 
 		brq.data.sg = mq->sg;
@@ -402,9 +560,14 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req)
 			brq.data.sg_len = i;
 		}
 
-		mmc_queue_bounce_pre(mq);
-
-		mmc_wait_for_req(card->host, &brq.mrq);
+               mmc_queue_bounce_pre(mq);
+ 
+               /*
+                * Try the workaround first for writes, then fall back.
+                */
+               if (rq_data_dir(req) != WRITE || disable_multi ||
+                   !mmc_handle_toshiba_write(mq, card, &brq.mrq))
+                       mmc_wait_for_req(card->host, &brq.mrq);
 
 		mmc_queue_bounce_post(mq);
 
@@ -589,6 +752,15 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card)
 		goto out;
 	}
 
+	if (card->cid.manfid == TOSHIBA_MANFID && mmc_card_mmc(card)) {
+		pr_info("%s: enable Toshiba workaround\n",
+			mmc_hostname(card->host));
+		md->bounce = kmalloc(TOSHIBA_HIGH_THRESHOLD * 512, GFP_KERNEL);
+		if (!md->bounce) {
+			ret = -ENOMEM;
+			goto err_kfree;
+		}
+	}
 
 	/*
 	 * Set the read-only status based on the supported commands
@@ -655,6 +827,8 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card)
  err_putdisk:
 	put_disk(md->disk);
  err_kfree:
+	if (md->bounce)
+		kfree(md->bounce);
 	kfree(md);
  out:
 	return ERR_PTR(ret);
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 45055c4..17eef89 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -307,6 +307,9 @@ static int mmc_read_ext_csd(struct mmc_card *card)
 	else
 		card->erased_byte = 0x0;
 
+	if (card->ext_csd.rev >= 5)
+		card->ext_csd.rel_wr_sec_c = ext_csd[EXT_CSD_REL_WR_SEC_C];
+
 out:
 	kfree(ext_csd);
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 6b75250..fea7ecb 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -43,6 +43,7 @@ struct mmc_csd {
 
 struct mmc_ext_csd {
 	u8			rev;
+        u8                      rel_wr_sec_c;
 	u8			erase_group_def;
 	u8			sec_feature_support;
 	unsigned int		sa_timeout;		/* Units: 100ns */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index a5d765c..1e87020 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -260,6 +260,7 @@ struct _mmc_csd {
 #define EXT_CSD_CARD_TYPE		196	/* RO */
 #define EXT_CSD_SEC_CNT			212	/* RO, 4 bytes */
 #define EXT_CSD_S_A_TIMEOUT		217	/* RO */
+#define EXT_CSD_REL_WR_SEC_C            222
 #define EXT_CSD_ERASE_TIMEOUT_MULT	223	/* RO */
 #define EXT_CSD_HC_ERASE_GRP_SIZE	224	/* RO */
 #define EXT_CSD_BOOT_SIZE_MULTI		226