On Wed, Feb 9, 2011 at 2:37 AM, Linus Walleij <linus.walleij@xxxxxxxxxx> wrote: > [Quoting in verbatin so the orginal mail hits linux-mmc, this is very > interesting!] > > 2011/2/8 Andrei Warkentin <andreiw@xxxxxxxxxxxx>: >> Hi, >> >> I'm not sure if this is the best place to bring this up, but Russel's >> name is on a fair share of drivers/mmc code, and there does seem to be >> quite a bit of MMC-related discussions. Excuse me in advance if this >> isn't the right forum :-). >> >> Certain MMC vendors (maybe even quite a bit of them) use a pretty >> rigid buffering scheme when it comes to handling writes. There is >> usually a buffer A for random accesses, and a buffer B for sequential >> accesses. For certain Toshiba parts, it looks like buffer A is 8KB >> wide, with buffer B being 4MB wide, and all accesses larger than 8KB >> effectively equating to 4MB accesses. Worse, consecutive small (8k) >> writes are treated as one large sequential access, once again ending >> up in buffer B, thus necessitating out-of-order writing to work around >> this. >> >> What this means is decreased life span for the parts, and it also >> means a performance impact on small writes, but the first item is much >> more crucial, especially for smaller parts. >> >> As I've mentioned, probably more vendors are affected. How about a >> generic MMC_BLOCK quirk that splits the requests (and optionally >> reorders) them? The thresholds would then be adjustable as >> module/kernel parameters based on manfid. I'm asking because I have a >> patch now, but its ugly and hardcoded against a specific manufacturer. > > There is a quirk API so that specific quirks can be flagged for certain > vendors and cards, e.g. some Toshibas in this case. e.g. grep the > kernel source for MMC_QUIRK_BLKSZ_FOR_BYTE_MODE. > > But as Russell says this probably needs to be signalled up to the > block layer to be handled properly. > > Why don't you post the code you have today as an RFC: patch, > I think many will be interested? > > Yours, > Linus Walleij > I think it's worthwhile to make make the upper block layers aware of MMC (and apparently other flash memory) limitations, but I think as a first step it could make sense (for me) to reformat the patch I am attaching into something that looks better. Don't take the attached patch too seriously :-). Thanks, A
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 7054fd5..3b32329 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -60,6 +60,7 @@ struct mmc_blk_data { spinlock_t lock; struct gendisk *disk; struct mmc_queue queue; + char *bounce; unsigned int usage; unsigned int read_only; @@ -93,6 +94,9 @@ static void mmc_blk_put(struct mmc_blk_data *md) __clear_bit(devidx, dev_use); + if (md->bounce) + kfree(md->bounce); + put_disk(md->disk); kfree(md); } @@ -312,6 +316,157 @@ out: return err ? 0 : 1; } +/* + * Workaround for Toshiba eMMC performance. If the request is less than two + * flash pages in size, then we want to split the write into one or two + * page-aligned writes to take advantage of faster buffering. Here we can + * adjust the size of the MMC request and let the block layer request handler + * deal with generating another MMC request. + */ +#define TOSHIBA_MANFID 0x11 +#define TOSHIBA_PAGE_SIZE 16 /* sectors */ +#define TOSHIBA_ADJUST_THRESHOLD 24 /* sectors */ +static bool mmc_adjust_toshiba_write(struct mmc_card *card, + struct mmc_request *mrq) +{ + if (mmc_card_mmc(card) && card->cid.manfid == TOSHIBA_MANFID && + mrq->data->blocks <= TOSHIBA_ADJUST_THRESHOLD) { + int sectors_in_page = TOSHIBA_PAGE_SIZE - + (mrq->cmd->arg % TOSHIBA_PAGE_SIZE); + if (mrq->data->blocks > sectors_in_page) { + mrq->data->blocks = sectors_in_page; + return true; + } + } + + return false; +} + +/* + * This is another strange workaround to try to close the gap on Toshiba eMMC + * performance when compared to other vendors. In order to take advantage + * of certain optimizations and assumptions in those cards, we will look for + * multiblock write transfers below a certain size and we do the following: + * + * - Break them up into seperate page-aligned (8k flash pages) transfers. + * - Execute the transfers in reverse order. + * - Use "reliable write" transfer mode. + * + * Neither the block I/O layer nor the scatterlist design seem to lend them- + * selves well to executing a block request out of order. So instead we let + * mmc_blk_issue_rq() setup the MMC request for the entire transfer and then + * break it up and reorder it here. This also requires that we put the data + * into a bounce buffer and send it as individual sg's. + */ +#define TOSHIBA_LOW_THRESHOLD 48 /* sectors */ +#define TOSHIBA_HIGH_THRESHOLD 64 /* sectors */ +static bool mmc_handle_toshiba_write(struct mmc_queue *mq, + struct mmc_card *card, + struct mmc_request *mrq) +{ + struct mmc_blk_data *md = mq->data; + unsigned int first_page, last_page, page; + unsigned long flags; + + if (!md->bounce || + mrq->data->blocks > TOSHIBA_HIGH_THRESHOLD || + mrq->data->blocks < TOSHIBA_LOW_THRESHOLD) + return false; + + first_page = mrq->cmd->arg / TOSHIBA_PAGE_SIZE; + last_page = (mrq->cmd->arg + mrq->data->blocks - 1) / TOSHIBA_PAGE_SIZE; + + /* Single page write: just do it the normal way */ + if (first_page == last_page) + return false; + + local_irq_save(flags); + sg_copy_to_buffer(mrq->data->sg, mrq->data->sg_len, + md->bounce, mrq->data->blocks * 512); + local_irq_restore(flags); + + for (page = last_page; page >= first_page; page--) { + unsigned long offset, length; + struct mmc_blk_request brq; + struct mmc_command cmd; + struct scatterlist sg; + + memset(&brq, 0, sizeof(struct mmc_blk_request)); + brq.mrq.cmd = &brq.cmd; + brq.mrq.data = &brq.data; + + brq.cmd.arg = page * TOSHIBA_PAGE_SIZE; + brq.data.blksz = 512; + if (page == first_page) { + brq.cmd.arg = mrq->cmd->arg; + brq.data.blocks = TOSHIBA_PAGE_SIZE - + (mrq->cmd->arg % TOSHIBA_PAGE_SIZE); + } else if (page == last_page) + brq.data.blocks = (mrq->cmd->arg + mrq->data->blocks) % + TOSHIBA_PAGE_SIZE; + if (brq.data.blocks == 0) + brq.data.blocks = TOSHIBA_PAGE_SIZE; + + if (!mmc_card_blockaddr(card)) + brq.cmd.arg <<= 9; + brq.cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC; + brq.stop.opcode = MMC_STOP_TRANSMISSION; + brq.stop.arg = 0; + brq.stop.flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC; + + brq.data.flags |= MMC_DATA_WRITE; + if (brq.data.blocks > 1) { + if (!mmc_host_is_spi(card->host)) + brq.mrq.stop = &brq.stop; + brq.cmd.opcode = MMC_WRITE_MULTIPLE_BLOCK; + } else { + brq.mrq.stop = NULL; + brq.cmd.opcode = MMC_WRITE_BLOCK; + } + + if (brq.cmd.opcode == MMC_WRITE_MULTIPLE_BLOCK && + brq.data.blocks <= card->ext_csd.rel_wr_sec_c) { + int err; + + cmd.opcode = MMC_SET_BLOCK_COUNT; + cmd.arg = brq.data.blocks | (1 << 31); + cmd.flags = MMC_RSP_R1 | MMC_CMD_AC; + err = mmc_wait_for_cmd(card->host, &cmd, 0); + if (!err) + brq.mrq.stop = NULL; + } + + mmc_set_data_timeout(&brq.data, card); + + offset = (brq.cmd.arg - mrq->cmd->arg) * 512; + length = brq.data.blocks * 512; + sg_init_one(&sg, md->bounce + offset, length); + brq.data.sg = &sg; + brq.data.sg_len = 1; + + mmc_wait_for_req(card->host, &brq.mrq); + + mrq->data->bytes_xfered += brq.data.bytes_xfered; + + if (brq.cmd.error || brq.data.error || brq.stop.error) { + mrq->cmd->error = brq.cmd.error; + mrq->data->error = brq.data.error; + mrq->stop->error = brq.stop.error; + + /* + * We're executing the request backwards, so don't let + * the block layer think some part of it has succeeded. + * It will get it wrong. Since the failure will cause + * us to fall back on single block writes, we're better + * off reporting that none of the data was written. + */ + mrq->data->bytes_xfered = 0; + break; + } + } + + return true; +} static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req) { struct mmc_blk_data *md = mq->data; @@ -378,6 +533,9 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req) brq.data.flags |= MMC_DATA_WRITE; } + if (rq_data_dir(req) == WRITE) + mmc_adjust_toshiba_write(card, &brq.mrq); + mmc_set_data_timeout(&brq.data, card); brq.data.sg = mq->sg; @@ -402,9 +560,14 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *req) brq.data.sg_len = i; } - mmc_queue_bounce_pre(mq); - - mmc_wait_for_req(card->host, &brq.mrq); + mmc_queue_bounce_pre(mq); + + /* + * Try the workaround first for writes, then fall back. + */ + if (rq_data_dir(req) != WRITE || disable_multi || + !mmc_handle_toshiba_write(mq, card, &brq.mrq)) + mmc_wait_for_req(card->host, &brq.mrq); mmc_queue_bounce_post(mq); @@ -589,6 +752,15 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card) goto out; } + if (card->cid.manfid == TOSHIBA_MANFID && mmc_card_mmc(card)) { + pr_info("%s: enable Toshiba workaround\n", + mmc_hostname(card->host)); + md->bounce = kmalloc(TOSHIBA_HIGH_THRESHOLD * 512, GFP_KERNEL); + if (!md->bounce) { + ret = -ENOMEM; + goto err_kfree; + } + } /* * Set the read-only status based on the supported commands @@ -655,6 +827,8 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card) err_putdisk: put_disk(md->disk); err_kfree: + if (md->bounce) + kfree(md->bounce); kfree(md); out: return ERR_PTR(ret); diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 45055c4..17eef89 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -307,6 +307,9 @@ static int mmc_read_ext_csd(struct mmc_card *card) else card->erased_byte = 0x0; + if (card->ext_csd.rev >= 5) + card->ext_csd.rel_wr_sec_c = ext_csd[EXT_CSD_REL_WR_SEC_C]; + out: kfree(ext_csd); diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 6b75250..fea7ecb 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -43,6 +43,7 @@ struct mmc_csd { struct mmc_ext_csd { u8 rev; + u8 rel_wr_sec_c; u8 erase_group_def; u8 sec_feature_support; unsigned int sa_timeout; /* Units: 100ns */ diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index a5d765c..1e87020 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -260,6 +260,7 @@ struct _mmc_csd { #define EXT_CSD_CARD_TYPE 196 /* RO */ #define EXT_CSD_SEC_CNT 212 /* RO, 4 bytes */ #define EXT_CSD_S_A_TIMEOUT 217 /* RO */ +#define EXT_CSD_REL_WR_SEC_C 222 #define EXT_CSD_ERASE_TIMEOUT_MULT 223 /* RO */ #define EXT_CSD_HC_ERASE_GRP_SIZE 224 /* RO */ #define EXT_CSD_BOOT_SIZE_MULTI 226