On Wed, Jul 25, 2018 at 02:29:30PM -0400, Peter Geis wrote: > On 07/25/2018 02:17 PM, Jens Axboe wrote: > > On 7/25/18 10:28 AM, Peter Geis wrote: > > > Good Afternoon, > > > > > > I have encountered an issue on both Tegra 2 and Tegra 3 devices > > > accessing emmc following the 25 July 2018 remote tracking merge. > > > > > > The offending commit is: > > > 6ce3dd6eec114930cf2035a8bcb1e80477ed79a8 > > > blk-mq: issue directly if hw queue isn't busy in case of 'none'. > > > > Can you try my current for-next? This should fix it: > > > > commit 8824f62246bef288173a6624a363352f0d4d3b09 > > Author: Ming Lei <ming.lei@xxxxxxxxxx> > > Date: Sun Jul 22 14:10:15 2018 +0800 > > > > blk-mq: fail the request in case issue failure > > > > That commit made the current merge window, it must be reverted before > reverting the offending commit. > > With that patch, the bug triggers then the kernel waits for the mmc to > recover. It seems however that the bug leaves the mmc in a zombie state, > where it is processing the previous command but the kernel has no control > over it. > > [ 4.233073] mmc0: Got command interrupt 0x00000001 even though no command > operation was in progress. > [ 4.242189] mmc0: sdhci: ============ SDHCI REGISTER DUMP =========== > [ 4.248616] mmc0: sdhci: Sys addr: 0x00000000 | Version: 0x00000001 > [ 4.255041] mmc0: sdhci: Blk size: 0x00007200 | Blk cnt: 0x00000000 > [ 4.261465] mmc0: sdhci: Argument: 0x002e3b10 | Trn mode: 0x00000033 > [ 4.267890] mmc0: sdhci: Present: 0x1ff70000 | Host ctl: 0x00000031 > [ 4.274314] mmc0: sdhci: Power: 0x0000000f | Blk gap: 0x00000000 > [ 4.280737] mmc0: sdhci: Wake-up: 0x00000000 | Clock: 0x00000007 > [ 4.287162] mmc0: sdhci: Timeout: 0x0000000e | Int stat: 0x00000002 > [ 4.293586] mmc0: sdhci: Int enab: 0x02ff000b | Sig enab: 0x02fc000b > [ 4.300010] mmc0: sdhci: AC12 err: 0x00000000 | Slot int: 0x00000000 > [ 4.306433] mmc0: sdhci: Caps: 0xe7ffd080 | Caps_1: 0x00000074 > [ 4.312857] mmc0: sdhci: Cmd: 0x0000123a | Max curr: 0x00969696 > [ 4.319281] mmc0: sdhci: Resp[0]: 0x00000900 | Resp[1]: 0x04800e92 > [ 4.325705] mmc0: sdhci: Resp[2]: 0x074b8000 | Resp[3]: 0x00000240 > [ 4.332128] mmc0: sdhci: Host ctl2: 0x00000000 > [ 4.336560] mmc0: sdhci: ADMA Err: 0x00000000 | ADMA Ptr: 0xae2f9220 > [ 4.342981] mmc0: sdhci: ============================================ Not found issue in the two merged patches, and it might be one mmc internal issue, could you apply the following debug patch and post the console log? diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index cf9c66c6d35a..6573684b07e5 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -415,6 +415,8 @@ void blk_mq_sched_insert_requests(struct request_queue *q, blk_mq_try_issue_list_directly(hctx, list); if (list_empty(list)) return; + if (blk_queue_debug(q)) + trace_printk("issue direct: partial done\n"); } blk_mq_insert_requests(hctx, ctx, list); } diff --git a/block/blk-mq.c b/block/blk-mq.c index e13bdc2707ce..efb898323e2d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -488,6 +488,9 @@ void blk_mq_free_request(struct request *rq) struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + if (blk_queue_debug(q)) + trace_printk("complete: rq %d\n", rq->internal_tag); + if (rq->rq_flags & RQF_ELVPRIV) { if (e && e->type->ops.mq.finish_request) e->type->ops.mq.finish_request(rq); @@ -1793,6 +1796,9 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq); + if (blk_queue_debug(rq->q)) + trace_printk("issue direct: rq %d, ret %d\n", + rq->internal_tag, (int)ret); if (ret != BLK_STS_OK) { if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { @@ -1841,6 +1847,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } + if (blk_queue_debug(q)) + trace_printk("make rq %d\n", rq->internal_tag); rq_qos_track(q, rq, bio); cookie = request_to_qc_t(data.hctx, rq); diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 648eb6743ed5..90745829cf9a 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -257,6 +257,9 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx, if (mmc_card_removed(mq->card)) { req->rq_flags |= RQF_QUIET; + + if (blk_queue_debug(q)) + trace_printk("queue rq %d, %s\n", req->internal_tag, "io err"); return BLK_STS_IOERR; } @@ -266,6 +269,9 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx, if (mq->recovery_needed) { spin_unlock_irq(q->queue_lock); + if (blk_queue_debug(q)) + trace_printk("queue rq %d, %s\n", req->internal_tag, + "resource"); return BLK_STS_RESOURCE; } @@ -274,6 +280,9 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx, if (mmc_cqe_dcmd_busy(mq)) { mq->cqe_busy |= MMC_CQE_DCMD_BUSY; spin_unlock_irq(q->queue_lock); + if (blk_queue_debug(q)) + trace_printk("queue rq %d, %s\n", req->internal_tag, + "resource"); return BLK_STS_RESOURCE; } break; @@ -326,6 +335,8 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx, break; } + if (blk_queue_debug(q)) + trace_printk("queue rq %d, %d\n", req->internal_tag, (int)ret); if (issued != MMC_REQ_STARTED) { bool put_card = false; @@ -404,6 +415,8 @@ static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth, mq->queue->queue_lock = lock; mq->queue->queuedata = mq; + blk_queue_flag_set(31, mq->queue); + return 0; free_tag_set: diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 1c828e0e9905..b2b471c2f5f6 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2640,6 +2640,7 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask) return; pr_err("%s: Got command interrupt 0x%08x even though no command operation was in progress.\n", mmc_hostname(host->mmc), (unsigned)intmask); + ftrace_dump(DUMP_ALL); sdhci_dumpregs(host); return; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 050d599f5ea9..29be3a6d5903 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -741,6 +741,7 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q); #define blk_queue_preempt_only(q) \ test_bit(QUEUE_FLAG_PREEMPT_ONLY, &(q)->queue_flags) #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) +#define blk_queue_debug(q) test_bit(31, &(q)->queue_flags) extern int blk_set_preempt_only(struct request_queue *q); extern void blk_clear_preempt_only(struct request_queue *q); Thanks, Ming