I think we've always known it's possible to lose a request during timeout handling, but just accepted that possibility. It seems to be causing problems, though, leading to unnecessary error escalation and IO failures. The possiblity arises when the block layer marks the request complete prior to running the timeout handler. If that request happens to complete while the handler is running, the request will be lost, inevitably triggering a second timeout. This patch attempts to shorten the window for this race condition by clearing the started flag when the driver completes a request. The block layer's timeout handler will then complete the command if it observes the started flag is no longer set. Note it's possible to lose the command even with this patch. It's just less likely to happen. Signed-off-by: Keith Busch <keith.busch@xxxxxxxxx> --- block/blk-mq.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 98a1860..37144ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -566,6 +566,7 @@ void blk_mq_complete_request(struct request *rq) if (unlikely(blk_should_fake_timeout(q))) return; + clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); if (!blk_mark_rq_complete(rq)) __blk_mq_complete_request(rq); } @@ -605,19 +606,19 @@ void blk_mq_start_request(struct request *rq) * complete. So be sure to clear complete again when we start * the request, otherwise we'll ignore the completion event. */ - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + if (q->dma_drain_size && blk_rq_bytes(rq)) { + /* + * Make sure space for the drain appears. We know we can do + * this because max_hw_segments has been adjusted to be one + * fewer than the device can handle. + */ + rq->nr_phys_segments++; + } + } if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); - - if (q->dma_drain_size && blk_rq_bytes(rq)) { - /* - * Make sure space for the drain appears. We know we can do - * this because max_hw_segments has been adjusted to be one - * fewer than the device can handle. - */ - rq->nr_phys_segments++; - } } EXPORT_SYMBOL(blk_mq_start_request); @@ -637,11 +638,6 @@ static void __blk_mq_requeue_request(struct request *rq) trace_block_rq_requeue(q, rq); wbt_requeue(q->rq_wb, &rq->issue_stat); blk_mq_sched_requeue_request(rq); - - if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { - if (q->dma_drain_size && blk_rq_bytes(rq)) - rq->nr_phys_segments--; - } } void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) @@ -763,10 +759,15 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) __blk_mq_complete_request(req); break; case BLK_EH_RESET_TIMER: - blk_add_timer(req); - blk_clear_rq_complete(req); - break; + if (test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) { + blk_add_timer(req); + blk_clear_rq_complete(req); + break; + } + /* Fall through */ case BLK_EH_NOT_HANDLED: + if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) + __blk_mq_complete_request(req); break; default: printk(KERN_ERR "block: bad eh return: %d\n", ret); -- 2.5.5