The behind idea is simple: 1) for none scheduler, driver tag has to be borrowed for flush rq, otherwise we may run out of tag, and IO hang is caused. And get/put driver tag is actually noop, so reorder tags isn't necessary at all. 2) for real I/O scheduler, we needn't to allocate driver tag beforehand for flush rq, and it works just fine to follow the way for normal requests: allocate driver tag for each rq just before calling .queue_rq(). One visible change to driver is that the driver tag isn't shared in the flush request sequence, that won't be a problem, since we always do that in legacy path. Then flush rq needn't to be treated specially wrt. get/put driver tag, code gets cleanup much, such as, reorder_tags_to_front() is removed, and we needn't to worry about request order in dispatch list for avoiding I/O deadlock. Also we have to put the driver tag before requeueing. Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> --- block/blk-flush.c | 35 ++++++++++++++++++++++++++--------- block/blk-mq-sched.c | 42 +++++------------------------------------- block/blk-mq.c | 41 ++++++----------------------------------- 3 files changed, 37 insertions(+), 81 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index a9773d2075ac..f17170675917 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; + if (!q->elevator) { + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); + flush_rq->tag = -1; + } else { + blk_mq_put_driver_tag_hctx(hctx, flush_rq); + flush_rq->internal_tag = -1; + } } running = &fq->flush_queue[fq->flush_running_idx]; @@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) blk_rq_init(q, flush_rq); /* - * Borrow tag from the first request since they can't - * be in flight at the same time. And acquire the tag's - * ownership for flush req. + * In case of none scheduler, borrow tag from the first request + * since they can't be in flight at the same time. And acquire + * the tag's ownership for flush req. + * + * In case of IO scheduler, flush rq need to borrow scheduler tag + * just for cheating put/get driver tag. */ if (q->mq_ops) { struct blk_mq_hw_ctx *hctx; flush_rq->mq_ctx = first_rq->mq_ctx; - flush_rq->tag = first_rq->tag; - fq->orig_rq = first_rq; - hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); - blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); + if (!q->elevator) { + fq->orig_rq = first_rq; + flush_rq->tag = first_rq->tag; + hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); + blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); + } else { + flush_rq->internal_tag = first_rq->internal_tag; + } } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; @@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) hctx = blk_mq_map_queue(q, ctx->cpu); + if (q->elevator) { + WARN_ON(rq->tag < 0); + blk_mq_put_driver_tag_hctx(hctx, rq); + } + /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 32e4ea2d0a77..eb082f82b470 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -366,29 +366,12 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return true; } - if (has_sched) { + if (has_sched) rq->rq_flags |= RQF_SORTED; - WARN_ON(rq->tag != -1); - } return false; } -/* - * Add flush/fua to the queue. If we fail getting a driver tag, then - * punt to the requeue list. Requeue will re-invoke us from a context - * that's safe to block from. - */ -static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool can_block) -{ - if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { - blk_insert_flush(rq); - blk_mq_run_hw_queue(hctx, true); - } else - blk_mq_add_to_requeue_list(rq, false, true); -} - void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async, bool can_block) { @@ -399,10 +382,12 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, /* flush rq in flush machinery need to be dispatched directly */ if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { - blk_mq_sched_insert_flush(hctx, rq, can_block); - return; + blk_insert_flush(rq); + goto run; } + WARN_ON(e && (rq->tag != -1)); + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) goto run; @@ -429,23 +414,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); struct elevator_queue *e = hctx->queue->elevator; - if (e) { - struct request *rq, *next; - - /* - * We bypass requests that already have a driver tag assigned, - * which should only be flushes. Flushes are only ever inserted - * as single requests, so we shouldn't ever hit the - * WARN_ON_ONCE() below (but let's handle it just in case). - */ - list_for_each_entry_safe(rq, next, list, queuelist) { - if (WARN_ON_ONCE(rq->tag != -1)) { - list_del_init(&rq->queuelist); - blk_mq_sched_bypass_insert(hctx, true, rq); - } - } - } - if (e && e->type->ops.mq.insert_requests) e->type->ops.mq.insert_requests(hctx, list, false); else diff --git a/block/blk-mq.c b/block/blk-mq.c index bf62065a15ec..445c74e0f5e5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -650,6 +650,8 @@ static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_put_driver_tag(rq); + trace_block_rq_requeue(q, rq); wbt_requeue(q->rq_wb, &rq->issue_stat); blk_mq_sched_requeue_request(rq); @@ -993,30 +995,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, return rq->tag != -1; } -/* - * If we fail getting a driver tag because all the driver tags are already - * assigned and on the dispatch list, BUT the first entry does not have a - * tag, then we could deadlock. For that case, move entries with assigned - * driver tags to the front, leaving the set of tagged requests in the - * same order, and the untagged set in the same order. - */ -static bool reorder_tags_to_front(struct list_head *list) -{ - struct request *rq, *tmp, *first = NULL; - - list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) { - if (rq == first) - break; - if (rq->tag != -1) { - list_move(&rq->queuelist, list); - if (!first) - first = rq; - } - } - - return first != NULL; -} - static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { @@ -1077,9 +1055,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); if (!blk_mq_get_driver_tag(rq, &hctx, false)) { - if (!queued && reorder_tags_to_front(list)) - continue; - /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. @@ -1135,7 +1110,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, nxt = list_first_entry(list, struct request, queuelist); blk_mq_put_driver_tag(nxt); } - blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; @@ -1704,13 +1678,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (unlikely(is_flush_fua)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - if (q->elevator) { - blk_mq_sched_insert_request(rq, false, true, true, - true); - } else { - blk_insert_flush(rq); - blk_mq_run_hw_queue(data.hctx, true); - } + + /* bypass scheduler for flush rq */ + blk_insert_flush(rq); + blk_mq_run_hw_queue(data.hctx, true); } else if (plug && q->nr_hw_queues == 1) { struct request *last = NULL; -- 2.9.5