On 01/25/2017 10:03 AM, Jens Axboe wrote: > On 01/25/2017 09:57 AM, Hannes Reinecke wrote: >> On 01/25/2017 04:52 PM, Jens Axboe wrote: >>> On 01/25/2017 04:10 AM, Hannes Reinecke wrote: >> [ .. ] >>>> Bah. >>>> >>>> Not quite. I'm still seeing some queues with state 'restart'. >>>> >>>> I've found that I need another patch on top of that: >>>> >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>>> index e872555..edcbb44 100644 >>>> --- a/block/blk-mq.c >>>> +++ b/block/blk-mq.c >>>> @@ -753,8 +754,10 @@ static void blk_mq_timeout_work(struct work_struct >>>> *work) >>>> >>>> queue_for_each_hw_ctx(q, hctx, i) { >>>> /* the hctx may be unmapped, so check it here */ >>>> - if (blk_mq_hw_queue_mapped(hctx)) >>>> + if (blk_mq_hw_queue_mapped(hctx)) { >>>> blk_mq_tag_idle(hctx); >>>> + blk_mq_sched_restart(hctx); >>>> + } >>>> } >>>> } >>>> blk_queue_exit(q); >>>> >>>> >>>> Reasoning is that in blk_mq_get_tag() we might end up scheduling the >>>> request on another hctx, but the original hctx might still have the >>>> SCHED_RESTART bit set. >>>> Which will never cleared as we complete the request on a different hctx, >>>> so anything we do on the end_request side won't do us any good. >>> >>> I think you are right, it'll potentially trigger with shared tags and >>> multiple hardware queues. I'll debug this today and come up with a >>> decent fix. >>> >>> I committed the previous patch, fwiw. >>> >> THX. >> >> The above patch _does_ help in the sense that my testcase now completes >> without stalls. And I even get a decent performance with the mq-sched >> fixes: 82k IOPs sequential read with mq-deadline as compared to 44k IOPs >> when running without I/O scheduling. >> Still some way off from the 132k IOPs I'm getting with CFQ, but we're >> getting there. >> >> However, I do get a noticeable stall during the stonewall sequence >> before the timeout handler kicks in, so the must be a better way for >> handling this. >> >> But nevertheless, thanks for all your work here. >> Very much appreciated. > > Yeah, the fix isn't really a fix, unless you are willing to tolerate > potentially tens of seconds of extra latency until we idle it out :-) > > So we can't use the un-idling for this, but we can track it on the > shared state, which is the tags. The problem isn't that we are > switching to a new hardware queue, it's if we mark the hardware queue > as restart AND it has nothing pending. In that case, we'll never > get it restarted, since IO completion is what restarts it. > > I need to handle that case separately. Currently testing a patch, I > should have something for you to test later today. Can you try this one? diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index d05061f..6a1656d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -300,6 +300,34 @@ bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq) } EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert); +static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +{ + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (blk_mq_hctx_has_pending(hctx)) + blk_mq_run_hw_queue(hctx, true); + } +} + +void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) +{ + unsigned int i; + + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) + blk_mq_sched_restart_hctx(hctx); + else { + struct request_queue *q = hctx->queue; + + if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) + return; + + clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags); + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_sched_restart_hctx(hctx); + } +} + static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 6b465bc..becbc78 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -19,6 +19,7 @@ bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, @@ -123,11 +124,6 @@ blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq) BUG_ON(rq->internal_tag == -1); blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag); - - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - blk_mq_run_hw_queue(hctx, true); - } } static inline void blk_mq_sched_started_request(struct request *rq) @@ -160,8 +156,15 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + struct request_queue *q = hctx->queue; + + if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) + set_bit(QUEUE_FLAG_RESTART, &q->queue_flags); + } + } } static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c3e667..3951b72 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -40,7 +40,7 @@ static LIST_HEAD(all_q_list); /* * Check if any of the ctx's have pending work in this hardware queue */ -static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { return sbitmap_any_bit_set(&hctx->ctx_map) || !list_empty_careful(&hctx->dispatch) || @@ -345,6 +345,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) blk_mq_sched_completed_request(hctx, rq); + blk_mq_sched_restart_queues(hctx); blk_queue_exit(q); } @@ -928,8 +929,16 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) if (!blk_mq_get_driver_tag(rq, &hctx, false)) { if (!queued && reorder_tags_to_front(list)) continue; + + /* + * We failed getting a driver tag. Mark the queue(s) + * as needing a restart. Retry getting a tag again, + * in case the needed IO completed right before we + * marked the queue as needing a restart. + */ blk_mq_sched_mark_restart(hctx); - break; + if (!blk_mq_get_driver_tag(rq, &hctx, false)) + break; } list_del_init(&rq->queuelist); diff --git a/block/blk-mq.h b/block/blk-mq.h index 6c24b90..077a400 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -33,6 +33,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); +bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); /* * Internal helpers for allocating/freeing the request map diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ee1fb59..40ce491 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -607,6 +607,7 @@ struct request_queue { #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_STATS 27 /* track rq completion times */ +#define QUEUE_FLAG_RESTART 28 #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- Jens Axboe -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html