We observed that it is possible for a flush to bypass the I/O scheduler and get added to hctx->dispatch in blk_mq_sched_bypass_insert. This can happen while a kworker is running blk_mq_do_dispatch_sched call in blk_mq_sched_dispatch_requests. However, the blk_mq_do_dispatch_sched call doesn't end in bounded time. As a result, the flush can sit there indefinitely, as the I/O scheduler feeds an arbitrary number of requests to the hardware. The solution is to periodically poll hctx->dispatch in blk_mq_do_dispatch_sched, to put a bound on the latency of the commands sitting there. Signed-off-by: Salman Qazi <sqazi@xxxxxxxxxx> --- block/blk-mq-sched.c | 6 ++++++ block/blk-mq.c | 4 ++++ block/blk-sysfs.c | 33 +++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 4 files changed, 45 insertions(+) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index ca22afd47b3d..75cdec64b9c7 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -90,6 +90,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; LIST_HEAD(rq_list); + int count = 0; do { struct request *rq; @@ -97,6 +98,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; + if (count > 0 && count % q->max_sched_batch == 0 && + !list_empty_careful(&hctx->dispatch)) + break; + if (!blk_mq_get_dispatch_budget(hctx)) break; @@ -112,6 +117,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * in blk_mq_dispatch_rq_list(). */ list_add(&rq->queuelist, &rq_list); + count++; } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); } diff --git a/block/blk-mq.c b/block/blk-mq.c index a12b1763508d..7cb13aa72a94 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -40,6 +40,8 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#define BLK_MQ_DEFAULT_MAX_SCHED_BATCH 100 + static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -2934,6 +2936,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, */ q->poll_nsec = BLK_MQ_POLL_CLASSIC; + q->max_sched_batch = BLK_MQ_DEFAULT_MAX_SCHED_BATCH; + blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index fca9b158f4a0..dd7b58a1bd35 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -390,6 +390,32 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, return count; } +static ssize_t queue_max_sched_batch_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%d\n", q->max_sched_batch); +} + +static ssize_t queue_max_sched_batch_store(struct request_queue *q, + const char *page, + size_t count) +{ + int err, val; + + if (!q->mq_ops) + return -EINVAL; + + err = kstrtoint(page, 10, &val); + if (err < 0) + return err; + + if (val <= 0) + return -EINVAL; + + q->max_sched_batch = val; + + return count; +} + static ssize_t queue_poll_show(struct request_queue *q, char *page) { return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); @@ -691,6 +717,12 @@ static struct queue_sysfs_entry queue_poll_delay_entry = { .store = queue_poll_delay_store, }; +static struct queue_sysfs_entry queue_max_sched_batch_entry = { + .attr = {.name = "max_sched_batch", .mode = 0644 }, + .show = queue_max_sched_batch_show, + .store = queue_max_sched_batch_store, +}; + static struct queue_sysfs_entry queue_wc_entry = { .attr = {.name = "write_cache", .mode = 0644 }, .show = queue_wc_show, @@ -763,6 +795,7 @@ static struct attribute *queue_attrs[] = { &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, &queue_io_timeout_entry.attr, + &queue_max_sched_batch_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &throtl_sample_time_entry.attr, #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 053ea4b51988..68e7d29d4dd4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -477,6 +477,8 @@ struct request_queue { unsigned int rq_timeout; int poll_nsec; + int max_sched_batch; + struct blk_stat_callback *poll_cb; struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS]; -- 2.25.0.341.g760bfbb309-goog