[PATCH] block: Limit number of items taken from the I/O scheduler in one go

Salman Qazi <sqazi@xxxxxxxxxx> · Mon, 3 Feb 2020 12:59:50 -0800

We observed that it is possible for a flush to bypass the I/O
scheduler and get added to hctx->dispatch in blk_mq_sched_bypass_insert.
This can happen while a kworker is running blk_mq_do_dispatch_sched call
in blk_mq_sched_dispatch_requests.

However, the blk_mq_do_dispatch_sched call doesn't end in bounded time.
As a result, the flush can sit there indefinitely, as the I/O scheduler
feeds an arbitrary number of requests to the hardware.

The solution is to periodically poll hctx->dispatch in
blk_mq_do_dispatch_sched, to put a bound on the latency of the commands
sitting there.

Signed-off-by: Salman Qazi <sqazi@xxxxxxxxxx>
---
 block/blk-mq-sched.c   |  6 ++++++
 block/blk-mq.c         |  4 ++++
 block/blk-sysfs.c      | 33 +++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  2 ++
 4 files changed, 45 insertions(+)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ca22afd47b3d..75cdec64b9c7 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -90,6 +90,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
 	LIST_HEAD(rq_list);
+	int count = 0;
 
 	do {
 		struct request *rq;
@@ -97,6 +98,10 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
 			break;
 
+		if (count > 0 && count % q->max_sched_batch == 0 &&
+		    !list_empty_careful(&hctx->dispatch))
+			break;
+
 		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
@@ -112,6 +117,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 		 * in blk_mq_dispatch_rq_list().
 		 */
 		list_add(&rq->queuelist, &rq_list);
+		count++;
 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a12b1763508d..7cb13aa72a94 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -40,6 +40,8 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
+#define BLK_MQ_DEFAULT_MAX_SCHED_BATCH	100
+
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -2934,6 +2936,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	 */
 	q->poll_nsec = BLK_MQ_POLL_CLASSIC;
 
+	q->max_sched_batch = BLK_MQ_DEFAULT_MAX_SCHED_BATCH;
+
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 	blk_mq_add_queue_tag_set(set, q);
 	blk_mq_map_swqueue(q);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index fca9b158f4a0..dd7b58a1bd35 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -390,6 +390,32 @@ static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
 	return count;
 }
 
+static ssize_t queue_max_sched_batch_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%d\n", q->max_sched_batch);
+}
+
+static ssize_t queue_max_sched_batch_store(struct request_queue *q,
+					   const char *page,
+					   size_t count)
+{
+	int err, val;
+
+	if (!q->mq_ops)
+		return -EINVAL;
+
+	err = kstrtoint(page, 10, &val);
+	if (err < 0)
+		return err;
+
+	if (val <= 0)
+		return -EINVAL;
+
+	q->max_sched_batch = val;
+
+	return count;
+}
+
 static ssize_t queue_poll_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page);
@@ -691,6 +717,12 @@ static struct queue_sysfs_entry queue_poll_delay_entry = {
 	.store = queue_poll_delay_store,
 };
 
+static struct queue_sysfs_entry queue_max_sched_batch_entry = {
+	.attr = {.name = "max_sched_batch", .mode = 0644 },
+	.show = queue_max_sched_batch_show,
+	.store = queue_max_sched_batch_store,
+};
+
 static struct queue_sysfs_entry queue_wc_entry = {
 	.attr = {.name = "write_cache", .mode = 0644 },
 	.show = queue_wc_show,
@@ -763,6 +795,7 @@ static struct attribute *queue_attrs[] = {
 	&queue_wb_lat_entry.attr,
 	&queue_poll_delay_entry.attr,
 	&queue_io_timeout_entry.attr,
+	&queue_max_sched_batch_entry.attr,
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	&throtl_sample_time_entry.attr,
 #endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 053ea4b51988..68e7d29d4dd4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -477,6 +477,8 @@ struct request_queue {
 	unsigned int		rq_timeout;
 	int			poll_nsec;
 
+	int			max_sched_batch;
+
 	struct blk_stat_callback	*poll_cb;
 	struct blk_rq_stat	poll_stat[BLK_MQ_POLL_STATS_BKTS];
 
-- 
2.25.0.341.g760bfbb309-goog