[PATCH 6/6] blk-mq: stop to allocate new requst and drain request before hctx becomes inactive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Before one CPU becomes offline, check if it is the last online CPU of hctx.
If yes, mark this hctx as inactive, meantime wait for completion of all
allocated IO requests originated from this hctx. Meantime check if this
hctx has become inactive during allocating request, if yes, give up it
and retry from new online CPU.

This way guarantees that there isn't any inflight IO before shutdown
the managed IRQ line when all CPUs of this IRQ line is offline.

Cc: John Garry <john.garry@xxxxxxxxxx>
Cc: Bart Van Assche <bvanassche@xxxxxxx>
Cc: Hannes Reinecke <hare@xxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
---
 block/blk-mq-debugfs.c |   1 +
 block/blk-mq-tag.c     |  10 ++++
 block/blk-mq.c         | 122 +++++++++++++++++++++++++++++++++++++----
 include/linux/blk-mq.h |   3 +
 4 files changed, 125 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 6d40eff04524..15df3a36e9fa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(STOPPED),
 	HCTX_STATE_NAME(TAG_ACTIVE),
 	HCTX_STATE_NAME(SCHED_RESTART),
+	HCTX_STATE_NAME(INACTIVE),
 };
 #undef HCTX_STATE_NAME
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 0ae79ca6e2da..c8e34c1f547f 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -184,6 +184,16 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	sbitmap_finish_wait(bt, ws, &wait);
 
 found_tag:
+	/*
+	 * we are in request allocation, check if the current hctx is inactive.
+	 * If yes, give up this allocation. We will retry on another new online
+	 * CPU.
+	 */
+	if (data->preempt_disabled && unlikely(test_bit(BLK_MQ_S_INACTIVE,
+					&data->hctx->state))) {
+		blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
+		return BLK_MQ_TAG_FAIL;
+	}
 	return tag + tag_offset;
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 18999198a37f..0551aa1740ec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -40,9 +40,9 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
-static struct request *blk_mq_get_request(struct request_queue *q,
-					  struct bio *bio,
-					  struct blk_mq_alloc_data *data);
+static struct request *__blk_mq_get_request(struct request_queue *q,
+					    struct bio *bio,
+					    struct blk_mq_alloc_data *data);
 
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
@@ -338,18 +338,19 @@ struct blk_mq_smp_call_info {
 	struct request_queue *q;
 	struct blk_mq_alloc_data *data;
 	struct request *rq;
+	struct bio *bio;
 };
 
 static void __blk_mq_alloc_request(void *alloc_info)
 {
 	struct blk_mq_smp_call_info *info = alloc_info;
 
-	info->rq = blk_mq_get_request(info->q, NULL, info->data);
+	info->rq = __blk_mq_get_request(info->q, info->bio, info->data);
 }
 
-static struct request *blk_mq_get_request(struct request_queue *q,
-					  struct bio *bio,
-					  struct blk_mq_alloc_data *data)
+static struct request *__blk_mq_get_request(struct request_queue *q,
+					    struct bio *bio,
+					    struct blk_mq_alloc_data *data)
 {
 	struct elevator_queue *e = q->elevator;
 	struct request *rq;
@@ -413,6 +414,34 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	return rq;
 }
 
+static struct request *blk_mq_get_request(struct request_queue *q,
+					  struct bio *bio,
+					  struct blk_mq_alloc_data *data)
+{
+	int ret;
+	struct blk_mq_smp_call_info info;
+	struct request *rq = __blk_mq_get_request(q, bio, data);
+
+	if (rq)
+		return rq;
+
+	if (data->flags & BLK_MQ_REQ_NOWAIT)
+		return NULL;
+
+	/*
+	 * The allocation fails because of inactive hctx, so retry from
+	 * online cpus.
+	 */
+	info.q = q;
+	info.bio = bio;
+	info.data = data;
+
+	ret = smp_call_function_any(cpu_online_mask, __blk_mq_alloc_request, &info, 1);
+	if (ret)
+		return ERR_PTR(ret);
+	return info.rq;
+}
+
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		blk_mq_req_flags_t flags)
 {
@@ -2316,13 +2345,81 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	return -ENOMEM;
 }
 
-static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+struct rq_iter_data {
+	bool has_rq;
+	struct blk_mq_hw_ctx *hctx;
+};
+
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
 {
-	return 0;
+	struct rq_iter_data *iter_data = data;
+
+	if (rq->mq_hctx == iter_data->hctx) {
+		iter_data->has_rq = true;
+		return false;
+	}
+
+	return true;
+}
+
+static bool blk_mq_tags_has_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_tags *tags = hctx->sched_tags ?: hctx->tags;
+	struct rq_iter_data data = {
+		.hctx	= hctx,
+	};
+
+	blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+	return data.has_rq;
+}
+
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
+		struct blk_mq_hw_ctx *hctx)
+{
+	if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+		return false;
+	if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+		return false;
+	return true;
 }
 
 static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
 {
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_online);
+	struct request_queue *q = hctx->queue;
+
+	if (!cpumask_test_cpu(cpu, hctx->cpumask))
+		return 0;
+
+	if (!blk_mq_last_cpu_in_hctx(cpu, hctx))
+		return 0;
+
+	/* Prevent new request from being allocated on the current hctx/cpu */
+	set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+
+	/*
+	 * Grab one refcount for avoiding scheduler switch, and
+	 * return immediately if queue has been frozen.
+	 */
+	if (!percpu_ref_tryget(&q->q_usage_counter))
+		return 0;
+
+	/* wait until all requests in this hctx are gone */
+	while (blk_mq_tags_has_request(hctx))
+		msleep(5);
+
+	percpu_ref_put(&q->q_usage_counter);
+	return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_online);
+
+	if (cpumask_test_cpu(cpu, hctx->cpumask))
+		clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
 	return 0;
 }
 
@@ -2333,12 +2430,15 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
  */
 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 {
-	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_dead);
 	struct blk_mq_ctx *ctx;
 	LIST_HEAD(tmp);
 	enum hctx_type type;
 
-	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+	if (!cpumask_test_cpu(cpu, hctx->cpumask))
+		return 0;
+
 	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
 	type = hctx->type;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 682ce79e8973..e3f6608346b5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -407,6 +407,9 @@ enum {
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
 
+	/* hw queue is inactive after all its CPUs become offline */
+	BLK_MQ_S_INACTIVE	= 3,
+
 	BLK_MQ_MAX_DEPTH	= 10240,
 
 	BLK_MQ_CPU_WORK_BATCH	= 8,
-- 
2.25.2




[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux