[PATCH 1/1] blk-mq: fix hang caused by freeze/unfreeze sequence

Roman Pen <roman.penyaev@xxxxxxxxxxxxxxxx> · Fri, 5 Aug 2016 19:41:30 +0200

Long time ago there was a similar fix proposed by Akinobu Mita[1],
but it seems that time everyone decided to fix this subtle race in
percpu-refcount and Tejun Heo[2] did an attempt (as I can see that
patchset was not applied).

The following is a description of a queue hang - same fix but a bug
from another angle.

The hang happens on queue freeze because of a simultaneous calls of
blk_mq_freeze_queue() and blk_mq_unfreeze_queue() from different threads,
and because of a reference race percpu_ref_reinit() and percpu_ref_kill()
swap.

 CPU#0             CPU#1
 ----------------  -----------------
 percpu_ref_kill()
 
                   percpu_ref_kill() << atomic reference does not
 percpu_ref_reinit()                 << guarantee the order

                   blk_mq_freeze_queue_wait() << HANG HERE

                   percpu_ref_reinit()

Firstly this wrong sequence raises two kernel warnings:

  1st. WARNING at lib/percpu-recount.c:309
       percpu_ref_kill_and_confirm called more than once

  2nd. WARNING at lib/percpu-refcount.c:331

But the most unpleasant effect is a hang of a blk_mq_freeze_queue_wait(),
which waits for a zero of a q_usage_counter, which never happens
because percpu-ref was not reinited and stays in PERCPU state forever.

The simplified sequence above is reproduced on shared tags, when one
queue is going to die meanwhile another one is initing:

 CPU#0                           CPU#1
 ------------------------------- ------------------------------------
 q1 = blk_mq_init_queue(shared_tags)

                                q2 = blk_mq_init_queue(shared_tags):
                                  blk_mq_add_queue_tag_set(shared_tags):
                                    blk_mq_update_tag_set_depth(shared_tags):
                                      blk_mq_freeze_queue(q1)
 blk_cleanup_queue(q1)                 ...
   blk_mq_freeze_queue(q1)   <<<->>>   blk_mq_unfreeze_queue(q1)

[1] Message id: 1443287365-4244-7-git-send-email-akinobu.mita@xxxxxxxxx
[2] Message id: 1443563240-29306-6-git-send-email-tj@xxxxxxxxxx

Signed-off-by: Roman Pen <roman.penyaev@xxxxxxxxxxxxxxxx>
Cc: Akinobu Mita <akinobu.mita@xxxxxxxxx>
Cc: Tejun Heo <tj@xxxxxxxxxx>
Cc: Jens Axboe <axboe@xxxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Cc: linux-block@xxxxxxxxxxxxxxx
Cc: linux-kernel@xxxxxxxxxxxxxxx
---
 block/blk-core.c       |  1 +
 block/blk-mq.c         | 22 +++++++++++-----------
 include/linux/blkdev.h |  7 ++++++-
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ef78848..01dcb02 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -740,6 +740,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
 
 	init_waitqueue_head(&q->mq_freeze_wq);
+	mutex_init(&q->mq_freeze_lock);
 
 	/*
 	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6d6f8fe..1f3e81b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -80,13 +80,13 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 
 void blk_mq_freeze_queue_start(struct request_queue *q)
 {
-	int freeze_depth;
-
-	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
-	if (freeze_depth == 1) {
+	mutex_lock(&q->mq_freeze_lock);
+	if (++q->mq_freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
+		mutex_unlock(&q->mq_freeze_lock);
 		blk_mq_run_hw_queues(q, false);
-	}
+	} else
+		mutex_unlock(&q->mq_freeze_lock);
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
 
@@ -124,14 +124,14 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	int freeze_depth;
-
-	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
-	WARN_ON_ONCE(freeze_depth < 0);
-	if (!freeze_depth) {
+	mutex_lock(&q->mq_freeze_lock);
+	q->mq_freeze_depth--;
+	WARN_ON_ONCE(q->mq_freeze_depth < 0);
+	if (!q->mq_freeze_depth) {
 		percpu_ref_reinit(&q->q_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
+	mutex_unlock(&q->mq_freeze_lock);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
@@ -2105,7 +2105,7 @@ void blk_mq_free_queue(struct request_queue *q)
 static void blk_mq_queue_reinit(struct request_queue *q,
 				const struct cpumask *online_mask)
 {
-	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
+	WARN_ON_ONCE(!q->mq_freeze_depth);
 
 	blk_mq_sysfs_unregister(q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f6ff9d1..d692c16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -445,7 +445,7 @@ struct request_queue {
 	struct mutex		sysfs_lock;
 
 	int			bypass_depth;
-	atomic_t		mq_freeze_depth;
+	int			mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
@@ -459,6 +459,11 @@ struct request_queue {
 #endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
+	/*
+	 * Protect concurrent access to q_usage_counter by
+	 * percpu_ref_kill() and percpu_ref_reinit().
+	 */
+	struct mutex		mq_freeze_lock;
 	struct percpu_ref	q_usage_counter;
 	struct list_head	all_q_node;
 
-- 
2.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html