[PATCH] block: mq flush: fix race between IPI handler and mq flush worker

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



In 'struct request', both 'csd' and 'requeue_work' are
defined inside one union, unfortunately, the below race
can be triggered:

	- one PREFLUSH request may be queued inside the previous
	POSTFLUSH request's IPI handler, and both the two requests
	share one request instance(q->flush_rq)

	- IPI interrupt for the previous POSTFLUSH request comes, and
	generic_smp_call_function_single_interrupt() is called to run
	the remote complete handler by csd->func();

	- the POSTFLUSH flush request's end_io callback(flush_end_io)
	is called

	- finally inside the path, blk_flush_queue_rq() calls
	INIT_WORK(&rq->requeue_work,...) and schedule the work for
	queuing a new PREFLUSH request

	- csd->func(csd->info) returns, and csd_unlock(csd) is called,
	see generic_smp_call_function_single_interrupt()

	- then the initialized rq->requeue_work is changed by
	csd_unlock(): the lowerest bit of requeue_work.func is
	observed that it was cleared, and function address can
	be unaligned on x86.

	- the work func of mq_flush_run() will never be run

	- fs sync hangs forever

This patch introduces one exclusive work_struct inside
request_queue for queuing flush request only to fix the
problem.

Cc: <stable@xxxxxxxxxxxxxxx>  #3.14
Signed-off-by: Ming Lei <tom.leiming@xxxxxxxxx>
---
Another simple fix is to disable ipi for flush request, but looks
this one should be better.

 block/blk-flush.c      |   43 +++++++++++++++++++++++++++++++++++++++----
 include/linux/blkdev.h |    1 +
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index ec7a224..42e8a34 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,21 +130,56 @@ static void blk_flush_restore_request(struct request *rq)
 	blk_clear_rq_complete(rq);
 }
 
+static void __mq_flush_run(struct request *rq)
+{
+	memset(&rq->csd, 0, sizeof(rq->csd));
+	blk_mq_insert_request(rq, false, true, false);
+}
+
 static void mq_flush_run(struct work_struct *work)
 {
+	struct request_queue *q;
+	struct request *rq;
+
+	q = container_of(work, struct request_queue, flush_work);
+	rq = q->flush_rq;
+
+	__mq_flush_run(rq);
+}
+
+static void mq_data_flush_run(struct work_struct *work)
+{
 	struct request *rq;
 
 	rq = container_of(work, struct request, requeue_work);
 
-	memset(&rq->csd, 0, sizeof(rq->csd));
-	blk_mq_insert_request(rq, false, true, false);
+	__mq_flush_run(rq);
 }
 
 static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 {
 	if (rq->q->mq_ops) {
-		INIT_WORK(&rq->requeue_work, mq_flush_run);
-		kblockd_schedule_work(&rq->requeue_work);
+		struct work_struct *wk;
+		work_func_t func;
+
+		/*
+		 * Flush request need use its own work_struct
+		 * instead of rq->requeue_work because it
+		 * will be changed by csd_unlock() in future
+		 * if the request(PREFLUSH) is being queued
+		 * from previous POSTFLUSH request's IPI
+		 * handler context.
+		 */
+		if (rq == rq->q->flush_rq) {
+			wk = &rq->q->flush_work;
+			func = mq_flush_run;
+		} else {
+			wk = &rq->requeue_work;
+			func = mq_data_flush_run;
+		}
+
+		INIT_WORK(wk, func);
+		kblockd_schedule_work(wk);
 		return false;
 	} else {
 		if (add_front)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1f1ecc7..4c8f24c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -461,6 +461,7 @@ struct request_queue {
 	struct list_head	flush_queue[2];
 	struct list_head	flush_data_in_flight;
 	struct request		*flush_rq;
+	struct work_struct	flush_work;
 	spinlock_t		mq_flush_lock;
 
 	struct mutex		sysfs_lock;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]