[PATCH 3/3] block: fix the lock inversion caused by kobject_del under sysfs_lock

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Currently, the kobject_del for sysfs mq and hctx entry is invoked
under q->sysfs_lock. lock inversion will came up when someone try
to access the sysfs entries concurrently. The scenario is as below:
  hold q->sysfs_lock    access mq,hctx sysfs entries
  kobject_del
  __kernfs_remove          kernfs_get_active
    kernfs_drain
     wait kn->active       require q->sysfs_lock
To fix this issue, we do as following:
 - still reserve q->sysfs_lock as the sync method between queue, mq
   hctx sysfs entries' show and store method.
 - use QUEUE_FLAG_REGISTERED to sync the blk_register/unregister_queue
   with sysfs entries.
 - use QUEUE_FLAG_REGISTERED to sync the blk_mq_register/unregister_dev
   with blk_mq_sysfs_register/unregister.
 - change q->mq_sysfs_init_done to q->mq_sysfs_ready to sync
   blk_mq_register/unregister_dev and blk_mq_sysfs_register/unregister
   with mq,hctx entries.
Then we don't need sysfs_lock on kobject_del anymore.

Signed-off-by: Jianchao Wang <jianchao.w.wang@xxxxxxxxxx>
---
 block/blk-mq-sysfs.c   | 66 +++++++++++++++++++++++++++++++++++++-------------
 block/blk-sysfs.c      | 39 +++++++++++++++--------------
 block/blk-wbt.c        |  4 ---
 block/elevator.c       |  3 ++-
 include/linux/blkdev.h |  2 +-
 5 files changed, 73 insertions(+), 41 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index ec26745..0923c2c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -55,7 +55,9 @@ static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
 
 	res = -ENOENT;
 	mutex_lock(&q->sysfs_lock);
-	if (!blk_queue_dying(q))
+	if (!blk_queue_dying(q) &&
+	    blk_queue_registered(q) &&
+	    q->mq_sysfs_ready)
 		res = entry->show(ctx, page);
 	mutex_unlock(&q->sysfs_lock);
 	return res;
@@ -78,7 +80,9 @@ static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
 
 	res = -ENOENT;
 	mutex_lock(&q->sysfs_lock);
-	if (!blk_queue_dying(q))
+	if (!blk_queue_dying(q) &&
+	    blk_queue_registered(q) &&
+	    q->mq_sysfs_ready)
 		res = entry->store(ctx, page, length);
 	mutex_unlock(&q->sysfs_lock);
 	return res;
@@ -101,7 +105,9 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
 
 	res = -ENOENT;
 	mutex_lock(&q->sysfs_lock);
-	if (!blk_queue_dying(q))
+	if (!blk_queue_dying(q) &&
+	    blk_queue_registered(q) &&
+	    q->mq_sysfs_ready)
 		res = entry->show(hctx, page);
 	mutex_unlock(&q->sysfs_lock);
 	return res;
@@ -125,7 +131,9 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
 
 	res = -ENOENT;
 	mutex_lock(&q->sysfs_lock);
-	if (!blk_queue_dying(q))
+	if (!blk_queue_dying(q) &&
+	    blk_queue_registered(q) &&
+	    q->mq_sysfs_ready)
 		res = entry->store(hctx, page, length);
 	mutex_unlock(&q->sysfs_lock);
 	return res;
@@ -253,7 +261,9 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	lockdep_assert_held(&q->sysfs_lock);
+	mutex_lock(&q->sysfs_lock);
+	q->mq_sysfs_ready = false;
+	mutex_unlock(&q->sysfs_lock);
 
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
@@ -262,7 +272,6 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 	kobject_del(&q->mq_kobj);
 	kobject_put(&dev->kobj);
 
-	q->mq_sysfs_init_done = false;
 }
 
 void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
@@ -295,13 +304,22 @@ void blk_mq_sysfs_init(struct request_queue *q)
 	}
 }
 
+/*
+ * blk_mq_register_dev/blk_mq_unregister_dev are only invoked by
+ * blk_register_queue/blk_unregister_queue. So we could use
+ * QUEUE_FLAG_REGISTERED to sync with blk_mq_sysfs_register/unregister.
+ * If QUEUE_FLAG_REGISTERED is set, q->mq_kobj is ok, otherwise, it is
+ * not there.
+ * blk_mq_register/unregister_dev blk_mq_sysfs_register/unregister all
+ * use q->mq_sysfs_ready to sync with mq and hctx sysfs entries' store
+ * and show method.
+ */
 int blk_mq_register_dev(struct device *dev, struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int ret, i;
 
 	WARN_ON_ONCE(!q->kobj.parent);
-	lockdep_assert_held(&q->sysfs_lock);
 
 	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
 	if (ret < 0)
@@ -315,7 +333,9 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
 			goto unreg;
 	}
 
-	q->mq_sysfs_init_done = true;
+	mutex_lock(&q->sysfs_lock);
+	q->mq_sysfs_ready = true;
+	mutex_unlock(&q->sysfs_lock);
 
 out:
 	return ret;
@@ -335,15 +355,20 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
+	/*
+	 * If QUEUE_FLAG_REGISTERED is not set, q->mq_kobj
+	 * is not ready.
+	 */
 	mutex_lock(&q->sysfs_lock);
-	if (!q->mq_sysfs_init_done)
-		goto unlock;
+	if (!blk_queue_registered(q)) {
+		mutex_unlock(&q->sysfs_lock);
+		return;
+	}
+	q->mq_sysfs_ready = false;
+	mutex_unlock(&q->sysfs_lock);
 
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
-
-unlock:
-	mutex_unlock(&q->sysfs_lock);
 }
 
 int blk_mq_sysfs_register(struct request_queue *q)
@@ -351,9 +376,16 @@ int blk_mq_sysfs_register(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	int i, ret = 0;
 
+	/*
+	 * If QUEUE_FLAG_REGISTERED is not set, q->mq_kobj
+	 * is not ready.
+	 */
 	mutex_lock(&q->sysfs_lock);
-	if (!q->mq_sysfs_init_done)
-		goto unlock;
+	if (!blk_queue_registered(q)) {
+		mutex_unlock(&q->sysfs_lock);
+		return ret;
+	}
+	mutex_unlock(&q->sysfs_lock);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
@@ -361,8 +393,8 @@ int blk_mq_sysfs_register(struct request_queue *q)
 			break;
 	}
 
-unlock:
+	mutex_lock(&q->sysfs_lock);
+	q->mq_sysfs_ready = true;
 	mutex_unlock(&q->sysfs_lock);
-
 	return ret;
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 96dcbb9..a3ef681 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -741,7 +741,8 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 	if (!entry->show)
 		return -EIO;
 	mutex_lock(&q->sysfs_lock);
-	if (blk_queue_dying(q)) {
+	if (blk_queue_dying(q) ||
+	    !blk_queue_registered(q)) {
 		mutex_unlock(&q->sysfs_lock);
 		return -ENOENT;
 	}
@@ -763,7 +764,8 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 
 	q = container_of(kobj, struct request_queue, kobj);
 	mutex_lock(&q->sysfs_lock);
-	if (blk_queue_dying(q)) {
+	if (blk_queue_dying(q) ||
+	    !blk_queue_registered(q)) {
 		mutex_unlock(&q->sysfs_lock);
 		return -ENOENT;
 	}
@@ -866,7 +868,6 @@ int blk_register_queue(struct gendisk *disk)
 	WARN_ONCE(blk_queue_registered(q),
 		  "%s is registering an already registered queue\n",
 		  kobject_name(&dev->kobj));
-	queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
 
 	/*
 	 * SCSI probing may synchronously create and destroy a lot of
@@ -887,17 +888,16 @@ int blk_register_queue(struct gendisk *disk)
 	if (ret)
 		return ret;
 
-	/* Prevent changes through sysfs until registration is completed. */
-	mutex_lock(&q->sysfs_lock);
-
 	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
 	if (ret < 0) {
 		blk_trace_remove_sysfs(dev);
-		goto unlock;
+		return ret;
 	}
 
 	if (q->mq_ops) {
-		blk_mq_register_dev(dev, q);
+		ret = blk_mq_register_dev(dev, q);
+		if (ret)
+			goto err_kobj;
 		blk_mq_debugfs_register(q);
 	}
 
@@ -907,20 +907,25 @@ int blk_register_queue(struct gendisk *disk)
 
 	blk_throtl_register_queue(q);
 
+	/* Prevent changes through sysfs until registration is completed. */
+	mutex_lock(&q->sysfs_lock);
 	if (q->request_fn || (q->mq_ops && q->elevator)) {
 		ret = elv_register_queue(q);
 		if (ret) {
 			mutex_unlock(&q->sysfs_lock);
-			kobject_uevent(&q->kobj, KOBJ_REMOVE);
-			kobject_del(&q->kobj);
-			blk_trace_remove_sysfs(dev);
-			kobject_put(&dev->kobj);
-			return ret;
+			goto err_kobj;
 		}
 	}
-	ret = 0;
-unlock:
+
+	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 	mutex_unlock(&q->sysfs_lock);
+	return 0;
+
+err_kobj:
+	kobject_uevent(&q->kobj, KOBJ_REMOVE);
+	kobject_del(&q->kobj);
+	blk_trace_remove_sysfs(dev);
+	kobject_put(&dev->kobj);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_register_queue);
@@ -949,16 +954,14 @@ void blk_unregister_queue(struct gendisk *disk)
 	 * concurrent elv_iosched_store() calls.
 	 */
 	mutex_lock(&q->sysfs_lock);
-
 	blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
-
+	mutex_unlock(&q->sysfs_lock);
 	/*
 	 * Remove the sysfs attributes before unregistering the queue data
 	 * structures that can be modified through sysfs.
 	 */
 	if (q->mq_ops)
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
-	mutex_unlock(&q->sysfs_lock);
 
 	kobject_uevent(&q->kobj, KOBJ_REMOVE);
 	kobject_del(&q->kobj);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 43ae265..c26deca 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -706,10 +706,6 @@ void wbt_enable_default(struct request_queue *q)
 	if (q->rq_wb)
 		return;
 
-	/* Queue not registered? Maybe shutting down... */
-	if (!blk_queue_registered(q))
-		return;
-
 	if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) ||
 	    (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
 		wbt_init(q);
diff --git a/block/elevator.c b/block/elevator.c
index a574841..a68e3e6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -869,7 +869,8 @@ void elv_unregister_queue(struct request_queue *q)
 		kobject_del(&e->kobj);
 		e->registered = 0;
 		/* Re-enable throttling in case elevator disabled it */
-		wbt_enable_default(q);
+		if (blk_queue_registered(q))
+			wbt_enable_default(q);
 	}
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d6174ed..df53e8f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -666,7 +666,7 @@ struct request_queue {
 	struct dentry		*sched_debugfs_dir;
 #endif
 
-	bool			mq_sysfs_init_done;
+	bool			mq_sysfs_ready;
 
 	size_t			cmd_size;
 	void			*rq_alloc_data;
-- 
2.7.4




[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux