On 9/29/21 7:15 AM, Ming Lei wrote:
The current blk_mq_quiesce_queue() and blk_mq_unquiesce_queue() always
stops and starts the queue unconditionally. And there can be concurrent
quiesce/unquiesce coming from different unrelated code paths, so
unquiesce may come unexpectedly and start queue too early.
Prepare for supporting nested / concurrent quiesce/unquiesce, so that we
can address the above issue.
NVMe has very complicated quiesce/unquiesce use pattern, add one mutex
and queue stopped state in nvme_ctrl, so that we can make sure that
quiece/unquiesce is called in pair.
Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
---
drivers/nvme/host/core.c | 51 ++++++++++++++++++++++++++++++++++++----
drivers/nvme/host/nvme.h | 4 ++++
2 files changed, 50 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 23fb746a8970..5d0b2eb38e43 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4375,6 +4375,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->scan_lock);
+ mutex_init(&ctrl->queues_stop_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
xa_init(&ctrl->cels);
init_rwsem(&ctrl->namespaces_rwsem);
@@ -4450,14 +4451,44 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
+static void __nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
+{
+ lockdep_assert_held(&ctrl->queues_stop_lock);
+
+ if (!ctrl->admin_queue_stopped) {
+ blk_mq_quiesce_queue(ctrl->admin_q);
+ ctrl->admin_queue_stopped = true;
+ }
+}
+
+static void __nvme_start_admin_queue(struct nvme_ctrl *ctrl)
+{
+ lockdep_assert_held(&ctrl->queues_stop_lock);
+
+ if (ctrl->admin_queue_stopped) {
+ blk_mq_unquiesce_queue(ctrl->admin_q);
+ ctrl->admin_queue_stopped = false;
+ }
+}
I'd make this a bit we can flip atomically.
+
static void nvme_start_ns_queue(struct nvme_ns *ns)
{
- blk_mq_unquiesce_queue(ns->queue);
+ lockdep_assert_held(&ns->ctrl->queues_stop_lock);
+
+ if (test_bit(NVME_NS_STOPPED, &ns->flags)) {
+ blk_mq_unquiesce_queue(ns->queue);
+ clear_bit(NVME_NS_STOPPED, &ns->flags);
+ }
}
static void nvme_stop_ns_queue(struct nvme_ns *ns)
{
- blk_mq_quiesce_queue(ns->queue);
+ lockdep_assert_held(&ns->ctrl->queues_stop_lock);
+
+ if (!test_bit(NVME_NS_STOPPED, &ns->flags)) {
+ blk_mq_quiesce_queue(ns->queue);
+ set_bit(NVME_NS_STOPPED, &ns->flags);
+ }
}
Why not use test_and_set_bit/test_and_clear_bit for serialization?
/*
@@ -4490,16 +4521,18 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
+ mutex_lock(&ctrl->queues_stop_lock);
down_read(&ctrl->namespaces_rwsem);
/* Forcibly unquiesce queues to avoid blocking dispatch */
if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
- nvme_start_admin_queue(ctrl);
+ __nvme_start_admin_queue(ctrl);
list_for_each_entry(ns, &ctrl->namespaces, list)
nvme_set_queue_dying(ns);
up_read(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->queues_stop_lock);
This extra lock wrapping the namespaces_rwsem is scary. The
ordering rules are not clear to me.