On 6/6/23 03:51, Ming Lei wrote:
On Tue, Jun 06, 2023 at 12:48:32AM +0300, Sagi Grimberg wrote:
When driver confirms that the controller is dead, this controller should
be deleted with marking as DEAD. Otherwise, upper layer may wait forever
in __bio_queue_enter() since the disk won't be marked as DEAD.
Especially, in del_gendisk(), disk won't be marked as DEAD unless bdev
sync & invalidate returns. If any writeback IO waits in
__bio_queue_enter(), IO deadlock is caused.
Add nvme_delete_dead_ctrl() for avoiding such kind of io deadlock.
Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
---
drivers/nvme/host/core.c | 24 +++++++++++++++++++++++-
drivers/nvme/host/nvme.h | 1 +
2 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ccb6eb1282f8..413213cfa417 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -227,16 +227,38 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
nvme_do_delete_ctrl(ctrl);
}
-int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+static int __nvme_delete_ctrl(struct nvme_ctrl *ctrl,
+ enum nvme_ctrl_state state)
{
+ if (state != NVME_CTRL_DELETING && state != NVME_CTRL_DEAD)
+ return -EINVAL;
+
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
return -EBUSY;
+ if (state == NVME_CTRL_DEAD) {
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD))
+ return -EBUSY;
+ }
if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
return -EBUSY;
return 0;
}
the user can trigger a delete in exactly the same condition as
the transport (say a nanosecond before the transport exhaust
the max_reconnects).
Yeah, the problem can be triggered when removing any NS which request
queue is frozen.
It it too fragile to call freeze_queue and unfreeze_queue in different
contexts since both two should have been done in strict pair, and
meantime I don't think freezing queues in nvme_rdma_teardown_io_queues()
is needed cause quiescing is supposed to be enough for driver to recover
controller.
And I guess the following approach(rdma only) should address the issue cleanly:
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0eb79696fb73..59352671aeb7 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -919,6 +919,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
if (!new) {
nvme_unquiesce_io_queues(&ctrl->ctrl);
+ nvme_start_freeze(&ctrl->ctrl);
if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
/*
* If we timed out waiting for freeze we are likely to
@@ -926,6 +927,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
* to be safe.
*/
ret = -ENODEV;
+ nvme_unfreeze(&ctrl->ctrl);
goto out_wait_freeze_timed_out;
}
blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
@@ -975,7 +977,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
if (ctrl->ctrl.queue_count > 1) {
- nvme_start_freeze(&ctrl->ctrl);
nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_sync_io_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
I agree this should work.
I think that we'd want something like
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 841f155fe0b3..6c718ad46e06 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -231,6 +231,11 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
return -EBUSY;
+
+ if (ctrl->ops->transport_disconnected &&
+ ctrl->ops->transport_disconnected(ctrl))
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD);
+
if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
return -EBUSY;
return 0;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 054bf2f8b1a1..657d3f79953d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2828,6 +2828,13 @@ static bool nvme_pci_supports_pci_p2pdma(struct
nvme_ctrl *ctrl)
return dma_pci_p2pdma_supported(dev->dev);
}
+static bool nvme_pci_disconnected(struct nvme_ctrl *nctrl)
+{
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
+
+ return !pci_device_is_present(to_pci_dev(dev->dev));
+}
+
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
@@ -2841,6 +2848,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops =
{
.get_address = nvme_pci_get_address,
.print_device_info = nvme_pci_print_device_info,
.supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma,
+ .transport_disconnected = nvme_pci_disconnected,
};
static int nvme_dev_map(struct nvme_dev *dev)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0eb79696fb73..2a03df693b0e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2235,6 +2235,18 @@ static void nvme_rdma_reset_ctrl_work(struct
work_struct *work)
nvme_rdma_reconnect_or_remove(ctrl);
}
+static bool nvme_rdma_disconnected(struct nvme_ctrl *nctrl)
+{
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+ int i;
+
+ for (i = 0; i < ctrl->queue_count; i++) {
+ if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags))
+ return false;
+ }
+ return true;
+}
+
static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.name = "rdma",
.module = THIS_MODULE,
@@ -2247,6 +2259,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops =
{
.delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
.stop_ctrl = nvme_rdma_stop_ctrl,
+ .transport_disconnected = nvme_rdma_disconnected,
};
/*
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index fe01ba75570d..043ce9878560 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2536,6 +2536,18 @@ static int nvme_tcp_get_address(struct nvme_ctrl
*ctrl, char *buf, int size)
return len;
}
+static bool nvme_tcp_disconnected(struct nvme_ctrl *nctrl)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ int i;
+
+ for (i = 0; i < ctrl->queue_count; i++) {
+ if (test_bit(NVME_TCP_Q_LIVE, &ctrl->queues[i].flags))
+ return false;
+ }
+ return true;
+}
+
static const struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.commit_rqs = nvme_tcp_commit_rqs,
@@ -2569,6 +2581,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops =
{
.delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvme_tcp_get_address,
.stop_ctrl = nvme_tcp_stop_ctrl,
+ .transport_disconnected = nvme_tcp_disconnected,
This way is too violent. The current queue/device status does not mean
the controller/queue is really dead cause recovering is in in-progress,
and we should call blk_mark_disk_dead() in case that controller is confirmed
as DEAD.
Well, the controller is going away, and the queues are not LIVE, so I
think the logic makes sense. However I do agree that your other
suggestion is cleaner.