When one req is timed out, now nvme_timeout() handles it by the following way: nvme_dev_disable(dev, false); nvme_reset_ctrl(&dev->ctrl); return BLK_EH_HANDLED. which may introduces the following issues: 1) the following timeout on other reqs may call nvme_dev_disable() again, which may quiesce queue again when resetting is in-progress, then finally nothing can move on. 2) when timeout is triggered in reset work function, nvme_wait_freeze() may wait forever because now controller can't be recovered at all This patch fixes the issues by: 1) handle timeout event in one EH thread, and wakeup this thread if controller recovery is needed 2) Inside the EH handler, timeout work is drained by nvme_unquiesce_timeout() before canceling in-flight requrests, so all requests can be canceled or completed by either EH or block timeout handler. 3) Moves the draining IO and unfreezing queues into one post_eh work context, so controller can be recovered always. This patch fixes reports from the horible test of block/011. Cc: Jianchao Wang <jianchao.w.wang@xxxxxxxxxx> Cc: Christoph Hellwig <hch@xxxxxx> Cc: Sagi Grimberg <sagi@xxxxxxxxxxx> Cc: linux-nvme@xxxxxxxxxxxxxxxxxxx Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> --- drivers/nvme/host/core.c | 22 ++++++++ drivers/nvme/host/nvme.h | 3 + drivers/nvme/host/pci.c | 140 +++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 155 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f9028873298e..664a268accd1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3582,6 +3582,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_freeze); +void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_unquiesce_timeout(ns->queue); + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout); + +void nvme_quiesce_timeout(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_quiesce_timeout(ns->queue); + up_read(&ctrl->namespaces_rwsem); +} +EXPORT_SYMBOL_GPL(nvme_quiesce_timeout); + void nvme_stop_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a19b7f04ac24..956ee19ff403 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -19,6 +19,7 @@ #include <linux/pci.h> #include <linux/kref.h> #include <linux/blk-mq.h> +#include <linux/blkdev.h> #include <linux/lightnvm.h> #include <linux/sed-opal.h> #include <linux/fault-inject.h> @@ -405,6 +406,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res); +void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl); +void nvme_quiesce_timeout(struct nvme_ctrl *ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8172ee584130..86c14d9051ff 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -29,6 +29,7 @@ #include <linux/types.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/sed-opal.h> +#include <linux/kthread.h> #include "nvme.h" @@ -112,6 +113,11 @@ struct nvme_dev { dma_addr_t host_mem_descs_dma; struct nvme_host_mem_buf_desc *host_mem_descs; void **host_mem_desc_bufs; + + spinlock_t eh_lock; + bool eh_in_recovery; + struct task_struct *ehandler; + struct work_struct post_eh_work; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -1176,6 +1182,100 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) csts, result); } +static void nvme_eh_schedule(struct nvme_dev *dev) +{ + spin_lock(&dev->eh_lock); + if (!dev->eh_in_recovery) { + dev->eh_in_recovery = true; + wake_up_process(dev->ehandler); + } + spin_unlock(&dev->eh_lock); +} + +static void nvme_eh_done(struct nvme_dev *dev) +{ + spin_lock(&dev->eh_lock); + if (dev->eh_in_recovery) + dev->eh_in_recovery = false; + spin_unlock(&dev->eh_lock); +} + +static int nvme_error_handler(void *data) +{ + struct nvme_dev *dev = data; + + while (true) { + /* + * The sequence in kthread_stop() sets the stop flag first + * then wakes the process. To avoid missed wakeups, the task + * should always be in a non running state before the stop + * flag is checked + */ + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + + spin_lock(&dev->eh_lock); + if (!dev->eh_in_recovery) { + spin_unlock(&dev->eh_lock); + schedule(); + continue; + } + spin_unlock(&dev->eh_lock); + + __set_current_state(TASK_RUNNING); + + dev_info(dev->ctrl.device, "start eh recovery\n"); + + /* freeze queues before recovery */ + nvme_start_freeze(&dev->ctrl); + + nvme_dev_disable(dev, false); + + /* + * reset won't wait for IO completion any more, so it + * is safe to reset controller in sync way + */ + nvme_reset_ctrl_sync(&dev->ctrl); + + dev_info(dev->ctrl.device, "eh recovery done\n"); + + /* + * drain IO & unfreeze queues in another context because + * these IOs may trigger timeout again + */ + queue_work(nvme_reset_wq, &dev->post_eh_work); + } + __set_current_state(TASK_RUNNING); + dev->ehandler = NULL; + + return 0; +} + +static void nvme_post_eh_work(struct work_struct *work) +{ + struct nvme_dev *dev = + container_of(work, struct nvme_dev, post_eh_work); + + nvme_wait_freeze(&dev->ctrl); + nvme_unfreeze(&dev->ctrl); +} + +static int nvme_eh_init(struct nvme_dev *dev) +{ + spin_lock_init(&dev->eh_lock); + INIT_WORK(&dev->post_eh_work, nvme_post_eh_work); + + dev->ehandler = kthread_run(nvme_error_handler, dev, + "nvme_eh_%d", dev->ctrl.instance); + if (IS_ERR(dev->ehandler)) { + dev_err(dev->ctrl.device, "error handler thread failed to spawn, error = %ld\n", + PTR_ERR(dev->ehandler)); + return PTR_ERR(dev->ehandler); + } + return 0; +} + static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1197,8 +1297,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) */ if (nvme_should_reset(dev, csts)) { nvme_warn_reset(dev, csts); - nvme_dev_disable(dev, false); - nvme_reset_ctrl(&dev->ctrl); + nvme_eh_schedule(dev); return BLK_EH_HANDLED; } @@ -1224,8 +1323,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); - nvme_dev_disable(dev, false); nvme_req(req)->flags |= NVME_REQ_CANCELLED; + nvme_eh_schedule(dev); return BLK_EH_HANDLED; default: break; @@ -1240,14 +1339,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); - nvme_dev_disable(dev, false); - nvme_reset_ctrl(&dev->ctrl); - /* * Mark the request as handled, since the inline shutdown * forces all outstanding requests to complete. */ nvme_req(req)->flags |= NVME_REQ_CANCELLED; + nvme_eh_schedule(dev); return BLK_EH_HANDLED; } @@ -2246,8 +2343,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) if (pci_is_enabled(pdev)) { u32 csts = readl(dev->bar + NVME_REG_CSTS); - if (dev->ctrl.state == NVME_CTRL_LIVE || - dev->ctrl.state == NVME_CTRL_RESETTING) { + if (shutdown && (dev->ctrl.state == NVME_CTRL_LIVE || + dev->ctrl.state == NVME_CTRL_RESETTING)) { nvme_start_freeze(&dev->ctrl); frozen = true; } @@ -2281,11 +2378,23 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) for (i = dev->ctrl.queue_count - 1; i >= 0; i--) nvme_suspend_queue(&dev->queues[i]); + /* safe to sync timeout after queues are quiesced */ + nvme_quiesce_timeout(&dev->ctrl); + blk_quiesce_timeout(dev->ctrl.admin_q); + nvme_pci_disable(dev); + /* + * Both timeout and interrupt handler have been drained, and all + * in-flight requests will be canceled now. + */ blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); + /* all requests are canceled now, so enable timeout now */ + nvme_unquiesce_timeout(&dev->ctrl); + blk_unquiesce_timeout(dev->ctrl.admin_q); + /* * The driver will not be starting up queues again if shutting down so * must flush all entered requests to their failed completion to avoid @@ -2416,9 +2525,14 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; + nvme_eh_done(dev); + /* * Keep the controller around but remove all namespaces if we don't have * any working I/O queue. + * + * Now we won't wait for IO completion here any more, and sync reset + * can be used safely anywhere. */ if (dev->online_queues < 2) { dev_warn(dev->ctrl.device, "IO queues not created\n"); @@ -2427,11 +2541,9 @@ static void nvme_reset_work(struct work_struct *work) new_state = NVME_CTRL_ADMIN_ONLY; } else { nvme_start_queues(&dev->ctrl); - nvme_wait_freeze(&dev->ctrl); /* hit this only when allocate tagset fails */ if (nvme_dev_add(dev)) new_state = NVME_CTRL_ADMIN_ONLY; - nvme_unfreeze(&dev->ctrl); } /* @@ -2449,6 +2561,7 @@ static void nvme_reset_work(struct work_struct *work) out: nvme_remove_dead_ctrl(dev, result); + nvme_eh_done(dev); } static void nvme_remove_dead_ctrl_work(struct work_struct *work) @@ -2590,10 +2703,15 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + if (nvme_eh_init(dev)) + goto uninit_ctrl; + nvme_reset_ctrl(&dev->ctrl); return 0; + uninit_ctrl: + nvme_uninit_ctrl(&dev->ctrl); release_pools: nvme_release_prp_pools(dev); unmap: @@ -2647,6 +2765,8 @@ static void nvme_remove(struct pci_dev *pdev) nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); + if (dev->ehandler) + kthread_stop(dev->ehandler); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); -- 2.9.5