On Wed, Jun 30, 2021 at 09:46:35PM +0200, Hannes Reinecke wrote: > Actually, Daniel and me came to a slightly different idea: use cpu hotplug > notifier. > Thing is, blk-mq already has cpu hotplug notifier, which should ensure that > no I/O is pending during cpu hotplug. > If we now add a nvme cpu hotplug notifier which essentially kicks off a > reset once all cpu in a hctx are offline the reset logic will rearrange the > queues to match the current cpu layout. > And when the cpus are getting onlined we'll do another reset. > > Daniel is currently preparing a patch; let's see how it goes. FTR, I can't say it was a huge success. I observed a real problem with this idea and 'maybe problem. First, we can't use nvme_ctrl_reset_sync() in the cpuhp path as this will block in various places due to percpu_rwsem_wait operations [1]. Not really a surprise I'd say. The maybe problem: I tried the test this time though with nvme_ctrl_reset() and this worked for a while (a couple of hours of FC port toggling with stress cpu hotplugging). Though eventually the system stopped to respond and died. Unfortunately, I don't have any core dump. So it's not totally clear what's happening. BTW, I used the same test setup with Ming's patches on top of the two of mine. The system was still working fine after 4 hours of the stress testing. [1] https://paste.opensuse.org/view/raw/62287389 For the fun, here is the nvme_ctrl_reset() patch: diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 66973bb56305..444756ba8dbe 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -20,6 +20,7 @@ #include <linux/ptrace.h> #include <linux/nvme_ioctl.h> #include <linux/pm_qos.h> +#include <linux/cpuhotplug.h> #include <asm/unaligned.h> #include "nvme.h" @@ -4245,6 +4246,8 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { + cpuhp_state_remove_instance_nocalls(CPUHP_AP_NVME_QUEUE_ONLINE, + &ctrl->cpuhp_node); nvme_hwmon_exit(ctrl); nvme_fault_inject_fini(&ctrl->fault_inject); dev_pm_qos_hide_latency_tolerance(ctrl->device); @@ -4357,6 +4360,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, if (ret) goto out_free_name; + ret = cpuhp_state_add_instance_nocalls(CPUHP_AP_NVME_QUEUE_ONLINE, + &ctrl->cpuhp_node); + if (ret) { + pr_debug("failed to cpuhp notifiers %d\n", ret); + goto out_del_device; + } + /* * Initialize latency tolerance controls. The sysfs files won't * be visible to userspace unless the device actually supports APST. @@ -4369,6 +4379,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, nvme_mpath_init_ctrl(ctrl); return 0; +out_del_device: + cdev_device_del(&ctrl->cdev, ctrl->device); out_free_name: nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); @@ -4529,6 +4541,80 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64); } +static int nvme_notify_online(unsigned int cpu, struct hlist_node *node) +{ + struct nvme_ctrl *ctrl = hlist_entry_safe(node, + struct nvme_ctrl, cpuhp_node); + struct nvme_ns *ns; + struct blk_mq_hw_ctx *hctx; + unsigned int i; + bool present = false; + + pr_warn("NQN %s CPU %u online\n", ctrl->opts->subsysnqn, cpu); + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { + queue_for_each_hw_ctx(ns->disk->queue, hctx, i) { + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + present = true; + break; + } + } + if (present) + break; + } + up_read(&ctrl->namespaces_rwsem); + if (present) { + pr_warn("trigger reset... \n"); + nvme_reset_ctrl_sync(ctrl); + pr_warn("executed\n"); + } + + return 0; +} + +static inline bool nvme_last_cpu_in_hctx(unsigned int cpu, + struct blk_mq_hw_ctx *hctx) +{ + if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + return false; + if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) + return false; + return true; +} + +static int nvme_notify_offline(unsigned int cpu, struct hlist_node *node) +{ + struct nvme_ctrl *ctrl = hlist_entry_safe(node, + struct nvme_ctrl, cpuhp_node); + struct nvme_ns *ns; + struct blk_mq_hw_ctx *hctx; + unsigned int i; + bool offline = false; + + pr_warn("NQN %s CPU %u offline\n", ctrl->opts->subsysnqn, cpu); + + down_read(&ctrl->namespaces_rwsem); + list_for_each_entry(ns, &ctrl->namespaces, list) { + queue_for_each_hw_ctx(ns->disk->queue, hctx, i) { + pr_warn("htcx %p cpumask %*pbl state x%lx\n", + hctx, cpumask_pr_args(hctx->cpumask), hctx->state); + if (nvme_last_cpu_in_hctx(cpu, hctx)) { + offline = true; + break; + } + } + if (offline) + break; + } + up_read(&ctrl->namespaces_rwsem); + if (offline) { + pr_warn("trigger reset... \n"); + nvme_reset_ctrl_sync(ctrl); + pr_warn("executed\n"); + } + return 0; +} static int __init nvme_core_init(void) { @@ -4580,8 +4666,17 @@ static int __init nvme_core_init(void) goto unregister_generic_ns; } + result = cpuhp_setup_state_multi(CPUHP_AP_NVME_QUEUE_ONLINE, + "nvme:online", + nvme_notify_online, + nvme_notify_offline); + if (result < 0) + goto destroy_chr_class; + return 0; +destroy_chr_class: + class_destroy(nvme_ns_chr_class); unregister_generic_ns: unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); destroy_subsys_class: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 0015860ec12b..34676ff79d40 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -352,6 +352,8 @@ struct nvme_ctrl { unsigned long discard_page_busy; struct nvme_fault_inject fault_inject; + + struct hlist_node cpuhp_node; }; enum nvme_iopolicy { diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4a62b3980642..5f6cf60bee1f 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -159,6 +159,7 @@ enum cpuhp_state { CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, CPUHP_AP_BLK_MQ_ONLINE, + CPUHP_AP_NVME_QUEUE_ONLINE, CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, CPUHP_AP_X86_INTEL_EPB_ONLINE, CPUHP_AP_PERF_ONLINE,