On Wed, 2022-11-02 at 23:19 +0000, Sean Christopherson wrote: > From: Chao Gao <chao.gao@xxxxxxxxx> > > The CPU STARTING section doesn't allow callbacks to fail. Move KVM's > hotplug callback to ONLINE section so that it can abort onlining a > CPU in > certain cases to avoid potentially breaking VMs running on existing > CPUs. > For example, when KVM fails to enable hardware virtualization on the > hotplugged CPU. > > Place KVM's hotplug state before CPUHP_AP_SCHED_WAIT_EMPTY as it > ensures > when offlining a CPU, all user tasks and non-pinned kernel tasks have > left > the CPU, i.e. there cannot be a vCPU task around. So, it is safe for > KVM's > CPU offline callback to disable hardware virtualization at that > point. > Likewise, KVM's online callback can enable hardware virtualization > before > any vCPU task gets a chance to run on hotplugged CPUs. > > Rename KVM's CPU hotplug callbacks accordingly. > > Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx> > Reviewed-by: Sean Christopherson <seanjc@xxxxxxxxxx> > Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> > Reviewed-by: Yuan Yao <yuan.yao@xxxxxxxxx> > Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> > --- > include/linux/cpuhotplug.h | 2 +- > virt/kvm/kvm_main.c | 30 ++++++++++++++++++++++-------- > 2 files changed, 23 insertions(+), 9 deletions(-) > > diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h > index 7337414e4947..de45be38dd27 100644 > --- a/include/linux/cpuhotplug.h > +++ b/include/linux/cpuhotplug.h > @@ -185,7 +185,6 @@ enum cpuhp_state { > CPUHP_AP_CSKY_TIMER_STARTING, > CPUHP_AP_TI_GP_TIMER_STARTING, > CPUHP_AP_HYPERV_TIMER_STARTING, > - CPUHP_AP_KVM_STARTING, > /* Must be the last timer callback */ > CPUHP_AP_DUMMY_TIMER_STARTING, > CPUHP_AP_ARM_XEN_STARTING, > @@ -200,6 +199,7 @@ enum cpuhp_state { > > /* Online section invoked on the hotplugged CPU from the > hotplug thread */ > CPUHP_AP_ONLINE_IDLE, > + CPUHP_AP_KVM_ONLINE, > CPUHP_AP_SCHED_WAIT_EMPTY, > CPUHP_AP_SMPBOOT_THREADS, > CPUHP_AP_X86_VDSO_VMA_ONLINE, > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index dd13af9f06d5..fd9e39c85549 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -5026,13 +5026,27 @@ static void hardware_enable_nolock(void > *junk) > } > } > > -static int kvm_starting_cpu(unsigned int cpu) > +static int kvm_online_cpu(unsigned int cpu) > { > + int ret = 0; > + > raw_spin_lock(&kvm_count_lock); > - if (kvm_usage_count) > + /* > + * Abort the CPU online process if hardware virtualization > cannot > + * be enabled. Otherwise running VMs would encounter > unrecoverable > + * errors when scheduled to this CPU. > + */ > + if (kvm_usage_count) { > + WARN_ON_ONCE(atomic_read(&hardware_enable_failed)); > + > hardware_enable_nolock(NULL); > + if (atomic_read(&hardware_enable_failed)) { > + atomic_set(&hardware_enable_failed, 0); I see other places using this hardware_enable_failed with atomic_inc(), should here use atomic_dec() instead of straightly set to 0? Though here is embraced by spin_lock, hardware_enable_nolock() can be invoked in other places in parallel? Fortunately in the end of this patch set, global hardware_enable_failed is get rid of. > + ret = -EIO; > + } > + } > raw_spin_unlock(&kvm_count_lock); > - return 0; > + return ret; > } > > static void hardware_disable_nolock(void *junk) > @@ -5045,7 +5059,7 @@ static void hardware_disable_nolock(void *junk) > kvm_arch_hardware_disable(); > } > > -static int kvm_dying_cpu(unsigned int cpu) > +static int kvm_offline_cpu(unsigned int cpu) > { > raw_spin_lock(&kvm_count_lock); > if (kvm_usage_count) > @@ -5822,8 +5836,8 @@ int kvm_init(unsigned vcpu_size, unsigned > vcpu_align, struct module *module) > if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) > return -ENOMEM; > > - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, > "kvm/cpu:starting", > - kvm_starting_cpu, kvm_dying_cpu); > + r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, > "kvm/cpu:online", > + kvm_online_cpu, kvm_offline_cpu); > if (r) > goto out_free_2; > register_reboot_notifier(&kvm_reboot_notifier); > @@ -5897,7 +5911,7 @@ int kvm_init(unsigned vcpu_size, unsigned > vcpu_align, struct module *module) > kmem_cache_destroy(kvm_vcpu_cache); > out_free_3: > unregister_reboot_notifier(&kvm_reboot_notifier); > - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); > + cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); > out_free_2: > free_cpumask_var(cpus_hardware_enabled); > return r; > @@ -5923,7 +5937,7 @@ void kvm_exit(void) > kvm_async_pf_deinit(); > unregister_syscore_ops(&kvm_syscore_ops); > unregister_reboot_notifier(&kvm_reboot_notifier); > - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); > + cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); > on_each_cpu(hardware_disable_nolock, NULL, 1); > kvm_irqfd_exit(); > free_cpumask_var(cpus_hardware_enabled);