On Fri, Jan 24, 2025, Paolo Bonzini wrote: > Il ven 24 gen 2025, 21:11 Sean Christopherson <seanjc@xxxxxxxxxx> ha scritto: > > Heh, except it's all kinds of broken. > > Yes, I didn't even try. > > > IMO, biting the bullet and converting to > > an SRCU-protected list is going to be far less work in the long run. > > I did try a long SRCU critical section and it was unreviewable. It > ends up a lot less manageable than just making the lock a leaf, > especially as the lock hierarchy spans multiple subsystems (static > key, KVM, cpufreq---thanks CPU hotplug lock...). I'm not following. If __kvmclock_cpufreq_notifier() and set_nx_huge_pages() switch to SRCU, then the deadlock goes away (it might even go away if just one of those two switches). SRCU readers would only interact with kvm_destroy_vm() from a locking perspective, and if that's problematic then we would already have a plethora of issues. > I also didn't like adding a synchronization primitive that's... kinda > single-use, but that would not be a blocker of course. It would be single use in the it only protects pure reader of vm_list, but there are plenty of those users. > So the second attempt was regular RCU, which looked a lot like this > patch. I started writing all the dances to find a struct kvm that > passes kvm_get_kvm_safe() before you do rcu_read_unlock() and drop the > previous one (because you cannot do kvm_put_kvm() within the RCU read > side) and set aside the idea, incorrectly thinking that they were not > needed with kvm_lock. Plus I didn't like having to keep alive a bunch > of data for a whole grace period if call_rcu() is used. > > So for the third attempt I could have chosen between dropping the SRCU > or just using kvm_lock. I didn't even think of SRCU to be honest, > because everything so far looked so bad, but it does seem a little > better than RCU. At least, if kvm_destroy_vm() uses call_srcu(), you > can call kvm_put_kvm() within srcu_read_lock()...srcu_read_unlock(). > It would look something like > > list_for_each_entry_srcu(kvm, &vm_list, vm_list, 1) { > if (!kvm_get_kvm_safe(kvm)) Unless I'm missing something, we shouldn't need to take a reference so long as SRCU is synchronized before destroying any part of the VM. If we don't take a reference, then we don't need to deal with the complexity of kvm_put_kvm() creating a recursive lock snafu. This is what I'm thinking, lightly tested... --- From: Sean Christopherson <seanjc@xxxxxxxxxx> Date: Fri, 24 Jan 2025 15:15:05 -0800 Subject: [PATCH] KVM: Use an SRCU lock to protect readers of vm_list Introduce a global SRCU lock to protect KVM's global list of VMs, and use it in all locations that currently take kvm_lock purely to prevent a VM from being destroyed. Keep using kvm_lock for flows that need to prevent VMs from being created, as SRCU synchronization only guards against use-after-free, it doesn't ensure a stable vm_list for readers. This fixes a largely theoretical deadlock where: - __kvm_set_memory_region() waits for kvm->srcu with kvm->slots_lock taken - set_nx_huge_pages() waits for kvm->slots_lock with kvm_lock taken - __kvmclock_cpufreq_notifier() waits for kvm_lock with cpu_hotplug_lock taken - KVM_RUN waits for cpu_hotplug_lock with kvm->srcu taken and therefore __kvm_set_memory_region() never completes synchronize_srcu(&kvm->srcu). __kvm_set_memory_region() lock(&kvm->slots_lock) set_nx_huge_pages() lock(kvm_lock) lock(&kvm->slots_lock) __kvmclock_cpufreq_notifier() lock(cpu_hotplug_lock) lock(kvm_lock) lock(&kvm->srcu) kvm_lapic_set_base() static_branch_inc() lock(cpu_hotplug_lock) sync(&kvm->srcu) Opportunistically add macros to walk the list of VMs, and the array of vCPUs in each VMs, to cut down on the amount of boilerplate. Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> --- arch/x86/kvm/mmu/mmu.c | 19 +++++++++++------ arch/x86/kvm/x86.c | 36 +++++++++++++++++-------------- include/linux/kvm_host.h | 9 ++++++++ virt/kvm/kvm_main.c | 46 +++++++++++++++++++++++++--------------- 4 files changed, 71 insertions(+), 39 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 74fa38ebddbf..f5b7ceb7ca0e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7127,6 +7127,10 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) } else if (sysfs_streq(val, "never")) { new_val = 0; + /* + * Take kvm_lock to ensure no VMs are *created* before the flag + * is set. vm_list_srcu only protect VMs being deleted. + */ mutex_lock(&kvm_lock); if (!list_empty(&vm_list)) { mutex_unlock(&kvm_lock); @@ -7142,17 +7146,19 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) if (new_val != old_val) { struct kvm *kvm; + int idx; - mutex_lock(&kvm_lock); + idx = srcu_read_lock(&vm_list_srcu); - list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vm_srcu(kvm) { mutex_lock(&kvm->slots_lock); kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); } - mutex_unlock(&kvm_lock); + + srcu_read_unlock(&vm_list_srcu, idx); } return 0; @@ -7275,13 +7281,14 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel if (is_recovery_enabled && (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; + int idx; - mutex_lock(&kvm_lock); + idx = srcu_read_lock(&vm_list_srcu); - list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vm_srcu(kvm) vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); - mutex_unlock(&kvm_lock); + srcu_read_unlock(&vm_list_srcu, idx); } return err; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b2d9a16fd4d3..8fb49237d179 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9428,6 +9428,11 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm *kvm; int cpu; + /* + * Take kvm_lock, not just vm_list_srcu, trevent new VMs from coming + * along in the middle of the update and not getting the in-progress + * request. + */ mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); @@ -9456,7 +9461,7 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int send_ipi = 0; + int send_ipi = 0, idx; unsigned long i; /* @@ -9500,17 +9505,16 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) smp_call_function_single(cpu, tsc_khz_changed, freq, 1); - mutex_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->cpu != cpu) - continue; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (vcpu->cpu != raw_smp_processor_id()) - send_ipi = 1; - } + idx = srcu_read_lock(&vm_list_srcu); + kvm_for_each_vcpu_in_each_vm(kvm, vcpu, i) { + if (vcpu->cpu != cpu) + continue; + + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + if (vcpu->cpu != raw_smp_processor_id()) + send_ipi = 1; } - mutex_unlock(&kvm_lock); + srcu_read_unlock(&vm_list_srcu, idx); if (freq->old < freq->new && send_ipi) { /* @@ -9588,13 +9592,13 @@ static void pvclock_gtod_update_fn(struct work_struct *work) struct kvm *kvm; struct kvm_vcpu *vcpu; unsigned long i; + int idx; - mutex_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + idx = srcu_read_lock(&vm_list_srcu); + kvm_for_each_vcpu_in_each_vm(kvm, vcpu, i) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + srcu_read_unlock(&vm_list_srcu, idx); atomic_set(&kvm_guest_has_master_clock, 0); - mutex_unlock(&kvm_lock); } static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9df590e8f3da..0d0edb697160 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -193,6 +193,11 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); extern struct mutex kvm_lock; extern struct list_head vm_list; +extern struct srcu_struct vm_list_srcu; + +#define kvm_for_each_vm_srcu(__kvm) \ + list_for_each_entry_srcu(__kvm, &vm_list, vm_list, \ + srcu_read_lock_held(&vm_list_srcu)) struct kvm_io_range { gpa_t addr; @@ -1001,6 +1006,10 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) return NULL; } +#define kvm_for_each_vcpu_in_each_vm(__kvm, __vcpu, __i) \ + kvm_for_each_vm_srcu(__kvm) \ + kvm_for_each_vcpu(__i, __vcpu, __kvm) + void kvm_destroy_vcpus(struct kvm *kvm); void vcpu_load(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e0b9d6dd6a85..7fcc4433bf35 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -109,6 +109,7 @@ module_param(allow_unsafe_mappings, bool, 0444); DEFINE_MUTEX(kvm_lock); LIST_HEAD(vm_list); +DEFINE_SRCU(vm_list_srcu); static struct kmem_cache *kvm_vcpu_cache; @@ -1261,13 +1262,21 @@ static void kvm_destroy_vm(struct kvm *kvm) int i; struct mm_struct *mm = kvm->mm; + mutex_lock(&kvm_lock); + list_del(&kvm->vm_list); + mutex_unlock(&kvm_lock); + + /* + * Ensure all readers of the global list go away before destroying any + * aspect of the VM. After this, the VM object is reachable only via + * this task and notifiers that are registered to the VM itself. + */ + synchronize_srcu(&vm_list_srcu); + kvm_destroy_pm_notifier(kvm); kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); - mutex_lock(&kvm_lock); - list_del(&kvm->vm_list); - mutex_unlock(&kvm_lock); kvm_arch_pre_destroy_vm(kvm); kvm_free_irq_routing(kvm); @@ -6096,14 +6105,16 @@ static int vm_stat_get(void *_offset, u64 *val) unsigned offset = (long)_offset; struct kvm *kvm; u64 tmp_val; + int idx; *val = 0; - mutex_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { + idx = srcu_read_lock(&vm_list_srcu); + kvm_for_each_vm_srcu(kvm) { kvm_get_stat_per_vm(kvm, offset, &tmp_val); *val += tmp_val; } - mutex_unlock(&kvm_lock); + srcu_read_unlock(&vm_list_srcu, idx); + return 0; } @@ -6111,15 +6122,15 @@ static int vm_stat_clear(void *_offset, u64 val) { unsigned offset = (long)_offset; struct kvm *kvm; + int idx; if (val) return -EINVAL; - mutex_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { + idx = srcu_read_lock(&vm_list_srcu); + kvm_for_each_vm_srcu(kvm) kvm_clear_stat_per_vm(kvm, offset); - } - mutex_unlock(&kvm_lock); + srcu_read_unlock(&vm_list_srcu, idx); return 0; } @@ -6132,14 +6143,15 @@ static int vcpu_stat_get(void *_offset, u64 *val) unsigned offset = (long)_offset; struct kvm *kvm; u64 tmp_val; + int idx; *val = 0; - mutex_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { + idx = srcu_read_lock(&vm_list_srcu); + kvm_for_each_vm_srcu(kvm) { kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); *val += tmp_val; } - mutex_unlock(&kvm_lock); + srcu_read_unlock(&vm_list_srcu, idx); return 0; } @@ -6147,15 +6159,15 @@ static int vcpu_stat_clear(void *_offset, u64 val) { unsigned offset = (long)_offset; struct kvm *kvm; + int idx; if (val) return -EINVAL; - mutex_lock(&kvm_lock); - list_for_each_entry(kvm, &vm_list, vm_list) { + idx = srcu_read_lock(&vm_list_srcu); + kvm_for_each_vm_srcu(kvm) kvm_clear_stat_per_vcpu(kvm, offset); - } - mutex_unlock(&kvm_lock); + srcu_read_unlock(&vm_list_srcu, idx); return 0; } base-commit: eb723766b1030a23c38adf2348b7c3d1409d11f0 --