This is a prototype with many TODO comments to give a better idea of what would be needed. The main missing piece a rework of every kvm_for_each_vcpu() into a less inefficient loop, but RCU readers cannot block, so the rewrite cannot be scripted. Is there a more suitable protection scheme? I didn't test it much ... I am still leaning towards the internally simpler version, (1), even if it requires userspace changes. --- arch/mips/kvm/mips.c | 8 +++--- arch/powerpc/kvm/powerpc.c | 6 +++-- arch/s390/kvm/kvm-s390.c | 27 ++++++++++++++------ arch/x86/kvm/hyperv.c | 3 +-- arch/x86/kvm/vmx.c | 3 ++- arch/x86/kvm/x86.c | 5 ++-- include/linux/kvm_host.h | 61 ++++++++++++++++++++++++++++++++++------------ virt/kvm/arm/arm.c | 10 +++----- virt/kvm/kvm_main.c | 58 +++++++++++++++++++++++++++++++++++-------- 9 files changed, 132 insertions(+), 49 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index bce2a6431430..4c9d383babe7 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -164,6 +164,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm) { unsigned int i; struct kvm_vcpu *vcpu; + struct kvm_vcpus *vcpus; kvm_for_each_vcpu(i, vcpu, kvm) { kvm_arch_vcpu_free(vcpu); @@ -171,8 +172,9 @@ void kvm_mips_free_vcpus(struct kvm *kvm) mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; + // TODO: resetting online vcpus shouldn't be needed + vcpus = rcu_dereference_protected(kvm->vcpus, lockdep_is_held(&kvm->lock)); + vcpus->online = 0; atomic_set(&kvm->online_vcpus, 0); @@ -499,7 +501,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, if (irq->cpu == -1) dvcpu = vcpu; else - dvcpu = vcpu->kvm->vcpus[irq->cpu]; + dvcpu = kvm_get_vcpu(vcpu->kvm, irq->cpu); if (intr == 2 || intr == 3 || intr == 4) { kvm_mips_callbacks->queue_io_int(dvcpu, irq); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3480faaf1ef8..9d1a16fd629f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -460,6 +460,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) { unsigned int i; struct kvm_vcpu *vcpu; + struct kvm_vcpus *vcpus; #ifdef CONFIG_KVM_XICS /* @@ -475,8 +476,9 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_arch_vcpu_free(vcpu); mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; + + vcpus = rcu_dereference_protected(kvm->vcpus, lockdep_is_held(&kvm->lock)); + vcpus->online = 0; atomic_set(&kvm->online_vcpus, 0); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9f23a9e81a91..dd8592c67ef4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1945,15 +1945,16 @@ static void kvm_free_vcpus(struct kvm *kvm) { unsigned int i; struct kvm_vcpu *vcpu; + struct kvm_vcpus *vcpus; kvm_for_each_vcpu(i, vcpu, kvm) kvm_arch_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - atomic_set(&kvm->online_vcpus, 0); + vcpus = rcu_dereference_protected(kvm->vcpus, lockdep_is_held(&kvm->lock)); + vcpus->online = 0; + mutex_unlock(&kvm->lock); } @@ -3415,6 +3416,7 @@ static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) { int i, online_vcpus, started_vcpus = 0; + struct kvm_vcpus *vcpus; if (!is_vcpu_stopped(vcpu)) return; @@ -3422,12 +3424,16 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1); /* Only one cpu at a time may enter/leave the STOPPED state. */ spin_lock(&vcpu->kvm->arch.start_stop_lock); - online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); - for (i = 0; i < online_vcpus; i++) { - if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) + rcu_read_lock(); + vcpus = rcu_dereference(vcpu->kvm->vcpus); + // TODO: this pattern is kvm_for_each_vcpu + for (i = 0; i < vcpus->online; i++) { + if (!is_vcpu_stopped(vcpus->array[i])) started_vcpus++; + // TODO: if (started_vcpus > 1) break; } + rcu_read_unlock(); if (started_vcpus == 0) { /* we're the only active VCPU -> speed it up */ @@ -3455,6 +3461,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) { int i, online_vcpus, started_vcpus = 0; struct kvm_vcpu *started_vcpu = NULL; + struct kvm_vcpus *vcpus; if (is_vcpu_stopped(vcpu)) return; @@ -3470,12 +3477,16 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) atomic_or(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); __disable_ibs_on_vcpu(vcpu); + rcu_read_lock(); + vcpus = rcu_dereference(vcpu->kvm->vcpus); + // TODO: use kvm_for_each_vcpu for (i = 0; i < online_vcpus; i++) { - if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) { + if (!is_vcpu_stopped(vcpus->array[i])) { started_vcpus++; - started_vcpu = vcpu->kvm->vcpus[i]; + started_vcpu = vcpus->array[i]; } } + rcu_read_unlock(); if (started_vcpus == 1) { /* diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index dc97f2544b6f..bf4037344729 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -111,8 +111,7 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) struct kvm_vcpu *vcpu = NULL; int i; - if (vpidx < KVM_MAX_VCPUS) - vcpu = kvm_get_vcpu(kvm, vpidx); + vcpu = kvm_get_vcpu(kvm, vpidx); if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index df8d2f127508..1f492a1b64e9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11742,7 +11742,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) + // TODO: make apicv state accessible directly from struct kvm + !kvm_vcpu_apicv_active(kvm_get_vcpu(kvm, 0))) return 0; idx = srcu_read_lock(&kvm->irq_srcu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e10eda86bc7b..5d8af3e4eab1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8081,6 +8081,7 @@ static void kvm_free_vcpus(struct kvm *kvm) { unsigned int i; struct kvm_vcpu *vcpu; + struct kvm_vcpus *vcpus; /* * Unpin any mmu pages first. @@ -8093,8 +8094,8 @@ static void kvm_free_vcpus(struct kvm *kvm) kvm_arch_vcpu_free(vcpu); mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; + vcpus = rcu_dereference_protected(kvm->vcpus, lockdep_is_held(&kvm->lock)); + vcpus->online = 0; atomic_set(&kvm->online_vcpus, 0); mutex_unlock(&kvm->lock); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c8df733eed41..eb9fb5b493ac 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -386,12 +386,17 @@ struct kvm_memslots { int used_slots; }; +struct kvm_vcpus { + u32 online; + struct kvm_vcpu *array[]; +}; + struct kvm { spinlock_t mmu_lock; struct mutex slots_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; - struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + struct kvm_vcpus *vcpus; /* * created_vcpus is protected by kvm->lock, and is incremented @@ -483,45 +488,71 @@ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { - /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu, in case - * the caller has read kvm->online_vcpus before (as is the case - * for kvm_for_each_vcpu, for example). - */ - smp_rmb(); - return kvm->vcpus[i]; + struct kvm_vcpu *r = NULL; + struct kvm_vcpus *vcpus; + + rcu_read_lock(); + vcpus = rcu_dereference(kvm->vcpus); + if (i < vcpus->online) + r = vcpus->array[i]; + // TODO: check for bounds & return NULL in that case? + rcu_read_unlock(); + + return r; } +// This is unacceptably inefficient implementation, but rcu critical section +// imposes limitations on preemption and we'll have to check all users before +// converting them. #define kvm_for_each_vcpu(idx, vcpup, kvm) \ for (idx = 0; \ - idx < atomic_read(&kvm->online_vcpus) && \ - (vcpup = kvm_get_vcpu(kvm, idx)) != NULL; \ + ({struct kvm_vcpus *__vcpus; \ + rcu_read_lock(); \ + __vcpus = rcu_dereference(kvm->vcpus); \ + vcpup = idx < __vcpus->online ? __vcpus->array[idx] : NULL; \ + rcu_read_unlock(); \ + vcpup;}); \ idx++) +#define kvm_for_each_vcpu_rcu(idx, vcpup, vcpus, kvm) \ + for (vcpus = rcu_dereference(kvm->vcpus), idx = 0; \ + idx < vcpus->online && (vcpup = vcpus->array[idx]); \ + idx++) \ + static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) { struct kvm_vcpu *vcpu = NULL; + struct kvm_vcpus *vcpus; int i; if (id < 0) return NULL; - if (id < KVM_MAX_VCPUS) - vcpu = kvm_get_vcpu(kvm, id); + vcpu = kvm_get_vcpu(kvm, id); if (vcpu && vcpu->vcpu_id == id) return vcpu; - kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu->vcpu_id == id) + rcu_read_lock(); + kvm_for_each_vcpu_rcu(i, vcpu, vcpus, kvm) + if (vcpu->vcpu_id == id) { + rcu_read_unlock(); return vcpu; + } + rcu_read_unlock(); return NULL; } static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) { struct kvm_vcpu *tmp; + struct kvm_vcpus *vcpus; int idx; - kvm_for_each_vcpu(idx, tmp, vcpu->kvm) - if (tmp == vcpu) + rcu_read_lock(); + kvm_for_each_vcpu_rcu(idx, tmp, vcpus, vcpu->kvm) + if (tmp == vcpu) { + rcu_read_unlock(); return idx; + } + rcu_read_unlock(); BUG(); } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index b9f68e4add71..f0774a08083d 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -174,16 +174,14 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) void kvm_arch_destroy_vm(struct kvm *kvm) { int i; + struct kvm_vcpu *vcpu; free_percpu(kvm->arch.last_vcpu_ran); kvm->arch.last_vcpu_ran = NULL; - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - if (kvm->vcpus[i]) { - kvm_arch_vcpu_free(kvm->vcpus[i]); - kvm->vcpus[i] = NULL; - } - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arch_vcpu_free(vcpu); + // other arches zeroed online here, for no apparent reason :) kvm_vgic_destroy(kvm); } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2eac2c62795f..578285ff74a4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -640,13 +640,31 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) return 0; } +// TODO: preallocate some VCPUs static inline struct kvm *kvm_alloc_vm(void) { - return kzalloc(sizeof(struct kvm), GFP_KERNEL); + struct kvm *kvm; + struct kvm_vcpus *vcpus; + + kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + if (!kvm) + return NULL; + + vcpus = kvzalloc(sizeof(*vcpus), GFP_KERNEL); + if (!vcpus) { + kfree(kvm); + return NULL; + } + vcpus->online = 0; + rcu_assign_pointer(kvm->vcpus, vcpus); + + return kvm; } static inline void kvm_free_vm(struct kvm *kvm) { + if (kvm) + kvfree(rcu_dereference_protected(kvm->vcpus, 1)); kfree(kvm); } @@ -2473,6 +2491,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) { int r; struct kvm_vcpu *vcpu; + struct kvm_vcpus *old, *new = NULL; if (id >= KVM_MAX_VCPU_ID) return -EINVAL; @@ -2508,7 +2527,22 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto unlock_vcpu_destroy; } - BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); + old = rcu_dereference_protected(kvm->vcpus, lockdep_is_held(&kvm->lock)); + + // TODO: make it a function for it that returns new/NULL + { + // OPTIMIZE: do not allocate every time. Requires atomic online + // counter. + u32 size = old->online + 1; + + new = kvzalloc(sizeof(*new) + size * sizeof(*new->array), GFP_KERNEL); + if (!new) { + r = -ENOMEM; + goto unlock_vcpu_destroy; + } + new->online = size; + memcpy(new->array, old->array, old->online * sizeof(*old->array)); + } /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); @@ -2518,20 +2552,24 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto unlock_vcpu_destroy; } - kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; - - /* - * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus - * before kvm->online_vcpu's incremented value. - */ - smp_wmb(); - atomic_inc(&kvm->online_vcpus); + new->array[old->online] = vcpu; + rcu_assign_pointer(kvm->vcpus, new); mutex_unlock(&kvm->lock); + + // we could schedule a callback instead + synchronize_rcu(); + kfree(old); + + // TODO: No longer synchronizes anything in the common code. + // Remove if the arch-specific uses were mostly hacks. + atomic_inc(&kvm->online_vcpus); + kvm_arch_vcpu_postcreate(vcpu); return r; unlock_vcpu_destroy: + kvfree(new); mutex_unlock(&kvm->lock); debugfs_remove_recursive(vcpu->debugfs_dentry); vcpu_destroy: -- 2.13.3