On Tue, Jul 31, 2012 at 4:20 PM, Marc Zyngier <marc.zyngier@xxxxxxx> wrote: > On 31/07/12 15:03, Christoffer Dall wrote: >> On Tue, Jul 31, 2012 at 2:27 PM, Christoffer Dall >> <c.dall@xxxxxxxxxxxxxxxxxxxxxx> wrote: >>> On Thu, Jul 5, 2012 at 5:03 PM, Marc Zyngier <marc.zyngier@xxxxxxx> wrote: >>>> Now that we run the exit handler with preemption disabled, we start >>>> seeing some very ugly things going on: >>>> >>>> BUG: scheduling while atomic: qemu-system-arm/1144/0x00000002 >>>> Modules linked in: >>>> [<c0015f3c>] (unwind_backtrace+0x0/0xf8) from [<c03f54e0>] (__schedule_bug+0x44/0x58) >>>> [<c03f54e0>] (__schedule_bug+0x44/0x58) from [<c0402690>] (__schedule+0x620/0x644) >>>> [<c0402690>] (__schedule+0x620/0x644) from [<c0402ad8>] (schedule_preempt_disabled+0x24/0x34) >>>> [<c0402ad8>] (schedule_preempt_disabled+0x24/0x34) from [<c04016ac>] (__mutex_lock_slowpath+0x120/0x1a8) >>>> [<c04016ac>] (__mutex_lock_slowpath+0x120/0x1a8) from [<c040176c>] (mutex_lock+0x38/0x3c) >>>> [<c040176c>] (mutex_lock+0x38/0x3c) from [<c0026ab4>] (user_mem_abort.isra.9+0xbc/0x1fc) >>>> [<c0026ab4>] (user_mem_abort.isra.9+0xbc/0x1fc) from [<c00273bc>] (kvm_handle_guest_abort+0xdc/0x164) >>>> [<c00273bc>] (kvm_handle_guest_abort+0xdc/0x164) from [<c0024fcc>] (handle_exit+0x74/0x11c) >>>> [<c0024fcc>] (handle_exit+0x74/0x11c) from [<c00255c0>] (kvm_arch_vcpu_ioctl_run+0x138/0x21c) >>>> [<c00255c0>] (kvm_arch_vcpu_ioctl_run+0x138/0x21c) from [<c002214c>] (kvm_vcpu_ioctl+0x3d8/0x6bc) >>>> [<c002214c>] (kvm_vcpu_ioctl+0x3d8/0x6bc) from [<c00e0990>] (do_vfs_ioctl+0x7c/0x2d0) >>>> [<c00e0990>] (do_vfs_ioctl+0x7c/0x2d0) from [<c00e0c1c>] (sys_ioctl+0x38/0x5c) >>>> [<c00e0c1c>] (sys_ioctl+0x38/0x5c) from [<c000eec0>] (ret_fast_syscall+0x0/0x30) >>>> >>>> The culprit is the stage-2 lock, which is defined as a mutex, and >>>> really should be a spinlock. >>>> >>>> Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx> >>>> --- >>>> arch/arm/include/asm/kvm_host.h | 2 +- >>>> arch/arm/kvm/arm.c | 2 +- >>>> arch/arm/kvm/mmu.c | 12 ++++++------ >>>> 3 files changed, 8 insertions(+), 8 deletions(-) >>>> >>>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h >>>> index 0c7e782..9d0cc83 100644 >>>> --- a/arch/arm/include/asm/kvm_host.h >>>> +++ b/arch/arm/include/asm/kvm_host.h >>>> @@ -40,7 +40,7 @@ struct kvm_arch { >>>> u32 vmid; >>>> >>>> /* 1-level 2nd stage table and lock */ >>>> - struct mutex pgd_mutex; >>>> + spinlock_t pgd_lock; >>>> pgd_t *pgd; >>>> >>>> /* VTTBR value associated with above pgd and vmid */ >>>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c >>>> index f3b206a..44691e1 100644 >>>> --- a/arch/arm/kvm/arm.c >>>> +++ b/arch/arm/kvm/arm.c >>>> @@ -97,7 +97,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) >>>> ret = kvm_alloc_stage2_pgd(kvm); >>>> if (ret) >>>> goto out_fail_alloc; >>>> - mutex_init(&kvm->arch.pgd_mutex); >>>> + spin_lock_init(&kvm->arch.pgd_lock); >>>> >>>> ret = create_hyp_mappings(kvm, kvm + 1); >>>> if (ret) >>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >>>> index 8c30376..4b174e6 100644 >>>> --- a/arch/arm/kvm/mmu.c >>>> +++ b/arch/arm/kvm/mmu.c >>>> @@ -354,14 +354,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, >>>> return -EFAULT; >>>> } >>>> >>>> - mutex_lock(&vcpu->kvm->arch.pgd_mutex); >>>> + spin_lock(&vcpu->kvm->arch.pgd_lock); >>>> new_pte = pfn_pte(pfn, PAGE_KVM_GUEST); >>>> if (writable) >>>> new_pte |= L_PTE2_WRITE; >>>> ret = stage2_set_pte(vcpu->kvm, fault_ipa, &new_pte); >>>> if (ret) >>>> put_page(pfn_to_page(pfn)); >>>> - mutex_unlock(&vcpu->kvm->arch.pgd_mutex); >>>> + spin_unlock(&vcpu->kvm->arch.pgd_lock); >>>> >>>> return ret; >>>> } >>>> @@ -611,13 +611,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) >>>> if (!kvm->arch.pgd) >>>> return 0; >>>> >>>> - mutex_lock(&kvm->arch.pgd_mutex); >>>> + spin_lock(&kvm->arch.pgd_lock); >>> >>> eh, hva_to_gpa calls it's own mutex and may sleep while holding a spinlock... >>> >>> also stage2_set_pte below allocates memory and may sleep... >>> >>> Perhaps we should instead call the cpu-local-critical cache functions >>> on the required CPU and run the whole thing with preemption enabled >>> instead....? >>> >> >> In fact, it looks simple enough to me (comments please): >> >> >> commit 95d11541f77783f00b8799a94ed99f1b1b321b1a >> Author: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> >> Date: Tue Jul 31 10:00:29 2012 -0400 >> >> ARM: KVM: Run exit handling under preemption again >> >> This makes for a much cleaner approach with less worries during exit >> handling and allows for correct cache maintenance for set/way operations >> on SMP, at the cost of taking a slight performance hit when the guest >> does set/way based cache operations _and_ when we are preempted between >> guest exit and emulation code. >> >> Signed-off-by: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> >> >> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h >> index e50d9b1..0006264 100644 >> --- a/arch/arm/include/asm/kvm_host.h >> +++ b/arch/arm/include/asm/kvm_host.h >> @@ -113,6 +113,7 @@ struct kvm_vcpu_arch { >> instructions */ >> >> /* dcache set/way operation pending */ >> + int last_pcpu; >> cpumask_t require_dcache_flush; >> >> /* IO related fields */ >> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c >> index 52e449e..38e9448 100644 >> --- a/arch/arm/kvm/arm.c >> +++ b/arch/arm/kvm/arm.c >> @@ -285,8 +285,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) >> /* >> * Check whether this vcpu requires the cache to be flushed on >> * this physical CPU. This is a consequence of doing dcache >> - * operations by set/way on this vcpu. We do it here in order >> - * to be in a non-preemptible section. >> + * operations by set/way on this vcpu. We do it here to be in >> + * a non-preemptible section. >> */ >> if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush)) >> flush_cache_all(); /* We'd really want v7_flush_dcache_all() */ >> @@ -535,15 +535,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu >> *vcpu, struct kvm_run *run) >> cond_resched(); >> update_vttbr(vcpu->kvm); >> >> - /* >> - * Make sure preemption is disabled while calling handle_exit >> - * as exit handling touches CPU-specific resources, such as >> - * caches, and we must stay on the same CPU. >> - * >> - * Code that might sleep must enable preemption and access >> - * CPU-specific resources first. >> - */ >> - preempt_disable(); >> local_irq_disable(); >> >> /* >> @@ -556,7 +547,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, >> struct kvm_run *run) >> >> if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) { >> local_irq_enable(); >> - preempt_enable(); >> continue; >> } >> >> @@ -572,6 +562,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, >> struct kvm_run *run) >> ret = __kvm_vcpu_run(vcpu); >> >> vcpu->mode = OUTSIDE_GUEST_MODE; >> + vcpu->arch.last_pcpu = smp_processor_id(); >> vcpu->stat.exits++; >> kvm_guest_exit(); >> trace_kvm_exit(vcpu->arch.regs.pc); >> @@ -582,7 +573,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, >> struct kvm_run *run) >> *************************************************************/ >> >> ret = handle_exit(vcpu, run, ret); >> - preempt_enable(); >> } >> >> if (vcpu->sigset_active) >> diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c >> index 039fc3a..4a89710 100644 >> --- a/arch/arm/kvm/emulate.c >> +++ b/arch/arm/kvm/emulate.c >> @@ -20,6 +20,7 @@ >> #include <asm/kvm_arm.h> >> #include <asm/kvm_host.h> >> #include <asm/kvm_emulate.h> >> +#include <asm/cacheflush.h> >> #include <trace/events/kvm.h> >> >> #include "trace.h" >> @@ -299,6 +300,18 @@ static bool write_dcsw(struct kvm_vcpu *vcpu, >> unsigned long cp15_reg) >> { >> u32 val; >> + int cpu; >> + >> + cpu = get_cpu(); >> + >> + cpumask_setall(&vcpu->arch.require_dcache_flush); >> + cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush); >> + >> + /* If we were already preempted, take the long way around */ >> + if (cpu != vcpu->arch.last_pcpu) { >> + flush_cache_all(); >> + goto done; >> + } > > So in this case, we nuke the caches on all CPUs. Not a big deal in my > opinion. > > Getting rid of the whole preempt_disable() trumps the above cost any > day, so I'm quite happy with this patch. > >> val = *vcpu_reg(vcpu, p->Rt1); >> >> @@ -313,8 +326,8 @@ static bool write_dcsw(struct kvm_vcpu *vcpu, >> break; >> } >> >> - cpumask_setall(&vcpu->arch.require_dcache_flush); >> - cpumask_clear_cpu(vcpu->cpu, &vcpu->arch.require_dcache_flush); >> +done: >> + put_cpu(); >> >> return true; >> } >> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c >> index 24769af..b4927ce 100644 >> --- a/arch/arm/kvm/mmu.c >> +++ b/arch/arm/kvm/mmu.c >> @@ -341,10 +341,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, >> phys_addr_t fault_ipa, >> return -EFAULT; >> } >> >> - /* preemption disabled for handle_exit, gfn_to_pfn may sleep */ >> - preempt_enable(); >> pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); >> - preempt_disable(); >> >> if (is_error_pfn(pfn)) { >> put_page(pfn_to_page(pfn)); >> > > Acked-by: Marc Zyngier <marc.zyngier@xxxxxxx> > Thanks, pushed out. -Christoffer _______________________________________________ kvmarm mailing list kvmarm@xxxxxxxxxxxxxxxxxxxxx https://lists.cs.columbia.edu/cucslists/listinfo/kvmarm