Re: [PATCH 1/4] ARM: KVM: Convert stage-2 mutex to a spinlock

Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> · Tue, 31 Jul 2012 16:27:33 +0200



On Tue, Jul 31, 2012 at 4:20 PM, Marc Zyngier <marc.zyngier@xxxxxxx> wrote:
> On 31/07/12 15:03, Christoffer Dall wrote:
>> On Tue, Jul 31, 2012 at 2:27 PM, Christoffer Dall
>> <c.dall@xxxxxxxxxxxxxxxxxxxxxx> wrote:
>>> On Thu, Jul 5, 2012 at 5:03 PM, Marc Zyngier <marc.zyngier@xxxxxxx> wrote:
>>>> Now that we run the exit handler with preemption disabled, we start
>>>> seeing some very ugly things going on:
>>>>
>>>> BUG: scheduling while atomic: qemu-system-arm/1144/0x00000002
>>>> Modules linked in:
>>>> [<c0015f3c>] (unwind_backtrace+0x0/0xf8) from [<c03f54e0>] (__schedule_bug+0x44/0x58)
>>>> [<c03f54e0>] (__schedule_bug+0x44/0x58) from [<c0402690>] (__schedule+0x620/0x644)
>>>> [<c0402690>] (__schedule+0x620/0x644) from [<c0402ad8>] (schedule_preempt_disabled+0x24/0x34)
>>>> [<c0402ad8>] (schedule_preempt_disabled+0x24/0x34) from [<c04016ac>] (__mutex_lock_slowpath+0x120/0x1a8)
>>>> [<c04016ac>] (__mutex_lock_slowpath+0x120/0x1a8) from [<c040176c>] (mutex_lock+0x38/0x3c)
>>>> [<c040176c>] (mutex_lock+0x38/0x3c) from [<c0026ab4>] (user_mem_abort.isra.9+0xbc/0x1fc)
>>>> [<c0026ab4>] (user_mem_abort.isra.9+0xbc/0x1fc) from [<c00273bc>] (kvm_handle_guest_abort+0xdc/0x164)
>>>> [<c00273bc>] (kvm_handle_guest_abort+0xdc/0x164) from [<c0024fcc>] (handle_exit+0x74/0x11c)
>>>> [<c0024fcc>] (handle_exit+0x74/0x11c) from [<c00255c0>] (kvm_arch_vcpu_ioctl_run+0x138/0x21c)
>>>> [<c00255c0>] (kvm_arch_vcpu_ioctl_run+0x138/0x21c) from [<c002214c>] (kvm_vcpu_ioctl+0x3d8/0x6bc)
>>>> [<c002214c>] (kvm_vcpu_ioctl+0x3d8/0x6bc) from [<c00e0990>] (do_vfs_ioctl+0x7c/0x2d0)
>>>> [<c00e0990>] (do_vfs_ioctl+0x7c/0x2d0) from [<c00e0c1c>] (sys_ioctl+0x38/0x5c)
>>>> [<c00e0c1c>] (sys_ioctl+0x38/0x5c) from [<c000eec0>] (ret_fast_syscall+0x0/0x30)
>>>>
>>>> The culprit is the stage-2 lock, which is defined as a mutex, and
>>>> really should be a spinlock.
>>>>
>>>> Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx>
>>>> ---
>>>>  arch/arm/include/asm/kvm_host.h |    2 +-
>>>>  arch/arm/kvm/arm.c              |    2 +-
>>>>  arch/arm/kvm/mmu.c              |   12 ++++++------
>>>>  3 files changed, 8 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
>>>> index 0c7e782..9d0cc83 100644
>>>> --- a/arch/arm/include/asm/kvm_host.h
>>>> +++ b/arch/arm/include/asm/kvm_host.h
>>>> @@ -40,7 +40,7 @@ struct kvm_arch {
>>>>         u32    vmid;
>>>>
>>>>         /* 1-level 2nd stage table and lock */
>>>> -       struct mutex pgd_mutex;
>>>> +       spinlock_t pgd_lock;
>>>>         pgd_t *pgd;
>>>>
>>>>         /* VTTBR value associated with above pgd and vmid */
>>>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>>>> index f3b206a..44691e1 100644
>>>> --- a/arch/arm/kvm/arm.c
>>>> +++ b/arch/arm/kvm/arm.c
>>>> @@ -97,7 +97,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>>>>         ret = kvm_alloc_stage2_pgd(kvm);
>>>>         if (ret)
>>>>                 goto out_fail_alloc;
>>>> -       mutex_init(&kvm->arch.pgd_mutex);
>>>> +       spin_lock_init(&kvm->arch.pgd_lock);
>>>>
>>>>         ret = create_hyp_mappings(kvm, kvm + 1);
>>>>         if (ret)
>>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>>> index 8c30376..4b174e6 100644
>>>> --- a/arch/arm/kvm/mmu.c
>>>> +++ b/arch/arm/kvm/mmu.c
>>>> @@ -354,14 +354,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>>                 return -EFAULT;
>>>>         }
>>>>
>>>> -       mutex_lock(&vcpu->kvm->arch.pgd_mutex);
>>>> +       spin_lock(&vcpu->kvm->arch.pgd_lock);
>>>>         new_pte = pfn_pte(pfn, PAGE_KVM_GUEST);
>>>>         if (writable)
>>>>                 new_pte |= L_PTE2_WRITE;
>>>>         ret = stage2_set_pte(vcpu->kvm, fault_ipa, &new_pte);
>>>>         if (ret)
>>>>                 put_page(pfn_to_page(pfn));
>>>> -       mutex_unlock(&vcpu->kvm->arch.pgd_mutex);
>>>> +       spin_unlock(&vcpu->kvm->arch.pgd_lock);
>>>>
>>>>         return ret;
>>>>  }
>>>> @@ -611,13 +611,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
>>>>         if (!kvm->arch.pgd)
>>>>                 return 0;
>>>>
>>>> -       mutex_lock(&kvm->arch.pgd_mutex);
>>>> +       spin_lock(&kvm->arch.pgd_lock);
>>>
>>> eh, hva_to_gpa calls it's own mutex and may sleep while holding a spinlock...
>>>
>>> also stage2_set_pte below allocates memory and may sleep...
>>>
>>> Perhaps we should instead call the cpu-local-critical cache functions
>>> on the required CPU and run the whole thing with preemption enabled
>>> instead....?
>>>
>>
>> In fact, it looks simple enough to me (comments please):
>>
>>
>> commit 95d11541f77783f00b8799a94ed99f1b1b321b1a
>> Author: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx>
>> Date:   Tue Jul 31 10:00:29 2012 -0400
>>
>>     ARM: KVM: Run exit handling under preemption again
>>
>>     This makes for a much cleaner approach with less worries during exit
>>     handling and allows for correct cache maintenance for set/way operations
>>     on SMP, at the cost of taking a slight performance hit when the guest
>>     does set/way based cache operations _and_ when we are preempted between
>>     guest exit and emulation code.
>>
>>     Signed-off-by: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx>
>>
>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
>> index e50d9b1..0006264 100644
>> --- a/arch/arm/include/asm/kvm_host.h
>> +++ b/arch/arm/include/asm/kvm_host.h
>> @@ -113,6 +113,7 @@ struct kvm_vcpu_arch {
>>                                  instructions */
>>
>>       /* dcache set/way operation pending */
>> +     int last_pcpu;
>>       cpumask_t require_dcache_flush;
>>
>>       /* IO related fields */
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index 52e449e..38e9448 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -285,8 +285,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>       /*
>>        * Check whether this vcpu requires the cache to be flushed on
>>        * this physical CPU. This is a consequence of doing dcache
>> -      * operations by set/way on this vcpu. We do it here in order
>> -      * to be in a non-preemptible section.
>> +      * operations by set/way on this vcpu. We do it here to be in
>> +      * a non-preemptible section.
>>        */
>>       if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
>>               flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
>> @@ -535,15 +535,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
>> *vcpu, struct kvm_run *run)
>>               cond_resched();
>>               update_vttbr(vcpu->kvm);
>>
>> -             /*
>> -              * Make sure preemption is disabled while calling handle_exit
>> -              * as exit handling touches CPU-specific resources, such as
>> -              * caches, and we must stay on the same CPU.
>> -              *
>> -              * Code that might sleep must enable preemption and access
>> -              * CPU-specific resources first.
>> -              */
>> -             preempt_disable();
>>               local_irq_disable();
>>
>>               /*
>> @@ -556,7 +547,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu,
>> struct kvm_run *run)
>>
>>               if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
>>                       local_irq_enable();
>> -                     preempt_enable();
>>                       continue;
>>               }
>>
>> @@ -572,6 +562,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu,
>> struct kvm_run *run)
>>               ret = __kvm_vcpu_run(vcpu);
>>
>>               vcpu->mode = OUTSIDE_GUEST_MODE;
>> +             vcpu->arch.last_pcpu = smp_processor_id();
>>               vcpu->stat.exits++;
>>               kvm_guest_exit();
>>               trace_kvm_exit(vcpu->arch.regs.pc);
>> @@ -582,7 +573,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu,
>> struct kvm_run *run)
>>                *************************************************************/
>>
>>               ret = handle_exit(vcpu, run, ret);
>> -             preempt_enable();
>>       }
>>
>>       if (vcpu->sigset_active)
>> diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
>> index 039fc3a..4a89710 100644
>> --- a/arch/arm/kvm/emulate.c
>> +++ b/arch/arm/kvm/emulate.c
>> @@ -20,6 +20,7 @@
>>  #include <asm/kvm_arm.h>
>>  #include <asm/kvm_host.h>
>>  #include <asm/kvm_emulate.h>
>> +#include <asm/cacheflush.h>
>>  #include <trace/events/kvm.h>
>>
>>  #include "trace.h"
>> @@ -299,6 +300,18 @@ static bool write_dcsw(struct kvm_vcpu *vcpu,
>>                      unsigned long cp15_reg)
>>  {
>>       u32 val;
>> +     int cpu;
>> +
>> +     cpu = get_cpu();
>> +
>> +     cpumask_setall(&vcpu->arch.require_dcache_flush);
>> +     cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
>> +
>> +     /* If we were already preempted, take the long way around */
>> +     if (cpu != vcpu->arch.last_pcpu) {
>> +             flush_cache_all();
>> +             goto done;
>> +     }
>
> So in this case, we nuke the caches on all CPUs. Not a big deal in my
> opinion.
>
> Getting rid of the whole preempt_disable() trumps the above cost any
> day, so I'm quite happy with this patch.
>
>>       val = *vcpu_reg(vcpu, p->Rt1);
>>
>> @@ -313,8 +326,8 @@ static bool write_dcsw(struct kvm_vcpu *vcpu,
>>               break;
>>       }
>>
>> -     cpumask_setall(&vcpu->arch.require_dcache_flush);
>> -     cpumask_clear_cpu(vcpu->cpu, &vcpu->arch.require_dcache_flush);
>> +done:
>> +     put_cpu();
>>
>>       return true;
>>  }
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index 24769af..b4927ce 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -341,10 +341,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
>> phys_addr_t fault_ipa,
>>               return -EFAULT;
>>       }
>>
>> -     /* preemption disabled for handle_exit, gfn_to_pfn may sleep */
>> -     preempt_enable();
>>       pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
>> -     preempt_disable();
>>
>>       if (is_error_pfn(pfn)) {
>>               put_page(pfn_to_page(pfn));
>>
>
> Acked-by: Marc Zyngier <marc.zyngier@xxxxxxx>
>
Thanks, pushed out.

-Christoffer
_______________________________________________
kvmarm mailing list
kvmarm@xxxxxxxxxxxxxxxxxxxxx
https://lists.cs.columbia.edu/cucslists/listinfo/kvmarm