Re: [PATCH 1/4] ARM: KVM: Convert stage-2 mutex to a spinlock

Marc Zyngier <marc.zyngier@xxxxxxx> · Tue, 31 Jul 2012 15:20:53 +0100

On 31/07/12 15:03, Christoffer Dall wrote:
> On Tue, Jul 31, 2012 at 2:27 PM, Christoffer Dall
> <c.dall@xxxxxxxxxxxxxxxxxxxxxx> wrote:
>> On Thu, Jul 5, 2012 at 5:03 PM, Marc Zyngier <marc.zyngier@xxxxxxx> wrote:
>>> Now that we run the exit handler with preemption disabled, we start
>>> seeing some very ugly things going on:
>>>
>>> BUG: scheduling while atomic: qemu-system-arm/1144/0x00000002
>>> Modules linked in:
>>> [<c0015f3c>] (unwind_backtrace+0x0/0xf8) from [<c03f54e0>] (__schedule_bug+0x44/0x58)
>>> [<c03f54e0>] (__schedule_bug+0x44/0x58) from [<c0402690>] (__schedule+0x620/0x644)
>>> [<c0402690>] (__schedule+0x620/0x644) from [<c0402ad8>] (schedule_preempt_disabled+0x24/0x34)
>>> [<c0402ad8>] (schedule_preempt_disabled+0x24/0x34) from [<c04016ac>] (__mutex_lock_slowpath+0x120/0x1a8)
>>> [<c04016ac>] (__mutex_lock_slowpath+0x120/0x1a8) from [<c040176c>] (mutex_lock+0x38/0x3c)
>>> [<c040176c>] (mutex_lock+0x38/0x3c) from [<c0026ab4>] (user_mem_abort.isra.9+0xbc/0x1fc)
>>> [<c0026ab4>] (user_mem_abort.isra.9+0xbc/0x1fc) from [<c00273bc>] (kvm_handle_guest_abort+0xdc/0x164)
>>> [<c00273bc>] (kvm_handle_guest_abort+0xdc/0x164) from [<c0024fcc>] (handle_exit+0x74/0x11c)
>>> [<c0024fcc>] (handle_exit+0x74/0x11c) from [<c00255c0>] (kvm_arch_vcpu_ioctl_run+0x138/0x21c)
>>> [<c00255c0>] (kvm_arch_vcpu_ioctl_run+0x138/0x21c) from [<c002214c>] (kvm_vcpu_ioctl+0x3d8/0x6bc)
>>> [<c002214c>] (kvm_vcpu_ioctl+0x3d8/0x6bc) from [<c00e0990>] (do_vfs_ioctl+0x7c/0x2d0)
>>> [<c00e0990>] (do_vfs_ioctl+0x7c/0x2d0) from [<c00e0c1c>] (sys_ioctl+0x38/0x5c)
>>> [<c00e0c1c>] (sys_ioctl+0x38/0x5c) from [<c000eec0>] (ret_fast_syscall+0x0/0x30)
>>>
>>> The culprit is the stage-2 lock, which is defined as a mutex, and
>>> really should be a spinlock.
>>>
>>> Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx>
>>> ---
>>>  arch/arm/include/asm/kvm_host.h |    2 +-
>>>  arch/arm/kvm/arm.c              |    2 +-
>>>  arch/arm/kvm/mmu.c              |   12 ++++++------
>>>  3 files changed, 8 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
>>> index 0c7e782..9d0cc83 100644
>>> --- a/arch/arm/include/asm/kvm_host.h
>>> +++ b/arch/arm/include/asm/kvm_host.h
>>> @@ -40,7 +40,7 @@ struct kvm_arch {
>>>         u32    vmid;
>>>
>>>         /* 1-level 2nd stage table and lock */
>>> -       struct mutex pgd_mutex;
>>> +       spinlock_t pgd_lock;
>>>         pgd_t *pgd;
>>>
>>>         /* VTTBR value associated with above pgd and vmid */
>>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>>> index f3b206a..44691e1 100644
>>> --- a/arch/arm/kvm/arm.c
>>> +++ b/arch/arm/kvm/arm.c
>>> @@ -97,7 +97,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>>>         ret = kvm_alloc_stage2_pgd(kvm);
>>>         if (ret)
>>>                 goto out_fail_alloc;
>>> -       mutex_init(&kvm->arch.pgd_mutex);
>>> +       spin_lock_init(&kvm->arch.pgd_lock);
>>>
>>>         ret = create_hyp_mappings(kvm, kvm + 1);
>>>         if (ret)
>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>> index 8c30376..4b174e6 100644
>>> --- a/arch/arm/kvm/mmu.c
>>> +++ b/arch/arm/kvm/mmu.c
>>> @@ -354,14 +354,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>>                 return -EFAULT;
>>>         }
>>>
>>> -       mutex_lock(&vcpu->kvm->arch.pgd_mutex);
>>> +       spin_lock(&vcpu->kvm->arch.pgd_lock);
>>>         new_pte = pfn_pte(pfn, PAGE_KVM_GUEST);
>>>         if (writable)
>>>                 new_pte |= L_PTE2_WRITE;
>>>         ret = stage2_set_pte(vcpu->kvm, fault_ipa, &new_pte);
>>>         if (ret)
>>>                 put_page(pfn_to_page(pfn));
>>> -       mutex_unlock(&vcpu->kvm->arch.pgd_mutex);
>>> +       spin_unlock(&vcpu->kvm->arch.pgd_lock);
>>>
>>>         return ret;
>>>  }
>>> @@ -611,13 +611,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
>>>         if (!kvm->arch.pgd)
>>>                 return 0;
>>>
>>> -       mutex_lock(&kvm->arch.pgd_mutex);
>>> +       spin_lock(&kvm->arch.pgd_lock);
>>
>> eh, hva_to_gpa calls it's own mutex and may sleep while holding a spinlock...
>>
>> also stage2_set_pte below allocates memory and may sleep...
>>
>> Perhaps we should instead call the cpu-local-critical cache functions
>> on the required CPU and run the whole thing with preemption enabled
>> instead....?
>>
> 
> In fact, it looks simple enough to me (comments please):
> 
> 
> commit 95d11541f77783f00b8799a94ed99f1b1b321b1a
> Author: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx>
> Date:   Tue Jul 31 10:00:29 2012 -0400
> 
>     ARM: KVM: Run exit handling under preemption again
> 
>     This makes for a much cleaner approach with less worries during exit
>     handling and allows for correct cache maintenance for set/way operations
>     on SMP, at the cost of taking a slight performance hit when the guest
>     does set/way based cache operations _and_ when we are preempted between
>     guest exit and emulation code.
> 
>     Signed-off-by: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx>
> 
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index e50d9b1..0006264 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -113,6 +113,7 @@ struct kvm_vcpu_arch {
>  				   instructions */
> 
>  	/* dcache set/way operation pending */
> +	int last_pcpu;
>  	cpumask_t require_dcache_flush;
> 
>  	/* IO related fields */
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 52e449e..38e9448 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -285,8 +285,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  	/*
>  	 * Check whether this vcpu requires the cache to be flushed on
>  	 * this physical CPU. This is a consequence of doing dcache
> -	 * operations by set/way on this vcpu. We do it here in order
> -	 * to be in a non-preemptible section.
> +	 * operations by set/way on this vcpu. We do it here to be in
> +	 * a non-preemptible section.
>  	 */
>  	if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
>  		flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
> @@ -535,15 +535,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
> *vcpu, struct kvm_run *run)
>  		cond_resched();
>  		update_vttbr(vcpu->kvm);
> 
> -		/*
> -		 * Make sure preemption is disabled while calling handle_exit
> -		 * as exit handling touches CPU-specific resources, such as
> -		 * caches, and we must stay on the same CPU.
> -		 *
> -		 * Code that might sleep must enable preemption and access
> -		 * CPU-specific resources first.
> -		 */
> -		preempt_disable();
>  		local_irq_disable();
> 
>  		/*
> @@ -556,7 +547,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu,
> struct kvm_run *run)
> 
>  		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
>  			local_irq_enable();
> -			preempt_enable();
>  			continue;
>  		}
> 
> @@ -572,6 +562,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu,
> struct kvm_run *run)
>  		ret = __kvm_vcpu_run(vcpu);
> 
>  		vcpu->mode = OUTSIDE_GUEST_MODE;
> +		vcpu->arch.last_pcpu = smp_processor_id();
>  		vcpu->stat.exits++;
>  		kvm_guest_exit();
>  		trace_kvm_exit(vcpu->arch.regs.pc);
> @@ -582,7 +573,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu,
> struct kvm_run *run)
>  		 *************************************************************/
> 
>  		ret = handle_exit(vcpu, run, ret);
> -		preempt_enable();
>  	}
> 
>  	if (vcpu->sigset_active)
> diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
> index 039fc3a..4a89710 100644
> --- a/arch/arm/kvm/emulate.c
> +++ b/arch/arm/kvm/emulate.c
> @@ -20,6 +20,7 @@
>  #include <asm/kvm_arm.h>
>  #include <asm/kvm_host.h>
>  #include <asm/kvm_emulate.h>
> +#include <asm/cacheflush.h>
>  #include <trace/events/kvm.h>
> 
>  #include "trace.h"
> @@ -299,6 +300,18 @@ static bool write_dcsw(struct kvm_vcpu *vcpu,
>  		       unsigned long cp15_reg)
>  {
>  	u32 val;
> +	int cpu;
> +
> +	cpu = get_cpu();
> +
> +	cpumask_setall(&vcpu->arch.require_dcache_flush);
> +	cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
> +
> +	/* If we were already preempted, take the long way around */
> +	if (cpu != vcpu->arch.last_pcpu) {
> +		flush_cache_all();
> +		goto done;
> +	}

So in this case, we nuke the caches on all CPUs. Not a big deal in my
opinion.

Getting rid of the whole preempt_disable() trumps the above cost any
day, so I'm quite happy with this patch.

>  	val = *vcpu_reg(vcpu, p->Rt1);
> 
> @@ -313,8 +326,8 @@ static bool write_dcsw(struct kvm_vcpu *vcpu,
>  		break;
>  	}
> 
> -	cpumask_setall(&vcpu->arch.require_dcache_flush);
> -	cpumask_clear_cpu(vcpu->cpu, &vcpu->arch.require_dcache_flush);
> +done:
> +	put_cpu();
> 
>  	return true;
>  }
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index 24769af..b4927ce 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -341,10 +341,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
> phys_addr_t fault_ipa,
>  		return -EFAULT;
>  	}
> 
> -	/* preemption disabled for handle_exit, gfn_to_pfn may sleep */
> -	preempt_enable();
>  	pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
> -	preempt_disable();
> 
>  	if (is_error_pfn(pfn)) {
>  		put_page(pfn_to_page(pfn));
> 

Acked-by: Marc Zyngier <marc.zyngier@xxxxxxx>

	M.
-- 
Jazz is not dead. It just smells funny...

_______________________________________________
kvmarm mailing list
kvmarm@xxxxxxxxxxxxxxxxxxxxx
https://lists.cs.columbia.edu/cucslists/listinfo/kvmarm