Re: [PATCH 1/4] ARM: KVM: Convert stage-2 mutex to a spinlock

Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> · Tue, 31 Jul 2012 16:03:42 +0200

On Tue, Jul 31, 2012 at 2:27 PM, Christoffer Dall
<c.dall@xxxxxxxxxxxxxxxxxxxxxx> wrote:
> On Thu, Jul 5, 2012 at 5:03 PM, Marc Zyngier <marc.zyngier@xxxxxxx> wrote:
>> Now that we run the exit handler with preemption disabled, we start
>> seeing some very ugly things going on:
>>
>> BUG: scheduling while atomic: qemu-system-arm/1144/0x00000002
>> Modules linked in:
>> [<c0015f3c>] (unwind_backtrace+0x0/0xf8) from [<c03f54e0>] (__schedule_bug+0x44/0x58)
>> [<c03f54e0>] (__schedule_bug+0x44/0x58) from [<c0402690>] (__schedule+0x620/0x644)
>> [<c0402690>] (__schedule+0x620/0x644) from [<c0402ad8>] (schedule_preempt_disabled+0x24/0x34)
>> [<c0402ad8>] (schedule_preempt_disabled+0x24/0x34) from [<c04016ac>] (__mutex_lock_slowpath+0x120/0x1a8)
>> [<c04016ac>] (__mutex_lock_slowpath+0x120/0x1a8) from [<c040176c>] (mutex_lock+0x38/0x3c)
>> [<c040176c>] (mutex_lock+0x38/0x3c) from [<c0026ab4>] (user_mem_abort.isra.9+0xbc/0x1fc)
>> [<c0026ab4>] (user_mem_abort.isra.9+0xbc/0x1fc) from [<c00273bc>] (kvm_handle_guest_abort+0xdc/0x164)
>> [<c00273bc>] (kvm_handle_guest_abort+0xdc/0x164) from [<c0024fcc>] (handle_exit+0x74/0x11c)
>> [<c0024fcc>] (handle_exit+0x74/0x11c) from [<c00255c0>] (kvm_arch_vcpu_ioctl_run+0x138/0x21c)
>> [<c00255c0>] (kvm_arch_vcpu_ioctl_run+0x138/0x21c) from [<c002214c>] (kvm_vcpu_ioctl+0x3d8/0x6bc)
>> [<c002214c>] (kvm_vcpu_ioctl+0x3d8/0x6bc) from [<c00e0990>] (do_vfs_ioctl+0x7c/0x2d0)
>> [<c00e0990>] (do_vfs_ioctl+0x7c/0x2d0) from [<c00e0c1c>] (sys_ioctl+0x38/0x5c)
>> [<c00e0c1c>] (sys_ioctl+0x38/0x5c) from [<c000eec0>] (ret_fast_syscall+0x0/0x30)
>>
>> The culprit is the stage-2 lock, which is defined as a mutex, and
>> really should be a spinlock.
>>
>> Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx>
>> ---
>>  arch/arm/include/asm/kvm_host.h |    2 +-
>>  arch/arm/kvm/arm.c              |    2 +-
>>  arch/arm/kvm/mmu.c              |   12 ++++++------
>>  3 files changed, 8 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
>> index 0c7e782..9d0cc83 100644
>> --- a/arch/arm/include/asm/kvm_host.h
>> +++ b/arch/arm/include/asm/kvm_host.h
>> @@ -40,7 +40,7 @@ struct kvm_arch {
>>         u32    vmid;
>>
>>         /* 1-level 2nd stage table and lock */
>> -       struct mutex pgd_mutex;
>> +       spinlock_t pgd_lock;
>>         pgd_t *pgd;
>>
>>         /* VTTBR value associated with above pgd and vmid */
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index f3b206a..44691e1 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -97,7 +97,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>>         ret = kvm_alloc_stage2_pgd(kvm);
>>         if (ret)
>>                 goto out_fail_alloc;
>> -       mutex_init(&kvm->arch.pgd_mutex);
>> +       spin_lock_init(&kvm->arch.pgd_lock);
>>
>>         ret = create_hyp_mappings(kvm, kvm + 1);
>>         if (ret)
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index 8c30376..4b174e6 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -354,14 +354,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>>                 return -EFAULT;
>>         }
>>
>> -       mutex_lock(&vcpu->kvm->arch.pgd_mutex);
>> +       spin_lock(&vcpu->kvm->arch.pgd_lock);
>>         new_pte = pfn_pte(pfn, PAGE_KVM_GUEST);
>>         if (writable)
>>                 new_pte |= L_PTE2_WRITE;
>>         ret = stage2_set_pte(vcpu->kvm, fault_ipa, &new_pte);
>>         if (ret)
>>                 put_page(pfn_to_page(pfn));
>> -       mutex_unlock(&vcpu->kvm->arch.pgd_mutex);
>> +       spin_unlock(&vcpu->kvm->arch.pgd_lock);
>>
>>         return ret;
>>  }
>> @@ -611,13 +611,13 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
>>         if (!kvm->arch.pgd)
>>                 return 0;
>>
>> -       mutex_lock(&kvm->arch.pgd_mutex);
>> +       spin_lock(&kvm->arch.pgd_lock);
>
> eh, hva_to_gpa calls it's own mutex and may sleep while holding a spinlock...
>
> also stage2_set_pte below allocates memory and may sleep...
>
> Perhaps we should instead call the cpu-local-critical cache functions
> on the required CPU and run the whole thing with preemption enabled
> instead....?
>

In fact, it looks simple enough to me (comments please):


commit 95d11541f77783f00b8799a94ed99f1b1b321b1a
Author: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx>
Date:   Tue Jul 31 10:00:29 2012 -0400

    ARM: KVM: Run exit handling under preemption again

    This makes for a much cleaner approach with less worries during exit
    handling and allows for correct cache maintenance for set/way operations
    on SMP, at the cost of taking a slight performance hit when the guest
    does set/way based cache operations _and_ when we are preempted between
    guest exit and emulation code.

    Signed-off-by: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx>

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index e50d9b1..0006264 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -113,6 +113,7 @@ struct kvm_vcpu_arch {
 				   instructions */

 	/* dcache set/way operation pending */
+	int last_pcpu;
 	cpumask_t require_dcache_flush;

 	/* IO related fields */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 52e449e..38e9448 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -285,8 +285,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	/*
 	 * Check whether this vcpu requires the cache to be flushed on
 	 * this physical CPU. This is a consequence of doing dcache
-	 * operations by set/way on this vcpu. We do it here in order
-	 * to be in a non-preemptible section.
+	 * operations by set/way on this vcpu. We do it here to be in
+	 * a non-preemptible section.
 	 */
 	if (cpumask_test_and_clear_cpu(cpu, &vcpu->arch.require_dcache_flush))
 		flush_cache_all(); /* We'd really want v7_flush_dcache_all() */
@@ -535,15 +535,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
*vcpu, struct kvm_run *run)
 		cond_resched();
 		update_vttbr(vcpu->kvm);

-		/*
-		 * Make sure preemption is disabled while calling handle_exit
-		 * as exit handling touches CPU-specific resources, such as
-		 * caches, and we must stay on the same CPU.
-		 *
-		 * Code that might sleep must enable preemption and access
-		 * CPU-specific resources first.
-		 */
-		preempt_disable();
 		local_irq_disable();

 		/*
@@ -556,7 +547,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu,
struct kvm_run *run)

 		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
 			local_irq_enable();
-			preempt_enable();
 			continue;
 		}

@@ -572,6 +562,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu,
struct kvm_run *run)
 		ret = __kvm_vcpu_run(vcpu);

 		vcpu->mode = OUTSIDE_GUEST_MODE;
+		vcpu->arch.last_pcpu = smp_processor_id();
 		vcpu->stat.exits++;
 		kvm_guest_exit();
 		trace_kvm_exit(vcpu->arch.regs.pc);
@@ -582,7 +573,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu,
struct kvm_run *run)
 		 *************************************************************/

 		ret = handle_exit(vcpu, run, ret);
-		preempt_enable();
 	}

 	if (vcpu->sigset_active)
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index 039fc3a..4a89710 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -20,6 +20,7 @@
 #include <asm/kvm_arm.h>
 #include <asm/kvm_host.h>
 #include <asm/kvm_emulate.h>
+#include <asm/cacheflush.h>
 #include <trace/events/kvm.h>

 #include "trace.h"
@@ -299,6 +300,18 @@ static bool write_dcsw(struct kvm_vcpu *vcpu,
 		       unsigned long cp15_reg)
 {
 	u32 val;
+	int cpu;
+
+	cpu = get_cpu();
+
+	cpumask_setall(&vcpu->arch.require_dcache_flush);
+	cpumask_clear_cpu(cpu, &vcpu->arch.require_dcache_flush);
+
+	/* If we were already preempted, take the long way around */
+	if (cpu != vcpu->arch.last_pcpu) {
+		flush_cache_all();
+		goto done;
+	}

 	val = *vcpu_reg(vcpu, p->Rt1);

@@ -313,8 +326,8 @@ static bool write_dcsw(struct kvm_vcpu *vcpu,
 		break;
 	}

-	cpumask_setall(&vcpu->arch.require_dcache_flush);
-	cpumask_clear_cpu(vcpu->cpu, &vcpu->arch.require_dcache_flush);
+done:
+	put_cpu();

 	return true;
 }
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 24769af..b4927ce 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -341,10 +341,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
phys_addr_t fault_ipa,
 		return -EFAULT;
 	}

-	/* preemption disabled for handle_exit, gfn_to_pfn may sleep */
-	preempt_enable();
 	pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable);
-	preempt_disable();

 	if (is_error_pfn(pfn)) {
 		put_page(pfn_to_page(pfn));
_______________________________________________
kvmarm mailing list
kvmarm@xxxxxxxxxxxxxxxxxxxxx
https://lists.cs.columbia.edu/cucslists/listinfo/kvmarm