[PATCH] KVM: Defer remote tlb flushes on invlpg (v4)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sun, Mar 29, 2009 at 01:36:01PM +0300, Avi Kivity wrote:
> Marcelo, Andrea?

Had to read the code a bit more to understand the reason of the
unsync_mmu flush in cr3 overwrite.

> Avi Kivity wrote:
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 2a36f7f..f0ea56c 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -1184,8 +1184,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
>>  		for_each_sp(pages, sp, parents, i)
>>  			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
>>  -		if (protected)
>> -			kvm_flush_remote_tlbs(vcpu->kvm);
>> +		kvm_flush_remote_tlbs_cond(vcpu->kvm, protected);
>>   		for_each_sp(pages, sp, parents, i) {
>>  			kvm_sync_page(vcpu, sp);

Ok so because we didn't flush the tlb on the other vcpus when invlpg
run, if cr3 overwrite needs to re-sync sptes wrprotecting them, we've
to flush the tlb in all vcpus to be sure the possibly writable tlb
entry reflecting the old writable spte instantiated before invlpg run,
is removed from the physical cpus. We wouldn't find it in for_each_sp
because it was rmap_removed, but we'll find something in
mmu_unsync_walk (right? we definitely have to find something in
mmu_unsync_walk for this to work, the parent sp have to leave
child->unsync set even after rmap_remove run in invlpg without
flushing the other vcpus tlbs).

>>  @@ -465,7 +464,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, 
>> gva_t gva)
>>  				rmap_remove(vcpu->kvm, sptep);
>>  				if (is_large_pte(*sptep))
>>  					--vcpu->kvm->stat.lpages;
>> -				need_flush = 1;
>> +				vcpu->kvm->remote_tlbs_dirty = true;
>>  			}
>>  			set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
>>  			break;
>> @@ -475,8 +474,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t 
>> gva)
>>  			break;
>>  	}
>>  -	if (need_flush)
>> -		kvm_flush_remote_tlbs(vcpu->kvm);
>>  	spin_unlock(&vcpu->kvm->mmu_lock);

AFIK to be compliant with lowlevel archs (without ASN it doesn't
matter I think as vmx always flush on exit), we have to flush the
local tlb here, with set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests). I
don't see why it's missing. Or am I wrong?

>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 68b217e..12afa50 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -758,10 +758,18 @@ static bool make_all_cpus_request(struct kvm *kvm, 
>> unsigned int req)
>>   void kvm_flush_remote_tlbs(struct kvm *kvm)
>>  {
>> +	kvm->remote_tlbs_dirty = false;
>> +	smp_wmb();

Still no lock prefix to the asm insn and here it runs outside the
mmu_lock, but ok, I tend to agree smp_wmb should be enough to be sure
the write is fully finished by the time smb_wmb returns. There's
another problem though.

CPU0				CPU1
-----------			-------------
remote_tlbs_dirty = false
				remote_tlbs_dirty = true
smp_tlb_flush
				set_shadow_pte(sptep, shadow_trap_nonpresent_pte);


The flush for the sptep will be lost.

>> @@ -907,8 +913,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct 
>> mmu_notifier *mn,
>>  	young = kvm_age_hva(kvm, address);
>>  	spin_unlock(&kvm->mmu_lock);
>>  -	if (young)
>> -		kvm_flush_remote_tlbs(kvm);
>> +	kvm_flush_remote_tlbs_cond(kvm, young);
>>   	return young;
>>  }

No need to flush for clear_flush_young method, pages can't be freed
there.

I mangled over the patch a bit, plus fixed the above smp race, let me
know what you think.

The the best workload to exercise this is running a VM with lots of
VCPUs and 8G of ram with a 32bit guest kernel and then just malloc and
touch a byte for each 4096 page allocated by malloc. That will run a
flood of invlpg. Then push the system to swap. while :; do cp /dev/hda
/dev/null; done, also works without O_DIRECT so the host cache make it
fast at the second run (not so much faster with host swapping though).

I only tested it so far with 12 VM on swap with 64bit kernels with
heavy I/O so it's not good test as I doubt any invlpg runs, not even
munmap(addr, 4k) uses invlpg.

Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx>
---

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d5bdf3a..900bc31 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1185,7 +1185,11 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 		for_each_sp(pages, sp, parents, i)
 			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
 
-		if (protected)
+		/*
+		 * Avoid leaving stale tlb entries in this vcpu representing
+		 * sptes rmap_removed by invlpg without vcpu smp tlb flush.
+		 */
+		if (protected || vcpu->kvm->remote_tlbs_dirty)
 			kvm_flush_remote_tlbs(vcpu->kvm);
 
 		for_each_sp(pages, sp, parents, i) {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 09782a9..060b4a3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -483,7 +483,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 	}
 
 	if (need_flush)
-		kvm_flush_remote_tlbs(vcpu->kvm);
+		kvm_flush_local_tlb(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	if (pte_gpa == -1)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 095ebb6..731b846 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -125,6 +125,7 @@ struct kvm_kernel_irq_routing_entry {
 struct kvm {
 	struct mutex lock; /* protects the vcpus array and APIC accesses */
 	spinlock_t mmu_lock;
+	bool remote_tlbs_dirty; /* =true serialized by mmu_lock, =false OOO */
 	struct rw_semaphore slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	int nmemslots;
@@ -235,6 +236,7 @@ void kvm_resched(struct kvm_vcpu *vcpu);
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
+void kvm_flush_local_tlb(struct kvm_vcpu *vcpu);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 363af32..d955690 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -756,8 +756,45 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 	return called;
 }
 
+void kvm_flush_local_tlb(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * This will guarantee that our current sptep is flushed from
+	 * the tlb too, even if there's a kvm_flush_remote_tlbs
+	 * running from under us (so if it's not running under
+	 * mmu_lock like in the mmu notifier invocation).
+	 */
+	smp_wmb();
+	/*
+	 * If the below assignment get lost because of lack of atomic ops
+	 * and one or more concurrent writers in kvm_flush_remote_tlbs
+	 * we know that any set_shadow_pte preceding kvm_flush_local_tlb
+	 * was visible to the other CPU, before KVM_REQ_TLB_FLUSH was set
+	 * in each vcpu->requests by kvm_flush_remote_tlbs.
+	 */
+	vcpu->kvm->remote_tlbs_dirty = true;
+
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+	/* get new asid before returning to guest mode */
+	if (!test_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+#else
+	/*
+	 * Without CONFIG_MMU_NOTIFIER enabled this isn't enough but
+	 * it will reduce the risk window at least.
+	 */
+	kvm_flush_remote_tlbs(vcpu->kvm);
+#endif
+}
+
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
+	kvm->remote_tlbs_dirty = false;
+	/*
+	 * Guarantee that remote_tlbs_dirty is committed to memory
+	 * before setting KVM_REQ_TLB_FLUSH.
+	 */
+	smp_wmb();
 	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
 		++kvm->stat.remote_tlb_flush;
 }
@@ -810,6 +847,23 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
 	return container_of(mn, struct kvm, mmu_notifier);
 }
 
+static void kvm_mmu_notifier_tlb_flush(int need_tlb_flush, struct kvm *kvm)
+{
+	/*
+	 * We've to flush the tlb before the pages can be freed.
+	 *
+	 * The important "remote_tlbs_dirty = true" that we can't miss
+	 * always runs under mmu_lock before we taken it in the above
+	 * spin_lock. Other than this we've to be sure not to set
+	 * remote_tlbs_dirty to false inside kvm_flush_remote_tlbs
+	 * unless we're going to flush the guest smp tlb for any
+	 * relevant spte that has been invalidated just before setting
+	 * "remote_tlbs_dirty = true".
+	 */
+	if (need_tlb_flush || kvm->remote_tlbs_dirty)
+		kvm_flush_remote_tlbs(kvm);
+}
+
 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 					     struct mm_struct *mm,
 					     unsigned long address)
@@ -840,10 +894,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
 	need_tlb_flush = kvm_unmap_hva(kvm, address);
 	spin_unlock(&kvm->mmu_lock);
 
-	/* we've to flush the tlb before the pages can be freed */
-	if (need_tlb_flush)
-		kvm_flush_remote_tlbs(kvm);
-
+	kvm_mmu_notifier_tlb_flush(need_tlb_flush, kvm);
 }
 
 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
@@ -865,9 +916,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		need_tlb_flush |= kvm_unmap_hva(kvm, start);
 	spin_unlock(&kvm->mmu_lock);
 
-	/* we've to flush the tlb before the pages can be freed */
-	if (need_tlb_flush)
-		kvm_flush_remote_tlbs(kvm);
+	kvm_mmu_notifier_tlb_flush(need_tlb_flush, kvm);
 }
 
 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -888,7 +937,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 	 * The above sequence increase must be visible before the
 	 * below count decrease but both values are read by the kvm
 	 * page fault under mmu_lock spinlock so we don't need to add
-	 * a smb_wmb() here in between the two.
+	 * a smp_wmb() here in between the two.
 	 */
 	kvm->mmu_notifier_count--;
 	spin_unlock(&kvm->mmu_lock);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux