Re: [PATCH 1/2] KVM: fix cache stale memslot info with correct mmio generation number

Xiao Guangrong <xiaoguangrong.eric@xxxxxxxxx> · Tue, 19 Aug 2014 03:56:11 +0800

On Aug 19, 2014, at 2:47 AM, Paolo Bonzini <pbonzini@xxxxxxxxxx> wrote:

> 
>> I think this patch is auditable, page-fault is always called by holding
>> srcu-lock so that a page fault can’t go across synchronize_srcu_expedited.
>> Only these cases can happen:
>> 
>> 1)  page fault occurs before synchronize_srcu_expedited.
>>    In this case, vcpu will generate mmio-exit for the memslot being registered
>>    by the ioctl. That’s ok since the ioctl have not finished.
>> 
>> 2) page fault occurs after synchronize_srcu_expedited and during
>>   increasing generation-number.
>>   In this case, userspace may get wrong mmio-exit (that happen if handing
>>   page-fault is slower that the ioctl), that’s ok too since userspace need do
>>  the check anyway like i said above.
>> 
>> 3) page fault occurs after generation-number update
>>   that’s definitely correct. :)
>> 
>>> Another alternative could be to use the low bit to mark an in-progress
>>> change, and skip the caching if the low bit is set.  Similar to a
>>> seqcount (except if read_seqcount_retry fails, we just punt and not
>>> retry anything), you could use it even though the memory barriers
>>> provided by write_seqcount_begin/end are not too useful in this case.
>> 
>> I do not know how the bit works, page fault will cache the memslot before
>> the bit set and cache the generation-number after the bit set.
>> 
>> Maybe i missed your idea, could you please detail it?
> 
> Something like this:
> 
> -	update_memslots(slots, new, kvm->memslots->generation);
> +	/* ensure generation number is always increased. */
> +	slots->generation = old_memslots->generation + 1;
> +	update_memslots(slots, new);
> 	rcu_assign_pointer(kvm->memslots, slots);
> 	synchronize_srcu_expedited(&kvm->srcu);
> +	slots->generation++;
> 
> Then case 1 and 2 will just have a cache miss.

So, case 2 is what you concerned? :) I still think it is ok but i do not have
strong opinion on that. How about simplify it like this:

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9314678..9fabf6a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -234,16 +234,22 @@ static unsigned int get_mmio_spte_generation(u64 spte)
 	return gen;
 }
 
-static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+static unsigned int __kvm_current_mmio_generation(struct kvm_memslots *slots)
 {
+
 	/*
 	 * Init kvm generation close to MMIO_MAX_GEN to easily test the
 	 * code of handling generation number wrap-around.
 	 */
-	return (kvm_memslots(kvm)->generation +
+	return (slots->generation +
 		      MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
 }
 
+static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+	return __kvm_current_mmio_generation(kvm_memslots(kvm));
+}
+
 static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
 			   unsigned access)
 {
@@ -287,9 +293,15 @@ static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
 
 static bool check_mmio_spte(struct kvm *kvm, u64 spte)
 {
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 	unsigned int kvm_gen, spte_gen;
 
-	kvm_gen = kvm_current_mmio_generation(kvm);
+	if (slots->updated)
+		return false;
+
+	smp_rmb();
+	
+	kvm_gen = __kvm_current_mmio_generation(slots);
 	spte_gen = get_mmio_spte_generation(spte);
 
 	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4b6c01b..1d4e78f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -96,7 +96,7 @@ static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 static void update_memslots(struct kvm_memslots *slots,
-			    struct kvm_memory_slot *new, u64 last_generation);
+			    struct kvm_memory_slot *new);
 
 static void kvm_release_pfn_dirty(pfn_t pfn);
 static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -685,8 +685,7 @@ static void sort_memslots(struct kvm_memslots *slots)
 }
 
 static void update_memslots(struct kvm_memslots *slots,
-			    struct kvm_memory_slot *new,
-			    u64 last_generation)
+			    struct kvm_memory_slot *new)
 {
 	if (new) {
 		int id = new->id;
@@ -697,8 +696,6 @@ static void update_memslots(struct kvm_memslots *slots,
 		if (new->npages != npages)
 			sort_memslots(slots);
 	}
-
-	slots->generation = last_generation + 1;
 }
 
 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
@@ -720,10 +717,17 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 {
 	struct kvm_memslots *old_memslots = kvm->memslots;
 
-	update_memslots(slots, new, kvm->memslots->generation);
+	/* ensure generation number is always increased. */
+	slots->updated = true;
+	slots->generation = old_memslots->generation;
+	update_memslots(slots, new);
 	rcu_assign_pointer(kvm->memslots, slots);
 	synchronize_srcu_expedited(&kvm->srcu);
 
+	slots->generation++;
+	smp_wmb();
+	slots->updated = false;
+
 	kvm_arch_memslots_updated(kvm);
 
 	return old_memslots;

--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html