[patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only)

mtosatti@xxxxxxxxxx · Wed, 09 Jul 2014 16:12:52 -0300

Allow vcpus to pin spte translations by:

1) Creating a per-vcpu list of pinned ranges.
2) On mmu reload request:
	- Fault ranges.
	- Mark sptes with a pinned bit.
	- Mark shadow pages as pinned.

3) Then modify the following actions:
	- Page age => skip spte flush.
	- MMU notifiers => force mmu reload request (which kicks cpu out of
				guest mode).
	- GET_DIRTY_LOG => force mmu reload request.
	- SLAB shrinker => skip shadow page deletion.

TDP-only.

Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>

---
 arch/x86/include/asm/kvm_host.h |   13 +
 arch/x86/kvm/mmu.c              |  294 +++++++++++++++++++++++++++++++++++++---
 arch/x86/kvm/mmu.h              |    7 
 arch/x86/kvm/mmutrace.h         |   23 +++
 arch/x86/kvm/paging_tmpl.h      |    4 
 arch/x86/kvm/x86.c              |    8 -
 include/linux/kvm_host.h        |    3 
 include/uapi/linux/kvm.h        |    2 
 virt/kvm/kvm_main.c             |   18 +-
 9 files changed, 340 insertions(+), 32 deletions(-)

Index: kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h
===================================================================

--- kvm.pinned-sptes.orig/arch/x86/include/asm/kvm_host.h	2014-07-09 12:05:34.836161266 -0300
+++ kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h	2014-07-09 12:08:45.341762782 -0300
@@ -97,6 +97,8 @@
 #define KVM_NR_FIXED_MTRR_REGION 88
 #define KVM_NR_VAR_MTRR 8
 
+#define KVM_MAX_PER_VCPU_PINNED_RANGE 10
+
 #define ASYNC_PF_PER_VCPU 64
 
 struct kvm_vcpu;
@@ -221,6 +223,8 @@
 	/* hold the gfn of each spte inside spt */
 	gfn_t *gfns;
 	bool unsync;
+	bool pinned;
+
 	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
 	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
@@ -337,6 +341,12 @@
 	KVM_DEBUGREG_WONT_EXIT = 2,
 };
 
+struct kvm_pinned_page_range {
+	gfn_t base_gfn;
+	unsigned long npages;
+	struct list_head link;
+};
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -392,6 +402,9 @@
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
+	struct list_head pinned_mmu_pages;
+	atomic_t nr_pinned_ranges;
+
 	struct fpu guest_fpu;
 	u64 xcr0;
 	u64 guest_supported_xcr0;
Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 12:05:34.837161264 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 12:09:21.856684314 -0300
@@ -148,6 +148,9 @@
 
 #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 #define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
+#define SPTE_PINNED		(1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT))
+
+#define SPTE_PINNED_BIT PT64_SECOND_AVAIL_BITS_SHIFT
 
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
@@ -327,6 +330,11 @@
 	return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
 }
 
+static int is_pinned_spte(u64 spte)
+{
+	return spte & SPTE_PINNED && is_shadow_present_pte(spte);
+}
+
 static int is_large_pte(u64 pte)
 {
 	return pte & PT_PAGE_SIZE_MASK;
@@ -1176,6 +1184,16 @@
 		kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
+static bool vcpu_has_pinned(struct kvm_vcpu *vcpu)
+{
+	return atomic_read(&vcpu->arch.nr_pinned_ranges);
+}
+
+static void mmu_reload_pinned_vcpus(struct kvm *kvm)
+{
+	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD, &vcpu_has_pinned);
+}
+
 /*
  * Write-protect on the specified @sptep, @pt_protect indicates whether
  * spte write-protection is caused by protecting shadow page table.
@@ -1268,7 +1286,8 @@
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			   struct kvm_memory_slot *slot, unsigned long data)
+			   struct kvm_memory_slot *slot, unsigned long data,
+			   bool age)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1278,6 +1297,14 @@
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
 
+		if (is_pinned_spte(*sptep)) {
+			/* don't nuke pinned sptes if page aging: return
+ 			 * young=yes instead.
+ 			 */
+			if (age)
+				return 1;
+			mmu_reload_pinned_vcpus(kvm);
+		}
 		drop_spte(kvm, sptep);
 		need_tlb_flush = 1;
 	}
@@ -1286,7 +1313,8 @@
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			     struct kvm_memory_slot *slot, unsigned long data)
+			     struct kvm_memory_slot *slot, unsigned long data,
+			     bool age)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1304,6 +1332,9 @@
 
 		need_flush = 1;
 
+		if (is_pinned_spte(*sptep))
+			mmu_reload_pinned_vcpus(kvm);
+
 		if (pte_write(*ptep)) {
 			drop_spte(kvm, sptep);
 			sptep = rmap_get_first(*rmapp, &iter);
@@ -1334,7 +1365,8 @@
 				int (*handler)(struct kvm *kvm,
 					       unsigned long *rmapp,
 					       struct kvm_memory_slot *slot,
-					       unsigned long data))
+					       unsigned long data,
+					       bool age))
 {
 	int j;
 	int ret = 0;
@@ -1374,7 +1406,7 @@
 			rmapp = __gfn_to_rmap(gfn_start, j, memslot);
 
 			for (; idx <= idx_end; ++idx)
-				ret |= handler(kvm, rmapp++, memslot, data);
+				ret |= handler(kvm, rmapp++, memslot, data, false);
 		}
 	}
 
@@ -1385,7 +1417,8 @@
 			  unsigned long data,
 			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
 					 struct kvm_memory_slot *slot,
-					 unsigned long data))
+					 unsigned long data,
+					 bool age))
 {
 	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
 }
@@ -1406,7 +1439,8 @@
 }
 
 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			 struct kvm_memory_slot *slot, unsigned long data)
+			 struct kvm_memory_slot *slot, unsigned long data,
+			 bool age)
 {
 	u64 *sptep;
 	struct rmap_iterator uninitialized_var(iter);
@@ -1421,7 +1455,7 @@
 	 * out actively used pages or breaking up actively used hugepages.
 	 */
 	if (!shadow_accessed_mask) {
-		young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
+		young = kvm_unmap_rmapp(kvm, rmapp, slot, data, true);
 		goto out;
 	}
 
@@ -1442,7 +1476,8 @@
 }
 
 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			      struct kvm_memory_slot *slot, unsigned long data)
+			      struct kvm_memory_slot *slot, unsigned long data,
+			      bool age)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1480,7 +1515,7 @@
 
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
-	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
+	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0, false);
 	kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
@@ -2753,7 +2788,8 @@
 }
 
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
-				pfn_t pfn, unsigned access, int *ret_val)
+				pfn_t pfn, unsigned access, int *ret_val,
+				bool pin)
 {
 	bool ret = true;
 
@@ -2763,8 +2799,14 @@
 		goto exit;
 	}
 
-	if (unlikely(is_noslot_pfn(pfn)))
+	if (unlikely(is_noslot_pfn(pfn))) {
+		/* pinned sptes must point to RAM */
+		if (unlikely(pin)) {
+			*ret_val = -EFAULT;
+			goto exit;
+		}
 		vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+	}
 
 	ret = false;
 exit:
@@ -2818,7 +2860,7 @@
  * - false: let the real page fault path to fix it.
  */
 static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
-			    u32 error_code)
+			    u32 error_code, bool pin)
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
@@ -2828,6 +2870,9 @@
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return false;
 
+	if (pin)
+		return false;
+
 	if (!page_fault_can_be_fast(error_code))
 		return false;
 
@@ -2895,9 +2940,71 @@
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
+			 gva_t gva, pfn_t *pfn, bool write, bool *writable,
+			 bool pin);
 static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
 
+
+static int get_sptep_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes[4])
+
+{
+	struct kvm_shadow_walk_iterator iterator;
+	int nr_sptes = 0;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return nr_sptes;
+
+	for_each_shadow_entry(vcpu, addr, iterator) {
+		sptes[iterator.level-1] = iterator.sptep;
+		nr_sptes++;
+		if (!is_shadow_present_pte(*iterator.sptep))
+			break;
+	}
+
+	return nr_sptes;
+}
+
+static bool __direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, bool pin)
+{
+	u64 *sptes[4];
+	int r, i, level;
+
+	r = get_sptep_hierarchy(vcpu, gfn << PAGE_SHIFT, sptes);
+	if (!r)
+		return false;
+
+	level = 5 - r;
+	if (!is_last_spte(*sptes[level-1], level))
+		return false;
+	if (!is_shadow_present_pte(*sptes[level-1]))
+		return false;
+
+	for (i = 0; i < r; i++) {
+		u64 *sptep = sptes[3-i];
+		struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+		if (pin) {
+			sp->pinned = true;
+			set_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
+		} else {
+			sp->pinned = false;
+			clear_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
+		}
+	}
+
+	return true;
+}
+
+static bool direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return __direct_pin_sptes(vcpu, gfn, true);
+}
+
+static bool direct_unpin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	return __direct_pin_sptes(vcpu, gfn, false);
+}
+
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 			 gfn_t gfn, bool prefault, bool pin, bool *pinned)
 {
@@ -2923,16 +3030,17 @@
 	} else
 		level = PT_PAGE_TABLE_LEVEL;
 
-	if (fast_page_fault(vcpu, v, level, error_code))
+	if (fast_page_fault(vcpu, v, level, error_code, pin))
 		return 0;
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
+	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable,
+			 pin))
 		return 0;
 
-	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r, pin))
 		return r;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -2943,6 +3051,8 @@
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
 			 prefault);
+	if (pin)
+		*pinned = direct_pin_sptes(vcpu, gfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -3131,7 +3241,7 @@
 
 			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
 			if (lm_root == NULL)
-				return 1;
+				return -ENOMEM;
 
 			lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
 
@@ -3349,7 +3459,8 @@
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-			 gva_t gva, pfn_t *pfn, bool write, bool *writable)
+			 gva_t gva, pfn_t *pfn, bool write, bool *writable,
+			 bool pin)
 {
 	bool async;
 
@@ -3358,7 +3469,7 @@
 	if (!async)
 		return false; /* *pfn has correct page already */
 
-	if (!prefault && can_do_async_pf(vcpu)) {
+	if (!prefault && !pin && can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(gva, gfn);
 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
 			trace_kvm_async_pf_doublefault(gva, gfn);
@@ -3406,16 +3517,17 @@
 	} else
 		level = PT_PAGE_TABLE_LEVEL;
 
-	if (fast_page_fault(vcpu, gpa, level, error_code))
+	if (fast_page_fault(vcpu, gpa, level, error_code, pin))
 		return 0;
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable,
+			 pin))
 		return 0;
 
-	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r, pin))
 		return r;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -3426,6 +3538,8 @@
 		transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
 	r = __direct_map(vcpu, gpa, write, map_writable,
 			 level, gfn, pfn, prefault);
+	if (pin)
+		*pinned = direct_pin_sptes(vcpu, gfn);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return r;
@@ -3903,6 +4017,141 @@
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
 
+int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu,
+				  gfn_t base_gfn, unsigned long npages)
+{
+	struct kvm_pinned_page_range *p;
+
+	if (!tdp_enabled) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+	list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
+		if (p->base_gfn == base_gfn && p->npages == npages) {
+			return -EEXIST;
+		}
+	}
+
+	if (atomic_read(&vcpu->arch.nr_pinned_ranges) >=
+	    KVM_MAX_PER_VCPU_PINNED_RANGE)
+		return -ENOSPC;
+
+	p = kzalloc(sizeof(struct kvm_pinned_page_range), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	atomic_inc(&vcpu->arch.nr_pinned_ranges);
+
+	trace_kvm_mmu_register_pinned_range(vcpu->vcpu_id, base_gfn, npages);
+
+	INIT_LIST_HEAD(&p->link);
+	p->base_gfn = base_gfn;
+	p->npages = npages;
+	list_add(&p->link, &vcpu->arch.pinned_mmu_pages);
+	kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+
+	return 0;
+}
+
+
+void unregister_pinned_sptes(struct kvm_vcpu *vcpu, unsigned long base_gfn,
+			     unsigned long npages)
+{
+	gfn_t gfn;
+
+	for (gfn = base_gfn; gfn < base_gfn+npages; gfn++)
+		direct_unpin_sptes(vcpu, gfn);
+
+}
+
+int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu,
+				    gfn_t base_gfn, unsigned long npages)
+{
+	struct kvm_pinned_page_range *p;
+
+	list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
+		if (p->base_gfn == base_gfn && p->npages == npages) {
+			list_del(&p->link);
+			atomic_dec(&vcpu->arch.nr_pinned_ranges);
+			spin_lock(&vcpu->kvm->mmu_lock);
+			mmu_reload_pinned_vcpus(vcpu->kvm);
+			unregister_pinned_sptes(vcpu, base_gfn, npages);
+			spin_unlock(&vcpu->kvm->mmu_lock);
+			kfree(p);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pinned_page_range *p, *p2;
+
+	list_for_each_entry_safe(p, p2, &vcpu->arch.pinned_mmu_pages, link) {
+		list_del(&p->link);
+		kfree(p);
+	}
+}
+
+/*
+ * Pin KVM MMU page translations. This guarantees, for valid
+ * addresses registered by kvm_mmu_register_pinned_range (valid address
+ * meaning address which posses sufficient information for fault to
+ * be resolved), valid translations exist while in guest mode and
+ * therefore no VM-exits due to faults will occur.
+ *
+ * Failure to instantiate pages will abort guest entry.
+ *
+ * Pinning is not guaranteed while executing as L2 guest.
+ *
+ */
+
+static int kvm_mmu_pin_pages(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pinned_page_range *p;
+	int r = 1;
+
+	if (is_guest_mode(vcpu))
+		return r;
+
+	if (!vcpu->arch.mmu.direct_map)
+		return r;
+
+	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) {
+		gfn_t gfn_offset;
+
+		for (gfn_offset = 0; gfn_offset < p->npages; gfn_offset++) {
+			gfn_t gfn = p->base_gfn + gfn_offset;
+			int r;
+			bool pinned = false;
+
+			r = vcpu->arch.mmu.page_fault(vcpu, gfn << PAGE_SHIFT,
+						     PFERR_WRITE_MASK, false,
+						     true, &pinned);
+			/* MMU notifier sequence window: retry */
+			if (!r && !pinned)
+				kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+			if (r) {
+				vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+				vcpu->run->internal.suberror =
+					KVM_INTERNAL_ERROR_PIN_FAILURE;
+				vcpu->run->internal.ndata = 1;
+				vcpu->run->internal.data[0] = gfn;
+				r = 0;
+				goto out;
+			}
+
+		}
+	}
+out:
+	return r;
+}
+
 int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -3916,6 +4165,7 @@
 		goto out;
 	/* set_cr3() should ensure TLB has been flushed */
 	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
+	r = kvm_mmu_pin_pages(vcpu);
 out:
 	return r;
 }
Index: kvm.pinned-sptes/arch/x86/kvm/mmu.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.h	2014-07-09 12:05:30.018171068 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmu.h	2014-07-09 12:08:45.343762778 -0300
@@ -94,7 +94,7 @@
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 {
 	if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE))
-		return 0;
+		return 1;
 
 	return kvm_mmu_load(vcpu);
 }
@@ -178,4 +178,9 @@
 }
 
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu,
+				 gfn_t base_gfn, unsigned long npages);
+int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu,
+				 gfn_t base_gfn, unsigned long npages);
+void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu);
 #endif
Index: kvm.pinned-sptes/arch/x86/kvm/x86.c
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/x86.c	2014-07-09 12:05:34.838161262 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/x86.c	2014-07-09 12:08:45.346762771 -0300
@@ -6017,7 +6017,7 @@
 	}
 
 	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r)) {
+	if (unlikely(r <= 0)) {
 		goto cancel_injection;
 	}
 
@@ -7049,6 +7049,8 @@
 
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
+	INIT_LIST_HEAD(&vcpu->arch.pinned_mmu_pages);
+	atomic_set(&vcpu->arch.nr_pinned_ranges, 0);
 
 	return 0;
 fail_free_wbinvd_dirty_mask:
@@ -7069,6 +7071,7 @@
 {
 	int idx;
 
+	kvm_mmu_free_pinned_ranges(vcpu);
 	kvm_pmu_destroy(vcpu);
 	kfree(vcpu->arch.mce_banks);
 	kvm_free_lapic(vcpu);
@@ -7113,6 +7116,7 @@
 	int r;
 	r = vcpu_load(vcpu);
 	BUG_ON(r);
+	kvm_mmu_free_pinned_ranges(vcpu);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 }
@@ -7408,7 +7412,7 @@
 		return;
 
 	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r))
+	if (unlikely(r <= 0))
 		return;
 
 	if (!vcpu->arch.mmu.direct_map &&
Index: kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/paging_tmpl.h	2014-07-09 12:05:34.837161264 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h	2014-07-09 12:08:45.346762771 -0300
@@ -747,11 +747,11 @@
 	smp_rmb();
 
 	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
-			 &map_writable))
+			 &map_writable, false))
 		return 0;
 
 	if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
-				walker.gfn, pfn, walker.pte_access, &r))
+				walker.gfn, pfn, walker.pte_access, &r, false))
 		return r;
 
 	/*
Index: kvm.pinned-sptes/arch/x86/kvm/mmutrace.h
===================================================================
--- kvm.pinned-sptes.orig/arch/x86/kvm/mmutrace.h	2014-07-09 12:05:30.018171068 -0300
+++ kvm.pinned-sptes/arch/x86/kvm/mmutrace.h	2014-07-09 12:08:45.347762769 -0300
@@ -322,6 +322,29 @@
 		  __entry->kvm_gen == __entry->spte_gen
 	)
 );
+
+TRACE_EVENT(
+	kvm_mmu_register_pinned_range,
+	TP_PROTO(unsigned int vcpu_id, gfn_t gfn, unsigned long npages),
+	TP_ARGS(vcpu_id, gfn, npages),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id	)
+		__field(	gfn_t,		gfn	)
+		__field(	unsigned long,	npages	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id		= vcpu_id;
+		__entry->gfn			= gfn;
+		__entry->npages			= npages;
+	),
+
+	TP_printk("vcpu_id %u gfn %llx npages %lx",
+		  __entry->vcpu_id,
+		  __entry->gfn,
+		  __entry->npages)
+);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
Index: kvm.pinned-sptes/include/uapi/linux/kvm.h
===================================================================
--- kvm.pinned-sptes.orig/include/uapi/linux/kvm.h	2014-07-09 12:05:30.019171066 -0300
+++ kvm.pinned-sptes/include/uapi/linux/kvm.h	2014-07-09 12:08:45.347762769 -0300
@@ -180,6 +180,8 @@
 #define KVM_INTERNAL_ERROR_SIMUL_EX	2
 /* Encounter unexpected vm-exit due to delivery event. */
 #define KVM_INTERNAL_ERROR_DELIVERY_EV	3
+/* Failure to pin address translation. */
+#define KVM_INTERNAL_ERROR_PIN_FAILURE	4
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
Index: kvm.pinned-sptes/include/linux/kvm_host.h
===================================================================
--- kvm.pinned-sptes.orig/include/linux/kvm_host.h	2014-07-09 12:05:30.019171066 -0300
+++ kvm.pinned-sptes/include/linux/kvm_host.h	2014-07-09 12:08:45.348762767 -0300
@@ -591,6 +591,9 @@
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 
+bool make_all_cpus_request(struct kvm *kvm, unsigned int req,
+			   bool (*vcpukick)(struct kvm_vcpu *));
+
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 void kvm_make_mclock_inprogress_request(struct kvm *kvm);
Index: kvm.pinned-sptes/virt/kvm/kvm_main.c
===================================================================
--- kvm.pinned-sptes.orig/virt/kvm/kvm_main.c	2014-07-09 12:05:30.019171066 -0300
+++ kvm.pinned-sptes/virt/kvm/kvm_main.c	2014-07-09 12:08:45.349762765 -0300
@@ -152,7 +152,8 @@
 {
 }
 
-static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
+bool make_all_cpus_request(struct kvm *kvm, unsigned int req,
+			   bool (*vcpukick)(struct kvm_vcpu *))
 {
 	int i, cpu, me;
 	cpumask_var_t cpus;
@@ -163,6 +164,8 @@
 
 	me = get_cpu();
 	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpukick && !vcpukick(vcpu))
+			continue;
 		kvm_make_request(req, vcpu);
 		cpu = vcpu->cpu;
 
@@ -189,7 +192,7 @@
 	long dirty_count = kvm->tlbs_dirty;
 
 	smp_mb();
-	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH, NULL))
 		++kvm->stat.remote_tlb_flush;
 	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
 }
@@ -197,17 +200,22 @@
 
 void kvm_reload_remote_mmus(struct kvm *kvm)
 {
-	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD, NULL);
+}
+
+void kvm_reload_pinned_remote_mmus(struct kvm *kvm)
+{
+	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD, NULL);
 }
 
 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
 {
-	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS, NULL);
 }
 
 void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
-	make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+	make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC, NULL);
 }
 
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html