Re: [patch 2/4] KVM: MMU: allow pinning spte translations (TDP-only)

Xiao Guangrong <xiaoguangrong.eric@xxxxxxxxx> · Tue, 22 Jul 2014 05:46:04 +0800

Hi Marcelo,

On Jul 10, 2014, at 3:12 AM, mtosatti@xxxxxxxxxx wrote:

> struct kvm_vcpu_arch {
> 	/*
> 	 * rip and regs accesses must go through
> @@ -392,6 +402,9 @@
> 	struct kvm_mmu_memory_cache mmu_page_cache;
> 	struct kvm_mmu_memory_cache mmu_page_header_cache;
> 
> +	struct list_head pinned_mmu_pages;
> +	atomic_t nr_pinned_ranges;
> +

I’m not sure per-cpu pinned list is a good idea, since currently all vcpu are using the same
page table, the per-list can not reduce lock-contention, why not make it be global to vm
instead?

> 	struct fpu guest_fpu;
> 	u64 xcr0;
> 	u64 guest_supported_xcr0;
> Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c
> ===================================================================
> --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c	2014-07-09 12:05:34.837161264 -0300
> +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c	2014-07-09 12:09:21.856684314 -0300
> @@ -148,6 +148,9 @@
> 
> #define SPTE_HOST_WRITEABLE	(1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> #define SPTE_MMU_WRITEABLE	(1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
> +#define SPTE_PINNED		(1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT))
> +
> +#define SPTE_PINNED_BIT PT64_SECOND_AVAIL_BITS_SHIFT
> 
> #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
> 
> @@ -327,6 +330,11 @@
> 	return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
> }
> 
> +static int is_pinned_spte(u64 spte)
> +{
> +	return spte & SPTE_PINNED && is_shadow_present_pte(spte);
> +}
> +
> static int is_large_pte(u64 pte)
> {
> 	return pte & PT_PAGE_SIZE_MASK;
> @@ -1176,6 +1184,16 @@
> 		kvm_flush_remote_tlbs(vcpu->kvm);
> }
> 
> +static bool vcpu_has_pinned(struct kvm_vcpu *vcpu)
> +{
> +	return atomic_read(&vcpu->arch.nr_pinned_ranges);
> +}
> +
> +static void mmu_reload_pinned_vcpus(struct kvm *kvm)
> +{
> +	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD, &vcpu_has_pinned);
> +}
> +
> /*
> * Write-protect on the specified @sptep, @pt_protect indicates whether
> * spte write-protection is caused by protecting shadow page table.
> @@ -1268,7 +1286,8 @@
> }
> 
> static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
> -			   struct kvm_memory_slot *slot, unsigned long data)
> +			   struct kvm_memory_slot *slot, unsigned long data,
> +			   bool age)
> {
> 	u64 *sptep;
> 	struct rmap_iterator iter;
> @@ -1278,6 +1297,14 @@
> 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
> 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
> 
> +		if (is_pinned_spte(*sptep)) {
> +			/* don't nuke pinned sptes if page aging: return
> + 			 * young=yes instead.
> + 			 */
> +			if (age)
> +				return 1;
> +			mmu_reload_pinned_vcpus(kvm);
> +		}
> 		drop_spte(kvm, step);

This has a window between zapping spte and re-pin spte, so guest will fail
at this time.

> 		need_tlb_flush = 1;
> 	}
> @@ -1286,7 +1313,8 @@
> }
> 
> static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
> -			     struct kvm_memory_slot *slot, unsigned long data)
> +			     struct kvm_memory_slot *slot, unsigned long data,
> +			     bool age)
> {
> 	u64 *sptep;
> 	struct rmap_iterator iter;
> @@ -1304,6 +1332,9 @@
> 
> 		need_flush = 1;
> 
> +		if (is_pinned_spte(*sptep))
> +			mmu_reload_pinned_vcpus(kvm);
> +
> 		if (pte_write(*ptep)) {
> 			drop_spte(kvm, sptep);
> 			sptep = rmap_get_first(*rmapp, &iter);
> @@ -1334,7 +1365,8 @@
> 				int (*handler)(struct kvm *kvm,
> 					       unsigned long *rmapp,
> 					       struct kvm_memory_slot *slot,
> -					       unsigned long data))
> +					       unsigned long data,
> +					       bool age))
> {
> 	int j;
> 	int ret = 0;
> @@ -1374,7 +1406,7 @@
> 			rmapp = __gfn_to_rmap(gfn_start, j, memslot);
> 
> 			for (; idx <= idx_end; ++idx)
> -				ret |= handler(kvm, rmapp++, memslot, data);
> +				ret |= handler(kvm, rmapp++, memslot, data, false);
> 		}
> 	}
> 
> @@ -1385,7 +1417,8 @@
> 			  unsigned long data,
> 			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
> 					 struct kvm_memory_slot *slot,
> -					 unsigned long data))
> +					 unsigned long data,
> +					 bool age))
> {
> 	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
> }
> @@ -1406,7 +1439,8 @@
> }
> 
> static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
> -			 struct kvm_memory_slot *slot, unsigned long data)
> +			 struct kvm_memory_slot *slot, unsigned long data,
> +			 bool age)
> {
> 	u64 *sptep;
> 	struct rmap_iterator uninitialized_var(iter);
> @@ -1421,7 +1455,7 @@
> 	 * out actively used pages or breaking up actively used hugepages.
> 	 */
> 	if (!shadow_accessed_mask) {
> -		young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
> +		young = kvm_unmap_rmapp(kvm, rmapp, slot, data, true);
> 		goto out;
> 	}
> 
> @@ -1442,7 +1476,8 @@
> }
> 
> static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
> -			      struct kvm_memory_slot *slot, unsigned long data)
> +			      struct kvm_memory_slot *slot, unsigned long data,
> +			      bool age)
> {
> 	u64 *sptep;
> 	struct rmap_iterator iter;
> @@ -1480,7 +1515,7 @@
> 
> 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
> 
> -	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
> +	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0, false);
> 	kvm_flush_remote_tlbs(vcpu->kvm);
> }
> 
> @@ -2753,7 +2788,8 @@
> }
> 
> static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
> -				pfn_t pfn, unsigned access, int *ret_val)
> +				pfn_t pfn, unsigned access, int *ret_val,
> +				bool pin)
> {
> 	bool ret = true;
> 
> @@ -2763,8 +2799,14 @@
> 		goto exit;
> 	}
> 
> -	if (unlikely(is_noslot_pfn(pfn)))
> +	if (unlikely(is_noslot_pfn(pfn))) {
> +		/* pinned sptes must point to RAM */
> +		if (unlikely(pin)) {
> +			*ret_val = -EFAULT;
> +			goto exit;
> +		}
> 		vcpu_cache_mmio_info(vcpu, gva, gfn, access);
> +	}
> 
> 	ret = false;
> exit:
> @@ -2818,7 +2860,7 @@
> * - false: let the real page fault path to fix it.
> */
> static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
> -			    u32 error_code)
> +			    u32 error_code, bool pin)
> {
> 	struct kvm_shadow_walk_iterator iterator;
> 	struct kvm_mmu_page *sp;
> @@ -2828,6 +2870,9 @@
> 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> 		return false;
> 
> +	if (pin)
> +		return false;
> +
> 	if (!page_fault_can_be_fast(error_code))
> 		return false;
> 
> @@ -2895,9 +2940,71 @@
> }
> 
> static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
> -			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
> +			 gva_t gva, pfn_t *pfn, bool write, bool *writable,
> +			 bool pin);
> static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
> 
> +
> +static int get_sptep_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes[4])
> +
> +{
> +	struct kvm_shadow_walk_iterator iterator;
> +	int nr_sptes = 0;
> +
> +	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
> +		return nr_sptes;
> +
> +	for_each_shadow_entry(vcpu, addr, iterator) {
> +		sptes[iterator.level-1] = iterator.sptep;
> +		nr_sptes++;
> +		if (!is_shadow_present_pte(*iterator.sptep))
> +			break;
> +	}
> +
> +	return nr_sptes;
> +}
> +
> +static bool __direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, bool pin)
> +{
> +	u64 *sptes[4];
> +	int r, i, level;
> +
> +	r = get_sptep_hierarchy(vcpu, gfn << PAGE_SHIFT, sptes);
> +	if (!r)
> +		return false;
> +
> +	level = 5 - r;
> +	if (!is_last_spte(*sptes[level-1], level))
> +		return false;
> +	if (!is_shadow_present_pte(*sptes[level-1]))
> +		return false;
> +
> +	for (i = 0; i < r; i++) {
> +		u64 *sptep = sptes[3-i];
> +		struct kvm_mmu_page *sp = page_header(__pa(sptep));
> +
> +		if (pin) {
> +			sp->pinned = true;
> +			set_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
> +		} else {
> +			sp->pinned = false;
> +			clear_bit(SPTE_PINNED_BIT, (unsigned long *)sptep);
> +		}
> +	}
> +
> +	return true;
> +}
> +
> +static bool direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
> +{
> +	return __direct_pin_sptes(vcpu, gfn, true);
> +}
> +
> +static bool direct_unpin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn)
> +{
> +	return __direct_pin_sptes(vcpu, gfn, false);
> +}
> +

I guess we can reuse the infrastructure which marks/clears unsync spte
to pin/unpind pages that can omit lot of code in this patch.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html