Re: [RFC PATCH 0/11] Rework gfn_to_pfn_cache

David Woodhouse <dwmw2@xxxxxxxxxxxxx> · Tue, 16 Nov 2021 16:06:52 +0000

On Tue, 2021-11-16 at 16:49 +0100, Paolo Bonzini wrote:
> On 11/16/21 16:09, David Woodhouse wrote:
> > On Tue, 2021-11-16 at 15:57 +0100, Paolo Bonzini wrote:
> > > This should not be needed, should it?  As long as the gfn-to-pfn
> > > cache's vcpu field is handled properly, the request will just cause
> > > the vCPU not to enter.
> > 
> > If the MMU mappings never change, the request never happens. But the
> > memslots *can* change, so it does need to be revalidated each time
> > through I think?
> 
> That needs to be done on KVM_SET_USER_MEMORY_REGION, using the same 
> request (or even the same list walking code) as the MMU notifiers.

Hm....  kvm_arch_memslots_updated() is already kicking every vCPU after
the update, and although that was asynchronous it was actually OK
because unlike in the MMU notifier case, that page wasn't actually
going away — and if that HVA *did* subsequently go away, our HVA-based
notifier check would still catch that and kill it synchronously.

But yes, we *could* come up with a wakeup mechanism which does it that
way.


> > > It would have to take the gpc->lock around
> > > changes to gpc->vcpu though (meaning: it's probably best to add a
> > > function gfn_to_pfn_cache_set_vcpu).
> > 
> > Hm, in my head that was never going to *change* for a given gpc; it
> > *belongs* to that vCPU for ever (and was even part of vmx->nested. for
> > that vCPU, to replace e.g. vmx->nested.pi_desc_map).
> 
> Ah okay, I thought it would be set in nested vmentry and cleared in 
> nested vmexit.

I don't think it needs to be proactively cleared; we just don't
*refresh* it until we need it again.

> > +static void nested_vmx_check_guest_maps(struct kvm_vcpu *vcpu)
> > +{
> > +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> > +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +	struct gfn_to_pfn_cache *gpc;
> > +
> > +	bool valid;
> > +
> > +	if (nested_cpu_has_posted_intr(vmcs12)) {
> > +		gpc = &vmx->nested.pi_desc_cache;
> > +
> > +		read_lock(&gpc->lock);
> > +		valid = kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc,
> > +						   vmcs12->posted_intr_desc_addr,
> > +						   PAGE_SIZE);
> > +		read_unlock(&gpc->lock);
> > +		if (!valid) {
> > +			/* XX: This isn't idempotent. Make it so, or use a different
> > +			 * req for the 'refresh'. */
> > +			kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
> > +			return;
> > +		}
> > +	}
> > +}
> 
> That's really slow to do on every vmentry.

It probably doesn't have to be.

Ultimately, all it *has* to check is that kvm->memslots->generation
hasn't changed since the caches were valid. That can surely be made
fast enough?

If we *know* the GPA and size haven't changed, and if we know that
gpc->valid becoming false would have been handled differently, then we
could optimise that whole thing away quite effectively to a single
check on ->generations?

> > So nested_get_vmcs12_pages() certainly isn't idempotent right now
> > because of all the kvm_vcpu_map() calls, which would just end up
> > leaking — but I suppose the point is to kill all those, and then maybe
> > it will be?
> 
> Yes, exactly.  That might be a larger than normal patch, but it should 
> not be one too hard to review.  Once there's something that works, we 
> can think of how to split (if it's worth it).

This one actually compiles. Not sure we have any test cases that will
actually exercise it though, do we?

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 465455334c0c..9f279d08e570 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1510,6 +1510,7 @@ struct kvm_x86_nested_ops {
 	int (*enable_evmcs)(struct kvm_vcpu *vcpu,
 			    uint16_t *vmcs_version);
 	uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
+	void (*check_guest_maps)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_x86_init_ops {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 280f34ea02c3..f67751112633 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -309,7 +309,7 @@ static void free_nested(struct kvm_vcpu *vcpu)
 		kvm_release_page_clean(vmx->nested.apic_access_page);
 		vmx->nested.apic_access_page = NULL;
 	}
-	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vmx->nested.virtual_apic_cache);
 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
 	vmx->nested.pi_desc = NULL;
 
@@ -3170,10 +3170,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 	}
 
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-		map = &vmx->nested.virtual_apic_map;
+		struct gfn_to_pfn_cache *gpc = &vmx->nested.virtual_apic_cache;
 
-		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
-			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
+		if (!kvm_gfn_to_pfn_cache_init(vcpu->kvm, gpc, vcpu, true,
+					       vmcs12->virtual_apic_page_addr,
+					       PAGE_SIZE, true)) {
+			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(gpc->pfn));
 		} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
 		           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
 			   !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
@@ -3198,6 +3200,9 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
 	if (nested_cpu_has_posted_intr(vmcs12)) {
 		map = &vmx->nested.pi_desc_map;
 
+		if (kvm_vcpu_mapped(map))
+			kvm_vcpu_unmap(vcpu, map, true);
+
 		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
 			vmx->nested.pi_desc =
 				(struct pi_desc *)(((void *)map->hva) +
@@ -3242,6 +3247,29 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
 	return true;
 }
 
+static void nested_vmx_check_guest_maps(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct gfn_to_pfn_cache *gpc;
+
+	int valid;
+
+	if (nested_cpu_has_posted_intr(vmcs12)) {
+		gpc = &vmx->nested.virtual_apic_cache;
+
+		read_lock(&gpc->lock);
+		valid = kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc,
+						   vmcs12->virtual_apic_page_addr,
+						   PAGE_SIZE);
+		read_unlock(&gpc->lock);
+		if (!valid) {
+			kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+			return;
+		}
+	}
+}
+
 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
 {
 	struct vmcs12 *vmcs12;
@@ -3737,9 +3765,15 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 
 	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
 	if (max_irr != 256) {
-		vapic_page = vmx->nested.virtual_apic_map.hva;
-		if (!vapic_page)
+		struct gfn_to_pfn_cache *gpc = &vmx->nested.virtual_apic_cache;
+
+		read_lock(&gpc->lock);
+		if (!kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc, gpc->gpa, PAGE_SIZE)) {
+			read_unlock(&gpc->lock);
 			goto mmio_needed;
+		}
+
+		vapic_page = gpc->khva;
 
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
 			vapic_page, &max_irr);
@@ -3749,6 +3783,7 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			status |= (u8)max_irr;
 			vmcs_write16(GUEST_INTR_STATUS, status);
 		}
+		read_unlock(&gpc->lock);
 	}
 
 	nested_mark_vmcs12_pages_dirty(vcpu);
@@ -4569,7 +4604,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
 		kvm_release_page_clean(vmx->nested.apic_access_page);
 		vmx->nested.apic_access_page = NULL;
 	}
-	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vmx->nested.virtual_apic_cache);
 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
 	vmx->nested.pi_desc = NULL;
 
@@ -6744,4 +6779,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
 	.write_log_dirty = nested_vmx_write_pml_buffer,
 	.enable_evmcs = nested_enable_evmcs,
 	.get_evmcs_version = nested_get_evmcs_version,
+	.check_guest_maps = nested_vmx_check_guest_maps,
 };
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ba66c171d951..6c61faef86d3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3839,19 +3839,23 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	void *vapic_page;
+	struct gfn_to_pfn_cache *gpc = &vmx->nested.virtual_apic_cache;
 	u32 vppr;
 	int rvi;
 
 	if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
 		!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
-		WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
+		WARN_ON_ONCE(gpc->gpa == GPA_INVALID))
 		return false;
 
 	rvi = vmx_get_rvi();
 
-	vapic_page = vmx->nested.virtual_apic_map.hva;
-	vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+	read_lock(&gpc->lock);
+	if (!kvm_gfn_to_pfn_cache_check(vcpu->kvm, gpc, gpc->gpa, PAGE_SIZE))
+		vppr = *((u32 *)(gpc->khva + APIC_PROCPRI));
+	else
+		vppr = 0xff;
+	read_unlock(&gpc->lock);
 
 	return ((rvi & 0xf0) > (vppr & 0xf0));
 }
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 4df2ac24ffc1..8364e7fc92a0 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -195,7 +195,7 @@ struct nested_vmx {
 	 * pointers, so we must keep them pinned while L2 runs.
 	 */
 	struct page *apic_access_page;
-	struct kvm_host_map virtual_apic_map;
+	struct gfn_to_pfn_cache virtual_apic_cache;
 	struct kvm_host_map pi_desc_map;
 
 	struct kvm_host_map msr_bitmap_map;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0a689bb62e9e..a879e4d08758 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9735,6 +9735,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
 			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+		if (kvm_check_request(KVM_REQ_GPC_INVALIDATE, vcpu))
+			; /* Nothing to do. It just wanted to wake us */
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
@@ -9781,6 +9783,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	local_irq_disable();
 	vcpu->mode = IN_GUEST_MODE;
 
+	/*
+	 * If the guest requires direct access to mapped L1 pages, check
+	 * the caches are valid. Will raise KVM_REQ_GET_NESTED_STATE_PAGES
+	 * to go and revalidate them, if necessary.
+	 */
+	if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->check_guest_maps)
+		kvm_x86_ops.nested_ops->check_guest_maps(vcpu);
+
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
 	/*
Attachment:
smime.p7s

Description: S/MIME cryptographic signature