[RFC PATCH part-6 08/13] pkvm: x86: Add support for pKVM to handle the nested EPT violation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Chuanxiao Dong <chuanxiao.dong@xxxxxxxxx>

Shadow EPT is used as EPTP for nested VMs and it's initialized with empty
entries. So when EPT violation happened, pKVM shall shadowing EPT
page table entry from guest vEPT under well-designed rules.

pKVM walks the guest vEPT to find out if the L2 GPA already has a
proper mapping in vEPT. If already has one, pKVM map it in the shadow
EPT - now just directly do the same mapping as vEPT, in the future,
pKVM shall follow some rules to do the mapping, like only after
confirming the page's ownership belong to this VM.

For other cases non-proper mapping in vEPT, they actually seek for the
handling from host VM (or more accurate from KVM-high):

- for the case vEPT is not present, it's actually a #PF and requesting
  page allocation from the host, so pKVM directly forward the
  violation back to the KVM-high for proper handling.

- and for the case that vEPT contains a misconfig mapping, such
  misconfig is set by KVM-high so it expected to handle it. pKVM modify
  VM_EXIT_REASON field to EPT_MISCONFIG if VMX support read-only field
  modification, then deliver back to KVM-high. Otherwise deliver
  EPT_VIOLATION to KVM-high directly as KVM handle EPT_MISCONFIG in its
  EPT_VIOLATION handler as well.

- another case is that if the EPT violation vmexit has pending event to be
  injected, such event also seeks for host handling to do further event
  injection, pKVM shall return back to the KVM-high with
  corresponding pending event vmexit info.

Signed-off-by: Chuanxiao Dong <chuanxiao.dong@xxxxxxxxx>
Signed-off-by: Jason Chen CJ <jason.cj.chen@xxxxxxxxx>
---
 arch/x86/kvm/vmx/pkvm/hyp/ept.c    | 60 ++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx/pkvm/hyp/ept.h    |  8 ++++
 arch/x86/kvm/vmx/pkvm/hyp/nested.c | 40 ++++++++++++++++++++
 arch/x86/kvm/vmx/pkvm/hyp/vmx.h    |  6 +++
 arch/x86/kvm/vmx/pkvm/pkvm_host.c  |  2 +
 5 files changed, 116 insertions(+)

diff --git a/arch/x86/kvm/vmx/pkvm/hyp/ept.c b/arch/x86/kvm/vmx/pkvm/hyp/ept.c
index 823e255de155..65f3a39210db 100644
--- a/arch/x86/kvm/vmx/pkvm/hyp/ept.c
+++ b/arch/x86/kvm/vmx/pkvm/hyp/ept.c
@@ -377,3 +377,63 @@ void pkvm_guest_ept_init(struct shadow_vcpu_state *shadow_vcpu, u64 guest_eptp)
 	pkvm_pgtable_init(&shadow_vcpu->vept, &virtual_ept_mm_ops, &ept_ops, &cap, false);
 	shadow_vcpu->vept.root_pa = host_gpa2hpa(guest_eptp & SPTE_BASE_ADDR_MASK);
 }
+
+static bool is_access_violation(u64 ept_entry, u64 exit_qual)
+{
+	bool access_violation = false;
+
+	if (/* Caused by data read */
+	    (((exit_qual & 0x1UL) != 0UL) && ((ept_entry & VMX_EPT_READABLE_MASK) == 0)) ||
+	    /* Caused by data write */
+	    (((exit_qual & 0x2UL) != 0UL) && ((ept_entry & VMX_EPT_WRITABLE_MASK) == 0)) ||
+	    /* Caused by instruction fetch */
+	    (((exit_qual & 0x4UL) != 0UL) && ((ept_entry & VMX_EPT_EXECUTABLE_MASK) == 0))) {
+		access_violation = true;
+	}
+
+	return access_violation;
+}
+
+enum sept_handle_ret
+pkvm_handle_shadow_ept_violation(struct shadow_vcpu_state *shadow_vcpu, u64 l2_gpa, u64 exit_quali)
+{
+	struct pkvm_shadow_vm *vm = shadow_vcpu->vm;
+	struct shadow_ept_desc *desc = &vm->sept_desc;
+	struct pkvm_pgtable *sept = &desc->sept;
+	struct pkvm_pgtable_ops *pgt_ops = sept->pgt_ops;
+	struct pkvm_pgtable *vept = &shadow_vcpu->vept;
+	enum sept_handle_ret ret = PKVM_NOT_HANDLED;
+	unsigned long phys;
+	int level;
+	u64 gprot, rsvd_chk_gprot;
+
+	pkvm_spin_lock(&vm->lock);
+
+	pkvm_pgtable_lookup(vept, l2_gpa, &phys, &gprot, &level);
+	if (phys == INVALID_ADDR)
+		/* Geust EPT not valid, back to kvm-high */
+		goto out;
+
+	if (is_access_violation(gprot, exit_quali))
+		/* Guest EPT error, refuse to handle in shadow ept */
+		goto out;
+
+	rsvd_chk_gprot = gprot;
+	/* is_rsvd_spte() need based on PAGE_SIZE bit */
+	if (level != PG_LEVEL_4K)
+		pgt_ops->pgt_entry_mkhuge(&rsvd_chk_gprot);
+
+	if (is_rsvd_spte(&ept_zero_check, rsvd_chk_gprot, level)) {
+		ret = PKVM_INJECT_EPT_MISC;
+	} else {
+		unsigned long level_size = pgt_ops->pgt_level_to_size(level);
+		unsigned long gpa = ALIGN_DOWN(l2_gpa, level_size);
+		unsigned long hpa = ALIGN_DOWN(host_gpa2hpa(phys), level_size);
+
+		if (!pkvm_pgtable_map(sept, gpa, hpa, level_size, 0, gprot))
+			ret = PKVM_HANDLED;
+	}
+out:
+	pkvm_spin_unlock(&vm->lock);
+	return ret;
+}
diff --git a/arch/x86/kvm/vmx/pkvm/hyp/ept.h b/arch/x86/kvm/vmx/pkvm/hyp/ept.h
index 420c9c5816e9..92a4f18535ea 100644
--- a/arch/x86/kvm/vmx/pkvm/hyp/ept.h
+++ b/arch/x86/kvm/vmx/pkvm/hyp/ept.h
@@ -12,6 +12,12 @@
 #define HOST_EPT_DEF_MMIO_PROT	(VMX_EPT_RWX_MASK |				\
 				(MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT))
 
+enum sept_handle_ret {
+	PKVM_NOT_HANDLED,
+	PKVM_HANDLED,
+	PKVM_INJECT_EPT_MISC,
+};
+
 int pkvm_host_ept_map(unsigned long vaddr_start, unsigned long phys_start,
 		unsigned long size, int pgsz_mask, u64 prot);
 int pkvm_host_ept_unmap(unsigned long vaddr_start, unsigned long phys_start,
@@ -24,6 +30,8 @@ int pkvm_shadow_ept_init(struct shadow_ept_desc *desc);
 void pkvm_shadow_ept_deinit(struct shadow_ept_desc *desc);
 void pkvm_guest_ept_init(struct shadow_vcpu_state *shadow_vcpu, u64 guest_eptp);
 void pkvm_guest_ept_deinit(struct shadow_vcpu_state *shadow_vcpu);
+enum sept_handle_ret
+pkvm_handle_shadow_ept_violation(struct shadow_vcpu_state *shadow_vcpu, u64 l2_gpa, u64 exit_quali);
 
 static inline bool is_valid_eptp(u64 eptp)
 {
diff --git a/arch/x86/kvm/vmx/pkvm/hyp/nested.c b/arch/x86/kvm/vmx/pkvm/hyp/nested.c
index 68eddb459cfa..22c161100145 100644
--- a/arch/x86/kvm/vmx/pkvm/hyp/nested.c
+++ b/arch/x86/kvm/vmx/pkvm/hyp/nested.c
@@ -1007,6 +1007,39 @@ int handle_vmlaunch(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static bool nested_handle_ept_violation(struct shadow_vcpu_state *shadow_vcpu,
+					u64 l2_gpa, u64 exit_quali)
+{
+	enum sept_handle_ret ret = pkvm_handle_shadow_ept_violation(shadow_vcpu,
+								    l2_gpa, exit_quali);
+	bool handled = false;
+
+	switch (ret) {
+	case PKVM_INJECT_EPT_MISC: {
+		/*
+		 * Inject EPT_MISCONFIG vmexit reason if can directly modify
+		 * the read-only fields. Otherwise still deliver EPT_VIOLATION
+		 * for simplification.
+		 */
+		if (vmx_has_vmwrite_any_field())
+			vmcs_write32(VM_EXIT_REASON, EXIT_REASON_EPT_MISCONFIG);
+		break;
+	}
+	case PKVM_HANDLED:
+		handled = true;
+		break;
+	default:
+		break;
+	}
+
+	if (handled && (vmcs_read32(IDT_VECTORING_INFO_FIELD) &
+			VECTORING_INFO_VALID_MASK))
+		/* pending interrupt, back to kvm-high to inject */
+		handled = false;
+
+	return handled;
+}
+
 int nested_vmexit(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1015,6 +1048,13 @@ int nested_vmexit(struct kvm_vcpu *vcpu)
 	struct vmcs *vmcs02 = (struct vmcs *)cur_shadow_vcpu->vmcs02;
 	struct vmcs12 *vmcs12 = (struct vmcs12 *)cur_shadow_vcpu->cached_vmcs12;
 
+	if ((vmx->exit_reason.full == EXIT_REASON_EPT_VIOLATION) &&
+		nested_handle_ept_violation(cur_shadow_vcpu,
+					    vmcs_read64(GUEST_PHYSICAL_ADDRESS),
+					    vmx->exit_qualification))
+		/* EPT violation can be handled by pkvm, no need back to kvm-high */
+		return 0;
+
 	/* clear guest mode if need switch back to host */
 	vcpu->arch.hflags &= ~HF_GUEST_MASK;
 
diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmx.h b/arch/x86/kvm/vmx/pkvm/hyp/vmx.h
index 3282f228964d..463780776666 100644
--- a/arch/x86/kvm/vmx/pkvm/hyp/vmx.h
+++ b/arch/x86/kvm/vmx/pkvm/hyp/vmx.h
@@ -40,6 +40,12 @@ static inline bool vmx_has_ept_execute_only(void)
 	return vmx_ept_capability_check(VMX_EPT_EXECUTE_ONLY_BIT);
 }
 
+static inline bool vmx_has_vmwrite_any_field(void)
+{
+	return !!(pkvm_hyp->vmcs_config.nested.misc_low &
+			MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS);
+}
+
 static inline u64 pkvm_construct_eptp(unsigned long root_hpa, int level)
 {
 	u64 eptp = 0;
diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_host.c b/arch/x86/kvm/vmx/pkvm/pkvm_host.c
index e5eab94f3e5e..498e304cfb94 100644
--- a/arch/x86/kvm/vmx/pkvm/pkvm_host.c
+++ b/arch/x86/kvm/vmx/pkvm/pkvm_host.c
@@ -380,6 +380,8 @@ static __init void pkvm_host_setup_nested_vmx_cap(struct pkvm_hyp *pkvm)
 	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
 		msrs->entry_ctls_low,
 		msrs->entry_ctls_high);
+
+	rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, msrs->misc_high);
 }
 
 static __init int pkvm_host_check_and_setup_vmx_cap(struct pkvm_hyp *pkvm)
-- 
2.25.1




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux