Hi Paolo, Paolo Bonzini <pbonzini@xxxxxxxxxx> writes: > ----- Original Message ----- >> From: "Bandan Das" <bsd@xxxxxxxxxx> >> To: kvm@xxxxxxxxxxxxxxx >> Cc: pbonzini@xxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx >> Sent: Friday, June 30, 2017 1:29:55 AM >> Subject: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor >> >> This is a mix of emulation/passthrough to implement EPTP >> switching for the nested hypervisor. >> >> If the shadow EPT are absent, a vmexit occurs with reason 59. >> L0 can then create shadow structures based on the entry that the >> guest calls with to obtain a new root_hpa that can be written to >> the shadow list and subsequently, reload the mmu to resume L2. >> On the next vmfunc(0, index) however, the processor will load the >> entry without an exit. > > What happens if the root_hpa is dropped by the L0 MMU? I'm not sure > what removes it from the shadow EPT list. That would result in a vmfunc vmexit, which will jump to handle_vmfunc and then a call to mmu_alloc_shadow_roots() that will overwrite the shadow eptp entry with the new one. I believe part of your question is also that root_hpa is valid, just not tracking the current l1 eptp and in that case, the processor can jump to some other guest's page tables which is a problem. In that case, I think it should be possible to invalidate that entry in the shadow eptp list. > For a first version of the patch, I would prefer a less optimized > version that always goes through L0 when L2 executes VMFUNC. > To achieve this effect, you can copy "enable VM functions" secondary > execution control from vmcs12 to vmcs02, but leave the VM-function > controls to 0 in vmcs02. Is the current approach prone to other undesired corner cases like the one you pointed above ? :) I would be uncomfortable having this in if you feel having the cpu jump to a new eptp feels like an interesting exploitable interface; however, I think it would be nice to have l2 execute vmfunc without exiting to L0. Thanks for the quick review! > Paolo > >> Signed-off-by: Bandan Das <bsd@xxxxxxxxxx> >> --- >> arch/x86/include/asm/vmx.h | 5 +++ >> arch/x86/kvm/vmx.c | 104 >> +++++++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 109 insertions(+) >> >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h >> index 35cd06f..e06783e 100644 >> --- a/arch/x86/include/asm/vmx.h >> +++ b/arch/x86/include/asm/vmx.h >> @@ -72,6 +72,7 @@ >> #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 >> #define SECONDARY_EXEC_RDRAND 0x00000800 >> #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 >> +#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000 >> #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 >> #define SECONDARY_EXEC_RDSEED 0x00010000 >> #define SECONDARY_EXEC_ENABLE_PML 0x00020000 >> @@ -114,6 +115,10 @@ >> #define VMX_MISC_SAVE_EFER_LMA 0x00000020 >> #define VMX_MISC_ACTIVITY_HLT 0x00000040 >> >> +/* VMFUNC functions */ >> +#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 >> +#define VMFUNC_EPTP_ENTRIES 512 >> + >> static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) >> { >> return vmx_basic & GENMASK_ULL(30, 0); >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index ca5d2b9..75049c0 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -240,11 +240,13 @@ struct __packed vmcs12 { >> u64 virtual_apic_page_addr; >> u64 apic_access_addr; >> u64 posted_intr_desc_addr; >> + u64 vm_function_control; >> u64 ept_pointer; >> u64 eoi_exit_bitmap0; >> u64 eoi_exit_bitmap1; >> u64 eoi_exit_bitmap2; >> u64 eoi_exit_bitmap3; >> + u64 eptp_list_address; >> u64 xss_exit_bitmap; >> u64 guest_physical_address; >> u64 vmcs_link_pointer; >> @@ -441,6 +443,7 @@ struct nested_vmx { >> struct page *apic_access_page; >> struct page *virtual_apic_page; >> struct page *pi_desc_page; >> + struct page *shadow_eptp_list; >> struct pi_desc *pi_desc; >> bool pi_pending; >> u16 posted_intr_nv; >> @@ -481,6 +484,7 @@ struct nested_vmx { >> u64 nested_vmx_cr4_fixed0; >> u64 nested_vmx_cr4_fixed1; >> u64 nested_vmx_vmcs_enum; >> + u64 nested_vmx_vmfunc_controls; >> }; >> >> #define POSTED_INTR_ON 0 >> @@ -1314,6 +1318,22 @@ static inline bool cpu_has_vmx_tsc_scaling(void) >> SECONDARY_EXEC_TSC_SCALING; >> } >> >> +static inline bool cpu_has_vmx_vmfunc(void) >> +{ >> + return vmcs_config.cpu_based_exec_ctrl & >> + SECONDARY_EXEC_ENABLE_VMFUNC; >> +} >> + >> +static inline u64 vmx_vmfunc_controls(void) >> +{ >> + u64 controls = 0; >> + >> + if (cpu_has_vmx_vmfunc()) >> + rdmsrl(MSR_IA32_VMX_VMFUNC, controls); >> + >> + return controls; >> +} >> + >> static inline bool report_flexpriority(void) >> { >> return flexpriority_enabled; >> @@ -1388,6 +1408,18 @@ static inline bool nested_cpu_has_posted_intr(struct >> vmcs12 *vmcs12) >> return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; >> } >> >> +static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12) >> +{ >> + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC); >> +} >> + >> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) >> +{ >> + return nested_cpu_has_vmfunc(vmcs12) && >> + (vmcs12->vm_function_control & >> + VMX_VMFUNC_EPTP_SWITCHING); >> +} >> + >> static inline bool is_nmi(u32 intr_info) >> { >> return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) >> @@ -3143,6 +3175,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 >> msr_index, u64 *pdata) >> *pdata = vmx->nested.nested_vmx_ept_caps | >> ((u64)vmx->nested.nested_vmx_vpid_caps << 32); >> break; >> + case MSR_IA32_VMX_VMFUNC: >> + *pdata = vmx->nested.nested_vmx_vmfunc_controls; >> + break; >> default: >> return 1; >> } >> @@ -6959,6 +6994,14 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) >> vmx->vmcs01.shadow_vmcs = shadow_vmcs; >> } >> >> + if (vmx_vmfunc_controls() & 1) { >> + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); >> + >> + if (!page) >> + goto out_shadow_vmcs; >> + vmx->nested.shadow_eptp_list = page; >> + } >> + >> INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); >> vmx->nested.vmcs02_num = 0; >> >> @@ -7128,6 +7171,11 @@ static void free_nested(struct vcpu_vmx *vmx) >> vmx->vmcs01.shadow_vmcs = NULL; >> } >> kfree(vmx->nested.cached_vmcs12); >> + >> + if (vmx->nested.shadow_eptp_list) { >> + __free_page(vmx->nested.shadow_eptp_list); >> + vmx->nested.shadow_eptp_list = NULL; >> + } >> /* Unpin physical memory we referred to in current vmcs02 */ >> if (vmx->nested.apic_access_page) { >> nested_release_page(vmx->nested.apic_access_page); >> @@ -7740,6 +7788,61 @@ static int handle_preemption_timer(struct kvm_vcpu >> *vcpu) >> return 1; >> } >> >> +static int handle_vmfunc(struct kvm_vcpu *vcpu) >> +{ >> + struct vcpu_vmx *vmx = to_vmx(vcpu); >> + struct vmcs12 *vmcs12; >> + struct page *page = NULL, >> + *shadow_page = vmx->nested.shadow_eptp_list; >> + u64 *l1_eptp_list, *shadow_eptp_list; >> + u32 index = vcpu->arch.regs[VCPU_REGS_RCX]; >> + u32 function = vcpu->arch.regs[VCPU_REGS_RAX]; >> + >> + if (is_guest_mode(vcpu)) { >> + vmcs12 = get_vmcs12(vcpu); >> + if (!nested_cpu_has_ept(vmcs12) || >> + !nested_cpu_has_eptp_switching(vmcs12)) >> + goto fail; >> + >> + /* >> + * Only function 0 is valid, everything upto 63 injects VMFUNC >> + * exit reason to L1, and #UD thereafter >> + */ >> + if (function || !vmcs12->eptp_list_address || >> + index >= VMFUNC_EPTP_ENTRIES) >> + goto fail; >> + >> + page = nested_get_page(vcpu, vmcs12->eptp_list_address); >> + if (!page) >> + goto fail; >> + >> + l1_eptp_list = kmap(page); >> + if (!l1_eptp_list[index]) >> + goto fail; >> + >> + kvm_mmu_unload(vcpu); >> + /* >> + * TODO: Verify that guest ept satisfies vmentry prereqs >> + */ >> + vmcs12->ept_pointer = l1_eptp_list[index]; >> + shadow_eptp_list = phys_to_virt(page_to_phys(shadow_page)); >> + kvm_mmu_reload(vcpu); >> + shadow_eptp_list[index] = >> + construct_eptp(vcpu->arch.mmu.root_hpa); >> + kunmap(page); >> + >> + return kvm_skip_emulated_instruction(vcpu); >> + } >> + >> +fail: >> + if (page) >> + kunmap(page); >> + nested_vmx_vmexit(vcpu, vmx->exit_reason, >> + vmcs_read32(VM_EXIT_INTR_INFO), >> + vmcs_readl(EXIT_QUALIFICATION)); >> + return 1; >> +} >> + >> /* >> * The exit handlers return 1 if the exit was handled fully and guest >> execution >> * may resume. Otherwise they set the kvm_run parameter to indicate what >> needs >> @@ -7790,6 +7893,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct >> kvm_vcpu *vcpu) = { >> [EXIT_REASON_XSAVES] = handle_xsaves, >> [EXIT_REASON_XRSTORS] = handle_xrstors, >> [EXIT_REASON_PML_FULL] = handle_pml_full, >> + [EXIT_REASON_VMFUNC] = handle_vmfunc, >> [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, >> }; >> >> -- >> 2.9.4 >> >>