Re: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor

Bandan Das <bsd@xxxxxxxxxx> · Fri, 30 Jun 2017 13:20:51 -0400

Hi Paolo,

Paolo Bonzini <pbonzini@xxxxxxxxxx> writes:

> ----- Original Message -----
>> From: "Bandan Das" <bsd@xxxxxxxxxx>
>> To: kvm@xxxxxxxxxxxxxxx
>> Cc: pbonzini@xxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx
>> Sent: Friday, June 30, 2017 1:29:55 AM
>> Subject: [PATCH 1/2] KVM: nVMX: Implement EPTP switching for the L1 hypervisor
>> 
>> This is a mix of emulation/passthrough to implement EPTP
>> switching for the nested hypervisor.
>> 
>> If the shadow EPT are absent, a vmexit occurs with reason 59.
>> L0 can then create shadow structures based on the entry that the
>> guest calls with to obtain a new root_hpa that can be written to
>> the shadow list and subsequently, reload the mmu to resume L2.
>> On the next vmfunc(0, index) however, the processor will load the
>> entry without an exit.
>
> What happens if the root_hpa is dropped by the L0 MMU?  I'm not sure
> what removes it from the shadow EPT list.

That would result in a vmfunc vmexit, which will jump to handle_vmfunc
and then a call to mmu_alloc_shadow_roots() that will overwrite the shadow
eptp entry with the new one.

I believe part of your question is also that root_hpa is valid, just not
tracking the current l1 eptp and in that case, the processor can jump to
some other guest's page tables which is a problem. In that case, I think it should
be possible to invalidate that entry in the shadow eptp list.

> For a first version of the patch, I would prefer a less optimized
> version that always goes through L0 when L2 executes VMFUNC.
> To achieve this effect, you can copy "enable VM functions" secondary
> execution control from vmcs12 to vmcs02, but leave the VM-function
> controls to 0 in vmcs02.

Is the current approach prone to other undesired corner cases like the one
you pointed above ? :) I would be uncomfortable having this in if you feel
having the cpu jump to a new eptp feels like an interesting exploitable
interface; however, I think it would be nice to have l2 execute vmfunc without
exiting to L0. 

Thanks for the quick review!

> Paolo
>
>> Signed-off-by: Bandan Das <bsd@xxxxxxxxxx>
>> ---
>>  arch/x86/include/asm/vmx.h |   5 +++
>>  arch/x86/kvm/vmx.c         | 104
>>  +++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 109 insertions(+)
>> 
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 35cd06f..e06783e 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -72,6 +72,7 @@
>>  #define SECONDARY_EXEC_PAUSE_LOOP_EXITING	0x00000400
>>  #define SECONDARY_EXEC_RDRAND			0x00000800
>>  #define SECONDARY_EXEC_ENABLE_INVPCID		0x00001000
>> +#define SECONDARY_EXEC_ENABLE_VMFUNC            0x00002000
>>  #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
>>  #define SECONDARY_EXEC_RDSEED			0x00010000
>>  #define SECONDARY_EXEC_ENABLE_PML               0x00020000
>> @@ -114,6 +115,10 @@
>>  #define VMX_MISC_SAVE_EFER_LMA			0x00000020
>>  #define VMX_MISC_ACTIVITY_HLT			0x00000040
>>  
>> +/* VMFUNC functions */
>> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
>> +#define VMFUNC_EPTP_ENTRIES  512
>> +
>>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>>  {
>>  	return vmx_basic & GENMASK_ULL(30, 0);
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index ca5d2b9..75049c0 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -240,11 +240,13 @@ struct __packed vmcs12 {
>>  	u64 virtual_apic_page_addr;
>>  	u64 apic_access_addr;
>>  	u64 posted_intr_desc_addr;
>> +	u64 vm_function_control;
>>  	u64 ept_pointer;
>>  	u64 eoi_exit_bitmap0;
>>  	u64 eoi_exit_bitmap1;
>>  	u64 eoi_exit_bitmap2;
>>  	u64 eoi_exit_bitmap3;
>> +	u64 eptp_list_address;
>>  	u64 xss_exit_bitmap;
>>  	u64 guest_physical_address;
>>  	u64 vmcs_link_pointer;
>> @@ -441,6 +443,7 @@ struct nested_vmx {
>>  	struct page *apic_access_page;
>>  	struct page *virtual_apic_page;
>>  	struct page *pi_desc_page;
>> +	struct page *shadow_eptp_list;
>>  	struct pi_desc *pi_desc;
>>  	bool pi_pending;
>>  	u16 posted_intr_nv;
>> @@ -481,6 +484,7 @@ struct nested_vmx {
>>  	u64 nested_vmx_cr4_fixed0;
>>  	u64 nested_vmx_cr4_fixed1;
>>  	u64 nested_vmx_vmcs_enum;
>> +	u64 nested_vmx_vmfunc_controls;
>>  };
>>  
>>  #define POSTED_INTR_ON  0
>> @@ -1314,6 +1318,22 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
>>  		SECONDARY_EXEC_TSC_SCALING;
>>  }
>>  
>> +static inline bool cpu_has_vmx_vmfunc(void)
>> +{
>> +	return vmcs_config.cpu_based_exec_ctrl &
>> +		SECONDARY_EXEC_ENABLE_VMFUNC;
>> +}
>> +
>> +static inline u64 vmx_vmfunc_controls(void)
>> +{
>> +	u64 controls = 0;
>> +
>> +	if (cpu_has_vmx_vmfunc())
>> +		rdmsrl(MSR_IA32_VMX_VMFUNC, controls);
>> +
>> +	return controls;
>> +}
>> +
>>  static inline bool report_flexpriority(void)
>>  {
>>  	return flexpriority_enabled;
>> @@ -1388,6 +1408,18 @@ static inline bool nested_cpu_has_posted_intr(struct
>> vmcs12 *vmcs12)
>>  	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
>>  }
>>  
>> +static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>> +{
>> +	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>> +}
>> +
>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>> +{
>> +	return nested_cpu_has_vmfunc(vmcs12) &&
>> +		(vmcs12->vm_function_control &
>> +		 VMX_VMFUNC_EPTP_SWITCHING);
>> +}
>> +
>>  static inline bool is_nmi(u32 intr_info)
>>  {
>>  	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>> @@ -3143,6 +3175,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32
>> msr_index, u64 *pdata)
>>  		*pdata = vmx->nested.nested_vmx_ept_caps |
>>  			((u64)vmx->nested.nested_vmx_vpid_caps << 32);
>>  		break;
>> +	case MSR_IA32_VMX_VMFUNC:
>> +		*pdata = vmx->nested.nested_vmx_vmfunc_controls;
>> +		break;
>>  	default:
>>  		return 1;
>>  	}
>> @@ -6959,6 +6994,14 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
>>  		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
>>  	}
>>  
>> +	if (vmx_vmfunc_controls() & 1) {
>> +		struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
>> +
>> +		if (!page)
>> +			goto out_shadow_vmcs;
>> +		vmx->nested.shadow_eptp_list = page;
>> +	}
>> +
>>  	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
>>  	vmx->nested.vmcs02_num = 0;
>>  
>> @@ -7128,6 +7171,11 @@ static void free_nested(struct vcpu_vmx *vmx)
>>  		vmx->vmcs01.shadow_vmcs = NULL;
>>  	}
>>  	kfree(vmx->nested.cached_vmcs12);
>> +
>> +	if (vmx->nested.shadow_eptp_list) {
>> +		__free_page(vmx->nested.shadow_eptp_list);
>> +		vmx->nested.shadow_eptp_list = NULL;
>> +	}
>>  	/* Unpin physical memory we referred to in current vmcs02 */
>>  	if (vmx->nested.apic_access_page) {
>>  		nested_release_page(vmx->nested.apic_access_page);
>> @@ -7740,6 +7788,61 @@ static int handle_preemption_timer(struct kvm_vcpu
>> *vcpu)
>>  	return 1;
>>  }
>>  
>> +static int handle_vmfunc(struct kvm_vcpu *vcpu)
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +	struct vmcs12 *vmcs12;
>> +	struct page *page = NULL,
>> +		*shadow_page = vmx->nested.shadow_eptp_list;
>> +	u64 *l1_eptp_list, *shadow_eptp_list;
>> +	u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>> +	u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
>> +
>> +	if (is_guest_mode(vcpu)) {
>> +		vmcs12 = get_vmcs12(vcpu);
>> +		if (!nested_cpu_has_ept(vmcs12) ||
>> +		    !nested_cpu_has_eptp_switching(vmcs12))
>> +			goto fail;
>> +
>> +		/*
>> +		 * Only function 0 is valid, everything upto 63 injects VMFUNC
>> +		 * exit reason to L1, and #UD thereafter
>> +		 */
>> +		if (function || !vmcs12->eptp_list_address ||
>> +		    index >= VMFUNC_EPTP_ENTRIES)
>> +			goto fail;
>> +
>> +		page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>> +		if (!page)
>> +			goto fail;
>> +
>> +		l1_eptp_list = kmap(page);
>> +		if (!l1_eptp_list[index])
>> +			goto fail;
>> +
>> +		kvm_mmu_unload(vcpu);
>> +		/*
>> +		 * TODO: Verify that guest ept satisfies vmentry prereqs
>> +		 */
>> +		vmcs12->ept_pointer = l1_eptp_list[index];
>> +		shadow_eptp_list = phys_to_virt(page_to_phys(shadow_page));
>> +		kvm_mmu_reload(vcpu);
>> +		shadow_eptp_list[index] =
>> +			construct_eptp(vcpu->arch.mmu.root_hpa);
>> +		kunmap(page);
>> +
>> +		return kvm_skip_emulated_instruction(vcpu);
>> +	}
>> +
>> +fail:
>> +	if (page)
>> +		kunmap(page);
>> +	nested_vmx_vmexit(vcpu, vmx->exit_reason,
>> +			  vmcs_read32(VM_EXIT_INTR_INFO),
>> +			  vmcs_readl(EXIT_QUALIFICATION));
>> +	return 1;
>> +}
>> +
>>  /*
>>   * The exit handlers return 1 if the exit was handled fully and guest
>>   execution
>>   * may resume.  Otherwise they set the kvm_run parameter to indicate what
>>   needs
>> @@ -7790,6 +7893,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct
>> kvm_vcpu *vcpu) = {
>>  	[EXIT_REASON_XSAVES]                  = handle_xsaves,
>>  	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
>>  	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
>> +	[EXIT_REASON_VMFUNC]                  = handle_vmfunc,
>>  	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
>>  };
>>  
>> --
>> 2.9.4
>> 
>>