Re: [PATCH 3.2 087/102] nEPT: Nested INVEPT

Ben Hutchings <ben@xxxxxxxxxxxxxxx> · Mon, 03 Nov 2014 13:44:21 +0000

On Sun, 2014-11-02 at 10:03 +0100, Paolo Bonzini wrote:
> You can just use the same scheme as your patch 88/102:

Why is that?  Why should I not use the upstream version?

Ben.

> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 685b8448d6e2..bd8cc9055fe2 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -6740,6 +6740,12 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> +static int handle_invept(struct kvm_vcpu *vcpu)
> +{
> +	kvm_queue_exception(vcpu, UD_VECTOR);
> +	return 1;
> +}
> +
>  /*
>   * The exit handlers return 1 if the exit was handled fully and guest execution
>   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
> @@ -6785,6 +6791,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
>  	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
>  	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
> +	[EXIT_REASON_INVEPT]                  = handle_invept,
>  };
>  
>  static const int kvm_vmx_max_exit_handlers =
> @@ -7020,6 +7027,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
>  	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
>  	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
>  	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
> +	case EXIT_REASON_INVEPT:
>  		/*
>  		 * VMX instructions trap unconditionally. This allows L1 to
>  		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
> 
> 
> Paolo
> 
> On 01/11/2014 23:28, Ben Hutchings wrote:
> > 3.2.64-rc1 review patch.  If anyone has any objections, please let me know.
> > 
> > ------------------
> > 
> > From: Nadav Har'El <nyh@xxxxxxxxxx>
> > 
> > commit bfd0a56b90005f8c8a004baf407ad90045c2b11e upstream.
> > 
> > If we let L1 use EPT, we should probably also support the INVEPT instruction.
> > 
> > In our current nested EPT implementation, when L1 changes its EPT table
> > for L2 (i.e., EPT12), L0 modifies the shadow EPT table (EPT02), and in
> > the course of this modification already calls INVEPT. But if last level
> > of shadow page is unsync not all L1's changes to EPT12 are intercepted,
> > which means roots need to be synced when L1 calls INVEPT. Global INVEPT
> > should not be different since roots are synced by kvm_mmu_load() each
> > time EPTP02 changes.
> > 
> > Reviewed-by: Xiao Guangrong <xiaoguangrong@xxxxxxxxxxxxxxxxxx>
> > Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx>
> > Signed-off-by: Jun Nakajima <jun.nakajima@xxxxxxxxx>
> > Signed-off-by: Xinhao Xu <xinhao.xu@xxxxxxxxx>
> > Signed-off-by: Yang Zhang <yang.z.zhang@xxxxxxxxx>
> > Signed-off-by: Gleb Natapov <gleb@xxxxxxxxxx>
> > Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> > [bwh: Backported to 3.2:
> >  - Adjust context, filename
> >  - Add definition of nested_ept_get_cr3(), added upstream by commit
> >    155a97a3d7c7 ("nEPT: MMU context for nested EPT")]
> > Signed-off-by: Ben Hutchings <ben@xxxxxxxxxxxxxxx>
> > ---
> > --- a/arch/x86/include/asm/vmx.h
> > +++ b/arch/x86/include/asm/vmx.h
> > @@ -279,6 +279,7 @@ enum vmcs_field {
> >  #define EXIT_REASON_APIC_ACCESS         44
> >  #define EXIT_REASON_EPT_VIOLATION       48
> >  #define EXIT_REASON_EPT_MISCONFIG       49
> > +#define EXIT_REASON_INVEPT              50
> >  #define EXIT_REASON_WBINVD		54
> >  #define EXIT_REASON_XSETBV		55
> >  
> > @@ -397,6 +398,7 @@ enum vmcs_field {
> >  #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR		0
> >  #define VMX_EPT_EXTENT_CONTEXT			1
> >  #define VMX_EPT_EXTENT_GLOBAL			2
> > +#define VMX_EPT_EXTENT_SHIFT			24
> >  
> >  #define VMX_EPT_EXECUTE_ONLY_BIT		(1ull)
> >  #define VMX_EPT_PAGE_WALK_4_BIT			(1ull << 6)
> > @@ -404,6 +406,7 @@ enum vmcs_field {
> >  #define VMX_EPTP_WB_BIT				(1ull << 14)
> >  #define VMX_EPT_2MB_PAGE_BIT			(1ull << 16)
> >  #define VMX_EPT_1GB_PAGE_BIT			(1ull << 17)
> > +#define VMX_EPT_INVEPT_BIT			(1ull << 20)
> >  #define VMX_EPT_EXTENT_INDIVIDUAL_BIT		(1ull << 24)
> >  #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
> >  #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -2869,6 +2869,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu
> >  	mmu_sync_roots(vcpu);
> >  	spin_unlock(&vcpu->kvm->mmu_lock);
> >  }
> > +EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
> >  
> >  static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
> >  				  u32 access, struct x86_exception *exception)
> > @@ -3131,6 +3132,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *
> >  	++vcpu->stat.tlb_flush;
> >  	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
> >  }
> > +EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
> >  
> >  static void paging_new_cr3(struct kvm_vcpu *vcpu)
> >  {
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -602,6 +602,7 @@ static void nested_release_page_clean(st
> >  	kvm_release_page_clean(page);
> >  }
> >  
> > +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
> >  static u64 construct_eptp(unsigned long root_hpa);
> >  static void kvm_cpu_vmxon(u64 addr);
> >  static void kvm_cpu_vmxoff(void);
> > @@ -1899,6 +1900,7 @@ static u32 nested_vmx_secondary_ctls_low
> >  static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
> >  static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
> >  static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
> > +static u32 nested_vmx_ept_caps;
> >  static __init void nested_vmx_setup_ctls_msrs(void)
> >  {
> >  	/*
> > @@ -5550,6 +5552,74 @@ static int handle_vmptrst(struct kvm_vcp
> >  	return 1;
> >  }
> >  
> > +/* Emulate the INVEPT instruction */
> > +static int handle_invept(struct kvm_vcpu *vcpu)
> > +{
> > +	u32 vmx_instruction_info, types;
> > +	unsigned long type;
> > +	gva_t gva;
> > +	struct x86_exception e;
> > +	struct {
> > +		u64 eptp, gpa;
> > +	} operand;
> > +	u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
> > +
> > +	if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
> > +	    !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
> > +		kvm_queue_exception(vcpu, UD_VECTOR);
> > +		return 1;
> > +	}
> > +
> > +	if (!nested_vmx_check_permission(vcpu))
> > +		return 1;
> > +
> > +	if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
> > +		kvm_queue_exception(vcpu, UD_VECTOR);
> > +		return 1;
> > +	}
> > +
> > +	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> > +	type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
> > +
> > +	types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
> > +
> > +	if (!(types & (1UL << type))) {
> > +		nested_vmx_failValid(vcpu,
> > +				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
> > +		return 1;
> > +	}
> > +
> > +	/* According to the Intel VMX instruction reference, the memory
> > +	 * operand is read even if it isn't needed (e.g., for type==global)
> > +	 */
> > +	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
> > +			vmx_instruction_info, &gva))
> > +		return 1;
> > +	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
> > +				sizeof(operand), &e)) {
> > +		kvm_inject_page_fault(vcpu, &e);
> > +		return 1;
> > +	}
> > +
> > +	switch (type) {
> > +	case VMX_EPT_EXTENT_CONTEXT:
> > +		if ((operand.eptp & eptp_mask) !=
> > +				(nested_ept_get_cr3(vcpu) & eptp_mask))
> > +			break;
> > +	case VMX_EPT_EXTENT_GLOBAL:
> > +		kvm_mmu_sync_roots(vcpu);
> > +		kvm_mmu_flush_tlb(vcpu);
> > +		nested_vmx_succeed(vcpu);
> > +		break;
> > +	default:
> > +		BUG_ON(1);
> > +		break;
> > +	}
> > +
> > +	skip_emulated_instruction(vcpu);
> > +	return 1;
> > +}
> > +
> >  /*
> >   * The exit handlers return 1 if the exit was handled fully and guest execution
> >   * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
> > @@ -5591,6 +5661,7 @@ static int (*kvm_vmx_exit_handlers[])(st
> >  	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
> >  	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
> >  	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
> > +	[EXIT_REASON_INVEPT]                  = handle_invept,
> >  };
> >  
> >  static const int kvm_vmx_max_exit_handlers =
> > @@ -5775,6 +5846,7 @@ static bool nested_vmx_exit_handled(stru
> >  	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
> >  	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
> >  	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
> > +	case EXIT_REASON_INVEPT:
> >  		/*
> >  		 * VMX instructions trap unconditionally. This allows L1 to
> >  		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
> > @@ -6436,6 +6508,12 @@ static void vmx_set_supported_cpuid(u32
> >  		entry->ecx |= bit(X86_FEATURE_VMX);
> >  }
> >  
> > +static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
> > +{
> > +	/* return the page table to be shadowed - in our case, EPT12 */
> > +	return get_vmcs12(vcpu)->ept_pointer;
> > +}
> > +
> >  /*
> >   * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
> >   * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
> > 

-- 
Ben Hutchings
Power corrupts.  Absolute power is kind of neat.
                           - John Lehman, Secretary of the US Navy 1981-1987
Attachment:
signature.asc

Description: This is a digitally signed message part