Re: [RFC] Nested VMX support - kernel

Avi Kivity <avi@xxxxxxxxxx> · Tue, 18 Aug 2009 12:46:29 +0300

On 08/17/2009 04:48 PM, oritw@xxxxxxxxxx wrote:
From: Orit Wasserman<oritw@xxxxxxxxxx>

This patch implements nested VMX support. It enables a guest to use the
VMX APIs in order to run its own nested guest (i.e., it enables
running other hypervisors which use VMX under KVM). The current patch
supports running Linux under a nested KVM. Additional patches for
running Windows under nested KVM, and Linux and Windows under nested
VMware server(!), are currently running in the lab. We are in the
process of forward-porting those patches to -tip.

   

Very impressive stuff.

The current patch only supports a single nested hypervisor, which can
only run a single guest.  SMP is not supported yet when running nested
hypervisor (work in progress). Only 64 bit nested hypervisors are
supported. Currently only EPT mode in both host and nested hypervisor
is supported (i.e., both hypervisors must use EPT).

   

Can you explain what is missing wrt SMP and multiguest support?

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 33901be..fad3577 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -389,7 +389,8 @@ struct kvm_arch{
  	unsigned int n_free_mmu_pages;
  	unsigned int n_requested_mmu_pages;
  	unsigned int n_alloc_mmu_pages;
-	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
+	struct hlist_head _mmu_page_hash[KVM_NUM_MMU_PAGES];
+	struct hlist_head *mmu_page_hash;
   

This leads to exploding memory use when the guest has multiple eptp 
contexts.  You should put all shadow pages in the existing 
mmu_page_hash, and tag then with eptp pointer.

Nested EPT is just like ordinary shadow pagetables.  Shadow folds the 
gva->gpa translation with the gpa->hpa translation; nested EPT folds the 
ngpa->gpa translation with the gpa->hpa translation so you should be 
able to reuse the code.


  #include<linux/kvm_host.h>
  #include<linux/types.h>
@@ -2042,7 +2043,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
  		ASSERT(!VALID_PAGE(root));
  		if (tdp_enabled)
  			direct = 1;
-		if (mmu_check_root(vcpu, root_gfn))
+		if (!is_nested_tdp()&&   mmu_check_root(vcpu, root_gfn))
  			return 1;
   

Why remove the check?  It's still needed (presuming root_gfn refers to 
the guest eptp).


-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+static int __tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, pfn_t pfn,
  				u32 error_code)
  {
-	pfn_t pfn;
  	int r;
  	int level;
  	gfn_t gfn = gpa>>  PAGE_SHIFT;
@@ -2159,11 +2159,6 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,

  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
  	smp_rmb();
-	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	if (is_error_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
  	spin_lock(&vcpu->kvm->mmu_lock);
  	if (mmu_notifier_retry(vcpu, mmu_seq))
  		goto out_unlock;
@@ -2180,6 +2175,30 @@ out_unlock:
  	return 0;
  }
   

Why do you need a variant of tdp_page_fault() that doesn't do error 
checking?


+int nested_tdp_page_fault(struct kvm_vcpu *vcpu,
+			  gpa_t gpa2,
+			  gpa_t ept12)
+{
+	gpa_t gpa1;
+	pfn_t pfn;
+	int r;
+	u64 data = 0;
+
+	ASSERT(vcpu);
+	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	r = mmu_topup_memory_caches(vcpu);
+	if (r)
+		return r;
+
+	gpa1 = paging64_nested_ept_walk(vcpu, gpa2, ept12);
+
+	if (gpa1 == UNMAPPED_GVA)
+		return 1;
+
+	kvm_read_guest(vcpu->kvm, gpa1,&data, sizeof(data));
   

Missing error check.

+
+	pfn = gfn_to_pfn(vcpu->kvm, gpa1>>  PAGE_SHIFT);
+
+	if (is_error_pfn(pfn)) {
+		kvm_release_pfn_clean(pfn);
+		return 1;
+	}
+
+	r = __tdp_page_fault(vcpu, gpa2&  PAGE_MASK, pfn, 0);
+	if (r)
+		return r;
+
+	return 0;
+
+}
+EXPORT_SYMBOL_GPL(nested_tdp_page_fault);
   

This should be part of the normal kvm_mmu_page_fault().  It needs to 
emulate if the nested guest has mmio access (for example if you use 
device assignment in the guest with an emulated device).

+#if PTTYPE == 64
+static gpa_t paging64_nested_ept_walk(struct kvm_vcpu *vcpu, gpa_t addr,
+				    gpa_t ept12)
+{
+	pt_element_t pte;
+	gfn_t table_gfn;
+	unsigned index;
+	gpa_t pte_gpa;
+	gpa_t gpa1 = UNMAPPED_GVA;
+
+	struct guest_walker walk;
+	struct guest_walker *walker =&walk;
+
+	walker->level = vcpu->arch.mmu.shadow_root_level;;
+	pte = ept12;
+
+	for (;;) {
+		index = PT_INDEX(addr, walker->level);
+
+		table_gfn = gpte_to_gfn(pte);
+		pte_gpa = gfn_to_gpa(table_gfn);
+		pte_gpa += index * sizeof(pt_element_t);
+		walker->table_gfn[walker->level - 1] = table_gfn;
+		walker->pte_gpa[walker->level - 1] = pte_gpa;
+
+		kvm_read_guest(vcpu->kvm, pte_gpa,&pte, sizeof(pte));
+
+		if (pte == shadow_trap_nonpresent_pte)
+			return UNMAPPED_GVA;
+
+		walker->ptes[walker->level - 1] = pte;
+
+		if (walker->level == PT_PAGE_TABLE_LEVEL) {
+			walker->gfn = gpte_to_gfn(pte);
+			break;
+		}
+
+		if (walker->level == PT_DIRECTORY_LEVEL
+		&&  (pte&  PT_PAGE_SIZE_MASK)
+		&&  (PTTYPE == 64 || is_pse(vcpu))) {
+			walker->gfn = gpte_to_gfn_lvl(pte, PT_DIRECTORY_LEVEL);
+			walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
+			if (PTTYPE == 32&&  is_cpuid_PSE36())
+				walker->gfn += pse36_gfn_delta(pte);
+			break;
+		}
+
+		--walker->level;
+	}
+
+	gpa1 = gfn_to_gpa(walker->gfn);
+
+	return gpa1;
+}
+#endif
   

This duplicates walk_addr().  Instead, please add a third compilation of 
paging_tmpl.h with the #defines set to the EPT format.  You can use 
recursion to implement the nesting.

We'll have the following walk scenarios:

  32, 64 - the existing non-nested walks
  32-on-ept, 64-on-ept - nested vmx
  32-on-64, 32-on-32, 64-on-64, 64-on-32 - nested svm

What's needed is to replace kvm_read_guest() with a function pointer 
that does a nested walk.



+
+struct __attribute__ ((__packed__)) level_state {
+	struct shadow_vmcs *shadow_vmcs;
+
+	u16 vpid;
+	u64 shadow_efer;
+	unsigned long cr2;
+	unsigned long cr3;
+	unsigned long cr4;
+	unsigned long cr8;
   

cr0?

@@ -114,6 +282,34 @@ struct vcpu_vmx {
  	ktime_t entry_time;
  	s64 vnmi_blocked_time;
  	u32 exit_reason;
+
+	/* Nested vmx */
+
+	/* Has the level1 guest done vmon? */
+	int vmon;
+	/* Has the level1 guest done vmclear? */
+	int vmclear;
+
+	/* Are we running nested guest */
+	int nested_mode;
+
+	/* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
+	int nested_run_pending;
+
+	/* flag indicating if there was a valid IDT after exiting from l2 */
+	int nested_pending_valid_idt;
+
+	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
+	u64 l1_cur_vmcs;
   


Put all these into a 'nested' structure.  Use bool for booleans.


+static inline bool is_nested(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested_mode;
+}
   

Is this really necessary?  You'll have the vmx variable available most 
places.

+
+static inline int vmcs_field_to_offset(unsigned long field)
+{
+	switch (field) {
+	case VIRTUAL_PROCESSOR_ID:
+		return offsetof(struct shadow_vmcs, virtual_processor_id);
+	case GUEST_ES_SELECTOR:
+		return offsetof(struct shadow_vmcs, guest_es_selector);
+	case GUEST_CS_SELECTOR:
+		return offsetof(struct shadow_vmcs, guest_cs_selector);
+	case GUEST_SS_SELECTOR:
+		return offsetof(struct shadow_vmcs, guest_ss_selector);
+	case GUEST_DS_SELECTOR:
+		return offsetof(struct shadow_vmcs, guest_ds_selector);
+	case GUEST_FS_SELECTOR:
+		return offsetof(struct shadow_vmcs, guest_fs_selector);
+	case GUEST_GS_SELECTOR:
+		return offsetof(struct shadow_vmcs, guest_gs_selector);
+	case GUEST_LDTR_SELECTOR:
+		return offsetof(struct shadow_vmcs, guest_ldtr_selector);
+	case GUEST_TR_SELECTOR:
+		return offsetof(struct shadow_vmcs, guest_tr_selector);
+	case HOST_ES_SELECTOR:
+		return offsetof(struct shadow_vmcs, host_es_selector);
+	case HOST_CS_SELECTOR:
+		return offsetof(struct shadow_vmcs, host_cs_selector);
+	case HOST_SS_SELECTOR:
+		return offsetof(struct shadow_vmcs, host_ss_selector);
+	case HOST_DS_SELECTOR:
+		return offsetof(struct shadow_vmcs, host_ds_selector);
+	case HOST_FS_SELECTOR:
+		return offsetof(struct shadow_vmcs, host_fs_selector);
+	case HOST_GS_SELECTOR:
+		return offsetof(struct shadow_vmcs, host_gs_selector);
+	case HOST_TR_SELECTOR:
+		return offsetof(struct shadow_vmcs, host_tr_selector);
+	case IO_BITMAP_A:
+		return offsetof(struct shadow_vmcs, io_bitmap_a);
+	case IO_BITMAP_A_HIGH:
+		return offsetof(struct shadow_vmcs, io_bitmap_a)+4;
+	case IO_BITMAP_B:
+		return offsetof(struct shadow_vmcs, io_bitmap_b);
+	case IO_BITMAP_B_HIGH:
+		return offsetof(struct shadow_vmcs, io_bitmap_b)+4;
+	case MSR_BITMAP:
+		return offsetof(struct shadow_vmcs, msr_bitmap);
+	case MSR_BITMAP_HIGH:
+		return offsetof(struct shadow_vmcs, msr_bitmap)+4;
+	case VM_EXIT_MSR_STORE_ADDR:
+		return offsetof(struct shadow_vmcs, vm_exit_msr_store_addr);
+	case VM_EXIT_MSR_STORE_ADDR_HIGH:
+		return offsetof(struct shadow_vmcs, vm_exit_msr_store_addr)+4;
+	case VM_EXIT_MSR_LOAD_ADDR:
+		return offsetof(struct shadow_vmcs, vm_exit_msr_load_addr);
+	case VM_EXIT_MSR_LOAD_ADDR_HIGH:
+		return offsetof(struct shadow_vmcs, vm_exit_msr_load_addr)+4;
+	case VM_ENTRY_MSR_LOAD_ADDR:
+		return offsetof(struct shadow_vmcs, vm_entry_msr_load_addr);
+	case VM_ENTRY_MSR_LOAD_ADDR_HIGH:
+		return offsetof(struct shadow_vmcs, vm_entry_msr_load_addr)+4;
+	case TSC_OFFSET:
+		return offsetof(struct shadow_vmcs, tsc_offset);
+	case TSC_OFFSET_HIGH:
+		return offsetof(struct shadow_vmcs, tsc_offset)+4;
+	case VIRTUAL_APIC_PAGE_ADDR:
+		return offsetof(struct shadow_vmcs, virtual_apic_page_addr);
+	case VIRTUAL_APIC_PAGE_ADDR_HIGH:
+		return offsetof(struct shadow_vmcs, virtual_apic_page_addr)+4;
+	case APIC_ACCESS_ADDR:
+		return offsetof(struct shadow_vmcs, apic_access_addr);
+	case APIC_ACCESS_ADDR_HIGH:
+		return offsetof(struct shadow_vmcs, apic_access_addr)+4;
+	case EPT_POINTER:
+		return offsetof(struct shadow_vmcs, ept_pointer);
+	case EPT_POINTER_HIGH:
+		return offsetof(struct shadow_vmcs, ept_pointer)+4;
+	case GUEST_PHYSICAL_ADDRESS:
+		return offsetof(struct shadow_vmcs, guest_physical_address);
+	case GUEST_PHYSICAL_ADDRESS_HIGH:
+		return offsetof(struct shadow_vmcs, guest_physical_address)+4;
+	case VMCS_LINK_POINTER:
+		return offsetof(struct shadow_vmcs, vmcs_link_pointer);
+	case VMCS_LINK_POINTER_HIGH:
+		return offsetof(struct shadow_vmcs, vmcs_link_pointer)+4;
+	case GUEST_IA32_DEBUGCTL:
+		return offsetof(struct shadow_vmcs, guest_ia32_debugctl);
+	case GUEST_IA32_DEBUGCTL_HIGH:
+		return offsetof(struct shadow_vmcs, guest_ia32_debugctl)+4;
+	case GUEST_IA32_PAT:
+		return offsetof(struct shadow_vmcs, guest_ia32_pat);
+	case GUEST_IA32_PAT_HIGH:
+		return offsetof(struct shadow_vmcs, guest_ia32_pat)+4;
+	case GUEST_PDPTR0:
+		return offsetof(struct shadow_vmcs, guest_pdptr0);
+	case GUEST_PDPTR0_HIGH:
+		return offsetof(struct shadow_vmcs, guest_pdptr0)+4;
+	case GUEST_PDPTR1:
+		return offsetof(struct shadow_vmcs, guest_pdptr1);
+	case GUEST_PDPTR1_HIGH:
+		return offsetof(struct shadow_vmcs, guest_pdptr1)+4;
+	case GUEST_PDPTR2:
+		return offsetof(struct shadow_vmcs, guest_pdptr2);
+	case GUEST_PDPTR2_HIGH:
+		return offsetof(struct shadow_vmcs, guest_pdptr2)+4;
+	case GUEST_PDPTR3:
+		return offsetof(struct shadow_vmcs, guest_pdptr3);
+	case GUEST_PDPTR3_HIGH:
+		return offsetof(struct shadow_vmcs, guest_pdptr3)+4;
+	case HOST_IA32_PAT:
+		return offsetof(struct shadow_vmcs, host_ia32_pat);
+	case HOST_IA32_PAT_HIGH:
+		return offsetof(struct shadow_vmcs, host_ia32_pat)+4;
+	case PIN_BASED_VM_EXEC_CONTROL:
+		return offsetof(struct shadow_vmcs, pin_based_vm_exec_control);
+	case CPU_BASED_VM_EXEC_CONTROL:
+		return offsetof(struct shadow_vmcs, cpu_based_vm_exec_control);
+	case EXCEPTION_BITMAP:
+		return offsetof(struct shadow_vmcs, exception_bitmap);
+	case PAGE_FAULT_ERROR_CODE_MASK:
+		return offsetof(struct shadow_vmcs, page_fault_error_code_mask);
+	case PAGE_FAULT_ERROR_CODE_MATCH:
+		return offsetof(struct shadow_vmcs,
+				page_fault_error_code_match);
+	case CR3_TARGET_COUNT:
+		return offsetof(struct shadow_vmcs, cr3_target_count);
+	case VM_EXIT_CONTROLS:
+		return offsetof(struct shadow_vmcs, vm_exit_controls);
+	case VM_EXIT_MSR_STORE_COUNT:
+		return offsetof(struct shadow_vmcs, vm_exit_msr_store_count);
+	case VM_EXIT_MSR_LOAD_COUNT:
+		return offsetof(struct shadow_vmcs, vm_exit_msr_load_count);
+	case VM_ENTRY_CONTROLS:
+		return offsetof(struct shadow_vmcs, vm_entry_controls);
+	case VM_ENTRY_MSR_LOAD_COUNT:
+		return offsetof(struct shadow_vmcs, vm_entry_msr_load_count);
+	case VM_ENTRY_INTR_INFO_FIELD:
+		return offsetof(struct shadow_vmcs, vm_entry_intr_info_field);
+	case VM_ENTRY_EXCEPTION_ERROR_CODE:
+		return offsetof(struct shadow_vmcs,
+				vm_entry_exception_error_code);
+	case VM_ENTRY_INSTRUCTION_LEN:
+		return offsetof(struct shadow_vmcs, vm_entry_instruction_len);
+	case TPR_THRESHOLD:
+		return offsetof(struct shadow_vmcs, tpr_threshold);
+	case SECONDARY_VM_EXEC_CONTROL:
+		return offsetof(struct shadow_vmcs, secondary_vm_exec_control);
+	case VM_INSTRUCTION_ERROR:
+		return offsetof(struct shadow_vmcs, vm_instruction_error);
+	case VM_EXIT_REASON:
+		return offsetof(struct shadow_vmcs, vm_exit_reason);
+	case VM_EXIT_INTR_INFO:
+		return offsetof(struct shadow_vmcs, vm_exit_intr_info);
+	case VM_EXIT_INTR_ERROR_CODE:
+		return offsetof(struct shadow_vmcs, vm_exit_intr_error_code);
+	case IDT_VECTORING_INFO_FIELD:
+		return offsetof(struct shadow_vmcs, idt_vectoring_info_field);
+	case IDT_VECTORING_ERROR_CODE:
+		return offsetof(struct shadow_vmcs, idt_vectoring_error_code);
+	case VM_EXIT_INSTRUCTION_LEN:
+		return offsetof(struct shadow_vmcs, vm_exit_instruction_len);
+	case VMX_INSTRUCTION_INFO:
+		return offsetof(struct shadow_vmcs, vmx_instruction_info);
+	case GUEST_ES_LIMIT:
+		return offsetof(struct shadow_vmcs, guest_es_limit);
+	case GUEST_CS_LIMIT:
+		return offsetof(struct shadow_vmcs, guest_cs_limit);
+	case GUEST_SS_LIMIT:
+		return offsetof(struct shadow_vmcs, guest_ss_limit);
+	case GUEST_DS_LIMIT:
+		return offsetof(struct shadow_vmcs, guest_ds_limit);
+	case GUEST_FS_LIMIT:
+		return offsetof(struct shadow_vmcs, guest_fs_limit);
+	case GUEST_GS_LIMIT:
+		return offsetof(struct shadow_vmcs, guest_gs_limit);
+	case GUEST_LDTR_LIMIT:
+		return offsetof(struct shadow_vmcs, guest_ldtr_limit);
+	case GUEST_TR_LIMIT:
+		return offsetof(struct shadow_vmcs, guest_tr_limit);
+	case GUEST_GDTR_LIMIT:
+		return offsetof(struct shadow_vmcs, guest_gdtr_limit);
+	case GUEST_IDTR_LIMIT:
+		return offsetof(struct shadow_vmcs, guest_idtr_limit);
+	case GUEST_ES_AR_BYTES:
+		return offsetof(struct shadow_vmcs, guest_es_ar_bytes);
+	case GUEST_CS_AR_BYTES:
+		return offsetof(struct shadow_vmcs, guest_cs_ar_bytes);
+	case GUEST_SS_AR_BYTES:
+		return offsetof(struct shadow_vmcs, guest_ss_ar_bytes);
+	case GUEST_DS_AR_BYTES:
+		return offsetof(struct shadow_vmcs, guest_ds_ar_bytes);
+	case GUEST_FS_AR_BYTES:
+		return offsetof(struct shadow_vmcs, guest_fs_ar_bytes);
+	case GUEST_GS_AR_BYTES:
+		return offsetof(struct shadow_vmcs, guest_gs_ar_bytes);
+	case GUEST_LDTR_AR_BYTES:
+		return offsetof(struct shadow_vmcs, guest_ldtr_ar_bytes);
+	case GUEST_TR_AR_BYTES:
+		return offsetof(struct shadow_vmcs, guest_tr_ar_bytes);
+	case GUEST_INTERRUPTIBILITY_INFO:
+		return offsetof(struct shadow_vmcs,
+				guest_interruptibility_info);
+	case GUEST_ACTIVITY_STATE:
+		return offsetof(struct shadow_vmcs, guest_activity_state);
+	case GUEST_SYSENTER_CS:
+		return offsetof(struct shadow_vmcs, guest_sysenter_cs);
+	case HOST_IA32_SYSENTER_CS:
+		return offsetof(struct shadow_vmcs, host_ia32_sysenter_cs);
+	case CR0_GUEST_HOST_MASK:
+		return offsetof(struct shadow_vmcs, cr0_guest_host_mask);
+	case CR4_GUEST_HOST_MASK:
+		return offsetof(struct shadow_vmcs, cr4_guest_host_mask);
+	case CR0_READ_SHADOW:
+		return offsetof(struct shadow_vmcs, cr0_read_shadow);
+	case CR4_READ_SHADOW:
+		return offsetof(struct shadow_vmcs, cr4_read_shadow);
+	case CR3_TARGET_VALUE0:
+		return offsetof(struct shadow_vmcs, cr3_target_value0);
+	case CR3_TARGET_VALUE1:
+		return offsetof(struct shadow_vmcs, cr3_target_value1);
+	case CR3_TARGET_VALUE2:
+		return offsetof(struct shadow_vmcs, cr3_target_value2);
+	case CR3_TARGET_VALUE3:
+		return offsetof(struct shadow_vmcs, cr3_target_value3);
+	case EXIT_QUALIFICATION:
+		return offsetof(struct shadow_vmcs, exit_qualification);
+	case GUEST_LINEAR_ADDRESS:
+		return offsetof(struct shadow_vmcs, guest_linear_address);
+	case GUEST_CR0:
+		return offsetof(struct shadow_vmcs, guest_cr0);
+	case GUEST_CR3:
+		return offsetof(struct shadow_vmcs, guest_cr3);
+	case GUEST_CR4:
+		return offsetof(struct shadow_vmcs, guest_cr4);
+	case GUEST_ES_BASE:
+		return offsetof(struct shadow_vmcs, guest_es_base);
+	case GUEST_CS_BASE:
+		return offsetof(struct shadow_vmcs, guest_cs_base);
+	case GUEST_SS_BASE:
+		return offsetof(struct shadow_vmcs, guest_ss_base);
+	case GUEST_DS_BASE:
+		return offsetof(struct shadow_vmcs, guest_ds_base);
+	case GUEST_FS_BASE:
+		return offsetof(struct shadow_vmcs, guest_fs_base);
+	case GUEST_GS_BASE:
+		return offsetof(struct shadow_vmcs, guest_gs_base);
+	case GUEST_LDTR_BASE:
+		return offsetof(struct shadow_vmcs, guest_ldtr_base);
+	case GUEST_TR_BASE:
+		return offsetof(struct shadow_vmcs, guest_tr_base);
+	case GUEST_GDTR_BASE:
+		return offsetof(struct shadow_vmcs, guest_gdtr_base);
+	case GUEST_IDTR_BASE:
+		return offsetof(struct shadow_vmcs, guest_idtr_base);
+	case GUEST_DR7:
+		return offsetof(struct shadow_vmcs, guest_dr7);
+	case GUEST_RSP:
+		return offsetof(struct shadow_vmcs, guest_rsp);
+	case GUEST_RIP:
+		return offsetof(struct shadow_vmcs, guest_rip);
+	case GUEST_RFLAGS:
+		return offsetof(struct shadow_vmcs, guest_rflags);
+	case GUEST_PENDING_DBG_EXCEPTIONS:
+		return offsetof(struct shadow_vmcs,
+				guest_pending_dbg_exceptions);
+	case GUEST_SYSENTER_ESP:
+		return offsetof(struct shadow_vmcs, guest_sysenter_esp);
+	case GUEST_SYSENTER_EIP:
+		return offsetof(struct shadow_vmcs, guest_sysenter_eip);
+	case HOST_CR0:
+		return offsetof(struct shadow_vmcs, host_cr0);
+	case HOST_CR3:
+		return offsetof(struct shadow_vmcs, host_cr3);
+	case HOST_CR4:
+		return offsetof(struct shadow_vmcs, host_cr4);
+	case HOST_FS_BASE:
+		return offsetof(struct shadow_vmcs, host_fs_base);
+	case HOST_GS_BASE:
+		return offsetof(struct shadow_vmcs, host_gs_base);
+	case HOST_TR_BASE:
+		return offsetof(struct shadow_vmcs, host_tr_base);
+	case HOST_GDTR_BASE:
+		return offsetof(struct shadow_vmcs, host_gdtr_base);
+	case HOST_IDTR_BASE:
+		return offsetof(struct shadow_vmcs, host_idtr_base);
+	case HOST_IA32_SYSENTER_ESP:
+		return offsetof(struct shadow_vmcs, host_ia32_sysenter_esp);
+	case HOST_IA32_SYSENTER_EIP:
+		return offsetof(struct shadow_vmcs, host_ia32_sysenter_eip);
+	case HOST_RSP:
+		return offsetof(struct shadow_vmcs, host_rsp);
+	case HOST_RIP:
+		return offsetof(struct shadow_vmcs, host_rip);
+	default:
+		printk(KERN_ERR "invalid vmcs encoding 0x%lx\n", field);
+		return -1;
+	}
+}
   

An array is better here if it can be reasonable packed.

+
+static inline unsigned long nested_vmcs_readl(struct kvm_vcpu *vcpu,
+					      unsigned long field)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long entry = (unsigned long)(vmx->l2_state->shadow_vmcs);
+	if (!vmx->l2_state->shadow_vmcs) {
+		printk(KERN_ERR "%s invalid nested vmcs\n", __func__);
+		return -1;
+	}
+
+	entry += vmcs_field_to_offset(field);
+	return *(unsigned long *)(entry);
+}
   

Best to use the real size (also, don't you need to zero fill excess bytes)?

+
+static inline u64 nested_vmcs_read64(struct kvm_vcpu *vcpu, unsigned long field)
+{
+#ifdef CONFIG_X86_64
+	return nested_vmcs_readl(vcpu, field);
+#else   /* nested: 32 bit not actually tested */
+	return nested_vmcs_readl(vcpu, field) |
+		((u64)nested_vmcs_readl(vcpu, field+1)<<  32);
+#endif
+}
   

Given it's pure software, why not read a u64?

  static inline int cpu_has_vmx_invept_individual_addr(void)
  {
  	return !!(vmx_capability.ept&  VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -836,6 +1512,50 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
  	/* skipping an emulated instruction also counts */
  	vmx_set_interrupt_shadow(vcpu, 0);
  }
+static int vmptrld(struct kvm_vcpu *vcpu,
+		   u64 phys_addr)
+{
+	u8 error;
+
+	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+		      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+		      : "cc");
+	if (error) {
+		printk(KERN_ERR "kvm: %s vmptrld %llx failed\n",
+		       __func__, phys_addr);
+		return 1;
+	}
+
+	return 0;
+}
   

Please adjust vmx_vcpu_load() to use this.  Also need to make the host 
vmclear machinery aware of it, so it can migrate vcpus, no?

+
+static void clear_rflags(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags&= ~(X86_EFLAGS_CF | X86_EFLAGS_ZF);
+	vmx_set_rflags(vcpu, rflags);
+}
   

This just clears two flags, make the name reflect it.

+
+static void vmfailInvalid_rflags(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags |= X86_EFLAGS_CF;
+	rflags&= ~X86_EFLAGS_PF&  ~X86_EFLAGS_AF&  ~X86_EFLAGS_ZF&
+		~X86_EFLAGS_SF&  ~X86_EFLAGS_OF;
+	vmx_set_rflags(vcpu, rflags);
+}
+
+static void vmfailValid_rflags(struct kvm_vcpu *vcpu)
+{
+	unsigned long rflags;
+	rflags = vmx_get_rflags(vcpu);
+	rflags |= X86_EFLAGS_ZF;
+	rflags&= ~X86_EFLAGS_PF&  ~X86_EFLAGS_AF&  ~X86_EFLAGS_CF&
+		~X86_EFLAGS_SF&  ~X86_EFLAGS_OF;
+	vmx_set_rflags(vcpu, rflags);
+}

   

Naming...


+
+/*
+ * Writes msr value for nested virtualization
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
   

Don't we need to do something here?

+		break;
+	default:
+		return 1;
+	}
+
+	return 0;
+}
+

   

@@ -2543,6 +3367,9 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
  {
  	u32 cpu_based_vm_exec_control;

+	if (is_nested(vcpu))
+		return;
+
   

Doesn't seem right.

+	r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
+	if (r) {
+		printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
+		       __func__, gpa, r);
+		goto out;
   

You can triple-fault the guest is it puts a vmcs in mmio.

+
+static int handle_vmptrld(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct page *vmcs_page;
+	u64 guest_vmcs_addr;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
   

Hmm, what if the nested guest runs vmptrld?  You need to inject a vmexit 
in this case.

+	vmx->l2_state->vmcs = alloc_vmcs();
+	if (!vmx->l2_state->vmcs) {
+		printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
+		return 1;
+	}
+
+	if (vmx->l1_cur_vmcs != guest_vmcs_addr) {
+		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
+		if (vmcs_page == NULL)
+			return 1;
+
+		/* load nested vmcs to processor */
+		if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
+			printk(KERN_INFO "%s error in vmptrld \n",
+			       __func__);
+			kvm_release_page_clean(vmcs_page);
+			return 1;
+		}
   

Isn't it dangerous to vmptrld a guest page?  Another vcpu could play 
with it, corrupting its state.+


  	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-	trace_kvm_page_fault(gpa, exit_qualification);
-	return kvm_mmu_page_fault(vcpu, gpa&  PAGE_MASK, 0);
+
+	if (is_nested(vcpu)&&  nested_cpu_has_vmx_ept(vcpu)) {
+		/* get gpa1 from EPT12 */
+		r  = nested_tdp_page_fault(vcpu, gpa,
+					   to_vmx(vcpu)->
+					   l2_state->shadow_vmcs->ept_pointer);
+
+		if (r<  0) {
+			printk(KERN_ERR "EPT: Not enough memory!\n");
+			return -ENOMEM;
   

Just return r.

+		} else if (r) {
+			nested_vmx_vmexit(vcpu, false);
+			return 1;
+		}
+		return 1;
+	} else	{
+		trace_kvm_page_fault(gpa, exit_qualification);
+		return kvm_mmu_page_fault(vcpu, gpa&  PAGE_MASK, 0);
+	}
+}
   

Again, I think the nested EPT code should be part of the normal 
kvm_mmu_page_fault() path.

+
+static int handle_invept(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	unsigned long type = kvm_register_read(vcpu, VCPU_REGS_RCX);
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	switch (type) {
+	case VMX_EPT_EXTENT_INDIVIDUAL_ADDR:
+		if (!cpu_has_vmx_invept_individual_addr())
+			vmfailValid_rflags(vcpu);
+		else {
+			struct {
+				u64 eptp, gpa;
+			} operand;
+			kvm_read_guest(vcpu->kvm,
+				       kvm_register_read(vcpu, VCPU_REGS_RAX),
+				&operand, sizeof(operand));
+			if (vmx->l2_state->ept_pointer)
+				ept_sync_individual_addr(vmx->l2_state->
+							 ept_pointer,
+							 operand.gpa);
   


You need to use the translated gpa here, no?

+	case VMX_EPT_EXTENT_GLOBAL:
+		if (!cpu_has_vmx_invept_global())
+			vmfailValid_rflags(vcpu);
+		else
+			ept_sync_global();
+		break;
   

Even if the guest requests a global flush, we can do a context flush for 
the current nested eptp, and use a generation counter to flush other 
contexts when they're switched to.  Will reduce guests affecting each 
other, though probably not important.


+void save_vmcs(struct shadow_vmcs *dst)
+{
+	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
+	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
+	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
+	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
+	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
+	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
+	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
+	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
   

IO_BITMAP_A can only change with vmwrite, which we trap, so why to we 
need to read it?  Ditto for may other fields.

Please split into smaller patches for easier future review.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html