Re: [PATCH 15/27] nVMX: Prepare vmcs02 from vmcs01 and vmcs12

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



 On 10/17/2010 12:11 PM, Nadav Har'El wrote:
This patch contains code to prepare the VMCS which can be used to actually
run the L2 guest, vmcs02. prepare_vmcs02 appropriately merges the information
in vmcs12 (the vmcs that L1 built for L2) and in vmcs01 (the vmcs that we
built for L1).

VMREAD/WRITE can only access one VMCS at a time (the "current" VMCS), which
makes it difficult for us to read from vmcs01 while writing to vmcs12. This
is why we first make a copy of vmcs01 in memory (vmcs01_fields) and then
read that memory copy while writing to vmcs12.


I believe I commented on this before - you can call the same functions kvm uses to initialize the normal vmcs to get the common parts filled in.+int load_vmcs_host_state(struct vmcs_fields *src)
+{
+	vmcs_write16(HOST_ES_SELECTOR, src->host_es_selector);
+	vmcs_write16(HOST_CS_SELECTOR, src->host_cs_selector);
+	vmcs_write16(HOST_SS_SELECTOR, src->host_ss_selector);
+	vmcs_write16(HOST_DS_SELECTOR, src->host_ds_selector);
+	vmcs_write16(HOST_FS_SELECTOR, src->host_fs_selector);
+	vmcs_write16(HOST_GS_SELECTOR, src->host_gs_selector);
+	vmcs_write16(HOST_TR_SELECTOR, src->host_tr_selector);

vmx_vcpu_setup() - you can extract the common parts and call them from here.

+
+	if (vmcs_config.vmexit_ctrl&  VM_EXIT_LOAD_IA32_PAT)
+		vmcs_write64(HOST_IA32_PAT, src->host_ia32_pat);
+
+	vmcs_write32(HOST_IA32_SYSENTER_CS, src->host_ia32_sysenter_cs);
+
+	vmcs_writel(HOST_CR0, src->host_cr0);
+	vmcs_writel(HOST_CR3, src->host_cr3);
+	vmcs_writel(HOST_CR4, src->host_cr4);

Ditto.

+	vmcs_writel(HOST_FS_BASE, src->host_fs_base);
+	vmcs_writel(HOST_GS_BASE, src->host_gs_base);

These change on vcpu migration. Perhaps the cause of smp failures? Check that vmx_vcpu_load() updates the correct vmcs.

+	vmcs_writel(HOST_TR_BASE, src->host_tr_base);
+	vmcs_writel(HOST_GDTR_BASE, src->host_gdtr_base);

Both per-cpu, again updated on vcpu mirgation.

+	vmcs_writel(HOST_IDTR_BASE, src->host_idtr_base);

Not per-cpu, unfortunately.

+	vmcs_writel(HOST_RSP, src->host_rsp);
+	vmcs_writel(HOST_RIP, src->host_rip);
+	vmcs_writel(HOST_IA32_SYSENTER_ESP, src->host_ia32_sysenter_esp);
+	vmcs_writel(HOST_IA32_SYSENTER_EIP, src->host_ia32_sysenter_eip);

Constant, can use vmx_vcpu_setup().

+
+	return 0;
+}
+
  /*
   * Switches to specified vcpu, until a matching vcpu_put(), but assumes
   * vcpu mutex is already taken.
@@ -5359,6 +5412,361 @@ static void vmx_set_supported_cpuid(u32
  		entry->ecx |= bit(X86_FEATURE_VMX);
  }

+/*
+ * Make a copy of the current VMCS to ordinary memory. This is needed because
+ * in VMX you cannot read and write to two VMCS at the same time, so when we
+ * want to do this (in prepare_vmcs02, which needs to read from vmcs01 while
+ * preparing vmcs02), we need to first save a copy of one VMCS's fields in
+ * memory, and then use that copy.
+ */
+void save_vmcs(struct vmcs_fields *dst)
+{
+	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
+	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
+	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
+	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
+	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
+	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
+	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
+	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
+	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
+	if (cpu_has_vmx_msr_bitmap())
+		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);

Do we support io bitmaps and msr bitmaps in this version? If not, please drop (also ept).

+	dst->tsc_offset = vmcs_read64(TSC_OFFSET);
+	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
+	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
+	if (enable_ept)
+		dst->ept_pointer = vmcs_read64(EPT_POINTER);
+	dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+	dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
+	dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
+		dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	if (enable_ept) {
+		dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+		dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+		dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+		dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+	}
+	dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+	dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
+	dst->page_fault_error_code_mask =
+		vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
+	dst->page_fault_error_code_match =
+		vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
+	dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
+	dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
+	dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
+	dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+	dst->vm_entry_exception_error_code =
+		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+	dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
+	dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
+	dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	if (enable_vpid&&  dst->secondary_vm_exec_control&
+	    SECONDARY_EXEC_ENABLE_VPID)
+		dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
+	dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+	dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	dst->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+	dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+	dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
+	dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
+	dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
+	dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+	dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
+	dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
+	dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
+	dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
+	dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
+	dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+	dst->guest_cr0 = vmcs_readl(GUEST_CR0);
+	dst->guest_cr3 = vmcs_readl(GUEST_CR3);
+	dst->guest_cr4 = vmcs_readl(GUEST_CR4);
+	dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+	dst->guest_dr7 = vmcs_readl(GUEST_DR7);
+	dst->guest_rsp = vmcs_readl(GUEST_RSP);
+	dst->guest_rip = vmcs_readl(GUEST_RIP);
+	dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+	dst->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+	dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+	dst->host_cr0 = vmcs_readl(HOST_CR0);
+	dst->host_cr3 = vmcs_readl(HOST_CR3);
+	dst->host_cr4 = vmcs_readl(HOST_CR4);
+	dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
+	dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
+	dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
+	dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
+	dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
+	dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
+	dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
+	dst->host_rsp = vmcs_readl(HOST_RSP);
+	dst->host_rip = vmcs_readl(HOST_RIP);
+	if (vmcs_config.vmexit_ctrl&  VM_EXIT_LOAD_IA32_PAT)
+		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);

I think this should be broken up:

- guest-state fields, obviously needed
- host-state fields, not needed (can reuse current kvm code to setup, never need to read them)
- control fields not modified by hardware - no need to save
- read-only control fields - probably no need to load

+
+	vmcs_write64(VMCS_LINK_POINTER, vmcs12->vmcs_link_pointer);
+	vmcs_write64(IO_BITMAP_A, vmcs01->io_bitmap_a);
+	vmcs_write64(IO_BITMAP_B, vmcs01->io_bitmap_b);

Reuse vmx_vcpu_setup()

+	if (cpu_has_vmx_msr_bitmap())
+		vmcs_write64(MSR_BITMAP, vmcs01->msr_bitmap);

Have setup_msrs() cache the value of msr_bitmap somewhere, so we don't need to vmcs_read64() it.


+
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+		     (vmcs01->pin_based_vm_exec_control |
+		      vmcs12->pin_based_vm_exec_control));

Reuse vmx_vcpu_setup()

+
+	if (vm_need_tpr_shadow(vcpu->kvm)&&
+	    nested_cpu_has_vmx_tpr_shadow(vcpu))
+		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+
+	exec_control = vmcs01->cpu_based_vm_exec_control;
+	exec_control&= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+	exec_control&= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+	exec_control&= ~CPU_BASED_TPR_SHADOW;
+	exec_control |= vmcs12->cpu_based_vm_exec_control;
+	if (!vm_need_tpr_shadow(vcpu->kvm) ||
+	    vmcs12->virtual_apic_page_addr == 0) {
+		exec_control&= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_STORE_EXITING |
+			CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	} else if (exec_control&  CPU_BASED_TPR_SHADOW) {
+#ifdef CONFIG_X86_64
+		exec_control&= ~CPU_BASED_CR8_STORE_EXITING;
+		exec_control&= ~CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	}
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);

Reuse vmx_vcpu_setup() code instead of vmread().

Note you have to set KVM_REQ_EVENT so INTR_PENDING is recalculated on vmexit.

+
+	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
+	 * bitwise-or of what L1 wants to trap for L2, and what we want to
+	 * trap. However, vmx_fpu_activate/deactivate may have happened after
+	 * we saved vmcs01, so we shouldn't trust its TS and NM_VECTOR bits
+	 * and need to base them again on fpu_active. Note that CR0.TS also
+	 * needs updating - we do this after this function returns (in
+	 * nested_vmx_run).
+	 */
+	vmcs_write32(EXCEPTION_BITMAP,
+		     ((vmcs01->exception_bitmap&~(1u<<NM_VECTOR)) |
+		      (vcpu->fpu_active ? 0 : (1u<<NM_VECTOR)) |
+		      vmcs12->exception_bitmap));

I think you can reuse update_exception_bitmap() here. Also, may need to enable #PF interception when enable_ept? not sure.

This reuses the fpu_active and guest debugging logic in update_exception_bitmap(). Again, needs to happen after we see the L2 state.

+	vmcs_writel(CR0_GUEST_HOST_MASK, vmcs12->cr0_guest_host_mask |
+			(vcpu->fpu_active ? 0 : X86_CR0_TS));

Please use ~cr0_guest_owned_bits instead, equivalent information. Should be something like

vmcs_writel(CR0_GUEST_HOST_MASK, vmcs12->cr0_guest_host_mask | ~cr0_guest_owned_bits);

+	vcpu->arch.cr0_guest_owned_bits = ~(vmcs12->cr0_guest_host_mask |
+			(vcpu->fpu_active ? 0 : X86_CR0_TS));

Here, too ( |= is natural for updating).

+
+	vmcs_write32(VM_EXIT_CONTROLS,
+		     (vmcs01->vm_exit_controls&
+			(~(VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT)))
+		       | vmcs12->vm_exit_controls);

vmx_vcpu_setup

+
+	vmcs_write32(VM_ENTRY_CONTROLS,
+		     (vmcs01->vm_entry_controls&
+			(~(VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE)))
+		      | vmcs12->vm_entry_controls);

vmx_vcpu_setup; IA32E mode will be updated by enter_lmode() or exit_lmode() which you'll need to call.

+
+	vmcs_writel(CR4_GUEST_HOST_MASK,
+		    (vmcs01->cr4_guest_host_mask&
+		     vmcs12->cr4_guest_host_mask));
+	

~cr4_guest_owned_bits

+	vmcs_write64(TSC_OFFSET, vmcs01->tsc_offset + vmcs12->tsc_offset);

Zachary Amsden

+
+	return 0;
+}
+
  static struct kvm_x86_ops vmx_x86_ops = {
  	.cpu_has_kvm_support = cpu_has_kvm_support,
  	.disabled_by_bios = vmx_disabled_by_bios,


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux