Re: [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




Avi Kivity <avi@xxxxxxxxxx> wrote on 03/09/2009 00:38:16:

> From:
>
> Avi Kivity <avi@xxxxxxxxxx>
>
> To:
>
> Orit Wasserman/Haifa/IBM@IBMIL
>
> Cc:
>
> kvm@xxxxxxxxxxxxxxx, Ben-Ami Yassour1/Haifa/IBM@IBMIL, Muli Ben-
> Yehuda/Haifa/IBM@IBMIL, Abel Gordon/Haifa/IBM@IBMIL,
> aliguori@xxxxxxxxxx, mmday@xxxxxxxxxx
>
> Date:
>
> 03/09/2009 00:38
>
> Subject:
>
> Re: [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume
>
> On 09/02/2009 06:38 PM, oritw@xxxxxxxxxx wrote:
> > -struct nested_vmx {
> > -   /* Has the level1 guest done vmon? */
> > +struct nested_vmx {   /* Has the level1 guest done vmon? */
> >
>
> A \n died here.
I will fix it.
>
> >      bool vmon;
> >      /* Has the level1 guest done vmclear? */
> >      bool vmclear;
> > +
> > +   /* Are we running nested guest */
> > +   bool nested_mode;
> > +
> > +   /* L1 requested VMLAUNCH or VMRESUME but we didn't run L2 yet */
> > +   bool nested_run_pending;
> > +
> > +   /* flag indicating if there was a valid IDT after exiting from l2
*/
> > +   bool nested_pending_valid_idt;
> >
>
> What does this mean?  pending event?
I will rename it.
> >
> > +
> > +static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu
*vcpu)
> > +{
> > +   return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
> > +      cpu_based_vm_exec_control&  CPU_BASED_TPR_SHADOW;
> > +}
> >
>
> Don't we need to check if the host supports it too?
We check it separately but I can add it here
>
> > +static inline bool nested_vm_need_virtualize_apic_accesses(struct
kvm_vcpu
> > +                        *vcpu)
> > +{
> > +   struct shadow_vmcs *shadow = to_vmx(vcpu)->nested.l2_state->
shadow_vmcs;
> > +
> > +   return (shadow->secondary_vm_exec_control&
> > +      SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)&&
> > +      to_vmx(vcpu)->nested.l2_state->shadow_vmcs->apic_access_addr !=
0;
> > +}
> >
>
> Why check apic_access_addr?
I will remove it.
>
> > +
> > +static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
> > +{
> > +   return to_vmx(vcpu)->nested.l2_state->shadow_vmcs->
> > +      secondary_vm_exec_control&  SECONDARY_EXEC_ENABLE_EPT;
> > +}
> >
>
> Need to check if secondary controls enabled?
If the secondary controls are not enabled this field is zero.
>
> > +static void vmx_set_irq(struct kvm_vcpu *vcpu)
> > +{
> > +   if (to_vmx(vcpu)->nested.nested_mode)
> > +      return;
> >
>
> Why?
>
> Note if the guest didn't enable external interrupt exiting, we need to
> inject as usual.
I look into it.
>
> >
> > +static int nested_handle_pending_idt(struct kvm_vcpu *vcpu)
> > +{
> >
>
> Again the name is confusing.  pending_event_injection?
I will rename it.
>
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   int irq;
> > +   int type;
> > +   int errCodeValid;
> > +   u32 idt_vectoring_info;
> > +   u32 guest_intr;
> > +   bool nmi_window_open;
> > +   bool interrupt_window_open;
> > +
> > +   if (vmx->nested.nested_mode&&  vmx->
nested.nested_pending_valid_idt) {
> > +      idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> > +      irq  = idt_vectoring_info&  VECTORING_INFO_VECTOR_MASK;
> > +      type = idt_vectoring_info&  VECTORING_INFO_TYPE_MASK;
> > +      errCodeValid = idt_vectoring_info&
> > +         VECTORING_INFO_DELIVER_CODE_MASK;
> > +
> > +      guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +      nmi_window_open =
> > +         !(guest_intr&  (GUEST_INTR_STATE_STI |
> > +               GUEST_INTR_STATE_MOV_SS |
> > +               GUEST_INTR_STATE_NMI));
> > +
> > +      interrupt_window_open =
> > +         ((vmcs_readl(GUEST_RFLAGS)&  X86_EFLAGS_IF)&&
> > +          !(guest_intr&  (GUEST_INTR_STATE_STI |
> > +                GUEST_INTR_STATE_MOV_SS)));
> > +
> > +      if (type == INTR_TYPE_EXT_INTR&&  !interrupt_window_open) {
> > +         printk(KERN_INFO "IDT ignored, l2 interrupt window
closed!\n");
> > +         return 0;
> > +      }
> >
>
> How can this happen?  Unless it's on nested entry, in which case we need
> to abort the entry.
Ok i will fix it. The truth I never saw it happen.
>
> > +
> >   #ifdef CONFIG_X86_64
> >   #define R "r"
> >   #define Q "q"
> > @@ -4646,6 +4842,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
> >   {
> >      struct vcpu_vmx *vmx = to_vmx(vcpu);
> >
> > +   nested_handle_pending_idt(vcpu);
> >
>
> You're not checking the return code (need to do that on entry).
I will fix it.
>
> > +
> > +   if (vmx->nested.nested_mode) {
> > +      vmcs_writel(GUEST_CR0, vmx->nested.l2_state->shadow_vmcs->
guest_cr0);
> >
>
> Might not be legal.  We may also want to force-enable caching.  Lastly,
> don't we need to handle cr0.ts and ct0.mp specially to manage the fpu
state?
We are working on implementing this correctly . kvm seems to handle it fine
but vmware doesn't like it.
>
> >
> > +   if (vmx->nested.nested_mode)
> > +      vmx->nested.vmclear = 0;
> > +
> >
>
> Why?
I will check it.
>
> >   free_vmcs:
> > @@ -5122,6 +5339,228 @@ static int shadow_vmcs_load(struct kvm_vcpu
*vcpu)
> >      return 0;
> >   }
> >
> > +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> > +{
> > +   struct shadow_vmcs *l2_shadow_vmcs =
> > +      to_vmx(vcpu)->nested.l2_state->shadow_vmcs;
> > +   struct shadow_vmcs *l1_shadow_vmcs =
> > +      to_vmx(vcpu)->nested.l1_state->shadow_vmcs;
> > +
> > +   l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> > +   l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> > +   l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> > +   l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> > +   l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16
(GUEST_LDTR_SELECTOR);
> > +   l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> > +
> > +   l1_shadow_vmcs->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> > +   l1_shadow_vmcs->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> > +   l1_shadow_vmcs->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> > +   l1_shadow_vmcs->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> > +   l1_shadow_vmcs->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> > +   l1_shadow_vmcs->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> > +   l1_shadow_vmcs->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> > +
> > +   l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> > +   l2_shadow_vmcs->guest_physical_address =
> > +      vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> > +   l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> > +   l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64
(GUEST_IA32_DEBUGCTL);
> > +   if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> > +      l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> > +   l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> > +   l2_shadow_vmcs->vm_entry_intr_info_field =
> > +      vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> > +   l2_shadow_vmcs->vm_entry_exception_error_code =
> > +      vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> > +   l2_shadow_vmcs->vm_entry_instruction_len =
> > +      vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> > +   l2_shadow_vmcs->vm_instruction_error =
> > +      vmcs_read32(VM_INSTRUCTION_ERROR);
> > +   l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> > +   l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> > +   l2_shadow_vmcs->vm_exit_intr_error_code =
> > +      vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> > +   l2_shadow_vmcs->idt_vectoring_info_field =
> > +      vmcs_read32(IDT_VECTORING_INFO_FIELD);
> > +   l2_shadow_vmcs->idt_vectoring_error_code =
> > +      vmcs_read32(IDT_VECTORING_ERROR_CODE);
> > +   l2_shadow_vmcs->vm_exit_instruction_len =
> > +      vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> > +   l2_shadow_vmcs->vmx_instruction_info =
> > +      vmcs_read32(VMX_INSTRUCTION_INFO);
> > +   l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> > +   l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> > +   l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> > +   l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> > +   l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> > +   l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> > +   l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> > +   l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> > +   l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> > +   l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> > +   l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32
(GUEST_LDTR_AR_BYTES);
> > +   l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> > +   l2_shadow_vmcs->guest_interruptibility_info =
> > +      vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> > +   l2_shadow_vmcs->guest_activity_state =
> > +      vmcs_read32(GUEST_ACTIVITY_STATE);
> > +   l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> > +
> > +   l1_shadow_vmcs->host_ia32_sysenter_cs =
> > +      vmcs_read32(HOST_IA32_SYSENTER_CS);
> > +
> > +   l2_shadow_vmcs->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> > +   l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> > +   l2_shadow_vmcs->exit_qualification = vmcs_readl
(EXIT_QUALIFICATION);
> > +   l2_shadow_vmcs->guest_linear_address = vmcs_readl
(GUEST_LINEAR_ADDRESS);
> > +   l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0);
> > +
> > +   l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
> > +   l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> > +   l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> > +   l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> > +   l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> > +   l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> > +   l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> > +   l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> > +   l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> > +   l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> > +   l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> > +   l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
> > +   l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
> > +   l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
> > +   l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> > +   l2_shadow_vmcs->guest_pending_dbg_exceptions =
> > +      vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> > +   l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl
(GUEST_SYSENTER_ESP);
> > +   l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl
(GUEST_SYSENTER_EIP);
> > +
> > +   l1_shadow_vmcs->host_cr0 = vmcs_readl(HOST_CR0);
> > +   l1_shadow_vmcs->host_cr3 = vmcs_readl(HOST_CR3);
> > +   l1_shadow_vmcs->host_cr4 = vmcs_readl(HOST_CR4);
> > +   l1_shadow_vmcs->host_fs_base = vmcs_readl(HOST_FS_BASE);
> > +   l1_shadow_vmcs->host_gs_base = vmcs_readl(HOST_GS_BASE);
> > +   l1_shadow_vmcs->host_tr_base = vmcs_readl(HOST_TR_BASE);
> > +   l1_shadow_vmcs->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> > +   l1_shadow_vmcs->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> > +   l1_shadow_vmcs->host_ia32_sysenter_esp =
> > +      vmcs_readl(HOST_IA32_SYSENTER_ESP);
> > +   l1_shadow_vmcs->host_ia32_sysenter_eip =
> > +      vmcs_readl(HOST_IA32_SYSENTER_EIP);
> > +   l1_shadow_vmcs->host_rsp = vmcs_readl(HOST_RSP);
> > +   l1_shadow_vmcs->host_rip = vmcs_readl(HOST_RIP);
> > +}
> >
>
> Can't we do it lazily?  Only read these on demand?
We can optimize and read some fields only when they are changed (after we
switch to qemu for example),
we do it in our performance version. Also there are some field that kvm
only write to once , we can read once.
This can be dangerous for other hypervisors , that may change them more
frequently.

>
> > +
> > +int load_vmcs_common(struct shadow_vmcs *src)
> > +{
> > +   vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
> > +   vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
> > +   vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
> > +   vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
> > +   vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
> > +   vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
> > +   vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
> > +   vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
> > +
> > +   vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
> > +   vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
> > +
> > +   if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> > +      vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
> > +
> > +   vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->
vm_entry_msr_load_count);
> > +   vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, src->
vm_entry_intr_info_field);
> > +   vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> > +           src->vm_entry_exception_error_code);
> > +   vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, src->
vm_entry_instruction_len);
> > +
> > +   vmcs_write32(GUEST_ES_LIMIT, src->guest_es_limit);
> > +   vmcs_write32(GUEST_CS_LIMIT, src->guest_cs_limit);
> > +   vmcs_write32(GUEST_SS_LIMIT, src->guest_ss_limit);
> > +   vmcs_write32(GUEST_DS_LIMIT, src->guest_ds_limit);
> > +   vmcs_write32(GUEST_FS_LIMIT, src->guest_fs_limit);
> > +   vmcs_write32(GUEST_GS_LIMIT, src->guest_gs_limit);
> > +   vmcs_write32(GUEST_LDTR_LIMIT, src->guest_ldtr_limit);
> > +   vmcs_write32(GUEST_TR_LIMIT, src->guest_tr_limit);
> > +   vmcs_write32(GUEST_GDTR_LIMIT, src->guest_gdtr_limit);
> > +   vmcs_write32(GUEST_IDTR_LIMIT, src->guest_idtr_limit);
> > +   vmcs_write32(GUEST_ES_AR_BYTES, src->guest_es_ar_bytes);
> > +   vmcs_write32(GUEST_CS_AR_BYTES, src->guest_cs_ar_bytes);
> > +   vmcs_write32(GUEST_SS_AR_BYTES, src->guest_ss_ar_bytes);
> > +   vmcs_write32(GUEST_DS_AR_BYTES, src->guest_ds_ar_bytes);
> > +   vmcs_write32(GUEST_FS_AR_BYTES, src->guest_fs_ar_bytes);
> > +   vmcs_write32(GUEST_GS_AR_BYTES, src->guest_gs_ar_bytes);
> > +   vmcs_write32(GUEST_LDTR_AR_BYTES, src->guest_ldtr_ar_bytes);
> > +   vmcs_write32(GUEST_TR_AR_BYTES, src->guest_tr_ar_bytes);
> > +   vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
> > +           src->guest_interruptibility_info);
> > +   vmcs_write32(GUEST_ACTIVITY_STATE, src->guest_activity_state);
> > +   vmcs_write32(GUEST_SYSENTER_CS, src->guest_sysenter_cs);
> > +
> > +   vmcs_writel(GUEST_ES_BASE, src->guest_es_base);
> > +   vmcs_writel(GUEST_CS_BASE, src->guest_cs_base);
> > +   vmcs_writel(GUEST_SS_BASE, src->guest_ss_base);
> > +   vmcs_writel(GUEST_DS_BASE, src->guest_ds_base);
> > +   vmcs_writel(GUEST_FS_BASE, src->guest_fs_base);
> > +   vmcs_writel(GUEST_GS_BASE, src->guest_gs_base);
> > +   vmcs_writel(GUEST_LDTR_BASE, src->guest_ldtr_base);
> > +   vmcs_writel(GUEST_TR_BASE, src->guest_tr_base);
> > +   vmcs_writel(GUEST_GDTR_BASE, src->guest_gdtr_base);
> > +   vmcs_writel(GUEST_IDTR_BASE, src->guest_idtr_base);
> > +   vmcs_writel(GUEST_DR7, src->guest_dr7);
> > +   vmcs_writel(GUEST_RSP, src->guest_rsp);
> > +   vmcs_writel(GUEST_RIP, src->guest_rip);
> > +   vmcs_writel(GUEST_RFLAGS, src->guest_rflags);
> > +   vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
> > +          src->guest_pending_dbg_exceptions);
> > +   vmcs_writel(GUEST_SYSENTER_ESP, src->guest_sysenter_esp);
> > +   vmcs_writel(GUEST_SYSENTER_EIP, src->guest_sysenter_eip);
> > +
> > +   return 0;
> > +}
> >
>
> If we do it lazily, we'll only need to reload bits that have changed.
True, we can add a bitmap and update the fields written to only.
>
> >   struct level_state *create_state(void)
> >   {
> >      struct level_state *state = NULL;
> > @@ -5176,6 +5615,685 @@ int create_l2_state(struct kvm_vcpu *vcpu)
> >
> >      return 0;
> >   }
> > +int prepare_vmcs_02(struct kvm_vcpu *vcpu)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   struct shadow_vmcs *src = vmx->nested.l2_state->shadow_vmcs;
> > +   u32 exec_control;
> > +
> > +   if (!src) {
> > +      printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
> > +      return 1;
> > +   }
> > +
> > +   load_vmcs_common(src);
> > +
> > +   if (cpu_has_vmx_vpid()&&  vmx->nested.l2_state->vpid != 0)
> > +      vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.l2_state->vpid);
> > +
> > +   if (vmx->nested.l2_state->io_bitmap_a)
> > +      vmcs_write64(IO_BITMAP_A, vmx->nested.l2_state->io_bitmap_a);
> > +
> > +   if (vmx->nested.l2_state->io_bitmap_b)
> > +      vmcs_write64(IO_BITMAP_B, vmx->nested.l2_state->io_bitmap_b);
> > +
> > +   if (vmx->nested.l2_state->msr_bitmap)
> > +      vmcs_write64(MSR_BITMAP, vmx->nested.l2_state->msr_bitmap);
> >
>
> Don't we need to combine the host and guest msr bitmaps and I/O
> bitmaps?  If the host doesn't allow an msr or I/O access to the guest,
> it shouldn't allow it to nested guests.
Yes we didn't implement it yet.
>
> > +
> > +   if (src->vm_entry_msr_load_count>  0) {
> > +      struct page *page;
> > +
> > +      page = nested_get_page(vcpu,
> > +                   src->vm_entry_msr_load_addr);
> > +      if (!page)
> > +         return 1;
> > +
> > +      vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
> >
>
> Luckily we don't use the msr autoload stuff.  If we did we'd have to
> merge it too.  But We have to emulate those loads (via vmx_set_msr), the
> guest can easily load bad msrs which would kill the host.
Ok.
>
> > +   if (src->virtual_apic_page_addr != 0) {
> > +      struct page *page;
> > +
> > +      page = nested_get_page(vcpu,
> > +                   src->virtual_apic_page_addr);
> > +      if (!page)
> > +         return 1;
> > +
> > +      vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(page));
> > +
> > +      kvm_release_page_clean(page);
> > +   }  else {
> > +      vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
> > +              src->virtual_apic_page_addr);
> > +   }
> >
>
> Don't understand the special zero value.
I will look into it.
>
> > +
> > +   vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> > +           (vmx->nested.l1_state->shadow_vmcs->
pin_based_vm_exec_control |
> > +            src->pin_based_vm_exec_control));
> > +
> > +   exec_control =
> vmx->nested.l1_state->shadow_vmcs->cpu_based_vm_exec_control;
> > +
> > +   exec_control&= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> > +
> > +   exec_control&= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> > +
> > +   exec_control&= ~CPU_BASED_TPR_SHADOW;
> >
>
> Why?
We use the values from VMCS12 always for those controls.
>
> > +   if (enable_vpid) {
> > +      if (vmx->nested.l2_state->vpid == 0) {
> > +         allocate_vpid(vmx);
> > +         vmx->nested.l2_state->vpid = vmx->vpid;
> >
>
> What if the guest has a nonzero vpid?
>
> > +static int nested_vmx_vmexit(struct kvm_vcpu *vcpu,
> > +              bool is_interrupt)
> > +{
> > +   struct vcpu_vmx *vmx = to_vmx(vcpu);
> > +   int initial_pfu_active = vcpu->fpu_active;
> > +
> > +   if (!vmx->nested.nested_mode) {
> > +      printk(KERN_INFO "WARNING: %s called but not in nested mode\n",
> > +             __func__);
> > +      return 0;
> > +   }
> > +
> > +   save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
> > +
> > +   sync_cached_regs_to_vmcs(vcpu);
> > +
> > +   prepare_vmcs_12(vcpu);
> > +   if (is_interrupt)
> > +      vmx->nested.l2_state->shadow_vmcs->vm_exit_reason =
> > +         EXIT_REASON_EXTERNAL_INTERRUPT;
> >
>
> Need to auto-ack the interrupt if requested by the guest.
The is_interrupt means L1 has interrupts, kvm regular code will handle it.
>
>
>
> --
> I have a truly marvellous patch that fixes the bug which this
> signature is too narrow to contain.
>

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux