Jim Mattson <jmattson@xxxxxxxxxx> writes: > My replies are inline. > > On Fri, Jul 15, 2016 at 5:50 PM, Bandan Das <bsd@xxxxxxxxxx> wrote: >> >> Hi Jim, >> >> Some comments below. >> >> Jim Mattson <jmattson@xxxxxxxxxx> writes: >> >>> If a shadow VMCS is referenced by the VMCS link pointer in the >>> current VMCS, then VM-entry makes the shadow VMCS active on the >>> current processor. No VMCS should ever be active on more than one >>> processor. If a VMCS is to be migrated from one processor to >>> another, software should execute a VMCLEAR for the VMCS on the >>> first processor before the VMCS is made active on the second >>> processor. >>> >>> We already keep track of ordinary VMCSs that are made active by >>> VMPTRLD. Extend that tracking to handle shadow VMCSs that are >>> made active by VM-entry to their parents. >>> >>> Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx> >>> >>> arch/x86/kvm/vmx.c | 73 +++++++++++++++++++++++++++++++++++++++--------------- >>> 1 file changed, 53 insertions(+), 20 deletions(-) >>> >>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >>> index 64a79f2..dd38521 100644 >>> --- a/arch/x86/kvm/vmx.c >>> +++ b/arch/x86/kvm/vmx.c >>> @@ -398,7 +398,7 @@ struct nested_vmx { >>> /* The host-usable pointer to the above */ >>> struct page *current_vmcs12_page; >>> struct vmcs12 *current_vmcs12; >>> - struct vmcs *current_shadow_vmcs; >>> + struct loaded_vmcs current_shadow_vmcs; >>> /* >>> * Indicates if the shadow vmcs must be updated with the >>> * data hold by vmcs12 >>> @@ -2113,6 +2113,15 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) >>> new.control) != old.control); >>> } >>> >>> +static void record_loaded_vmcs(struct loaded_vmcs *loaded_vmcs, int cpu) >>> +{ >>> + if (loaded_vmcs->vmcs) { >>> + list_add(&loaded_vmcs->loaded_vmcss_on_cpu_link, >>> + &per_cpu(loaded_vmcss_on_cpu, cpu)); >>> + loaded_vmcs->cpu = cpu; >>> + } >>> +} >>> + >>> /* >>> * Switches to specified vcpu, until a matching vcpu_put(), but assumes >>> * vcpu mutex is already taken. >>> @@ -2124,8 +2133,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) >>> >>> if (!vmm_exclusive) >>> kvm_cpu_vmxon(phys_addr); >>> - else if (vmx->loaded_vmcs->cpu != cpu) >>> + else if (vmx->loaded_vmcs->cpu != cpu) { >>> loaded_vmcs_clear(vmx->loaded_vmcs); >>> + if (vmx->nested.current_shadow_vmcs.vmcs) >>> + loaded_vmcs_clear(&vmx->nested.current_shadow_vmcs); >>> + } >>> >>> if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { >>> per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; >>> @@ -2147,8 +2159,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) >>> */ >>> smp_rmb(); >>> >>> - list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, >>> - &per_cpu(loaded_vmcss_on_cpu, cpu)); >>> + record_loaded_vmcs(vmx->loaded_vmcs, cpu); >>> + record_loaded_vmcs(&vmx->nested.current_shadow_vmcs, cpu); >>> crash_enable_local_vmclear(cpu); >>> local_irq_enable(); >>> >>> @@ -2161,8 +2173,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) >>> >>> rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); >>> vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ >>> - >>> - vmx->loaded_vmcs->cpu = cpu; >>> } >> >> Is the order of setting loaded_vmcs->cpu important here ? Your patch changed it, I can't >> think of anything wrong with it however... >> > > I can't find any references to loaded_vmcs->cpu between the new > assignment and the original assignment. I believe this change is > okay. > > However, on a related note, I have preserved the existing ordering of > VMPTRLD followed by adding the VMCS to the loaded_vmcss_on_cpu. Is > this ordering safe in the event of a KEXEC? So, there might be the possibility that kexec doesn't clear the recently loaded vmcs since it was not on the list ? Looks like an issue to me but I have cc-ed Paolo to comment/confirm. >>> /* Setup TSC multiplier */ >>> @@ -6812,6 +6822,34 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, >>> return 0; >>> } >>> >>> +static int setup_shadow_vmcs(struct vcpu_vmx *vmx) >>> +{ >>> + struct vmcs *shadow_vmcs; >>> + int cpu; >>> + >>> + shadow_vmcs = alloc_vmcs(); >>> + if (!shadow_vmcs) >>> + return -ENOMEM; >>> + >>> + /* mark vmcs as shadow */ >>> + shadow_vmcs->revision_id |= (1u << 31); >>> + /* init shadow vmcs */ >>> + vmx->nested.current_shadow_vmcs.vmcs = shadow_vmcs; >>> + loaded_vmcs_init(&vmx->nested.current_shadow_vmcs); >>> + >>> + cpu = get_cpu(); >>> + local_irq_disable(); >>> + crash_disable_local_vmclear(cpu); >>> + >>> + record_loaded_vmcs(&vmx->nested.current_shadow_vmcs, cpu); >>> + >>> + crash_enable_local_vmclear(cpu); >>> + local_irq_enable(); >>> + put_cpu(); >>> + >>> + return 0; >>> +} >>> + >>> /* >>> * Emulate the VMXON instruction. >>> * Currently, we just remember that VMX is active, and do not save or even >>> @@ -6824,7 +6862,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) >>> { >>> struct kvm_segment cs; >>> struct vcpu_vmx *vmx = to_vmx(vcpu); >>> - struct vmcs *shadow_vmcs; >>> const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED >>> | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; >>> >>> @@ -6867,14 +6904,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) >>> } >>> >>> if (enable_shadow_vmcs) { >>> - shadow_vmcs = alloc_vmcs(); >>> - if (!shadow_vmcs) >>> - return -ENOMEM; >>> - /* mark vmcs as shadow */ >>> - shadow_vmcs->revision_id |= (1u << 31); >>> - /* init shadow vmcs */ >>> - vmcs_clear(shadow_vmcs); >>> - vmx->nested.current_shadow_vmcs = shadow_vmcs; >>> + int ret = setup_shadow_vmcs(vmx); >>> + if (ret) >>> + return ret; >> Nit: >> if (setup_shadow_vmcs(vmx)) >> return -ENOMEM; >> >> Also, isn't it better to actually add the shadow vmcs to the list >> in handle_vmptrld than in handle_vmon ? That is where shadow is actually >> enabled and the link pointer set. >> > > Are you are suggesting that the invariant should be that the > shadow VMCS is on the loaded_vmcss_on_cpu list iff it is > referenced by the link pointer in the parent? Maintaining that > invariant will require some extra work. The shadow VMCS will have > to be removed from the loaded_vmcss_on_cpu list in > nested_release_vmcs12, and vmx_vcpu_load will have to check > vmx->nested.current_vmptr before the loaded_vmcss_on_cpu > operations involving the shadow VMCS. Hmm... It just seemed like managing shadow_vmcs() is more of a vmptrld/vmclear operation but that also means removing/adding vmcss everytime vmclear is called and iirc the current code does a good job of reusing vmcss. So, I think this is fine. Bandan >>> } >>> >>> INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); >>> @@ -6959,7 +6991,7 @@ static void free_nested(struct vcpu_vmx *vmx) >>> free_vpid(vmx->nested.vpid02); >>> nested_release_vmcs12(vmx); >>> if (enable_shadow_vmcs) >>> - free_vmcs(vmx->nested.current_shadow_vmcs); >>> + free_loaded_vmcs(&vmx->nested.current_shadow_vmcs); >>> /* Unpin physical memory we referred to in current vmcs02 */ >>> if (vmx->nested.apic_access_page) { >>> nested_release_page(vmx->nested.apic_access_page); >>> @@ -7135,7 +7167,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) >>> int i; >>> unsigned long field; >>> u64 field_value; >>> - struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; >>> + struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs.vmcs; >>> const unsigned long *fields = shadow_read_write_fields; >>> const int num_fields = max_shadow_read_write_fields; >>> >>> @@ -7184,7 +7216,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) >>> int i, q; >>> unsigned long field; >>> u64 field_value = 0; >>> - struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; >>> + struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs.vmcs; >>> >>> vmcs_load(shadow_vmcs); >>> >>> @@ -7364,10 +7396,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) >>> vmx->nested.current_vmcs12 = new_vmcs12; >>> vmx->nested.current_vmcs12_page = page; >>> if (enable_shadow_vmcs) { >>> + struct vmcs *shadow_vmcs; >>> + shadow_vmcs = vmx->nested.current_shadow_vmcs.vmcs; >> >> Nit: struct vmcs *shadow_vmcs = >> vmx->nested.current_shadow_vmcs.vmcs; >> >> Bandan >> >>> vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, >>> SECONDARY_EXEC_SHADOW_VMCS); >>> - vmcs_write64(VMCS_LINK_POINTER, >>> - __pa(vmx->nested.current_shadow_vmcs)); >>> + vmcs_write64(VMCS_LINK_POINTER, __pa(shadow_vmcs)); >>> vmx->nested.sync_shadow_vmcs = true; >>> } >>> } >>> -- >>> To unsubscribe from this list: send the line "unsubscribe kvm" in >>> the body of a message to majordomo@xxxxxxxxxxxxxxx >>> More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html