Re: [PATCH] kvm: nVMX: Track active shadow VMCSs

Bandan Das <bsd@xxxxxxxxxx> · Mon, 18 Jul 2016 19:36:13 -0400

Jim Mattson <jmattson@xxxxxxxxxx> writes:

> My replies are inline.
>
> On Fri, Jul 15, 2016 at 5:50 PM, Bandan Das <bsd@xxxxxxxxxx> wrote:
>>
>> Hi Jim,
>>
>> Some comments below.
>>
>> Jim Mattson <jmattson@xxxxxxxxxx> writes:
>>
>>> If a shadow VMCS is referenced by the VMCS link pointer in the
>>> current VMCS, then VM-entry makes the shadow VMCS active on the
>>> current processor. No VMCS should ever be active on more than one
>>> processor. If a VMCS is to be migrated from one processor to
>>> another, software should execute a VMCLEAR for the VMCS on the
>>> first processor before the VMCS is made active on the second
>>> processor.
>>>
>>> We already keep track of ordinary VMCSs that are made active by
>>> VMPTRLD. Extend that tracking to handle shadow VMCSs that are
>>> made active by VM-entry to their parents.
>>>
>>> Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx>
>>>
>>>  arch/x86/kvm/vmx.c | 73 +++++++++++++++++++++++++++++++++++++++---------------
>>>  1 file changed, 53 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index 64a79f2..dd38521 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -398,7 +398,7 @@ struct nested_vmx {
>>>       /* The host-usable pointer to the above */
>>>       struct page *current_vmcs12_page;
>>>       struct vmcs12 *current_vmcs12;
>>> -     struct vmcs *current_shadow_vmcs;
>>> +     struct loaded_vmcs current_shadow_vmcs;
>>>       /*
>>>        * Indicates if the shadow vmcs must be updated with the
>>>        * data hold by vmcs12
>>> @@ -2113,6 +2113,15 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
>>>                       new.control) != old.control);
>>>  }
>>>
>>> +static void record_loaded_vmcs(struct loaded_vmcs *loaded_vmcs, int cpu)
>>> +{
>>> +       if (loaded_vmcs->vmcs) {
>>> +            list_add(&loaded_vmcs->loaded_vmcss_on_cpu_link,
>>> +                     &per_cpu(loaded_vmcss_on_cpu, cpu));
>>> +            loaded_vmcs->cpu = cpu;
>>> +       }
>>> +}
>>> +
>>>  /*
>>>   * Switches to specified vcpu, until a matching vcpu_put(), but assumes
>>>   * vcpu mutex is already taken.
>>> @@ -2124,8 +2133,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>>
>>>       if (!vmm_exclusive)
>>>               kvm_cpu_vmxon(phys_addr);
>>> -     else if (vmx->loaded_vmcs->cpu != cpu)
>>> +     else if (vmx->loaded_vmcs->cpu != cpu) {
>>>               loaded_vmcs_clear(vmx->loaded_vmcs);
>>> +             if (vmx->nested.current_shadow_vmcs.vmcs)
>>> +                     loaded_vmcs_clear(&vmx->nested.current_shadow_vmcs);
>>> +     }
>>>
>>>       if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
>>>               per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
>>> @@ -2147,8 +2159,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>>                */
>>>               smp_rmb();
>>>
>>> -             list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
>>> -                      &per_cpu(loaded_vmcss_on_cpu, cpu));
>>> +             record_loaded_vmcs(vmx->loaded_vmcs, cpu);
>>> +             record_loaded_vmcs(&vmx->nested.current_shadow_vmcs, cpu);
>>>               crash_enable_local_vmclear(cpu);
>>>               local_irq_enable();
>>>
>>> @@ -2161,8 +2173,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>>>
>>>               rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
>>>               vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
>>> -
>>> -             vmx->loaded_vmcs->cpu = cpu;
>>>       }
>>
>> Is the order of setting loaded_vmcs->cpu important here ? Your patch changed it, I can't
>> think of anything wrong with it however...
>>
>
> I can't find any references to loaded_vmcs->cpu between the new
> assignment and the original assignment. I believe this change is
> okay.
>
> However, on a related note, I have preserved the existing ordering of
> VMPTRLD followed by adding the VMCS to the loaded_vmcss_on_cpu. Is
> this ordering safe in the event of a KEXEC?

So, there might be the possibility that kexec doesn't clear the recently loaded vmcs 
since it was not on the list ? Looks like an issue to me but I have cc-ed Paolo to
comment/confirm.

>>>       /* Setup TSC multiplier */
>>> @@ -6812,6 +6822,34 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
>>>       return 0;
>>>  }
>>>
>>> +static int setup_shadow_vmcs(struct vcpu_vmx *vmx)
>>> +{
>>> +       struct vmcs *shadow_vmcs;
>>> +       int cpu;
>>> +
>>> +       shadow_vmcs = alloc_vmcs();
>>> +       if (!shadow_vmcs)
>>> +            return -ENOMEM;
>>> +
>>> +       /* mark vmcs as shadow */
>>> +       shadow_vmcs->revision_id |= (1u << 31);
>>> +       /* init shadow vmcs */
>>> +       vmx->nested.current_shadow_vmcs.vmcs = shadow_vmcs;
>>> +       loaded_vmcs_init(&vmx->nested.current_shadow_vmcs);
>>> +
>>> +       cpu = get_cpu();
>>> +       local_irq_disable();
>>> +       crash_disable_local_vmclear(cpu);
>>> +
>>> +       record_loaded_vmcs(&vmx->nested.current_shadow_vmcs, cpu);
>>> +
>>> +       crash_enable_local_vmclear(cpu);
>>> +       local_irq_enable();
>>> +       put_cpu();
>>> +
>>> +       return 0;
>>> +}
>>> +
>>>  /*
>>>   * Emulate the VMXON instruction.
>>>   * Currently, we just remember that VMX is active, and do not save or even
>>> @@ -6824,7 +6862,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>>  {
>>>       struct kvm_segment cs;
>>>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>>> -     struct vmcs *shadow_vmcs;
>>>       const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
>>>               | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
>>>
>>> @@ -6867,14 +6904,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>>>       }
>>>
>>>       if (enable_shadow_vmcs) {
>>> -             shadow_vmcs = alloc_vmcs();
>>> -             if (!shadow_vmcs)
>>> -                     return -ENOMEM;
>>> -             /* mark vmcs as shadow */
>>> -             shadow_vmcs->revision_id |= (1u << 31);
>>> -             /* init shadow vmcs */
>>> -             vmcs_clear(shadow_vmcs);
>>> -             vmx->nested.current_shadow_vmcs = shadow_vmcs;
>>> +             int ret = setup_shadow_vmcs(vmx);
>>> +             if (ret)
>>> +                     return ret;
>> Nit:
>>              if (setup_shadow_vmcs(vmx))
>>                        return -ENOMEM;
>>
>> Also, isn't it better to actually add the shadow vmcs to the list
>> in handle_vmptrld than in handle_vmon ? That is where shadow is actually
>> enabled and the link pointer set.
>>
>
> Are you are suggesting that the invariant should be that the
> shadow VMCS is on the loaded_vmcss_on_cpu list iff it is
> referenced by the link pointer in the parent? Maintaining that
> invariant will require some extra work. The shadow VMCS will have
> to be removed from the loaded_vmcss_on_cpu list in
> nested_release_vmcs12, and vmx_vcpu_load will have to check
> vmx->nested.current_vmptr before the loaded_vmcss_on_cpu
> operations involving the shadow VMCS.

Hmm... It just seemed like managing shadow_vmcs() is more of a vmptrld/vmclear
operation but that also means removing/adding vmcss everytime vmclear is called
and iirc the current code does a good job of reusing vmcss. So, I think this is
fine.

Bandan

>>>       }
>>>
>>>       INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
>>> @@ -6959,7 +6991,7 @@ static void free_nested(struct vcpu_vmx *vmx)
>>>       free_vpid(vmx->nested.vpid02);
>>>       nested_release_vmcs12(vmx);
>>>       if (enable_shadow_vmcs)
>>> -             free_vmcs(vmx->nested.current_shadow_vmcs);
>>> +             free_loaded_vmcs(&vmx->nested.current_shadow_vmcs);
>>>       /* Unpin physical memory we referred to in current vmcs02 */
>>>       if (vmx->nested.apic_access_page) {
>>>               nested_release_page(vmx->nested.apic_access_page);
>>> @@ -7135,7 +7167,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
>>>       int i;
>>>       unsigned long field;
>>>       u64 field_value;
>>> -     struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
>>> +     struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs.vmcs;
>>>       const unsigned long *fields = shadow_read_write_fields;
>>>       const int num_fields = max_shadow_read_write_fields;
>>>
>>> @@ -7184,7 +7216,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
>>>       int i, q;
>>>       unsigned long field;
>>>       u64 field_value = 0;
>>> -     struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
>>> +     struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs.vmcs;
>>>
>>>       vmcs_load(shadow_vmcs);
>>>
>>> @@ -7364,10 +7396,11 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
>>>               vmx->nested.current_vmcs12 = new_vmcs12;
>>>               vmx->nested.current_vmcs12_page = page;
>>>               if (enable_shadow_vmcs) {
>>> +                     struct vmcs *shadow_vmcs;
>>> +                     shadow_vmcs = vmx->nested.current_shadow_vmcs.vmcs;
>>
>> Nit: struct vmcs *shadow_vmcs =
>>                  vmx->nested.current_shadow_vmcs.vmcs;
>>
>> Bandan
>>
>>>                       vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
>>>                                     SECONDARY_EXEC_SHADOW_VMCS);
>>> -                     vmcs_write64(VMCS_LINK_POINTER,
>>> -                                  __pa(vmx->nested.current_shadow_vmcs));
>>> +                     vmcs_write64(VMCS_LINK_POINTER, __pa(shadow_vmcs));
>>>                       vmx->nested.sync_shadow_vmcs = true;
>>>               }
>>>       }
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>>> the body of a message to majordomo@xxxxxxxxxxxxxxx
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html