Re: [PATCH v4 3/3] KVM: nVMX: Emulate EPTP switching for the L1 hypervisor

Jim Mattson <jmattson@xxxxxxxxxx> · Tue, 11 Jul 2017 11:22:37 -0700

On Tue, Jul 11, 2017 at 10:58 AM, Bandan Das <bsd@xxxxxxxxxx> wrote:
> David Hildenbrand <david@xxxxxxxxxx> writes:
>
>> On 10.07.2017 22:49, Bandan Das wrote:
>>> When L2 uses vmfunc, L0 utilizes the associated vmexit to
>>> emulate a switching of the ept pointer by reloading the
>>> guest MMU.
>>>
>>> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
>>> Signed-off-by: Bandan Das <bsd@xxxxxxxxxx>
>>> ---
>>>  arch/x86/include/asm/vmx.h |  6 +++++
>>>  arch/x86/kvm/vmx.c         | 58 +++++++++++++++++++++++++++++++++++++++++++---
>>>  2 files changed, 61 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>>> index da5375e..5f63a2e 100644
>>> --- a/arch/x86/include/asm/vmx.h
>>> +++ b/arch/x86/include/asm/vmx.h
>>> @@ -115,6 +115,10 @@
>>>  #define VMX_MISC_SAVE_EFER_LMA                      0x00000020
>>>  #define VMX_MISC_ACTIVITY_HLT                       0x00000040
>>>
>>> +/* VMFUNC functions */
>>> +#define VMX_VMFUNC_EPTP_SWITCHING               0x00000001
>>> +#define VMFUNC_EPTP_ENTRIES  512
>>> +
>>>  static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
>>>  {
>>>      return vmx_basic & GENMASK_ULL(30, 0);
>>> @@ -200,6 +204,8 @@ enum vmcs_field {
>>>      EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
>>>      EOI_EXIT_BITMAP3                = 0x00002022,
>>>      EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
>>> +    EPTP_LIST_ADDRESS               = 0x00002024,
>>> +    EPTP_LIST_ADDRESS_HIGH          = 0x00002025,
>>>      VMREAD_BITMAP                   = 0x00002026,
>>>      VMWRITE_BITMAP                  = 0x00002028,
>>>      XSS_EXIT_BITMAP                 = 0x0000202C,
>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>>> index fe8f5fc..0a969fb 100644
>>> --- a/arch/x86/kvm/vmx.c
>>> +++ b/arch/x86/kvm/vmx.c
>>> @@ -246,6 +246,7 @@ struct __packed vmcs12 {
>>>      u64 eoi_exit_bitmap1;
>>>      u64 eoi_exit_bitmap2;
>>>      u64 eoi_exit_bitmap3;
>>> +    u64 eptp_list_address;
>>>      u64 xss_exit_bitmap;
>>>      u64 guest_physical_address;
>>>      u64 vmcs_link_pointer;
>>> @@ -771,6 +772,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
>>>      FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
>>>      FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
>>>      FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
>>> +    FIELD64(EPTP_LIST_ADDRESS, eptp_list_address),
>>>      FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
>>>      FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
>>>      FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
>>> @@ -1402,6 +1404,13 @@ static inline bool nested_cpu_has_vmfunc(struct vmcs12 *vmcs12)
>>>      return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VMFUNC);
>>>  }
>>>
>>> +static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
>>> +{
>>> +    return nested_cpu_has_vmfunc(vmcs12) &&
>>> +            (vmcs12->vm_function_control &
>>
>> I wonder if it makes sense to rename vm_function_control to
>> - vmfunc_control
>> - vmfunc_controls (so it matches nested_vmx_vmfunc_controls)
>> - vmfunc_ctrl
>
> I tend to follow the SDM names because it's easy to look for them.
>
>>> +             VMX_VMFUNC_EPTP_SWITCHING);
>>> +}
>>> +
>>>  static inline bool is_nmi(u32 intr_info)
>>>  {
>>>      return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
>>> @@ -2791,7 +2800,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
>>>      if (cpu_has_vmx_vmfunc()) {
>>>              vmx->nested.nested_vmx_secondary_ctls_high |=
>>>                      SECONDARY_EXEC_ENABLE_VMFUNC;
>>> -            vmx->nested.nested_vmx_vmfunc_controls = 0;
>>> +            /*
>>> +             * Advertise EPTP switching unconditionally
>>> +             * since we emulate it
>>> +             */
>>> +            vmx->nested.nested_vmx_vmfunc_controls =
>>> +                    VMX_VMFUNC_EPTP_SWITCHING;>     }
>>>
>>>      /*
>>> @@ -7772,6 +7786,9 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>>      struct vcpu_vmx *vmx = to_vmx(vcpu);
>>>      struct vmcs12 *vmcs12;
>>>      u32 function = vcpu->arch.regs[VCPU_REGS_RAX];
>>> +    u32 index = vcpu->arch.regs[VCPU_REGS_RCX];
>>> +    struct page *page = NULL;
>>> +    u64 *l1_eptp_list, address;
>>>
>>>      /*
>>>       * VMFUNC is only supported for nested guests, but we always enable the
>>> @@ -7784,11 +7801,46 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
>>>      }
>>>
>>>      vmcs12 = get_vmcs12(vcpu);
>>> -    if ((vmcs12->vm_function_control & (1 << function)) == 0)
>>> +    if (((vmcs12->vm_function_control & (1 << function)) == 0) ||
>>> +        WARN_ON_ONCE(function))
>>
>> "... instruction causes a VM exit if the bit at position EAX is 0 in the
>> VM-function controls (the selected VM function is
>> not enabled)."
>>
>> So g2 can trigger this WARN_ON_ONCE, no? I think we should drop it then
>> completely.
>
> It's a good hint to see if L2 misbehaved and WARN_ON_ONCE makes sure it's
> not misused.
>
>>> +            goto fail;
>>> +
>>> +    if (!nested_cpu_has_ept(vmcs12) ||
>>> +        !nested_cpu_has_eptp_switching(vmcs12))
>>> +            goto fail;
>>> +
>>> +    if (!vmcs12->eptp_list_address || index >= VMFUNC_EPTP_ENTRIES)
>>> +            goto fail;
>>
>> I can find the definition for an vmexit in case of index >=
>> VMFUNC_EPTP_ENTRIES, but not for !vmcs12->eptp_list_address in the SDM.
>>
>> Can you give me a hint?
>
> I don't think there is. Since, we are basically emulating eptp switching
> for L2, this is a good check to have.

There is nothing wrong with a hypervisor using physical page 0 for
whatever purpose it likes, including an EPTP list.

>>> +
>>> +    page = nested_get_page(vcpu, vmcs12->eptp_list_address);
>>> +    if (!page)
>>>              goto fail;
>>> -    WARN_ONCE(1, "VMCS12 VM function control should have been zero");
>>> +
>>> +    l1_eptp_list = kmap(page);
>>> +    address = l1_eptp_list[index];
>>> +    if (!address)
>>> +            goto fail;
>>
>> Can you move that check to the other address checks below? (or rework if
>> this make sense, see below)
>>
>>> +    /*
>>> +     * If the (L2) guest does a vmfunc to the currently
>>> +     * active ept pointer, we don't have to do anything else
>>> +     */
>>> +    if (vmcs12->ept_pointer != address) {
>>> +            if (address >> cpuid_maxphyaddr(vcpu) ||
>>> +                !IS_ALIGNED(address, 4096))
>>
>> Couldn't the pfn still be invalid and make kvm_mmu_reload() fail?
>> (triggering a KVM_REQ_TRIPLE_FAULT)
>
> If there's a triple fault, I think it's a good idea to inject it
> back. Basically, there's no need to take care of damage control
> that L1 is intentionally doing.
>
>>> +                    goto fail;
>>> +            kvm_mmu_unload(vcpu);
>>> +            vmcs12->ept_pointer = address;
>>> +            kvm_mmu_reload(vcpu);
>>
>> I was thinking about something like this:
>>
>> kvm_mmu_unload(vcpu);
>> old = vmcs12->ept_pointer;
>> vmcs12->ept_pointer = address;
>> if (kvm_mmu_reload(vcpu)) {
>>       /* pointer invalid, restore previous state */
>>       kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
>>       vmcs12->ept_pointer = old;
>>       kvm_mmu_reload(vcpu);
>>       goto fail;
>> }
>>
>> The you can inherit the checks from mmu_check_root().
>>
>>
>> Wonder why I can't spot checks for cpuid_maxphyaddr() /
>> IS_ALIGNED(address, 4096) for ordinary use of vmcs12->ept_pointer. The
>> checks should be identical.
>
> I think the reason is vmcs12->ept_pointer is never used directly. It's
> used to create a shadow table but nevertheless, the check should be there.
>
>>
>>> +            kunmap(page);
>>> +            nested_release_page_clean(page);
>>
>> shouldn't the kunmap + nested_release_page_clean go outside the if clause?
>
> :) Indeed, thanks for the catch.
>
> Bandan
>
>>> +    }
>>> +    return kvm_skip_emulated_instruction(vcpu);
>>>
>>>  fail:
>>> +    if (page) {
>>> +            kunmap(page);
>>> +            nested_release_page_clean(page);
>>> +    }
>>>      nested_vmx_vmexit(vcpu, vmx->exit_reason,
>>>                        vmcs_read32(VM_EXIT_INTR_INFO),
>>>                        vmcs_readl(EXIT_QUALIFICATION));
>>>
>>
>> David and mmu code are not yet best friends. So sorry if I am missing
>> something.