Re: [PATCH v4] KVM: nVMX: Fully support of nested VMX preemption timer

Arthur Chunqi Li <yzt356@xxxxxxxxx> · Mon, 16 Sep 2013 13:42:04 +0800



On Sat, Sep 14, 2013 at 1:15 AM, Paolo Bonzini <pbonzini@xxxxxxxxxx> wrote:
> Il 06/09/2013 04:04, Arthur Chunqi Li ha scritto:
>> This patch contains the following two changes:
>> 1. Fix the bug in nested preemption timer support. If vmexit L2->L0
>> with some reasons not emulated by L1, preemption timer value should
>> be save in such exits.
>> 2. Add support of "Save VMX-preemption timer value" VM-Exit controls
>> to nVMX.
>>
>> With this patch, nested VMX preemption timer features are fully
>> supported.
>>
>> Signed-off-by: Arthur Chunqi Li <yzt356@xxxxxxxxx>
>> ---
>> ChangeLog to v3:
>> Move nested_adjust_preemption_timer to the latest place just before vmenter.
>> Some minor changes.
>>
>>  arch/x86/include/uapi/asm/msr-index.h |    1 +
>>  arch/x86/kvm/vmx.c                    |   49 +++++++++++++++++++++++++++++++--
>>  2 files changed, 48 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
>> index bb04650..b93e09a 100644
>> --- a/arch/x86/include/uapi/asm/msr-index.h
>> +++ b/arch/x86/include/uapi/asm/msr-index.h
>> @@ -536,6 +536,7 @@
>>
>>  /* MSR_IA32_VMX_MISC bits */
>>  #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
>> +#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
>>  /* AMD-V MSRs */
>>
>>  #define MSR_VM_CR                       0xc0010114
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 1f1da43..f364d16 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -374,6 +374,8 @@ struct nested_vmx {
>>        */
>>       struct page *apic_access_page;
>>       u64 msr_ia32_feature_control;
>> +     /* Set if vmexit is L2->L1 */
>> +     bool nested_vmx_exit;
>>  };
>>
>>  #define POSTED_INTR_ON  0
>> @@ -2204,7 +2206,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
>>  #ifdef CONFIG_X86_64
>>               VM_EXIT_HOST_ADDR_SPACE_SIZE |
>>  #endif
>> -             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
>> +             VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
>> +             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> +     if (!(nested_vmx_pinbased_ctls_high &
>> +                             PIN_BASED_VMX_PREEMPTION_TIMER) ||
>> +                     !(nested_vmx_exit_ctls_high &
>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
>
> Align this under the other "!".  Also, I prefer to have one long line
> for the whole "!(... & ...) ||" (and likewise below), but I don't know
> if Gleb agrees
>
>> +             nested_vmx_exit_ctls_high &=
>> +                     (~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
>
> Please remove parentheses around ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER, and
> likewise elsewhere in the patch.
>
>> +             nested_vmx_pinbased_ctls_high &=
>> +                     (~PIN_BASED_VMX_PREEMPTION_TIMER);
>> +     }
>>       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
>>                                     VM_EXIT_LOAD_IA32_EFER);
>>
>> @@ -6707,6 +6719,24 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>>       *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
>>  }
>>
>> +static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
>> +{
>> +     u64 delta_tsc_l1;
>> +     u32 preempt_val_l1, preempt_val_l2, preempt_scale;
>
> Should this exit immediately if the preemption timer pin-based control
> is disabled?
Hi Paolo,
How can I get pin-based control here from "struct kvm_vcpu *vcpu"?

Arthur
>
>> +     preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
>> +                     MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
>> +     preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +     delta_tsc_l1 = kvm_x86_ops->read_l1_tsc(vcpu,
>> +                     native_read_tsc()) - vcpu->arch.last_guest_tsc;
>
> Please format this like:
>
>         delta_tsc_l1 =
>                 kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc())
>                 - vcpu->arch.last_guest_tsc;
>
>> +     preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
>> +     if (preempt_val_l2 <= preempt_val_l1)
>> +             preempt_val_l2 = 0;
>> +     else
>> +             preempt_val_l2 -= preempt_val_l1;
>> +     vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
>
> Did you test that a value of 0 triggers an immediate exit, rather than
> counting down by 2^32?  Perhaps it's safer to limit the value to 1
> instead of 0.
>
>> +}
>> +
>>  /*
>>   * The guest has exited.  See if we can fix it or if we need userspace
>>   * assistance.
>> @@ -6736,9 +6766,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>>               vmx->nested.nested_run_pending = 0;
>>
>>       if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
>> +             vmx->nested.nested_vmx_exit = true;
>
> I think this assignment should be in nested_vmx_vmexit, since it is
> called from other places as well.
>
>>               nested_vmx_vmexit(vcpu);
>>               return 1;
>>       }
>> +     vmx->nested.nested_vmx_exit = false;
>>
>>       if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
>>               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
>> @@ -7132,6 +7164,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>       debugctlmsr = get_debugctlmsr();
>>
>>       vmx->__launched = vmx->loaded_vmcs->launched;
>> +     if (is_guest_mode(vcpu) && !(vmx->nested.nested_vmx_exit))
>> +             nested_adjust_preemption_timer(vcpu);
>
> Please leave the assignment to __launched last, since it's already
> initializing the asm below.
>
> I don't like the is_guest_mode check here... Maybe it's
> micro-optimizing, but I wonder if we already do too many checks in
> vmx_vcpu_run...  For example, is_guest_mode could be changed (I think)
> to a check for "vmx->loaded_vmcs == &vmx->vmcs1".
>
> Alternatively, we could change nested_vmx_exit to an enum in struct
> vcpu_vmx (with values for L0->L1, L0->L2, L1->L2) that is initialized in
> vmx_handle_exit.  Then we could check directly for L0->L2 and not adjust
> the preemption timer in other cases.  In fact, I suspect this enum could
> replace HF_GUEST_MASK altogether.  However, this would require some
> other, more complicated, changes to svm.c.
>
> Gleb, what do you think?
>
> Paolo
>
>>       asm(
>>               /* Store host registers */
>>               "push %%" _ASM_DX "; push %%" _ASM_BP ";"
>> @@ -7518,6 +7552,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>  {
>>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>>       u32 exec_control;
>> +     u32 exit_control;
>>
>>       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
>>       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
>> @@ -7691,7 +7726,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
>>        * bits are further modified by vmx_set_efer() below.
>>        */
>> -     vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
>> +     exit_control = vmcs_config.vmexit_ctrl;
>> +     if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
>> +             exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
>> +     vmcs_write32(VM_EXIT_CONTROLS, exit_control);
>>
>>       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
>>        * emulated by vmx_set_efer(), below.
>> @@ -8090,6 +8128,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>>       vmcs12->guest_pending_dbg_exceptions =
>>               vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
>>
>> +     if ((vmcs12->pin_based_vm_exec_control &
>> +                             PIN_BASED_VMX_PREEMPTION_TIMER) &&
>> +                     (vmcs12->vm_exit_controls &
>> +                             VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
>> +             vmcs12->vmx_preemption_timer_value =
>> +                     vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
>> +
>>       /*
>>        * In some cases (usually, nested EPT), L2 is allowed to change its
>>        * own CR3 without exiting. If it has changed it, we must keep it.
>>
>


-- 
Arthur Chunqi Li
Department of Computer Science
School of EECS
Peking University
Beijing, China
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html