Re: [PATCH v3 7/7] x86/kvm: use Enlightened VMCS when running on Hyper-V

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Paolo Bonzini <pbonzini@xxxxxxxxxx> writes:

> On 09/03/2018 15:02, Vitaly Kuznetsov wrote:
>> Enlightened VMCS is just a structure in memory, the main benefit
>> besides avoiding somewhat slower VMREAD/VMWRITE is using clean field
>> mask: we tell the underlying hypervisor which fields were modified
>> since VMEXIT so there's no need to inspect them all.
>> 
>> Tight CPUID loop test shows significant speedup:
>> Before: 18890 cycles
>> After: 8304 cycles
>> 
>> Static key is being used to avoid performance penalty for non-Hyper-V
>> deployments. Tests show we add around 3 (three) CPU cycles on each
>> VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID
>> loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run()
>> but I don't see a clean way to use static key in assembly.
>
> If you want to live dangerously, you can use text_poke_early to change
> the vmwrite to mov.  It's just a single instruction, so it's probably
> not too hard.
>

I'd say it's not worth it ...

>> Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
>> ---
>> Changes since v2:
>> - define KVM_EVMCS_VERSION [Radim Krčmář]
>> - WARN_ONCE in get_evmcs_offset[,_cf] [Radim Krčmář]
>> - add evmcs_sanitize_exec_ctrls() and use it in hardware_setup() and
>>   dump_vmcs() [Radim Krčmář]
>> ---
>>  arch/x86/kvm/vmx.c | 625 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 615 insertions(+), 10 deletions(-)
>> 
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index 051dab74e4e9..44b6efa7d54e 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -53,6 +53,7 @@
>>  #include <asm/mmu_context.h>
>>  #include <asm/microcode.h>
>>  #include <asm/nospec-branch.h>
>> +#include <asm/mshyperv.h>
>>  
>>  #include "trace.h"
>>  #include "pmu.h"
>> @@ -1000,6 +1001,484 @@ static const u32 vmx_msr_index[] = {
>>  	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
>>  };
>>  
>> +DEFINE_STATIC_KEY_FALSE(enable_evmcs);
>> +
>> +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +static bool __read_mostly enlightened_vmcs = true;
>> +module_param(enlightened_vmcs, bool, 0444);
>> +
>> +#define KVM_EVMCS_VERSION 1
>> +
>> +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
>> +#define EVMCS1_FIELD(number, name, clean_mask)[ROL16(number, 6)] = \
>> +		(u32)EVMCS1_OFFSET(name) | ((u32)clean_mask << 16)
>> +
>> +/*
>> + * Lower 16 bits encode offset of the field in struct hv_enlightened_vmcs,
>> + * upped 16 bits hold clean field mask.
>> + */
>> +static const u32 vmcs_field_to_evmcs_1[] = {
>> +	/* 64 bit rw */
>> +	EVMCS1_FIELD(GUEST_RIP, guest_rip,
>> +		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE),
>
> Maybe we should use a single "#include"d file (like vmx_shadow_fields.h)
> and share it between HV-on-KVM and KVM-on-HV.
>
> ...

Actually, yes, looking at 13k+ lines of code in vmx.c makes me think
it's time we start doing something about it :-)

>
>> +	EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr,
>> +		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> +	EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr,
>> +		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> +	EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
>> +		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> +	EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
>> +		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> +	EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
>> +		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> +	EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
>> +		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>> +	EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
>> +		     HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
>
> We shouldn't use these on Hyper-V, should we (that is, shouldn't the
> WARN below fire if you try---and so why include them at all)?
>

True, these shouldn't be used and that's why there's no clean field
assigned to them. They, however, do have a corresponding eVMCS field.
I will try removing them in next version.

>> +
>> +static inline u16 get_evmcs_offset(unsigned long field)
>> +{
>> +	unsigned int index = ROL16(field, 6);
>> +
>> +	if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) {
>> +		WARN_ONCE(1, "kvm: reading unsupported EVMCS field %lx\n",
>> +			  field);
>> +		return 0;
>> +	}
>> +
>> +	return (u16)vmcs_field_to_evmcs_1[index];
>> +}
>> +
>> +static inline u16 get_evmcs_offset_cf(unsigned long field, u16 *clean_field)
>> +{
>> +	unsigned int index = ROL16(field, 6);
>> +	u32 evmcs_field;
>> +
>> +	if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) {
>> +		WARN_ONCE(1, "kvm: writing unsupported EVMCS field %lx\n",
>> +			  field);
>> +		return 0;
>> +	}
>> +
>> +	evmcs_field = vmcs_field_to_evmcs_1[index];
>> +
>> +	*clean_field = evmcs_field >> 16;
>> +
>> +	return (u16)evmcs_field;
>> +}
>
> You can mark this __always_inline, and make it
>
> 	if (clean_field)
> 		*clean_field = evmcs_field >> 16;
>
> or alternatively, use a two-element struct and do
>
> 	evmcs_field = &vmcs_field_to_evmcs_1[index];
> 	if (clean_field)
> 		*clean_field = evmcs_field->clean_field;
> 	return evmcs_field->offset;
>
> Also, if you return int and make the WARN_ONCE case return -ENOENT, GCC
> should be able to optimize out the "if (!offset)" (which becomes "if
> (offset < 0)") in the callers.  Nitpicking, but...
>

Ok, good suggestion, I'll try.

>> +static void vmcs_load_enlightened(u64 phys_addr)
>> +{
>> +	struct hv_vp_assist_page *vp_ap =
>> +		hv_get_vp_assist_page(smp_processor_id());
>> +
>> +	vp_ap->current_nested_vmcs = phys_addr;
>> +	vp_ap->enlighten_vmentry = 1;
>> +}
>
> evmcs_load?
>

Works for me,

>> +static void evmcs_sanitize_exec_ctrls(u32 *cpu_based_2nd_exec_ctrl,
>> +				      u32 *pin_based_exec_ctrl)
>> +{
>> +	*pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;> +	*cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>> +	*cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
>> +	*cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT;
>> +	*cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML;
>> +	*cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
>> +	*cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS;
>> +	*cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING;
>> +	*cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
>> +	*pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
>
> How can these be set?
>

They can not if Hyper-V behaves but Radim didn't want to trust it -- so
the suggestion was to forcefully disable unsupported controls.

>> @@ -3596,6 +4104,14 @@ static int hardware_enable(void)
>>  	if (cr4_read_shadow() & X86_CR4_VMXE)
>>  		return -EBUSY;
>>  
>> +	/*
>> +	 * This can happen if we hot-added a CPU but failed to allocate
>> +	 * VP assist page for it.
>> +	 */
>> +	if (static_branch_unlikely(&enable_evmcs) &&
>> +	    !hv_get_vp_assist_page(cpu))
>> +		return -EFAULT;
>
> -ENODEV?  Maybe add a printk, because this is really rare.
>

Ok,

>>  	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
>>  	INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
>>  	spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
>> @@ -3829,7 +4345,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>>  	vmcs_conf->size = vmx_msr_high & 0x1fff;
>>  	vmcs_conf->order = get_order(vmcs_conf->size);
>>  	vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
>> -	vmcs_conf->revision_id = vmx_msr_low;
>> +
>> +	/* KVM supports Enlightened VMCS v1 only */
>> +	if (static_branch_unlikely(&enable_evmcs))
>> +		vmcs_conf->revision_id = KVM_EVMCS_VERSION;
>> +	else
>> +		vmcs_conf->revision_id = vmx_msr_low;
>>  
>>  	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
>>  	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
>> @@ -6990,6 +7511,17 @@ static __init int hardware_setup(void)
>>  		goto out;
>>  	}
>>  
>> +	if (static_branch_unlikely(&enable_evmcs)) {
>> +		evmcs_sanitize_exec_ctrls(&vmcs_config.cpu_based_2nd_exec_ctrl,
>> +					  &vmcs_config.pin_based_exec_ctrl);
>
> Why not do it in setup_vmcs_config after the vmcs_conf->vmentry_ctrl
> assignment (and pass &vmcs_config, which there is "vmcs_conf", directly
> to the function)?  And if sanitizing clears the bits in vmentry_ctl and
> vmexit_ctl, there's no need to clear cpu_has_load_perf_global_ctrl.
>

Ok, if we decide to keep 'sanitization' in place.

>> +		/*
>> +		 * Enlightened VMCSv1 doesn't support these:
>> +		 *	GUEST_IA32_PERF_GLOBAL_CTRL	= 0x00002808,
>> +		 *	HOST_IA32_PERF_GLOBAL_CTRL	= 0x00002c04,
>> +		 */
>> +		cpu_has_load_perf_global_ctrl = false;> +	}
>> +
>>  	if (boot_cpu_has(X86_FEATURE_NX))
>>  		kvm_enable_efer_bits(EFER_NX);
>>  
>> @@ -8745,6 +9277,10 @@ static void dump_vmcs(void)
>>  	if (cpu_has_secondary_exec_ctrls())
>>  		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
>>  
>> +	if (static_branch_unlikely(&enable_evmcs))
>> +		evmcs_sanitize_exec_ctrls(&secondary_exec_control,
>> +					  &pin_based_exec_ctrl);
>
> This is wrong, we're reading the VMCS so the values must already be
> sanitized (and if not, that's the bug and we want dump_vmcs to print the
> "wrong" values).

The problem is that we vmcs_read these fields later in the function and
this will now WARN(). Initally, there was no WARN() for non-existent
fields so this could work (we would just print zeroes for unsupported
fields). Maybe, additional WARN_ON() is not a big deal here.

In reality, these controls should never be set.

>
>>  	pr_err("*** Guest State ***\n");
>>  	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
>>  	       vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
>> @@ -8784,7 +9320,8 @@ static void dump_vmcs(void)
>>  	pr_err("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
>>  	       vmcs_read64(GUEST_IA32_DEBUGCTL),
>>  	       vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
>> -	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
>> +	if (cpu_has_load_perf_global_ctrl &&
>> +	    vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
>>  		pr_err("PerfGlobCtl = 0x%016llx\n",
>>  		       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
>>  	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
>> @@ -8820,7 +9357,8 @@ static void dump_vmcs(void)
>>  		pr_err("EFER = 0x%016llx  PAT = 0x%016llx\n",
>>  		       vmcs_read64(HOST_IA32_EFER),
>>  		       vmcs_read64(HOST_IA32_PAT));
>> -	if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
>> +	if (cpu_has_load_perf_global_ctrl &&
>> +	    vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
>>  		pr_err("PerfGlobCtl = 0x%016llx\n",
>>  		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
>>  
>> @@ -9397,7 +9935,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
>>  static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>  {
>>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> -	unsigned long cr3, cr4;
>> +	unsigned long cr3, cr4, evmcs_rsp;
>>  
>>  	/* Record the guest's net vcpu time for enforced NMI injections. */
>>  	if (unlikely(!enable_vnmi &&
>> @@ -9463,6 +10001,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>  		native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
>>  
>>  	vmx->__launched = vmx->loaded_vmcs->launched;
>> +
>> +	evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
>> +		(unsigned long)&current_evmcs->host_rsp : 0;
>
> (If you use text_poke_early, you can do this assignment unconditionally,
> since it's just a single lea instruction).
>

Something to take a look at)

>> @@ -9604,6 +10152,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
>>  	/* Eliminate branch target predictions from guest mode */
>>  	vmexit_fill_RSB();
>>  
>> +	/* All fields are clean at this point */
>> +	if (static_branch_unlikely(&enable_evmcs))
>> +		current_evmcs->hv_clean_fields |=
>> +			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
>> +
>>  	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
>>  	if (vmx->host_debugctlmsr)
>>  		update_debugctlmsr(vmx->host_debugctlmsr);
>> @@ -12419,7 +12972,36 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>>  
>>  static int __init vmx_init(void)
>>  {
>> -	int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
>> +	int r;
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +	/*
>> +	 * Enlightened VMCS usage should be recommended and the host needs
>> +	 * to support eVMCS v1 or above. We can also disable eVMCS support
>> +	 * with module parameter.
>> +	 */
>> +	if (enlightened_vmcs &&
>> +	    ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
>> +	    (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
>> +	    KVM_EVMCS_VERSION) {
>> +		int cpu;
>> +
>> +		/* Check that we have assist pages on all online CPUs */
>> +		for_each_online_cpu(cpu) {
>> +			if (!hv_get_vp_assist_page(cpu)) {
>> +				enlightened_vmcs = false;
>> +				break;
>> +			}
>> +		}
>> +		if (enlightened_vmcs) {
>> +			pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
>> +			static_branch_enable(&enable_evmcs);
>> +		}
>> +	}
>
> A bit nicer to clear enlightened_vmcs in the "else" branch?

Yes, as a precaution, why not. (But we should solely rely on
'enable_evmcs' later on).

>
> That's it.  Nice work!
>
> Paolo
>
>> +#endif
>> +
>> +	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
>>                       __alignof__(struct vcpu_vmx), THIS_MODULE);
>>  	if (r)
>>  		return r;
>> @@ -12440,6 +13022,29 @@ static void __exit vmx_exit(void)
>>  #endif
>>  
>>  	kvm_exit();
>> +
>> +#if IS_ENABLED(CONFIG_HYPERV)
>> +	if (static_branch_unlikely(&enable_evmcs)) {
>> +		int cpu;
>> +		struct hv_vp_assist_page *vp_ap;
>> +		/*
>> +		 * Reset everything to support using non-enlightened VMCS
>> +		 * access later (e.g. when we reload the module with
>> +		 * enlightened_vmcs=0)
>> +		 */
>> +		for_each_online_cpu(cpu) {
>> +			vp_ap =	hv_get_vp_assist_page(cpu);
>> +
>> +			if (!vp_ap)
>> +				continue;
>> +
>> +			vp_ap->current_nested_vmcs = 0;
>> +			vp_ap->enlighten_vmentry = 0;
>> +		}
>> +
>> +		static_branch_disable(&enable_evmcs);
>> +	}
>> +#endif
>>  }
>>  
>>  module_init(vmx_init)
>> 

-- 
  Vitaly



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux