On 09/03/2018 15:02, Vitaly Kuznetsov wrote: > Enlightened VMCS is just a structure in memory, the main benefit > besides avoiding somewhat slower VMREAD/VMWRITE is using clean field > mask: we tell the underlying hypervisor which fields were modified > since VMEXIT so there's no need to inspect them all. > > Tight CPUID loop test shows significant speedup: > Before: 18890 cycles > After: 8304 cycles > > Static key is being used to avoid performance penalty for non-Hyper-V > deployments. Tests show we add around 3 (three) CPU cycles on each > VMEXIT (1077.5 cycles before, 1080.7 cycles after for the same CPUID > loop on bare metal). We can probably avoid one test/jmp in vmx_vcpu_run() > but I don't see a clean way to use static key in assembly. If you want to live dangerously, you can use text_poke_early to change the vmwrite to mov. It's just a single instruction, so it's probably not too hard. > Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> > --- > Changes since v2: > - define KVM_EVMCS_VERSION [Radim Krčmář] > - WARN_ONCE in get_evmcs_offset[,_cf] [Radim Krčmář] > - add evmcs_sanitize_exec_ctrls() and use it in hardware_setup() and > dump_vmcs() [Radim Krčmář] > --- > arch/x86/kvm/vmx.c | 625 ++++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 615 insertions(+), 10 deletions(-) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 051dab74e4e9..44b6efa7d54e 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -53,6 +53,7 @@ > #include <asm/mmu_context.h> > #include <asm/microcode.h> > #include <asm/nospec-branch.h> > +#include <asm/mshyperv.h> > > #include "trace.h" > #include "pmu.h" > @@ -1000,6 +1001,484 @@ static const u32 vmx_msr_index[] = { > MSR_EFER, MSR_TSC_AUX, MSR_STAR, > }; > > +DEFINE_STATIC_KEY_FALSE(enable_evmcs); > + > +#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs)) > + > +#if IS_ENABLED(CONFIG_HYPERV) > +static bool __read_mostly enlightened_vmcs = true; > +module_param(enlightened_vmcs, bool, 0444); > + > +#define KVM_EVMCS_VERSION 1 > + > +#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) > +#define EVMCS1_FIELD(number, name, clean_mask)[ROL16(number, 6)] = \ > + (u32)EVMCS1_OFFSET(name) | ((u32)clean_mask << 16) > + > +/* > + * Lower 16 bits encode offset of the field in struct hv_enlightened_vmcs, > + * upped 16 bits hold clean field mask. > + */ > +static const u32 vmcs_field_to_evmcs_1[] = { > + /* 64 bit rw */ > + EVMCS1_FIELD(GUEST_RIP, guest_rip, > + HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE), Maybe we should use a single "#include"d file (like vmx_shadow_fields.h) and share it between HV-on-KVM and KVM-on-HV. ... > + EVMCS1_FIELD(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr, > + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), > + EVMCS1_FIELD(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr, > + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), > + EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr, > + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), > + EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0, > + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), > + EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1, > + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), > + EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2, > + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), > + EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3, > + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL), We shouldn't use these on Hyper-V, should we (that is, shouldn't the WARN below fire if you try---and so why include them at all)? > + > +static inline u16 get_evmcs_offset(unsigned long field) > +{ > + unsigned int index = ROL16(field, 6); > + > + if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) { > + WARN_ONCE(1, "kvm: reading unsupported EVMCS field %lx\n", > + field); > + return 0; > + } > + > + return (u16)vmcs_field_to_evmcs_1[index]; > +} > + > +static inline u16 get_evmcs_offset_cf(unsigned long field, u16 *clean_field) > +{ > + unsigned int index = ROL16(field, 6); > + u32 evmcs_field; > + > + if (index >= ARRAY_SIZE(vmcs_field_to_evmcs_1)) { > + WARN_ONCE(1, "kvm: writing unsupported EVMCS field %lx\n", > + field); > + return 0; > + } > + > + evmcs_field = vmcs_field_to_evmcs_1[index]; > + > + *clean_field = evmcs_field >> 16; > + > + return (u16)evmcs_field; > +} You can mark this __always_inline, and make it if (clean_field) *clean_field = evmcs_field >> 16; or alternatively, use a two-element struct and do evmcs_field = &vmcs_field_to_evmcs_1[index]; if (clean_field) *clean_field = evmcs_field->clean_field; return evmcs_field->offset; Also, if you return int and make the WARN_ONCE case return -ENOENT, GCC should be able to optimize out the "if (!offset)" (which becomes "if (offset < 0)") in the callers. Nitpicking, but... > +static void vmcs_load_enlightened(u64 phys_addr) > +{ > + struct hv_vp_assist_page *vp_ap = > + hv_get_vp_assist_page(smp_processor_id()); > + > + vp_ap->current_nested_vmcs = phys_addr; > + vp_ap->enlighten_vmentry = 1; > +} evmcs_load? > +static void evmcs_sanitize_exec_ctrls(u32 *cpu_based_2nd_exec_ctrl, > + u32 *pin_based_exec_ctrl) > +{ > + *pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;> + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; > + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; > + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_APIC_REGISTER_VIRT; > + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML; > + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC; > + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS; > + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING; > + *cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; > + *pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; How can these be set? > @@ -3596,6 +4104,14 @@ static int hardware_enable(void) > if (cr4_read_shadow() & X86_CR4_VMXE) > return -EBUSY; > > + /* > + * This can happen if we hot-added a CPU but failed to allocate > + * VP assist page for it. > + */ > + if (static_branch_unlikely(&enable_evmcs) && > + !hv_get_vp_assist_page(cpu)) > + return -EFAULT; -ENODEV? Maybe add a printk, because this is really rare. > INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); > INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); > spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); > @@ -3829,7 +4345,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) > vmcs_conf->size = vmx_msr_high & 0x1fff; > vmcs_conf->order = get_order(vmcs_conf->size); > vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; > - vmcs_conf->revision_id = vmx_msr_low; > + > + /* KVM supports Enlightened VMCS v1 only */ > + if (static_branch_unlikely(&enable_evmcs)) > + vmcs_conf->revision_id = KVM_EVMCS_VERSION; > + else > + vmcs_conf->revision_id = vmx_msr_low; > > vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; > vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; > @@ -6990,6 +7511,17 @@ static __init int hardware_setup(void) > goto out; > } > > + if (static_branch_unlikely(&enable_evmcs)) { > + evmcs_sanitize_exec_ctrls(&vmcs_config.cpu_based_2nd_exec_ctrl, > + &vmcs_config.pin_based_exec_ctrl); Why not do it in setup_vmcs_config after the vmcs_conf->vmentry_ctrl assignment (and pass &vmcs_config, which there is "vmcs_conf", directly to the function)? And if sanitizing clears the bits in vmentry_ctl and vmexit_ctl, there's no need to clear cpu_has_load_perf_global_ctrl. > + /* > + * Enlightened VMCSv1 doesn't support these: > + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, > + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, > + */ > + cpu_has_load_perf_global_ctrl = false;> + } > + > if (boot_cpu_has(X86_FEATURE_NX)) > kvm_enable_efer_bits(EFER_NX); > > @@ -8745,6 +9277,10 @@ static void dump_vmcs(void) > if (cpu_has_secondary_exec_ctrls()) > secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); > > + if (static_branch_unlikely(&enable_evmcs)) > + evmcs_sanitize_exec_ctrls(&secondary_exec_control, > + &pin_based_exec_ctrl); This is wrong, we're reading the VMCS so the values must already be sanitized (and if not, that's the bug and we want dump_vmcs to print the "wrong" values). > pr_err("*** Guest State ***\n"); > pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", > vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), > @@ -8784,7 +9320,8 @@ static void dump_vmcs(void) > pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", > vmcs_read64(GUEST_IA32_DEBUGCTL), > vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); > - if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) > + if (cpu_has_load_perf_global_ctrl && > + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) > pr_err("PerfGlobCtl = 0x%016llx\n", > vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); > if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) > @@ -8820,7 +9357,8 @@ static void dump_vmcs(void) > pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", > vmcs_read64(HOST_IA32_EFER), > vmcs_read64(HOST_IA32_PAT)); > - if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) > + if (cpu_has_load_perf_global_ctrl && > + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) > pr_err("PerfGlobCtl = 0x%016llx\n", > vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); > > @@ -9397,7 +9935,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) > static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > - unsigned long cr3, cr4; > + unsigned long cr3, cr4, evmcs_rsp; > > /* Record the guest's net vcpu time for enforced NMI injections. */ > if (unlikely(!enable_vnmi && > @@ -9463,6 +10001,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) > native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); > > vmx->__launched = vmx->loaded_vmcs->launched; > + > + evmcs_rsp = static_branch_unlikely(&enable_evmcs) ? > + (unsigned long)¤t_evmcs->host_rsp : 0; (If you use text_poke_early, you can do this assignment unconditionally, since it's just a single lea instruction). > @@ -9604,6 +10152,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) > /* Eliminate branch target predictions from guest mode */ > vmexit_fill_RSB(); > > + /* All fields are clean at this point */ > + if (static_branch_unlikely(&enable_evmcs)) > + current_evmcs->hv_clean_fields |= > + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; > + > /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ > if (vmx->host_debugctlmsr) > update_debugctlmsr(vmx->host_debugctlmsr); > @@ -12419,7 +12972,36 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { > > static int __init vmx_init(void) > { > - int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), > + int r; > + > +#if IS_ENABLED(CONFIG_HYPERV) > + /* > + * Enlightened VMCS usage should be recommended and the host needs > + * to support eVMCS v1 or above. We can also disable eVMCS support > + * with module parameter. > + */ > + if (enlightened_vmcs && > + ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && > + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= > + KVM_EVMCS_VERSION) { > + int cpu; > + > + /* Check that we have assist pages on all online CPUs */ > + for_each_online_cpu(cpu) { > + if (!hv_get_vp_assist_page(cpu)) { > + enlightened_vmcs = false; > + break; > + } > + } > + if (enlightened_vmcs) { > + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); > + static_branch_enable(&enable_evmcs); > + } > + } A bit nicer to clear enlightened_vmcs in the "else" branch? That's it. Nice work! Paolo > +#endif > + > + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), > __alignof__(struct vcpu_vmx), THIS_MODULE); > if (r) > return r; > @@ -12440,6 +13022,29 @@ static void __exit vmx_exit(void) > #endif > > kvm_exit(); > + > +#if IS_ENABLED(CONFIG_HYPERV) > + if (static_branch_unlikely(&enable_evmcs)) { > + int cpu; > + struct hv_vp_assist_page *vp_ap; > + /* > + * Reset everything to support using non-enlightened VMCS > + * access later (e.g. when we reload the module with > + * enlightened_vmcs=0) > + */ > + for_each_online_cpu(cpu) { > + vp_ap = hv_get_vp_assist_page(cpu); > + > + if (!vp_ap) > + continue; > + > + vp_ap->current_nested_vmcs = 0; > + vp_ap->enlighten_vmentry = 0; > + } > + > + static_branch_disable(&enable_evmcs); > + } > +#endif > } > > module_init(vmx_init) >