From: Yunhong Jiang <yunhong.jiang@xxxxxxxxx> Adding the basic VMX preemption timer functionality, including checking if the feature is supported, if the feature is broken on the CPU, setup/clean the VMX preemption timer. Also adds a parameter to state if the VMX preemption timer should be utilized. Signed-off-by: Yunhong Jiang <yunhong.jiang@xxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 6 ++ arch/x86/kvm/vmx.c | 142 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 147 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e0fbe7e70dc1..2410a86ef2f3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -655,6 +655,9 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; + + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; }; struct kvm_lpage_info { @@ -1005,6 +1008,9 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); + + int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); + void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 51b08cd43bb7..6393bbc6aef9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -110,6 +110,10 @@ module_param_named(pml, enable_pml, bool, S_IRUGO); #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL +static int cpu_preemption_timer_multi; +static bool __read_mostly enable_hv_timer; +module_param_named(enable_hv_timer, enable_hv_timer, bool, S_IRUGO); + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -1056,6 +1060,61 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +/* + * Comment's format: document - errata name - stepping - processor name. + * Refer from + * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp + */ +static u32 vmx_preemption_cpu_tfms[] = { +/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ +0x000206E6, +/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ +/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ +/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020652, +/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020655, +/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ +/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ +/* + * 320767.pdf - AAP86 - B1 - + * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile + */ +0x000106E5, +/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ +0x000106A0, +/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ +0x000106A1, +/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ +0x000106A4, + /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ + /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ + /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ +0x000106A5, +}; + +static inline bool cpu_has_broken_vmx_preemption_timer(void) +{ + u32 eax = cpuid_eax(0x00000001), i; + + /* Clear the reserved bits */ + eax &= ~(0x3U << 14 | 0xfU << 28); + for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++) + if (eax == vmx_preemption_cpu_tfms[i]) + return true; + + return false; +} + +static inline bool cpu_has_vmx_preemption_timer(void) +{ + if (cpu_has_broken_vmx_preemption_timer()) + return false; + + return vmcs_config.pin_based_exec_ctrl & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline bool cpu_has_vmx_posted_intr(void) { return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && @@ -3308,7 +3367,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | + PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -4781,6 +4841,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) if (!kvm_vcpu_apicv_active(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + /* Enable the preemption timer dynamically */ + pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; } @@ -6389,6 +6451,23 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + /* + * We support only x86_64 platform now because guest_tsc and host_tsc + * conversion is only done there yet. + */ +#ifdef CONFIG_X86_64 + if (cpu_has_vmx_preemption_timer() && enable_hv_timer) { + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + cpu_preemption_timer_multi = + vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + } else { + kvm_x86_ops->set_hv_timer = NULL; + kvm_x86_ops->cancel_hv_timer = NULL; + } +#endif + kvm_set_posted_intr_wakeup_handler(wakeup_handler); return alloc_kvm_area(); @@ -10662,6 +10741,62 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } +#ifdef CONFIG_X86_64 +/* (a << shift) / divisor, return 1 if overflow otherwise 0 */ +static inline int u64_shl_div_u64(u64 a, unsigned int shift, + u64 divisor, u64 *result) +{ + u64 low = a << shift, high = a >> (64 - shift); + + /* To avoid the overflow on divq */ + if (high > divisor) + return 1; + + /* Low hold the result, high hold rem which is discarded */ + asm("divq %2\n\t" : "=a" (low), "=d" (high) : + "rm" (divisor), "0" (low), "1" (high)); + *result = low; + + return 0; +} + +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) +{ + u64 tscl = rdtsc(), delta_tsc; + + delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl); + + /* Convert to host delta tsc if tsc scaling is enabled */ + if (vcpu->arch.tsc_scaling_ratio && + u64_shl_div_u64(delta_tsc, + kvm_tsc_scaling_ratio_frac_bits, + vcpu->arch.tsc_scaling_ratio, + &delta_tsc)) + return -ERANGE; + + /* + * If the delta tsc can't be fit in the 32 bit after the multi shift, + * we can't use the preemption timer. + * It's possible that it can be fit when vmentry happens late, but + * checking on every vmentry is costly, so fail earilier. + */ + if (delta_tsc >> (cpu_preemption_timer_multi + 32)) + return -ERANGE; + + vcpu->arch.hv_deadline_tsc = tscl + delta_tsc; + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + return 0; +} + +static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hv_deadline_tsc = 0; + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); +} +#endif + static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) { if (ple_gap) @@ -11038,6 +11173,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .pmu_ops = &intel_pmu_ops, .update_pi_irte = vmx_update_pi_irte, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif }; static int __init vmx_init(void) -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html