From: Yunhong Jiang <yunhong.jiang@xxxxxxxxx> Utilizing the VMX preemption timer for tsc deadline timer virtualization. The VMX preemption timer is armed when the vCPU is running, and a VMExit will happen if the virtual TSC deadline timer expires. When the vCPU thread is blocked because of HLT, the tsc deadline timer virtualization will be switched to use the current solution, i.e. use the timer for it. It's switched back to VMX preemption timer when the vCPU thread is unblocked. This solution avoids the complex OS's hrtimer system, and the host timer interrupt handling cost, with a preemption_timer VMexit. It fits well for some NFV usage scenario, when the vCPU is bound to a pCPU and the pCPU is isolated, or some similar scenario. Signed-off-by: Yunhong Jiang <yunhong.jiang@xxxxxxxxx> --- arch/x86/kvm/lapic.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/lapic.h | 5 ++++ arch/x86/kvm/trace.h | 16 ++++++++++++ arch/x86/kvm/vmx.c | 34 +++++++++++++++++++++++++ arch/x86/kvm/x86.c | 17 ++++++++++++- 5 files changed, 142 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f1cf8a5ede11..aedbf60846c4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1343,6 +1343,67 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_restore(flags); } +bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.apic->lapic_timer.hv_timer_in_use; +} +EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); + +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + WARN_ON(swait_active(&vcpu->wq)); + kvm_x86_ops->cancel_hv_timer(vcpu); + apic->lapic_timer.hv_timer_in_use = 0; + apic_timer_expired(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); + +void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(apic->lapic_timer.hv_timer_in_use); + + if (apic_lvtt_tscdeadline(apic) && + !atomic_read(&apic->lapic_timer.pending)) { + u64 tscdeadline = apic->lapic_timer.tscdeadline; + + if (!kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { + apic->lapic_timer.hv_timer_in_use = true; + hrtimer_cancel(&apic->lapic_timer.timer); + } + + /* In case the sw timer triggered in above small window */ + if (atomic_read(&apic->lapic_timer.pending) && + apic->lapic_timer.hv_timer_in_use) + kvm_x86_ops->cancel_hv_timer(apic->vcpu); + trace_kvm_hv_timer_state(vcpu->vcpu_id, + apic->lapic_timer.hv_timer_in_use); + } +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); + +void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + /* Possibly the TSC deadline timer is not enabled yet */ + if (!apic->lapic_timer.hv_timer_in_use) + return; + + kvm_x86_ops->cancel_hv_timer(vcpu); + apic->lapic_timer.hv_timer_in_use = false; + + if (atomic_read(&apic->lapic_timer.pending)) + return; + + start_sw_tscdeadline(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); + static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; @@ -1389,7 +1450,16 @@ static void start_apic_timer(struct kvm_lapic *apic) ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); } else if (apic_lvtt_tscdeadline(apic)) { - start_sw_tscdeadline(apic); + /* lapic timer in tsc deadline mode */ + u64 tscdeadline = apic->lapic_timer.tscdeadline; + + if (kvm_x86_ops->set_hv_timer && + !kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { + apic->lapic_timer.hv_timer_in_use = true; + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, + apic->lapic_timer.hv_timer_in_use); + } else + start_sw_tscdeadline(apic); } } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 891c6da7d4aa..336ba51bb16e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -20,6 +20,7 @@ struct kvm_timer { u64 tscdeadline; u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ + bool hv_timer_in_use; }; struct kvm_lapic { @@ -212,4 +213,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, const unsigned long *bitmap, u32 bitmap_size); +void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); +void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); +bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 8de925031b5c..58bc0d68e933 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -6,6 +6,7 @@ #include <asm/svm.h> #include <asm/clocksource.h> #include <asm/pvclock-abi.h> +#include <lapic.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -1348,6 +1349,21 @@ TRACE_EVENT(kvm_avic_unaccelerated_access, __entry->vec) ); +TRACE_EVENT(kvm_hv_timer_state, + TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), + TP_ARGS(vcpu_id, hv_timer_in_use), + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(unsigned int, hv_timer_in_use) + ), + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->hv_timer_in_use = hv_timer_in_use; + ), + TP_printk("vcpu_id %x hv_timer %x\n", + __entry->vcpu_id, + __entry->hv_timer_in_use) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3e407c6be171..9948797b65e5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7644,6 +7644,11 @@ static int handle_pcommit(struct kvm_vcpu *vcpu) return 1; } +static int handle_preemption_timer(struct kvm_vcpu *vcpu) +{ + kvm_lapic_expired_hv_timer(vcpu); + return 1; +} /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -7695,6 +7700,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_PCOMMIT] = handle_pcommit, + [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; static const int kvm_vmx_max_exit_handlers = @@ -8703,6 +8709,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } +void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + u64 tscl; + u32 delta_tsc; + + if (!apic->lapic_timer.hv_timer_in_use) + return; + + tscl = rdtsc(); + if (vcpu->arch.hv_deadline_tsc > tscl) + /* sure to be 32 bit only because checked on set_hv_timer */ + delta_tsc = (u32)((vcpu->arch.hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); +} + static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8752,6 +8778,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); + vmx_arm_hv_timer(vcpu); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -10892,6 +10920,9 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) if (pi_pre_block(vcpu)) return 1; + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_switch_to_sw_timer(vcpu); + return 0; } @@ -10938,6 +10969,9 @@ static void pi_post_block(struct kvm_vcpu *vcpu) static void vmx_post_block(struct kvm_vcpu *vcpu) { + if (kvm_x86_ops->set_hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + pi_post_block(vcpu); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 902d9da12392..a75d1437426c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2735,10 +2735,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { + u64 tscl = rdtsc(); s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - rdtsc() - vcpu->arch.last_host_tsc; + tscl - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); + + /* + * If tsc backwrap, we need update the hv_deadline_tsc, + * otherwise, the ((hv_deadline_tsc-tsc) >> + * cpu_preemption_timer_multi) may >32bit on vcpu_run(). + * This may cause deadline_tsc not so accurate, but that's + * inevitable anyway. + */ + if (tscl < vcpu->arch.hv_orig_tsc) { + vcpu->arch.hv_orig_tsc = tscl; + vcpu->arch.hv_deadline_tsc -= + vcpu->arch.hv_orig_tsc - tscl; + } + if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html