On Tue, May 24, 2016 at 3:27 PM, Yunhong Jiang <yunhong.jiang@xxxxxxxxxxxxxxx> wrote: > From: Yunhong Jiang <yunhong.jiang@xxxxxxxxx> > > Utilizing the VMX preemption timer for tsc deadline timer > virtualization. The VMX preemption timer is armed when the vCPU is > running, and a VMExit will happen if the virtual TSC deadline timer > expires. > > When the vCPU thread is scheduled out, the tsc deadline timer > virtualization will be switched to use the current solution, i.e. use > the timer for it. It's switched back to VMX preemption timer when the > vCPU thread is scheduled int. > > This solution avoids the complex OS's hrtimer system, and also the host > timer interrupt handling cost, with a preemption_timer VMexit. It fits > well for some NFV usage scenario, when the vCPU is bound to a pCPU and > the pCPU is isolated, or some similar scenario. > > However, it possibly has impact if the vCPU thread is scheduled in/out > very frequently, because it switches from/to the hrtimer emulation a lot. What is the cost of the extra sched-in/out hooks? > > Signed-off-by: Yunhong Jiang <yunhong.jiang@xxxxxxxxx> > --- > arch/x86/kvm/lapic.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++-- > arch/x86/kvm/lapic.h | 11 ++++++ > arch/x86/kvm/trace.h | 22 ++++++++++++ > arch/x86/kvm/vmx.c | 8 +++++ > arch/x86/kvm/x86.c | 4 +++ > 5 files changed, 139 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index f1cf8a5ede11..93679db7ce0f 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -1313,7 +1313,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) > __delay(tsc_deadline - guest_tsc); > } > > -static void start_sw_tscdeadline(struct kvm_lapic *apic) > +static void start_sw_tscdeadline(struct kvm_lapic *apic, int no_expire) > { > u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; > u64 ns = 0; > @@ -1330,7 +1330,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) > > now = apic->lapic_timer.timer.base->get_time(); > guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); > - if (likely(tscdeadline > guest_tsc)) { > + if (no_expire || likely(tscdeadline > guest_tsc)) { > ns = (tscdeadline - guest_tsc) * 1000000ULL; > do_div(ns, this_tsc_khz); > expire = ktime_add_ns(now, ns); > @@ -1343,6 +1343,85 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) > local_irq_restore(flags); > } > > +void kvm_lapic_arm_hv_timer(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + u64 tscdeadline, guest_tsc; > + > + if (apic->lapic_timer.hv_timer_state == HV_TIMER_NOT_USED) > + return; > + > + tscdeadline = apic->lapic_timer.tscdeadline; > + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); > + > + if (tscdeadline >= guest_tsc) > + kvm_x86_ops->set_hv_timer(vcpu, tscdeadline - guest_tsc); Does this interact correctly with TSC scaling? IIUC this programs the VMX preemption timer with a delay in guest cycles, rather than host cycles. > + else > + kvm_x86_ops->set_hv_timer(vcpu, 0); > + > + apic->lapic_timer.hv_timer_state = HV_TIMER_ARMED; > + trace_kvm_hv_timer_state(vcpu->vcpu_id, > + apic->lapic_timer.hv_timer_state); > +} > +EXPORT_SYMBOL_GPL(kvm_lapic_arm_hv_timer); > + > +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + > + WARN_ON(apic->lapic_timer.hv_timer_state != HV_TIMER_ARMED); > + WARN_ON(swait_active(&vcpu->wq)); > + kvm_x86_ops->cancel_hv_timer(vcpu); > + apic->lapic_timer.hv_timer_state = HV_TIMER_NOT_USED; > + apic_timer_expired(apic); > +} > +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); > + > +void switch_to_hv_lapic_timer(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + > + WARN_ON(apic->lapic_timer.hv_timer_state != HV_TIMER_NOT_USED); > + > + if (apic_lvtt_tscdeadline(apic) && > + !atomic_read(&apic->lapic_timer.pending)) { > + hrtimer_cancel(&apic->lapic_timer.timer); > + /* In case the timer triggered in above small window */ > + if (!atomic_read(&apic->lapic_timer.pending)) { > + apic->lapic_timer.hv_timer_state = > + HV_TIMER_NEEDS_ARMING; > + trace_kvm_hv_timer_state(vcpu->vcpu_id, > + apic->lapic_timer.hv_timer_state); > + } > + } > +} > +EXPORT_SYMBOL_GPL(switch_to_hv_lapic_timer); > + > +void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu) > +{ > + struct kvm_lapic *apic = vcpu->arch.apic; > + > + /* Possibly the TSC deadline timer is not enabled yet */ > + if (apic->lapic_timer.hv_timer_state == HV_TIMER_NOT_USED) > + return; > + > + if (apic->lapic_timer.hv_timer_state == HV_TIMER_ARMED) > + kvm_x86_ops->cancel_hv_timer(vcpu); > + apic->lapic_timer.hv_timer_state = HV_TIMER_NOT_USED; > + > + if (atomic_read(&apic->lapic_timer.pending)) > + return; > + > + /* > + * Don't trigger the apic_timer_expired() for deadlock, > + * because the swake_up() from apic_timer_expired() will > + * try to get the run queue lock, which has been held here > + * since we are in context switch procedure already. > + */ > + start_sw_tscdeadline(apic, 1); > +} > +EXPORT_SYMBOL_GPL(switch_to_sw_lapic_timer); > + > static void start_apic_timer(struct kvm_lapic *apic) > { > ktime_t now; > @@ -1389,7 +1468,19 @@ static void start_apic_timer(struct kvm_lapic *apic) > ktime_to_ns(ktime_add_ns(now, > apic->lapic_timer.period))); > } else if (apic_lvtt_tscdeadline(apic)) { > - start_sw_tscdeadline(apic); > + /* lapic timer in tsc deadline mode */ > + if (kvm_x86_ops->set_hv_timer) { > + if (unlikely(!apic->lapic_timer.tscdeadline || > + !apic->vcpu->arch.virtual_tsc_khz)) > + return; > + > + /* Expired timer will be checked on vcpu_run() */ > + apic->lapic_timer.hv_timer_state = > + HV_TIMER_NEEDS_ARMING; > + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, > + apic->lapic_timer.hv_timer_state); > + } else > + start_sw_tscdeadline(apic, 0); > } > } > > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > index 891c6da7d4aa..dc4fd8eea04d 100644 > --- a/arch/x86/kvm/lapic.h > +++ b/arch/x86/kvm/lapic.h > @@ -12,6 +12,12 @@ > #define KVM_APIC_SHORT_MASK 0xc0000 > #define KVM_APIC_DEST_MASK 0x800 > > +enum { > + HV_TIMER_NOT_USED, > + HV_TIMER_NEEDS_ARMING, > + HV_TIMER_ARMED, > +}; > + > struct kvm_timer { > struct hrtimer timer; > s64 period; /* unit: ns */ > @@ -20,6 +26,7 @@ struct kvm_timer { > u64 tscdeadline; > u64 expired_tscdeadline; > atomic_t pending; /* accumulated triggered timers */ > + int hv_timer_state; > }; > > struct kvm_lapic { > @@ -212,4 +219,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, > struct kvm_vcpu **dest_vcpu); > int kvm_vector_to_index(u32 vector, u32 dest_vcpus, > const unsigned long *bitmap, u32 bitmap_size); > +void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu); > +void switch_to_hv_lapic_timer(struct kvm_vcpu *vcpu); > +void kvm_lapic_arm_hv_timer(struct kvm_vcpu *vcpu); > +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); > #endif > diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h > index 8de925031b5c..8e1b5e9c2e78 100644 > --- a/arch/x86/kvm/trace.h > +++ b/arch/x86/kvm/trace.h > @@ -6,6 +6,7 @@ > #include <asm/svm.h> > #include <asm/clocksource.h> > #include <asm/pvclock-abi.h> > +#include <lapic.h> > > #undef TRACE_SYSTEM > #define TRACE_SYSTEM kvm > @@ -1348,6 +1349,27 @@ TRACE_EVENT(kvm_avic_unaccelerated_access, > __entry->vec) > ); > > +#define kvm_trace_symbol_hv_timer \ > + {HV_TIMER_NOT_USED, "no"}, \ > + {HV_TIMER_NEEDS_ARMING, "need_arming"}, \ > + {HV_TIMER_ARMED, "armed"} > + > +TRACE_EVENT(kvm_hv_timer_state, > + TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_state), > + TP_ARGS(vcpu_id, hv_timer_state), > + TP_STRUCT__entry( > + __field(unsigned int, vcpu_id) > + __field(unsigned int, hv_timer_state) > + ), > + TP_fast_assign( > + __entry->vcpu_id = vcpu_id; > + __entry->hv_timer_state = hv_timer_state; > + ), > + TP_printk("vcpu_id %x hv_timer %s\n", > + __entry->vcpu_id, > + __print_symbolic(__entry->hv_timer_state, > + kvm_trace_symbol_hv_timer)) > + ); > #endif /* _TRACE_KVM_H */ > > #undef TRACE_INCLUDE_PATH > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 2b29afa61715..dc0b8cae02b8 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -7576,6 +7576,11 @@ static int handle_pcommit(struct kvm_vcpu *vcpu) > return 1; > } > > +static int handle_preemption_timer(struct kvm_vcpu *vcpu) > +{ > + kvm_lapic_expired_hv_timer(vcpu); > + return 1; > +} > /* > * The exit handlers return 1 if the exit was handled fully and guest execution > * may resume. Otherwise they set the kvm_run parameter to indicate what needs > @@ -7627,6 +7632,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { > [EXIT_REASON_XRSTORS] = handle_xrstors, > [EXIT_REASON_PML_FULL] = handle_pml_full, > [EXIT_REASON_PCOMMIT] = handle_pcommit, > + [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, > }; > > static const int kvm_vmx_max_exit_handlers = > @@ -8678,6 +8684,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) > if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) > vmx_set_interrupt_shadow(vcpu, 0); > > + kvm_lapic_arm_hv_timer(vcpu); > + > if (vmx->guest_pkru_valid) > __write_pkru(vmx->guest_pkru); > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 269d576520ba..f4b50608568a 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -7724,11 +7724,15 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) > > void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) > { > + if (kvm_x86_ops->set_hv_timer) > + switch_to_hv_lapic_timer(vcpu); > kvm_x86_ops->sched_in(vcpu, cpu); > } > > void kvm_arch_sched_out(struct kvm_vcpu *vcpu) > { > + if (kvm_x86_ops->set_hv_timer) > + switch_to_sw_lapic_timer(vcpu); > } > > int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) > -- > 1.8.3.1 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html