From: Ben Luo <bn0418@xxxxxxxxx> In general, KVM guest programs tsc-deadline timestamp to MSR_IA32_TSC_DEADLINE MSR. When pvtimer is enabled, we introduce a new mechanism to reprogram KVM guest timer. A periodically working kthread scans share page and synchronize timer setting for guest on a dedicated CPU. The next time event of the periodically working kthread is a threshold to decide whether to program tsc-deadline timestamp to MSR_IA32_TSC_DEADLINE MSR, or to share page. Signed-off-by: Yang Zhang <yang.zhang.wz@xxxxxxxxx> Signed-off-by: Quan Xu <quan.xu0@xxxxxxxxx> Signed-off-by: Ben Luo <bn0418@xxxxxxxxx> --- arch/x86/include/asm/kvm_para.h | 9 +++++++++ arch/x86/kernel/apic/apic.c | 9 ++++++--- arch/x86/kernel/kvm.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index c373e44..109e706 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -5,6 +5,7 @@ #include <asm/processor.h> #include <asm/alternative.h> #include <uapi/asm/kvm_para.h> +#include <linux/hrtimer.h> extern void kvmclock_init(void); extern int kvm_register_clock(char *txt); @@ -92,6 +93,8 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); +int kvm_pv_timer_next_event(unsigned long tsc, + struct clock_event_device *evt); extern void kvm_disable_steal_time(void); #ifdef CONFIG_PARAVIRT_SPINLOCKS @@ -126,6 +129,12 @@ static inline void kvm_disable_steal_time(void) { return; } + +static inline int kvm_pv_timer_next_event(unsigned long tsc, + struct clock_event_device *evt) +{ + return 0; +} #endif #endif /* _ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ff89177..286c1b3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -471,10 +471,13 @@ static int lapic_next_event(unsigned long delta, static int lapic_next_deadline(unsigned long delta, struct clock_event_device *evt) { - u64 tsc; + u64 tsc = rdtsc() + (((u64) delta) * TSC_DIVISOR); - tsc = rdtsc(); - wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + /* TODO: undisciplined function call */ + if (kvm_pv_timer_next_event(tsc, evt)) + return 0; + + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc); return 0; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8bb9594..ec7aff1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -328,6 +328,35 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK); } +static DEFINE_PER_CPU(int, pvtimer_enabled); +static DEFINE_PER_CPU(struct pvtimer_vcpu_event_info, + pvtimer_shared_buf) = {0}; +#define PVTIMER_PADDING 25000 +int kvm_pv_timer_next_event(unsigned long tsc, + struct clock_event_device *evt) +{ + struct pvtimer_vcpu_event_info *src; + u64 now; + + if (!this_cpu_read(pvtimer_enabled)) + return false; + + src = this_cpu_ptr(&pvtimer_shared_buf); + xchg((u64 *)&src->expire_tsc, tsc); + + barrier(); + + if (tsc < src->next_sync_tsc) + return false; + + rdtscll(now); + if (tsc < now || tsc - now < PVTIMER_PADDING) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(kvm_pv_timer_next_event); + static void kvm_guest_cpu_init(void) { if (!kvm_para_available()) @@ -362,6 +391,15 @@ static void kvm_guest_cpu_init(void) if (has_steal_clock) kvm_register_steal_time(); + + if (kvm_para_has_feature(KVM_FEATURE_PV_TIMER)) { + unsigned long data; + + data = slow_virt_to_phys(this_cpu_ptr(&pvtimer_shared_buf)) + | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_TIMER_EN, data); + this_cpu_write(pvtimer_enabled, 1); + } } static void kvm_pv_disable_apf(void) -- 1.7.1