From: Ben Luo <bn0418@xxxxxxxxx> In general, KVM guest programs tsc-deadline timestamp to MSR_IA32_TSC_DEADLINE MSR. This will cause a VM-exit, and then KVM handles this timer for guest. The tsc-deadline timestamp is mostly recorded in share page with less VM-exit. We Introduce a periodically working kthread to scan share page and synchronize timer setting for guest on a dedicated CPU. Signed-off-by: Yang Zhang <yang.zhang.wz@xxxxxxxxx> Signed-off-by: Quan Xu <quan.xu0@xxxxxxxxx> Signed-off-by: Ben Luo <bn0418@xxxxxxxxx> --- arch/x86/kvm/lapic.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/lapic.h | 5 ++ 2 files changed, 143 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 55c9ba3..20a23bb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -36,6 +36,10 @@ #include <asm/delay.h> #include <linux/atomic.h> #include <linux/jump_label.h> +#include <linux/ktime.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/mmu_context.h> #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" @@ -70,6 +74,12 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul +static struct hrtimer pv_sync_timer; +static long pv_timer_period_ns = PVTIMER_PERIOD_NS; +static struct task_struct *pv_timer_polling_worker; + +module_param(pv_timer_period_ns, long, 0644); + static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -2542,8 +2552,130 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) } } +static enum hrtimer_restart pv_sync_timer_callback(struct hrtimer *timer) +{ + hrtimer_forward_now(timer, ns_to_ktime(pv_timer_period_ns)); + wake_up_process(pv_timer_polling_worker); + + return HRTIMER_RESTART; +} + +void kvm_apic_sync_pv_timer(void *data) +{ + struct kvm_vcpu *vcpu = data; + struct kvm_lapic *apic = vcpu->arch.apic; + unsigned long flags, this_tsc_khz = vcpu->arch.virtual_tsc_khz; + u64 guest_tsc, expire_tsc; + long rem_tsc; + + if (!lapic_in_kernel(vcpu) || !pv_timer_enabled(vcpu)) + return; + + local_irq_save(flags); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + rem_tsc = ktime_to_ns(hrtimer_get_remaining(&pv_sync_timer)) + * this_tsc_khz; + if (rem_tsc <= 0) + rem_tsc += pv_timer_period_ns * this_tsc_khz; + do_div(rem_tsc, 1000000L); + + /* + * make sure guest_tsc and rem_tsc are assigned before to update + * next_sync_tsc. + */ + smp_wmb(); + kvm_xchg_guest_cached(vcpu->kvm, &vcpu->arch.pv_timer.data, + offsetof(struct pvtimer_vcpu_event_info, next_sync_tsc), + guest_tsc + rem_tsc, 8); + + /* make sure next_sync_tsc is visible */ + smp_wmb(); + + expire_tsc = kvm_xchg_guest_cached(vcpu->kvm, &vcpu->arch.pv_timer.data, + offsetof(struct pvtimer_vcpu_event_info, expire_tsc), + 0UL, 8); + + /* make sure expire_tsc is visible */ + smp_wmb(); + + if (expire_tsc) { + if (expire_tsc > guest_tsc) + /* + * As we bind this thread to a dedicated CPU through + * IPI, the timer is registered on that dedicated + * CPU here. + */ + kvm_set_lapic_tscdeadline_msr(apic->vcpu, expire_tsc); + else + /* deliver immediately if expired */ + kvm_apic_local_deliver(apic, APIC_LVTT); + } + local_irq_restore(flags); +} + +static int pv_timer_polling(void *arg) +{ + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; + mm_segment_t oldfs = get_fs(); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + + spin_lock(&kvm_lock); + __set_current_state(TASK_RUNNING); + list_for_each_entry(kvm, &vm_list, vm_list) { + set_fs(USER_DS); + use_mm(kvm->mm); + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_apic_sync_pv_timer(vcpu); + } + unuse_mm(kvm->mm); + set_fs(oldfs); + } + + spin_unlock(&kvm_lock); + + schedule(); + } + + return 0; +} + +static void kvm_pv_timer_init(void) +{ + ktime_t ktime = ktime_set(0, pv_timer_period_ns); + + hrtimer_init(&pv_sync_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + pv_sync_timer.function = &pv_sync_timer_callback; + + /* kthread for pv_timer sync buffer */ + pv_timer_polling_worker = kthread_create(pv_timer_polling, NULL, + "pv_timer_polling_worker/%d", + PVTIMER_SYNC_CPU); + if (IS_ERR(pv_timer_polling_worker)) { + pr_warn_once("kvm: failed to create thread for pv_timer\n"); + pv_timer_polling_worker = NULL; + hrtimer_cancel(&pv_sync_timer); + + return; + } + + kthread_bind(pv_timer_polling_worker, PVTIMER_SYNC_CPU); + wake_up_process(pv_timer_polling_worker); + hrtimer_start(&pv_sync_timer, ktime, HRTIMER_MODE_REL); +} + void kvm_lapic_init(void) { + kvm_pv_timer_init(); + /* do not patch jump label more than once per second */ jump_label_rate_limit(&apic_hw_disabled, HZ); jump_label_rate_limit(&apic_sw_disabled, HZ); @@ -2551,6 +2683,12 @@ void kvm_lapic_init(void) void kvm_lapic_exit(void) { + if (pv_timer_polling_worker) { + hrtimer_cancel(&pv_sync_timer); + kthread_stop(pv_timer_polling_worker); + pv_timer_polling_worker = NULL; + } + static_key_deferred_flush(&apic_hw_disabled); static_key_deferred_flush(&apic_sw_disabled); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 539a738..4588d59 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -16,6 +16,9 @@ #define APIC_BUS_CYCLE_NS 1 #define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS) +#define PVTIMER_SYNC_CPU (NR_CPUS - 1) /* dedicated CPU */ +#define PVTIMER_PERIOD_NS 250000L /* pvtimer default period */ + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ @@ -213,6 +216,8 @@ static inline bool pv_timer_enabled(struct kvm_vcpu *vcpu) return vcpu->arch.pv_timer.msr_val & KVM_MSR_ENABLED; } +void kvm_apic_sync_pv_timer(void *data); + bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); -- 1.7.1