From: Wanpeng Li <wanpengli@xxxxxxxxxxx> Make lapic timer unpinned when timer is injected by posted-interrupt, the emulated timer can be offload to the housekeeping cpus. This patch introduces a new kvm module parameter, it is false by default. The host admin can enable it after fine tuned, e.g. dedicated instances scenario w/ nohz_full cover the pCPUs which vCPUs resident, several pCPUs surplus for housekeeping, disable mwait/hlt/pause vmexits to occupy the pCPUs, fortunately preemption timer is disabled after mwait is exposed to guest which makes emulated timer offload can be possible. Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx> Cc: Radim Krčmář <rkrcmar@xxxxxxxxxx> Signed-off-by: Wanpeng Li <wanpengli@xxxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/lapic.c | 20 ++++++++++++++++---- arch/x86/kvm/svm.c | 5 +++++ arch/x86/kvm/vmx/vmx.c | 9 +++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aeadbc7..ccb3d61 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1090,6 +1090,7 @@ struct kvm_x86_ops { void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); + bool (*pi_inject_timer_enabled)(struct kvm_vcpu *vcpu); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fcf42a3..8c9c14d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -127,6 +127,12 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) return apic->vcpu->vcpu_id; } +static inline bool posted_interrupt_inject_timer(struct kvm_vcpu *vcpu) +{ + return (kvm_x86_ops->pi_inject_timer_enabled(vcpu) && + kvm_mwait_in_guest(vcpu->kvm)); +} + static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->mode) { @@ -1581,7 +1587,9 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); - hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED); + hrtimer_start(&ktimer->timer, expire, + posted_interrupt_inject_timer(vcpu) ? + HRTIMER_MODE_ABS : HRTIMER_MODE_ABS_PINNED); } else apic_timer_expired(apic); @@ -1683,7 +1691,8 @@ static void start_sw_period(struct kvm_lapic *apic) hrtimer_start(&apic->lapic_timer.timer, apic->lapic_timer.target_expiration, - HRTIMER_MODE_ABS_PINNED); + posted_interrupt_inject_timer(apic->vcpu) ? + HRTIMER_MODE_ABS : HRTIMER_MODE_ABS_PINNED); } bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) @@ -2320,7 +2329,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); + posted_interrupt_inject_timer(vcpu) ? + HRTIMER_MODE_ABS : HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { apic->lapic_timer.timer_advance_ns = 1000; @@ -2509,7 +2519,9 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(timer, + posted_interrupt_inject_timer(vcpu) ? + HRTIMER_MODE_ABS : HRTIMER_MODE_ABS_PINNED); } /* diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 302cb40..aee1d91 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5173,6 +5173,11 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } +static inline bool svm_posted_interrupt_inject_timer(struct kvm_vcpu *vcpu) +{ + return false; +} + static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) { unsigned long flags; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index da24f18..2b4fd61 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -101,6 +101,9 @@ module_param(fasteoi, bool, S_IRUGO); static bool __read_mostly enable_apicv = 1; module_param(enable_apicv, bool, S_IRUGO); +bool __read_mostly pi_inject_timer = 0; +module_param(pi_inject_timer, bool, S_IRUGO); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -3724,6 +3727,11 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) kvm_vcpu_kick(vcpu); } +static bool vmx_pi_inject_timer_enabled(struct kvm_vcpu *vcpu) +{ + return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -7671,6 +7679,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .pi_inject_timer_enabled = vmx_pi_inject_timer_enabled, .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, -- 2.7.4