PIN (Posted interrupt notification) is useless to host as KVM always syncs pending guest interrupts in PID to guest's vAPIC before each VM entry. In fact, Sending PINs to a CPU running in host will lead to additional overhead due to interrupt handling. Currently, software path, vmx_deliver_posted_interrupt(), is optimized to issue PINs only if target vCPU is in IN_GUEST_MODE. But hardware paths (VT-d and Intel IPI virtualization) aren't optimized. Suppress notification unless notification is necessary (when vCPU is blocked or running in guest). Specifically, set PID.SN when kvm sets OUTSIDE_GUEST_MODE and vCPUs wake up from blocking, and clear PID.SN when kvm sets IN_GUEST_MODE and vCPUs starts to block. Also, don't toggle PID.SN at any other places, e.g., vmx_vcpu_pi_load(). To allow x86 common code to toggle SN bit when kvm sets vcpu->mode, a new kvm_x86_ops is added. When IPI virtualization is enabled, this patch increases "perf bench" [*] by 8.10%, and PIN count in 1 second drops from tens of thousands to tens. But cpuid loop test shows this patch causes 0.94% overhead in VM-exit round-trip latency. [*] test cmd: perf bench sched pipe -T. Note that we change the source code to pin two threads to two different vCPUs so that it can reproduce stable results. Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx> --- Changes in v3: - add a dedicated hook to toggle SN - clear/set SN when kvm sets IN/OUTSIDE_GUEST_MODE - use kvm_arch_vcpu_{un}blocking instead of vcpu_put/load to set/clear SN for blocked vCPUs - recollect perf data and update the figures in commit message arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/posted_intr.c | 63 +++++++++++++++++------------- arch/x86/kvm/vmx/posted_intr.h | 1 + arch/x86/kvm/vmx/vmx.c | 22 ++++++++++- arch/x86/kvm/x86.c | 20 ++++++++++ 6 files changed, 80 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 51f777071584..a2d77302d7a8 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -86,6 +86,7 @@ KVM_X86_OP_OPTIONAL(set_virtual_apic_mode) KVM_X86_OP_OPTIONAL(set_apic_access_page_addr) KVM_X86_OP(deliver_interrupt) KVM_X86_OP_OPTIONAL(sync_pir_to_irr) +KVM_X86_OP_OPTIONAL(pi_suppress_notification) KVM_X86_OP_OPTIONAL_RET0(set_tss_addr) KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2c96c43c313a..66d67b198021 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1543,6 +1543,7 @@ struct kvm_x86_ops { void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); + void (*pi_suppress_notification)(struct kvm_vcpu *vcpu, bool suppress); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 1b56c5e5c9fb..ae1794a213d7 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -48,6 +48,38 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) return 0; } +/* + * Toggle Suppress Notification (SN) bit in PID. + * + * Note that after clearing SN, the check of PIR and setting ON bit + * ensure pending IRQs in PIR are not ignored. + */ +void vmx_pi_suppress_notification(struct kvm_vcpu *vcpu, bool suppress) +{ + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + if (suppress) { + pi_set_sn(pi_desc); + } else { + pi_clear_sn(pi_desc); + /* + * Clear SN before reading the bitmap. The VT-d firmware + * writes the bitmap and reads SN atomically (5.2.3 in the + * spec), so it doesn't really have a memory barrier that + * pairs with this, but we cannot do that and we need one. + */ + smp_mb__after_atomic(); + + /* + * ON may be out of sync with PIR. Set ON if there are + * pending IRQs in PIR to ensure subsequent + * ->sync_pir_to_irr() won't leave the pending IRQs behind. + */ + if (!pi_test_on(pi_desc) && !pi_is_pir_empty(pi_desc)) + pi_set_on(pi_desc); + } +} + void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -69,15 +101,8 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * full update can be skipped as neither the vector nor the destination * needs to be changed. */ - if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { - /* - * Clear SN if it was set due to being preempted. Again, do - * this even if there is no assigned device for simplicity. - */ - if (pi_test_and_clear_sn(pi_desc)) - goto after_clear_sn; + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) return; - } local_irq_save(flags); @@ -101,11 +126,10 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.control = old.control; /* - * Clear SN (as above) and refresh the destination APIC ID to - * handle task migration (@cpu != vcpu->cpu). + * Refresh the destination APIC ID to handle task migration + * (@cpu != vcpu->cpu). */ new.ndst = dest; - new.sn = 0; /* * Restore the notification vector; in the blocking case, the @@ -115,19 +139,6 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) } while (pi_try_set_control(pi_desc, &old.control, new.control)); local_irq_restore(flags); - -after_clear_sn: - - /* - * Clear SN before reading the bitmap. The VT-d firmware - * writes the bitmap and reads SN atomically (5.2.3 in the - * spec), so it doesn't really have a memory barrier that - * pairs with this, but we cannot do that and we need one. - */ - smp_mb__after_atomic(); - - if (!pi_is_pir_empty(pi_desc)) - pi_set_on(pi_desc); } static bool vmx_can_use_vtd_pi(struct kvm *kvm) @@ -193,8 +204,6 @@ static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!vmx_needs_pi_wakeup(vcpu)) return; @@ -207,7 +216,7 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) * its wait state and manually scheduling out. */ if (vcpu->preempted) - pi_set_sn(pi_desc); + vmx_pi_suppress_notification(vcpu, true); } /* diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 26992076552e..9232b87bf63e 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -102,5 +102,6 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void vmx_pi_start_assignment(struct kvm *kvm); +void vmx_pi_suppress_notification(struct kvm_vcpu *vcpu, bool suppress); #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c9b49a09e6b5..084894c50b64 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6697,6 +6697,16 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) vmx_set_rvi(max_irr); } +static void vmx_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + vmx_pi_suppress_notification(vcpu, false); +} + +static void vmx_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + vmx_pi_suppress_notification(vcpu, true); +} + static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6706,7 +6716,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; - if (pi_test_on(&vmx->pi_desc)) { + /* + * Check PIR if ON=1 || SN=1. This is required to avoid breaking + * halt-polling (and other call sites with SN=1). Notification + * during halt-polling is undesired. But VT-d engine and IPI + * virtualization don't set ON if SN=1. To ensure KVM can detect + * pending IRQs in time, check PIR as well if SN=1. + */ + if (pi_test_on(&vmx->pi_desc) || pi_test_sn(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* * IOMMU can write to PID.ON, so the barrier matters even on UP. @@ -8030,6 +8047,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .prepare_switch_to_guest = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, + .vcpu_blocking = vmx_vcpu_blocking, + .vcpu_unblocking = vmx_vcpu_unblocking, .update_exception_bitmap = vmx_update_exception_bitmap, .get_msr_feature = vmx_get_msr_feature, @@ -8089,6 +8108,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, + .pi_suppress_notification = vmx_pi_suppress_notification, .deliver_interrupt = vmx_deliver_interrupt, .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d7374d768296..ce744e840423 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10426,6 +10426,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* Store vcpu->apicv_active before vcpu->mode. */ smp_store_release(&vcpu->mode, IN_GUEST_MODE); + /* + * Notification are necessary to trigger the delivery of posted + * IRQs when vCPU is running in guest. Enable notification before + * guest entry. + * + * Do this even if apicv is disabled for simplicity. + */ + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_pi_suppress_notification)(vcpu, false); kvm_vcpu_srcu_read_unlock(vcpu); @@ -10538,6 +10547,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + /* + * Suppress notification when CPU is OUTSIDE_GUEST_MODE to avoid + * wasting time on handling interrupts. A notification to host/kvm + * just indicates some interrupts are posted for a vCPU. Since KVM + * always syncs pending interrupts in PIR to vAPIC IRR before guest + * entry (in ->sync_pir_to_irr()), notification isn't needed. + * + * Do this even if apicv is disabled for simplicity. + */ + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_pi_suppress_notification)(vcpu, true); /* * Sync xfd before calling handle_exit_irqoff() which may base-commit: 372d07084593dc7a399bf9bee815711b1fb1bcf2 -- 2.25.1