> On 21 Nov 2019, at 1:36, Liran Alon <liran.alon@xxxxxxxxxx> wrote: > > > >> On 20 Nov 2019, at 5:42, Wanpeng Li <kernellwp@xxxxxxxxx> wrote: >> >> From: Wanpeng Li <wanpengli@xxxxxxxxxxx> >> >> ICR and TSCDEADLINE MSRs write cause the main MSRs write vmexits in >> our product observation, multicast IPIs are not as common as unicast >> IPI like RESCHEDULE_VECTOR and CALL_FUNCTION_SINGLE_VECTOR etc. > > Have you also had the chance to measure non-Linux workloads. Such as Windows? > >> >> This patch tries to optimize x2apic physical destination mode, fixed >> delivery mode single target IPI. The fast path is invoked at >> ->handle_exit_irqoff() to emulate only the effect of the ICR write >> itself, i.e. the sending of IPIs. Sending IPIs early in the VM-Exit >> flow reduces the latency of virtual IPIs by avoiding the expensive bits >> of transitioning from guest to host, e.g. reacquiring KVM's SRCU lock. >> Especially when running guest w/ KVM_CAP_X86_DISABLE_EXITS capability >> enabled or guest can keep running, IPI can be injected to target vCPU >> by posted-interrupt immediately. > > May I suggest an alternative phrasing? Something such as: > > “”” > This patch introduce a mechanism to handle certain performance-critical WRMSRs > in a very early stage of KVM VMExit handler. > > This mechanism is specifically used for accelerating writes to x2APIC ICR that > attempt to send a virtual IPI with physical destination-mode, fixed delivery-mode > and single target. Which was found as one of the main causes of VMExits for > Linux workloads. > > The reason this mechanism significantly reduce the latency of such virtual IPIs > is by sending the physical IPI to the target vCPU in a very early stage of KVM > VMExit handler, before host interrupts are enabled and before expensive > operations such as reacquiring KVM’s SRCU lock. > Latency is reduced even more when KVM is able to use APICv posted-interrupt > mechanism (which allows to deliver the virtual IPI directly to target vCPU without > the need to kick it to host). > “”” > >> >> Testing on Xeon Skylake server: >> >> The virtual IPI latency from sender send to receiver receive reduces >> more than 200+ cpu cycles. >> >> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx> >> Cc: Radim Krčmář <rkrcmar@xxxxxxxxxx> >> Cc: Sean Christopherson <sean.j.christopherson@xxxxxxxxx> >> Cc: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> >> Cc: Liran Alon <liran.alon@xxxxxxxxxx> >> Signed-off-by: Wanpeng Li <wanpengli@xxxxxxxxxxx> > > I see you used the code I provided my reply to v2. :) > I had only some very minor comments inline below. Therefore: > Reviewed-by: Liran Alon <liran.alon@xxxxxxxxxx> Oh there is a small thing pointed by Sean I agree with (and missed in my review) before this Reviewed-by can be given. You should avoid performing accelerated WRMSR handling in case vCPU is in guest-mode. (From obvious reasons as L1 for example could intercept this WRMSR…) -Liran > > Thanks for doing this optimisation. > -Liran > >> --- >> v2 -> v3: >> * for both VMX and SVM >> * vmx_handle_exit() get second parameter by value and not by pointer >> * rename parameter to “accel_exit_completion” >> * preserve tracepoint ordering >> * rename handler to handle_accel_set_msr_irqoff and more generic >> * add comments above handle_accel_set_msr_irqoff >> * msr index APIC_BASE_MSR + (APIC_ICR >> 4) >> v1 -> v2: >> * add tracepoint >> * Instead of a separate vcpu->fast_vmexit, set exit_reason >> to vmx->exit_reason to -1 if the fast path succeeds. >> * move the "kvm_skip_emulated_instruction(vcpu)" to vmx_handle_exit >> * moving the handling into vmx_handle_exit_irqoff() >> >> arch/x86/include/asm/kvm_host.h | 11 ++++++++-- >> arch/x86/kvm/svm.c | 14 ++++++++---- >> arch/x86/kvm/vmx/vmx.c | 13 ++++++++--- >> arch/x86/kvm/x86.c | 48 +++++++++++++++++++++++++++++++++++++++-- >> arch/x86/kvm/x86.h | 1 + >> 5 files changed, 76 insertions(+), 11 deletions(-) >> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index 898ab9e..67c7889 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -175,6 +175,11 @@ enum { >> VCPU_SREG_LDTR, >> }; >> >> +enum accel_exit_completion { >> + ACCEL_EXIT_NONE, >> + ACCEL_EXIT_SKIP_EMUL_INS = -1, > > You should remove the “= -1”. > >> +}; >> + >> #include <asm/kvm_emulate.h> >> >> #define KVM_NR_MEM_OBJS 40 >> @@ -1084,7 +1089,8 @@ struct kvm_x86_ops { >> void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); >> >> void (*run)(struct kvm_vcpu *vcpu); >> - int (*handle_exit)(struct kvm_vcpu *vcpu); >> + int (*handle_exit)(struct kvm_vcpu *vcpu, >> + enum accel_exit_completion accel_exit); >> int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); >> void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); >> u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); >> @@ -1134,7 +1140,8 @@ struct kvm_x86_ops { >> int (*check_intercept)(struct kvm_vcpu *vcpu, >> struct x86_instruction_info *info, >> enum x86_intercept_stage stage); >> - void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); >> + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, >> + enum accel_exit_completion *accel_exit); >> bool (*mpx_supported)(void); >> bool (*xsaves_supported)(void); >> bool (*umip_emulated)(void); >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >> index d02a73a..060b11d 100644 >> --- a/arch/x86/kvm/svm.c >> +++ b/arch/x86/kvm/svm.c >> @@ -4929,7 +4929,8 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) >> *info2 = control->exit_info_2; >> } >> >> -static int handle_exit(struct kvm_vcpu *vcpu) >> +static int handle_exit(struct kvm_vcpu *vcpu, >> + enum accel_exit_completion accel_exit) >> { >> struct vcpu_svm *svm = to_svm(vcpu); >> struct kvm_run *kvm_run = vcpu->run; >> @@ -4987,7 +4988,10 @@ static int handle_exit(struct kvm_vcpu *vcpu) >> __func__, svm->vmcb->control.exit_int_info, >> exit_code); >> >> - if (exit_code >= ARRAY_SIZE(svm_exit_handlers) >> + if (accel_exit == ACCEL_EXIT_SKIP_EMUL_INS) { >> + kvm_skip_emulated_instruction(vcpu); >> + return 1; >> + } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers) >> || !svm_exit_handlers[exit_code]) { >> vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); >> dump_vmcb(vcpu); >> @@ -6187,9 +6191,11 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, >> return ret; >> } >> >> -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) >> +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, >> + enum accel_exit_completion *accel_exit) >> { >> - >> + if (to_svm(vcpu)->vmcb->control.exit_code == EXIT_REASON_MSR_WRITE) >> + *accel_exit = handle_accel_set_msr_irqoff(vcpu); >> } >> >> static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) >> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c >> index 621142e5..86c0a23 100644 >> --- a/arch/x86/kvm/vmx/vmx.c >> +++ b/arch/x86/kvm/vmx/vmx.c >> @@ -5792,7 +5792,8 @@ void dump_vmcs(void) >> * The guest has exited. See if we can fix it or if we need userspace >> * assistance. >> */ >> -static int vmx_handle_exit(struct kvm_vcpu *vcpu) >> +static int vmx_handle_exit(struct kvm_vcpu *vcpu, >> + enum accel_exit_completion accel_exit) >> { >> struct vcpu_vmx *vmx = to_vmx(vcpu); >> u32 exit_reason = vmx->exit_reason; >> @@ -5878,7 +5879,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) >> } >> } >> >> - if (exit_reason < kvm_vmx_max_exit_handlers >> + if (accel_exit == ACCEL_EXIT_SKIP_EMUL_INS) { >> + kvm_skip_emulated_instruction(vcpu); >> + return 1; >> + } else if (exit_reason < kvm_vmx_max_exit_handlers >> && kvm_vmx_exit_handlers[exit_reason]) { >> #ifdef CONFIG_RETPOLINE >> if (exit_reason == EXIT_REASON_MSR_WRITE) >> @@ -6223,7 +6227,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) >> } >> STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); >> >> -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) >> +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, >> + enum accel_exit_completion *accel_exit) >> { >> struct vcpu_vmx *vmx = to_vmx(vcpu); >> >> @@ -6231,6 +6236,8 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) >> handle_external_interrupt_irqoff(vcpu); >> else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) >> handle_exception_nmi_irqoff(vmx); >> + else if (vmx->exit_reason == EXIT_REASON_MSR_WRITE) >> + *accel_exit = handle_accel_set_msr_irqoff(vcpu); >> } >> >> static bool vmx_has_emulated_msr(int index) >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index 991dd01..966c659 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -1509,6 +1509,49 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) >> } >> EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); >> >> +static int handle_accel_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) >> +{ >> + if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) && >> + ((data & KVM_APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && >> + ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { >> + >> + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); >> + return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data); >> + } >> + >> + return 1; >> +} >> + >> +/* >> + * The fast path for frequent and performance sensitive wrmsr emulation, >> + * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces >> + * the latency of virtual IPI by avoiding the expensive bits of transitioning >> + * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the >> + * other cases which must be called after interrupts are enabled on the host. >> + */ > > This comment belongs better on top of handle_accel_set_x2apic_icr_irqoff(). > As handle_accel_set_msr_irqoff() is in theory written to maybe use it for other MSRs as-well. > >> +enum accel_exit_completion handle_accel_set_msr_irqoff(struct kvm_vcpu *vcpu) >> +{ >> + u32 msr = kvm_rcx_read(vcpu); >> + u64 data = kvm_read_edx_eax(vcpu); >> + int ret = 0; >> + >> + switch (msr) { >> + case APIC_BASE_MSR + (APIC_ICR >> 4): >> + ret = handle_accel_set_x2apic_icr_irqoff(vcpu, data); >> + break; >> + default: >> + return ACCEL_EXIT_NONE; >> + } >> + >> + if (!ret) { >> + trace_kvm_msr_write(msr, data); >> + return ACCEL_EXIT_SKIP_EMUL_INS; >> + } >> + >> + return ACCEL_EXIT_NONE; >> +} >> +EXPORT_SYMBOL_GPL(handle_accel_set_msr_irqoff); >> + >> /* >> * Adapt set_msr() to msr_io()'s calling convention >> */ >> @@ -7984,6 +8027,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) >> bool req_int_win = >> dm_request_for_irq_injection(vcpu) && >> kvm_cpu_accept_dm_intr(vcpu); >> + enum accel_exit_completion accel_exit = ACCEL_EXIT_NONE; >> >> bool req_immediate_exit = false; >> >> @@ -8226,7 +8270,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) >> vcpu->mode = OUTSIDE_GUEST_MODE; >> smp_wmb(); >> >> - kvm_x86_ops->handle_exit_irqoff(vcpu); >> + kvm_x86_ops->handle_exit_irqoff(vcpu, &accel_exit); >> >> /* >> * Consume any pending interrupts, including the possible source of >> @@ -8270,7 +8314,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) >> kvm_lapic_sync_from_vapic(vcpu); >> >> vcpu->arch.gpa_available = false; >> - r = kvm_x86_ops->handle_exit(vcpu); >> + r = kvm_x86_ops->handle_exit(vcpu, accel_exit); >> return r; >> >> cancel_injection: >> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h >> index 29391af..f14ec14 100644 >> --- a/arch/x86/kvm/x86.h >> +++ b/arch/x86/kvm/x86.h >> @@ -291,6 +291,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, >> bool kvm_vector_hashing_enabled(void); >> int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, >> int emulation_type, void *insn, int insn_len); >> +enum accel_exit_completion handle_accel_set_msr_irqoff(struct kvm_vcpu *vcpu); >> >> #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ >> | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ >> -- >> 2.7.4