On entry to the guest, secondary threads now wait for the primary to switch the MMU after loading up most of their state, rather than before. This means that the secondary threads get into the guest sooner, in the common case where the secondary threads get to kvmppc_hv_entry before the primary thread. On exit, the first thread out increments the exit count and interrupts the other threads (to get them out of the guest) before saving most of its state, rather than after. That means that the other threads exit sooner and means that the first thread doesn't spend so much time waiting for the other threads at the point where the MMU gets switched back to the host. This pulls out the code that increments the exit count and interrupts other threads into a separate function, kvmhv_commence_exit(). This also makes sure that r12 and vcpu->arch.trap are set correctly in some corner cases. Statistics from /sys/kernel/debug/kvm/vm*/vcpu*/timings show the improvement. Aggregating across vcpus for a guest with 32 vcpus, 8 threads/vcore, running on a POWER8, gives this before the change: rm_entry: avg 3919.3ns (244 - 56492, 742665 samples) rm_exit: avg 4102.5ns (130 - 36272, 704056 samples) rm_intr: avg 1006.0ns (12 - 75040, 2819905 samples) and this after the change: rm_entry: avg 2979.8ns (258 - 83740, 836403 samples) rm_exit: avg 3992.9ns (12 - 45572, 838034 samples) rm_intr: avg 922.2ns (12 - 66694, 3127066 samples) showing a substantial reduction in the time spent in the real-mode guest entry code, and smaller reductions in the real mode guest exit and interrupt handling times. (The test was to start the guest and boot Fedora 20 big-endian to the login prompt.) Signed-off-by: Paul Mackerras <paulus@xxxxxxxxx> --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 240 +++++++++++++++++++------------- 1 file changed, 141 insertions(+), 99 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 04728ce..ff1461d 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -175,6 +175,19 @@ kvmppc_primary_no_guest: /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ mfspr r3, SPRN_HDEC mtspr SPRN_DEC, r3 + /* + * Make sure the primary has finished the MMU switch. + * We should never get here on a secondary thread, but + * check it for robustness' sake. + */ + ld r5, HSTATE_KVM_VCORE(r13) +65: lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + beq 65b + /* Set LPCR. */ + ld r8,VCORE_LPCR(r5) + mtspr SPRN_LPCR,r8 + isync /* set our bit in napping_threads */ ld r5, HSTATE_KVM_VCORE(r13) lbz r7, HSTATE_PTID(r13) @@ -206,7 +219,7 @@ kvm_novcpu_wakeup: /* check the wake reason */ bl kvmppc_check_wake_reason - + /* see if any other thread is already exiting */ lwz r0, VCORE_ENTRY_EXIT(r5) cmpwi r0, 0x100 @@ -243,7 +256,12 @@ kvm_novcpu_wakeup: kvm_novcpu_exit: ld r4, HSTATE_KVM_VCPU(r13) - b hdec_soon + cmpdi r4, 0 + beq 13f + addi r3, r4, VCPU_TB_RMEXIT + bl kvmhv_accumulate_time +13: bl kvmhv_commence_exit + b kvmhv_switch_to_host /* * We come in here when wakened from nap mode. @@ -417,7 +435,7 @@ kvmppc_hv_entry: ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ lbz r6,HSTATE_PTID(r13) cmpwi r6,0 - bne 20f + bne 10f ld r6,KVM_SDR1(r9) lwz r7,KVM_LPID(r9) li r0,LPID_RSVD /* switch to reserved LPID */ @@ -488,26 +506,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) li r0,1 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ - b 10f - - /* Secondary threads wait for primary to have done partition switch */ -20: lbz r0,VCORE_IN_GUEST(r5) - cmpwi r0,0 - beq 20b - - /* Set LPCR. */ -10: ld r8,VCORE_LPCR(r5) - mtspr SPRN_LPCR,r8 - isync - - /* Check if HDEC expires soon */ - mfspr r3,SPRN_HDEC - cmpwi r3,512 /* 1 microsecond */ - li r12,BOOK3S_INTERRUPT_HV_DECREMENTER - blt hdec_soon /* Do we have a guest vcpu to run? */ - cmpdi r4, 0 +10: cmpdi r4, 0 beq kvmppc_primary_no_guest kvmppc_got_guest: @@ -832,6 +833,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) clrrdi r6,r6,1 mtspr SPRN_CTRLT,r6 4: + /* Secondary threads wait for primary to have done partition switch */ + ld r5, HSTATE_KVM_VCORE(r13) + lbz r6, HSTATE_PTID(r13) + cmpwi r6, 0 + beq 21f + lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + bne 21f + HMT_LOW +20: lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + beq 20b + HMT_MEDIUM +21: + /* Set LPCR. */ + ld r8,VCORE_LPCR(r5) + mtspr SPRN_LPCR,r8 + isync + + /* Check if HDEC expires soon */ + mfspr r3, SPRN_HDEC + cmpwi r3, 512 /* 1 microsecond */ + blt hdec_soon + ld r6, VCPU_CTR(r4) lwz r7, VCPU_XER(r4) @@ -936,18 +961,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b . secondary_too_late: + li r12, 0 cmpdi r4, 0 beq 11f + stw r12, VCPU_TRAP(r4) addi r3, r4, VCPU_TB_RMEXIT bl kvmhv_accumulate_time 11: b kvmhv_switch_to_host hdec_soon: - cmpdi r4, 0 - beq 12f + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER + stw r12, VCPU_TRAP(r4) + mr r9, r4 addi r3, r4, VCPU_TB_RMEXIT bl kvmhv_accumulate_time -12: b kvmhv_do_exit + b guest_exit_cont /****************************************************************************** * * @@ -1108,7 +1136,7 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ stw r7, VCPU_DSISR(r9) /* don't overwrite fault_dar/fault_dsisr if HDSI */ cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE - beq 6f + beq mc_cont std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) @@ -1120,8 +1148,11 @@ mc_cont: mr r4, r9 bl kvmhv_accumulate_time + /* Increment exit count, poke other threads to exit */ + bl kvmhv_commence_exit + /* Save guest CTRL register, set runlatch to 1 */ -6: mfspr r6,SPRN_CTRLF + mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) andi. r0,r6,1 bne 4f @@ -1463,83 +1494,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) slbia ptesync -kvmhv_do_exit: /* r12 = trap, r13 = paca */ /* * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do * have to coordinate the hardware threads. */ - /* Increment the threads-exiting-guest count in the 0xff00 - bits of vcore->entry_exit_count */ - ld r5,HSTATE_KVM_VCORE(r13) - addi r6,r5,VCORE_ENTRY_EXIT -41: lwarx r3,0,r6 - addi r0,r3,0x100 - stwcx. r0,0,r6 - bne 41b - isync /* order stwcx. vs. reading napping_threads */ - - /* - * At this point we have an interrupt that we have to pass - * up to the kernel or qemu; we can't handle it in real mode. - * Thus we have to do a partition switch, so we have to - * collect the other threads, if we are the first thread - * to take an interrupt. To do this, we set the HDEC to 0, - * which causes an HDEC interrupt in all threads within 2ns - * because the HDEC register is shared between all 4 threads. - * However, we don't need to bother if this is an HDEC - * interrupt, since the other threads will already be on their - * way here in that case. - */ - cmpwi r3,0x100 /* Are we the first here? */ - bge 43f - cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER - beq 43f - li r0,0 - mtspr SPRN_HDEC,r0 - - /* - * Send a message or IPI to any napping threads, since an HDEC interrupt - * doesn't wake CPUs up from nap. - */ - lwz r3,VCORE_NAPPING_THREADS(r5) - lbz r4,HSTATE_PTID(r13) - li r0,1 - sld r0,r0,r4 - andc. r3,r3,r0 /* no sense IPI'ing ourselves */ - beq 43f - /* Order entry/exit update vs. IPIs */ - sync -BEGIN_FTR_SECTION - b 45f -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) - /* Use msgsnd on POWER8 */ - lhz r6, VCORE_PCPU(r5) - clrldi r6, r6, 64-3 - oris r6, r6, (PPC_DBELL_SERVER << (63-36))@h -42: andi. r0, r3, 1 - beq 44f - PPC_MSGSND(6) -44: srdi. r3, r3, 1 - addi r6, r6, 1 - bne 42b - b kvmhv_switch_to_host - /* Use IPIs on POWER7 */ -45: mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ - subf r6,r4,r13 -42: andi. r0,r3,1 - beq 44f - ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ - li r0,IPI_PRIORITY - li r7,XICS_MFRR - stbcix r0,r7,r8 /* trigger the IPI */ -44: srdi. r3,r3,1 - addi r6,r6,PACA_SIZE - bne 42b - kvmhv_switch_to_host: /* Secondary threads wait for primary to do partition switch */ -43: ld r5,HSTATE_KVM_VCORE(r13) + ld r5,HSTATE_KVM_VCORE(r13) ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ lbz r3,HSTATE_PTID(r13) cmpwi r3,0 @@ -1639,6 +1601,84 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtlr r0 blr +kvmhv_commence_exit: /* r12 = trap, r13 = paca, doesn't trash r9 */ + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -PPC_MIN_STKFRM(r1) + + /* Increment the threads-exiting-guest count in the 0xff00 + bits of vcore->entry_exit_count */ + ld r5,HSTATE_KVM_VCORE(r13) + addi r6,r5,VCORE_ENTRY_EXIT +41: lwarx r3,0,r6 + addi r0,r3,0x100 + stwcx. r0,0,r6 + bne 41b + isync /* order stwcx. vs. reading napping_threads */ + + /* + * At this point we have an interrupt that we have to pass + * up to the kernel or qemu; we can't handle it in real mode. + * Thus we have to do a partition switch, so we have to + * collect the other threads, if we are the first thread + * to take an interrupt. To do this, we set the HDEC to 0, + * which causes an HDEC interrupt in all threads within 2ns + * because the HDEC register is shared between all 4 threads. + * However, we don't need to bother if this is an HDEC + * interrupt, since the other threads will already be on their + * way here in that case. + */ + cmpwi r3,0x100 /* Are we the first here? */ + bge 43f + cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER + beq 43f + li r0,0 + mtspr SPRN_HDEC,r0 + + /* + * Send a message or IPI to any napping threads, since an HDEC interrupt + * doesn't wake CPUs up from nap. + */ + lwz r3,VCORE_NAPPING_THREADS(r5) + lbz r4,HSTATE_PTID(r13) + li r0,1 + sld r0,r0,r4 + andc. r3,r3,r0 /* no sense IPI'ing ourselves */ + beq 43f + /* Order entry/exit update vs. IPIs */ + sync +BEGIN_FTR_SECTION + b 45f +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + /* Use msgsnd on POWER8 */ + lhz r6, VCORE_PCPU(r5) + clrldi r6, r6, 64-3 + oris r6, r6, (PPC_DBELL_SERVER << (63-36))@h +42: andi. r0, r3, 1 + beq 44f + PPC_MSGSND(6) +44: srdi. r3, r3, 1 + addi r6, r6, 1 + bne 42b + b 43f + /* Use IPIs on POWER7 */ +45: mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ + subf r6,r4,r13 +42: andi. r0,r3,1 + beq 44f + ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ + li r0,IPI_PRIORITY + li r7,XICS_MFRR + stbcix r0,r7,r8 /* trigger the IPI */ +44: srdi. r3,r3,1 + addi r6,r6,PACA_SIZE + bne 42b + +43: ld r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1) + addi r1, r1, PPC_MIN_STKFRM + mtlr r0 + blr + /* * Check whether an HDSI is an HPTE not found fault or something else. * If it is an HPTE not found fault that is due to the guest accessing @@ -2074,8 +2114,8 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ lbz r5,VCPU_PRODDED(r3) cmpwi r5,0 bne kvm_cede_prodded - li r0,0 /* set trap to 0 to say hcall is handled */ - stw r0,VCPU_TRAP(r3) + li r12,0 /* set trap to 0 to say hcall is handled */ + stw r12,VCPU_TRAP(r3) li r0,H_SUCCESS std r0,VCPU_GPR(R3)(r3) @@ -2279,7 +2319,8 @@ kvm_cede_prodded: /* we've ceded but we want to give control to the host */ kvm_cede_exit: - b hcall_real_fallback + ld r9, HSTATE_KVM_VCPU(r13) + b guest_exit_cont /* Try to handle a machine check in real mode */ machine_check_realmode: @@ -2417,6 +2458,7 @@ kvmppc_read_intr: bne- 43f /* OK, it's an IPI for us */ + li r12, 0 li r3, -1 1: blr -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html