On Mon, Oct 08, 2018 at 04:30:55PM +1100, Paul Mackerras wrote: > This creates an alternative guest entry/exit path which is used for > radix guests on POWER9 systems when we have indep_threads_mode=Y. In > these circumstances there is exactly one vcpu per vcore and there is > no coordination required between vcpus or vcores; the vcpu can enter > the guest without needing to synchronize with anything else. > > The new fast path is implemented almost entirely in C in book3s_hv.c > and runs with the MMU on until the guest is entered. On guest exit > we use the existing path until the point where we are committed to > exiting the guest (as distinct from handling an interrupt in the > low-level code and returning to the guest) and we have pulled the > guest context from the XIVE. At that point we check a flag in the > stack frame to see whether we came in via the old path and the new > path; if we came in via the new path then we go back to C code to do > the rest of the process of saving the guest context and restoring the > host context. > > The C code is split into separate functions for handling the > OS-accessible state and the hypervisor state, with the idea that the > latter can be replaced by a hypercall when we implement nested > virtualization. > > Signed-off-by: Paul Mackerras <paulus@xxxxxxxxxx> Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > --- > arch/powerpc/include/asm/asm-prototypes.h | 2 + > arch/powerpc/include/asm/kvm_ppc.h | 2 + > arch/powerpc/kvm/book3s_hv.c | 429 +++++++++++++++++++++++++++++- > arch/powerpc/kvm/book3s_hv_ras.c | 2 + > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 95 ++++++- > arch/powerpc/kvm/book3s_xive.c | 63 +++++ > 6 files changed, 589 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h > index 0c1a2b0..5c9b00c 100644 > --- a/arch/powerpc/include/asm/asm-prototypes.h > +++ b/arch/powerpc/include/asm/asm-prototypes.h > @@ -165,4 +165,6 @@ void kvmhv_load_host_pmu(void); > void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use); > void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu); > > +int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu); > + > #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ > diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h > index 83d61b8..245e564 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -585,6 +585,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); > > extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, > int level, bool line_status); > +extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); > #else > static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, > u32 priority) { return -1; } > @@ -607,6 +608,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur > > static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, > int level, bool line_status) { return -ENODEV; } > +static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } > #endif /* CONFIG_KVM_XIVE */ > > /* > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 0e17593..0c1dd76 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -3080,6 +3080,269 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) > } > > /* > + * Load up hypervisor-mode registers on P9. > + */ > +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + s64 hdec; > + u64 tb, purr, spurr; > + int trap; > + unsigned long host_hfscr = mfspr(SPRN_HFSCR); > + unsigned long host_ciabr = mfspr(SPRN_CIABR); > + unsigned long host_dawr = mfspr(SPRN_DAWR); > + unsigned long host_dawrx = mfspr(SPRN_DAWRX); > + unsigned long host_psscr = mfspr(SPRN_PSSCR); > + unsigned long host_pidr = mfspr(SPRN_PID); > + > + hdec = time_limit - mftb(); > + if (hdec < 0) > + return BOOK3S_INTERRUPT_HV_DECREMENTER; > + mtspr(SPRN_HDEC, hdec); > + > + if (vc->tb_offset) { > + u64 new_tb = mftb() + vc->tb_offset; > + mtspr(SPRN_TBU40, new_tb); > + tb = mftb(); > + if ((tb & 0xffffff) < (new_tb & 0xffffff)) > + mtspr(SPRN_TBU40, new_tb + 0x1000000); > + vc->tb_offset_applied = vc->tb_offset; > + } > + > + if (vc->pcr) > + mtspr(SPRN_PCR, vc->pcr); > + mtspr(SPRN_DPDES, vc->dpdes); > + mtspr(SPRN_VTB, vc->vtb); > + > + local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); > + local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); > + mtspr(SPRN_PURR, vcpu->arch.purr); > + mtspr(SPRN_SPURR, vcpu->arch.spurr); > + > + if (cpu_has_feature(CPU_FTR_DAWR)) { > + mtspr(SPRN_DAWR, vcpu->arch.dawr); > + mtspr(SPRN_DAWRX, vcpu->arch.dawrx); > + } > + mtspr(SPRN_CIABR, vcpu->arch.ciabr); > + mtspr(SPRN_IC, vcpu->arch.ic); > + mtspr(SPRN_PID, vcpu->arch.pid); > + > + mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | > + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); > + > + mtspr(SPRN_HFSCR, vcpu->arch.hfscr); > + > + mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0); > + mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1); > + mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); > + mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); > + > + mtspr(SPRN_AMOR, ~0UL); > + > + mtspr(SPRN_LPCR, vc->lpcr); > + isync(); > + > + kvmppc_xive_push_vcpu(vcpu); > + > + mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); > + mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); > + > + trap = __kvmhv_vcpu_entry_p9(vcpu); > + > + /* Advance host PURR/SPURR by the amount used by guest */ > + purr = mfspr(SPRN_PURR); > + spurr = mfspr(SPRN_SPURR); > + mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr + > + purr - vcpu->arch.purr); > + mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr + > + spurr - vcpu->arch.spurr); > + vcpu->arch.purr = purr; > + vcpu->arch.spurr = spurr; > + > + vcpu->arch.ic = mfspr(SPRN_IC); > + vcpu->arch.pid = mfspr(SPRN_PID); > + vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS; > + > + vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0); > + vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1); > + vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); > + vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); > + > + mtspr(SPRN_PSSCR, host_psscr); > + mtspr(SPRN_HFSCR, host_hfscr); > + mtspr(SPRN_CIABR, host_ciabr); > + mtspr(SPRN_DAWR, host_dawr); > + mtspr(SPRN_DAWRX, host_dawrx); > + mtspr(SPRN_PID, host_pidr); > + > + /* > + * Since this is radix, do a eieio; tlbsync; ptesync sequence in > + * case we interrupted the guest between a tlbie and a ptesync. > + */ > + asm volatile("eieio; tlbsync; ptesync"); > + > + mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */ > + isync(); > + > + vc->dpdes = mfspr(SPRN_DPDES); > + vc->vtb = mfspr(SPRN_VTB); > + mtspr(SPRN_DPDES, 0); > + if (vc->pcr) > + mtspr(SPRN_PCR, 0); > + > + if (vc->tb_offset_applied) { > + u64 new_tb = mftb() - vc->tb_offset_applied; > + mtspr(SPRN_TBU40, new_tb); > + tb = mftb(); > + if ((tb & 0xffffff) < (new_tb & 0xffffff)) > + mtspr(SPRN_TBU40, new_tb + 0x1000000); > + vc->tb_offset_applied = 0; > + } > + > + mtspr(SPRN_HDEC, 0x7fffffff); > + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); > + > + return trap; > +} > + > +/* > + * Virtual-mode guest entry for POWER9 and later when the host and > + * guest are both using the radix MMU. The LPIDR has already been set. > + */ > +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + unsigned long host_dscr = mfspr(SPRN_DSCR); > + unsigned long host_tidr = mfspr(SPRN_TIDR); > + unsigned long host_iamr = mfspr(SPRN_IAMR); > + s64 dec; > + u64 tb; > + int trap, save_pmu; > + > + dec = mfspr(SPRN_DEC); > + tb = mftb(); > + if (dec < 512) > + return BOOK3S_INTERRUPT_HV_DECREMENTER; > + local_paca->kvm_hstate.dec_expires = dec + tb; > + if (local_paca->kvm_hstate.dec_expires < time_limit) > + time_limit = local_paca->kvm_hstate.dec_expires; > + > + vcpu->arch.ceded = 0; > + > + kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */ > + > + kvmppc_subcore_enter_guest(); > + > + vc->entry_exit_map = 1; > + vc->in_guest = 1; > + > + if (vcpu->arch.vpa.pinned_addr) { > + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; > + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; > + lp->yield_count = cpu_to_be32(yield_count); > + vcpu->arch.vpa.dirty = 1; > + } > + > + if (cpu_has_feature(CPU_FTR_TM) || > + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) > + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); > + > + kvmhv_load_guest_pmu(vcpu); > + > + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); > + load_fp_state(&vcpu->arch.fp); > + load_vr_state(&vcpu->arch.vr); > + > + mtspr(SPRN_DSCR, vcpu->arch.dscr); > + mtspr(SPRN_IAMR, vcpu->arch.iamr); > + mtspr(SPRN_PSPB, vcpu->arch.pspb); > + mtspr(SPRN_FSCR, vcpu->arch.fscr); > + mtspr(SPRN_TAR, vcpu->arch.tar); > + mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); > + mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); > + mtspr(SPRN_BESCR, vcpu->arch.bescr); > + mtspr(SPRN_WORT, vcpu->arch.wort); > + mtspr(SPRN_TIDR, vcpu->arch.tid); > + mtspr(SPRN_DAR, vcpu->arch.shregs.dar); > + mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); > + mtspr(SPRN_AMR, vcpu->arch.amr); > + mtspr(SPRN_UAMOR, vcpu->arch.uamor); > + > + if (!(vcpu->arch.ctrl & 1)) > + mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); > + > + mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); > + > + if (vcpu->arch.doorbell_request) { > + vc->dpdes = 1; > + smp_wmb(); > + vcpu->arch.doorbell_request = 0; > + } > + > + trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit); > + > + vcpu->arch.slb_max = 0; > + dec = mfspr(SPRN_DEC); > + tb = mftb(); > + vcpu->arch.dec_expires = dec + tb; > + vcpu->cpu = -1; > + vcpu->arch.thread_cpu = -1; > + vcpu->arch.ctrl = mfspr(SPRN_CTRLF); > + > + vcpu->arch.iamr = mfspr(SPRN_IAMR); > + vcpu->arch.pspb = mfspr(SPRN_PSPB); > + vcpu->arch.fscr = mfspr(SPRN_FSCR); > + vcpu->arch.tar = mfspr(SPRN_TAR); > + vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); > + vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); > + vcpu->arch.bescr = mfspr(SPRN_BESCR); > + vcpu->arch.wort = mfspr(SPRN_WORT); > + vcpu->arch.tid = mfspr(SPRN_TIDR); > + vcpu->arch.amr = mfspr(SPRN_AMR); > + vcpu->arch.uamor = mfspr(SPRN_UAMOR); > + vcpu->arch.dscr = mfspr(SPRN_DSCR); > + > + mtspr(SPRN_PSPB, 0); > + mtspr(SPRN_WORT, 0); > + mtspr(SPRN_AMR, 0); > + mtspr(SPRN_UAMOR, 0); > + mtspr(SPRN_DSCR, host_dscr); > + mtspr(SPRN_TIDR, host_tidr); > + mtspr(SPRN_IAMR, host_iamr); > + mtspr(SPRN_PSPB, 0); > + > + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); > + store_fp_state(&vcpu->arch.fp); > + store_vr_state(&vcpu->arch.vr); > + > + if (cpu_has_feature(CPU_FTR_TM) || > + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) > + kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); > + > + save_pmu = 1; > + if (vcpu->arch.vpa.pinned_addr) { > + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; > + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; > + lp->yield_count = cpu_to_be32(yield_count); > + vcpu->arch.vpa.dirty = 1; > + save_pmu = lp->pmcregs_in_use; > + } > + > + kvmhv_save_guest_pmu(vcpu, save_pmu); > + > + vc->entry_exit_map = 0x101; > + vc->in_guest = 0; > + > + mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); > + > + kvmhv_load_host_pmu(); > + > + kvmppc_subcore_exit_guest(); > + > + return trap; > +} > + > +/* > * Wait for some other vcpu thread to execute us, and > * wake us up when we need to handle something in the host. > */ > @@ -3405,6 +3668,167 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) > return vcpu->arch.ret; > } > > +static int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, > + struct kvm_vcpu *vcpu, u64 time_limit) > +{ > + int trap, r, pcpu, pcpu0; > + int srcu_idx; > + struct kvmppc_vcore *vc; > + struct kvm *kvm = vcpu->kvm; > + > + trace_kvmppc_run_vcpu_enter(vcpu); > + > + kvm_run->exit_reason = 0; > + vcpu->arch.ret = RESUME_GUEST; > + vcpu->arch.trap = 0; > + > + vc = vcpu->arch.vcore; > + vcpu->arch.ceded = 0; > + vcpu->arch.run_task = current; > + vcpu->arch.kvm_run = kvm_run; > + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); > + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; > + vcpu->arch.busy_preempt = TB_NIL; > + vcpu->arch.last_inst = KVM_INST_FETCH_FAILED; > + vc->runnable_threads[0] = vcpu; > + vc->n_runnable = 1; > + vc->runner = vcpu; > + > + /* See if the MMU is ready to go */ > + if (!kvm->arch.mmu_ready) { > + r = kvmhv_setup_mmu(vcpu); > + if (r) { > + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; > + kvm_run->fail_entry. > + hardware_entry_failure_reason = 0; > + vcpu->arch.ret = r; > + goto out; > + } > + } > + > + if (need_resched()) > + cond_resched(); > + > + kvmppc_update_vpas(vcpu); > + > + init_vcore_to_run(vc); > + vc->preempt_tb = TB_NIL; > + > + preempt_disable(); > + pcpu = smp_processor_id(); > + vc->pcpu = pcpu; > + kvmppc_prepare_radix_vcpu(vcpu, pcpu); > + > + local_irq_disable(); > + hard_irq_disable(); > + if (signal_pending(current)) > + goto sigpend; > + if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready) > + goto out; > + > + kvmppc_core_prepare_to_enter(vcpu); > + > + kvmppc_clear_host_core(pcpu); > + > + local_paca->kvm_hstate.tid = 0; > + local_paca->kvm_hstate.napping = 0; > + local_paca->kvm_hstate.kvm_split_mode = NULL; > + kvmppc_start_thread(vcpu, vc); > + kvmppc_create_dtl_entry(vcpu, vc); > + trace_kvm_guest_enter(vcpu); > + > + vc->vcore_state = VCORE_RUNNING; > + trace_kvmppc_run_core(vc, 0); > + > + mtspr(SPRN_LPID, vc->kvm->arch.lpid); > + isync(); > + > + /* See comment above in kvmppc_run_core() about this */ > + pcpu0 = pcpu; > + if (cpu_has_feature(CPU_FTR_ARCH_300)) > + pcpu0 &= ~0x3UL; > + > + if (cpumask_test_cpu(pcpu0, &kvm->arch.need_tlb_flush)) { > + radix__local_flush_tlb_lpid_guest(kvm->arch.lpid); > + /* Clear the bit after the TLB flush */ > + cpumask_clear_cpu(pcpu0, &kvm->arch.need_tlb_flush); > + } > + > + trace_hardirqs_on(); > + guest_enter_irqoff(); > + > + srcu_idx = srcu_read_lock(&kvm->srcu); > + > + this_cpu_disable_ftrace(); > + > + trap = kvmhv_p9_guest_entry(vcpu, time_limit); > + vcpu->arch.trap = trap; > + > + this_cpu_enable_ftrace(); > + > + srcu_read_unlock(&kvm->srcu, srcu_idx); > + > + mtspr(SPRN_LPID, kvm->arch.host_lpid); > + isync(); > + > + trace_hardirqs_off(); > + set_irq_happened(trap); > + > + kvmppc_set_host_core(pcpu); > + > + local_irq_enable(); > + guest_exit(); > + > + cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest); > + > + preempt_enable(); > + > + /* cancel pending decrementer exception if DEC is now positive */ > + if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) > + kvmppc_core_dequeue_dec(vcpu); > + > + trace_kvm_guest_exit(vcpu); > + r = RESUME_GUEST; > + if (trap) > + r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); > + vcpu->arch.ret = r; > + > + if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded && > + !kvmppc_vcpu_woken(vcpu)) { > + kvmppc_set_timer(vcpu); > + while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) { > + if (signal_pending(current)) { > + vcpu->stat.signal_exits++; > + kvm_run->exit_reason = KVM_EXIT_INTR; > + vcpu->arch.ret = -EINTR; > + break; > + } > + spin_lock(&vc->lock); > + kvmppc_vcore_blocked(vc); > + spin_unlock(&vc->lock); > + } > + } > + vcpu->arch.ceded = 0; > + > + vc->vcore_state = VCORE_INACTIVE; > + trace_kvmppc_run_core(vc, 1); > + > + done: > + kvmppc_remove_runnable(vc, vcpu); > + trace_kvmppc_run_vcpu_exit(vcpu, kvm_run); > + > + return vcpu->arch.ret; > + > + sigpend: > + vcpu->stat.signal_exits++; > + kvm_run->exit_reason = KVM_EXIT_INTR; > + vcpu->arch.ret = -EINTR; > + out: > + local_irq_enable(); > + preempt_enable(); > + goto done; > +} > + > static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) > { > int r; > @@ -3480,7 +3904,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) > vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; > > do { > - r = kvmppc_run_vcpu(run, vcpu); > + if (kvm->arch.threads_indep && kvm_is_radix(kvm)) > + r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0); > + else > + r = kvmppc_run_vcpu(run, vcpu); > > if (run->exit_reason == KVM_EXIT_PAPR_HCALL && > !(vcpu->arch.shregs.msr & MSR_PR)) { > diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c > index ee564b6..0787f12 100644 > --- a/arch/powerpc/kvm/book3s_hv_ras.c > +++ b/arch/powerpc/kvm/book3s_hv_ras.c > @@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void) > > local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; > } > +EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest); > > void kvmppc_subcore_exit_guest(void) > { > @@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void) > > local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; > } > +EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest); > > static bool kvmppc_tb_resync_required(void) > { > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > index 45dd637..ea84696 100644 > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > @@ -47,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) > #define NAPPING_NOVCPU 2 > > /* Stack frame offsets for kvmppc_hv_entry */ > -#define SFS 160 > +#define SFS 208 > #define STACK_SLOT_TRAP (SFS-4) > +#define STACK_SLOT_SHORT_PATH (SFS-8) > #define STACK_SLOT_TID (SFS-16) > #define STACK_SLOT_PSSCR (SFS-24) > #define STACK_SLOT_PID (SFS-32) > @@ -57,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) > #define STACK_SLOT_DAWR (SFS-56) > #define STACK_SLOT_DAWRX (SFS-64) > #define STACK_SLOT_HFSCR (SFS-72) > +/* the following is used by the P9 short path */ > +#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ > > /* > * Call kvmppc_hv_entry in real mode. > @@ -1020,6 +1023,9 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) > no_xive: > #endif /* CONFIG_KVM_XICS */ > > + li r0, 0 > + stw r0, STACK_SLOT_SHORT_PATH(r1) > + > deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */ > /* Check if we can deliver an external or decrementer interrupt now */ > ld r0, VCPU_PENDING_EXC(r4) > @@ -1034,13 +1040,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) > bl kvmppc_guest_entry_inject_int > ld r4, HSTATE_KVM_VCPU(r13) > 71: > - ld r10, VCPU_PC(r4) > - ld r11, VCPU_MSR(r4) > ld r6, VCPU_SRR0(r4) > ld r7, VCPU_SRR1(r4) > mtspr SPRN_SRR0, r6 > mtspr SPRN_SRR1, r7 > > +fast_guest_entry_c: > + ld r10, VCPU_PC(r4) > + ld r11, VCPU_MSR(r4) > /* r11 = vcpu->arch.msr & ~MSR_HV */ > rldicl r11, r11, 63 - MSR_HV_LG, 1 > rotldi r11, r11, 1 + MSR_HV_LG > @@ -1117,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) > HRFI_TO_GUEST > b . > > +/* > + * Enter the guest on a P9 or later system where we have exactly > + * one vcpu per vcore and we don't need to go to real mode > + * (which implies that host and guest are both using radix MMU mode). > + * r3 = vcpu pointer > + * Most SPRs and all the VSRs have been loaded already. > + */ > +_GLOBAL(__kvmhv_vcpu_entry_p9) > +EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9) > + mflr r0 > + std r0, PPC_LR_STKOFF(r1) > + stdu r1, -SFS(r1) > + > + li r0, 1 > + stw r0, STACK_SLOT_SHORT_PATH(r1) > + > + std r3, HSTATE_KVM_VCPU(r13) > + mfcr r4 > + stw r4, SFS+8(r1) > + > + std r1, HSTATE_HOST_R1(r13) > + > + reg = 14 > + .rept 18 > + std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) > + reg = reg + 1 > + .endr > + > + reg = 14 > + .rept 18 > + ld reg, __VCPU_GPR(reg)(r3) > + reg = reg + 1 > + .endr > + > + mfmsr r10 > + std r10, HSTATE_HOST_MSR(r13) > + > + mr r4, r3 > + b fast_guest_entry_c > +guest_exit_short_path: > + > + li r0, KVM_GUEST_MODE_NONE > + stb r0, HSTATE_IN_GUEST(r13) > + > + reg = 14 > + .rept 18 > + std reg, __VCPU_GPR(reg)(r9) > + reg = reg + 1 > + .endr > + > + reg = 14 > + .rept 18 > + ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) > + reg = reg + 1 > + .endr > + > + lwz r4, SFS+8(r1) > + mtcr r4 > + > + mr r3, r12 /* trap number */ > + > + addi r1, r1, SFS > + ld r0, PPC_LR_STKOFF(r1) > + mtlr r0 > + > + /* If we are in real mode, do a rfid to get back to the caller */ > + mfmsr r4 > + andi. r5, r4, MSR_IR > + bnelr > + rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */ > + mtspr SPRN_SRR0, r0 > + ld r10, HSTATE_HOST_MSR(r13) > + rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG > + mtspr SPRN_SRR1, r10 > + RFI_TO_KERNEL > + b . > + > secondary_too_late: > li r12, 0 > stw r12, STACK_SLOT_TRAP(r1) > @@ -1377,6 +1461,11 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ > 1: > #endif /* CONFIG_KVM_XICS */ > > + /* If we came in through the P9 short path, go back out to C now */ > + lwz r0, STACK_SLOT_SHORT_PATH(r1) > + cmpwi r0, 0 > + bne guest_exit_short_path > + > /* For hash guest, read the guest SLB and save it away */ > ld r5, VCPU_KVM(r9) > lbz r0, KVM_RADIX(r5) > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c > index 30c2eb7..ad4a370 100644 > --- a/arch/powerpc/kvm/book3s_xive.c > +++ b/arch/powerpc/kvm/book3s_xive.c > @@ -62,6 +62,69 @@ > #define XIVE_Q_GAP 2 > > /* > + * Push a vcpu's context to the XIVE on guest entry. > + * This assumes we are in virtual mode (MMU on) > + */ > +void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) > +{ > + void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; > + u64 pq; > + > + if (!tima) > + return; > + eieio(); > + __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); > + __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); > + vcpu->arch.xive_pushed = 1; > + eieio(); > + > + /* > + * We clear the irq_pending flag. There is a small chance of a > + * race vs. the escalation interrupt happening on another > + * processor setting it again, but the only consequence is to > + * cause a spurious wakeup on the next H_CEDE, which is not an > + * issue. > + */ > + vcpu->arch.irq_pending = 0; > + > + /* > + * In single escalation mode, if the escalation interrupt is > + * on, we mask it. > + */ > + if (vcpu->arch.xive_esc_on) { > + pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + > + XIVE_ESB_SET_PQ_01)); > + mb(); > + > + /* > + * We have a possible subtle race here: The escalation > + * interrupt might have fired and be on its way to the > + * host queue while we mask it, and if we unmask it > + * early enough (re-cede right away), there is a > + * theorical possibility that it fires again, thus > + * landing in the target queue more than once which is > + * a big no-no. > + * > + * Fortunately, solving this is rather easy. If the > + * above load setting PQ to 01 returns a previous > + * value where P is set, then we know the escalation > + * interrupt is somewhere on its way to the host. In > + * that case we simply don't clear the xive_esc_on > + * flag below. It will be eventually cleared by the > + * handler for the escalation interrupt. > + * > + * Then, when doing a cede, we check that flag again > + * before re-enabling the escalation interrupt, and if > + * set, we abort the cede. > + */ > + if (!(pq & XIVE_ESB_VAL_P)) > + /* Now P is 0, we can clear the flag */ > + vcpu->arch.xive_esc_on = 0; > + } > +} > +EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu); > + > +/* > * This is a simple trigger for a generic XIVE IRQ. This must > * only be called for interrupts that support a trigger page > */ -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature