Nicholas Piggin <npiggin@xxxxxxxxx> writes: Hi, thanks for this. It helped me clarify a bunch of details that I haven't understood while reading the asm code. Some comments below: > Almost all logic is moved to C, by introducing a new in_guest mode that > selects and branches very early in the interrupt handler to the P9 exit > code. > > The remaining assembly is only about 160 lines of low level stack setup, > with VCPU vs host register save and restore, plus a small shim to the > legacy paths in the interrupt handler. > > There are two motivations for this, the first is just make the code more > maintainable being in C. The second is to reduce the amount of code > running in a special KVM mode, "realmode". I put that in quotes because > with radix it is no longer necessarily real-mode in the MMU, but it > still has to be treated specially because it may be in real-mode, and > has various important registers like PID, DEC, TB, etc set to guest. > This is hostile to the rest of Linux and can't use arbitrary kernel > functionality or be instrumented well. > > This initial patch is a reasonably faithful conversion of the asm code. > It does lack any loop to return quickly back into the guest without > switching out of realmode in the case of unimportant or easily handled > interrupts, as explained in the previous change, handling HV interrupts > in real mode is not so important for P9. > > Signed-off-by: Nicholas Piggin <npiggin@xxxxxxxxx> > --- <snip> > diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S > index f826c8dc2e19..cc7b76865a16 100644 > --- a/arch/powerpc/kvm/book3s_64_entry.S > +++ b/arch/powerpc/kvm/book3s_64_entry.S > @@ -1,10 +1,14 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > #include <asm/asm-offsets.h> > #include <asm/cache.h> > +#include <asm/code-patching-asm.h> > #include <asm/exception-64s.h> > +#include <asm/export.h> > #include <asm/kvm_asm.h> > #include <asm/kvm_book3s_asm.h> > #include <asm/ppc_asm.h> > #include <asm/reg.h> > +#include <asm/ultravisor-api.h> > > /* > * These are branched to from interrupt handlers in exception-64s.S which set > @@ -13,13 +17,24 @@ > .global kvmppc_hcall > .balign IFETCH_ALIGN_BYTES > kvmppc_hcall: > + lbz r10,HSTATE_IN_GUEST(r13) > + cmpwi r10,KVM_GUEST_MODE_GUEST_HV_FAST > + beq kvmppc_p9_exit_hcall > ld r10,PACA_EXGEN+EX_R13(r13) > SET_SCRATCH0(r10) > li r10,0xc00 > + li r11,PACA_EXGEN > + b 1f > > .global kvmppc_interrupt > .balign IFETCH_ALIGN_BYTES > kvmppc_interrupt: > + std r10,HSTATE_SCRATCH0(r13) > + lbz r10,HSTATE_IN_GUEST(r13) > + cmpwi r10,KVM_GUEST_MODE_GUEST_HV_FAST > + beq kvmppc_p9_exit_interrupt > + ld r10,HSTATE_SCRATCH0(r13) > + lbz r11,HSTATE_IN_GUEST(r13) > li r11,PACA_EXGEN > cmpdi r10,0x200 > bgt+ 1f > @@ -114,3 +129,169 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) > GET_SCRATCH0(r13) > HRFI_TO_KERNEL > #endif > + > +/* Stack frame offsets for kvmppc_hv_entry */ > +#define SFS 208 > +#define STACK_SLOT_VCPU (SFS-8) > +#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ > + > +/* > + * void kvmppc_p9_enter_guest(struct vcpu *vcpu); > + * > + * Enter the guest on a ISAv3.0 or later system where we have exactly > + * one vcpu per vcore, and both the host and guest are radix, and threads > + * are set to "indepdent mode". > + */ > +.balign IFETCH_ALIGN_BYTES > +_GLOBAL(kvmppc_p9_enter_guest) > +EXPORT_SYMBOL_GPL(kvmppc_p9_enter_guest) > + mflr r0 > + std r0,PPC_LR_STKOFF(r1) > + stdu r1,-SFS(r1) > + > + std r1,HSTATE_HOST_R1(r13) > + std r3,STACK_SLOT_VCPU(r1) The vcpu was stored on the paca previously. I don't get the change, could you explain? > + > + mfcr r4 > + stw r4,SFS+8(r1) > + > + reg = 14 > + .rept 18 > + std reg,STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) > + reg = reg + 1 > + .endr > + > + ld r4,VCPU_LR(r3) > + mtlr r4 > + ld r4,VCPU_CTR(r3) > + mtctr r4 > + ld r4,VCPU_XER(r3) > + mtspr SPRN_XER,r4 > + > + ld r1,VCPU_CR(r3) > + > +BEGIN_FTR_SECTION > + ld r4,VCPU_CFAR(r3) > + mtspr SPRN_CFAR,r4 > +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) > +BEGIN_FTR_SECTION > + ld r4,VCPU_PPR(r3) > + mtspr SPRN_PPR,r4 > +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) > + > + reg = 4 > + .rept 28 > + ld reg,__VCPU_GPR(reg)(r3) > + reg = reg + 1 > + .endr > + > + ld r4,VCPU_KVM(r3) > + lbz r4,KVM_SECURE_GUEST(r4) > + cmpdi r4,0 > + ld r4,VCPU_GPR(R4)(r3) > + bne .Lret_to_ultra > + > + mtcr r1 > + > + ld r0,VCPU_GPR(R0)(r3) > + ld r1,VCPU_GPR(R1)(r3) > + ld r2,VCPU_GPR(R2)(r3) > + ld r3,VCPU_GPR(R3)(r3) > + > + HRFI_TO_GUEST > + b . > + > + /* > + * Use UV_RETURN ultracall to return control back to the Ultravisor > + * after processing an hypercall or interrupt that was forwarded > + * (a.k.a. reflected) to the Hypervisor. > + * > + * All registers have already been reloaded except the ucall requires: > + * R0 = hcall result > + * R2 = SRR1, so UV can detect a synthesized interrupt (if any) > + * R3 = UV_RETURN > + */ > +.Lret_to_ultra: > + mtcr r1 > + ld r1,VCPU_GPR(R1)(r3) > + > + ld r0,VCPU_GPR(R3)(r3) > + mfspr r2,SPRN_SRR1 > + LOAD_REG_IMMEDIATE(r3, UV_RETURN) > + sc 2 > + > +/* > + * kvmppc_p9_exit_hcall and kvmppc_p9_exit_interrupt are branched to from > + * above if the interrupt was taken for a guest that was entered via > + * kvmppc_p9_enter_guest(). > + * > + * This code recovers the host stack and vcpu pointer, saves all GPRs and > + * CR, LR, CTR, XER as well as guest MSR and NIA into the VCPU, then re- > + * establishes the host stack and registers to return from the > + * kvmppc_p9_enter_guest() function. > + */ > +.balign IFETCH_ALIGN_BYTES > +kvmppc_p9_exit_hcall: > + mfspr r11,SPRN_SRR0 > + mfspr r12,SPRN_SRR1 > + li r10,0xc00 > + std r10,HSTATE_SCRATCH0(r13) > + > +.balign IFETCH_ALIGN_BYTES > +kvmppc_p9_exit_interrupt: > + std r1,HSTATE_SCRATCH1(r13) > + std r3,HSTATE_SCRATCH2(r13) > + ld r1,HSTATE_HOST_R1(r13) > + ld r3,STACK_SLOT_VCPU(r1) > + > + std r9,VCPU_CR(r3) > + > +1: > + std r11,VCPU_PC(r3) > + std r12,VCPU_MSR(r3) > + > + reg = 14 > + .rept 18 > + std reg,__VCPU_GPR(reg)(r3) > + reg = reg + 1 > + .endr > + > + /* r1, r3, r9-r13 are saved to vcpu by C code */ If we just saved r1 and r3, why don't we put them in the vcpu right now? That would avoid having the C code knowing about scratch areas. > + std r0,VCPU_GPR(R0)(r3) > + std r2,VCPU_GPR(R2)(r3) > + reg = 4 > + .rept 5 > + std reg,__VCPU_GPR(reg)(r3) > + reg = reg + 1 > + .endr > + > + ld r2,PACATOC(r13) > + > + mflr r4 > + std r4,VCPU_LR(r3) > + mfspr r4,SPRN_XER > + std r4,VCPU_XER(r3) > + > + reg = 14 > + .rept 18 > + ld reg,STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) > + reg = reg + 1 > + .endr > + > + lwz r4,SFS+8(r1) > + mtcr r4 > + > + /* > + * Flush the link stack here, before executing the first blr on the > + * way out of the guest. > + * > + * The link stack won't match coming out of the guest anyway so the > + * only cost is the flush itself. The call clobbers r0. > + */ > +1: nop > + patch_site 1b patch__call_kvm_flush_link_stack_2 > + > + addi r1,r1,SFS > + ld r0,PPC_LR_STKOFF(r1) > + mtlr r0 > + blr > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 1997cf347d3e..28a2761515e3 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -1421,6 +1421,8 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, > */ > case BOOK3S_INTERRUPT_H_DATA_STORAGE: > r = RESUME_PAGE_FAULT; > + if (vcpu->arch.fault_dsisr == HDSISR_CANARY) > + r = RESUME_GUEST; /* Just retry if it's the canary */ > break; > case BOOK3S_INTERRUPT_H_INST_STORAGE: > vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); > @@ -3736,14 +3738,14 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, > vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); > vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR); > mtspr(SPRN_PSSCR_PR, host_psscr); > - > /* H_CEDE has to be handled now, not later */ > - if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && > + if (trap == BOOK3S_INTERRUPT_SYSCALL && > kvmppc_get_gpr(vcpu, 3) == H_CEDE) { > kvmppc_cede(vcpu); > kvmppc_set_gpr(vcpu, 3, 0); > trap = 0; > } > + > } else { > kvmppc_xive_push_vcpu(vcpu); > trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr); > @@ -3768,9 +3770,10 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, > } > } > kvmppc_xive_pull_vcpu(vcpu); > + > + vcpu->arch.slb_max = 0; > } > > - vcpu->arch.slb_max = 0; > dec = mfspr(SPRN_DEC); > if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */ > dec = (s32) dec; > @@ -4429,11 +4432,19 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) > else > r = kvmppc_run_vcpu(vcpu); > > - if (run->exit_reason == KVM_EXIT_PAPR_HCALL && > - !(vcpu->arch.shregs.msr & MSR_PR)) { > - trace_kvm_hcall_enter(vcpu); > - r = kvmppc_pseries_do_hcall(vcpu); > - trace_kvm_hcall_exit(vcpu, r); > + if (run->exit_reason == KVM_EXIT_PAPR_HCALL) { > + if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) { > + /* > + * Guest userspace executed sc 1, reflect it > + * back as a privileged program check interrupt. > + */ > + kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV); > + r = RESUME_GUEST; This is in conflict with this snippet in kvmppc_handle_exit_hv: case BOOK3S_INTERRUPT_SYSCALL: { /* hcall - punt to userspace */ int i; /* hypercall with MSR_PR has already been handled in rmode, * and never reaches here. */ That function already queues some 0x700s so maybe we could move this one in there as well. > + } else { > + trace_kvm_hcall_enter(vcpu); > + r = kvmppc_pseries_do_hcall(vcpu); > + trace_kvm_hcall_exit(vcpu, r); > + } > kvmppc_core_prepare_to_enter(vcpu); > } else if (r == RESUME_PAGE_FAULT) { > srcu_idx = srcu_read_lock(&kvm->srcu); > diff --git a/arch/powerpc/kvm/book3s_hv_interrupt.c b/arch/powerpc/kvm/book3s_hv_interrupt.c > new file mode 100644 > index 000000000000..5a7b036c447f > --- /dev/null > +++ b/arch/powerpc/kvm/book3s_hv_interrupt.c > @@ -0,0 +1,221 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +#include <linux/kernel.h> > +#include <linux/kvm_host.h> > +#include <asm/asm-prototypes.h> > +#include <asm/dbell.h> > +#include <asm/kvm_ppc.h> > + > +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING > +static void __start_timing(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + u64 tb = mftb() - vc->tb_offset_applied; > + > + vcpu->arch.cur_activity = next; > + vcpu->arch.cur_tb_start = tb; > +} > + > +static void __accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + struct kvmhv_tb_accumulator *curr; > + u64 tb = mftb() - vc->tb_offset_applied; > + u64 prev_tb; > + u64 delta; > + u64 seq; > + > + curr = vcpu->arch.cur_activity; > + vcpu->arch.cur_activity = next; > + prev_tb = vcpu->arch.cur_tb_start; > + vcpu->arch.cur_tb_start = tb; > + > + if (!curr) > + return; > + > + delta = tb - prev_tb; > + > + seq = curr->seqcount; > + curr->seqcount = seq + 1; > + smp_wmb(); > + curr->tb_total += delta; > + if (seq == 0 || delta < curr->tb_min) > + curr->tb_min = delta; > + if (delta > curr->tb_max) > + curr->tb_max = delta; > + smp_wmb(); > + curr->seqcount = seq + 2; > +} > + > +#define start_timing(vcpu, next) __start_timing(vcpu, next) > +#define end_timing(vcpu) __start_timing(vcpu, NULL) > +#define accumulate_time(vcpu, next) __accumulate_time(vcpu, next) > +#else > +#define start_timing(vcpu, next) do {} while (0) > +#define end_timing(vcpu) do {} while (0) > +#define accumulate_time(vcpu, next) do {} while (0) > +#endif > + > +static inline void mfslb(unsigned int idx, u64 *slbee, u64 *slbev) > +{ > + asm volatile("slbmfev %0,%1" : "=r" (*slbev) : "r" (idx)); > + asm volatile("slbmfee %0,%1" : "=r" (*slbee) : "r" (idx)); > +} > + > +static inline void mtslb(unsigned int idx, u64 slbee, u64 slbev) > +{ > + BUG_ON((slbee & 0xfff) != idx); > + > + asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee)); > +} > + > +static inline void slb_invalidate(unsigned int ih) > +{ > + asm volatile("slbia %0" :: "i"(ih)); > +} > + > +/* > + * Malicious or buggy radix guests may have inserted SLB entries > + * (only 0..3 because radix always runs with UPRT=1), so these must > + * be cleared here to avoid side-channels. slbmte is used rather > + * than slbia, as it won't clear cached translations. > + */ > +static void radix_clear_slb(void) > +{ > + u64 slbee, slbev; > + int i; > + > + for (i = 0; i < 4; i++) { > + mfslb(i, &slbee, &slbev); > + if (unlikely(slbee || slbev)) { > + slbee = i; > + slbev = 0; > + mtslb(i, slbee, slbev); > + } > + } > +} > + > +int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu) > +{ > + u64 *exsave; > + unsigned long msr = mfmsr(); > + int trap; > + > + start_timing(vcpu, &vcpu->arch.rm_entry); > + > + vcpu->arch.ceded = 0; > + > + WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV); > + WARN_ON_ONCE(!(vcpu->arch.shregs.msr & MSR_ME)); > + > + mtspr(SPRN_HSRR0, vcpu->arch.regs.nip); > + mtspr(SPRN_HSRR1, (vcpu->arch.shregs.msr & ~MSR_HV) | MSR_ME); > + > + /* > + * On POWER9 DD2.1 and below, sometimes on a Hypervisor Data Storage > + * Interrupt (HDSI) the HDSISR is not be updated at all. s/be // > + * > + * To work around this we put a canary value into the HDSISR before > + * returning to a guest and then check for this canary when we take a > + * HDSI. If we find the canary on a HDSI, we know the hardware didn't > + * update the HDSISR. In this case we return to the guest to retake the > + * HDSI which should correctly update the HDSISR the second time HDSI > + * entry. > + * > + * Just do this on all p9 processors for now. > + */ > + mtspr(SPRN_HDSISR, HDSISR_CANARY); > + > + accumulate_time(vcpu, &vcpu->arch.guest_time); > + > + local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST_HV_FAST; > + kvmppc_p9_enter_guest(vcpu); > + // Radix host and guest means host never runs with guest MMU state > + local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE; > + > + accumulate_time(vcpu, &vcpu->arch.rm_intr); > + > + /* Get these from r11/12 and paca exsave */ > + vcpu->arch.shregs.srr0 = mfspr(SPRN_SRR0); > + vcpu->arch.shregs.srr1 = mfspr(SPRN_SRR1); > + vcpu->arch.shregs.dar = mfspr(SPRN_DAR); > + vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); > + > + trap = local_paca->kvm_hstate.scratch0 & ~0x2; Couldn't we return the trap from kvmppc_p9_enter_guest? Seems like a nice pattern to have and avoids exposing the IVEC+0x2 magic which is hidden in asm already. > + if (likely(trap > BOOK3S_INTERRUPT_MACHINE_CHECK)) { > + exsave = local_paca->exgen; > + } else if (trap == BOOK3S_INTERRUPT_SYSTEM_RESET) { > + exsave = local_paca->exnmi; > + } else { /* trap == 0x200 */ > + exsave = local_paca->exmc; > + } > + > + vcpu->arch.regs.gpr[1] = local_paca->kvm_hstate.scratch1; > + vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2; > + vcpu->arch.regs.gpr[9] = exsave[EX_R9/sizeof(u64)]; > + vcpu->arch.regs.gpr[10] = exsave[EX_R10/sizeof(u64)]; > + vcpu->arch.regs.gpr[11] = exsave[EX_R11/sizeof(u64)]; > + vcpu->arch.regs.gpr[12] = exsave[EX_R12/sizeof(u64)]; > + vcpu->arch.regs.gpr[13] = exsave[EX_R13/sizeof(u64)]; > + vcpu->arch.ppr = exsave[EX_PPR/sizeof(u64)]; > + vcpu->arch.cfar = exsave[EX_CFAR/sizeof(u64)]; > + vcpu->arch.regs.ctr = exsave[EX_CTR/sizeof(u64)]; > + > + vcpu->arch.last_inst = KVM_INST_FETCH_FAILED; > + > + if (unlikely(trap == BOOK3S_INTERRUPT_MACHINE_CHECK)) { > + vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)]; > + vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)]; > + kvmppc_realmode_machine_check(vcpu); > + > + } else if (unlikely(trap == BOOK3S_INTERRUPT_HMI)) { > + kvmppc_realmode_hmi_handler(); > + > + } else if (trap == BOOK3S_INTERRUPT_H_EMUL_ASSIST) { > + vcpu->arch.emul_inst = mfspr(SPRN_HEIR); > + > + } else if (trap == BOOK3S_INTERRUPT_H_DATA_STORAGE) { > + vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)]; > + vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)]; > + vcpu->arch.fault_gpa = mfspr(SPRN_ASDR); > + > + } else if (trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { > + vcpu->arch.fault_gpa = mfspr(SPRN_ASDR); > + > + } else if (trap == BOOK3S_INTERRUPT_H_FAC_UNAVAIL) { > + vcpu->arch.hfscr = mfspr(SPRN_HFSCR); > + > +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM > + /* > + * Softpatch interrupt for transactional memory emulation cases > + * on POWER9 DD2.2. This is early in the guest exit path - we > + * haven't saved registers or done a treclaim yet. > + */ > + } else if (trap == BOOK3S_INTERRUPT_HV_SOFTPATCH) { > + vcpu->arch.emul_inst = mfspr(SPRN_HEIR); > + > + /* > + * The cases we want to handle here are those where the guest > + * is in real suspend mode and is trying to transition to > + * transactional mode. > + */ > + if (local_paca->kvm_hstate.fake_suspend && > + (vcpu->arch.shregs.msr & MSR_TS_S)) { > + if (kvmhv_p9_tm_emulation_early(vcpu)) { > + /* Prevent it being handled again. */ > + trap = 0; > + } > + } > +#endif > + } > + > + radix_clear_slb(); > + > + __mtmsrd(msr, 0); Isn't this the same as mtmsr(msr) ? > + > + accumulate_time(vcpu, &vcpu->arch.rm_exit); > + > + end_timing(vcpu); > + > + return trap; > +} > +EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9); <snip>