On Fri, Sep 21, 2018 at 08:01:51PM +1000, Paul Mackerras wrote: > This adds a new hypercall, H_ENTER_NESTED, which is used by a nested > hypervisor to enter one of its nested guests. The hypercall supplies > register values in two structs. Those values are copied by the level 0 > (L0) hypervisor (the one which is running in hypervisor mode) into the > vcpu struct of the L1 guest, and then the guest is run until an > interrupt or error occurs which needs to be reported to L1 via the > hypercall return value. > > Currently this assumes that the L0 and L1 hypervisors are the same > endianness, and the structs passed as arguments are in native > endianness. That's nasty. It'd be good to at least detect this and bail. > > Nested hypervisors do not support indep_threads_mode=N, so this adds > code to print a warning message if the administrator has set > indep_threads_mode=N, and treat it as Y. > > Signed-off-by: Paul Mackerras <paulus@xxxxxxxxxx> > --- > arch/powerpc/include/asm/hvcall.h | 36 +++++ > arch/powerpc/include/asm/kvm_book3s.h | 7 + > arch/powerpc/include/asm/kvm_host.h | 5 + > arch/powerpc/kernel/asm-offsets.c | 1 + > arch/powerpc/kvm/book3s_hv.c | 194 +++++++++++++++++++++++---- > arch/powerpc/kvm/book3s_hv_nested.c | 230 ++++++++++++++++++++++++++++++++ > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 + > 7 files changed, 452 insertions(+), 25 deletions(-) > > diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h > index 9afaa82..dfcf43d 100644 > --- a/arch/powerpc/include/asm/hvcall.h > +++ b/arch/powerpc/include/asm/hvcall.h > @@ -487,6 +487,42 @@ struct h_cpu_char_result { > u64 behaviour; > }; > > +/* Register state for entering a nested guest with H_ENTER_NESTED */ > +struct hv_guest_state { > + u64 version; /* version of this structure layout */ > + u32 lpid; > + u32 vcpu_token; > + /* These registers are hypervisor privileged (at least for writing) */ > + u64 lpcr; > + u64 pcr; > + u64 amor; > + u64 dpdes; > + u64 hfscr; > + s64 tb_offset; > + u64 dawr0; > + u64 dawrx0; > + u64 ciabr; > + u64 hdec_expiry; > + u64 purr; > + u64 spurr; > + u64 ic; > + u64 vtb; > + u64 hdar; > + u64 hdsisr; > + u64 heir; > + u64 asdr; > + /* These are OS privileged but need to be set late in guest entry */ > + u64 srr0; > + u64 srr1; > + u64 sprg[4]; > + u64 pidr; > + u64 cfar; > + u64 ppr; > +}; > + > +/* Latest version of hv_guest_state structure */ > +#define HV_GUEST_STATE_VERSION 1 > + > #endif /* __ASSEMBLY__ */ > #endif /* __KERNEL__ */ > #endif /* _ASM_POWERPC_HVCALL_H */ > diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h > index 7719ca5..125bc5b 100644 > --- a/arch/powerpc/include/asm/kvm_book3s.h > +++ b/arch/powerpc/include/asm/kvm_book3s.h > @@ -280,6 +280,13 @@ void kvmhv_vm_nested_init(struct kvm *kvm); > long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); > void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1); > void kvmhv_release_all_nested(struct kvm *kvm); > +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); > +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu, > + u64 time_limit); > +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); > +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, > + struct hv_guest_state *hr); > +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu); > > void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); > > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h > index c35d4f2..ceb9f20 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -95,6 +95,7 @@ struct dtl_entry; > > struct kvmppc_vcpu_book3s; > struct kvmppc_book3s_shadow_vcpu; > +struct kvm_nested_guest; > > struct kvm_vm_stat { > ulong remote_tlb_flush; > @@ -786,6 +787,10 @@ struct kvm_vcpu_arch { > u32 emul_inst; > > u32 online; > + > + /* For support of nested guests */ > + struct kvm_nested_guest *nested; > + u32 nested_vcpu_id; > #endif > > #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING > diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c > index 7c3738d..d0abcbb 100644 > --- a/arch/powerpc/kernel/asm-offsets.c > +++ b/arch/powerpc/kernel/asm-offsets.c > @@ -503,6 +503,7 @@ int main(void) > OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); > OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty); > OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst); > + OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested); > OFFSET(VCPU_CPU, kvm_vcpu, cpu); > OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu); > #endif > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 82c9a1e..da380cb 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -942,6 +942,13 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) > break; > case H_ENTER_NESTED: > ret = H_FUNCTION; > + if (!vcpu->kvm->arch.nested_enable) > + break; Wouldn't H_AUTHORITY make more sense than H_FUNCTION for the no-nested-allowed case? > + ret = kvmhv_enter_nested_guest(vcpu); > + if (ret == H_INTERRUPT) { > + kvmppc_set_gpr(vcpu, 3, 0); > + return RESUME_HOST; > + } > break; > > default: > @@ -1269,6 +1276,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, > return r; > } > > +static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) Might be nice to rename to make it clear if this is the L0 or L1 handling for a nested exit. > +{ > + int r; > + int srcu_idx; > + > + vcpu->stat.sum_exits++; > + > + /* > + * This can happen if an interrupt occurs in the last stages > + * of guest entry or the first stages of guest exit (i.e. after > + * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV > + * and before setting it to KVM_GUEST_MODE_HOST_HV). > + * That can happen due to a bug, or due to a machine check > + * occurring at just the wrong time. > + */ > + if (vcpu->arch.shregs.msr & MSR_HV) { > + printk(KERN_EMERG "KVM trap in HV mode while nested!\n"); > + printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", > + vcpu->arch.trap, kvmppc_get_pc(vcpu), > + vcpu->arch.shregs.msr); > + kvmppc_dump_regs(vcpu); > + BUG(); > + } > + switch (vcpu->arch.trap) { > + /* We're good on these - the host merely wanted to get our attention */ > + case BOOK3S_INTERRUPT_HV_DECREMENTER: > + vcpu->stat.dec_exits++; > + r = RESUME_GUEST; > + break; > + case BOOK3S_INTERRUPT_EXTERNAL: > + vcpu->stat.ext_intr_exits++; > + r = RESUME_HOST; > + break; > + case BOOK3S_INTERRUPT_H_DOORBELL: > + case BOOK3S_INTERRUPT_H_VIRT: > + vcpu->stat.ext_intr_exits++; > + r = RESUME_GUEST; > + break; > + /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ > + case BOOK3S_INTERRUPT_HMI: > + case BOOK3S_INTERRUPT_PERFMON: > + case BOOK3S_INTERRUPT_SYSTEM_RESET: > + r = RESUME_GUEST; > + break; > + case BOOK3S_INTERRUPT_MACHINE_CHECK: > + /* Pass the machine check to the L1 guest */ > + r = RESUME_HOST; > + /* Print the MCE event to host console. */ > + machine_check_print_event_info(&vcpu->arch.mce_evt, false); > + break; > + /* > + * We get these next two if the guest accesses a page which it thinks > + * it has mapped but which is not actually present, either because > + * it is for an emulated I/O device or because the corresonding > + * host page has been paged out. > + */ > + case BOOK3S_INTERRUPT_H_DATA_STORAGE: > + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); > + r = kvmhv_nested_page_fault(vcpu); > + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); > + break; > + case BOOK3S_INTERRUPT_H_INST_STORAGE: > + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); > + vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) & > + DSISR_SRR1_MATCH_64S; > + if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) > + vcpu->arch.fault_dsisr |= DSISR_ISSTORE; > + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); > + r = kvmhv_nested_page_fault(vcpu); > + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); > + break; > + > +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM > + case BOOK3S_INTERRUPT_HV_SOFTPATCH: > + /* > + * This occurs for various TM-related instructions that > + * we need to emulate on POWER9 DD2.2. We have already > + * handled the cases where the guest was in real-suspend > + * mode and was transitioning to transactional state. > + */ > + r = kvmhv_p9_tm_emulation(vcpu); > + break; > +#endif > + > + case BOOK3S_INTERRUPT_HV_RM_HARD: > + vcpu->arch.trap = 0; > + r = RESUME_GUEST; > + if (!xive_enabled()) > + kvmppc_xics_rm_complete(vcpu, 0); > + break; > + default: > + r = RESUME_HOST; > + break; > + } > + > + return r; > +} > + > static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, > struct kvm_sregs *sregs) > { > @@ -3095,7 +3200,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) > /* > * Load up hypervisor-mode registers on P9. > */ > -static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) > +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit) > { > struct kvmppc_vcore *vc = vcpu->arch.vcore; > s64 hdec; > @@ -3108,7 +3213,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) > unsigned long host_psscr = mfspr(SPRN_PSSCR); > unsigned long host_pidr = mfspr(SPRN_PID); > > - hdec = local_paca->kvm_hstate.dec_expires - mftb(); > + hdec = time_limit - mftb(); Looks like this change might better belong in the earlier patch creating kvmhv_load_hv_regs_and_go(). > if (hdec < 0) > return BOOK3S_INTERRUPT_HV_DECREMENTER; > mtspr(SPRN_HDEC, hdec); > @@ -3222,7 +3327,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu) > * Virtual-mode guest entry for POWER9 and later when the host and > * guest are both using the radix MMU. The LPIDR has already been set. > */ > -int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) > +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit) > { > struct kvmppc_vcore *vc = vcpu->arch.vcore; > unsigned long host_dscr = mfspr(SPRN_DSCR); > @@ -3237,6 +3342,8 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) > if (dec < 512) > return BOOK3S_INTERRUPT_HV_DECREMENTER; > local_paca->kvm_hstate.dec_expires = dec + tb; > + if (local_paca->kvm_hstate.dec_expires < time_limit) > + time_limit = local_paca->kvm_hstate.dec_expires; > > vcpu->arch.ceded = 0; > > @@ -3290,7 +3397,28 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu) > vcpu->arch.doorbell_request = 0; > } > > - trap = kvmhv_load_hv_regs_and_go(vcpu); > + if (!cpu_has_feature(CPU_FTR_HVMODE)) { > + /* call our hypervisor to load up HV regs and go */ > + struct hv_guest_state hvregs; > + > + kvmhv_save_hv_regs(vcpu, &hvregs); > + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; > + hvregs.version = HV_GUEST_STATE_VERSION; > + hvregs.lpid = vcpu->kvm->arch.lpid; > + hvregs.vcpu_token = vcpu->vcpu_id; > + hvregs.hdec_expiry = time_limit; > + if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, > + &vcpu->arch.pending_exceptions)) > + hvregs.lpcr |= LPCR_MER; > + trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), > + __pa(&vcpu->arch.regs)); > + kvmhv_restore_hv_return_state(vcpu, &hvregs); > + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; > + vcpu->arch.shregs.dar = mfspr(SPRN_DAR); > + vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); > + } else { > + trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit); > + } > > vcpu->arch.slb_max = 0; > dec = mfspr(SPRN_DEC); > @@ -3530,6 +3658,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) > trace_kvmppc_vcore_wakeup(do_sleep, block_ns); > } > > +/* > + * This is assumed not to be able to fail for a radix guest in > + * kvmhv_run_single_vcpu(). > + */ Might be clearer to split that change out with a rationale as to why it's correct. > static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) > { > int r = 0; > @@ -3679,12 +3811,14 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) > return vcpu->arch.ret; > } > > -static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > - struct kvm_vcpu *vcpu) > +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, > + struct kvm_vcpu *vcpu, u64 time_limit) > { > int trap, r, pcpu, pcpu0; > int srcu_idx; > struct kvmppc_vcore *vc; > + struct kvm_nested_guest *nested = vcpu->arch.nested; > + unsigned long lpid; > > trace_kvmppc_run_vcpu_enter(vcpu); > > @@ -3705,16 +3839,8 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > vc->runner = vcpu; > > /* See if the MMU is ready to go */ > - if (!vcpu->kvm->arch.mmu_ready) { > - r = kvmhv_setup_mmu(vcpu); > - if (r) { > - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; > - kvm_run->fail_entry. > - hardware_entry_failure_reason = 0; > - vcpu->arch.ret = r; > - goto out; > - } > - } > + if (!vcpu->kvm->arch.mmu_ready) > + kvmhv_setup_mmu(vcpu); > > if (need_resched()) > cond_resched(); > @@ -3736,7 +3862,12 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > if (lazy_irq_pending() || need_resched() || !vcpu->kvm->arch.mmu_ready) > goto out; > > - kvmppc_core_prepare_to_enter(vcpu); > + if (!nested) { > + kvmppc_core_prepare_to_enter(vcpu); > + } else if (vcpu->arch.pending_exceptions) { > + vcpu->arch.ret = RESUME_HOST; > + goto out; > + } > > kvmppc_clear_host_core(pcpu); > > @@ -3750,7 +3881,10 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > vc->vcore_state = VCORE_RUNNING; > trace_kvmppc_run_core(vc, 0); > > - mtspr(SPRN_LPID, vc->kvm->arch.lpid); > + lpid = vc->kvm->arch.lpid; > + if (nested) > + lpid = nested->shadow_lpid; > + mtspr(SPRN_LPID, lpid); > isync(); > > /* See comment above in kvmppc_run_core() about this */ > @@ -3759,7 +3893,7 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > pcpu0 &= ~0x3UL; > > if (cpumask_test_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush)) { > - radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid); > + radix__local_flush_tlb_lpid_guest(lpid); > /* Clear the bit after the TLB flush */ > cpumask_clear_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush); > } > @@ -3771,7 +3905,7 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > > this_cpu_disable_ftrace(); > > - trap = kvmhv_p9_guest_entry(vcpu); > + trap = kvmhv_p9_guest_entry(vcpu, time_limit); > vcpu->arch.trap = trap; > > this_cpu_enable_ftrace(); > @@ -3796,8 +3930,12 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run, > > trace_kvm_guest_exit(vcpu); > r = RESUME_GUEST; > - if (trap) > - r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); > + if (trap) { > + if (!nested) > + r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); > + else > + r = kvmppc_handle_nested_exit(vcpu); > + } > vcpu->arch.ret = r; > > if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded && > @@ -3912,7 +4050,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) > > do { > if (kvm->arch.threads_indep && kvm_is_radix(kvm)) > - r = kvmppc_run_single_vcpu(run, vcpu); > + r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0); > else > r = kvmppc_run_vcpu(run, vcpu); > > @@ -4462,8 +4600,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) > * On POWER9, we only need to do this if the "indep_threads_mode" > * module parameter has been set to N. > */ > - if (cpu_has_feature(CPU_FTR_ARCH_300)) > - kvm->arch.threads_indep = indep_threads_mode; > + if (cpu_has_feature(CPU_FTR_ARCH_300)) { > + if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) { > + pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n"); > + kvm->arch.threads_indep = true; Wouldn't it be cleaner to enforce this at the point indep_threads_mode is set, rather than altering the value here? > + } else { > + kvm->arch.threads_indep = indep_threads_mode; > + } > + } > if (!kvm->arch.threads_indep) > kvm_hv_vm_activated(); > > diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c > index 5fe3ea4..a7f3da9 100644 > --- a/arch/powerpc/kvm/book3s_hv_nested.c > +++ b/arch/powerpc/kvm/book3s_hv_nested.c > @@ -20,6 +20,231 @@ static struct patb_entry *pseries_partition_tb; > > static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp); > > +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + > + hr->lpcr = vc->lpcr; > + hr->pcr = vc->pcr; > + hr->dpdes = vc->dpdes; > + hr->hfscr = vcpu->arch.hfscr; > + hr->tb_offset = vc->tb_offset; > + hr->dawr0 = vcpu->arch.dawr; > + hr->dawrx0 = vcpu->arch.dawrx; > + hr->ciabr = vcpu->arch.ciabr; > + hr->purr = vcpu->arch.purr; > + hr->spurr = vcpu->arch.spurr; > + hr->ic = vcpu->arch.ic; > + hr->vtb = vc->vtb; > + hr->srr0 = vcpu->arch.shregs.srr0; > + hr->srr1 = vcpu->arch.shregs.srr1; > + hr->sprg[0] = vcpu->arch.shregs.sprg0; > + hr->sprg[1] = vcpu->arch.shregs.sprg1; > + hr->sprg[2] = vcpu->arch.shregs.sprg2; > + hr->sprg[3] = vcpu->arch.shregs.sprg3; > + hr->pidr = vcpu->arch.pid; > + hr->cfar = vcpu->arch.cfar; > + hr->ppr = vcpu->arch.ppr; > +} > + > +static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, > + struct hv_guest_state *hr) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + > + hr->dpdes = vc->dpdes; > + hr->hfscr = vcpu->arch.hfscr; > + hr->purr = vcpu->arch.purr; > + hr->spurr = vcpu->arch.spurr; > + hr->ic = vcpu->arch.ic; > + hr->vtb = vc->vtb; > + hr->srr0 = vcpu->arch.shregs.srr0; > + hr->srr1 = vcpu->arch.shregs.srr1; > + hr->sprg[0] = vcpu->arch.shregs.sprg0; > + hr->sprg[1] = vcpu->arch.shregs.sprg1; > + hr->sprg[2] = vcpu->arch.shregs.sprg2; > + hr->sprg[3] = vcpu->arch.shregs.sprg3; > + hr->pidr = vcpu->arch.pid; > + hr->cfar = vcpu->arch.cfar; > + hr->ppr = vcpu->arch.ppr; > + switch (trap) { > + case BOOK3S_INTERRUPT_H_DATA_STORAGE: > + hr->hdar = vcpu->arch.fault_dar; > + hr->hdsisr = vcpu->arch.fault_dsisr; > + hr->asdr = vcpu->arch.fault_gpa; > + break; > + case BOOK3S_INTERRUPT_H_INST_STORAGE: > + hr->asdr = vcpu->arch.fault_gpa; > + break; > + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: > + hr->heir = vcpu->arch.emul_inst; > + break; > + } > +} > + > +static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + > + vc->pcr = hr->pcr; > + vc->dpdes = hr->dpdes; > + vcpu->arch.hfscr = hr->hfscr; > + vcpu->arch.dawr = hr->dawr0; > + vcpu->arch.dawrx = hr->dawrx0; > + vcpu->arch.ciabr = hr->ciabr; > + vcpu->arch.purr = hr->purr; > + vcpu->arch.spurr = hr->spurr; > + vcpu->arch.ic = hr->ic; > + vc->vtb = hr->vtb; > + vcpu->arch.shregs.srr0 = hr->srr0; > + vcpu->arch.shregs.srr1 = hr->srr1; > + vcpu->arch.shregs.sprg0 = hr->sprg[0]; > + vcpu->arch.shregs.sprg1 = hr->sprg[1]; > + vcpu->arch.shregs.sprg2 = hr->sprg[2]; > + vcpu->arch.shregs.sprg3 = hr->sprg[3]; > + vcpu->arch.pid = hr->pidr; > + vcpu->arch.cfar = hr->cfar; > + vcpu->arch.ppr = hr->ppr; > +} > + > +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, > + struct hv_guest_state *hr) > +{ > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + > + vc->dpdes = hr->dpdes; > + vcpu->arch.hfscr = hr->hfscr; > + vcpu->arch.purr = hr->purr; > + vcpu->arch.spurr = hr->spurr; > + vcpu->arch.ic = hr->ic; > + vc->vtb = hr->vtb; > + vcpu->arch.fault_dar = hr->hdar; > + vcpu->arch.fault_dsisr = hr->hdsisr; > + vcpu->arch.fault_gpa = hr->asdr; > + vcpu->arch.emul_inst = hr->heir; > + vcpu->arch.shregs.srr0 = hr->srr0; > + vcpu->arch.shregs.srr1 = hr->srr1; > + vcpu->arch.shregs.sprg0 = hr->sprg[0]; > + vcpu->arch.shregs.sprg1 = hr->sprg[1]; > + vcpu->arch.shregs.sprg2 = hr->sprg[2]; > + vcpu->arch.shregs.sprg3 = hr->sprg[3]; > + vcpu->arch.pid = hr->pidr; > + vcpu->arch.cfar = hr->cfar; > + vcpu->arch.ppr = hr->ppr; > +} > + > +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) > +{ > + long int err, r; > + struct kvm_nested_guest *l2; > + struct pt_regs l2_regs, saved_l1_regs; > + struct hv_guest_state l2_hv, saved_l1_hv; > + struct kvmppc_vcore *vc = vcpu->arch.vcore; > + u64 hv_ptr, regs_ptr; > + u64 hdec_exp; > + s64 delta_purr, delta_spurr, delta_ic, delta_vtb; > + u64 mask; > + > + if (!kvm_is_radix(vcpu->kvm)) > + return H_FUNCTION; Would it be safer / cleaner to have this instead check that the L1 has completed an H_SET_PARTITION_TABLE? Which wouldn't be allowed for an HPT guest. > + > + /* copy parameters in */ > + hv_ptr = kvmppc_get_gpr(vcpu, 4); > + err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv, > + sizeof(struct hv_guest_state)); > + if (err) > + return H_PARAMETER; > + if (l2_hv.version != HV_GUEST_STATE_VERSION) > + return H_P2; > + > + regs_ptr = kvmppc_get_gpr(vcpu, 5); > + err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs, > + sizeof(struct pt_regs)); > + if (err) > + return H_PARAMETER; > + > + /* translate lpid */ > + l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true); > + if (!l2) > + return H_PARAMETER; > + if (!l2->l1_gr_to_hr) { > + mutex_lock(&l2->tlb_lock); > + kvmhv_update_ptbl_cache(l2); > + mutex_unlock(&l2->tlb_lock); > + } > + > + /* save l1 values of things */ > + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; > + saved_l1_regs = vcpu->arch.regs; > + kvmhv_save_hv_regs(vcpu, &saved_l1_hv); > + > + /* convert TB values/offsets to host (L0) values */ > + hdec_exp = l2_hv.hdec_expiry - vc->tb_offset; > + vc->tb_offset += l2_hv.tb_offset; > + > + /* set L1 state to L2 state */ > + vcpu->arch.nested = l2; > + vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; > + vcpu->arch.regs = l2_regs; > + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; > + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | > + LPCR_LPES | LPCR_MER; > + vc->lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask); > + restore_hv_regs(vcpu, &l2_hv); > + > + vcpu->arch.ret = RESUME_GUEST; > + vcpu->arch.trap = 0; > + do { > + if (mftb() >= hdec_exp) { > + vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER; > + r = RESUME_HOST; > + break; > + } > + r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp); > + } while (is_kvmppc_resume_guest(r)); > + > + /* save L2 state for return */ > + l2_regs = vcpu->arch.regs; > + l2_regs.msr = vcpu->arch.shregs.msr; > + delta_purr = vcpu->arch.purr - l2_hv.purr; > + delta_spurr = vcpu->arch.spurr - l2_hv.spurr; > + delta_ic = vcpu->arch.ic - l2_hv.ic; > + delta_vtb = vc->vtb - l2_hv.vtb; > + save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv); > + > + /* restore L1 state */ > + vcpu->arch.nested = NULL; > + vcpu->arch.regs = saved_l1_regs; > + vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK; > + /* set L1 MSR TS field according to L2 transaction state */ > + if (l2_regs.msr & MSR_TS_MASK) > + vcpu->arch.shregs.msr |= MSR_TS_S; > + vc->lpcr = saved_l1_hv.lpcr; > + vc->tb_offset = saved_l1_hv.tb_offset; > + restore_hv_regs(vcpu, &saved_l1_hv); > + vcpu->arch.purr += delta_purr; > + vcpu->arch.spurr += delta_spurr; > + vcpu->arch.ic += delta_ic; > + vc->vtb += delta_vtb; > + > + kvmhv_put_nested(l2); > + > + /* copy l2_hv_state and regs back to guest */ > + err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv, > + sizeof(struct hv_guest_state)); > + if (err) > + return H_AUTHORITY; > + err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs, > + sizeof(struct pt_regs)); > + if (err) > + return H_AUTHORITY; > + > + if (r == -EINTR) > + return H_INTERRUPT; > + > + return vcpu->arch.trap; > +} > + > /* Only called when we're not in hypervisor mode */ > bool kvmhv_nested_init(void) > { > @@ -284,3 +509,8 @@ struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid) > return NULL; > return kvm->arch.nested_guests[lpid]; > } > + > +long kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) > +{ > + return RESUME_HOST; > +} > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > index 83efc13..04fcaa4 100644 > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S > @@ -2199,6 +2199,10 @@ hcall_try_real_mode: > andi. r0,r11,MSR_PR > /* sc 1 from userspace - reflect to guest syscall */ > bne sc_1_fast_return > + /* sc 1 from nested guest - give it to L1 to handle */ > + ld r0, VCPU_NESTED(r9) > + cmpdi r0, 0 > + bne guest_exit_cont > clrrdi r3,r3,2 > cmpldi r3,hcall_real_table_end - hcall_real_table > bge guest_exit_cont -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature