On 2/4/19 12:07 PM, Paul Mackerras wrote: > Currently, the KVM code assumes that if the host kernel is using the > XIVE interrupt controller (the new interrupt controller that first > appeared in POWER9 systems), then the in-kernel XICS emulation will > use the XIVE hardware to deliver interrupts to the guest. However, > this only works when the host is running in hypervisor mode and has > full access to all of the XIVE functionality. It doesn't work in any > nested virtualization scenario, either with PR KVM or nested-HV KVM, > because the XICS-on-XIVE code calls directly into the native-XIVE > routines, which are not initialized and cannot function correctly > because they use OPAL calls, and OPAL is not available in a guest. > > This means that using the in-kernel XICS emulation in a nested > hypervisor that is using XIVE as its interrupt controller will cause a > (nested) host kernel crash. To fix this, we change most of the places > where the current code calls xive_enabled() to select between the > XICS-on-XIVE emulation and the plain XICS emulation to call a new > function, xics_on_xive(), which returns false in a guest. > > However, there is a further twist. The plain XICS emulation has some > functions which are used in real mode and access the underlying XICS > controller (the interrupt controller of the host) directly. In the > case of a nested hypervisor, this means doing XICS hypercalls > directly. When the nested host is using XIVE as its interrupt > controller, these hypercalls will fail. Therefore this also adds > checks in the places where the XICS emulation wants to access the > underlying interrupt controller directly, and if that is XIVE, makes > the code use the virtual mode fallback paths, which call generic > kernel infrastructure rather than doing direct XICS access. I gave the patch a try on a L1 hypervisor running with a XICS-on-XIVE KVM device. Performance is good. Without any tuning (CPU pinning is required to get all the juice), we have ~33 Gb/s between L0 and L2, a little more between L1 and L2. Reviewed-by: Cédric Le Goater <clg@xxxxxxxx> Thanks, C. > Signed-off-by: Paul Mackerras <paulus@xxxxxxxxxx> > --- > arch/powerpc/include/asm/kvm_ppc.h | 14 ++++++++++++++ > arch/powerpc/kvm/book3s.c | 10 +++++----- > arch/powerpc/kvm/book3s_hv.c | 16 ++++++++-------- > arch/powerpc/kvm/book3s_hv_builtin.c | 14 +++++++------- > arch/powerpc/kvm/book3s_hv_rm_xics.c | 7 +++++++ > arch/powerpc/kvm/book3s_rtas.c | 8 ++++---- > arch/powerpc/kvm/powerpc.c | 4 ++-- > 7 files changed, 47 insertions(+), 26 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h > index eb0d79f..b3bf4f6 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -36,6 +36,8 @@ > #endif > #ifdef CONFIG_KVM_BOOK3S_64_HANDLER > #include <asm/paca.h> > +#include <asm/xive.h> > +#include <asm/cpu_has_feature.h> > #endif > > /* > @@ -616,6 +618,18 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir > static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } > #endif /* CONFIG_KVM_XIVE */ > > +#ifdef CONFIG_PPC_POWERNV > +static inline bool xics_on_xive(void) > +{ > + return xive_enabled() && cpu_has_feature(CPU_FTR_HVMODE); > +} > +#else > +static inline bool xics_on_xive(void) > +{ > + return false; > +} > +#endif > > /* > * Prototypes for functions called only from assembler code. > * Having prototypes reduces sparse errors. > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c > index bd1a677..601c094 100644 > --- a/arch/powerpc/kvm/book3s.c > +++ b/arch/powerpc/kvm/book3s.c > @@ -635,7 +635,7 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, > r = -ENXIO; > break; > } > - if (xive_enabled()) > + if (xics_on_xive()) > *val = get_reg_val(id, kvmppc_xive_get_icp(vcpu)); > else > *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); > @@ -708,7 +708,7 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, > r = -ENXIO; > break; > } > - if (xive_enabled()) > + if (xics_on_xive()) > r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val)); > else > r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val)); > @@ -984,7 +984,7 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall) > int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, > bool line_status) > { > - if (xive_enabled()) > + if (xics_on_xive()) > return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level, > line_status); > else > @@ -1037,7 +1037,7 @@ static int kvmppc_book3s_init(void) > > #ifdef CONFIG_KVM_XICS > #ifdef CONFIG_KVM_XIVE > - if (xive_enabled()) { > + if (xics_on_xive()) { > kvmppc_xive_init_module(); > kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); > } else > @@ -1050,7 +1050,7 @@ static int kvmppc_book3s_init(void) > static void kvmppc_book3s_exit(void) > { > #ifdef CONFIG_KVM_XICS > - if (xive_enabled()) > + if (xics_on_xive()) > kvmppc_xive_exit_module(); > #endif > #ifdef CONFIG_KVM_BOOK3S_32_HANDLER > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 5a066fc..94b620b 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -922,7 +922,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) > case H_IPOLL: > case H_XIRR_X: > if (kvmppc_xics_enabled(vcpu)) { > - if (xive_enabled()) { > + if (xics_on_xive()) { > ret = H_NOT_AVAILABLE; > return RESUME_GUEST; > } > @@ -1431,7 +1431,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) > case BOOK3S_INTERRUPT_HV_RM_HARD: > vcpu->arch.trap = 0; > r = RESUME_GUEST; > - if (!xive_enabled()) > + if (!xics_on_xive()) > kvmppc_xics_rm_complete(vcpu, 0); > break; > default: > @@ -3649,7 +3649,7 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) > #ifdef CONFIG_KVM_XICS > static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) > { > - if (!xive_enabled()) > + if (!xics_on_xive()) > return false; > return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr < > vcpu->arch.xive_saved_state.cppr; > @@ -4209,7 +4209,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) > vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); > srcu_read_unlock(&kvm->srcu, srcu_idx); > } else if (r == RESUME_PASSTHROUGH) { > - if (WARN_ON(xive_enabled())) > + if (WARN_ON(xics_on_xive())) > r = H_SUCCESS; > else > r = kvmppc_xics_rm_complete(vcpu, 0); > @@ -4733,7 +4733,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) > * If xive is enabled, we route 0x500 interrupts directly > * to the guest. > */ > - if (xive_enabled()) > + if (xics_on_xive()) > lpcr |= LPCR_LPES; > } > > @@ -4969,7 +4969,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) > if (i == pimap->n_mapped) > pimap->n_mapped++; > > - if (xive_enabled()) > + if (xics_on_xive()) > rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc); > else > kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); > @@ -5010,7 +5010,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) > return -ENODEV; > } > > - if (xive_enabled()) > + if (xics_on_xive()) > rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc); > else > kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); > @@ -5389,7 +5389,7 @@ static int kvmppc_book3s_init_hv(void) > * indirectly, via OPAL. > */ > #ifdef CONFIG_SMP > - if (!xive_enabled() && !kvmhv_on_pseries() && > + if (!xics_on_xive() && !kvmhv_on_pseries() && > !local_paca->kvm_hstate.xics_phys) { > struct device_node *np; > > diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c > index a71e2fc..b0cf224 100644 > --- a/arch/powerpc/kvm/book3s_hv_builtin.c > +++ b/arch/powerpc/kvm/book3s_hv_builtin.c > @@ -257,7 +257,7 @@ void kvmhv_rm_send_ipi(int cpu) > } > > /* We should never reach this */ > - if (WARN_ON_ONCE(xive_enabled())) > + if (WARN_ON_ONCE(xics_on_xive())) > return; > > /* Else poke the target with an IPI */ > @@ -577,7 +577,7 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) > { > if (!kvmppc_xics_enabled(vcpu)) > return H_TOO_HARD; > - if (xive_enabled()) { > + if (xics_on_xive()) { > if (is_rm()) > return xive_rm_h_xirr(vcpu); > if (unlikely(!__xive_vm_h_xirr)) > @@ -592,7 +592,7 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) > if (!kvmppc_xics_enabled(vcpu)) > return H_TOO_HARD; > vcpu->arch.regs.gpr[5] = get_tb(); > - if (xive_enabled()) { > + if (xics_on_xive()) { > if (is_rm()) > return xive_rm_h_xirr(vcpu); > if (unlikely(!__xive_vm_h_xirr)) > @@ -606,7 +606,7 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) > { > if (!kvmppc_xics_enabled(vcpu)) > return H_TOO_HARD; > - if (xive_enabled()) { > + if (xics_on_xive()) { > if (is_rm()) > return xive_rm_h_ipoll(vcpu, server); > if (unlikely(!__xive_vm_h_ipoll)) > @@ -621,7 +621,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, > { > if (!kvmppc_xics_enabled(vcpu)) > return H_TOO_HARD; > - if (xive_enabled()) { > + if (xics_on_xive()) { > if (is_rm()) > return xive_rm_h_ipi(vcpu, server, mfrr); > if (unlikely(!__xive_vm_h_ipi)) > @@ -635,7 +635,7 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) > { > if (!kvmppc_xics_enabled(vcpu)) > return H_TOO_HARD; > - if (xive_enabled()) { > + if (xics_on_xive()) { > if (is_rm()) > return xive_rm_h_cppr(vcpu, cppr); > if (unlikely(!__xive_vm_h_cppr)) > @@ -649,7 +649,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) > { > if (!kvmppc_xics_enabled(vcpu)) > return H_TOO_HARD; > - if (xive_enabled()) { > + if (xics_on_xive()) { > if (is_rm()) > return xive_rm_h_eoi(vcpu, xirr); > if (unlikely(!__xive_vm_h_eoi)) > diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c > index b3f5786..3b9662a 100644 > --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c > +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c > @@ -144,6 +144,13 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, > return; > } > > + if (xive_enabled() && kvmhv_on_pseries()) { > + /* No XICS access or hypercalls available, too hard */ > + this_icp->rm_action |= XICS_RM_KICK_VCPU; > + this_icp->rm_kick_target = vcpu; > + return; > + } > + > /* > * Check if the core is loaded, > * if not, find an available host core to post to wake the VCPU, > diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c > index 2d3b2b1..4e178c4 100644 > --- a/arch/powerpc/kvm/book3s_rtas.c > +++ b/arch/powerpc/kvm/book3s_rtas.c > @@ -33,7 +33,7 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) > server = be32_to_cpu(args->args[1]); > priority = be32_to_cpu(args->args[2]); > > - if (xive_enabled()) > + if (xics_on_xive()) > rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority); > else > rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); > @@ -56,7 +56,7 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) > irq = be32_to_cpu(args->args[0]); > > server = priority = 0; > - if (xive_enabled()) > + if (xics_on_xive()) > rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority); > else > rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); > @@ -83,7 +83,7 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args) > > irq = be32_to_cpu(args->args[0]); > > - if (xive_enabled()) > + if (xics_on_xive()) > rc = kvmppc_xive_int_off(vcpu->kvm, irq); > else > rc = kvmppc_xics_int_off(vcpu->kvm, irq); > @@ -105,7 +105,7 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args) > > irq = be32_to_cpu(args->args[0]); > > - if (xive_enabled()) > + if (xics_on_xive()) > rc = kvmppc_xive_int_on(vcpu->kvm, irq); > else > rc = kvmppc_xics_int_on(vcpu->kvm, irq); > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c > index b90a7d1..8c69af1 100644 > --- a/arch/powerpc/kvm/powerpc.c > +++ b/arch/powerpc/kvm/powerpc.c > @@ -748,7 +748,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) > kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); > break; > case KVMPPC_IRQ_XICS: > - if (xive_enabled()) > + if (xics_on_xive()) > kvmppc_xive_cleanup_vcpu(vcpu); > else > kvmppc_xics_free_icp(vcpu); > @@ -1931,7 +1931,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, > r = -EPERM; > dev = kvm_device_from_filp(f.file); > if (dev) { > - if (xive_enabled()) > + if (xics_on_xive()) > r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]); > else > r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); >