Re: [PATCH] KVM: PPC: Book3S: Allow XICS emulation to work in nested hosts using XIVE

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2/4/19 12:07 PM, Paul Mackerras wrote:
> Currently, the KVM code assumes that if the host kernel is using the
> XIVE interrupt controller (the new interrupt controller that first
> appeared in POWER9 systems), then the in-kernel XICS emulation will
> use the XIVE hardware to deliver interrupts to the guest.  However,
> this only works when the host is running in hypervisor mode and has
> full access to all of the XIVE functionality.  It doesn't work in any
> nested virtualization scenario, either with PR KVM or nested-HV KVM,
> because the XICS-on-XIVE code calls directly into the native-XIVE
> routines, which are not initialized and cannot function correctly
> because they use OPAL calls, and OPAL is not available in a guest.
> 
> This means that using the in-kernel XICS emulation in a nested
> hypervisor that is using XIVE as its interrupt controller will cause a
> (nested) host kernel crash.  To fix this, we change most of the places
> where the current code calls xive_enabled() to select between the
> XICS-on-XIVE emulation and the plain XICS emulation to call a new
> function, xics_on_xive(), which returns false in a guest.
> 
> However, there is a further twist.  The plain XICS emulation has some
> functions which are used in real mode and access the underlying XICS
> controller (the interrupt controller of the host) directly.  In the
> case of a nested hypervisor, this means doing XICS hypercalls
> directly.  When the nested host is using XIVE as its interrupt
> controller, these hypercalls will fail.  Therefore this also adds
> checks in the places where the XICS emulation wants to access the
> underlying interrupt controller directly, and if that is XIVE, makes
> the code use the virtual mode fallback paths, which call generic
> kernel infrastructure rather than doing direct XICS access.

I gave the patch a try on a L1 hypervisor running with a XICS-on-XIVE
KVM device. 

Performance is good. Without any tuning (CPU pinning is required to
get all the juice), we have ~33 Gb/s between L0 and L2, a little more
between L1 and L2. 


Reviewed-by: Cédric Le Goater <clg@xxxxxxxx>

Thanks,

C.



> Signed-off-by: Paul Mackerras <paulus@xxxxxxxxxx>
> ---
>  arch/powerpc/include/asm/kvm_ppc.h   | 14 ++++++++++++++
>  arch/powerpc/kvm/book3s.c            | 10 +++++-----
>  arch/powerpc/kvm/book3s_hv.c         | 16 ++++++++--------
>  arch/powerpc/kvm/book3s_hv_builtin.c | 14 +++++++-------
>  arch/powerpc/kvm/book3s_hv_rm_xics.c |  7 +++++++
>  arch/powerpc/kvm/book3s_rtas.c       |  8 ++++----
>  arch/powerpc/kvm/powerpc.c           |  4 ++--
>  7 files changed, 47 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index eb0d79f..b3bf4f6 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -36,6 +36,8 @@
>  #endif
>  #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
>  #include <asm/paca.h>
> +#include <asm/xive.h>
> +#include <asm/cpu_has_feature.h>
>  #endif
>  
>  /*
> @@ -616,6 +618,18 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir
>  static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
>  #endif /* CONFIG_KVM_XIVE */
>  
> +#ifdef CONFIG_PPC_POWERNV
> +static inline bool xics_on_xive(void)
> +{
> +	return xive_enabled() && cpu_has_feature(CPU_FTR_HVMODE);
> +}
> +#else
> +static inline bool xics_on_xive(void)
> +{
> +	return false;
> +}
> +#endif
>
>  /*
>   * Prototypes for functions called only from assembler code.
>   * Having prototypes reduces sparse errors.
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index bd1a677..601c094 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -635,7 +635,7 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
>  				r = -ENXIO;
>  				break;
>  			}
> -			if (xive_enabled())
> +			if (xics_on_xive())
>  				*val = get_reg_val(id, kvmppc_xive_get_icp(vcpu));
>  			else
>  				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
> @@ -708,7 +708,7 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
>  				r = -ENXIO;
>  				break;
>  			}
> -			if (xive_enabled())
> +			if (xics_on_xive())
>  				r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val));
>  			else
>  				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
> @@ -984,7 +984,7 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall)
>  int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
>  		bool line_status)
>  {
> -	if (xive_enabled())
> +	if (xics_on_xive())
>  		return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level,
>  					   line_status);
>  	else
> @@ -1037,7 +1037,7 @@ static int kvmppc_book3s_init(void)
>  
>  #ifdef CONFIG_KVM_XICS
>  #ifdef CONFIG_KVM_XIVE
> -	if (xive_enabled()) {
> +	if (xics_on_xive()) {
>  		kvmppc_xive_init_module();
>  		kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
>  	} else
> @@ -1050,7 +1050,7 @@ static int kvmppc_book3s_init(void)
>  static void kvmppc_book3s_exit(void)
>  {
>  #ifdef CONFIG_KVM_XICS
> -	if (xive_enabled())
> +	if (xics_on_xive())
>  		kvmppc_xive_exit_module();
>  #endif
>  #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 5a066fc..94b620b 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -922,7 +922,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>  	case H_IPOLL:
>  	case H_XIRR_X:
>  		if (kvmppc_xics_enabled(vcpu)) {
> -			if (xive_enabled()) {
> +			if (xics_on_xive()) {
>  				ret = H_NOT_AVAILABLE;
>  				return RESUME_GUEST;
>  			}
> @@ -1431,7 +1431,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
>  	case BOOK3S_INTERRUPT_HV_RM_HARD:
>  		vcpu->arch.trap = 0;
>  		r = RESUME_GUEST;
> -		if (!xive_enabled())
> +		if (!xics_on_xive())
>  			kvmppc_xics_rm_complete(vcpu, 0);
>  		break;
>  	default:
> @@ -3649,7 +3649,7 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
>  #ifdef CONFIG_KVM_XICS
>  static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
>  {
> -	if (!xive_enabled())
> +	if (!xics_on_xive())
>  		return false;
>  	return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
>  		vcpu->arch.xive_saved_state.cppr;
> @@ -4209,7 +4209,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
>  				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
>  			srcu_read_unlock(&kvm->srcu, srcu_idx);
>  		} else if (r == RESUME_PASSTHROUGH) {
> -			if (WARN_ON(xive_enabled()))
> +			if (WARN_ON(xics_on_xive()))
>  				r = H_SUCCESS;
>  			else
>  				r = kvmppc_xics_rm_complete(vcpu, 0);
> @@ -4733,7 +4733,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
>  		 * If xive is enabled, we route 0x500 interrupts directly
>  		 * to the guest.
>  		 */
> -		if (xive_enabled())
> +		if (xics_on_xive())
>  			lpcr |= LPCR_LPES;
>  	}
>  
> @@ -4969,7 +4969,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
>  	if (i == pimap->n_mapped)
>  		pimap->n_mapped++;
>  
> -	if (xive_enabled())
> +	if (xics_on_xive())
>  		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
>  	else
>  		kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
> @@ -5010,7 +5010,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
>  		return -ENODEV;
>  	}
>  
> -	if (xive_enabled())
> +	if (xics_on_xive())
>  		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
>  	else
>  		kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
> @@ -5389,7 +5389,7 @@ static int kvmppc_book3s_init_hv(void)
>  	 * indirectly, via OPAL.
>  	 */
>  #ifdef CONFIG_SMP
> -	if (!xive_enabled() && !kvmhv_on_pseries() &&
> +	if (!xics_on_xive() && !kvmhv_on_pseries() &&
>  	    !local_paca->kvm_hstate.xics_phys) {
>  		struct device_node *np;
>  
> diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
> index a71e2fc..b0cf224 100644
> --- a/arch/powerpc/kvm/book3s_hv_builtin.c
> +++ b/arch/powerpc/kvm/book3s_hv_builtin.c
> @@ -257,7 +257,7 @@ void kvmhv_rm_send_ipi(int cpu)
>  	}
>  
>  	/* We should never reach this */
> -	if (WARN_ON_ONCE(xive_enabled()))
> +	if (WARN_ON_ONCE(xics_on_xive()))
>  	    return;
>  
>  	/* Else poke the target with an IPI */
> @@ -577,7 +577,7 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
>  {
>  	if (!kvmppc_xics_enabled(vcpu))
>  		return H_TOO_HARD;
> -	if (xive_enabled()) {
> +	if (xics_on_xive()) {
>  		if (is_rm())
>  			return xive_rm_h_xirr(vcpu);
>  		if (unlikely(!__xive_vm_h_xirr))
> @@ -592,7 +592,7 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
>  	if (!kvmppc_xics_enabled(vcpu))
>  		return H_TOO_HARD;
>  	vcpu->arch.regs.gpr[5] = get_tb();
> -	if (xive_enabled()) {
> +	if (xics_on_xive()) {
>  		if (is_rm())
>  			return xive_rm_h_xirr(vcpu);
>  		if (unlikely(!__xive_vm_h_xirr))
> @@ -606,7 +606,7 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
>  {
>  	if (!kvmppc_xics_enabled(vcpu))
>  		return H_TOO_HARD;
> -	if (xive_enabled()) {
> +	if (xics_on_xive()) {
>  		if (is_rm())
>  			return xive_rm_h_ipoll(vcpu, server);
>  		if (unlikely(!__xive_vm_h_ipoll))
> @@ -621,7 +621,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
>  {
>  	if (!kvmppc_xics_enabled(vcpu))
>  		return H_TOO_HARD;
> -	if (xive_enabled()) {
> +	if (xics_on_xive()) {
>  		if (is_rm())
>  			return xive_rm_h_ipi(vcpu, server, mfrr);
>  		if (unlikely(!__xive_vm_h_ipi))
> @@ -635,7 +635,7 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
>  {
>  	if (!kvmppc_xics_enabled(vcpu))
>  		return H_TOO_HARD;
> -	if (xive_enabled()) {
> +	if (xics_on_xive()) {
>  		if (is_rm())
>  			return xive_rm_h_cppr(vcpu, cppr);
>  		if (unlikely(!__xive_vm_h_cppr))
> @@ -649,7 +649,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
>  {
>  	if (!kvmppc_xics_enabled(vcpu))
>  		return H_TOO_HARD;
> -	if (xive_enabled()) {
> +	if (xics_on_xive()) {
>  		if (is_rm())
>  			return xive_rm_h_eoi(vcpu, xirr);
>  		if (unlikely(!__xive_vm_h_eoi))
> diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
> index b3f5786..3b9662a 100644
> --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
> +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
> @@ -144,6 +144,13 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
>  		return;
>  	}
>  
> +	if (xive_enabled() && kvmhv_on_pseries()) {
> +		/* No XICS access or hypercalls available, too hard */
> +		this_icp->rm_action |= XICS_RM_KICK_VCPU;
> +		this_icp->rm_kick_target = vcpu;
> +		return;
> +	}
> +
>  	/*
>  	 * Check if the core is loaded,
>  	 * if not, find an available host core to post to wake the VCPU,
> diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
> index 2d3b2b1..4e178c4 100644
> --- a/arch/powerpc/kvm/book3s_rtas.c
> +++ b/arch/powerpc/kvm/book3s_rtas.c
> @@ -33,7 +33,7 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
>  	server = be32_to_cpu(args->args[1]);
>  	priority = be32_to_cpu(args->args[2]);
>  
> -	if (xive_enabled())
> +	if (xics_on_xive())
>  		rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority);
>  	else
>  		rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
> @@ -56,7 +56,7 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
>  	irq = be32_to_cpu(args->args[0]);
>  
>  	server = priority = 0;
> -	if (xive_enabled())
> +	if (xics_on_xive())
>  		rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority);
>  	else
>  		rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
> @@ -83,7 +83,7 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
>  
>  	irq = be32_to_cpu(args->args[0]);
>  
> -	if (xive_enabled())
> +	if (xics_on_xive())
>  		rc = kvmppc_xive_int_off(vcpu->kvm, irq);
>  	else
>  		rc = kvmppc_xics_int_off(vcpu->kvm, irq);
> @@ -105,7 +105,7 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
>  
>  	irq = be32_to_cpu(args->args[0]);
>  
> -	if (xive_enabled())
> +	if (xics_on_xive())
>  		rc = kvmppc_xive_int_on(vcpu->kvm, irq);
>  	else
>  		rc = kvmppc_xics_int_on(vcpu->kvm, irq);
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index b90a7d1..8c69af1 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -748,7 +748,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
>  		kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
>  		break;
>  	case KVMPPC_IRQ_XICS:
> -		if (xive_enabled())
> +		if (xics_on_xive())
>  			kvmppc_xive_cleanup_vcpu(vcpu);
>  		else
>  			kvmppc_xics_free_icp(vcpu);
> @@ -1931,7 +1931,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>  		r = -EPERM;
>  		dev = kvm_device_from_filp(f.file);
>  		if (dev) {
> -			if (xive_enabled())
> +			if (xics_on_xive())
>  				r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]);
>  			else
>  				r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
> 




[Index of Archives]     [KVM Development]     [KVM ARM]     [KVM ia64]     [Linux Virtualization]     [Linux USB Devel]     [Linux Video]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [Big List of Linux Books]

  Powered by Linux