Re: [PATCH] powerpc/kvm/xive: Enable use of the new "single escalation" feature

Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> · Tue, 21 Nov 2017 20:37:26 +1100

On Tue, 2017-11-21 at 14:57 +1100, Benjamin Herrenschmidt wrote:
> That feature, provided by Power9 DDD2.0 and later, when supported
> by newer OPAL versions, allows to sacrifice a queue (priority 7)
> in favor of merging all the escalation interrupts of the queues
> of a single VP into a single interrupt.
> 
> This reduces the number of host interrupts used up by KVM guests
> especially when those guests use multiple priorities.
> 
> It will also enable a future change to control the masking of the
> escalation interrupts more precisely to avoid spurrious ones.
> 
> Signed-off-by: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
> ---
> 
> To test, you need a DD2.x chip and this series applied to
> your skiboot firmware:
> 
> https://patchwork.ozlabs.org/project/skiboot/list/?series=14500

Or better, this one:

https://patchwork.ozlabs.org/project/skiboot/list/?series=14526

> 
>  arch/powerpc/include/asm/opal-api.h |  1 +
>  arch/powerpc/include/asm/xive.h     |  3 ++-
>  arch/powerpc/kvm/book3s_xive.c      | 48 ++++++++++++++++++++++++-------------
>  arch/powerpc/kvm/book3s_xive.h      | 15 +++++-------
>  arch/powerpc/sysdev/xive/native.c   | 18 ++++++++++++--
>  5 files changed, 57 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
> index 450a60b81d2a..4df668a32ab4 100644
> --- a/arch/powerpc/include/asm/opal-api.h
> +++ b/arch/powerpc/include/asm/opal-api.h
> @@ -1070,6 +1070,7 @@ enum {
>  /* Flags for OPAL_XIVE_GET/SET_VP_INFO */
>  enum {
>  	OPAL_XIVE_VP_ENABLED		= 0x00000001,
> +	OPAL_XIVE_VP_SINGLE_ESCALATION	= 0x00000002,
>  };
>  
>  /* "Any chip" replacement for chip ID for allocation functions */
> diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
> index 371fbebf1ec9..11d5edeb5c22 100644
> --- a/arch/powerpc/include/asm/xive.h
> +++ b/arch/powerpc/include/asm/xive.h
> @@ -143,9 +143,10 @@ extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
>  
>  extern void xive_native_sync_source(u32 hw_irq);
>  extern bool is_xive_irq(struct irq_chip *chip);
> -extern int xive_native_enable_vp(u32 vp_id);
> +extern int xive_native_enable_vp(u32 vp_id, bool single_escalation);
>  extern int xive_native_disable_vp(u32 vp_id);
>  extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
> +extern bool xive_native_has_single_escalation(void);
>  
>  #else
>  
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index 6cff5bdfd6b7..a102efeabf05 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -112,19 +112,21 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
>  		return -EIO;
>  	}
>  
> -	/*
> -	 * Future improvement: start with them disabled
> -	 * and handle DD2 and later scheme of merged escalation
> -	 * interrupts
> -	 */
> -	name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
> -			 vcpu->kvm->arch.lpid, xc->server_num, prio);
> +	if (xc->xive->single_escalation)
> +		name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
> +				 vcpu->kvm->arch.lpid, xc->server_num);
> +	else
> +		name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
> +				 vcpu->kvm->arch.lpid, xc->server_num, prio);
>  	if (!name) {
>  		pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
>  		       prio, xc->server_num);
>  		rc = -ENOMEM;
>  		goto error;
>  	}
> +
> +	pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio);
> +
>  	rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
>  			 IRQF_NO_THREAD, name, vcpu);
>  	if (rc) {
> @@ -191,12 +193,12 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
>  
>  	pr_devel("Provisioning prio... %d\n", prio);
>  
> -	/* Provision each VCPU and enable escalations */
> +	/* Provision each VCPU and enable escalations if needed */
>  	kvm_for_each_vcpu(i, vcpu, kvm) {
>  		if (!vcpu->arch.xive_vcpu)
>  			continue;
>  		rc = xive_provision_queue(vcpu, prio);
> -		if (rc == 0)
> +		if (rc == 0 && !xive->single_escalation)
>  			xive_attach_escalation(vcpu, prio);
>  		if (rc)
>  			return rc;
> @@ -1081,6 +1083,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>  	/* Allocate IPI */
>  	xc->vp_ipi = xive_native_alloc_irq();
>  	if (!xc->vp_ipi) {
> +		pr_err("Failed to allocate xive irq for VCPU IPI\n");
>  		r = -EIO;
>  		goto bail;
>  	}
> @@ -1090,19 +1093,34 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>  	if (r)
>  		goto bail;
>  
> +	/*
> +	 * Enable the VP first as the single escalation mode will
> +	 * affect escalation interrupts numbering
> +	 */
> +	r = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
> +	if (r) {
> +		pr_err("Failed to enable VP in OPAL, err %d\n", r);
> +		goto bail;
> +	}
> +
>  	/*
>  	 * Initialize queues. Initially we set them all for no queueing
>  	 * and we enable escalation for queue 0 only which we'll use for
>  	 * our mfrr change notifications. If the VCPU is hot-plugged, we
> -	 * do handle provisioning however.
> +	 * do handle provisioning however based on the existing "map"
> +	 * of enabled queues.
>  	 */
>  	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
>  		struct xive_q *q = &xc->queues[i];
>  
> +		/* Single escalation, no queue 7 */
> +		if (i == 7 && xive->single_escalation)
> +			break;
> +
>  		/* Is queue already enabled ? Provision it */
>  		if (xive->qmap & (1 << i)) {
>  			r = xive_provision_queue(vcpu, i);
> -			if (r == 0)
> +			if (r == 0 && !xive->single_escalation)
>  				xive_attach_escalation(vcpu, i);
>  			if (r)
>  				goto bail;
> @@ -1122,11 +1140,6 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>  	if (r)
>  		goto bail;
>  
> -	/* Enable the VP */
> -	r = xive_native_enable_vp(xc->vp_id);
> -	if (r)
> -		goto bail;
> -
>  	/* Route the IPI */
>  	r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
>  	if (!r)
> @@ -1473,6 +1486,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
>  
>  	pr_devel("  val=0x016%llx (server=0x%x, guest_prio=%d)\n",
>  		 val, server, guest_prio);
> +
>  	/*
>  	 * If the source doesn't already have an IPI, allocate
>  	 * one and get the corresponding data
> @@ -1761,6 +1775,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
>  	if (xive->vp_base == XIVE_INVALID_VP)
>  		ret = -ENOMEM;
>  
> +	xive->single_escalation = xive_native_has_single_escalation();
> +
>  	if (ret) {
>  		kfree(xive);
>  		return ret;
> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
> index 6ba63f8e8a61..a08ae6fd4c51 100644
> --- a/arch/powerpc/kvm/book3s_xive.h
> +++ b/arch/powerpc/kvm/book3s_xive.h
> @@ -120,6 +120,8 @@ struct kvmppc_xive {
>  	u32	q_order;
>  	u32	q_page_order;
>  
> +	/* Flags */
> +	u8	single_escalation;
>  };
>  
>  #define KVMPPC_XIVE_Q_COUNT	8
> @@ -201,25 +203,20 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp
>   * is as follow.
>   *
>   * Guest request for 0...6 are honored. Guest request for anything
> - * higher results in a priority of 7 being applied.
> - *
> - * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb
> - * in order to match AIX expectations
> + * higher results in a priority of 6 being applied.
>   *
>   * Similar mapping is done for CPPR values
>   */
>  static inline u8 xive_prio_from_guest(u8 prio)
>  {
> -	if (prio == 0xff || prio < 8)
> +	if (prio == 0xff || prio < 6)
>  		return prio;
> -	return 7;
> +	return 6;
>  }
>  
>  static inline u8 xive_prio_to_guest(u8 prio)
>  {
> -	if (prio == 0xff || prio < 7)
> -		return prio;
> -	return 0xb;
> +	return prio;
>  }
>  
>  static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle)
> diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
> index ebc244b08d67..d22aeb0b69e1 100644
> --- a/arch/powerpc/sysdev/xive/native.c
> +++ b/arch/powerpc/sysdev/xive/native.c
> @@ -42,6 +42,7 @@ static u32 xive_provision_chip_count;
>  static u32 xive_queue_shift;
>  static u32 xive_pool_vps = XIVE_INVALID_VP;
>  static struct kmem_cache *xive_provision_cache;
> +static bool xive_has_single_esc;
>  
>  int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
>  {
> @@ -571,6 +572,10 @@ bool __init xive_native_init(void)
>  			break;
>  	}
>  
> +	/* Do we support single escalation */
> +	if (of_get_property(np, "single-escalation-support", NULL) != NULL)
> +		xive_has_single_esc = true;
> +
>  	/* Configure Thread Management areas for KVM */
>  	for_each_possible_cpu(cpu)
>  		kvmppc_set_xive_tima(cpu, r.start, tima);
> @@ -667,12 +672,15 @@ void xive_native_free_vp_block(u32 vp_base)
>  }
>  EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
>  
> -int xive_native_enable_vp(u32 vp_id)
> +int xive_native_enable_vp(u32 vp_id, bool single_escalation)
>  {
>  	s64 rc;
> +	u64 flags = OPAL_XIVE_VP_ENABLED;
>  
> +	if (single_escalation)
> +		flags |= OPAL_XIVE_VP_SINGLE_ESCALATION;
>  	for (;;) {
> -		rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0);
> +		rc = opal_xive_set_vp_info(vp_id, flags, 0);
>  		if (rc != OPAL_BUSY)
>  			break;
>  		msleep(1);
> @@ -710,3 +718,9 @@ int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id)
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(xive_native_get_vp_info);
> +
> +bool xive_native_has_single_escalation(void)
> +{
> +	return xive_has_single_esc;
> +}
> +EXPORT_SYMBOL_GPL(xive_native_has_single_escalation);