On Tue, 2017-11-21 at 14:57 +1100, Benjamin Herrenschmidt wrote: > That feature, provided by Power9 DDD2.0 and later, when supported > by newer OPAL versions, allows to sacrifice a queue (priority 7) > in favor of merging all the escalation interrupts of the queues > of a single VP into a single interrupt. > > This reduces the number of host interrupts used up by KVM guests > especially when those guests use multiple priorities. > > It will also enable a future change to control the masking of the > escalation interrupts more precisely to avoid spurrious ones. > > Signed-off-by: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> > --- > > To test, you need a DD2.x chip and this series applied to > your skiboot firmware: > > https://patchwork.ozlabs.org/project/skiboot/list/?series=14500 Or better, this one: https://patchwork.ozlabs.org/project/skiboot/list/?series=14526 > > arch/powerpc/include/asm/opal-api.h | 1 + > arch/powerpc/include/asm/xive.h | 3 ++- > arch/powerpc/kvm/book3s_xive.c | 48 ++++++++++++++++++++++++------------- > arch/powerpc/kvm/book3s_xive.h | 15 +++++------- > arch/powerpc/sysdev/xive/native.c | 18 ++++++++++++-- > 5 files changed, 57 insertions(+), 28 deletions(-) > > diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h > index 450a60b81d2a..4df668a32ab4 100644 > --- a/arch/powerpc/include/asm/opal-api.h > +++ b/arch/powerpc/include/asm/opal-api.h > @@ -1070,6 +1070,7 @@ enum { > /* Flags for OPAL_XIVE_GET/SET_VP_INFO */ > enum { > OPAL_XIVE_VP_ENABLED = 0x00000001, > + OPAL_XIVE_VP_SINGLE_ESCALATION = 0x00000002, > }; > > /* "Any chip" replacement for chip ID for allocation functions */ > diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h > index 371fbebf1ec9..11d5edeb5c22 100644 > --- a/arch/powerpc/include/asm/xive.h > +++ b/arch/powerpc/include/asm/xive.h > @@ -143,9 +143,10 @@ extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); > > extern void xive_native_sync_source(u32 hw_irq); > extern bool is_xive_irq(struct irq_chip *chip); > -extern int xive_native_enable_vp(u32 vp_id); > +extern int xive_native_enable_vp(u32 vp_id, bool single_escalation); > extern int xive_native_disable_vp(u32 vp_id); > extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); > +extern bool xive_native_has_single_escalation(void); > > #else > > diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c > index 6cff5bdfd6b7..a102efeabf05 100644 > --- a/arch/powerpc/kvm/book3s_xive.c > +++ b/arch/powerpc/kvm/book3s_xive.c > @@ -112,19 +112,21 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) > return -EIO; > } > > - /* > - * Future improvement: start with them disabled > - * and handle DD2 and later scheme of merged escalation > - * interrupts > - */ > - name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d", > - vcpu->kvm->arch.lpid, xc->server_num, prio); > + if (xc->xive->single_escalation) > + name = kasprintf(GFP_KERNEL, "kvm-%d-%d", > + vcpu->kvm->arch.lpid, xc->server_num); > + else > + name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d", > + vcpu->kvm->arch.lpid, xc->server_num, prio); > if (!name) { > pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n", > prio, xc->server_num); > rc = -ENOMEM; > goto error; > } > + > + pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio); > + > rc = request_irq(xc->esc_virq[prio], xive_esc_irq, > IRQF_NO_THREAD, name, vcpu); > if (rc) { > @@ -191,12 +193,12 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio) > > pr_devel("Provisioning prio... %d\n", prio); > > - /* Provision each VCPU and enable escalations */ > + /* Provision each VCPU and enable escalations if needed */ > kvm_for_each_vcpu(i, vcpu, kvm) { > if (!vcpu->arch.xive_vcpu) > continue; > rc = xive_provision_queue(vcpu, prio); > - if (rc == 0) > + if (rc == 0 && !xive->single_escalation) > xive_attach_escalation(vcpu, prio); > if (rc) > return rc; > @@ -1081,6 +1083,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > /* Allocate IPI */ > xc->vp_ipi = xive_native_alloc_irq(); > if (!xc->vp_ipi) { > + pr_err("Failed to allocate xive irq for VCPU IPI\n"); > r = -EIO; > goto bail; > } > @@ -1090,19 +1093,34 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > if (r) > goto bail; > > + /* > + * Enable the VP first as the single escalation mode will > + * affect escalation interrupts numbering > + */ > + r = xive_native_enable_vp(xc->vp_id, xive->single_escalation); > + if (r) { > + pr_err("Failed to enable VP in OPAL, err %d\n", r); > + goto bail; > + } > + > /* > * Initialize queues. Initially we set them all for no queueing > * and we enable escalation for queue 0 only which we'll use for > * our mfrr change notifications. If the VCPU is hot-plugged, we > - * do handle provisioning however. > + * do handle provisioning however based on the existing "map" > + * of enabled queues. > */ > for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { > struct xive_q *q = &xc->queues[i]; > > + /* Single escalation, no queue 7 */ > + if (i == 7 && xive->single_escalation) > + break; > + > /* Is queue already enabled ? Provision it */ > if (xive->qmap & (1 << i)) { > r = xive_provision_queue(vcpu, i); > - if (r == 0) > + if (r == 0 && !xive->single_escalation) > xive_attach_escalation(vcpu, i); > if (r) > goto bail; > @@ -1122,11 +1140,6 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, > if (r) > goto bail; > > - /* Enable the VP */ > - r = xive_native_enable_vp(xc->vp_id); > - if (r) > - goto bail; > - > /* Route the IPI */ > r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI); > if (!r) > @@ -1473,6 +1486,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) > > pr_devel(" val=0x016%llx (server=0x%x, guest_prio=%d)\n", > val, server, guest_prio); > + > /* > * If the source doesn't already have an IPI, allocate > * one and get the corresponding data > @@ -1761,6 +1775,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) > if (xive->vp_base == XIVE_INVALID_VP) > ret = -ENOMEM; > > + xive->single_escalation = xive_native_has_single_escalation(); > + > if (ret) { > kfree(xive); > return ret; > diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h > index 6ba63f8e8a61..a08ae6fd4c51 100644 > --- a/arch/powerpc/kvm/book3s_xive.h > +++ b/arch/powerpc/kvm/book3s_xive.h > @@ -120,6 +120,8 @@ struct kvmppc_xive { > u32 q_order; > u32 q_page_order; > > + /* Flags */ > + u8 single_escalation; > }; > > #define KVMPPC_XIVE_Q_COUNT 8 > @@ -201,25 +203,20 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp > * is as follow. > * > * Guest request for 0...6 are honored. Guest request for anything > - * higher results in a priority of 7 being applied. > - * > - * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb > - * in order to match AIX expectations > + * higher results in a priority of 6 being applied. > * > * Similar mapping is done for CPPR values > */ > static inline u8 xive_prio_from_guest(u8 prio) > { > - if (prio == 0xff || prio < 8) > + if (prio == 0xff || prio < 6) > return prio; > - return 7; > + return 6; > } > > static inline u8 xive_prio_to_guest(u8 prio) > { > - if (prio == 0xff || prio < 7) > - return prio; > - return 0xb; > + return prio; > } > > static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle) > diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c > index ebc244b08d67..d22aeb0b69e1 100644 > --- a/arch/powerpc/sysdev/xive/native.c > +++ b/arch/powerpc/sysdev/xive/native.c > @@ -42,6 +42,7 @@ static u32 xive_provision_chip_count; > static u32 xive_queue_shift; > static u32 xive_pool_vps = XIVE_INVALID_VP; > static struct kmem_cache *xive_provision_cache; > +static bool xive_has_single_esc; > > int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) > { > @@ -571,6 +572,10 @@ bool __init xive_native_init(void) > break; > } > > + /* Do we support single escalation */ > + if (of_get_property(np, "single-escalation-support", NULL) != NULL) > + xive_has_single_esc = true; > + > /* Configure Thread Management areas for KVM */ > for_each_possible_cpu(cpu) > kvmppc_set_xive_tima(cpu, r.start, tima); > @@ -667,12 +672,15 @@ void xive_native_free_vp_block(u32 vp_base) > } > EXPORT_SYMBOL_GPL(xive_native_free_vp_block); > > -int xive_native_enable_vp(u32 vp_id) > +int xive_native_enable_vp(u32 vp_id, bool single_escalation) > { > s64 rc; > + u64 flags = OPAL_XIVE_VP_ENABLED; > > + if (single_escalation) > + flags |= OPAL_XIVE_VP_SINGLE_ESCALATION; > for (;;) { > - rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0); > + rc = opal_xive_set_vp_info(vp_id, flags, 0); > if (rc != OPAL_BUSY) > break; > msleep(1); > @@ -710,3 +718,9 @@ int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id) > return 0; > } > EXPORT_SYMBOL_GPL(xive_native_get_vp_info); > + > +bool xive_native_has_single_escalation(void) > +{ > + return xive_has_single_esc; > +} > +EXPORT_SYMBOL_GPL(xive_native_has_single_escalation);