On 3/10/20 4:09 PM, Greg Kurz wrote: > On Fri, 6 Mar 2020 16:01:40 +0100 > Cédric Le Goater <clg@xxxxxxxx> wrote: > >> When a CPU is brought up, an IPI number is allocated and recorded >> under the XIVE CPU structure. Invalid IPI numbers are tracked with >> interrupt number 0x0. >> >> On the PowerNV platform, the interrupt number space starts at 0x10 and >> this works fine. However, on the sPAPR platform, it is possible to >> allocate the interrupt number 0x0 and this raises an issue when CPU 0 >> is unplugged. The XIVE spapr driver tracks allocated interrupt numbers >> in a bitmask and it is not correctly updated when interrupt number 0x0 >> is freed. It stays allocated and it is then impossible to reallocate. >> >> Fix by using the XIVE_BAD_IRQ value instead of zero on both platforms. >> >> Reported-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> >> Fixes: eac1e731b59e ("powerpc/xive: guest exploitation of the XIVE interrupt controller") >> Cc: stable@xxxxxxxxxxxxxxx # v4.14+ >> Signed-off-by: Cédric Le Goater <clg@xxxxxxxx> >> --- > > This looks mostly good. I'm juste wondering about potential overlooks: Yes. Thanks for doing so. There are some non-obvious use of interrupt numbers under the hood. One thing we should be adding is a comment on the different interrupt number spaces. The HW interrupt numbers, the EISN numbers found on the XIVE even queue and the Linux interrupt numbers are different spaces and have different limits. XIVE_BAD_IRQ was introduced for the EISN numbers and the patch is using this value for HW numbers. This is ok because it's more a tag than a limit. > $ git grep 'if.*hw_i' arch/powerpc/ | egrep -v 'xics|XIVE_BAD_IRQ' > > > arch/powerpc/kvm/book3s_xive.h: if (out_hw_irq) > arch/powerpc/kvm/book3s_xive.h: if (out_hw_irq) > arch/powerpc/kvm/book3s_xive_template.c: else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) > arch/powerpc/sysdev/xive/common.c: else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { > > This hw_irq check in xive_do_source_eoi() for example is related to: > > /* > * Note: We pass "0" to the hw_irq argument in order to > * avoid calling into the backend EOI code which we don't > * want to do in the case of a re-trigger. Backends typically > * only do EOI for LSIs anyway. > */ > xive_do_source_eoi(0, xd); that's a hack to skip the following part of the code in case of EOI being done through FW: else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { /* * The FW told us to call it. This happens for some * interrupt sources that need additional HW whacking * beyond the ESB manipulation. For example LPC interrupts * on P9 DD1.0 needed a latch to be clared in the LPC bridge * itself. The Firmware will take care of it. */ if (WARN_ON_ONCE(!xive_ops->eoi)) return; xive_ops->eoi(hw_irq); That was the case for PHB4 LSIs on P9 DD1 only. We could probably drop the code in Linux unless similar bugs show up on new platforms. > but it can get hw_irq from: > > xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data); That part is fine. It's an IPI EOI. > > It seems that these should use XIVE_BAD_IRQ as well or I'm missing > something ? > > arch/powerpc/sysdev/xive/common.c: if (hw_irq) This tests the XIVE IPI number which is mapped to 0 for all CPUs. See xive_request_ipi() and xive_irq_domain_map() C. > arch/powerpc/sysdev/xive/common.c: if (d->domain != xive_irq_domain || hw_irq == 0) > > > >> arch/powerpc/sysdev/xive/xive-internal.h | 7 +++++++ >> arch/powerpc/sysdev/xive/common.c | 12 +++--------- >> arch/powerpc/sysdev/xive/native.c | 4 ++-- >> arch/powerpc/sysdev/xive/spapr.c | 4 ++-- >> 4 files changed, 14 insertions(+), 13 deletions(-) >> >> diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h >> index 59cd366e7933..382980f4de2d 100644 >> --- a/arch/powerpc/sysdev/xive/xive-internal.h >> +++ b/arch/powerpc/sysdev/xive/xive-internal.h >> @@ -5,6 +5,13 @@ >> #ifndef __XIVE_INTERNAL_H >> #define __XIVE_INTERNAL_H >> >> +/* >> + * A "disabled" interrupt should never fire, to catch problems >> + * we set its logical number to this >> + */ >> +#define XIVE_BAD_IRQ 0x7fffffff >> +#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1) >> + >> /* Each CPU carry one of these with various per-CPU state */ >> struct xive_cpu { >> #ifdef CONFIG_SMP >> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c >> index fa49193206b6..550baba98ec9 100644 >> --- a/arch/powerpc/sysdev/xive/common.c >> +++ b/arch/powerpc/sysdev/xive/common.c >> @@ -68,13 +68,6 @@ static u32 xive_ipi_irq; >> /* Xive state for each CPU */ >> static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu); >> >> -/* >> - * A "disabled" interrupt should never fire, to catch problems >> - * we set its logical number to this >> - */ >> -#define XIVE_BAD_IRQ 0x7fffffff >> -#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1) >> - >> /* An invalid CPU target */ >> #define XIVE_INVALID_TARGET (-1) >> >> @@ -1153,7 +1146,7 @@ static int xive_setup_cpu_ipi(unsigned int cpu) >> xc = per_cpu(xive_cpu, cpu); >> >> /* Check if we are already setup */ >> - if (xc->hw_ipi != 0) >> + if (xc->hw_ipi != XIVE_BAD_IRQ) >> return 0; >> >> /* Grab an IPI from the backend, this will populate xc->hw_ipi */ >> @@ -1190,7 +1183,7 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) >> /* Disable the IPI and free the IRQ data */ >> >> /* Already cleaned up ? */ >> - if (xc->hw_ipi == 0) >> + if (xc->hw_ipi == XIVE_BAD_IRQ) >> return; >> >> /* Mask the IPI */ >> @@ -1346,6 +1339,7 @@ static int xive_prepare_cpu(unsigned int cpu) >> if (np) >> xc->chip_id = of_get_ibm_chip_id(np); >> of_node_put(np); >> + xc->hw_ipi = XIVE_BAD_IRQ; >> >> per_cpu(xive_cpu, cpu) = xc; >> } >> diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c >> index 0ff6b739052c..50e1a8e02497 100644 >> --- a/arch/powerpc/sysdev/xive/native.c >> +++ b/arch/powerpc/sysdev/xive/native.c >> @@ -312,7 +312,7 @@ static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc) >> s64 rc; >> >> /* Free the IPI */ >> - if (!xc->hw_ipi) >> + if (xc->hw_ipi == XIVE_BAD_IRQ) >> return; >> for (;;) { >> rc = opal_xive_free_irq(xc->hw_ipi); >> @@ -320,7 +320,7 @@ static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc) >> msleep(OPAL_BUSY_DELAY_MS); >> continue; >> } >> - xc->hw_ipi = 0; >> + xc->hw_ipi = XIVE_BAD_IRQ; >> break; >> } >> } >> diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c >> index 55dc61cb4867..3f15615712b5 100644 >> --- a/arch/powerpc/sysdev/xive/spapr.c >> +++ b/arch/powerpc/sysdev/xive/spapr.c >> @@ -560,11 +560,11 @@ static int xive_spapr_get_ipi(unsigned int cpu, struct xive_cpu *xc) >> >> static void xive_spapr_put_ipi(unsigned int cpu, struct xive_cpu *xc) >> { >> - if (!xc->hw_ipi) >> + if (xc->hw_ipi == XIVE_BAD_IRQ) >> return; >> >> xive_irq_bitmap_free(xc->hw_ipi); >> - xc->hw_ipi = 0; >> + xc->hw_ipi = XIVE_BAD_IRQ; >> } >> #endif /* CONFIG_SMP */ >> >