Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> writes: > The XIVE interrupt controller is the new interrupt controller > found in POWER9. It supports advanced virtualization capabilities > among other things. > > Currently we use a set of firmware calls that simulate the old > "XICS" interrupt controller but this is fairly inefficient. > > This adds the framework for using XIVE along with a native > backend which OPAL for configuration. Later, a backend allowing ^ calls? > the use in a KVM or PowerVM guest will also be provided. > > This disables some fast path for interrupts in KVM when XIVE is > enabled as these rely on the firmware emulation code which is no > longer available when the XIVE is used natively by Linux. > > A latter patch will make KVM also directly exploit the XIVE, thus > recovering the lost performance (and more). > > Signed-off-by: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> > --- > arch/powerpc/include/asm/xive.h | 116 +++ > arch/powerpc/include/asm/xmon.h | 2 + > arch/powerpc/platforms/powernv/Kconfig | 2 + > arch/powerpc/platforms/powernv/setup.c | 15 +- > arch/powerpc/platforms/powernv/smp.c | 39 +- > arch/powerpc/sysdev/Kconfig | 1 + > arch/powerpc/sysdev/Makefile | 1 + > arch/powerpc/sysdev/xive/Kconfig | 7 + > arch/powerpc/sysdev/xive/Makefile | 4 + > arch/powerpc/sysdev/xive/common.c | 1175 ++++++++++++++++++++++++++++++ > arch/powerpc/sysdev/xive/native.c | 604 +++++++++++++++ > arch/powerpc/sysdev/xive/xive-internal.h | 51 ++ > arch/powerpc/sysdev/xive/xive-regs.h | 88 +++ > arch/powerpc/xmon/xmon.c | 93 ++- > 14 files changed, 2186 insertions(+), 12 deletions(-) I'm not going to review this in one go, given it's 10:30pm already. So just a few things that hit me straight away. > diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h > new file mode 100644 > index 0000000..b1604b73 > --- /dev/null > +++ b/arch/powerpc/include/asm/xive.h > @@ -0,0 +1,116 @@ Copyright missing. > +#ifndef _ASM_POWERPC_XIVE_H > +#define _ASM_POWERPC_XIVE_H > + > +#define XIVE_INVALID_VP 0xffffffff > + > +#ifdef CONFIG_PPC_XIVE > + > +extern void __iomem *xive_tm_area; I think Paul already commented on "tm" being an overly used acronym. > +extern u32 xive_tm_offset; > + > +/* > + * Per-irq data (irq_get_handler_data for normal IRQs), IPIs > + * have it stored in the xive_cpu structure. We also cache > + * for normal interrupts the current target CPU. > + */ > +struct xive_irq_data { > + /* Setup by backend */ > + u64 flags; > +#define XIVE_IRQ_FLAG_STORE_EOI 0x01 > +#define XIVE_IRQ_FLAG_LSI 0x02 > +#define XIVE_IRQ_FLAG_SHIFT_BUG 0x04 > +#define XIVE_IRQ_FLAG_MASK_FW 0x08 > +#define XIVE_IRQ_FLAG_EOI_FW 0x10 I don't love that style, prefer them just prior to the struct. > + u64 eoi_page; > + void __iomem *eoi_mmio; > + u64 trig_page; > + void __iomem *trig_mmio; > + u32 esb_shift; > + int src_chip; Why not space out the members like you do in xive_q below, I think that looks better given you have the long __iomem lines. > + > + /* Setup/used by frontend */ > + int target; > + bool saved_p; > +}; > +#define XIVE_INVALID_CHIP_ID -1 > + > +/* A queue tracking structure in a CPU */ > +struct xive_q { > + __be32 *qpage; > + u32 msk; > + u32 idx; > + u32 toggle; > + u64 eoi_phys; > + void __iomem *eoi_mmio; > + u32 esc_irq; > + atomic_t count; > + atomic_t pending_count; > +}; > + > +/* > + * "magic" ESB MMIO offsets What's an ESB? > + */ > +#define XIVE_ESB_GET 0x800 > +#define XIVE_ESB_SET_PQ_00 0xc00 > +#define XIVE_ESB_SET_PQ_01 0xd00 > +#define XIVE_ESB_SET_PQ_10 0xe00 > +#define XIVE_ESB_SET_PQ_11 0xf00 > +#define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01 > + > +extern bool __xive_enabled; > + > +static inline bool xive_enabled(void) { return __xive_enabled; } > + > +extern bool xive_native_init(void); > +extern void xive_smp_probe(void); > +extern int xive_smp_prepare_cpu(unsigned int cpu); > +extern void xive_smp_setup_cpu(void); > +extern void xive_smp_disable_cpu(void); > +extern void xive_kexec_teardown_cpu(int secondary); > +extern void xive_shutdown(void); > +extern void xive_flush_interrupt(void); > + > +/* xmon hook */ > +extern void xmon_xive_do_dump(int cpu); > + > +/* APIs used by KVM */ > +extern u32 xive_native_default_eq_shift(void); > +extern u32 xive_native_alloc_vp_block(u32 max_vcpus); > +extern void xive_native_free_vp_block(u32 vp_base); > +extern int xive_native_populate_irq_data(u32 hw_irq, > + struct xive_irq_data *data); > +extern void xive_cleanup_irq_data(struct xive_irq_data *xd); > +extern u32 xive_native_alloc_irq(void); > +extern void xive_native_free_irq(u32 irq); > +extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); > + > +extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, > + __be32 *qpage, u32 order, bool can_escalate); > +extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); > + > +extern bool __xive_irq_trigger(struct xive_irq_data *xd); > +extern bool __xive_irq_retrigger(struct xive_irq_data *xd); > +extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd); > + > +extern bool is_xive_irq(struct irq_chip *chip); > + > +#else > + > +static inline bool xive_enabled(void) { return false; } > + > +static inline bool xive_native_init(void) { return false; } > +static inline void xive_smp_probe(void) { } > +extern inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; } > +static inline void xive_smp_setup_cpu(void) { } > +static inline void xive_smp_disable_cpu(void) { } > +static inline void xive_kexec_teardown_cpu(int secondary) { } > +static inline void xive_shutdown(void) { } > +static inline void xive_flush_interrupt(void) { } > + > +static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) > + { return XIVE_INVALID_VP; } > +static inline void xive_native_free_vp_block(u32 vp_base) { } > + > +#endif > + > +#endif /* _ASM_POWERPC_XIVE_H */ > diff --git a/arch/powerpc/include/asm/xmon.h b/arch/powerpc/include/asm/xmon.h > index 5eb8e59..eb42a0c 100644 > --- a/arch/powerpc/include/asm/xmon.h > +++ b/arch/powerpc/include/asm/xmon.h > @@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { }; > extern int cpus_are_in_xmon(void); > #endif > > +extern void xmon_printf(const char *format, ...); > + > #endif /* __KERNEL __ */ > #endif /* __ASM_POWERPC_XMON_H */ > diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig > index 3a07e4d..81ee2ed 100644 > --- a/arch/powerpc/platforms/powernv/Kconfig > +++ b/arch/powerpc/platforms/powernv/Kconfig > @@ -4,6 +4,8 @@ config PPC_POWERNV > select PPC_NATIVE > select PPC_XICS > select PPC_ICP_NATIVE > + select PPC_XIVE > + select PPC_XIVE_NATIVE > select PPC_P7_NAP > select PCI > select PCI_MSI > diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c > index d50c7d9..adceac9 100644 > --- a/arch/powerpc/platforms/powernv/setup.c > +++ b/arch/powerpc/platforms/powernv/setup.c > @@ -32,6 +32,7 @@ > #include <asm/machdep.h> > #include <asm/firmware.h> > #include <asm/xics.h> > +#include <asm/xive.h> > #include <asm/opal.h> > #include <asm/kexec.h> > #include <asm/smp.h> > @@ -76,7 +77,9 @@ static void __init pnv_init(void) > > static void __init pnv_init_IRQ(void) > { > - xics_init(); > + /* Try using a XIVE if available, otherwise use a XICS */ > + if (!xive_native_init()) > + xics_init(); > > WARN_ON(!ppc_md.get_irq); > } > @@ -218,10 +221,12 @@ static void pnv_kexec_wait_secondaries_down(void) > > static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) > { > - xics_kexec_teardown_cpu(secondary); > + if (xive_enabled()) > + xive_kexec_teardown_cpu(secondary); > + else > + xics_kexec_teardown_cpu(secondary); > > /* On OPAL, we return all CPUs to firmware */ > - > if (!firmware_has_feature(FW_FEATURE_OPAL)) > return; > > @@ -237,6 +242,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) > /* Primary waits for the secondaries to have reached OPAL */ > pnv_kexec_wait_secondaries_down(); > > + /* Switch XIVE back to emulation mode */ > + if (xive_enabled()) > + xive_shutdown(); > + > /* > * We might be running as little-endian - now that interrupts > * are disabled, reset the HILE bit to big-endian so we don't > diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c > index 8b67e1e..f571955 100644 > --- a/arch/powerpc/platforms/powernv/smp.c > +++ b/arch/powerpc/platforms/powernv/smp.c > @@ -29,6 +29,7 @@ > #include <asm/vdso_datapage.h> > #include <asm/cputhreads.h> > #include <asm/xics.h> > +#include <asm/xive.h> > #include <asm/opal.h> > #include <asm/runlatch.h> > #include <asm/code-patching.h> > @@ -47,7 +48,9 @@ > > static void pnv_smp_setup_cpu(int cpu) > { > - if (cpu != boot_cpuid) > + if (xive_enabled()) > + xive_smp_setup_cpu(); > + else if (cpu != boot_cpuid) > xics_setup_cpu(); > > #ifdef CONFIG_PPC_DOORBELL > @@ -132,7 +135,10 @@ static int pnv_smp_cpu_disable(void) > vdso_data->processorCount--; > if (cpu == boot_cpuid) > boot_cpuid = cpumask_any(cpu_online_mask); > - xics_migrate_irqs_away(); > + if (xive_enabled()) > + xive_smp_disable_cpu(); > + else > + xics_migrate_irqs_away(); > return 0; > } > > @@ -213,9 +219,12 @@ static void pnv_smp_cpu_kill_self(void) > if (((srr1 & wmask) == SRR1_WAKEEE) || > ((srr1 & wmask) == SRR1_WAKEHVI) || > (local_paca->irq_happened & PACA_IRQ_EE)) { > - if (cpu_has_feature(CPU_FTR_ARCH_300)) > - icp_opal_flush_interrupt(); > - else > + if (cpu_has_feature(CPU_FTR_ARCH_300)) { > + if (xive_enabled()) > + xive_flush_interrupt(); > + else > + icp_opal_flush_interrupt(); > + } else > icp_native_flush_interrupt(); > } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) { > unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); > @@ -252,10 +261,26 @@ static int pnv_cpu_bootable(unsigned int nr) > return smp_generic_cpu_bootable(nr); > } > > +static int pnv_smp_prepare_cpu(int cpu) > +{ > + if (xive_enabled()) > + return xive_smp_prepare_cpu(cpu); > + return 0; > +} > + > +static void __init pnv_smp_probe(void) > +{ > + if (xive_enabled()) > + xive_smp_probe(); > + else > + xics_smp_probe(); > +} > + > static struct smp_ops_t pnv_smp_ops = { > .message_pass = smp_muxed_ipi_message_pass, > - .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ > - .probe = xics_smp_probe, > + .cause_ipi = NULL, /* Filled at runtime by xi{cs,ve}_smp_probe() */ > + .probe = pnv_smp_probe, > + .prepare_cpu = pnv_smp_prepare_cpu, > .kick_cpu = pnv_smp_kick_cpu, > .setup_cpu = pnv_smp_setup_cpu, > .cpu_bootable = pnv_cpu_bootable, > diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig > index 52dc165..caf882e 100644 > --- a/arch/powerpc/sysdev/Kconfig > +++ b/arch/powerpc/sysdev/Kconfig > @@ -28,6 +28,7 @@ config PPC_MSI_BITMAP > default y if PPC_POWERNV > > source "arch/powerpc/sysdev/xics/Kconfig" > +source "arch/powerpc/sysdev/xive/Kconfig" > > config PPC_SCOM > bool > diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile > index a254824..c0ae11d 100644 > --- a/arch/powerpc/sysdev/Makefile > +++ b/arch/powerpc/sysdev/Makefile > @@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o > subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror > > obj-$(CONFIG_PPC_XICS) += xics/ > +obj-$(CONFIG_PPC_XIVE) += xive/ > > obj-$(CONFIG_GE_FPGA) += ge/ > diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig > new file mode 100644 > index 0000000..c8816c8 > --- /dev/null > +++ b/arch/powerpc/sysdev/xive/Kconfig > @@ -0,0 +1,7 @@ > +config PPC_XIVE > + def_bool n > + select PPC_SMP_MUXED_IPI > + select HARDIRQS_SW_RESEND > + > +config PPC_XIVE_NATIVE > + def_bool n > diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile > new file mode 100644 > index 0000000..3fab303 > --- /dev/null > +++ b/arch/powerpc/sysdev/xive/Makefile > @@ -0,0 +1,4 @@ > +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror > + > +obj-y += common.o > +obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o > diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c > new file mode 100644 > index 0000000..96037e0 > --- /dev/null > +++ b/arch/powerpc/sysdev/xive/common.c > @@ -0,0 +1,1175 @@ > +/* > + * Copyright 2016,2017 IBM Corporation. > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + */ If here you put: #define pr_fmt(fmt) "xive: " fmt Then you can drop the prefix from every pr_xxx() in the whole file. > +#include <linux/types.h> > +#include <linux/threads.h> > +#include <linux/kernel.h> > +#include <linux/irq.h> > +#include <linux/debugfs.h> Unused? > +#include <linux/smp.h> > +#include <linux/interrupt.h> > +#include <linux/seq_file.h> Unused? > +#include <linux/init.h> > +#include <linux/cpu.h> > +#include <linux/of.h> > +#include <linux/slab.h> > +#include <linux/spinlock.h> > +#include <linux/msi.h> > + > +#include <asm/prom.h> > +#include <asm/io.h> > +#include <asm/smp.h> > +#include <asm/machdep.h> > +#include <asm/irq.h> > +#include <asm/errno.h> > +#include <asm/xive.h> > +#include <asm/xmon.h> > + > +#include "xive-regs.h" > +#include "xive-internal.h" > + > +#undef DEBUG_FLUSH > +#undef DEBUG_ALL > + > +#define DBG(fmt...) pr_devel("XIVE: " fmt) > + > +#ifdef DEBUG_ALL > +#define DBG_VERBOSE(fmt...) pr_devel("XIVE: " fmt) > +#else > +#define DBG_VERBOSE(fmt...) do { } while(0) > +#endif > + > +bool __xive_enabled; > +bool xive_cmdline_disabled; > + > +/* We use only one priority for now */ > +static u8 xive_irq_priority; > + > +void __iomem *xive_tm_area; > +u32 xive_tm_offset; > +static const struct xive_ops *xive_ops; > +static struct irq_domain *xive_irq_domain; > + > +/* The IPIs all use the same logical irq number */ > +static u32 xive_ipi_irq; > + > +/* Xive state for each CPU */ > +static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu); > + > +/* > + * A "disabled" interrupt should never fire, to catch problems > + * we set its logical number to this > + */ > +#define XIVE_BAD_IRQ 0x7fffffff Can it be anything? How about 0x7fbadbad ? > +#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1) > + > +/* An invalid CPU target */ > +#define XIVE_INVALID_TARGET (-1) > + > +static u32 xive_read_eq(struct xive_q *q, u8 prio, bool just_peek) Can it have a doc comment? And tell me what an EQ is? > +{ > + u32 cur; > + > + if (!q->qpage) > + return 0; A newline or .. > + cur = be32_to_cpup(q->qpage + q->idx); > + if ((cur >> 31) == q->toggle) > + return 0; .. two wouldn't hurt here. > + if (!just_peek) { > + q->idx = (q->idx + 1) & q->msk; > + if (q->idx == 0) > + q->toggle ^= 1; > + } > + return cur & 0x7fffffff; Is that XIVE_BAD_IRQ ? > +} > + > +static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) > +{ > + u32 hirq = 0; Is that a hwirq or something different? > + u8 prio; > + > + /* Find highest pending priority */ > + while (xc->pending_prio != 0) { > + struct xive_q *q; > + > + prio = ffs(xc->pending_prio) - 1; > + DBG_VERBOSE("scan_irq: trying prio %d\n", prio); > + > + /* Try to fetch */ > + hirq = xive_read_eq(&xc->queue[prio], prio, just_peek); > + > + /* Found something ? That's it */ > + if (hirq) > + break; > + > + /* Clear pending bits */ > + xc->pending_prio &= ~(1 << prio); > + > + /* > + * Check if the queue count needs adjusting due to > + * interrupts being moved away. > + */ > + q = &xc->queue[prio]; > + if (atomic_read(&q->pending_count)) { > + int p = atomic_xchg(&q->pending_count, 0); > + if (p) { > + WARN_ON(p > atomic_read(&q->count)); > + atomic_sub(p, &q->count); I am not sure what's going on there. > + } > + } > + } > + > + /* If nothing was found, set CPPR to 0xff */ Would be nice to spell out CPPR somewhere. > + if (hirq == 0) > + prio = 0xff; > + > + /* Update HW CPPR to match if necessary */ > + if (prio != xc->cppr) { > + DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio); > + xc->cppr = prio; > + out_8(xive_tm_area + xive_tm_offset + TM_CPPR, prio); What's the out_8() doing? I was expecting it to use xc, or something per-cpu. > + } > + > + return hirq; > +} > + > +#ifdef CONFIG_XMON > +static void xive_dump_eq(const char *name, struct xive_q *q) > +{ > + u32 i0, i1, idx; > + > + if (!q->qpage) > + return; > + idx = q->idx; > + i0 = be32_to_cpup(q->qpage + idx); > + idx = (idx + 1) & q->msk; > + i1 = be32_to_cpup(q->qpage + idx); > + xmon_printf(" %s Q T=%d %08x %08x ...\n", name, > + q->toggle, i0, i1); > +} > + > +void xmon_xive_do_dump(int cpu) > +{ > + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); > + struct xive_irq_data *xd; > + uint64_t val, offset; u64 ? > + > + xmon_printf("XIVE state for CPU %d:\n", cpu); > + xmon_printf(" pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr); > + xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]); > + xd = &xc->ipi_data; > + offset = 0x800; > + if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG) > + offset |= offset << 4; > + val = in_be64(xd->eoi_mmio + offset); > + xmon_printf(" IPI state: %x:%c%c\n", xc->hw_ipi, > + val & 2 ? 'P' : 'p', > + val & 1 ? 'Q' : 'q'); > +} > +#endif /* CONFIG_XMON */ > + > +static void xive_update_pending_irqs(struct xive_cpu *xc) > +{ > + u8 he, cppr; > + u16 ack; > + > + /* Perform the acknowledge hypervisor to register cycle */ > + ack = be16_to_cpu(__raw_readw(xive_tm_area + TM_SPC_ACK_HV_REG)); > + > + /* Synchronize subsequent queue accesses */ > + mb(); > + > + DBG_VERBOSE("CPU %d get_irq, ack=%04x\n", smp_processor_id(), ack); > + > + /* Check the HE field */ > + cppr = ack & 0xff; > + he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8)); > + switch(he) { > + case TM_QW3_NSR_HE_NONE: > + break; > + case TM_QW3_NSR_HE_PHYS: > + if (cppr == 0xff) > + return; > + xc->pending_prio |= 1 << cppr; > + if (cppr >= xc->cppr) > + pr_err("XIVE: CPU %d odd ack CPPR, got %d at %d\n", > + smp_processor_id(), cppr, xc->cppr); > + xc->cppr = cppr; > + break; > + case TM_QW3_NSR_HE_POOL: > + case TM_QW3_NSR_HE_LSI: > + pr_err("XIVE: CPU %d got unexpected interrupt type HE=%d\n", > + smp_processor_id(), he); > + return; > + } > +} > + > +static unsigned int xive_get_irq(void) > +{ > + struct xive_cpu *xc = __this_cpu_read(xive_cpu); > + u32 hirq; > + > + /* > + * This can be called either as a result of a HW interrupt or > + * as a "replay" because EOI decided there was still something > + * in one of the queues. > + * > + * First we perform an ACK cycle in order to update our mask > + * of pending priorities. This will also have the effect of > + * updating the CPPR to the most favored pending interrupts. > + * > + * In the future, if we have a way to differenciate a first > + * entry (on HW interrupt) from a replay triggered by EOI, > + * we could skip this on replays unless we soft-mask tells us > + * that a new HW interrupt occurred. > + */ > + xive_update_pending_irqs(xc); > + > + DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio); > + > + hirq = xive_scan_interrupts(xc, false); > + > + DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n", > + hirq, xc->pending_prio); > + > + /* Return pending interrupt if any */ > + if (hirq == XIVE_BAD_IRQ) > + return 0; > + return hirq; > +} > + > + > +static void xive_do_queue_eoi(struct xive_cpu *xc) > +{ > + if (xive_scan_interrupts(xc, true) != 0) { > + DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio); > + force_external_irq_replay(); > + } > +} > + > +static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset) > +{ > + u64 val; > + > + if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG) > + offset |= offset << 4; > + > + val = in_be64(xd->eoi_mmio + offset); > + > + return (u8)val; > +} > + > +static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) > +{ > + /* If the XIVE supports the new "store EOI facility, use it */ > + if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) > + out_be64(xd->eoi_mmio, 0); > + else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { > + if (WARN_ON_ONCE(!xive_ops->eoi)) > + return; > + xive_ops->eoi(hw_irq); > + } else { > + uint8_t eoi_val; u8? > + > + /* > + * Otherwise for EOI, we use the special MMIO that does > + * a clear of both P and Q and returns the old Q. > + * > + * This allows us to then do a re-trigger if Q was set > + * rather than synthetizing an interrupt in software > + */ > + eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); > + DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val); > + > + if ((xd->flags & XIVE_IRQ_FLAG_LSI) || !(eoi_val & 1)) > + return; > + > + /* Re-trigger */ > + if (xd->trig_mmio) > + out_be64(xd->trig_mmio, 0); > + } > + > +} > + > +static void xive_irq_eoi(struct irq_data *d) > +{ > + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); > + struct xive_cpu *xc = __this_cpu_read(xive_cpu); > + > + DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n", > + d->irq, irqd_to_hwirq(d), xc->pending_prio); > + > + if (!irqd_irq_disabled(d)) > + xive_do_source_eoi(irqd_to_hwirq(d), xd); > + > + /* > + * Clear saved_p to indicate that it's no longer occupying > + * a queue slot on the target queue > + */ > + xd->saved_p = false; > + > + xive_do_queue_eoi(xc); > +} > + > +static void xive_do_source_set_mask(struct xive_irq_data *xd, > + bool masked) > +{ > + if (masked) > + xive_poke_esb(xd, XIVE_ESB_SET_PQ_01); > + else > + xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); > +} > + > +static bool xive_try_pick_target(int cpu) > +{ > + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); > + struct xive_q *q = &xc->queue[xive_irq_priority]; > + int max; > + > + /* Calculate max number of interrupts in that queue. > + * > + * We leave a gap of 1 just in case... > + */ > + max = (q->msk + 1) - 1; > + return !!atomic_add_unless(&q->count, 1, max); > +} > + > +static void xive_dec_target_count(int cpu) > +{ > + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); > + struct xive_q *q = &xc->queue[xive_irq_priority]; > + > + if (WARN_ON(cpu < 0)) > + return; > + > + /* > + * We increment the "pending count" which will be used > + * to decrement the target queue count whenever it's next > + * processed and found empty. This ensure that we don't > + * decrement while we still have the interrupt there > + * occupying a slot. > + */ > + atomic_inc(&q->pending_count); > +} > + > +static int xive_find_target_in_mask(const struct cpumask *mask, > + unsigned int fuzz) > +{ > + int cpu, first, num, i; > + > + /* Pick up a starting point CPU in the mask based on fuzz */ > + num = cpumask_weight(mask); > + first = (fuzz++) % num; > + > + /* Locate it */ > + cpu = cpumask_first(mask); > + for (i = 0; i < first; i++) > + cpu = cpumask_next(cpu, mask); > + first = cpu; > + > + /* > + * Now go through the entire mask until we find a valid > + * target. > + */ > + for (;;) { > + /* > + * We re-check online as the fallback case passes us > + * an untested affinity mask > + */ > + if (cpu_online(cpu) && xive_try_pick_target(cpu)) > + return cpu; > + cpu = cpumask_next(cpu, mask); > + if (cpu == first) > + break; > + } > + return -1; > +} > + > +static int xive_pick_irq_target(struct irq_data *d, > + const struct cpumask *affinity) > +{ > + static unsigned int fuzz; > + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); > + cpumask_var_t mask; > + int cpu = -1; > + > + /* > + * Pick a target CPU for an interrupt. This is done at > + * startup or if the affinity is changed in a way that > + * invalidates the current target. > + */ > + > + /* If we have chip IDs, first we try to build a mask of > + * CPUs matching ther CPU and find a target in there > + */ > + if (xd->src_chip != XIVE_INVALID_CHIP_ID && > + zalloc_cpumask_var(&mask, GFP_ATOMIC)) { > + /* Build a mask of matching chip IDs */ > + for_each_cpu_and(cpu, affinity, cpu_online_mask) { > + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); > + if (xc->chip_id == xd->src_chip) > + cpumask_set_cpu(cpu, mask); > + } > + /* Try to find a target */ > + if (!cpumask_empty(mask)) > + cpu = xive_find_target_in_mask(mask, fuzz++); > + free_cpumask_var(mask); > + if (cpu >= 0) > + return cpu; > + fuzz--; > + } > + > + /* No chip IDs, fallback to using the affinity mask */ > + return xive_find_target_in_mask(affinity, fuzz++); > +} > + > +static unsigned int xive_irq_startup(struct irq_data *d) > +{ > + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); > + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); > + int target, rc; > + > + DBG("xive_irq_startup: irq %d [0x%x] data @%p\n", > + d->irq, hw_irq, d); > + > +#ifdef CONFIG_PCI_MSI > + /* > + * The generic MSI code returns with the interrupt disabled on the > + * card, using the MSI mask bits. Firmware doesn't appear to unmask > + * at that level, so we do it here by hand. > + */ > + if (irq_data_get_msi_desc(d)) > + pci_msi_unmask_irq(d); > +#endif > + > + /* Pick a target */ > + target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d)); > + if (target == XIVE_INVALID_TARGET) { > + /* Try again breaking affinity */ > + target = xive_pick_irq_target(d, cpu_online_mask); > + if (target == XIVE_INVALID_TARGET) > + return -ENXIO; > + pr_warn("XIVE: irq %d started with broken affinity\n", > + d->irq); > + } > + xd->target = target; > + > + /* > + * Configure the logical number to be the Linux IRQ number > + * and set the target queue > + */ > + rc = xive_ops->configure_irq(hw_irq, > + get_hard_smp_processor_id(target), > + xive_irq_priority, d->irq); > + if (rc) > + return rc; > + > + /* Unmask the ESB */ > + xive_do_source_set_mask(xd, false); > + > + return 0; > +} > + > +static void xive_irq_shutdown(struct irq_data *d) > +{ > + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); > + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); > + > + DBG("xive_irq_shutdown: irq %d [0x%x] data @%p\n", > + d->irq, hw_irq, d); > + > + if (WARN_ON(xd->target == XIVE_INVALID_TARGET)) > + return; > + > + /* Mask the interrupt at the source */ > + xive_do_source_set_mask(xd, true); > + > + /* Mask the interrupt in HW in the IVT/EAS */ > + xive_ops->configure_irq(hw_irq, > + get_hard_smp_processor_id(xd->target), > + 0xff, hw_irq); > + > + xive_dec_target_count(xd->target); > + xd->target = XIVE_INVALID_TARGET; > +} > + > +static void xive_irq_unmask(struct irq_data *d) > +{ > + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); > + > + DBG("xive_irq_unmask: irq %d data @%p\n", d->irq, xd); > + > + /* > + * This is a workaround for PCI LSI problems on P9, for > + * these, we call FW to set the mask. The problems might > + * be fixed by P9 DD2.0, if that is the case, we will make > + * this a DD1 workaround only > + */ > + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) { > + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); > + xive_ops->configure_irq(hw_irq, > + get_hard_smp_processor_id(xd->target), > + xive_irq_priority, d->irq); > + return; > + } > + > + xive_do_source_set_mask(xd, false); > +} > + > +static void xive_irq_mask(struct irq_data *d) > +{ > + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); > + > + DBG("xive_irq_mask: irq %d data @%p\n", d->irq, xd); > + > + /* > + * This is a workaround for PCI LSI problems on P9, for > + * these, we call OPAL to set the mask. The problems might > + * be fixed by P9 DD2.0, if that is the case, we will make > + * this a DD1 workaround only > + */ > + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) { > + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); > + xive_ops->configure_irq(hw_irq, > + get_hard_smp_processor_id(xd->target), > + 0xff, d->irq); > + return; > + } > + > + xive_do_source_set_mask(xd, true); > +} > + > +static int xive_irq_set_affinity(struct irq_data *d, > + const struct cpumask *cpumask, > + bool force) > +{ > + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); > + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); > + u32 target, old_target; > + int rc = 0; > + > + DBG("xive_irq_set_affinity: irq %d\n", d->irq); > + > + /* Is this valid ? */ > + if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) > + return -EINVAL; > + > + /* If existing target is already in the new mask, and is > + * online then do nothing. > + */ > + if (cpu_online(xd->target) && > + cpumask_test_cpu(xd->target, cpumask)) > + return IRQ_SET_MASK_OK; > + > + /* Pick a new target */ > + target = xive_pick_irq_target(d, cpumask); > + > + /* No target found */ > + if (target == XIVE_INVALID_TARGET) > + return -ENXIO; > + > + old_target = xd->target; > + > + /* > + * Only configure the irq if it's not currently passed-through to > + * a KVM guest > + */ > + rc = xive_ops->configure_irq(hw_irq, > + get_hard_smp_processor_id(target), > + xive_irq_priority, d->irq); > + if (rc < 0) { > + pr_err("XIVE: Error %d reconfiguring irq %d\n", rc, d->irq); > + return rc; > + } > + > + DBG(" target: 0x%x\n", target); > + xd->target = target; > + > + /* Give up previous target */ > + if (old_target != XIVE_INVALID_TARGET) > + xive_dec_target_count(old_target); > + > + return IRQ_SET_MASK_OK; > +} > + > +static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type) > +{ > + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); > + > + /* > + * We only support these. This has really no effect other than setting > + * the corresponding descriptor bits mind you but those will in turn > + * affect the resend function when re-enabling an edge interrupt. > + * > + * Set set the default to edge as explained in map(). > + */ > + if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE) > + flow_type = IRQ_TYPE_EDGE_RISING; > + > + if (flow_type != IRQ_TYPE_EDGE_RISING && > + flow_type != IRQ_TYPE_LEVEL_LOW) > + return -EINVAL; > + > + irqd_set_trigger_type(d, flow_type); > + > + /* > + * Double check it matches what the FW thinks > + * > + * NOTE: We don't know yet if the PAPR interface will provide > + * the LSI vs MSI information appart from the device-tree so > + * this check might have to move into an optional backend call > + * that is specific to the native backend > + */ > + if ((flow_type == IRQ_TYPE_LEVEL_LOW) != > + !!(xd->flags & XIVE_IRQ_FLAG_LSI)) > + pr_warn("XIVE: Interrupt %d (HW 0x%x) type mismatch," > + " Linux says %s, FW says %s\n", > + d->irq, (u32)irqd_to_hwirq(d), > + (flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge", > + (xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge"); > + > + return IRQ_SET_MASK_OK_NOCOPY; > +} > + > +static int xive_irq_retrigger(struct irq_data *d) > +{ > + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); > + > + /* This should be only for MSIs */ > + if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) > + return 0; > + > + /* > + * To perform a retrigger, we first set the PQ bits to > + * 11, then perform an EOI. > + */ > + xive_poke_esb(xd, XIVE_ESB_SET_PQ_11); > + > + /* > + * Note: We pass "0" to the hw_irq argument in order to > + * avoid calling into the backend EOI code which we don't > + * want to do in the case of a re-trigger. Backends typically > + * only do EOI for LSIs anyway. > + */ > + xive_do_source_eoi(0, xd); > + > + return 1; > +} > + > +static struct irq_chip xive_irq_chip = { > + .name = "XIVE-IRQ", > + .irq_startup = xive_irq_startup, > + .irq_shutdown = xive_irq_shutdown, > + .irq_eoi = xive_irq_eoi, > + .irq_mask = xive_irq_mask, > + .irq_unmask = xive_irq_unmask, > + .irq_set_affinity = xive_irq_set_affinity, > + .irq_set_type = xive_irq_set_type, > + .irq_retrigger = xive_irq_retrigger, > +}; > + > +bool is_xive_irq(struct irq_chip *chip) > +{ > + return chip == &xive_irq_chip; > +} > + > +void xive_cleanup_irq_data(struct xive_irq_data *xd) > +{ > + if (xd->eoi_mmio) { > + iounmap(xd->eoi_mmio); > + if (xd->eoi_mmio == xd->trig_mmio) > + xd->trig_mmio = NULL; > + xd->eoi_mmio = NULL; > + } > + if (xd->trig_mmio) { > + iounmap(xd->trig_mmio); > + xd->trig_mmio = NULL; > + } > +} > + > +static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) > +{ > + struct xive_irq_data *xd; > + int rc; > + > + xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL); > + if (!xd) > + return -ENOMEM; > + rc = xive_ops->populate_irq_data(hw, xd); > + if (rc) { > + kfree(xd); > + return rc; > + } > + xd->target = XIVE_INVALID_TARGET; > + irq_set_handler_data(virq, xd); > + > + return 0; > +} > + > +static void xive_irq_free_data(unsigned int virq) > +{ > + struct xive_irq_data *xd = irq_get_handler_data(virq); > + > + if (!xd) > + return; > + irq_set_handler_data(virq, NULL); > + xive_cleanup_irq_data(xd); > + kfree(xd); > +} > + > +#ifdef CONFIG_SMP > + > +static void xive_cause_ipi(int cpu, unsigned long msg) > +{ > + struct xive_cpu *xc; > + struct xive_irq_data *xd; > + > + xc = per_cpu(xive_cpu, cpu); > + > + DBG_VERBOSE("IPI msg#%ld CPU %d -> %d (HW IRQ 0x%x)\n", > + msg, smp_processor_id(), cpu, xc->hw_ipi); > + > + xd = &xc->ipi_data; > + if (WARN_ON(!xd->trig_mmio)) > + return; > + out_be64(xd->trig_mmio, 0); > +} > + > +static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id) > +{ > + return smp_ipi_demux(); > +} > + > +static void xive_ipi_eoi(struct irq_data *d) > +{ > + struct xive_cpu *xc = __this_cpu_read(xive_cpu); > + > + /* Handle possible race with unplug and drop stale IPIs */ > + if (!xc) > + return; > + xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data); > + xive_do_queue_eoi(xc); > +} > + > +static void xive_ipi_unmask(struct irq_data *d) > +{ > + /* Nothing to do, we never mask IPIs, but the callback > + * must exist > + */ > +} > + > +static void xive_ipi_mask(struct irq_data *d) > +{ > + /* Nothing to do, we never mask IPIs, but the callback > + * must exist > + */ > +} > + > +static struct irq_chip xive_ipi_chip = { > + .name = "XIVE-IPI", > + .irq_eoi = xive_ipi_eoi, > + .irq_mask = xive_ipi_mask, > + .irq_unmask = xive_ipi_unmask, > +}; > + > +static void __init xive_request_ipi(void) > +{ > + unsigned int virq; > + > + /* Initialize it */ > + virq = irq_create_mapping(xive_irq_domain, 0); > + xive_ipi_irq = virq; > + > + BUG_ON(request_irq(virq, xive_muxed_ipi_action, > + IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); > +} > + > +static int xive_setup_cpu_ipi(unsigned int cpu) > +{ > + struct xive_cpu *xc; > + int rc; > + > + pr_debug("XIVE: Setting up IPI for CPU %d\n", cpu); > + > + xc = per_cpu(xive_cpu, cpu); > + > + /* Check if we are already setup */ > + if (xc->hw_ipi != 0) > + return 0; > + > + /* Grab an IPI from the backend, this will populate xc->hw_ipi */ > + if (xive_ops->get_ipi(cpu, xc)) > + return -EIO; > + > + /* Populate the IRQ data in the xive_cpu structure and > + * configure the HW / enable the IPIs > + */ > + rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data); > + if (rc) { > + pr_err("XIVE: Failed to populate IPI data on CPU %d\n", cpu); > + return -EIO; > + } > + rc = xive_ops->configure_irq(xc->hw_ipi, > + get_hard_smp_processor_id(cpu), > + xive_irq_priority, xive_ipi_irq); > + if (rc) { > + pr_err("XIVE: Failed to map IPI CPU %d\n", cpu); > + return -EIO; > + } > + DBG("XIVE: CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu, > + xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio); > + > + /* Unmask it */ > + xive_do_source_set_mask(&xc->ipi_data, false); > + > + return 0; > +} > + > +static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) > +{ > + /* Disable the IPI and free the IRQ data */ > + > + /* Already cleaned up ? */ > + if (xc->hw_ipi == 0) > + return; > + > + /* Mask the IPI */ > + xive_do_source_set_mask(&xc->ipi_data, true); > + > + /* > + * Note: We don't call xive_cleanup_irq_data() to free > + * the mappings as this is called from an IPI on kexec > + * which is not a safe environment to call iounmap() > + */ > + > + /* Deconfigure/mask in the backend */ > + xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(), > + 0xff, xive_ipi_irq); > + > + /* Free the IPIs in the backend */ > + xive_ops->put_ipi(cpu, xc); > +} > + > +void __init xive_smp_probe(void) > +{ > + smp_ops->cause_ipi = xive_cause_ipi; > + > + /* Register the IPI */ > + xive_request_ipi(); > + > + /* Allocate and setup IPI for the boot CPU */ > + xive_setup_cpu_ipi(smp_processor_id()); > +} > + > +#endif /* CONFIG_SMP */ > + > +static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, > + irq_hw_number_t hw) > +{ > + int rc; > + > + /* > + * Mark interrupts as edge sensitive by default so that resend > + * actually works. Will fix that up below if needed. > + */ > + irq_clear_status_flags(virq, IRQ_LEVEL); > + > + /* IPIs are special and come up with HW number 0 */ > + if (hw == 0) { > + /* > + * IPIs are marked per-cpu. We use separate HW interrupts under > + * the hood but associated with the same "linux" interrupt > + */ > + irq_set_chip_and_handler(virq, &xive_ipi_chip, > + handle_percpu_irq); > + return 0; > + } > + > + rc = xive_irq_alloc_data(virq, hw); > + if (rc) > + return rc; > + > + irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq); > + > + return 0; > +} > + > +static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq) > +{ > + struct irq_data *data = irq_get_irq_data(virq); > + unsigned int hw_irq; > + > + if (!data) > + return; > + hw_irq = (unsigned int)irqd_to_hwirq(data); > + if (hw_irq) > + xive_irq_free_data(virq); > +} > + > +static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct, > + const u32 *intspec, unsigned int intsize, > + irq_hw_number_t *out_hwirq, unsigned int *out_flags) > + > +{ > + *out_hwirq = intspec[0]; > + > + /* > + * If intsize is at least 2, we look for the type in the second cell, > + * we assume the LSB indicates a level interrupt. > + */ > + if (intsize > 1) { > + if (intspec[1] & 1) > + *out_flags = IRQ_TYPE_LEVEL_LOW; > + else > + *out_flags = IRQ_TYPE_EDGE_RISING; > + } else > + *out_flags = IRQ_TYPE_LEVEL_LOW; > + > + return 0; > +} > + > +static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node, > + enum irq_domain_bus_token bus_token) > +{ > + return xive_ops->match(node); > +} > + > +static const struct irq_domain_ops xive_irq_domain_ops = { > + .match = xive_irq_domain_match, > + .map = xive_irq_domain_map, > + .unmap = xive_irq_domain_unmap, > + .xlate = xive_irq_domain_xlate, > +}; > + > +static void __init xive_init_host(void) > +{ > + xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ, > + &xive_irq_domain_ops, NULL); > + BUG_ON(xive_irq_domain == NULL); > + irq_set_default_host(xive_irq_domain); > +} > + > +static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) > +{ > + if (xc->queue[xive_irq_priority].qpage) > + xive_ops->cleanup_queue(cpu, xc, xive_irq_priority); > +} > + > +static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) > +{ > + int rc = 0; > + > + /* We setup 1 queues for now with a 64k page */ > + if (!xc->queue[xive_irq_priority].qpage) > + rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority); > + > + return rc; > +} > + > +static int xive_prepare_cpu(unsigned int cpu) > +{ > + struct xive_cpu *xc; > + > + xc = per_cpu(xive_cpu, cpu); > + if (!xc) { > + struct device_node *np; > + > + xc = kzalloc_node(sizeof(struct xive_cpu), > + GFP_KERNEL, cpu_to_node(cpu)); > + if (!xc) > + return -ENOMEM; > + np = of_get_cpu_node(cpu, NULL); > + if (np) > + xc->chip_id = of_get_ibm_chip_id(np); > + of_node_put(np); > + > + per_cpu(xive_cpu, cpu) = xc; > + } > + > + /* Setup EQs if not already */ > + return xive_setup_cpu_queues(cpu, xc); > +} > + > +static void xive_setup_cpu(void) > +{ > + struct xive_cpu *xc = __this_cpu_read(xive_cpu); > + > + /* Debug: Dump the TM state */ > + DBG("CPU %d [HW 0x%02x] VT=%02x\n", > + smp_processor_id(), hard_smp_processor_id(), > + in_8(xive_tm_area + xive_tm_offset + TM_WORD2)); > + > + /* The backend might have additional things to do */ > + if (xive_ops->setup_cpu) > + xive_ops->setup_cpu(smp_processor_id(), xc); > + > + /* Set CPPR to 0xff to enable flow of interrupts */ > + xc->cppr = 0xff; > + out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0xff); > +} > + > +#ifdef CONFIG_SMP > +void xive_smp_setup_cpu(void) > +{ > + DBG("XIVE: SMP setup CPU %d\n", smp_processor_id()); > + > + /* This will have already been done on the boot CPU */ > + if (smp_processor_id() != boot_cpuid) > + xive_setup_cpu(); > + > +} > + > +int xive_smp_prepare_cpu(unsigned int cpu) > +{ > + int rc; > + > + /* Allocate per-CPU data and queues */ > + rc = xive_prepare_cpu(cpu); > + if (rc) > + return rc; > + > + /* Allocate and setup IPI for the new CPU */ > + return xive_setup_cpu_ipi(cpu); > +} > + > +#ifdef CONFIG_HOTPLUG_CPU > +static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) > +{ > + u32 irq; > + > + /* We assume local irqs are disabled */ > + WARN_ON(!irqs_disabled()); > + > + /* Check what's already in the CPU queue */ > + while ((irq = xive_scan_interrupts(xc, false)) != 0) { > + /* > + * We need to re-route that interrupt to its new distination. > + * First get and lock the descriptor > + */ > + struct irq_desc *desc = irq_to_desc(irq); > + struct irq_data *d = irq_desc_get_irq_data(desc); > + struct xive_irq_data *xd; > + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); > + > + /* > + * Ignore anything that isn't a XIVE irq and ignore > + * IPIs, so can just be dropped. > + */ > + if (d->domain != xive_irq_domain || hw_irq == 0) > + continue; > +#ifdef DEBUG_FLUSH > + pr_info("CPU %d: Got irq %d while offline, re-routing...\n", > + cpu, irq); > +#endif > + raw_spin_lock(&desc->lock); > + xd = irq_desc_get_handler_data(desc); > + > + /* For LSIs, we EOI, this will cause a resend if it's > + * still asserted. Otherwise do an MSI retrigger > + */ > + if (xd->flags & XIVE_IRQ_FLAG_LSI) > + xive_do_source_eoi(irqd_to_hwirq(d), xd); > + else > + xive_irq_retrigger(d); > + raw_spin_unlock(&desc->lock); > + } > +} > + > +void xive_smp_disable_cpu(void) > +{ > + struct xive_cpu *xc = __this_cpu_read(xive_cpu); > + unsigned int cpu = smp_processor_id(); > + > + /* Migrate interrupts away from the CPU */ > + irq_migrate_all_off_this_cpu(); > + > + /* Set CPPR to 0 to disable flow of interrupts */ > + xc->cppr = 0; > + out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0); > + > + /* Flush everything still in the queue */ > + xive_flush_cpu_queue(cpu, xc); > + > + /* Re-enable CPPR */ > + xc->cppr = 0xff; > + out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0xff); > +} > + > +void xive_flush_interrupt(void) > +{ > + struct xive_cpu *xc = __this_cpu_read(xive_cpu); > + unsigned int cpu = smp_processor_id(); > + > + /* Called if an interrupt occurs while the CPU is hot unplugged */ > + xive_flush_cpu_queue(cpu, xc); > +} > + > +#endif /* CONFIG_HOTPLUG_CPU */ > + > +#endif /* CONFIG_SMP */ > + > +void xive_kexec_teardown_cpu(int secondary) > +{ > + struct xive_cpu *xc = __this_cpu_read(xive_cpu); > + unsigned int cpu = smp_processor_id(); > + > + /* Set CPPR to 0 to disable flow of interrupts */ > + xc->cppr = 0; > + out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0); > + > + /* Backend cleanup if any */ > + if (xive_ops->teardown_cpu) > + xive_ops->teardown_cpu(cpu, xc); > + > + /* Get rid of IPI */ > + xive_cleanup_cpu_ipi(cpu, xc); > + > + /* Disable and free the queues */ > + xive_cleanup_cpu_queues(cpu, xc); > +} > + > +void xive_shutdown(void) > +{ > + xive_ops->shutdown(); > +} > + > +bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, > + u8 max_prio) > +{ > + xive_tm_area = area; > + xive_tm_offset = offset; > + xive_ops = ops; > + xive_irq_priority = max_prio; > + > + ppc_md.get_irq = xive_get_irq; > + __xive_enabled = true; > + > + DBG("Initializing host..\n"); > + xive_init_host(); > + > + DBG("Initializing boot CPU..\n"); > + > + /* Allocate per-CPU data and queues */ > + xive_prepare_cpu(smp_processor_id()); > + > + /* Get ready for interrupts */ > + xive_setup_cpu(); > + > + pr_info("XIVE: Interrupt handling intialized with %s backend\n", > + xive_ops->name); > + pr_info("XIVE: Using priority %d for all interrupts\n", max_prio); > + > + return true; > +} > + > +static int __init xive_off(char *arg) > +{ > + xive_cmdline_disabled = true; > + return 0; > +} > +__setup("xive=off", xive_off); > diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c > new file mode 100644 > index 0000000..26cc6bf > --- /dev/null > +++ b/arch/powerpc/sysdev/xive/native.c > @@ -0,0 +1,604 @@ > +/* > + * Copyright 2016,2017 IBM Corporation. > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public License > + * as published by the Free Software Foundation; either version > + * 2 of the License, or (at your option) any later version. > + */ > +#include <linux/types.h> > +#include <linux/irq.h> > +#include <linux/debugfs.h> Unused? > +#include <linux/smp.h> > +#include <linux/interrupt.h> > +#include <linux/seq_file.h> Unused? > +#include <linux/init.h> > +#include <linux/of.h> > +#include <linux/slab.h> > +#include <linux/spinlock.h> > +#include <linux/delay.h> > +#include <linux/cpumask.h> > +#include <linux/mm.h> > + > +#include <asm/prom.h> > +#include <asm/io.h> > +#include <asm/smp.h> > +#include <asm/irq.h> > +#include <asm/errno.h> > +#include <asm/xive.h> > +#include <asm/opal.h> > + > +#include "xive-regs.h" > +#include "xive-internal.h" > + > +#define DBG(fmt...) pr_devel("XIVE: " fmt) > + > +/* Enable this for using queue MMIO page for EOI. We don't currently > + * use it as we always notify > + */ > +#undef USE_QUEUE_MMIO Dead code? Or we want to keep it? > +static u32 xive_provision_size; > +static u32 *xive_provision_chips; > +static u32 xive_provision_chip_count; > +static u32 xive_queue_shift; > +static u32 xive_pool_vps = XIVE_INVALID_VP; > +static struct kmem_cache *xive_provision_cache; > + > +int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) > +{ > + __be64 flags, eoi_page, trig_page; > + __be32 esb_shift, src_chip; > + u64 opal_flags; > + s64 rc; > + > + memset(data, 0, sizeof(*data)); > + > + rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page, > + &esb_shift, &src_chip); > + if (rc) { > + pr_err("XIVE: opal_xive_get_irq_info(0x%x) returned %lld\n", > + hw_irq, rc); > + return -EINVAL; > + } > + > + opal_flags = be64_to_cpu(flags); > + if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI) > + data->flags |= XIVE_IRQ_FLAG_STORE_EOI; > + if (opal_flags & OPAL_XIVE_IRQ_LSI) > + data->flags |= XIVE_IRQ_FLAG_LSI; > + if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG) > + data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG; > + if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW) > + data->flags |= XIVE_IRQ_FLAG_MASK_FW; > + if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW) > + data->flags |= XIVE_IRQ_FLAG_EOI_FW; > + data->eoi_page = be64_to_cpu(eoi_page); > + data->trig_page = be64_to_cpu(trig_page); > + data->esb_shift = be32_to_cpu(esb_shift); > + data->src_chip = be32_to_cpu(src_chip); > + > + data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift); > + if (!data->eoi_mmio) { > + pr_err("XIVE: Failed to map EOI page for irq 0x%x\n", hw_irq); > + return -ENOMEM; > + } > + > + if (!data->trig_page) > + return 0; > + if (data->trig_page == data->eoi_page) { > + data->trig_mmio = data->eoi_mmio; > + return 0; > + } > + > + data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift); > + if (!data->trig_mmio) { > + pr_err("XIVE: Failed to map trigger page for irq 0x%x\n", hw_irq); > + return -ENOMEM; > + } > + return 0; > +} > + > +int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) > +{ > + s64 rc; > + > + for (;;) { > + rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq); > + if (rc != OPAL_BUSY) > + break; > + msleep(1); > + } > + return rc == 0 ? 0 : -ENXIO; > +} > + > +/* This can be called multiple time to change a queue configuration */ > +int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, > + __be32 *qpage, u32 order, bool can_escalate) > +{ > + s64 rc = 0; > + __be64 qeoi_page_be; > + __be32 esc_irq_be; > + u64 flags, qpage_phys; > + > + /* If there's an actual queue page, clean it */ > + if (order) { > + BUG_ON(!qpage); Can't we just return an error? > + qpage_phys = __pa(qpage); > + } else > + qpage_phys = 0; > + > + /* Initialize the rest of the fields */ > + q->msk = order ? ((1u << (order - 2)) - 1) : 0; > + q->idx = 0; > + q->toggle = 0; > + > + rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL, > + &qeoi_page_be, > + &esc_irq_be, > + NULL); > + if (rc) { > + pr_err("XIVE: Error %lld getting queue info prio %d\n", > + rc, prio); > + rc = -EIO; > + goto fail; > + } > + q->eoi_phys = be64_to_cpu(qeoi_page_be); > + > +#ifdef USE_QUEUE_MMIO > + if (!q->eoi_mmio) > + q->eoi_mmio = ioremap(q->eoi_phys, PAGE_SIZE); > + if (!q->eoi_mmio) { > + pr_err("XIVE: Failed to map queue MMIO prio %d CPU %d\n", > + rc, prio, cpu); > + rc = -ENOMEM; > + goto fail; > + } > +#endif /* USE_QUEUE_MMIO */ > + > + ... > +static bool xive_parse_provisioning(struct device_node *np) > +{ > + int rc; > + > + if (of_property_read_u32(np, "ibm,xive-provision-page-size", > + &xive_provision_size) < 0) > + return true; > + rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4); > + if (rc < 0) { > + pr_err("XIVE: Error %d getting provision chips array\n", rc); > + return false; > + } > + xive_provision_chip_count = rc; > + if (rc == 0) > + return true; > + > + xive_provision_chips = kzalloc(4 * xive_provision_chip_count, > + GFP_KERNEL); > + BUG_ON(!xive_provision_chips); return false? > + > + rc = of_property_read_u32_array(np, "ibm,xive-provision-chips", > + xive_provision_chips, > + xive_provision_chip_count); ... > diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h > new file mode 100644 > index 0000000..e736fc5 > --- /dev/null > +++ b/arch/powerpc/sysdev/xive/xive-internal.h > @@ -0,0 +1,51 @@ Copyright missing. > +#ifndef __XIVE_INTERNAL_H > +#define __XIVE_INTERNAL_H ... > diff --git a/arch/powerpc/sysdev/xive/xive-regs.h b/arch/powerpc/sysdev/xive/xive-regs.h > new file mode 100644 > index 0000000..f1edb23 > --- /dev/null > +++ b/arch/powerpc/sysdev/xive/xive-regs.h > @@ -0,0 +1,88 @@ Copyright missing. > +#ifndef __XIVE_REGS_H__ > +#define __XIVE_REGS_H__ ... > diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c > index 16321ad..c71e919 100644 > --- a/arch/powerpc/xmon/xmon.c > +++ b/arch/powerpc/xmon/xmon.c ... > + > +static void dump_one_xive_irq(uint32_t num) u32? > +{ > + int64_t rc; > + __be64 vp; > + uint8_t prio; u8? zzzzz ... cheers -- To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html