Hi Jianjun, On Fri, 25 Dec 2020 10:03:07 +0000, Jianjun Wang <jianjun.wang@xxxxxxxxxxxx> wrote: > > MediaTek's PCIe host controller has three generation HWs, the new > generation HW is an individual bridge, it supports Gen3 speed and > up to 256 MSI interrupt numbers for multi-function devices. > > Add support for new Gen3 controller which can be found on MT8192. > > Signed-off-by: Jianjun Wang <jianjun.wang@xxxxxxxxxxxx> > Acked-by: Ryder Lee <ryder.lee@xxxxxxxxxxxx> > --- > drivers/pci/controller/Kconfig | 13 + > drivers/pci/controller/Makefile | 1 + > drivers/pci/controller/pcie-mediatek-gen3.c | 1084 +++++++++++++++++++ > 3 files changed, 1098 insertions(+) > create mode 100644 drivers/pci/controller/pcie-mediatek-gen3.c This is a pretty large patch, and it'd be great if you would split it into at least 4 parts (core PCIe, PM, MSI, INTx). > > diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig > index 64e2f5e379aa..b242b17025b3 100644 > --- a/drivers/pci/controller/Kconfig > +++ b/drivers/pci/controller/Kconfig > @@ -242,6 +242,19 @@ config PCIE_MEDIATEK > Say Y here if you want to enable PCIe controller support on > MediaTek SoCs. > > +config PCIE_MEDIATEK_GEN3 > + tristate "MediaTek Gen3 PCIe controller" > + depends on ARCH_MEDIATEK || COMPILE_TEST > + depends on PCI_MSI_IRQ_DOMAIN > + help > + Adds support for PCIe Gen3 MAC controller for MediaTek SoCs. > + This PCIe controller is compatible with Gen3, Gen2 and Gen1 speed, > + and support up to 256 MSI interrupt numbers for > + multi-function devices. > + > + Say Y here if you want to enable Gen3 PCIe controller support on > + MediaTek SoCs. > + > config PCIE_TANGO_SMP8759 > bool "Tango SMP8759 PCIe controller (DANGEROUS)" > depends on ARCH_TANGO && PCI_MSI && OF > diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile > index 04c6edc285c5..df5d77d72a9d 100644 > --- a/drivers/pci/controller/Makefile > +++ b/drivers/pci/controller/Makefile > @@ -27,6 +27,7 @@ obj-$(CONFIG_PCIE_ROCKCHIP) += pcie-rockchip.o > obj-$(CONFIG_PCIE_ROCKCHIP_EP) += pcie-rockchip-ep.o > obj-$(CONFIG_PCIE_ROCKCHIP_HOST) += pcie-rockchip-host.o > obj-$(CONFIG_PCIE_MEDIATEK) += pcie-mediatek.o > +obj-$(CONFIG_PCIE_MEDIATEK_GEN3) += pcie-mediatek-gen3.o > obj-$(CONFIG_PCIE_TANGO_SMP8759) += pcie-tango.o > obj-$(CONFIG_VMD) += vmd.o > obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb.o > diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c > new file mode 100644 > index 000000000000..00cdb598a9f5 > --- /dev/null > +++ b/drivers/pci/controller/pcie-mediatek-gen3.c > @@ -0,0 +1,1084 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * MediaTek PCIe host controller driver. > + * > + * Copyright (c) 2020 MediaTek Inc. > + * Author: Jianjun Wang <jianjun.wang@xxxxxxxxxxxx> > + */ > + > +#include <linux/clk.h> > +#include <linux/delay.h> > +#include <linux/iopoll.h> > +#include <linux/irq.h> > +#include <linux/irqchip/chained_irq.h> > +#include <linux/irqdomain.h> > +#include <linux/kernel.h> > +#include <linux/module.h> > +#include <linux/msi.h> > +#include <linux/of_address.h> > +#include <linux/of_clk.h> > +#include <linux/of_pci.h> > +#include <linux/of_platform.h> > +#include <linux/pci.h> > +#include <linux/phy/phy.h> > +#include <linux/platform_device.h> > +#include <linux/pm_domain.h> > +#include <linux/pm_runtime.h> > +#include <linux/reset.h> > + > +#include "../pci.h" > + > +#define PCIE_SETTING_REG 0x80 > +#define PCIE_PCI_IDS_1 0x9c > +#define PCI_CLASS(class) (class << 8) > +#define PCIE_RC_MODE BIT(0) > + > +#define PCIE_CFGNUM_REG 0x140 > +#define PCIE_CFG_DEVFN(devfn) ((devfn) & GENMASK(7, 0)) > +#define PCIE_CFG_BUS(bus) (((bus) << 8) & GENMASK(15, 8)) > +#define PCIE_CFG_BYTE_EN(bytes) (((bytes) << 16) & GENMASK(19, 16)) > +#define PCIE_CFG_FORCE_BYTE_EN BIT(20) > +#define PCIE_CFG_OFFSET_ADDR 0x1000 > +#define PCIE_CFG_HEADER(bus, devfn) \ > + (PCIE_CFG_BUS(bus) | PCIE_CFG_DEVFN(devfn)) > + > +#define PCIE_RST_CTRL_REG 0x148 > +#define PCIE_MAC_RSTB BIT(0) > +#define PCIE_PHY_RSTB BIT(1) > +#define PCIE_BRG_RSTB BIT(2) > +#define PCIE_PE_RSTB BIT(3) > + > +#define PCIE_LTSSM_STATUS_REG 0x150 > +#define PCIE_LTSSM_STATE_MASK GENMASK(28, 24) > +#define PCIE_LTSSM_STATE(val) ((val & PCIE_LTSSM_STATE_MASK) >> 24) > +#define PCIE_LTSSM_STATE_L2_IDLE 0x14 > + > +#define PCIE_LINK_STATUS_REG 0x154 > +#define PCIE_PORT_LINKUP BIT(8) > + > +#define PCIE_MSI_SET_NUM 8 > +#define PCIE_MSI_IRQS_PER_SET 32 > +#define PCIE_MSI_IRQS_NUM \ > + (PCIE_MSI_IRQS_PER_SET * (PCIE_MSI_SET_NUM)) > + > +#define PCIE_INT_ENABLE_REG 0x180 > +#define PCIE_MSI_MASK GENMASK(PCIE_MSI_SET_NUM + 8 - 1, 8) > +#define PCIE_MSI_SHIFT 8 > +#define PCIE_INTX_SHIFT 24 > +#define PCIE_INTX_MASK GENMASK(27, 24) > + > +#define PCIE_INT_STATUS_REG 0x184 > +#define PCIE_MSI_SET_ENABLE_REG 0x190 > + > +#define PCIE_ICMD_PM_REG 0x198 > +#define PCIE_TURN_OFF_LINK BIT(4) > + > +#define PCIE_MSI_ADDR_BASE_REG 0xc00 > +#define PCIE_MSI_SET_OFFSET 0x10 > +#define PCIE_MSI_STATUS_OFFSET 0x04 > +#define PCIE_MSI_ENABLE_OFFSET 0x08 > + > +#define PCIE_TRANS_TABLE_BASE_REG 0x800 > +#define PCIE_ATR_SRC_ADDR_MSB_OFFSET 0x4 > +#define PCIE_ATR_TRSL_ADDR_LSB_OFFSET 0x8 > +#define PCIE_ATR_TRSL_ADDR_MSB_OFFSET 0xc > +#define PCIE_ATR_TRSL_PARAM_OFFSET 0x10 > +#define PCIE_ATR_TLB_SET_OFFSET 0x20 > + > +#define PCIE_MAX_TRANS_TABLES 8 > +#define PCIE_ATR_EN BIT(0) > +#define PCIE_ATR_SIZE(size) \ > + (((((size) - 1) << 1) & GENMASK(6, 1)) | PCIE_ATR_EN) > +#define PCIE_ATR_ID(id) ((id) & GENMASK(3, 0)) > +#define PCIE_ATR_TYPE_MEM PCIE_ATR_ID(0) > +#define PCIE_ATR_TYPE_IO PCIE_ATR_ID(1) > +#define PCIE_ATR_TLP_TYPE(type) (((type) << 16) & GENMASK(18, 16)) > +#define PCIE_ATR_TLP_TYPE_MEM PCIE_ATR_TLP_TYPE(0) > +#define PCIE_ATR_TLP_TYPE_IO PCIE_ATR_TLP_TYPE(2) > + > +/** > + * struct mtk_pcie_msi - MSI information for each set > + * @base: IO mapped register base > + * @irq: MSI set Interrupt number > + * @index: MSI set number > + * @msg_addr: MSI message address > + * @domain: IRQ domain > + */ > +struct mtk_pcie_msi { > + void __iomem *base; > + unsigned int irq; > + int index; > + phys_addr_t msg_addr; > + struct irq_domain *domain; > +}; This looks odd. You seem to say that this covers a set if MSIs, and yet the irq field here clearly isn't part of a set. Is that per MSI instead? Either way, something is not quite as it should be. > + > +/** > + * struct mtk_pcie_port - PCIe port information > + * @dev: PCIe device > + * @base: IO mapped register base > + * @reg_base: Physical register base > + * @mac_reset: mac reset control > + * @phy_reset: phy reset control > + * @phy: PHY controller block > + * @clks: PCIe clocks > + * @num_clks: PCIe clocks count for this port > + * @irq: PCIe controller interrupt number > + * @intx_domain: legacy INTx IRQ domain > + * @msi_domain: MSI IRQ domain > + * @msi_top_domain: MSI IRQ top domain > + * @msi_info: MSI sets information > + * @lock: lock protecting IRQ bit map > + * @msi_irq_in_use: bit map for assigned MSI IRQ > + */ > +struct mtk_pcie_port { > + struct device *dev; > + void __iomem *base; > + phys_addr_t reg_base; > + struct reset_control *mac_reset; > + struct reset_control *phy_reset; > + struct phy *phy; > + struct clk_bulk_data *clks; > + int num_clks; > + > + int irq; > + struct irq_domain *intx_domain; > + struct irq_domain *msi_domain; > + struct irq_domain *msi_top_domain; > + struct mtk_pcie_msi **msi_info; > + struct mutex lock; > + DECLARE_BITMAP(msi_irq_in_use, PCIE_MSI_IRQS_NUM); > +}; > + > +/** > + * mtk_pcie_config_tlp_header > + * @bus: PCI bus to query > + * @devfn: device/function number > + * @where: offset in config space > + * @size: data size in TLP header > + * > + * Set byte enable field and device information in configuration TLP header. > + */ > +static void mtk_pcie_config_tlp_header(struct pci_bus *bus, unsigned int devfn, > + int where, int size) > +{ > + struct mtk_pcie_port *port = bus->sysdata; > + int bytes; > + u32 val; > + > + bytes = (GENMASK(size - 1, 0) & 0xf) << (where & 0x3); > + > + val = PCIE_CFG_FORCE_BYTE_EN | PCIE_CFG_BYTE_EN(bytes) | > + PCIE_CFG_HEADER(bus->number, devfn); > + > + writel(val, port->base + PCIE_CFGNUM_REG); Please convert all instances of writel/readl to their relaxed version, unless you can show that you need the extra memory ordering enforcement. This is specially valuable on an arm64 system (which MT8192 seems to be). > +} > + > +static void __iomem *mtk_pcie_map_bus(struct pci_bus *bus, unsigned int devfn, > + int where) > +{ > + struct mtk_pcie_port *port = bus->sysdata; > + > + return port->base + PCIE_CFG_OFFSET_ADDR + where; > +} > + > +static int mtk_pcie_config_read(struct pci_bus *bus, unsigned int devfn, > + int where, int size, u32 *val) > +{ > + mtk_pcie_config_tlp_header(bus, devfn, where, size); > + > + return pci_generic_config_read32(bus, devfn, where, size, val); > +} > + > +static int mtk_pcie_config_write(struct pci_bus *bus, unsigned int devfn, > + int where, int size, u32 val) > +{ > + mtk_pcie_config_tlp_header(bus, devfn, where, size); > + > + if (size <= 2) > + val <<= (where & 0x3) * 8; > + > + return pci_generic_config_write32(bus, devfn, where, 4, val); > +} > + > +static struct pci_ops mtk_pcie_ops = { > + .map_bus = mtk_pcie_map_bus, > + .read = mtk_pcie_config_read, > + .write = mtk_pcie_config_write, > +}; > + > +static int mtk_pcie_set_trans_table(struct mtk_pcie_port *port, > + resource_size_t cpu_addr, > + resource_size_t pci_addr, > + resource_size_t size, > + unsigned long type, int num) > +{ > + void __iomem *table; > + u32 val; > + > + if (num >= PCIE_MAX_TRANS_TABLES) { > + dev_err(port->dev, "not enough translate table[%d] for addr: %#llx, limited to [%d]\n", > + num, (unsigned long long) cpu_addr, > + PCIE_MAX_TRANS_TABLES); > + return -ENODEV; > + } > + > + table = port->base + PCIE_TRANS_TABLE_BASE_REG + > + num * PCIE_ATR_TLB_SET_OFFSET; > + > + writel(lower_32_bits(cpu_addr) | PCIE_ATR_SIZE(fls(size) - 1), table); > + writel(upper_32_bits(cpu_addr), table + PCIE_ATR_SRC_ADDR_MSB_OFFSET); > + writel(lower_32_bits(pci_addr), table + PCIE_ATR_TRSL_ADDR_LSB_OFFSET); > + writel(upper_32_bits(pci_addr), table + PCIE_ATR_TRSL_ADDR_MSB_OFFSET); > + > + if (type == IORESOURCE_IO) > + val = PCIE_ATR_TYPE_IO | PCIE_ATR_TLP_TYPE_IO; > + else > + val = PCIE_ATR_TYPE_MEM | PCIE_ATR_TLP_TYPE_MEM; > + > + writel(val, table + PCIE_ATR_TRSL_PARAM_OFFSET); > + > + return 0; > +} > + > +static int mtk_pcie_startup_port(struct mtk_pcie_port *port) > +{ > + struct resource_entry *entry; > + struct pci_host_bridge *host = pci_host_bridge_from_priv(port); > + unsigned int table_index = 0; > + int err; > + u32 val; > + > + /* Set as RC mode */ > + val = readl(port->base + PCIE_SETTING_REG); > + val |= PCIE_RC_MODE; > + writel(val, port->base + PCIE_SETTING_REG); > + > + /* Set class code */ > + val = readl(port->base + PCIE_PCI_IDS_1); > + val &= ~GENMASK(31, 8); > + val |= PCI_CLASS(PCI_CLASS_BRIDGE_PCI << 8); > + writel(val, port->base + PCIE_PCI_IDS_1); > + > + /* Assert all reset signals */ > + val = readl(port->base + PCIE_RST_CTRL_REG); > + val |= PCIE_MAC_RSTB | PCIE_PHY_RSTB | PCIE_BRG_RSTB | PCIE_PE_RSTB; > + writel(val, port->base + PCIE_RST_CTRL_REG); > + > + /* De-assert reset signals */ > + val &= ~(PCIE_MAC_RSTB | PCIE_PHY_RSTB | PCIE_BRG_RSTB); > + writel(val, port->base + PCIE_RST_CTRL_REG); > + > + /* Delay 100ms to wait the reference clocks become stable */ > + msleep(100); > + > + /* De-assert PERST# signal */ > + val &= ~PCIE_PE_RSTB; > + writel(val, port->base + PCIE_RST_CTRL_REG); > + > + /* Check if the link is up or not */ > + err = readl_poll_timeout(port->base + PCIE_LINK_STATUS_REG, val, > + !!(val & PCIE_PORT_LINKUP), 20, > + 50 * USEC_PER_MSEC); > + if (err) { > + val = readl(port->base + PCIE_LTSSM_STATUS_REG); > + dev_err(port->dev, "PCIe link down, ltssm reg val: %#x\n", val); > + return err; > + } > + > + /* Set PCIe translation windows */ > + resource_list_for_each_entry(entry, &host->windows) { > + struct resource *res = entry->res; > + unsigned long type = resource_type(res); > + resource_size_t cpu_addr; > + resource_size_t pci_addr; > + resource_size_t size; > + const char *range_type; > + > + if (type == IORESOURCE_IO) { > + cpu_addr = pci_pio_to_address(res->start); > + range_type = "IO"; > + } else if (type == IORESOURCE_MEM) { > + cpu_addr = res->start; > + range_type = "MEM"; > + } else { > + continue; > + } > + > + pci_addr = res->start - entry->offset; > + size = resource_size(res); > + err = mtk_pcie_set_trans_table(port, cpu_addr, pci_addr, size, > + type, table_index); > + if (err) > + return err; > + > + dev_dbg(port->dev, "set %s trans window[%d]: cpu_addr = %#llx, pci_addr = %#llx, size = %#llx\n", > + range_type, table_index, (unsigned long long) cpu_addr, > + (unsigned long long) pci_addr, > + (unsigned long long) size); > + > + table_index++; > + } > + > + return 0; > +} > + > +static inline struct mtk_pcie_msi *mtk_get_msi_info(struct mtk_pcie_port *port, > + unsigned long hwirq) > +{ > + return port->msi_info[hwirq / PCIE_MSI_IRQS_PER_SET]; > +} > + > +static int mtk_pcie_set_affinity(struct irq_data *data, > + const struct cpumask *mask, bool force) > +{ > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > + struct irq_data *port_data = irq_get_irq_data(port->irq); NAK. Either you are resolving an irqdata that is the same as the one as the one passed as a parameter (and it is thus pointless), or you are poking into some other interrupt, and it is a terrible layering violation. Either way, this must go. > + struct irq_chip *port_chip = irq_data_get_irq_chip(port_data); > + int ret; > + > + if (!port_chip || !port_chip->irq_set_affinity) > + return -EINVAL; In what circumstances can this fail? > + > + ret = port_chip->irq_set_affinity(port_data, mask, force); You really have to explain this indirection, and possibly remove it. > + > + irq_data_update_effective_affinity(data, mask); > + > + return ret; > +} > + > +static void mtk_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) > +{ > + struct mtk_pcie_msi *msi_info; > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > + unsigned long hwirq; > + > + msi_info = mtk_get_msi_info(port, data->hwirq); So msi_info *is* per interrupt, and yet contains information that is seemingly global. Please address this. > + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; > + > + msg->address_hi = 0; What guarantees this? > + msg->address_lo = lower_32_bits(msi_info->msg_addr); > + msg->data = hwirq; > + dev_dbg(port->dev, "msi#%#lx address_hi %#x address_lo %#x data %d\n", > + hwirq, msg->address_hi, msg->address_lo, msg->data); If address_hi is always 0, what is the point of printing it? > +} > + > +static void mtk_msi_irq_ack(struct irq_data *data) > +{ > + struct mtk_pcie_msi *msi_info; > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > + unsigned long hwirq; > + > + msi_info = mtk_get_msi_info(port, data->hwirq); > + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; > + > + writel(BIT(hwirq), msi_info->base + PCIE_MSI_STATUS_OFFSET); > +} > + > +static void mtk_msi_irq_mask(struct irq_data *data) > +{ > + struct mtk_pcie_msi *msi_info; > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > + unsigned long hwirq; > + u32 val; > + > + msi_info = mtk_get_msi_info(port, data->hwirq); > + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; > + > + val = readl(msi_info->base + PCIE_MSI_ENABLE_OFFSET); > + val &= ~BIT(hwirq); > + writel(val, msi_info->base + PCIE_MSI_ENABLE_OFFSET); > + > + pci_msi_mask_irq(data); This is at the wrong location. The common idiom is to split PCI and generic MSI. Please follow the same idiom in order to keep the code maintainable. > +} > + > +static void mtk_msi_irq_unmask(struct irq_data *data) > +{ > + struct mtk_pcie_msi *msi_info; > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > + unsigned long hwirq; > + u32 val; > + > + msi_info = mtk_get_msi_info(port, data->hwirq); > + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; > + > + val = readl(msi_info->base + PCIE_MSI_ENABLE_OFFSET); > + val |= BIT(hwirq); > + writel(val, msi_info->base + PCIE_MSI_ENABLE_OFFSET); > + > + pci_msi_unmask_irq(data); Same thing here. > +} > + > +static struct irq_chip mtk_msi_irq_chip = { > + .irq_ack = mtk_msi_irq_ack, > + .irq_compose_msi_msg = mtk_compose_msi_msg, > + .irq_mask = mtk_msi_irq_mask, > + .irq_unmask = mtk_msi_irq_unmask, > + .irq_set_affinity = mtk_pcie_set_affinity, > + .name = "PCIe", > +}; > + > +static irq_hw_number_t mtk_pcie_get_hwirq(struct msi_domain_info *info, > + msi_alloc_info_t *arg) > +{ > + struct msi_desc *desc = arg->desc; > + irq_hw_number_t hwirq = arg->hwirq; > + > + arg->hwirq += desc->nvec_used; > + > + return hwirq; > +} Please follow the common MSI flow. There is absolutely no need to reinvent the wheel. > + > +static void mtk_pcie_msi_free(struct irq_domain *domain, > + struct msi_domain_info *info, unsigned int virq) > +{ > + struct irq_data *data = irq_domain_get_irq_data(domain, virq); > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > + > + mutex_lock(&port->lock); > + > + bitmap_clear(port->msi_irq_in_use, data->hwirq, 1); > + > + mutex_unlock(&port->lock); > +} > + > +static int mtk_pcie_msi_prepare(struct irq_domain *domain, struct device *dev, > + int nvec, msi_alloc_info_t *arg) > +{ > + struct msi_domain_info *info = domain->host_data; > + struct mtk_pcie_port *port = info->chip_data; > + struct msi_desc *desc = first_pci_msi_entry(to_pci_dev(dev)); > + int hwirq, ret = 0; > + > + mutex_lock(&port->lock); > + > + if (desc->msi_attrib.is_msix) { > + int i; > + unsigned long bit; > + > + for (i = 0; i < nvec; i++) { > + bit = find_first_zero_bit(port->msi_irq_in_use, > + PCIE_MSI_IRQS_NUM); > + if (bit >= PCIE_MSI_IRQS_NUM) { > + ret = -ENOSPC; > + goto msi_prepare_out; > + } else { > + set_bit(bit, port->msi_irq_in_use); > + } > + } > + > + hwirq = bit - nvec; > + } else { > + hwirq = bitmap_find_free_region(port->msi_irq_in_use, > + PCIE_MSI_IRQS_NUM, > + order_base_2(nvec)); > + if (hwirq < 0) > + ret = -ENOSPC; > + } > + > +msi_prepare_out: > + mutex_unlock(&port->lock); > + > + if (ret) > + return ret; > + > + memset(arg, 0, sizeof(*arg)); > + arg->hwirq = hwirq; > + > + return 0; > +} NAK. This should take place in the irqdomain alloc path. .prepare is reserved for HW that require per-device setup, and this doesn't interact with the HW at all. Your distinction between MSI-X and non-MSI-X also becomes moot once you place the allocator at the place where allocations take place. > + > +static void mtk_pcie_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) > +{ > + arg->desc = desc; > +} NAK. Please use the common PCI flow. > + > +static struct msi_domain_ops mtk_msi_domain_ops = { > + .get_hwirq = mtk_pcie_get_hwirq, > + .msi_free = mtk_pcie_msi_free, > + .msi_prepare = mtk_pcie_msi_prepare, > + .set_desc = mtk_pcie_msi_set_desc, > +}; > + > +static struct msi_domain_info mtk_msi_domain_info = { > + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_PCI_MSIX | > + MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI), > + .chip = &mtk_msi_irq_chip, > + .ops = &mtk_msi_domain_ops, > + .handler = handle_edge_irq, > + .handler_name = "MSI", > +}; > + > +static void mtk_msi_top_irq_eoi(struct irq_data *data) > +{ > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > + unsigned long msi_irq = data->hwirq + PCIE_MSI_SHIFT; > + > + writel(BIT(msi_irq), port->base + PCIE_INT_STATUS_REG); > +} > + > +static struct irq_chip mtk_msi_top_irq_chip = { > + .irq_eoi = mtk_msi_top_irq_eoi, > + .name = "PCIe", > +}; > + > +static void mtk_pcie_msi_handler(struct irq_desc *desc) > +{ > + struct mtk_pcie_msi *msi_info = irq_desc_get_handler_data(desc); > + struct irq_chip *irqchip = irq_desc_get_chip(desc); > + unsigned long msi_enable, msi_status; > + unsigned int virq; > + irq_hw_number_t bit, hwirq; > + > + chained_irq_enter(irqchip, desc); > + > + msi_enable = readl(msi_info->base + PCIE_MSI_ENABLE_OFFSET); > + while ((msi_status = readl(msi_info->base + PCIE_MSI_STATUS_OFFSET) & > + msi_enable)) { > + for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) { > + hwirq = bit + msi_info->index * PCIE_MSI_IRQS_PER_SET; > + virq = irq_find_mapping(msi_info->domain, hwirq); > + generic_handle_irq(virq); > + } > + } > + > + chained_irq_exit(irqchip, desc); If your MSIs are behind a muxing interrupt you *cannot* implement an affinity setting callback, as this changes the affinity of *all* MSIs in one go, which is a userspace ABI violation. I can't believe that this is almost 2021, and people are still shipping HW that is that broken :-(. > +} > + > +static int mtk_msi_top_domain_map(struct irq_domain *domain, > + unsigned int virq, irq_hw_number_t hwirq) > +{ > + struct mtk_pcie_port *port = domain->host_data; > + struct mtk_pcie_msi *msi_info = port->msi_info[hwirq]; > + > + irq_domain_set_info(domain, virq, hwirq, > + &mtk_msi_top_irq_chip, domain->host_data, > + mtk_pcie_msi_handler, msi_info, NULL); > + > + return 0; > +} > + > +static const struct irq_domain_ops mtk_msi_top_domain_ops = { > + .map = mtk_msi_top_domain_map, NAK. MSIs do not need a mapping callback. This belongs to the irqdomain alloc callback, which you don't implement. > +}; > + > +static void mtk_intx_mask(struct irq_data *data) > +{ > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > + u32 val; > + > + val = readl(port->base + PCIE_INT_ENABLE_REG); > + val &= ~BIT(data->hwirq + PCIE_INTX_SHIFT); > + writel(val, port->base + PCIE_INT_ENABLE_REG); > +} > + > +static void mtk_intx_unmask(struct irq_data *data) > +{ > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > + u32 val; > + > + val = readl(port->base + PCIE_INT_ENABLE_REG); > + val |= BIT(data->hwirq + PCIE_INTX_SHIFT); > + writel(val, port->base + PCIE_INT_ENABLE_REG); > +} > + > +static void mtk_intx_eoi(struct irq_data *data) > +{ > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > + unsigned long hwirq; > + > + /** > + * As an emulated level IRQ, its interrupt status will remain > + * until the corresponding de-assert message is received; hence that > + * the status can only be cleared when the interrupt has been serviced. > + */ > + hwirq = data->hwirq + PCIE_INTX_SHIFT; > + writel(BIT(hwirq), port->base + PCIE_INT_STATUS_REG); All of this is the description of a level interrupt, so this is pretty much devoid of any information as to *why* you need to write to clear this bit. What happens if the interrupt is still asserted because nothing has handled it? Without any further information, this looks terribly wrong. > +} > + > +static struct irq_chip mtk_intx_irq_chip = { > + .irq_mask = mtk_intx_mask, > + .irq_unmask = mtk_intx_unmask, > + .irq_eoi = mtk_intx_eoi, > + .irq_set_affinity = mtk_pcie_set_affinity, > + .name = "PCIe", > +}; > + > +static int mtk_pcie_intx_map(struct irq_domain *domain, unsigned int irq, > + irq_hw_number_t hwirq) > +{ > + irq_set_chip_and_handler_name(irq, &mtk_intx_irq_chip, > + handle_fasteoi_irq, "INTx"); > + irq_set_chip_data(irq, domain->host_data); > + > + return 0; > +} > + > +static const struct irq_domain_ops intx_domain_ops = { > + .map = mtk_pcie_intx_map, > +}; > + > +static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port, > + struct device_node *node) > +{ > + struct device *dev = port->dev; > + struct device_node *intc_node; > + struct fwnode_handle *fwnode = of_node_to_fwnode(node); > + struct mtk_pcie_msi *msi_info; > + struct msi_domain_info *info; > + int i, ret; > + > + /* Setup INTx */ > + intc_node = of_get_child_by_name(node, "interrupt-controller"); > + if (!intc_node) { > + dev_err(dev, "missing PCIe Intc node\n"); > + return -ENODEV; > + } > + > + port->intx_domain = irq_domain_add_linear(intc_node, PCI_NUM_INTX, > + &intx_domain_ops, port); > + if (!port->intx_domain) { > + dev_err(dev, "failed to get INTx IRQ domain\n"); > + return -ENODEV; > + } > + > + /* Setup MSI */ > + mutex_init(&port->lock); > + > + info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL); > + if (!info) > + return -ENOMEM; > + > + memcpy(info, &mtk_msi_domain_info, sizeof(*info)); > + info->chip_data = port; > + > + port->msi_domain = pci_msi_create_irq_domain(fwnode, info, NULL); > + if (!port->msi_domain) { > + dev_info(dev, "failed to create MSI domain\n"); > + ret = -ENODEV; > + goto err_msi_domain; > + } > + > + /* Enable MSI and setup PCIe domains */ > + port->msi_top_domain = irq_domain_add_hierarchy(NULL, 0, 0, node, > + &mtk_msi_top_domain_ops, > + port); > + if (!port->msi_top_domain) { > + dev_info(dev, "failed to create MSI top domain\n"); > + ret = -ENODEV; > + goto err_msi_top_domain; > + } > + > + port->msi_info = devm_kzalloc(dev, PCIE_MSI_SET_NUM, GFP_KERNEL); > + if (!port->msi_info) { > + ret = -ENOMEM; > + goto err_msi_info; > + } > + > + for (i = 0; i < PCIE_MSI_SET_NUM; i++) { > + int offset = i * PCIE_MSI_SET_OFFSET; > + u32 val; > + > + msi_info = devm_kzalloc(dev, sizeof(*msi_info), GFP_KERNEL); > + if (!msi_info) { > + ret = -ENOMEM; > + goto err_msi_set; > + } > + > + msi_info->base = port->base + PCIE_MSI_ADDR_BASE_REG + offset; > + msi_info->msg_addr = port->reg_base + PCIE_MSI_ADDR_BASE_REG + > + offset; All of this can be derived from i (and this hwirq) at runtime. Why do we need to cache this information? > + > + writel(lower_32_bits(msi_info->msg_addr), msi_info->base); > + > + msi_info->index = i; > + msi_info->domain = port->msi_domain; If this is common to all sets, why isn't it global? > + > + port->msi_info[i] = msi_info; > + > + /* Alloc IRQ for each MSI set */ > + msi_info->irq = irq_create_mapping(port->msi_top_domain, i); > + if (!msi_info->irq) { > + dev_info(dev, "allocate MSI top IRQ failed\n"); > + ret = -ENOSPC; > + goto err_msi_set; > + } > + > + val = readl(port->base + PCIE_INT_ENABLE_REG); > + val |= BIT(i + PCIE_MSI_SHIFT); > + writel(val, port->base + PCIE_INT_ENABLE_REG); > + > + val = readl(port->base + PCIE_MSI_SET_ENABLE_REG); > + val |= BIT(i); > + writel(val, port->base + PCIE_MSI_SET_ENABLE_REG); > + } > + > + return 0; > + > +err_msi_set: > + while (i-- > 0) { > + msi_info = port->msi_info[i]; > + irq_dispose_mapping(msi_info->irq); > + } > +err_msi_info: > + irq_domain_remove(port->msi_top_domain); > +err_msi_top_domain: > + irq_domain_remove(port->msi_domain); > +err_msi_domain: > + irq_domain_remove(port->intx_domain); > + > + return ret; > +} At this stage, I stopped, because there is really a lot to change in this driver before it is in an acceptable shape. Please use the common idioms established for existing MSI implementations. I'd appreciate if you kept me posted on the following versions of this driver. Thanks, M. -- Without deviation from the norm, progress is not possible.