On Fri, 2020-12-25 at 19:22 +0000, Marc Zyngier wrote: > Hi Jianjun, > > On Fri, 25 Dec 2020 10:03:07 +0000, > Jianjun Wang <jianjun.wang@xxxxxxxxxxxx> wrote: > > > > MediaTek's PCIe host controller has three generation HWs, the new > > generation HW is an individual bridge, it supports Gen3 speed and > > up to 256 MSI interrupt numbers for multi-function devices. > > > > Add support for new Gen3 controller which can be found on MT8192. > > > > Signed-off-by: Jianjun Wang <jianjun.wang@xxxxxxxxxxxx> > > Acked-by: Ryder Lee <ryder.lee@xxxxxxxxxxxx> > > --- > > drivers/pci/controller/Kconfig | 13 + > > drivers/pci/controller/Makefile | 1 + > > drivers/pci/controller/pcie-mediatek-gen3.c | 1084 +++++++++++++++++++ > > 3 files changed, 1098 insertions(+) > > create mode 100644 drivers/pci/controller/pcie-mediatek-gen3.c > > This is a pretty large patch, and it'd be great if you would split it > into at least 4 parts (core PCIe, PM, MSI, INTx). Hi Marc, Thanks for your review, I will try to split it to those parts in the next version. > > > > > diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig > > index 64e2f5e379aa..b242b17025b3 100644 > > --- a/drivers/pci/controller/Kconfig > > +++ b/drivers/pci/controller/Kconfig > > @@ -242,6 +242,19 @@ config PCIE_MEDIATEK > > Say Y here if you want to enable PCIe controller support on > > MediaTek SoCs. > > > > +config PCIE_MEDIATEK_GEN3 > > + tristate "MediaTek Gen3 PCIe controller" > > + depends on ARCH_MEDIATEK || COMPILE_TEST > > + depends on PCI_MSI_IRQ_DOMAIN > > + help > > + Adds support for PCIe Gen3 MAC controller for MediaTek SoCs. > > + This PCIe controller is compatible with Gen3, Gen2 and Gen1 speed, > > + and support up to 256 MSI interrupt numbers for > > + multi-function devices. > > + > > + Say Y here if you want to enable Gen3 PCIe controller support on > > + MediaTek SoCs. > > + > > config PCIE_TANGO_SMP8759 > > bool "Tango SMP8759 PCIe controller (DANGEROUS)" > > depends on ARCH_TANGO && PCI_MSI && OF > > diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile > > index 04c6edc285c5..df5d77d72a9d 100644 > > --- a/drivers/pci/controller/Makefile > > +++ b/drivers/pci/controller/Makefile > > @@ -27,6 +27,7 @@ obj-$(CONFIG_PCIE_ROCKCHIP) += pcie-rockchip.o > > obj-$(CONFIG_PCIE_ROCKCHIP_EP) += pcie-rockchip-ep.o > > obj-$(CONFIG_PCIE_ROCKCHIP_HOST) += pcie-rockchip-host.o > > obj-$(CONFIG_PCIE_MEDIATEK) += pcie-mediatek.o > > +obj-$(CONFIG_PCIE_MEDIATEK_GEN3) += pcie-mediatek-gen3.o > > obj-$(CONFIG_PCIE_TANGO_SMP8759) += pcie-tango.o > > obj-$(CONFIG_VMD) += vmd.o > > obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb.o > > diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c > > new file mode 100644 > > index 000000000000..00cdb598a9f5 > > --- /dev/null > > +++ b/drivers/pci/controller/pcie-mediatek-gen3.c > > @@ -0,0 +1,1084 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* > > + * MediaTek PCIe host controller driver. > > + * > > + * Copyright (c) 2020 MediaTek Inc. > > + * Author: Jianjun Wang <jianjun.wang@xxxxxxxxxxxx> > > + */ > > + > > +#include <linux/clk.h> > > +#include <linux/delay.h> > > +#include <linux/iopoll.h> > > +#include <linux/irq.h> > > +#include <linux/irqchip/chained_irq.h> > > +#include <linux/irqdomain.h> > > +#include <linux/kernel.h> > > +#include <linux/module.h> > > +#include <linux/msi.h> > > +#include <linux/of_address.h> > > +#include <linux/of_clk.h> > > +#include <linux/of_pci.h> > > +#include <linux/of_platform.h> > > +#include <linux/pci.h> > > +#include <linux/phy/phy.h> > > +#include <linux/platform_device.h> > > +#include <linux/pm_domain.h> > > +#include <linux/pm_runtime.h> > > +#include <linux/reset.h> > > + > > +#include "../pci.h" > > + > > +#define PCIE_SETTING_REG 0x80 > > +#define PCIE_PCI_IDS_1 0x9c > > +#define PCI_CLASS(class) (class << 8) > > +#define PCIE_RC_MODE BIT(0) > > + > > +#define PCIE_CFGNUM_REG 0x140 > > +#define PCIE_CFG_DEVFN(devfn) ((devfn) & GENMASK(7, 0)) > > +#define PCIE_CFG_BUS(bus) (((bus) << 8) & GENMASK(15, 8)) > > +#define PCIE_CFG_BYTE_EN(bytes) (((bytes) << 16) & GENMASK(19, 16)) > > +#define PCIE_CFG_FORCE_BYTE_EN BIT(20) > > +#define PCIE_CFG_OFFSET_ADDR 0x1000 > > +#define PCIE_CFG_HEADER(bus, devfn) \ > > + (PCIE_CFG_BUS(bus) | PCIE_CFG_DEVFN(devfn)) > > + > > +#define PCIE_RST_CTRL_REG 0x148 > > +#define PCIE_MAC_RSTB BIT(0) > > +#define PCIE_PHY_RSTB BIT(1) > > +#define PCIE_BRG_RSTB BIT(2) > > +#define PCIE_PE_RSTB BIT(3) > > + > > +#define PCIE_LTSSM_STATUS_REG 0x150 > > +#define PCIE_LTSSM_STATE_MASK GENMASK(28, 24) > > +#define PCIE_LTSSM_STATE(val) ((val & PCIE_LTSSM_STATE_MASK) >> 24) > > +#define PCIE_LTSSM_STATE_L2_IDLE 0x14 > > + > > +#define PCIE_LINK_STATUS_REG 0x154 > > +#define PCIE_PORT_LINKUP BIT(8) > > + > > +#define PCIE_MSI_SET_NUM 8 > > +#define PCIE_MSI_IRQS_PER_SET 32 > > +#define PCIE_MSI_IRQS_NUM \ > > + (PCIE_MSI_IRQS_PER_SET * (PCIE_MSI_SET_NUM)) > > + > > +#define PCIE_INT_ENABLE_REG 0x180 > > +#define PCIE_MSI_MASK GENMASK(PCIE_MSI_SET_NUM + 8 - 1, 8) > > +#define PCIE_MSI_SHIFT 8 > > +#define PCIE_INTX_SHIFT 24 > > +#define PCIE_INTX_MASK GENMASK(27, 24) > > + > > +#define PCIE_INT_STATUS_REG 0x184 > > +#define PCIE_MSI_SET_ENABLE_REG 0x190 > > + > > +#define PCIE_ICMD_PM_REG 0x198 > > +#define PCIE_TURN_OFF_LINK BIT(4) > > + > > +#define PCIE_MSI_ADDR_BASE_REG 0xc00 > > +#define PCIE_MSI_SET_OFFSET 0x10 > > +#define PCIE_MSI_STATUS_OFFSET 0x04 > > +#define PCIE_MSI_ENABLE_OFFSET 0x08 > > + > > +#define PCIE_TRANS_TABLE_BASE_REG 0x800 > > +#define PCIE_ATR_SRC_ADDR_MSB_OFFSET 0x4 > > +#define PCIE_ATR_TRSL_ADDR_LSB_OFFSET 0x8 > > +#define PCIE_ATR_TRSL_ADDR_MSB_OFFSET 0xc > > +#define PCIE_ATR_TRSL_PARAM_OFFSET 0x10 > > +#define PCIE_ATR_TLB_SET_OFFSET 0x20 > > + > > +#define PCIE_MAX_TRANS_TABLES 8 > > +#define PCIE_ATR_EN BIT(0) > > +#define PCIE_ATR_SIZE(size) \ > > + (((((size) - 1) << 1) & GENMASK(6, 1)) | PCIE_ATR_EN) > > +#define PCIE_ATR_ID(id) ((id) & GENMASK(3, 0)) > > +#define PCIE_ATR_TYPE_MEM PCIE_ATR_ID(0) > > +#define PCIE_ATR_TYPE_IO PCIE_ATR_ID(1) > > +#define PCIE_ATR_TLP_TYPE(type) (((type) << 16) & GENMASK(18, 16)) > > +#define PCIE_ATR_TLP_TYPE_MEM PCIE_ATR_TLP_TYPE(0) > > +#define PCIE_ATR_TLP_TYPE_IO PCIE_ATR_TLP_TYPE(2) > > + > > +/** > > + * struct mtk_pcie_msi - MSI information for each set > > + * @base: IO mapped register base > > + * @irq: MSI set Interrupt number > > + * @index: MSI set number > > + * @msg_addr: MSI message address > > + * @domain: IRQ domain > > + */ > > +struct mtk_pcie_msi { > > + void __iomem *base; > > + unsigned int irq; > > + int index; > > + phys_addr_t msg_addr; > > + struct irq_domain *domain; > > +}; > > This looks odd. You seem to say that this covers a set if MSIs, and > yet the irq field here clearly isn't part of a set. Is that per MSI > instead? Either way, something is not quite as it should be. > Appreciate all these comments, please allow me to explain the MSI interrupt design in this HW. The HW design of MSI interrupts will be like the following: +-----+ | GIC | +-----+ ^ | |[port->irq] | +-+-+-+-+-+-+-+-+ |0|1|2|3|4|5|6|7|[PCIe intc] +-+-+-+-+-+-+-+-+ ^ ^ ^ | | ... |[msi_info->irq] +-------+ +------+ +-----------+ | | | +-+-+---+--+--+ +-+-+---+--+--+ +-+-+---+--+--+ |0|1|...|30|31| |0|1|...|30|31| |0|1|...|30|31|[MSI sets] +-+-+---+--+--+ +-+-+---+--+--+ +-+-+---+--+--+ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ | | | | | | | | | | | | [MSI irq] | | | | | | | | | | | | (MSI SET0) (MSI SET1) ... (MSI SET7) In software parts, the port->msi_top_domain is created to maintains 8 MSI IRQs from the PCIe intc layer, its hardware IRQ will be mapped to msi_info->irq by irq_create_mapping. The port->msi_domain contains 256 MSI IRQs in total, it consist of 8 MSI sets, and each MSI set contains 32 MSI IRQs. The structure of mtk_pcie_msi is used to describe the MSI set, I think it will be more convenient and comply with the HW design when use this structure, we can get the information of MSI set directly, instead of calculated by port->base. When a MSI interrupt is received, the interrupt handle flow will be like the following: mtk_pcie_irq_handler (port->irq) | |(find mapping in msi_top_domain) | v mtk_pcie_msi_handler (msi_info->irq) | |(find mapping in msi_domain) | v handle_edge_irq (MSI irq) | | v (dispatch to device handler) Yes, I had to admit that it's not a quite good solution of irqdomains, since the local irq domain is partial coupled with the standard PCI MSI irqdomain. Should I need to create another irqdomain to maintain the MSI sets layer, and set it as the parent domain of PCI MSI domain? I really need your suggestions, thanks a lot. > > + > > +/** > > + * struct mtk_pcie_port - PCIe port information > > + * @dev: PCIe device > > + * @base: IO mapped register base > > + * @reg_base: Physical register base > > + * @mac_reset: mac reset control > > + * @phy_reset: phy reset control > > + * @phy: PHY controller block > > + * @clks: PCIe clocks > > + * @num_clks: PCIe clocks count for this port > > + * @irq: PCIe controller interrupt number > > + * @intx_domain: legacy INTx IRQ domain > > + * @msi_domain: MSI IRQ domain > > + * @msi_top_domain: MSI IRQ top domain > > + * @msi_info: MSI sets information > > + * @lock: lock protecting IRQ bit map > > + * @msi_irq_in_use: bit map for assigned MSI IRQ > > + */ > > +struct mtk_pcie_port { > > + struct device *dev; > > + void __iomem *base; > > + phys_addr_t reg_base; > > + struct reset_control *mac_reset; > > + struct reset_control *phy_reset; > > + struct phy *phy; > > + struct clk_bulk_data *clks; > > + int num_clks; > > + > > + int irq; > > + struct irq_domain *intx_domain; > > + struct irq_domain *msi_domain; > > + struct irq_domain *msi_top_domain; > > + struct mtk_pcie_msi **msi_info; > > + struct mutex lock; > > + DECLARE_BITMAP(msi_irq_in_use, PCIE_MSI_IRQS_NUM); > > +}; > > + > > +/** > > + * mtk_pcie_config_tlp_header > > + * @bus: PCI bus to query > > + * @devfn: device/function number > > + * @where: offset in config space > > + * @size: data size in TLP header > > + * > > + * Set byte enable field and device information in configuration TLP header. > > + */ > > +static void mtk_pcie_config_tlp_header(struct pci_bus *bus, unsigned int devfn, > > + int where, int size) > > +{ > > + struct mtk_pcie_port *port = bus->sysdata; > > + int bytes; > > + u32 val; > > + > > + bytes = (GENMASK(size - 1, 0) & 0xf) << (where & 0x3); > > + > > + val = PCIE_CFG_FORCE_BYTE_EN | PCIE_CFG_BYTE_EN(bytes) | > > + PCIE_CFG_HEADER(bus->number, devfn); > > + > > + writel(val, port->base + PCIE_CFGNUM_REG); > > Please convert all instances of writel/readl to their relaxed version, > unless you can show that you need the extra memory ordering > enforcement. This is specially valuable on an arm64 system (which > MT8192 seems to be). Sure, I will fix it in the next version. > > > +} > > + > > +static void __iomem *mtk_pcie_map_bus(struct pci_bus *bus, unsigned int devfn, > > + int where) > > +{ > > + struct mtk_pcie_port *port = bus->sysdata; > > + > > + return port->base + PCIE_CFG_OFFSET_ADDR + where; > > +} > > + > > +static int mtk_pcie_config_read(struct pci_bus *bus, unsigned int devfn, > > + int where, int size, u32 *val) > > +{ > > + mtk_pcie_config_tlp_header(bus, devfn, where, size); > > + > > + return pci_generic_config_read32(bus, devfn, where, size, val); > > +} > > + > > +static int mtk_pcie_config_write(struct pci_bus *bus, unsigned int devfn, > > + int where, int size, u32 val) > > +{ > > + mtk_pcie_config_tlp_header(bus, devfn, where, size); > > + > > + if (size <= 2) > > + val <<= (where & 0x3) * 8; > > + > > + return pci_generic_config_write32(bus, devfn, where, 4, val); > > +} > > + > > +static struct pci_ops mtk_pcie_ops = { > > + .map_bus = mtk_pcie_map_bus, > > + .read = mtk_pcie_config_read, > > + .write = mtk_pcie_config_write, > > +}; > > + > > +static int mtk_pcie_set_trans_table(struct mtk_pcie_port *port, > > + resource_size_t cpu_addr, > > + resource_size_t pci_addr, > > + resource_size_t size, > > + unsigned long type, int num) > > +{ > > + void __iomem *table; > > + u32 val; > > + > > + if (num >= PCIE_MAX_TRANS_TABLES) { > > + dev_err(port->dev, "not enough translate table[%d] for addr: %#llx, limited to [%d]\n", > > + num, (unsigned long long) cpu_addr, > > + PCIE_MAX_TRANS_TABLES); > > + return -ENODEV; > > + } > > + > > + table = port->base + PCIE_TRANS_TABLE_BASE_REG + > > + num * PCIE_ATR_TLB_SET_OFFSET; > > + > > + writel(lower_32_bits(cpu_addr) | PCIE_ATR_SIZE(fls(size) - 1), table); > > + writel(upper_32_bits(cpu_addr), table + PCIE_ATR_SRC_ADDR_MSB_OFFSET); > > + writel(lower_32_bits(pci_addr), table + PCIE_ATR_TRSL_ADDR_LSB_OFFSET); > > + writel(upper_32_bits(pci_addr), table + PCIE_ATR_TRSL_ADDR_MSB_OFFSET); > > + > > + if (type == IORESOURCE_IO) > > + val = PCIE_ATR_TYPE_IO | PCIE_ATR_TLP_TYPE_IO; > > + else > > + val = PCIE_ATR_TYPE_MEM | PCIE_ATR_TLP_TYPE_MEM; > > + > > + writel(val, table + PCIE_ATR_TRSL_PARAM_OFFSET); > > + > > + return 0; > > +} > > + > > +static int mtk_pcie_startup_port(struct mtk_pcie_port *port) > > +{ > > + struct resource_entry *entry; > > + struct pci_host_bridge *host = pci_host_bridge_from_priv(port); > > + unsigned int table_index = 0; > > + int err; > > + u32 val; > > + > > + /* Set as RC mode */ > > + val = readl(port->base + PCIE_SETTING_REG); > > + val |= PCIE_RC_MODE; > > + writel(val, port->base + PCIE_SETTING_REG); > > + > > + /* Set class code */ > > + val = readl(port->base + PCIE_PCI_IDS_1); > > + val &= ~GENMASK(31, 8); > > + val |= PCI_CLASS(PCI_CLASS_BRIDGE_PCI << 8); > > + writel(val, port->base + PCIE_PCI_IDS_1); > > + > > + /* Assert all reset signals */ > > + val = readl(port->base + PCIE_RST_CTRL_REG); > > + val |= PCIE_MAC_RSTB | PCIE_PHY_RSTB | PCIE_BRG_RSTB | PCIE_PE_RSTB; > > + writel(val, port->base + PCIE_RST_CTRL_REG); > > + > > + /* De-assert reset signals */ > > + val &= ~(PCIE_MAC_RSTB | PCIE_PHY_RSTB | PCIE_BRG_RSTB); > > + writel(val, port->base + PCIE_RST_CTRL_REG); > > + > > + /* Delay 100ms to wait the reference clocks become stable */ > > + msleep(100); > > + > > + /* De-assert PERST# signal */ > > + val &= ~PCIE_PE_RSTB; > > + writel(val, port->base + PCIE_RST_CTRL_REG); > > + > > + /* Check if the link is up or not */ > > + err = readl_poll_timeout(port->base + PCIE_LINK_STATUS_REG, val, > > + !!(val & PCIE_PORT_LINKUP), 20, > > + 50 * USEC_PER_MSEC); > > + if (err) { > > + val = readl(port->base + PCIE_LTSSM_STATUS_REG); > > + dev_err(port->dev, "PCIe link down, ltssm reg val: %#x\n", val); > > + return err; > > + } > > + > > + /* Set PCIe translation windows */ > > + resource_list_for_each_entry(entry, &host->windows) { > > + struct resource *res = entry->res; > > + unsigned long type = resource_type(res); > > + resource_size_t cpu_addr; > > + resource_size_t pci_addr; > > + resource_size_t size; > > + const char *range_type; > > + > > + if (type == IORESOURCE_IO) { > > + cpu_addr = pci_pio_to_address(res->start); > > + range_type = "IO"; > > + } else if (type == IORESOURCE_MEM) { > > + cpu_addr = res->start; > > + range_type = "MEM"; > > + } else { > > + continue; > > + } > > + > > + pci_addr = res->start - entry->offset; > > + size = resource_size(res); > > + err = mtk_pcie_set_trans_table(port, cpu_addr, pci_addr, size, > > + type, table_index); > > + if (err) > > + return err; > > + > > + dev_dbg(port->dev, "set %s trans window[%d]: cpu_addr = %#llx, pci_addr = %#llx, size = %#llx\n", > > + range_type, table_index, (unsigned long long) cpu_addr, > > + (unsigned long long) pci_addr, > > + (unsigned long long) size); > > + > > + table_index++; > > + } > > + > > + return 0; > > +} > > + > > +static inline struct mtk_pcie_msi *mtk_get_msi_info(struct mtk_pcie_port *port, > > + unsigned long hwirq) > > +{ > > + return port->msi_info[hwirq / PCIE_MSI_IRQS_PER_SET]; > > +} > > + > > +static int mtk_pcie_set_affinity(struct irq_data *data, > > + const struct cpumask *mask, bool force) > > +{ > > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > > + struct irq_data *port_data = irq_get_irq_data(port->irq); > > NAK. Either you are resolving an irqdata that is the same as the one > as the one passed as a parameter (and it is thus pointless), or you > are poking into some other interrupt, and it is a terrible layering > violation. Either way, this must go. > > > + struct irq_chip *port_chip = irq_data_get_irq_chip(port_data); > > + int ret; > > + > > + if (!port_chip || !port_chip->irq_set_affinity) > > + return -EINVAL; > > In what circumstances can this fail? > > > + > > + ret = port_chip->irq_set_affinity(port_data, mask, force); > > You really have to explain this indirection, and possibly remove it. As the MSI HW designs described before, port->irq is the only interrupt which connected with GIC, so the only way to set irq affinity is that to use the irq_set_affinity callback of port_chip. It seems there is no chance to implement this callback, I will replace to "return -EINVAL" in the next version. > > > + > > + irq_data_update_effective_affinity(data, mask); > > + > > + return ret; > > +} > > + > > +static void mtk_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) > > +{ > > + struct mtk_pcie_msi *msi_info; > > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > > + unsigned long hwirq; > > + > > + msi_info = mtk_get_msi_info(port, data->hwirq); > > So msi_info *is* per interrupt, and yet contains information that is > seemingly global. Please address this. I will fix it in the next version. > > > + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; > > + > > + msg->address_hi = 0; > > What guarantees this? It should not be set to 0 directly, I will replace to upper_32_bits(msi_info->msi_addr) in the next version. > > > + msg->address_lo = lower_32_bits(msi_info->msg_addr); > > + msg->data = hwirq; > > + dev_dbg(port->dev, "msi#%#lx address_hi %#x address_lo %#x data %d\n", > > + hwirq, msg->address_hi, msg->address_lo, msg->data); > > If address_hi is always 0, what is the point of printing it? > > > +} > > + > > +static void mtk_msi_irq_ack(struct irq_data *data) > > +{ > > + struct mtk_pcie_msi *msi_info; > > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > > + unsigned long hwirq; > > + > > + msi_info = mtk_get_msi_info(port, data->hwirq); > > + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; > > + > > + writel(BIT(hwirq), msi_info->base + PCIE_MSI_STATUS_OFFSET); > > +} > > + > > +static void mtk_msi_irq_mask(struct irq_data *data) > > +{ > > + struct mtk_pcie_msi *msi_info; > > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > > + unsigned long hwirq; > > + u32 val; > > + > > + msi_info = mtk_get_msi_info(port, data->hwirq); > > + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; > > + > > + val = readl(msi_info->base + PCIE_MSI_ENABLE_OFFSET); > > + val &= ~BIT(hwirq); > > + writel(val, msi_info->base + PCIE_MSI_ENABLE_OFFSET); > > + > > + pci_msi_mask_irq(data); > > This is at the wrong location. The common idiom is to split PCI and > generic MSI. Please follow the same idiom in order to keep the code > maintainable. > > > +} > > + > > +static void mtk_msi_irq_unmask(struct irq_data *data) > > +{ > > + struct mtk_pcie_msi *msi_info; > > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > > + unsigned long hwirq; > > + u32 val; > > + > > + msi_info = mtk_get_msi_info(port, data->hwirq); > > + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; > > + > > + val = readl(msi_info->base + PCIE_MSI_ENABLE_OFFSET); > > + val |= BIT(hwirq); > > + writel(val, msi_info->base + PCIE_MSI_ENABLE_OFFSET); > > + > > + pci_msi_unmask_irq(data); > > Same thing here. > > > +} > > + > > +static struct irq_chip mtk_msi_irq_chip = { > > + .irq_ack = mtk_msi_irq_ack, > > + .irq_compose_msi_msg = mtk_compose_msi_msg, > > + .irq_mask = mtk_msi_irq_mask, > > + .irq_unmask = mtk_msi_irq_unmask, > > + .irq_set_affinity = mtk_pcie_set_affinity, > > + .name = "PCIe", > > +}; > > + > > +static irq_hw_number_t mtk_pcie_get_hwirq(struct msi_domain_info *info, > > + msi_alloc_info_t *arg) > > +{ > > + struct msi_desc *desc = arg->desc; > > + irq_hw_number_t hwirq = arg->hwirq; > > + > > + arg->hwirq += desc->nvec_used; > > + > > + return hwirq; > > +} > > Please follow the common MSI flow. There is absolutely no need to > reinvent the wheel. > > > + > > +static void mtk_pcie_msi_free(struct irq_domain *domain, > > + struct msi_domain_info *info, unsigned int virq) > > +{ > > + struct irq_data *data = irq_domain_get_irq_data(domain, virq); > > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > > + > > + mutex_lock(&port->lock); > > + > > + bitmap_clear(port->msi_irq_in_use, data->hwirq, 1); > > + > > + mutex_unlock(&port->lock); > > +} > > + > > +static int mtk_pcie_msi_prepare(struct irq_domain *domain, struct device *dev, > > + int nvec, msi_alloc_info_t *arg) > > +{ > > + struct msi_domain_info *info = domain->host_data; > > + struct mtk_pcie_port *port = info->chip_data; > > + struct msi_desc *desc = first_pci_msi_entry(to_pci_dev(dev)); > > + int hwirq, ret = 0; > > + > > + mutex_lock(&port->lock); > > + > > + if (desc->msi_attrib.is_msix) { > > + int i; > > + unsigned long bit; > > + > > + for (i = 0; i < nvec; i++) { > > + bit = find_first_zero_bit(port->msi_irq_in_use, > > + PCIE_MSI_IRQS_NUM); > > + if (bit >= PCIE_MSI_IRQS_NUM) { > > + ret = -ENOSPC; > > + goto msi_prepare_out; > > + } else { > > + set_bit(bit, port->msi_irq_in_use); > > + } > > + } > > + > > + hwirq = bit - nvec; > > + } else { > > + hwirq = bitmap_find_free_region(port->msi_irq_in_use, > > + PCIE_MSI_IRQS_NUM, > > + order_base_2(nvec)); > > + if (hwirq < 0) > > + ret = -ENOSPC; > > + } > > + > > +msi_prepare_out: > > + mutex_unlock(&port->lock); > > + > > + if (ret) > > + return ret; > > + > > + memset(arg, 0, sizeof(*arg)); > > + arg->hwirq = hwirq; > > + > > + return 0; > > +} > > NAK. This should take place in the irqdomain alloc path. .prepare is > reserved for HW that require per-device setup, and this doesn't > interact with the HW at all. > > Your distinction between MSI-X and non-MSI-X also becomes moot once > you place the allocator at the place where allocations take place. > > > + > > +static void mtk_pcie_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) > > +{ > > + arg->desc = desc; > > +} > > NAK. Please use the common PCI flow. > > > + > > +static struct msi_domain_ops mtk_msi_domain_ops = { > > + .get_hwirq = mtk_pcie_get_hwirq, > > + .msi_free = mtk_pcie_msi_free, > > + .msi_prepare = mtk_pcie_msi_prepare, > > + .set_desc = mtk_pcie_msi_set_desc, > > +}; > > + > > +static struct msi_domain_info mtk_msi_domain_info = { > > + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_PCI_MSIX | > > + MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI), > > + .chip = &mtk_msi_irq_chip, > > + .ops = &mtk_msi_domain_ops, > > + .handler = handle_edge_irq, > > + .handler_name = "MSI", > > +}; > > + > > +static void mtk_msi_top_irq_eoi(struct irq_data *data) > > +{ > > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > > + unsigned long msi_irq = data->hwirq + PCIE_MSI_SHIFT; > > + > > + writel(BIT(msi_irq), port->base + PCIE_INT_STATUS_REG); > > +} > > + > > +static struct irq_chip mtk_msi_top_irq_chip = { > > + .irq_eoi = mtk_msi_top_irq_eoi, > > + .name = "PCIe", > > +}; > > + > > +static void mtk_pcie_msi_handler(struct irq_desc *desc) > > +{ > > + struct mtk_pcie_msi *msi_info = irq_desc_get_handler_data(desc); > > + struct irq_chip *irqchip = irq_desc_get_chip(desc); > > + unsigned long msi_enable, msi_status; > > + unsigned int virq; > > + irq_hw_number_t bit, hwirq; > > + > > + chained_irq_enter(irqchip, desc); > > + > > + msi_enable = readl(msi_info->base + PCIE_MSI_ENABLE_OFFSET); > > + while ((msi_status = readl(msi_info->base + PCIE_MSI_STATUS_OFFSET) & > > + msi_enable)) { > > + for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) { > > + hwirq = bit + msi_info->index * PCIE_MSI_IRQS_PER_SET; > > + virq = irq_find_mapping(msi_info->domain, hwirq); > > + generic_handle_irq(virq); > > + } > > + } > > + > > + chained_irq_exit(irqchip, desc); > > If your MSIs are behind a muxing interrupt you *cannot* implement an > affinity setting callback, as this changes the affinity of *all* MSIs > in one go, which is a userspace ABI violation. I can't believe that > this is almost 2021, and people are still shipping HW that is that > broken :-(. Sorry, I haven't noticed the potential risk of userspace ABI violation, thanks for pointing out. > > > +} > > + > > +static int mtk_msi_top_domain_map(struct irq_domain *domain, > > + unsigned int virq, irq_hw_number_t hwirq) > > +{ > > + struct mtk_pcie_port *port = domain->host_data; > > + struct mtk_pcie_msi *msi_info = port->msi_info[hwirq]; > > + > > + irq_domain_set_info(domain, virq, hwirq, > > + &mtk_msi_top_irq_chip, domain->host_data, > > + mtk_pcie_msi_handler, msi_info, NULL); > > + > > + return 0; > > +} > > + > > +static const struct irq_domain_ops mtk_msi_top_domain_ops = { > > + .map = mtk_msi_top_domain_map, > > NAK. MSIs do not need a mapping callback. This belongs to the > irqdomain alloc callback, which you don't implement. > > > +}; > > + > > +static void mtk_intx_mask(struct irq_data *data) > > +{ > > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > > + u32 val; > > + > > + val = readl(port->base + PCIE_INT_ENABLE_REG); > > + val &= ~BIT(data->hwirq + PCIE_INTX_SHIFT); > > + writel(val, port->base + PCIE_INT_ENABLE_REG); > > +} > > + > > +static void mtk_intx_unmask(struct irq_data *data) > > +{ > > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > > + u32 val; > > + > > + val = readl(port->base + PCIE_INT_ENABLE_REG); > > + val |= BIT(data->hwirq + PCIE_INTX_SHIFT); > > + writel(val, port->base + PCIE_INT_ENABLE_REG); > > +} > > + > > +static void mtk_intx_eoi(struct irq_data *data) > > +{ > > + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); > > + unsigned long hwirq; > > + > > + /** > > + * As an emulated level IRQ, its interrupt status will remain > > + * until the corresponding de-assert message is received; hence that > > + * the status can only be cleared when the interrupt has been serviced. > > + */ > > + hwirq = data->hwirq + PCIE_INTX_SHIFT; > > + writel(BIT(hwirq), port->base + PCIE_INT_STATUS_REG); > > All of this is the description of a level interrupt, so this is pretty > much devoid of any information as to *why* you need to write to clear > this bit. What happens if the interrupt is still asserted because > nothing has handled it? Without any further information, this looks > terribly wrong. Sorry, this comment should be used to describe the mtk_intx_eoi function, it cause misunderstandings at this place. I just want to add the comment to explain that why this interrupt needs to be acked at the end of interrupt. I will move it to the front of mtk_intx_eoi in the next version. > > > +} > > + > > +static struct irq_chip mtk_intx_irq_chip = { > > + .irq_mask = mtk_intx_mask, > > + .irq_unmask = mtk_intx_unmask, > > + .irq_eoi = mtk_intx_eoi, > > + .irq_set_affinity = mtk_pcie_set_affinity, > > + .name = "PCIe", > > +}; > > + > > +static int mtk_pcie_intx_map(struct irq_domain *domain, unsigned int irq, > > + irq_hw_number_t hwirq) > > +{ > > + irq_set_chip_and_handler_name(irq, &mtk_intx_irq_chip, > > + handle_fasteoi_irq, "INTx"); > > + irq_set_chip_data(irq, domain->host_data); > > + > > + return 0; > > +} > > + > > +static const struct irq_domain_ops intx_domain_ops = { > > + .map = mtk_pcie_intx_map, > > +}; > > + > > +static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port, > > + struct device_node *node) > > +{ > > + struct device *dev = port->dev; > > + struct device_node *intc_node; > > + struct fwnode_handle *fwnode = of_node_to_fwnode(node); > > + struct mtk_pcie_msi *msi_info; > > + struct msi_domain_info *info; > > + int i, ret; > > + > > + /* Setup INTx */ > > + intc_node = of_get_child_by_name(node, "interrupt-controller"); > > + if (!intc_node) { > > + dev_err(dev, "missing PCIe Intc node\n"); > > + return -ENODEV; > > + } > > + > > + port->intx_domain = irq_domain_add_linear(intc_node, PCI_NUM_INTX, > > + &intx_domain_ops, port); > > + if (!port->intx_domain) { > > + dev_err(dev, "failed to get INTx IRQ domain\n"); > > + return -ENODEV; > > + } > > + > > + /* Setup MSI */ > > + mutex_init(&port->lock); > > + > > + info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL); > > + if (!info) > > + return -ENOMEM; > > + > > + memcpy(info, &mtk_msi_domain_info, sizeof(*info)); > > + info->chip_data = port; > > + > > + port->msi_domain = pci_msi_create_irq_domain(fwnode, info, NULL); > > + if (!port->msi_domain) { > > + dev_info(dev, "failed to create MSI domain\n"); > > + ret = -ENODEV; > > + goto err_msi_domain; > > + } > > + > > + /* Enable MSI and setup PCIe domains */ > > + port->msi_top_domain = irq_domain_add_hierarchy(NULL, 0, 0, node, > > + &mtk_msi_top_domain_ops, > > + port); > > + if (!port->msi_top_domain) { > > + dev_info(dev, "failed to create MSI top domain\n"); > > + ret = -ENODEV; > > + goto err_msi_top_domain; > > + } > > + > > + port->msi_info = devm_kzalloc(dev, PCIE_MSI_SET_NUM, GFP_KERNEL); > > + if (!port->msi_info) { > > + ret = -ENOMEM; > > + goto err_msi_info; > > + } > > + > > + for (i = 0; i < PCIE_MSI_SET_NUM; i++) { > > + int offset = i * PCIE_MSI_SET_OFFSET; > > + u32 val; > > + > > + msi_info = devm_kzalloc(dev, sizeof(*msi_info), GFP_KERNEL); > > + if (!msi_info) { > > + ret = -ENOMEM; > > + goto err_msi_set; > > + } > > + > > + msi_info->base = port->base + PCIE_MSI_ADDR_BASE_REG + offset; > > + msi_info->msg_addr = port->reg_base + PCIE_MSI_ADDR_BASE_REG + > > + offset; > > All of this can be derived from i (and this hwirq) at runtime. Why do > we need to cache this information? I try to use msi_info structure to describe the MSI set, I think this will be more comply with the HW design and more easier to understand the interrupt handle flow. > > > + > > + writel(lower_32_bits(msi_info->msg_addr), msi_info->base); > > + > > + msi_info->index = i; > > + msi_info->domain = port->msi_domain; > > If this is common to all sets, why isn't it global? I will fix it in the next version. > > > + > > + port->msi_info[i] = msi_info; > > + > > + /* Alloc IRQ for each MSI set */ > > + msi_info->irq = irq_create_mapping(port->msi_top_domain, i); > > + if (!msi_info->irq) { > > + dev_info(dev, "allocate MSI top IRQ failed\n"); > > + ret = -ENOSPC; > > + goto err_msi_set; > > + } > > + > > + val = readl(port->base + PCIE_INT_ENABLE_REG); > > + val |= BIT(i + PCIE_MSI_SHIFT); > > + writel(val, port->base + PCIE_INT_ENABLE_REG); > > + > > + val = readl(port->base + PCIE_MSI_SET_ENABLE_REG); > > + val |= BIT(i); > > + writel(val, port->base + PCIE_MSI_SET_ENABLE_REG); > > + } > > + > > + return 0; > > + > > +err_msi_set: > > + while (i-- > 0) { > > + msi_info = port->msi_info[i]; > > + irq_dispose_mapping(msi_info->irq); > > + } > > +err_msi_info: > > + irq_domain_remove(port->msi_top_domain); > > +err_msi_top_domain: > > + irq_domain_remove(port->msi_domain); > > +err_msi_domain: > > + irq_domain_remove(port->intx_domain); > > + > > + return ret; > > +} > > At this stage, I stopped, because there is really a lot to change in > this driver before it is in an acceptable shape. Please use the common > idioms established for existing MSI implementations. > > I'd appreciate if you kept me posted on the following versions of this > driver. > > Thanks, > > M. >