From: Nate Watterson <nwatterson@xxxxxxxxxx> NVIDIA's Grace SoC includes custom CMDQ-Virtualization (CMDQV) hardware, which adds multiple VCMDQ interfaces to supplement the architected SMMU_CMDQ in an effort to reduce contention. To make use of these supplemental CMDQs in arm-smmu-v3 driver, we borrow the "implementation infrastructure" design from the arm-smmu driver, and add support for implementation defined issue_cmdlist methods. Signed-off-by: Nate Watterson <nwatterson@xxxxxxxxxx> Signed-off-by: Nicolin Chen <nicolinc@xxxxxxxxxx> --- MAINTAINERS | 2 + drivers/iommu/arm/arm-smmu-v3/Makefile | 2 +- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c | 7 + drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 67 +-- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 11 + .../iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c | 425 ++++++++++++++++++ 6 files changed, 487 insertions(+), 27 deletions(-) create mode 100644 drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c diff --git a/MAINTAINERS b/MAINTAINERS index d69b2d4646be..e72e3459c9be 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18240,8 +18240,10 @@ F: drivers/i2c/busses/i2c-tegra.c TEGRA IOMMU DRIVERS M: Thierry Reding <thierry.reding@xxxxxxxxx> R: Krishna Reddy <vdumpa@xxxxxxxxxx> +R: Nicolin Chen <nicoleotsuka@xxxxxxxxx> L: linux-tegra@xxxxxxxxxxxxxxx S: Supported +F: drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c F: drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c F: drivers/iommu/tegra* diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile index 1f5838d3351b..0aa84c0a50ea 100644 --- a/drivers/iommu/arm/arm-smmu-v3/Makefile +++ b/drivers/iommu/arm/arm-smmu-v3/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o -arm_smmu_v3-objs-y += arm-smmu-v3.o arm-smmu-v3-impl.o +arm_smmu_v3-objs-y += arm-smmu-v3.o arm-smmu-v3-impl.o nvidia-smmu-v3.o arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o arm_smmu_v3-objs := $(arm_smmu_v3-objs-y) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c index 6947d28067a8..37d062e40eb5 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-impl.c @@ -4,5 +4,12 @@ struct arm_smmu_device *arm_smmu_v3_impl_init(struct arm_smmu_device *smmu) { + /* + * Nvidia implementation supports ACPI only, so calling its init() + * unconditionally to walk through ACPI tables to probe the device. + * It will keep the smmu pointer intact, if it fails. + */ + smmu = nvidia_smmu_v3_impl_init(smmu); + return smmu; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index b2d23de2b207..439809e1acd4 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -336,9 +336,9 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) } static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu, - u32 prod) + u32 prod, struct arm_smmu_cmdq *cmdq) { - struct arm_smmu_queue *q = &smmu->cmdq.q; + struct arm_smmu_queue *q = &cmdq->q; struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC, }; @@ -575,11 +575,11 @@ static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq, /* Wait for the command queue to become non-full */ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, - struct arm_smmu_ll_queue *llq) + struct arm_smmu_ll_queue *llq, + struct arm_smmu_cmdq *cmdq) { unsigned long flags; struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; int ret = 0; /* @@ -595,7 +595,7 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, queue_poll_init(smmu, &qp); do { - llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + llq->val = READ_ONCE(cmdq->q.llq.val); if (!queue_full(llq)) break; @@ -610,11 +610,11 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, * Must be called with the cmdq lock held in some capacity. */ static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, - struct arm_smmu_ll_queue *llq) + struct arm_smmu_ll_queue *llq, + struct arm_smmu_cmdq *cmdq) { int ret = 0; struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod)); queue_poll_init(smmu, &qp); @@ -634,15 +634,15 @@ static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, * Must be called with the cmdq lock held in some capacity. */ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, - struct arm_smmu_ll_queue *llq) + struct arm_smmu_ll_queue *llq, + struct arm_smmu_cmdq *cmdq) { struct arm_smmu_queue_poll qp; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; u32 prod = llq->prod; int ret = 0; queue_poll_init(smmu, &qp); - llq->val = READ_ONCE(smmu->cmdq.q.llq.val); + llq->val = READ_ONCE(cmdq->q.llq.val); do { if (queue_consumed(llq, prod)) break; @@ -684,12 +684,13 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, } static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu, - struct arm_smmu_ll_queue *llq) + struct arm_smmu_ll_queue *llq, + struct arm_smmu_cmdq *cmdq) { if (smmu->options & ARM_SMMU_OPT_MSIPOLL) - return __arm_smmu_cmdq_poll_until_msi(smmu, llq); + return __arm_smmu_cmdq_poll_until_msi(smmu, llq, cmdq); - return __arm_smmu_cmdq_poll_until_consumed(smmu, llq); + return __arm_smmu_cmdq_poll_until_consumed(smmu, llq, cmdq); } static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, @@ -709,6 +710,14 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, } } +static int arm_smmu_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int n, bool sync) +{ + if (smmu->impl && smmu->impl->issue_cmdlist) + return smmu->impl->issue_cmdlist(smmu, cmds, n, sync); + + return arm_smmu_cmdq_issue_cmdlist(smmu, cmds, n, sync, &smmu->cmdq); +} + /* * This is the actual insertion function, and provides the following * ordering guarantees to callers: @@ -725,14 +734,13 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, * insert their own list of commands then all of the commands from one * CPU will appear before any of the commands from the other CPU. */ -static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, - u64 *cmds, int n, bool sync) +int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int n, bool sync, + struct arm_smmu_cmdq *cmdq) { u64 cmd_sync[CMDQ_ENT_DWORDS]; u32 prod; unsigned long flags; bool owner; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; struct arm_smmu_ll_queue llq = { .max_n_shift = cmdq->q.llq.max_n_shift, }, head = llq; @@ -746,7 +754,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, while (!queue_has_space(&llq, n + sync)) { local_irq_restore(flags); - if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) + if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq, cmdq)) dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); local_irq_save(flags); } @@ -772,7 +780,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n); if (sync) { prod = queue_inc_prod_n(&llq, n); - arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod); + arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod, cmdq); queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS); /* @@ -822,7 +830,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */ if (sync) { llq.prod = queue_inc_prod_n(&llq, n); - ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq); + ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq, cmdq); if (ret) { dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n", @@ -856,12 +864,12 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, return -EINVAL; } - return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false); + return arm_smmu_issue_cmdlist(smmu, cmd, 1, false); } static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) { - return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true); + return arm_smmu_issue_cmdlist(smmu, NULL, 0, true); } static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, @@ -869,7 +877,7 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *cmd) { if (cmds->num == CMDQ_BATCH_ENTRIES) { - arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false); + arm_smmu_issue_cmdlist(smmu, cmds->cmds, cmds->num, false); cmds->num = 0; } arm_smmu_cmdq_build_cmd(&cmds->cmds[cmds->num * CMDQ_ENT_DWORDS], cmd); @@ -879,7 +887,7 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu, struct arm_smmu_cmdq_batch *cmds) { - return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true); + return arm_smmu_issue_cmdlist(smmu, cmds->cmds, cmds->num, true); } static int arm_smmu_page_response(struct device *dev, @@ -2899,10 +2907,9 @@ static void arm_smmu_cmdq_free_bitmap(void *data) bitmap_free(bitmap); } -static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) +static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu, struct arm_smmu_cmdq *cmdq) { int ret = 0; - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; unsigned int nents = 1 << cmdq->q.llq.max_n_shift; atomic_long_t *bitmap; @@ -2932,7 +2939,7 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu) if (ret) return ret; - ret = arm_smmu_cmdq_init(smmu); + ret = arm_smmu_cmdq_init(smmu, &smmu->cmdq); if (ret) return ret; @@ -3416,6 +3423,14 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass) return ret; } + if (smmu->impl && smmu->impl->device_reset) { + ret = smmu->impl->device_reset(smmu); + if (ret) { + dev_err(smmu->dev, "failed at implementation specific device_reset\n"); + return ret; + } + } + return 0; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 4c60ba14221b..baec2d3a46f9 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -647,6 +647,8 @@ struct arm_smmu_device { #define ARM_SMMU_OPT_MSIPOLL (1 << 2) u32 options; + const struct arm_smmu_impl *impl; + struct arm_smmu_cmdq cmdq; struct arm_smmu_evtq evtq; struct arm_smmu_priq priq; @@ -807,7 +809,16 @@ static inline u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle) static inline void arm_smmu_sva_notifier_synchronize(void) {} #endif /* CONFIG_ARM_SMMU_V3_SVA */ +int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int n, bool sync, + struct arm_smmu_cmdq *cmdq); + /* Implementation details */ +struct arm_smmu_impl { + int (*device_reset)(struct arm_smmu_device *smmu); + int (*issue_cmdlist)(struct arm_smmu_device *smmu, u64 *cmds, int n, bool sync); +}; + struct arm_smmu_device *arm_smmu_v3_impl_init(struct arm_smmu_device *smmu); +struct arm_smmu_device *nvidia_smmu_v3_impl_init(struct arm_smmu_device *smmu); #endif /* _ARM_SMMU_V3_H */ diff --git a/drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c new file mode 100644 index 000000000000..ceec2a24057f --- /dev/null +++ b/drivers/iommu/arm/arm-smmu-v3/nvidia-smmu-v3.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define dev_fmt(fmt) "nvidia_smmu_cmdqv: " fmt + +#include <linux/acpi.h> +#include <linux/dma-mapping.h> +#include <linux/iommu.h> +#include <linux/iopoll.h> +#include <linux/platform_device.h> + +#include <acpi/acpixf.h> + +#include "arm-smmu-v3.h" + +#define NVIDIA_SMMU_CMDQV_HID "NVDA0600" + +/* CMDQV global config regs */ +#define NVIDIA_CMDQV_CONFIG 0x0000 +#define CMDQV_EN BIT(0) + +#define NVIDIA_CMDQV_PARAM 0x0004 +#define CMDQV_NUM_VINTF_LOG2 GENMASK(11, 8) +#define CMDQV_NUM_VCMDQ_LOG2 GENMASK(7, 4) + +#define NVIDIA_CMDQV_STATUS 0x0008 +#define CMDQV_STATUS GENMASK(2, 1) +#define CMDQV_ENABLED BIT(0) + +#define NVIDIA_CMDQV_VINTF_ERR_MAP 0x000C +#define NVIDIA_CMDQV_VINTF_INT_MASK 0x0014 +#define NVIDIA_CMDQV_VCMDQ_ERR_MAP 0x001C + +#define NVIDIA_CMDQV_CMDQ_ALLOC(q) (0x0200 + 0x4*(q)) +#define CMDQV_CMDQ_ALLOC_VINTF GENMASK(20, 15) +#define CMDQV_CMDQ_ALLOC_LVCMDQ GENMASK(7, 1) +#define CMDQV_CMDQ_ALLOCATED BIT(0) + +/* VINTF config regs */ +#define NVIDIA_CMDQV_VINTF(v) (0x1000 + 0x100*(v)) + +#define NVIDIA_VINTF_CONFIG 0x0000 +#define VINTF_HYP_OWN BIT(17) +#define VINTF_VMID GENMASK(16, 1) +#define VINTF_EN BIT(0) + +#define NVIDIA_VINTF_STATUS 0x0004 +#define VINTF_STATUS GENMASK(3, 1) +#define VINTF_ENABLED BIT(0) + +/* VCMDQ config regs */ +#define NVIDIA_CMDQV_VCMDQ(q) (0x10000 + 0x80*(q)) + +#define NVIDIA_VCMDQ_CONS 0x00000 +#define VCMDQ_CONS_ERR GENMASK(30, 24) + +#define NVIDIA_VCMDQ_PROD 0x00004 + +#define NVIDIA_VCMDQ_CONFIG 0x00008 +#define VCMDQ_EN BIT(0) + +#define NVIDIA_VCMDQ_STATUS 0x0000C +#define VCMDQ_ENABLED BIT(0) + +#define NVIDIA_VCMDQ_GERROR 0x00010 +#define NVIDIA_VCMDQ_GERRORN 0x00014 + +#define NVIDIA_VCMDQ_BASE 0x10000 +#define VCMDQ_ADDR GENMASK(63, 5) +#define VCMDQ_LOG2SIZE GENMASK(4, 0) + +struct nvidia_smmu_vintf { + u16 idx; + u32 cfg; + u32 status; + + void __iomem *base; + struct arm_smmu_cmdq *vcmdqs; +}; + +struct nvidia_smmu { + struct arm_smmu_device smmu; + + struct device *cmdqv_dev; + void __iomem *cmdqv_base; + int cmdqv_irq; + + /* CMDQV Hardware Params */ + u16 num_total_vintfs; + u16 num_total_vcmdqs; + u16 num_vcmdqs_per_vintf; + + /* CMDQV_VINTF(0) reserved for host kernel use */ + struct nvidia_smmu_vintf vintf0; +}; + +static irqreturn_t nvidia_smmu_cmdqv_isr(int irq, void *devid) +{ + struct nvidia_smmu *nsmmu = (struct nvidia_smmu *)devid; + struct nvidia_smmu_vintf *vintf0 = &nsmmu->vintf0; + u32 vintf_err_map[2]; + u32 vcmdq_err_map[4]; + + vintf_err_map[0] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VINTF_ERR_MAP); + vintf_err_map[1] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VINTF_ERR_MAP + 0x4); + + vcmdq_err_map[0] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VCMDQ_ERR_MAP); + vcmdq_err_map[1] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VCMDQ_ERR_MAP + 0x4); + vcmdq_err_map[2] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VCMDQ_ERR_MAP + 0x8); + vcmdq_err_map[3] = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_VCMDQ_ERR_MAP + 0xC); + + dev_warn(nsmmu->cmdqv_dev, + "Unexpected cmdqv error reported: vintf_map %08X %08X, vcmdq_map %08X %08X %08X %08X\n", + vintf_err_map[0], vintf_err_map[1], vcmdq_err_map[0], vcmdq_err_map[1], + vcmdq_err_map[2], vcmdq_err_map[3]); + + /* If the error was reported by vintf0, avoid using any of its VCMDQs */ + if (vintf_err_map[vintf0->idx / 32] & (1 << (vintf0->idx % 32))) { + vintf0->status = readl_relaxed(vintf0->base + NVIDIA_VINTF_STATUS); + + dev_warn(nsmmu->cmdqv_dev, "error (0x%lX) reported by host vintf0 - disabling its vcmdqs\n", + FIELD_GET(VINTF_STATUS, vintf0->status)); + } else if (vintf_err_map[0] || vintf_err_map[1]) { + dev_err(nsmmu->cmdqv_dev, "cmdqv error interrupt triggered by unassigned vintf!\n"); + } + + return IRQ_HANDLED; +} + +/* Adapt struct arm_smmu_cmdq init sequences from arm-smmu-v3.c for VCMDQs */ +static int nvidia_smmu_init_one_arm_smmu_cmdq(struct nvidia_smmu *nsmmu, + struct arm_smmu_cmdq *cmdq, + void __iomem *vcmdq_base, + u16 idx) +{ + struct arm_smmu_queue *q = &cmdq->q; + size_t qsz; + + /* struct arm_smmu_cmdq config normally done in arm_smmu_device_hw_probe() */ + q->llq.max_n_shift = ilog2(SZ_64K >> CMDQ_ENT_SZ_SHIFT); + + /* struct arm_smmu_cmdq config normally done in arm_smmu_init_one_queue() */ + qsz = (1 << q->llq.max_n_shift) << CMDQ_ENT_SZ_SHIFT; + q->base = dmam_alloc_coherent(nsmmu->cmdqv_dev, qsz, &q->base_dma, GFP_KERNEL); + if (!q->base) { + dev_err(nsmmu->cmdqv_dev, "failed to allocate 0x%zX bytes for VCMDQ%u\n", + qsz, idx); + return -ENOMEM; + } + dev_dbg(nsmmu->cmdqv_dev, "allocated %u entries for VCMDQ%u @ 0x%llX [%pad] ++ %zX", + 1 << q->llq.max_n_shift, idx, (u64)q->base, &q->base_dma, qsz); + + q->prod_reg = vcmdq_base + NVIDIA_VCMDQ_PROD; + q->cons_reg = vcmdq_base + NVIDIA_VCMDQ_CONS; + q->ent_dwords = CMDQ_ENT_DWORDS; + + q->q_base = q->base_dma & VCMDQ_ADDR; + q->q_base |= FIELD_PREP(VCMDQ_LOG2SIZE, q->llq.max_n_shift); + + q->llq.prod = q->llq.cons = 0; + + /* struct arm_smmu_cmdq config normally done in arm_smmu_cmdq_init() */ + atomic_set(&cmdq->owner_prod, 0); + atomic_set(&cmdq->lock, 0); + + cmdq->valid_map = (atomic_long_t *)bitmap_zalloc(1 << q->llq.max_n_shift, GFP_KERNEL); + if (!cmdq->valid_map) { + dev_err(nsmmu->cmdqv_dev, "failed to allocate valid_map for VCMDQ%u\n", idx); + return -ENOMEM; + } + + return 0; +} + +static int nvidia_smmu_cmdqv_init(struct nvidia_smmu *nsmmu) +{ + struct nvidia_smmu_vintf *vintf0 = &nsmmu->vintf0; + u32 regval; + u16 idx; + int ret; + + /* Setup vintf0 for host kernel */ + vintf0->idx = 0; + vintf0->base = nsmmu->cmdqv_base + NVIDIA_CMDQV_VINTF(0); + + regval = FIELD_PREP(VINTF_HYP_OWN, nsmmu->num_total_vintfs > 1); + writel_relaxed(regval, vintf0->base + NVIDIA_VINTF_CONFIG); + + regval |= FIELD_PREP(VINTF_EN, 1); + writel_relaxed(regval, vintf0->base + NVIDIA_VINTF_CONFIG); + + vintf0->cfg = regval; + + ret = readl_relaxed_poll_timeout(vintf0->base + NVIDIA_VINTF_STATUS, + regval, regval == VINTF_ENABLED, + 1, ARM_SMMU_POLL_TIMEOUT_US); + vintf0->status = regval; + if (ret) { + dev_err(nsmmu->cmdqv_dev, "failed to enable VINTF[%u]: STATUS = 0x%08X\n", + vintf0->idx, regval); + return ret; + } + + /* Allocate vcmdqs to vintf0 */ + for (idx = 0; idx < nsmmu->num_vcmdqs_per_vintf; idx++) { + regval = FIELD_PREP(CMDQV_CMDQ_ALLOC_VINTF, vintf0->idx); + regval |= FIELD_PREP(CMDQV_CMDQ_ALLOC_LVCMDQ, idx); + regval |= CMDQV_CMDQ_ALLOCATED; + writel_relaxed(regval, nsmmu->cmdqv_base + NVIDIA_CMDQV_CMDQ_ALLOC(idx)); + } + + /* Build an arm_smmu_cmdq for each vcmdq allocated to vintf0 */ + vintf0->vcmdqs = devm_kcalloc(nsmmu->cmdqv_dev, nsmmu->num_vcmdqs_per_vintf, + sizeof(*vintf0->vcmdqs), GFP_KERNEL); + if (!vintf0->vcmdqs) + return -ENOMEM; + + for (idx = 0; idx < nsmmu->num_vcmdqs_per_vintf; idx++) { + void __iomem *vcmdq_base = nsmmu->cmdqv_base + NVIDIA_CMDQV_VCMDQ(idx); + struct arm_smmu_cmdq *cmdq = &vintf0->vcmdqs[idx]; + + /* Setup struct arm_smmu_cmdq data members */ + nvidia_smmu_init_one_arm_smmu_cmdq(nsmmu, cmdq, vcmdq_base, idx); + + /* Configure and enable the vcmdq */ + writel_relaxed(0, vcmdq_base + NVIDIA_VCMDQ_PROD); + writel_relaxed(0, vcmdq_base + NVIDIA_VCMDQ_CONS); + + writeq_relaxed(cmdq->q.q_base, vcmdq_base + NVIDIA_VCMDQ_BASE); + + writel_relaxed(VCMDQ_EN, vcmdq_base + NVIDIA_VCMDQ_CONFIG); + ret = readl_poll_timeout(vcmdq_base + NVIDIA_VCMDQ_STATUS, + regval, regval == VCMDQ_ENABLED, + 1, ARM_SMMU_POLL_TIMEOUT_US); + if (ret) { + u32 gerror = readl_relaxed(vcmdq_base + NVIDIA_VCMDQ_GERROR); + u32 gerrorn = readl_relaxed(vcmdq_base + NVIDIA_VCMDQ_GERRORN); + u32 cons = readl_relaxed(vcmdq_base + NVIDIA_VCMDQ_CONS); + + dev_err(nsmmu->cmdqv_dev, + "failed to enable VCMDQ[%u]: GERROR=0x%X, GERRORN=0x%X, CONS=0x%X\n", + idx, gerror, gerrorn, cons); + return ret; + } + + dev_info(nsmmu->cmdqv_dev, "VCMDQ%u allocated to VINTF%u as CMDQ%u\n", + idx, vintf0->idx, idx); + } + + return 0; +} + +static int nvidia_smmu_probe(struct nvidia_smmu *nsmmu) +{ + struct platform_device *cmdqv_pdev = to_platform_device(nsmmu->cmdqv_dev); + struct resource *res; + u32 regval; + + /* Base address */ + res = platform_get_resource(cmdqv_pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENXIO; + + nsmmu->cmdqv_base = devm_ioremap_resource(nsmmu->cmdqv_dev, res); + if (IS_ERR(nsmmu->cmdqv_base)) + return PTR_ERR(nsmmu->cmdqv_base); + + /* Interrupt */ + nsmmu->cmdqv_irq = platform_get_irq(cmdqv_pdev, 0); + if (nsmmu->cmdqv_irq < 0) { + dev_warn(nsmmu->cmdqv_dev, "no cmdqv interrupt - errors will not be reported\n"); + nsmmu->cmdqv_irq = 0; + } + + /* Probe the h/w */ + regval = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_CONFIG); + if (!FIELD_GET(CMDQV_EN, regval)) { + dev_err(nsmmu->cmdqv_dev, "CMDQV h/w is disabled: CMDQV_CONFIG=0x%08X\n", regval); + return -ENODEV; + } + + regval = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_STATUS); + if (!FIELD_GET(CMDQV_ENABLED, regval) || FIELD_GET(CMDQV_STATUS, regval)) { + dev_err(nsmmu->cmdqv_dev, "CMDQV h/w not ready: CMDQV_STATUS=0x%08X\n", regval); + return -ENODEV; + } + + regval = readl_relaxed(nsmmu->cmdqv_base + NVIDIA_CMDQV_PARAM); + nsmmu->num_total_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval); + nsmmu->num_total_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval); + nsmmu->num_vcmdqs_per_vintf = nsmmu->num_total_vcmdqs / nsmmu->num_total_vintfs; + + return 0; +} + +static int nvidia_smmu_issue_cmdlist(struct arm_smmu_device *smmu, u64 *cmds, int n, bool sync) +{ + struct nvidia_smmu *nsmmu = (struct nvidia_smmu *)smmu; + struct nvidia_smmu_vintf *vintf0 = &nsmmu->vintf0; + struct arm_smmu_cmdq *cmdq = &smmu->cmdq; + u16 idx; + + /* Make sure vintf0 is enabled and healthy */ + if (vintf0->status != VINTF_ENABLED) + goto issue_cmdlist; + + /* Check for illegal CMDs */ + if (!FIELD_GET(VINTF_HYP_OWN, vintf0->cfg)) { + u64 opcode = (n) ? FIELD_GET(CMDQ_0_OP, cmds[0]) : CMDQ_OP_CMD_SYNC; + + switch (opcode) { + case CMDQ_OP_TLBI_NH_ASID: + case CMDQ_OP_TLBI_NH_VA: + case CMDQ_OP_TLBI_S12_VMALL: + case CMDQ_OP_TLBI_S2_IPA: + case CMDQ_OP_ATC_INV: + break; + default: + goto issue_cmdlist; + } + } + + /* + * Select a vcmdq to use. Here we use a temporal solution to + * balance out traffic on cmdq issuing: each cmdq has its own + * lock, if all cpus issue cmdlist using the same cmdq, only + * one CPU at a time can enter the process, while the others + * will be spinning at the same lock. + */ + idx = smp_processor_id() % nsmmu->num_vcmdqs_per_vintf; + cmdq = &vintf0->vcmdqs[idx]; + +issue_cmdlist: + return arm_smmu_cmdq_issue_cmdlist(smmu, cmds, n, sync, cmdq); +} + +static int nvidia_smmu_device_reset(struct arm_smmu_device *smmu) +{ + struct nvidia_smmu *nsmmu = (struct nvidia_smmu *)smmu; + int ret; + + ret = nvidia_smmu_cmdqv_init(nsmmu); + if (ret) + return ret; + + if (nsmmu->cmdqv_irq) { + ret = devm_request_irq(nsmmu->cmdqv_dev, nsmmu->cmdqv_irq, nvidia_smmu_cmdqv_isr, + IRQF_SHARED, "nvidia-smmu-cmdqv", nsmmu); + if (ret) { + dev_err(nsmmu->cmdqv_dev, "failed to claim irq (%d): %d\n", + nsmmu->cmdqv_irq, ret); + return ret; + } + } + + /* Disable FEAT_MSI and OPT_MSIPOLL since VCMDQs only support CMD_SYNC w/CS_NONE */ + smmu->features &= ~ARM_SMMU_FEAT_MSI; + smmu->options &= ~ARM_SMMU_OPT_MSIPOLL; + + return 0; +} + +const struct arm_smmu_impl nvidia_smmu_impl = { + .device_reset = nvidia_smmu_device_reset, + .issue_cmdlist = nvidia_smmu_issue_cmdlist, +}; + +#ifdef CONFIG_ACPI +struct nvidia_smmu *nvidia_smmu_create(struct arm_smmu_device *smmu) +{ + struct nvidia_smmu *nsmmu = NULL; + struct acpi_iort_node *node; + struct acpi_device *adev; + struct device *cmdqv_dev; + const char *match_uid; + + if (acpi_disabled) + return NULL; + + /* Look for a device in the DSDT whose _UID matches the SMMU's iort_node identifier */ + node = *(struct acpi_iort_node **)dev_get_platdata(smmu->dev); + match_uid = kasprintf(GFP_KERNEL, "%u", node->identifier); + adev = acpi_dev_get_first_match_dev(NVIDIA_SMMU_CMDQV_HID, match_uid, -1); + kfree(match_uid); + + if (!adev) + return NULL; + + cmdqv_dev = bus_find_device_by_acpi_dev(&platform_bus_type, adev); + if (!cmdqv_dev) + return NULL; + + dev_info(smmu->dev, "found companion CMDQV device, %s", dev_name(cmdqv_dev)); + + nsmmu = devm_krealloc(smmu->dev, smmu, sizeof(*nsmmu), GFP_KERNEL); + if (!nsmmu) + return ERR_PTR(-ENOMEM); + + nsmmu->cmdqv_dev = cmdqv_dev; + + return nsmmu; +} +#else +struct nvidia_smmu *nvidia_smmu_create(struct arm_smmu_device *smmu) +{ + return NULL; +} +#endif + +struct arm_smmu_device *nvidia_smmu_v3_impl_init(struct arm_smmu_device *smmu) +{ + struct nvidia_smmu *nsmmu; + int ret; + + nsmmu = nvidia_smmu_create(smmu); + if (!nsmmu) + return smmu; + + ret = nvidia_smmu_probe(nsmmu); + if (ret) + return ERR_PTR(ret); + + nsmmu->smmu.impl = &nvidia_smmu_impl; + + return &nsmmu->smmu; +} -- 2.17.1