On Mon, Apr 29, 2024 at 09:43:49PM -0700, Nicolin Chen wrote: > -struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu) > +static bool tegra241_vintf_support_cmds(struct tegra241_vintf *vintf, > + u64 *cmds, int n) > +{ > + int i; > + > + /* VINTF owned by hypervisor can execute any command */ > + if (vintf->hyp_own) > + return true; > + > + /* Guest-owned VINTF must Check against the list of supported CMDs */ > + for (i = 0; i < n; i++) { > + switch (FIELD_GET(CMDQ_0_OP, cmds[i * CMDQ_ENT_DWORDS])) { > + case CMDQ_OP_TLBI_NH_ASID: > + case CMDQ_OP_TLBI_NH_VA: > + case CMDQ_OP_ATC_INV: So CMDQ only works if not ARM_SMMU_FEAT_E2H? Probably worth mentioning that too along with the discussion about HYP > + continue; > + default: > + return false; > + } > + } > + > + return true; > +} For a performance path this looping seems disappointing.. The callers don't actually mix different command type. Is there something preventing adding a parameter at the callers? Actually looking at this more closely, isn't the command q selection in the wrong place? Ie this batch stuff: static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, struct arm_smmu_cmdq_batch *cmds, struct arm_smmu_cmdq_ent *cmd) { int index; if (cmds->num == CMDQ_BATCH_ENTRIES - 1 && (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) { arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true); cmds->num = 0; } if (cmds->num == CMDQ_BATCH_ENTRIES) { arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false); cmds->num = 0; } index = cmds->num * CMDQ_ENT_DWORDS; if (unlikely(arm_smmu_cmdq_build_cmd(&cmds->cmds[index], cmd))) { dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", cmd->opcode); return; } Has to push everything, across all the iterations of add/submut, onto the same CMDQ otherwise the SYNC won't be properly flushing? But each arm_smmu_cmdq_issue_cmdlist() calls its own get q function. Yes, they probably return the same Q since we are probably on the same CPU, but it seems logically wrong (and slower!) to organize it like this. I would expect the Q to be selected when the struct arm_smmu_cmdq_batch is allocated on the stack, and be the same for the entire batch operation. Not only do we spend less time trying to compute the Q to use we have a built in guarentee that every command will be on the same Q as the fenching SYNC. Something sort of like this as another patch? diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 268da20baa4e9c..d8c9597878315a 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -357,11 +357,22 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) return 0; } -static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu, - u64 *cmds, int n) +enum required_cmds { + CMDS_ALL, + /* + * Commands will be one of: + * CMDQ_OP_ATC_INV, CMDQ_OP_TLBI_EL2_VA, CMDQ_OP_TLBI_NH_VA, + * CMDQ_OP_TLBI_EL2_ASID, CMDQ_OP_TLBI_NH_ASID, CMDQ_OP_TLBI_S2_IPA, + * CMDQ_OP_TLBI_S12_VMALL, CMDQ_OP_SYNC + */ + CMDS_INVALIDATION, +}; + +static struct arm_smmu_cmdq * +arm_smmu_get_cmdq(struct arm_smmu_device *smmu, enum required_cmds required) { if (smmu->tegra241_cmdqv) - return tegra241_cmdqv_get_cmdq(smmu, cmds, n); + return tegra241_cmdqv_get_cmdq(smmu, required); return &smmu->cmdq; } @@ -766,13 +777,13 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, * CPU will appear before any of the commands from the other CPU. */ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, - u64 *cmds, int n, bool sync) + struct arm_smmu_cmdq *cmdq, u64 *cmds, + int n, bool sync) { u64 cmd_sync[CMDQ_ENT_DWORDS]; u32 prod; unsigned long flags; bool owner; - struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu, cmds, n); struct arm_smmu_ll_queue llq, head; int ret = 0; @@ -897,7 +908,8 @@ static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, return -EINVAL; } - return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, sync); + return arm_smmu_cmdq_issue_cmdlist( + smmu, arm_smmu_get_cmdq(smmu, CMDS_ALL), cmd, 1, sync); } static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, @@ -912,6 +924,14 @@ static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu, return __arm_smmu_cmdq_issue_cmd(smmu, ent, true); } +static void arm_smmu_cmdq_batch_init(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_batch *cmds, + enum required_cmds required) +{ + cmds->num = 0; + cmds->q = arm_smmu_get_cmdq(smmu, required); +} + static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, struct arm_smmu_cmdq_batch *cmds, struct arm_smmu_cmdq_ent *cmd) @@ -920,12 +940,14 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, if (cmds->num == CMDQ_BATCH_ENTRIES - 1 && (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) { - arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true); + arm_smmu_cmdq_issue_cmdlist(smmu, cmds->q, cmds->cmds, + cmds->num, true); cmds->num = 0; } if (cmds->num == CMDQ_BATCH_ENTRIES) { - arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false); + arm_smmu_cmdq_issue_cmdlist(smmu, cmds->q, cmds->cmds, + cmds->num, false); cmds->num = 0; } @@ -942,7 +964,8 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu, struct arm_smmu_cmdq_batch *cmds) { - return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true); + return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->q, cmds->cmds, cmds->num, + true); } static void arm_smmu_page_response(struct device *dev, struct iopf_fault *unused, @@ -1181,7 +1204,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master, }, }; - cmds.num = 0; + arm_smmu_cmdq_batch_init(smmu, &cmds, CMDS_ALL); for (i = 0; i < master->num_streams; i++) { cmd.cfgi.sid = master->streams[i].id; arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); @@ -2045,7 +2068,7 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master, arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd); - cmds.num = 0; + arm_smmu_cmdq_batch_init(master->smmu, &cmds, CMDS_INVALIDATION); for (i = 0; i < master->num_streams; i++) { cmd.atc.sid = master->streams[i].id; arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd); @@ -2083,7 +2106,7 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, if (!atomic_read(&smmu_domain->nr_ats_masters)) return 0; - cmds.num = 0; + arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, CMDS_INVALIDATION); spin_lock_irqsave(&smmu_domain->devices_lock, flags); list_for_each_entry(master_domain, &smmu_domain->devices, @@ -2161,7 +2184,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, num_pages++; } - cmds.num = 0; + arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, CMDS_INVALIDATION); while (iova < end) { if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 9412fa4ff5e045..5651ea2541a0a2 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -576,6 +576,7 @@ struct arm_smmu_cmdq { struct arm_smmu_cmdq_batch { u64 cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS]; + struct arm_smmu_cmdq *q; int num; };