Re: [PATCH v6 5/6] iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV

Jason Gunthorpe <jgg@xxxxxxxxxx> · Tue, 30 Apr 2024 13:35:45 -0300

On Mon, Apr 29, 2024 at 09:43:48PM -0700, Nicolin Chen wrote:

> static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
> {
> +	if (smmu->tegra241_cmdqv)
> +		return tegra241_cmdqv_get_cmdq(smmu);

Since it is compile time optional it would make some sense to optimize
this (in all the places) too:

if (arm_smmu_has_smmu_tegra241_cmdqv(smmu))
   [..]

static inline bool arm_smmu_has_smmu_tegra241_cmdqv(struct arm_smmu_device *smmu)
{
	return IS_ENABLED(CONFIG_TEGRA241_CMDQV) && smmu->tegra241_cmdqv;
}

> @@ -3105,12 +3108,10 @@ static struct iommu_ops arm_smmu_ops = {
>  };
>  
>  /* Probing and initialisation functions */
> -static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
> -				   struct arm_smmu_queue *q,
> -				   void __iomem *page,
> -				   unsigned long prod_off,
> -				   unsigned long cons_off,
> -				   size_t dwords, const char *name)
> +int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
> +			    struct arm_smmu_queue *q, void __iomem *page,
> +			    unsigned long prod_off, unsigned long cons_off,
> +			    size_t dwords, const char *name)
>  {
>  	size_t qsz;

This hunk and the .h file part should be moved to the prior patch that
is de-exporting things.

> +/* MMIO helpers */
> +#define cmdqv_readl(reg) \
> +	readl(cmdqv->base + TEGRA241_CMDQV_##reg)
> +#define cmdqv_readl_relaxed(reg) \
> +	readl_relaxed(cmdqv->base + TEGRA241_CMDQV_##reg)
> +#define cmdqv_writel(val, reg) \
> +	writel((val), cmdqv->base + TEGRA241_CMDQV_##reg)
> +#define cmdqv_writel_relaxed(val, reg) \
> +	writel_relaxed((val), cmdqv->base + TEGRA241_CMDQV_##reg)

Please don't hide access to a stack variable in a macro, and I'm not
keen on the ##reg scheme either - it makes it much harder to search
for things.

Really this all seems like alot of overkill to make a little bit of
shorthand. It is not so wordy just to type it out:

  readl(vintf->base + TEGRA241_VINTF_CONFIG) 

> +/* Logging helpers */
> +#define cmdqv_warn(fmt, ...) \
> +	dev_warn(cmdqv->dev, "CMDQV: " fmt, ##__VA_ARGS__)
> +#define cmdqv_err(fmt, ...) \
> +	dev_err(cmdqv->dev, "CMDQV: " fmt, ##__VA_ARGS__)
> +#define cmdqv_info(fmt, ...) \
> +	dev_info(cmdqv->dev, "CMDQV: " fmt, ##__VA_ARGS__)
> +#define cmdqv_dbg(fmt, ...) \
> +	dev_dbg(cmdqv->dev, "CMDQV: " fmt, ##__VA_ARGS__)

Really not sure these are necessary, same remark about the stack
variable.

Also cmdqv->dev is the wrong thing to print, this is part of the smmu driver,
it should print cmdqv->smmu->dev for consistency

> +#define vintf_warn(fmt, ...) \
> +	dev_warn(vintf->cmdqv->dev, "VINTF%u: " fmt, vintf->idx, ##__VA_ARGS__)
> +#define vintf_err(fmt, ...) \
> +	dev_err(vintf->cmdqv->dev, "VINTF%u: " fmt, vintf->idx, ##__VA_ARGS__)
> +#define vintf_info(fmt, ...) \
> +	dev_info(vintf->cmdqv->dev, "VINTF%u: " fmt, vintf->idx, ##__VA_ARGS__)
> +#define vintf_dbg(fmt, ...) \
> +	dev_dbg(vintf->cmdqv->dev, "VINTF%u: " fmt, vintf->idx, ##__VA_ARGS__)
> +
> +#define vcmdq_warn(fmt, ...)                                                   \
> +	({                                                                     \
> +		struct tegra241_vintf *vintf = vcmdq->vintf;                   \
> +		if (vintf)                                                     \
> +			vintf_warn("VCMDQ%u/LVCMDQ%u: " fmt,                   \
> +				   vcmdq->idx, vcmdq->lidx,                    \
> +				   ##__VA_ARGS__);                             \
> +		else                                                           \
> +			dev_warn(vcmdq->cmdqv->dev, "VCMDQ%u: " fmt,           \
> +				 vcmdq->idx, ##__VA_ARGS__);                   \
> +	})
> +#define vcmdq_err(fmt, ...)                                                    \
> +	({                                                                     \
> +		struct tegra241_vintf *vintf = vcmdq->vintf;                   \
> +		if (vintf)                                                     \
> +			vintf_err("VCMDQ%u/LVCMDQ%u: " fmt,                    \
> +				  vcmdq->idx, vcmdq->lidx,                     \
> +				  ##__VA_ARGS__);                              \
> +		else                                                           \
> +			dev_err(vcmdq->cmdqv->dev, "VCMDQ%u: " fmt,            \
> +				vcmdq->idx, ##__VA_ARGS__);                    \
> +	})
> +#define vcmdq_info(fmt, ...)                                                   \
> +	({                                                                     \
> +		struct tegra241_vintf *vintf = vcmdq->vintf;                   \
> +		if (vintf)                                                     \
> +			vintf_info("VCMDQ%u/LVCMDQ%u: " fmt,                   \
> +				   vcmdq->idx, vcmdq->lidx,                    \
> +				   ##__VA_ARGS__);                             \
> +		else                                                           \
> +			dev_info(vcmdq->cmdqv->dev, "VCMDQ%u: " fmt,           \
> +				 vcmdq->idx, ##__VA_ARGS__);                   \
> +	})
> +#define vcmdq_dbg(fmt, ...)                                                    \
> +	({                                                                     \
> +		struct tegra241_vintf *vintf = vcmdq->vintf;                   \
> +		if (vintf)                                                     \
> +			vintf_dbg("VCMDQ%u/LVCMDQ%u: " fmt,                    \
> +				  vcmdq->idx, vcmdq->lidx,                     \
> +				  ##__VA_ARGS__);                              \
> +		else                                                           \
> +			dev_dbg(vcmdq->cmdqv->dev, "VCMDQ%u: " fmt,            \
> +				vcmdq->idx, ##__VA_ARGS__);                    \
> +	})

Some of these are barely used, is it worth all these macros??

> +
> +/* Configuring and polling helpers */
> +#define tegra241_cmdqv_write_config(_owner, _OWNER, _regval)                   \
> +	({                                                                     \
> +		bool _en = (_regval) & _OWNER##_EN;                            \
> +		u32 _status;                                                   \
> +		int _ret;                                                      \
> +		writel((_regval), _owner->base + TEGRA241_##_OWNER##_CONFIG);  \
> +		_ret = readl_poll_timeout(                                     \
> +			_owner->base + TEGRA241_##_OWNER##_STATUS, _status,    \
> +			_en ? (_regval) & _OWNER##_ENABLED :                   \
> +			      !((_regval) & _OWNER##_ENABLED),                 \
> +			1, ARM_SMMU_POLL_TIMEOUT_US);                          \
> +		if (_ret)                                                      \
> +			_owner##_err("failed to %sable, STATUS = 0x%08X\n",    \
> +				     _en ? "en" : "dis", _status);             \
> +		atomic_set(&_owner->status, _status);                          \
> +		_ret;                                                          \
> +	})

I feel like this could be an actual inline function without the macro
wrapper with a little fiddling.

> +
> +#define cmdqv_write_config(_regval) \
> +	tegra241_cmdqv_write_config(cmdqv, CMDQV, _regval)
> +#define vintf_write_config(_regval) \
> +	tegra241_cmdqv_write_config(vintf, VINTF, _regval)
> +#define vcmdq_write_config(_regval) \
> +	tegra241_cmdqv_write_config(vcmdq, VCMDQ, _regval)

More hidden access to stack values

> +/**
> + * struct tegra241_cmdqv - CMDQ-V for SMMUv3
> + * @smmu: SMMUv3 pointer
> + * @dev: Device pointer

This should probably be clarified as the device pointer to the ACPI
companion device

> +static void tegra241_cmdqv_handle_vintf0_error(struct tegra241_cmdqv *cmdqv)
> +{
> +	struct tegra241_vintf *vintf = cmdqv->vintfs[0];
> +	int i;
> +
> +	/* Cache status to bypass VCMDQs until error is recovered */
> +	atomic_set(&vintf->status, vintf_readl(STATUS));
> +
> +	for (i = 0; i < 4; i++) {
> +		u32 lvcmdq_err_map = vintf_readl_relaxed(CMDQ_ERR_MAP(i));
> +
> +		while (lvcmdq_err_map) {
> +			int lidx = ffs(lvcmdq_err_map) - 1;
> +			struct tegra241_vcmdq *vcmdq = vintf->vcmdqs[lidx];
> +			u32 gerrorn, gerror;
> +
> +			lvcmdq_err_map &= ~BIT(lidx);
> +
> +			__arm_smmu_cmdq_skip_err(cmdqv->smmu, &vcmdq->cmdq.q);
> +
> +			gerrorn = vcmdq_page0_readl_relaxed(GERRORN);
> +			gerror = vcmdq_page0_readl_relaxed(GERROR);
> +
> +			vcmdq_page0_writel(gerror, GERRORN);
> +		}
> +	}
> +
> +	/* Now error status should be clean, cache it again */
> +	atomic_set(&vintf->status, vintf_readl(STATUS));
> +}
> +
> +static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid)
> +{
> +	struct tegra241_cmdqv *cmdqv = (struct tegra241_cmdqv *)devid;
> +	u32 vintf_errs[2];
> +	u32 vcmdq_errs[4];
> +
> +	vintf_errs[0] = cmdqv_readl_relaxed(VINTF_ERR_MAP);
> +	vintf_errs[1] = cmdqv_readl_relaxed(VINTF_ERR_MAP + 0x4);
> +
> +	vcmdq_errs[0] = cmdqv_readl_relaxed(VCMDQ_ERR_MAP(0));
> +	vcmdq_errs[1] = cmdqv_readl_relaxed(VCMDQ_ERR_MAP(1));
> +	vcmdq_errs[2] = cmdqv_readl_relaxed(VCMDQ_ERR_MAP(2));
> +	vcmdq_errs[3] = cmdqv_readl_relaxed(VCMDQ_ERR_MAP(3));
> +
> +	cmdqv_warn("unexpected cmdqv error reported\n");
> +	cmdqv_warn(" vintf_map: 0x%08X%08X\n", vintf_errs[1], vintf_errs[0]);
> +	cmdqv_warn(" vcmdq_map: 0x%08X%08X%08X%08X\n",
> +		   vcmdq_errs[3], vcmdq_errs[2], vcmdq_errs[1], vcmdq_errs[0]);

Put warnings in one print only, spreading them like this just
increases the risk of tearing.. It doesn't need to be all pretty.

> +struct arm_smmu_cmdq *tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
> +{
> +	struct tegra241_cmdqv *cmdqv = smmu->tegra241_cmdqv;
> +	struct tegra241_vintf *vintf = cmdqv->vintfs[0];
> +	struct tegra241_vcmdq *vcmdq;
> +	u16 lidx;
> +
> +	if (bypass_vcmdq)

READ_ONCE 

> +		return &smmu->cmdq;
> +
> +	/* Use SMMU CMDQ if vintfs[0] is uninitialized */
> +	if (!FIELD_GET(VINTF_ENABLED, atomic_read(&vintf->status)))
> +		return &smmu->cmdq;
> +
> +	/* Use SMMU CMDQ if vintfs[0] has error status */
> +	if (FIELD_GET(VINTF_STATUS, atomic_read(&vintf->status)))
> +		return &smmu->cmdq;

Why atomic_read? The unlocked interaction with
tegra241_cmdqv_handle_vintf0_error() doesn't seem especially sane IMHO

> +static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq)
> +{
> +	u32 gerrorn, gerror;
> +
> +	if (vcmdq_write_config(0)) {
> +		vcmdq_err("GERRORN=0x%X\n", vcmdq_page0_readl_relaxed(GERRORN));
> +		vcmdq_err("GERROR=0x%X\n", vcmdq_page0_readl_relaxed(GERROR));
> +		vcmdq_err("CONS=0x%X\n", vcmdq_page0_readl_relaxed(CONS));

Less prints, include a unique message about why this is being
printed..

> +	}
> +	vcmdq_page0_writel_relaxed(0, PROD);
> +	vcmdq_page0_writel_relaxed(0, CONS);
> +	vcmdq_page1_writeq_relaxed(0, BASE);
> +	vcmdq_page1_writeq_relaxed(0, CONS_INDX_BASE);
> +
> +	gerrorn = vcmdq_page0_readl_relaxed(GERRORN);
> +	gerror = vcmdq_page0_readl_relaxed(GERROR);
> +	if (gerror != gerrorn) {
> +		vcmdq_info("Uncleared error detected, resetting\n");
> +		vcmdq_page0_writel(gerror, GERRORN);
> +	}
> +
> +	vcmdq_dbg("deinited\n");
> +}
> +
> +static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
> +{
> +	int ret;
> +
> +	/* Configure and enable the vcmdq */
> +	tegra241_vcmdq_hw_deinit(vcmdq);
> +
> +	vcmdq_page1_writeq_relaxed(vcmdq->cmdq.q.q_base, BASE);
> +
> +	ret = vcmdq_write_config(VCMDQ_EN);
> +	if (ret) {
> +		vcmdq_err("GERRORN=0x%X\n", vcmdq_page0_readl_relaxed(GERRORN));
> +		vcmdq_err("GERROR=0x%X\n", vcmdq_page0_readl_relaxed(GERROR));
> +		vcmdq_err("CONS=0x%X\n", vcmdq_page0_readl_relaxed(CONS));
> +		return ret;

Same print?

> +static void tegra241_vcmdq_free_smmu_cmdq(struct tegra241_vcmdq *vcmdq)
> +{
> +	struct tegra241_cmdqv *cmdqv = vcmdq->cmdqv;
> +	struct arm_smmu_queue *q = &vcmdq->cmdq.q;
> +	size_t nents = 1 << q->llq.max_n_shift;
> +
> +	dmam_free_coherent(cmdqv->smmu->dev, (nents * CMDQ_ENT_DWORDS) << 3,
> +			   q->base, q->base_dma);

If we are calling dmam_free, do we really need devm at all?

> +static struct tegra241_vcmdq *
> +tegra241_vintf_lvcmdq_alloc(struct tegra241_vintf *vintf, u16 lidx)
> +{
> +	struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
> +	struct tegra241_vcmdq *vcmdq;
> +	int ret;
> +
> +	vcmdq = devm_kzalloc(cmdqv->dev, sizeof(*vcmdq), GFP_KERNEL);
> +	if (!vcmdq)
> +		return ERR_PTR(-ENOMEM);
> +
> +	ret = tegra241_vintf_lvcmdq_init(vintf, lidx, vcmdq);
> +	if (ret)
> +		goto free_vcmdq;
> +
> +	/* Setup struct arm_smmu_cmdq data members */
> +	ret = tegra241_vcmdq_alloc_smmu_cmdq(vcmdq);
> +	if (ret)
> +		goto deinit_lvcmdq;
> +
> +	ret = tegra241_vcmdq_hw_init(vcmdq);
> +	if (ret)
> +		goto free_queue;
> +
> +	vcmdq_dbg("allocated\n");
> +	return vcmdq;
> +free_queue:
> +	tegra241_vcmdq_free_smmu_cmdq(vcmdq);
> +deinit_lvcmdq:
> +	tegra241_vintf_lvcmdq_deinit(vcmdq);
> +free_vcmdq:
> +	devm_kfree(cmdqv->dev, vcmdq);
> +	return ERR_PTR(ret);
> +}
> +
> +static void tegra241_vintf_lvcmdq_free(struct tegra241_vcmdq *vcmdq)
> +{
> +	tegra241_vcmdq_hw_deinit(vcmdq);
> +	tegra241_vcmdq_free_smmu_cmdq(vcmdq);
> +	tegra241_vintf_lvcmdq_deinit(vcmdq);
> +	devm_kfree(vcmdq->cmdqv->dev, vcmdq);

Ditto for devm_kfree.

> +struct tegra241_cmdqv *
> +tegra241_cmdqv_acpi_probe(struct arm_smmu_device *smmu, int id)

id is a u32. 

It might be clearer to just pass in the struct
acpi_iort_node *?

> +{
> +	struct tegra241_cmdqv *cmdqv;
> +
> +	cmdqv = tegra241_cmdqv_find_resource(smmu, id);
> +	if (!cmdqv)
> +		return NULL;
> +
> +	if (tegra241_cmdqv_probe(cmdqv)) {
> +		if (cmdqv->irq > 0)
> +			devm_free_irq(smmu->dev, cmdqv->irq, cmdqv);
> +		devm_iounmap(smmu->dev, cmdqv->base);
> +		devm_kfree(smmu->dev, cmdqv);
> +		return NULL;

Oh. Please don't use devm at all in this code then, it is not attached
to a probed driver with the proper scope, devm isn't going to work in
sensible way.

Jason