Re: [RFC PATCH v8 08/10] ACPI:RAS2: Add ACPI RAS2 driver

Daniel Ferguson <danielf@xxxxxxxxxxxxxxxxxxxxxx> · Wed, 5 Jun 2024 14:32:55 -0700

> +static int ras2_check_pcc_chan(struct ras2_pcc_subspace *pcc_subspace)
> +{
> +	struct acpi_ras2_shared_memory __iomem *generic_comm_base =pcc_subspace->pcc_comm_addr;
> +	ktime_t next_deadline = ktime_add(ktime_get(), pcc_subspace->deadline);
> +	u16 status;
> +
> +	while (!ktime_after(ktime_get(), next_deadline)) {
> +		/*
> +		 * As per ACPI spec, the PCC space will be initialized by
> +		 * platform and should have set the command completion bit when
> +		 * PCC can be used by OSPM
> +		 */
> +		status = readw_relaxed(&generic_comm_base->status);
> +		if (status & RAS2_PCC_CMD_ERROR)
> +			return -EIO;

We need to clear the error bit before reporting an error.
Maybe the error was transient, or specific to the last transaction.
So clearing it here, lets us try again later. Like This:
		if (status & RAS2_PCC_CMD_ERROR) {
			status &= ~RAS2_PCC_CMD_ERROR;
			writew_relaxed(status, &generic_comm_base->status);
			return -EIO;
		}

Also, we are thinking that using the "Set RAS Capability Status" as a
way to communicate RAS2 Scrub specific error conditions is appropriate.
If we agree on that idea, then we can additionally check the
set_capabilities_status and return an appropriate error..
for example, we can add a new error mapping function:
static int report_cap_error(u32 cap_status)
{
	switch (cap_status) {
		case 1:  /* Not Valid */
		case 2:  /* Not Supported */
			return -EPERM;
		case 3:  /* Busy */
			return -EBUSY;
		case 4:  /* FailedF */
		case 5:  /* Aborted */
		case 6:  /* Invalid Data */
			return -EINVAL;
		default: /* 0 or other, Success */
			return 0;
	}
}

and then instead, modify ras2_check_pcc_chan in this way:
		if (status & RAS2_PCC_CMD_ERROR) {
			cap_status = readw_relaxed(&generic_comm_base->set_capabilities_status);
			ret = report_cap_error(cap_status);

			status &= ~RAS2_PCC_CMD_ERROR;
			writew_relaxed(status, &generic_comm_base->status);
			return ret;
		}

> +		if (status & RAS2_PCC_CMD_COMPLETE)
> +			return 0;
> +		/*
> +		 * Reducing the bus traffic in case this loop takes longer than
> +		 * a few retries.
> +		 */
> +		msleep(10);
> +	}
> +
> +	return -EIO;
> +}
> +
> +/**
> + * ras2_send_pcc_cmd() - Send RAS2 command via PCC channel
> + * @ras2_ctx:	pointer to the ras2 context structure
> + * @cmd:	command to send
> + *
> + * Returns: 0 on success, an error otherwise
> + */
> +int ras2_send_pcc_cmd(struct ras2_scrub_ctx *ras2_ctx, u16 cmd)
> +{
> +	struct ras2_pcc_subspace *pcc_subspace = ras2_ctx->pcc_subspace;
> +	struct acpi_ras2_shared_memory *generic_comm_base = pcc_subspace->pcc_comm_addr;
> +	static ktime_t last_cmd_cmpl_time, last_mpar_reset;
> +	struct mbox_chan *pcc_channel;
> +	unsigned int time_delta;
> +	static int mpar_count;
> +	int ret;
> +
> +	guard(mutex)(&ras2_pcc_subspace_lock);
> +	ret = ras2_check_pcc_chan(pcc_subspace);
> +	if (ret)
> +		return ret;
> +	pcc_channel = pcc_subspace->pcc_chan->mchan;
> +
> +	/*
> +	 * Handle the Minimum Request Turnaround Time(MRTT)
> +	 * "The minimum amount of time that OSPM must wait after the completion
> +	 * of a command before issuing the next command, in microseconds"
> +	 */
> +	if (pcc_subspace->pcc_mrtt) {
> +		time_delta = ktime_us_delta(ktime_get(), last_cmd_cmpl_time);
> +		if (pcc_subspace->pcc_mrtt > time_delta)
> +			udelay(pcc_subspace->pcc_mrtt - time_delta);
> +	}
> +
> +	/*
> +	 * Handle the non-zero Maximum Periodic Access Rate(MPAR)
> +	 * "The maximum number of periodic requests that the subspace channel can
> +	 * support, reported in commands per minute. 0 indicates no limitation."
> +	 *
> +	 * This parameter should be ideally zero or large enough so that it can
> +	 * handle maximum number of requests that all the cores in the system can
> +	 * collectively generate. If it is not, we will follow the spec and just
> +	 * not send the request to the platform after hitting the MPAR limit in
> +	 * any 60s window
> +	 */
> +	if (pcc_subspace->pcc_mpar) {
> +		if (mpar_count == 0) {
> +			time_delta = ktime_ms_delta(ktime_get(), last_mpar_reset);
> +			if (time_delta < 60 * MSEC_PER_SEC) {
> +				dev_dbg(ras2_ctx->dev,
> +					"PCC cmd not sent due to MPAR limit");
> +				return -EIO;
> +			}
> +			last_mpar_reset = ktime_get();
> +			mpar_count = pcc_subspace->pcc_mpar;
> +		}
> +		mpar_count--;
> +	}
> +
> +	/* Write to the shared comm region. */
> +	writew_relaxed(cmd, &generic_comm_base->command);
> +
> +	/* Flip CMD COMPLETE bit */
> +	writew_relaxed(0, &generic_comm_base->status);
> +
> +	/* Ring doorbell */
> +	ret = mbox_send_message(pcc_channel, &cmd);
> +	if (ret < 0) {
> +		dev_err(ras2_ctx->dev,
> +			"Err sending PCC mbox message. cmd:%d, ret:%d\n",
> +			cmd, ret);
> +		return ret;
> +	}
> +
> +	/*
> +	 * If Minimum Request Turnaround Time is non-zero, we need
> +	 * to record the completion time of both READ and WRITE
> +	 * command for proper handling of MRTT, so we need to check
> +	 * for pcc_mrtt in addition to CMD_READ
> +	 */
> +	if (cmd == RAS2_PCC_CMD_EXEC || pcc_subspace->pcc_mrtt) {
> +		ret = ras2_check_pcc_chan(pcc_subspace);
> +		if (pcc_subspace->pcc_mrtt)
> +			last_cmd_cmpl_time = ktime_get();
> +	}
> +
> +	if (pcc_channel->mbox->txdone_irq)
> +		mbox_chan_txdone(pcc_channel, ret);
> +	else
> +		mbox_client_txdone(pcc_channel, ret);
> +
> +	return 0;

As of now, this code doesn't consider errors occurring after the
mbox_send_message.
Checking errors is, probably, necessary to take appropriate action - such as
reporting an error to the user.  Also, error or no, it is important to
call the txdone bits.

One simple solution that works is to modify the previous "return 0" to
look like:
	return ret >= 0 ? 0 : ret;

> +}
> +EXPORT_SYMBOL_GPL(ras2_send_pcc_cmd);