Re: [PATCH v8 3/3] usb: typec: cros_ec_ucsi: Recover from write timeouts

Heikki Krogerus <heikki.krogerus@xxxxxxxxxxxxxxx> · Mon, 2 Dec 2024 16:06:31 +0200

Hi Łukasz,

On Thu, Nov 28, 2024 at 11:20:35PM +0000, Łukasz Bartosik wrote:
> From: Abhishek Pandit-Subedi <abhishekpandit@xxxxxxxxxxxx>
> 
> In a suspend-resume edge case, the OPM is timing out in ucsi_resume and
> the PPM is getting stuck waiting for a command complete ack. Add a write
> timeout recovery task that will get us out of this state.

Sorry, I did not realise this before, but this is in practice a fix
(or workaround) to the driver that you just introduced in the previous
patch. Please merge it into that.

> Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@xxxxxxxxxxxx>
> Signed-off-by: Łukasz Bartosik <ukaszb@xxxxxxxxxxxx>
> ---
>  drivers/usb/typec/ucsi/cros_ec_ucsi.c | 88 ++++++++++++++++++++++++++-
>  1 file changed, 87 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/usb/typec/ucsi/cros_ec_ucsi.c b/drivers/usb/typec/ucsi/cros_ec_ucsi.c
> index c588d9a57643..6daf61e7e62a 100644
> --- a/drivers/usb/typec/ucsi/cros_ec_ucsi.c
> +++ b/drivers/usb/typec/ucsi/cros_ec_ucsi.c
> @@ -7,6 +7,7 @@
>  
>  #include <linux/container_of.h>
>  #include <linux/dev_printk.h>
> +#include <linux/jiffies.h>
>  #include <linux/mod_devicetable.h>
>  #include <linux/module.h>
>  #include <linux/platform_data/cros_ec_commands.h>
> @@ -29,6 +30,9 @@
>   */
>  #define WRITE_TMO_MS		5000
>  
> +/* Number of times to attempt recovery from a write timeout before giving up. */
> +#define WRITE_TMO_CTR_MAX	5
> +
>  struct cros_ucsi_data {
>  	struct device *dev;
>  	struct ucsi *ucsi;
> @@ -36,6 +40,8 @@ struct cros_ucsi_data {
>  	struct cros_ec_device *ec;
>  	struct notifier_block nb;
>  	struct work_struct work;
> +	struct delayed_work write_tmo;
> +	int tmo_counter;
>  
>  	struct completion complete;
>  	unsigned long flags;
> @@ -99,12 +105,43 @@ static int cros_ucsi_async_control(struct ucsi *ucsi, u64 cmd)
>  	return 0;
>  }
>  
> +static int cros_ucsi_sync_control(struct ucsi *ucsi, u64 cmd)
> +{
> +	struct cros_ucsi_data *udata = ucsi_get_drvdata(ucsi);
> +	int ret;
> +
> +	ret = ucsi_sync_control_common(ucsi, cmd);
> +	if (ret)
> +		goto out;
> +
> +	/* Successful write. Cancel any pending recovery work. */
> +	cancel_delayed_work_sync(&udata->write_tmo);
> +
> +	return 0;
> +out:

This label looks a bit unnecessary to me. How about a switch statement?

	ret = ucsi_sync_control_common(ucsi, cmd);
        switch (ret) {
        case -EBUSY:
        case -ETIMEDOUT:
		cancel_delayed_work_sync(&udata->write_tmo);
		schedule_delayed_work(&udata->write_tmo, msecs_to_jiffies(WRITE_TMO_MS));
                break;
        case 0:
                cancel_delayed_work_sync(&udata->write_tmo);
                break;
        }

        return ret;

> +	/* EC may return -EBUSY if CCI.busy is set. Convert this to a timeout.
> +	 */
> +	if (ret == -EBUSY)
> +		ret = -ETIMEDOUT;
> +
> +	/* Schedule recovery attempt when we timeout or tried to send a command
> +	 * while still busy.
> +	 */
> +	if (ret == -ETIMEDOUT) {
> +		cancel_delayed_work_sync(&udata->write_tmo);
> +		schedule_delayed_work(&udata->write_tmo,
> +				      msecs_to_jiffies(WRITE_TMO_MS));
> +	}
> +
> +	return ret;
> +}
> +
>  struct ucsi_operations cros_ucsi_ops = {
>  	.read_version = cros_ucsi_read_version,
>  	.read_cci = cros_ucsi_read_cci,
>  	.read_message_in = cros_ucsi_read_message_in,
>  	.async_control = cros_ucsi_async_control,
> -	.sync_control = ucsi_sync_control_common,
> +	.sync_control = cros_ucsi_sync_control,
>  };
>  
>  static void cros_ucsi_work(struct work_struct *work)
> @@ -118,6 +155,54 @@ static void cros_ucsi_work(struct work_struct *work)
>  	ucsi_notify_common(udata->ucsi, cci);
>  }
>  
> +static void cros_ucsi_write_timeout(struct work_struct *work)
> +{
> +	struct cros_ucsi_data *udata =
> +		container_of(work, struct cros_ucsi_data, write_tmo.work);
> +	u32 cci;
> +	u64 cmd;
> +
> +	if (cros_ucsi_read(udata->ucsi, UCSI_CCI, &cci, sizeof(cci))) {
> +		dev_err(udata->dev,
> +			"Reading CCI failed; no write timeout recovery possible.");
> +		return;
> +	}
> +
> +	if (cci & UCSI_CCI_BUSY) {
> +		udata->tmo_counter++;
> +
> +		if (udata->tmo_counter <= WRITE_TMO_CTR_MAX)
> +			schedule_delayed_work(&udata->write_tmo,
> +					      msecs_to_jiffies(WRITE_TMO_MS));
> +		else
> +			dev_err(udata->dev,
> +				"PPM unresponsive - too many write timeouts.");
> +
> +		return;
> +	}
> +
> +	/* No longer busy means we can reset our timeout counter. */
> +	udata->tmo_counter = 0;
> +
> +	/* Need to ack previous command which may have timed out. */
> +	if (cci & UCSI_CCI_COMMAND_COMPLETE) {
> +		cmd = UCSI_ACK_CC_CI | UCSI_ACK_COMMAND_COMPLETE;
> +		cros_ucsi_async_control(udata->ucsi, &cmd);
> +
> +		/* Check again after a few seconds that the system has
> +		 * recovered to make sure our async write above was successful.
> +		 */
> +		schedule_delayed_work(&udata->write_tmo,
> +				      msecs_to_jiffies(WRITE_TMO_MS));
> +		return;
> +	}
> +
> +	/* We recovered from a previous timeout. Treat this as a recovery from
> +	 * suspend and call resume.
> +	 */
> +	ucsi_resume(udata->ucsi);
> +}
> +
>  static int cros_ucsi_event(struct notifier_block *nb,
>  			   unsigned long host_event, void *_notify)
>  {
> @@ -162,6 +247,7 @@ static int cros_ucsi_probe(struct platform_device *pdev)
>  	platform_set_drvdata(pdev, udata);
>  
>  	INIT_WORK(&udata->work, cros_ucsi_work);
> +	INIT_DELAYED_WORK(&udata->write_tmo, cros_ucsi_write_timeout);
>  	init_completion(&udata->complete);
>  
>  	udata->ucsi = ucsi_create(dev, &cros_ucsi_ops);

thanks,

-- 
heikki