Re: [PATCH 3/3] drm/i915/hwmon: Block waiting for GuC reset to complete

Rodrigo Vivi <rodrigo.vivi@xxxxxxxxx> · Tue, 18 Apr 2023 01:35:58 -0400

On Mon, Apr 10, 2023 at 03:35:09PM -0700, Ashutosh Dixit wrote:
> Instead of erroring out when GuC reset is in progress, block waiting for
> GuC reset to complete which is a more reasonable uapi behavior.
> 
> v2: Avoid race between wake_up_all and waiting for wakeup (Rodrigo)
> 
> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_hwmon.c | 38 +++++++++++++++++++++++++++----
>  1 file changed, 33 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_hwmon.c b/drivers/gpu/drm/i915/i915_hwmon.c
> index 9ab8971679fe3..8471a667dfc71 100644
> --- a/drivers/gpu/drm/i915/i915_hwmon.c
> +++ b/drivers/gpu/drm/i915/i915_hwmon.c
> @@ -51,6 +51,7 @@ struct hwm_drvdata {
>  	char name[12];
>  	int gt_n;
>  	bool reset_in_progress;
> +	wait_queue_head_t waitq;
>  };
>  
>  struct i915_hwmon {
> @@ -395,16 +396,41 @@ hwm_power_max_read(struct hwm_drvdata *ddat, long *val)
>  static int
>  hwm_power_max_write(struct hwm_drvdata *ddat, long val)
>  {
> +#define GUC_RESET_TIMEOUT msecs_to_jiffies(2000)
> +
> +	int ret = 0, timeout = GUC_RESET_TIMEOUT;
>  	struct i915_hwmon *hwmon = ddat->hwmon;
>  	intel_wakeref_t wakeref;
> -	int ret = 0;
> +	DEFINE_WAIT(wait);
>  	u32 nval;
>  
> -	mutex_lock(&hwmon->hwmon_lock);
> -	if (hwmon->ddat.reset_in_progress) {
> -		ret = -EAGAIN;
> -		goto unlock;
> +	/* Block waiting for GuC reset to complete when needed */
> +	for (;;) {
> +		mutex_lock(&hwmon->hwmon_lock);

I'm really afraid of how this mutex is handled with the wait queue.
some initial thought it looks like it is trying to reimplement ww_mutex?

all other examples of the wait_queue usages like this or didn't use
locks or had it in a total different flow that I could not correlate.

> +
> +		prepare_to_wait(&ddat->waitq, &wait, TASK_INTERRUPTIBLE);
> +
> +		if (!hwmon->ddat.reset_in_progress)
> +			break;

If this breaks we never unlock it?

> +
> +		if (signal_pending(current)) {
> +			ret = -EINTR;
> +			break;
> +		}
> +
> +		if (!timeout) {
> +			ret = -ETIME;
> +			break;
> +		}
> +
> +		mutex_unlock(&hwmon->hwmon_lock);

do we need to lock the signal pending and timeout as well?
or only wrapping it around the hwmon->ddat access would be
enough?

> +
> +		timeout = schedule_timeout(timeout);
>  	}
> +	finish_wait(&ddat->waitq, &wait);
> +	if (ret)
> +		goto unlock;
> +
>  	wakeref = intel_runtime_pm_get(ddat->uncore->rpm);
>  
>  	/* Disable PL1 limit and verify, because the limit cannot be disabled on all platforms */
> @@ -508,6 +534,7 @@ void i915_hwmon_power_max_restore(struct drm_i915_private *i915, bool old)
>  	intel_uncore_rmw(hwmon->ddat.uncore, hwmon->rg.pkg_rapl_limit,
>  			 PKG_PWR_LIM_1_EN, old ? PKG_PWR_LIM_1_EN : 0);
>  	hwmon->ddat.reset_in_progress = false;
> +	wake_up_all(&hwmon->ddat.waitq);
>  
>  	mutex_unlock(&hwmon->hwmon_lock);
>  }
> @@ -784,6 +811,7 @@ void i915_hwmon_register(struct drm_i915_private *i915)
>  	ddat->uncore = &i915->uncore;
>  	snprintf(ddat->name, sizeof(ddat->name), "i915");
>  	ddat->gt_n = -1;
> +	init_waitqueue_head(&ddat->waitq);
>  
>  	for_each_gt(gt, i915, i) {
>  		ddat_gt = hwmon->ddat_gt + i;
> -- 
> 2.38.0
>