Re: [PATCH v3] drm/amdgpu/gfx9.4.3: Implement compute pipe reset

"Lazar, Lijo" <lijo.lazar@xxxxxxx> · Tue, 27 Aug 2024 13:31:54 +0530

On 8/22/2024 3:08 PM, Prike Liang wrote:
> Implement the compute pipe reset and driver will
> fallback to pipe reset when queue reset failed.
> 
> Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx>
> ---
> v3: Use the dev log and filer out the gfx9.4.4 pipe reset support.
> v2: Convert the GC logic instance to physical instance in the
>     register accessing process and use the dev_* print to specify
>     the failed device.
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   5 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 154 +++++++++++++++++++++---
>  2 files changed, 139 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index e28c1ebfa98f..d4d74ba2bc27 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -143,6 +143,11 @@ struct kiq_pm4_funcs {
>  				   uint32_t queue_type, uint32_t me_id,
>  				   uint32_t pipe_id, uint32_t queue_id,
>  				   uint32_t xcc_id, uint32_t vmid);
> +	int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring,
> +				   uint32_t queue_type, uint32_t me,
> +				   uint32_t pipe, uint32_t queue,
> +				   uint32_t xcc_id);
> +
>  	/* Packet sizes */
>  	int set_resources_size;
>  	int map_queues_size;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 2067f26d3a9d..aa0c76eed452 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev,
>  				struct amdgpu_cu_info *cu_info);
>  static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev, int xcc_id);
>  static void gfx_v9_4_3_xcc_unset_safe_mode(struct amdgpu_device *adev, int xcc_id);
> +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> +					uint32_t queue_type, uint32_t me,
> +					uint32_t pipe, uint32_t queue,
> +					uint32_t xcc_id);
>  
>  static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring,
>  				uint64_t queue_mask)
> @@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs gfx_v9_4_3_kiq_pm4_funcs = {
>  	.kiq_query_status = gfx_v9_4_3_kiq_query_status,
>  	.kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs,
>  	.kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue,
> +	.kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe,
>  	.set_resources_size = 8,
>  	.map_queues_size = 7,
>  	.unmap_queues_size = 6,
> @@ -3466,6 +3471,116 @@ static void gfx_v9_4_3_emit_wave_limit(struct amdgpu_ring *ring, bool enable)
>  	}
>  }
>  
> +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t me,
> +				uint32_t pipe, uint32_t queue,
> +				uint32_t xcc_id)
> +{
> +	int i, r;
> +	/* make sure dequeue is complete*/
> +	gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> +	mutex_lock(&adev->srbm_mutex);
> +	soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) & 1))
> +			break;
> +		udelay(1);
> +	}
> +	if (i >= adev->usec_timeout)
> +		r = -ETIMEDOUT;
> +	else
> +		r = 0;
> +	soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> +	mutex_unlock(&adev->srbm_mutex);
> +	gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> +
> +	return r;
> +
> +}
> +
> +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
> +{
> +	/*TODO: Need check gfx9.4.4 mec fw whether supports pipe reset as well.*/
> +	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) &&
> +			adev->gfx.mec_fw_version >= 0x0000009b)
> +		return true;
> +	else
> +		dev_warn_once(adev->dev, "Please use the latest MEC version to see whether support pipe reset\n");
> +
> +	return false;
> +}
> +
> +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> +					uint32_t queue_type, uint32_t me,
> +					uint32_t pipe, uint32_t queue,
> +					uint32_t xcc_id)
> +{
> +	struct amdgpu_device *adev = kiq_ring->adev;
> +	uint32_t reset_pipe, clean_pipe;
> +	int r;
> +
> +	if (!gfx_v9_4_3_pipe_reset_support(adev))
> +		return -EINVAL;
> +
> +	gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> +	mutex_lock(&adev->srbm_mutex);
> +	soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));

MEC_CNTL is not a per-queue register. So not sure if selecting this
makes sense here. The mutex may be taken so that resets for other queues
in the same XCC don't run in parallel.

> +
> +	reset_pipe = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL);
> +	clean_pipe = reset_pipe;

I think the saved value could be written back (skipping the SET_FIELD
steps below) as the only change that's done here is to set the
PIPEx_RESET field.

Apart from those, looks fine.

Thanks,
Lijo

> +
> +	if (me == 1) {
> +		switch (pipe) {
> +		case 0:
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE0_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE0_RESET, 0);
> +			break;
> +		case 1:
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE1_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE1_RESET, 0);
> +			break;
> +		case 2:
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE2_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE2_RESET, 0);
> +			break;
> +		case 3:
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE3_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE3_RESET, 0);
> +			break;
> +		default:
> +			break;
> +		}
> +	} else {
> +		if (pipe) {
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME2_PIPE1_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME2_PIPE1_RESET, 0);
> +		} else {
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME2_PIPE0_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME2_PIPE0_RESET, 0);
> +		}
> +	}
> +
> +	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, reset_pipe);
> +	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, clean_pipe);
> +	soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> +	mutex_unlock(&adev->srbm_mutex);
> +	gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> +
> +	r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id);
> +	return r;
> +}
> +
>  static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
>  				unsigned int vmid)
>  {
> @@ -3473,7 +3588,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
>  	struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
>  	struct amdgpu_ring *kiq_ring = &kiq->ring;
>  	unsigned long flags;
> -	int r, i;
> +	int r;
>  
>  	if (amdgpu_sriov_vf(adev))
>  		return -EINVAL;
> @@ -3495,26 +3610,25 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
>  	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>  
>  	r = amdgpu_ring_test_ring(kiq_ring);
> -	if (r)
> -		return r;
> -
> -	/* make sure dequeue is complete*/
> -	amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
> -	mutex_lock(&adev->srbm_mutex);
> -	soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, ring->xcc_id));
> -	for (i = 0; i < adev->usec_timeout; i++) {
> -		if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
> -			break;
> -		udelay(1);
> -	}
> -	if (i >= adev->usec_timeout)
> -		r = -ETIMEDOUT;
> -	soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
> -	mutex_unlock(&adev->srbm_mutex);
> -	amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
>  	if (r) {
> -		dev_err(adev->dev, "fail to wait on hqd deactive\n");
> -		return r;
> +		dev_err(adev->dev, "kiq ring test failed after ring: %s queue reset\n",
> +				ring->name);
> +		goto pipe_reset;
> +	}
> +
> +	r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, ring->xcc_id);
> +	if (r)
> +		dev_err(adev->dev, "fail to wait on hqd deactive and will try pipe reset\n");
> +
> +pipe_reset:
> +	if(r) {
> +		r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs->type,
> +						ring->me, ring->pipe,
> +						ring->queue, ring->xcc_id);
> +		dev_info(adev->dev, "ring: %s pipe reset :%s\n", ring->name,
> +				r ? "failed" : "successfully");
> +		if (r)
> +			return r;
>  	}
>  
>  	r = amdgpu_bo_reserve(ring->mqd_obj, false);