Re: [PATCH v2] drm/amdgpu/gfx9.4.3: Implement compute pipe reset

"Lazar, Lijo" <lijo.lazar@xxxxxxx> · Tue, 20 Aug 2024 18:12:47 +0530

On 8/20/2024 4:01 PM, Prike Liang wrote:
> Implement the compute pipe reset and driver will
> fallback to pipe reset when queue reset failed.
> 
> Signed-off-by: Prike Liang <Prike.Liang@xxxxxxx>
> ---
> v2: Convert the GC logic instance to physical instance in the
>     register accessing process and 

> use the dev_* print to specify the failed device.

This is not fully done, marked below.

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |   5 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 153 ++++++++++++++++++++----
>  2 files changed, 138 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index e28c1ebfa98f..d4d74ba2bc27 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -143,6 +143,11 @@ struct kiq_pm4_funcs {
>  				   uint32_t queue_type, uint32_t me_id,
>  				   uint32_t pipe_id, uint32_t queue_id,
>  				   uint32_t xcc_id, uint32_t vmid);
> +	int (*kiq_reset_hw_pipe)(struct amdgpu_ring *kiq_ring,
> +				   uint32_t queue_type, uint32_t me,
> +				   uint32_t pipe, uint32_t queue,
> +				   uint32_t xcc_id);
> +
>  	/* Packet sizes */
>  	int set_resources_size;
>  	int map_queues_size;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 2067f26d3a9d..ab9d5adbbfe8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -166,6 +166,10 @@ static int gfx_v9_4_3_get_cu_info(struct amdgpu_device *adev,
>  				struct amdgpu_cu_info *cu_info);
>  static void gfx_v9_4_3_xcc_set_safe_mode(struct amdgpu_device *adev, int xcc_id);
>  static void gfx_v9_4_3_xcc_unset_safe_mode(struct amdgpu_device *adev, int xcc_id);
> +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> +					uint32_t queue_type, uint32_t me,
> +					uint32_t pipe, uint32_t queue,
> +					uint32_t xcc_id);
>  
>  static void gfx_v9_4_3_kiq_set_resources(struct amdgpu_ring *kiq_ring,
>  				uint64_t queue_mask)
> @@ -323,6 +327,7 @@ static const struct kiq_pm4_funcs gfx_v9_4_3_kiq_pm4_funcs = {
>  	.kiq_query_status = gfx_v9_4_3_kiq_query_status,
>  	.kiq_invalidate_tlbs = gfx_v9_4_3_kiq_invalidate_tlbs,
>  	.kiq_reset_hw_queue = gfx_v9_4_3_kiq_reset_hw_queue,
> +	.kiq_reset_hw_pipe = gfx_v9_4_3_kiq_reset_hw_pipe,
>  	.set_resources_size = 8,
>  	.map_queues_size = 7,
>  	.unmap_queues_size = 6,
> @@ -3466,6 +3471,115 @@ static void gfx_v9_4_3_emit_wave_limit(struct amdgpu_ring *ring, bool enable)
>  	}
>  }
>  
> +static int gfx_v9_4_3_unmap_done(struct amdgpu_device *adev, uint32_t me,
> +				uint32_t pipe, uint32_t queue,
> +				uint32_t xcc_id)
> +{
> +	int i, r;
> +	/* make sure dequeue is complete*/
> +	gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> +	mutex_lock(&adev->srbm_mutex);
> +	soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> +	for (i = 0; i < adev->usec_timeout; i++) {
> +		if (!(RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_HQD_ACTIVE) & 1))
> +			break;
> +		udelay(1);
> +	}
> +	if (i >= adev->usec_timeout)
> +		r = -ETIMEDOUT;
> +	else
> +		r = 0;
> +	soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> +	mutex_unlock(&adev->srbm_mutex);
> +	gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> +
> +	return r;
> +
> +}
> +
> +static bool gfx_v9_4_3_pipe_reset_support(struct amdgpu_device *adev)
> +{
> +
> +	if (unlikely(adev->gfx.mec_fw_version < 0x0000009b)) {
> +		DRM_WARN_ONCE("MEC firmware version too old, please use FW no older than 155!\n");
> +		return false;
> +	}

This path will be taken GCv9.4.3 and GCv9.4.4. GCv9.4.4 has a different
FW version. If FW is not yet ready for 9.4.4, better check that and
return false for that.

> +
> +	return true;
> +}
> +
> +static int gfx_v9_4_3_kiq_reset_hw_pipe(struct amdgpu_ring *kiq_ring,
> +					uint32_t queue_type, uint32_t me,
> +					uint32_t pipe, uint32_t queue,
> +					uint32_t xcc_id)
> +{
> +	struct amdgpu_device *adev = kiq_ring->adev;
> +	uint32_t reset_pipe, clean_pipe;
> +	int r;
> +
> +	if (!gfx_v9_4_3_pipe_reset_support(adev))
> +		return -EINVAL;
> +
> +	gfx_v9_4_3_xcc_set_safe_mode(adev, xcc_id);
> +	mutex_lock(&adev->srbm_mutex);
> +	soc15_grbm_select(adev, me, pipe, queue, 0, GET_INST(GC, xcc_id));
> +
> +	reset_pipe = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL);
> +	clean_pipe = reset_pipe;
> +
> +	if (me == 1) {
> +		switch (pipe) {
> +		case 0:
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE0_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE0_RESET, 0);
> +			break;
> +		case 1:
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE1_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE1_RESET, 0);
> +			break;
> +		case 2:
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE2_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE2_RESET, 0);
> +			break;
> +		case 3:
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE3_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME1_PIPE3_RESET, 0);
> +			break;
> +		default:
> +			break;
> +		}
> +	} else {
> +		if (pipe) {
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME2_PIPE1_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME2_PIPE1_RESET, 0);
> +		} else {
> +			reset_pipe = REG_SET_FIELD(reset_pipe, CP_MEC_CNTL,
> +						   MEC_ME2_PIPE0_RESET, 1);
> +			clean_pipe = REG_SET_FIELD(clean_pipe, CP_MEC_CNTL,
> +						   MEC_ME2_PIPE0_RESET, 0);
> +		}
> +	}
> +
> +	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, reset_pipe);
> +	WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, clean_pipe);
> +	soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, xcc_id));
> +	mutex_unlock(&adev->srbm_mutex);
> +	gfx_v9_4_3_xcc_unset_safe_mode(adev, xcc_id);
> +
> +	r = gfx_v9_4_3_unmap_done(adev, me, pipe, queue, xcc_id);
> +	return r;
> +}
> +
>  static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
>  				unsigned int vmid)
>  {
> @@ -3473,7 +3587,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
>  	struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
>  	struct amdgpu_ring *kiq_ring = &kiq->ring;
>  	unsigned long flags;
> -	int r, i;
> +	int r;
>  
>  	if (amdgpu_sriov_vf(adev))
>  		return -EINVAL;
> @@ -3495,26 +3609,25 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
>  	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>  
>  	r = amdgpu_ring_test_ring(kiq_ring);
> -	if (r)
> -		return r;
> -
> -	/* make sure dequeue is complete*/
> -	amdgpu_gfx_rlc_enter_safe_mode(adev, ring->xcc_id);
> -	mutex_lock(&adev->srbm_mutex);
> -	soc15_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0, GET_INST(GC, ring->xcc_id));
> -	for (i = 0; i < adev->usec_timeout; i++) {
> -		if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
> -			break;
> -		udelay(1);
> -	}
> -	if (i >= adev->usec_timeout)
> -		r = -ETIMEDOUT;
> -	soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, ring->xcc_id));
> -	mutex_unlock(&adev->srbm_mutex);
> -	amdgpu_gfx_rlc_exit_safe_mode(adev, ring->xcc_id);
>  	if (r) {
> -		dev_err(adev->dev, "fail to wait on hqd deactive\n");
> -		return r;
> +		DRM_ERROR("kiq ring test failed after ring: %s queue reset\n",
> +				ring->name);

dev_err here

> +		goto pipe_reset;
> +	}
> +
> +	r = gfx_v9_4_3_unmap_done(adev, ring->me, ring->pipe, ring->queue, ring->xcc_id);
> +	if (r)
> +		dev_err(adev->dev,"fail to wait on hqd deactive and will try pipe reset\n");
> +
> +pipe_reset:
> +	if(r) {
> +		r = gfx_v9_4_3_kiq_reset_hw_pipe(kiq_ring, ring->funcs->type,
> +						ring->me, ring->pipe,
> +						ring->queue, ring->xcc_id);

A side question about this - what's the expectation about the other
queue(s) on this pipe at this point? Are they expected to be also hung
at this point?

In short, checking if this should be done from a higher level code or
from here itself. Ex:

Queue reset -> Failed -> Try dequeue of all other active queues -> Do a
pipe reset

> +		DRM_INFO("ring: %s pipe reset :%s\n", ring->name,
> +				r ? "failed" : "successfully");
dev_info here

Thanks,
Lijo

> +		if (r)
> +			return r;
>  	}
>  
>  	r = amdgpu_bo_reserve(ring->mqd_obj, false);