Re: [PATCH 1/4] drm/amdgpu/kfd: Add shared SDMA reset functionality with callback support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 2/10/2025 1:01 PM, Jesse.zhang@xxxxxxx wrote:
> From: "Jesse.zhang@xxxxxxx" <Jesse.zhang@xxxxxxx>
> 
> This patch introduces shared SDMA reset functionality between AMDGPU and KFD.
> The implementation includes the following key changes:
> 
> 1. Added `amdgpu_sdma_reset_queue`:
>    - Resets a specific SDMA queue by instance ID.
>    - Invokes registered pre-reset and post-reset callbacks to allow KFD and AMDGPU
>      to save/restore their state during the reset process.
> 
> 2. Added `amdgpu_set_on_reset_callbacks`:
>    - Allows KFD and AMDGPU to register callback functions for pre-reset and
>      post-reset operations.
>    - Callbacks are stored in a global linked list and invoked in the correct order
>      during SDMA reset.
> 
> This patch ensures that both AMDGPU and KFD can handle SDMA reset events
> gracefully, with proper state saving and restoration. It also provides a flexible
> callback mechanism for future extensions.
> 
> v2: fix CamelCase and put the SDMA helper into amdgpu_sdma.c (Alex)
> v3: rename the `amdgpu_register_on_reset_callbacks` function to
>       `amdgpu_sdma_register_on_reset_callbacks`
>     move global reset_callback_list to struct amdgpu_sdma (Alex)
> 
> Suggested-by: Alex Deucher <alexander.deucher@xxxxxxx>
> Suggested-by: Jiadong Zhu <Jiadong.Zhu@xxxxxxx>
> Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 72 ++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 11 ++++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  2 +-
>  3 files changed, 84 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 174badca27e7..19c8be7d72e2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -460,3 +460,75 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
>  			device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
>  	}
>  }
> +
> +/**
> + * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks
> + * @funcs: Pointer to the callback structure containing pre_reset and post_reset functions
> + *
> + * This function allows KFD and AMDGPU to register their own callbacks for handling
> + * pre-reset and post-reset operations. The callbacks are added to a global list.
> + */
> +void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct sdma_on_reset_funcs *funcs)

Maybe simpler to keep it sdma_register_reset_callbacks?
> +{
> +	if (!funcs)
> +		return;
> +
> +	/* Initialize the list node in the callback structure */
> +	INIT_LIST_HEAD(&funcs->list);
> +
> +	/* Add the callback structure to the global list */
> +	list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
> +}
> +
> +/**
> + * amdgpu_sdma_reset_instance - Reset a specific SDMA instance
> + * @adev: Pointer to the AMDGPU device
> + * @instance_id: ID of the SDMA engine instance to reset
> + *
> + * This function performs the following steps:
> + * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save their state.
> + * 2. Resets the specified SDMA engine instance.
> + * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to restore their state.
> + *
> + * Returns: 0 on success, or a negative error code on failure.
> + */
> +int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t instance_id)
> +{
> +	struct sdma_on_reset_funcs *funcs;
> +	int ret;
> +
> +	/* Invoke all registered pre_reset callbacks */
> +	list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> +		if (funcs->pre_reset) {
> +			ret = funcs->pre_reset(adev, instance_id);
> +			if (ret) {
> +				dev_err(adev->dev,
> +				"beforeReset callback failed for instance %u: %d\n",
> +					instance_id, ret);

Please add more context like "SDMA pre-reset failed" etc. ('SDMA' needs
to be there). Same for the message down also.

Thanks,
Lijo

> +				return ret;
> +			}
> +		}
> +	}
> +
> +	/* Perform the SDMA reset for the specified instance */
> +	ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
> +	if (ret) {
> +		dev_err(adev->dev, "Failed to reset SDMA instance %u\n", instance_id);
> +		return ret;
> +	}
> +
> +	/* Invoke all registered post_reset callbacks */
> +	list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> +		if (funcs->post_reset) {
> +			ret = funcs->post_reset(adev, instance_id);
> +			if (ret) {
> +				dev_err(adev->dev,
> +				"afterReset callback failed for instance %u: %d\n",
> +					instance_id, ret);
> +				return ret;
> +			}
> +		}
> +	}
> +
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index 5f60736051d1..fbb8b04ef2cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -98,6 +98,13 @@ struct amdgpu_sdma_ras {
>  	struct amdgpu_ras_block_object ras_block;
>  };
>  
> +struct sdma_on_reset_funcs {
> +	int (*pre_reset)(struct amdgpu_device *adev, uint32_t instance_id);
> +	int (*post_reset)(struct amdgpu_device *adev, uint32_t instance_id);
> +	/* Linked list node to store this structure in a list; */
> +	struct list_head list;
> +};
> +
>  struct amdgpu_sdma {
>  	struct amdgpu_sdma_instance instance[AMDGPU_MAX_SDMA_INSTANCES];
>  	struct amdgpu_irq_src	trap_irq;
> @@ -118,6 +125,7 @@ struct amdgpu_sdma {
>  	struct amdgpu_sdma_ras	*ras;
>  	uint32_t		*ip_dump;
>  	uint32_t 		supported_reset;
> +	struct list_head	reset_callback_list;
>  };
>  
>  /*
> @@ -157,6 +165,9 @@ struct amdgpu_buffer_funcs {
>  				 uint32_t byte_count);
>  };
>  
> +void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct sdma_on_reset_funcs *funcs);
> +int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t instance_id);
> +
>  #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)->mman.buffer_funcs->emit_copy_buffer((ib),  (s), (d), (b), (t))
>  #define amdgpu_emit_fill_buffer(adev, ib, s, d, b) (adev)->mman.buffer_funcs->emit_fill_buffer((ib), (s), (d), (b))
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index 5e0066cd6c51..64c163dd708f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -1477,7 +1477,7 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
>  	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
>  	if (r)
>  		return r;
> -
> +	INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
>  	return r;
>  }
>  




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux