Re: [PATCH 1/4 v6] drm/amdgpu/kfd: Add shared SDMA reset functionality with callback support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Feb 11, 2025 at 12:22 AM Jesse.zhang@xxxxxxx
<jesse.zhang@xxxxxxx> wrote:
>
> From: "Jesse.zhang@xxxxxxx" <Jesse.zhang@xxxxxxx>
>
> This patch introduces shared SDMA reset functionality between AMDGPU and KFD.
> The implementation includes the following key changes:
>
> 1. Added `amdgpu_sdma_reset_queue`:
>    - Resets a specific SDMA queue by instance ID.
>    - Invokes registered pre-reset and post-reset callbacks to allow KFD and AMDGPU
>      to save/restore their state during the reset process.
>
> 2. Added `amdgpu_set_on_reset_callbacks`:
>    - Allows KFD and AMDGPU to register callback functions for pre-reset and
>      post-reset operations.
>    - Callbacks are stored in a global linked list and invoked in the correct order
>      during SDMA reset.
>
> This patch ensures that both AMDGPU and KFD can handle SDMA reset events
> gracefully, with proper state saving and restoration. It also provides a flexible
> callback mechanism for future extensions.
>
> v2: fix CamelCase and put the SDMA helper into amdgpu_sdma.c (Alex)
> v3: rename the `amdgpu_register_on_reset_callbacks` function to
>       `amdgpu_sdma_register_on_reset_callbacks`
>     move global reset_callback_list to struct amdgpu_sdma (Alex)
>
> Suggested-by: Alex Deucher <alexander.deucher@xxxxxxx>
> Suggested-by: Jiadong Zhu <Jiadong.Zhu@xxxxxxx>
> Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 72 ++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 11 ++++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  2 +-
>  3 files changed, 84 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 174badca27e7..19c8be7d72e2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -460,3 +460,75 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev)
>                         device_remove_file(adev->dev, &dev_attr_sdma_reset_mask);
>         }
>  }
> +
> +/**
> + * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks

Maybe rename this amdgpu_sdma_register_engine_on_reset_callbacks()

> + * @funcs: Pointer to the callback structure containing pre_reset and post_reset functions
> + *
> + * This function allows KFD and AMDGPU to register their own callbacks for handling
> + * pre-reset and post-reset operations. The callbacks are added to a global list.

Consider rewording to something like:

This function allows KFD and AMDGPU to register their own callbacks for handling
pre-reset and post-reset operations for engine reset.  These are
needed because engine
reset will stop all queues on that engine.

> + */
> +void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct sdma_on_reset_funcs *funcs)
> +{
> +       if (!funcs)
> +               return;
> +
> +       /* Initialize the list node in the callback structure */
> +       INIT_LIST_HEAD(&funcs->list);
> +
> +       /* Add the callback structure to the global list */
> +       list_add_tail(&funcs->list, &adev->sdma.reset_callback_list);
> +}
> +
> +/**
> + * amdgpu_sdma_reset_instance - Reset a specific SDMA instance
> + * @adev: Pointer to the AMDGPU device
> + * @instance_id: ID of the SDMA engine instance to reset
> + *
> + * This function performs the following steps:
> + * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save their state.
> + * 2. Resets the specified SDMA engine instance.
> + * 3. Calls all registered post_reset callbacks to allow KFD and AMDGPU to restore their state.
> + *
> + * Returns: 0 on success, or a negative error code on failure.
> + */
> +int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t instance_id)

Maybe rename this?  amdgpu_sdma_reset_engine()?  I'm ok either way if
you feel strongly.

> +{
> +       struct sdma_on_reset_funcs *funcs;
> +       int ret;
> +
> +       /* Invoke all registered pre_reset callbacks */
> +       list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> +               if (funcs->pre_reset) {
> +                       ret = funcs->pre_reset(adev, instance_id);
> +                       if (ret) {
> +                               dev_err(adev->dev,
> +                               "beforeReset callback failed for instance %u: %d\n",
> +                                       instance_id, ret);
> +                               return ret;
> +                       }
> +               }
> +       }
> +
> +       /* Perform the SDMA reset for the specified instance */
> +       ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
> +       if (ret) {
> +               dev_err(adev->dev, "Failed to reset SDMA instance %u\n", instance_id);
> +               return ret;
> +       }
> +
> +       /* Invoke all registered post_reset callbacks */
> +       list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> +               if (funcs->post_reset) {
> +                       ret = funcs->post_reset(adev, instance_id);
> +                       if (ret) {
> +                               dev_err(adev->dev,
> +                               "afterReset callback failed for instance %u: %d\n",
> +                                       instance_id, ret);
> +                               return ret;
> +                       }
> +               }
> +       }
> +
> +       return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index 5f60736051d1..fbb8b04ef2cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -98,6 +98,13 @@ struct amdgpu_sdma_ras {
>         struct amdgpu_ras_block_object ras_block;
>  };
>
> +struct sdma_on_reset_funcs {

struct sdma_engine_on_reset_funcs {

> +       int (*pre_reset)(struct amdgpu_device *adev, uint32_t instance_id);
> +       int (*post_reset)(struct amdgpu_device *adev, uint32_t instance_id);
> +       /* Linked list node to store this structure in a list; */
> +       struct list_head list;
> +};
> +
>  struct amdgpu_sdma {
>         struct amdgpu_sdma_instance instance[AMDGPU_MAX_SDMA_INSTANCES];
>         struct amdgpu_irq_src   trap_irq;
> @@ -118,6 +125,7 @@ struct amdgpu_sdma {
>         struct amdgpu_sdma_ras  *ras;
>         uint32_t                *ip_dump;
>         uint32_t                supported_reset;
> +       struct list_head        reset_callback_list;

engine_reset_callback_list

>  };
>
>  /*
> @@ -157,6 +165,9 @@ struct amdgpu_buffer_funcs {
>                                  uint32_t byte_count);
>  };
>
> +void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct sdma_on_reset_funcs *funcs);
> +int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t instance_id);
> +
>  #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)->mman.buffer_funcs->emit_copy_buffer((ib),  (s), (d), (b), (t))
>  #define amdgpu_emit_fill_buffer(adev, ib, s, d, b) (adev)->mman.buffer_funcs->emit_fill_buffer((ib), (s), (d), (b))
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index 5e0066cd6c51..64c163dd708f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -1477,7 +1477,7 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
>         r = amdgpu_sdma_sysfs_reset_mask_init(adev);
>         if (r)
>                 return r;
> -
> +       INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
>         return r;
>  }
>
> --
> 2.25.1
>




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux