On 2/21/2025 11:47 AM, jesse.zhang@xxxxxxx wrote: > From: "Jesse.zhang@xxxxxxx" <Jesse.zhang@xxxxxxx> > > - Introduce a new function `sdma_v4_4_2_init_sysfs_reset_mask` to initialize the sysfs reset mask for SDMA. > - Move the initialization of the sysfs reset mask to the `late_init` stage to ensure that the SMU initialization > and capability setup are completed before checking the SDMA reset capability. > - Consolidate the logic for setting the supported reset types and initializing the sysfs reset mask into the new function. > - For IP versions 9.4.3 and 9.4.4, enable per-queue reset if the MEC firmware version is at least 0xb0 and PMFW supports queue reset. > - Add a TODO comment for future support of per-queue reset for IP version 9.4.5. > > This change ensures that per-queue reset is only enabled when the MEC and PMFW support it. > > Suggested-by: Jonathan Kim <Jonathan.Kim@xxxxxxx> > Signed-off-by: Vitaly Prosyak <vitaly.prosyak@xxxxxxx> > Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 55 ++++++++++++++++++++---- > 1 file changed, 47 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > index 4fa688e00f5e..fd2884de2dc4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > @@ -107,6 +107,7 @@ static void sdma_v4_4_2_set_vm_pte_funcs(struct amdgpu_device *adev); > static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev); > static void sdma_v4_4_2_set_ras_funcs(struct amdgpu_device *adev); > static void sdma_v4_4_2_set_engine_reset_funcs(struct amdgpu_device *adev); > +static int sdma_v4_4_2_init_sysfs_reset_mask(struct amdgpu_device *adev); > > static u32 sdma_v4_4_2_get_reg_offset(struct amdgpu_device *adev, > u32 instance, u32 offset) > @@ -1366,6 +1367,7 @@ static int sdma_v4_4_2_process_ras_data_cb(struct amdgpu_device *adev, > static int sdma_v4_4_2_late_init(struct amdgpu_ip_block *ip_block) > { > struct amdgpu_device *adev = ip_block->adev; > + int r; > #if 0 > struct ras_ih_if ih_info = { > .cb = sdma_v4_4_2_process_ras_data_cb, > @@ -1374,7 +1376,12 @@ static int sdma_v4_4_2_late_init(struct amdgpu_ip_block *ip_block) > if (!amdgpu_persistent_edc_harvesting_supported(adev)) > amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__SDMA); > > - return 0; > + /* The initialization is done in the late_init stage to ensure that the SMU > + * initialization and capability setup are completed before we check the SDMA > + * reset capability > + */ > + r = sdma_v4_4_2_init_sysfs_reset_mask(adev); Late init is called after every reset. Since the sysfs file is created already, it will return something like -EEXIST. Thanks, Lijo > + return r; > } > > static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block) > @@ -1481,10 +1488,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block) > } > } > > - /* TODO: Add queue reset mask when FW fully supports it */ > - adev->sdma.supported_reset = > - amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); > - > if (amdgpu_sdma_ras_sw_init(adev)) { > dev_err(adev->dev, "fail to initialize sdma ras block\n"); > return -EINVAL; > @@ -1497,9 +1500,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block) > else > DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); > > - r = amdgpu_sdma_sysfs_reset_mask_init(adev); > - if (r) > - return r; > /* Initialize guilty flags for GFX and PAGE queues */ > adev->sdma.gfx_guilty = false; > adev->sdma.page_guilty = false; > @@ -2328,6 +2328,45 @@ static void sdma_v4_4_2_set_vm_pte_funcs(struct amdgpu_device *adev) > adev->vm_manager.vm_pte_num_scheds = adev->sdma.num_instances; > } > > +/** > + * sdma_v4_4_2_init_sysfs_reset_mask - Initialize sysfs reset mask for SDMA > + * @adev: Pointer to the AMDGPU device structure > + * > + * This function initializes the sysfs reset mask for SDMA and sets the supported > + * reset types based on the IP version and firmware versions. > + * > + * Returns: 0 on success, or a negative error code on failure. > + */ > +static int sdma_v4_4_2_init_sysfs_reset_mask(struct amdgpu_device *adev) > +{ > + int r = 0; > + > + /* Set the supported reset types */ > + adev->sdma.supported_reset = > + amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); > + /* > + * the user queue relies on MEC fw and pmfw when the sdma queue do reset. > + * it needs to check both of them at here to skip old mec and pmfw. > + */ > + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { > + case IP_VERSION(9, 4, 3): > + case IP_VERSION(9, 4, 4): > + if ((adev->gfx.mec_fw_version >= 0xb0) && amdgpu_dpm_reset_sdma_is_supported(adev)) > + adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; > + break; > + case IP_VERSION(9, 4, 5): > + /*TODO: enable the queue reset flag until fw supported */ > + default: > + break; > + } > + > + /* Initialize the sysfs reset mask */ > + r = amdgpu_sdma_sysfs_reset_mask_init(adev); > + > + return r; > + > +} > + > const struct amdgpu_ip_block_version sdma_v4_4_2_ip_block = { > .type = AMD_IP_BLOCK_TYPE_SDMA, > .major = 4,