[Public] Hi Jesse, > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Huang, > Tim > Sent: Tuesday, October 29, 2024 12:21 PM > To: Zhang, Jesse(Jie) <Jesse.Zhang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Koenig, Christian > <Christian.Koenig@xxxxxxx>; Zhang, Jesse(Jie) <Jesse.Zhang@xxxxxxx>; > Zhang, Jesse(Jie) <Jesse.Zhang@xxxxxxx> > Subject: RE: [PATCH V3 1/5] drm/amdgpu: Add sysfs interface for gc reset > mask > > [Public] > > [Public] > > Hi Jesse, > > > -----Original Message----- > > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of > > Jesse.zhang@xxxxxxx > > Sent: Thursday, October 24, 2024 3:39 PM > > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Koenig, Christian > > <Christian.Koenig@xxxxxxx>; Zhang, Jesse(Jie) <Jesse.Zhang@xxxxxxx>; > > Zhang, Jesse(Jie) <Jesse.Zhang@xxxxxxx> > > Subject: [PATCH V3 1/5] drm/amdgpu: Add sysfs interface for gc reset > > mask > > > > Add two sysfs interfaces for gfx and compute: > > gfx_reset_mask > > compute_reset_mask > > > > These interfaces are read-only and show the resets supported by the IP. > > For example, full adapter reset (mode1/mode2/BACO/etc), soft reset, > > queue reset, and pipe reset. > > > > V2: the sysfs node returns a text string instead of some flags > > (Christian) > > v3: add a generic helper which takes the ring as parameter > > and print the strings in the order they are applied (Christian) > > > > check amdgpu_gpu_recovery before creating sysfs file itself, > > and initialize supported_reset_types in IP version files (Lijo) > > > > Signed-off-by: Jesse Zhang <Jesse.Zhang@xxxxxxx> Suggested-by:Alex > > Deucher <alexander.deucher@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 +++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 66 > > ++++++++++++++++++++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 ++ > > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 ++ > > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 14 +++++ > > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 12 ++++ > > 7 files changed, 147 insertions(+) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > index 48c9b9b06905..aea1031d7b84 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > > @@ -300,6 +300,12 @@ extern int amdgpu_wbrf; > > #define AMDGPU_RESET_VCE (1 << 13) > > #define AMDGPU_RESET_VCE1 (1 << 14) > > > > +/* reset mask */ > > +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, > > +mode1/mode2/BACO/etc. */ #define AMDGPU_RESET_TYPE_SOFT_RESET > (1 > > << 1) > > +/* IP level soft reset */ #define AMDGPU_RESET_TYPE_PER_QUEUE (1 << > > +2) > > +/* per queue */ #define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per > > +pipe */ > > + > > /* max cursor sizes (in pixels) */ > > #define CIK_CURSOR_WIDTH 128 > > #define CIK_CURSOR_HEIGHT 128 > > @@ -1466,6 +1472,8 @@ struct dma_fence > *amdgpu_device_get_gang(struct > > amdgpu_device *adev); struct dma_fence > > *amdgpu_device_switch_gang(struct amdgpu_device *adev, > > struct dma_fence *gang); > > bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev); > > +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring); > > +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset); > > > > /* atpx handler */ > > #if defined(CONFIG_VGA_SWITCHEROO) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > index ef715b2bbcdb..cd1e3f018893 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > @@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct > > amdgpu_device *adev, > > } > > return ret; > > } > > + > > +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) { > > + ssize_t size = 0; > > + > > + if (!ring) > > + return size; > > + > > + if (amdgpu_device_should_recover_gpu(ring->adev)) > > + size |= AMDGPU_RESET_TYPE_FULL; > > + > > + if (unlikely(!ring->adev->debug_disable_soft_recovery) && > > + !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) > > + size |= AMDGPU_RESET_TYPE_SOFT_RESET; > > + > > + return size; > > +} > > + > > +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) { > > + ssize_t size = 0; > > + > > + if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) > > + size += sysfs_emit_at(buf, size, "soft "); > > + > > + if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) > > + size += sysfs_emit_at(buf, size, "queue "); > > + > > + if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) > > + size += sysfs_emit_at(buf, size, "pipe "); > > + > > + if (supported_reset & AMDGPU_RESET_TYPE_FULL) > > + size += sysfs_emit_at(buf, size, "full "); > > + > > + size += sysfs_emit_at(buf, size, "\n"); > > + return size; > > +} > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > > index e96984c53e72..6de1f3bf6863 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > > @@ -1588,6 +1588,32 @@ static ssize_t > > amdgpu_gfx_set_enforce_isolation(struct device *dev, > > return count; > > } > > > > +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev, > > + struct > device_attribute *attr, > > + char *buf) { > > + struct drm_device *ddev = dev_get_drvdata(dev); > > + struct amdgpu_device *adev = drm_to_adev(ddev); > > + > > + if (!adev) > > + return -ENODEV; > > + > > + return amdgpu_show_reset_mask(buf, > > + adev->gfx.gfx_supported_reset); } > > + > > +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev, > > + struct > device_attribute *attr, > > + char *buf) { > > + struct drm_device *ddev = dev_get_drvdata(dev); > > + struct amdgpu_device *adev = drm_to_adev(ddev); > > + > > + if (!adev) > > + return -ENODEV; > > + > > + return amdgpu_show_reset_mask(buf, > > adev->gfx.compute_supported_reset); > > +} > > + > > static DEVICE_ATTR(run_cleaner_shader, 0200, > > NULL, amdgpu_gfx_set_run_cleaner_shader); > > > > @@ -1602,6 +1628,12 @@ static DEVICE_ATTR(current_compute_partition, > > 0644, static DEVICE_ATTR(available_compute_partition, 0444, > > amdgpu_gfx_get_available_compute_partition, NULL); > > > > +static DEVICE_ATTR(gfx_reset_mask, 0444, > > + amdgpu_gfx_get_gfx_reset_mask, NULL); > > + > > +static DEVICE_ATTR(compute_reset_mask, 0444, > > + amdgpu_gfx_get_compute_reset_mask, NULL); > > + > > int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) { > > struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1702,6 > > +1734,40 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device > > *adev, > > cleaner_shader_size); } > > > > +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) { > > + int r = 0; > > + > > + if (!amdgpu_gpu_recovery) > > + return r; > > + > > + if (adev->gfx.num_gfx_rings) { > > + r = device_create_file(adev->dev, > &dev_attr_gfx_reset_mask); > > + if (r) > > + return r; > > + } > > + > > + if (adev->gfx.num_compute_rings) { > > + r = device_create_file(adev->dev, > &dev_attr_compute_reset_mask); > > + if (r) > > + return r; > > + } > > + > > + return r; > > +} > > + > > +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev) { > > + if (!amdgpu_gpu_recovery) > > + return; > > + > > + if (adev->gfx.num_gfx_rings) > > + device_remove_file(adev->dev, &dev_attr_gfx_reset_mask); > > + > > + if (adev->gfx.num_compute_rings) > > + device_remove_file(adev->dev, > > + &dev_attr_compute_reset_mask); } > > + > > /** > > * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD > > (Graphics Driver) > > * @adev: amdgpu_device pointer > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > index f710178a21bc..fb0e1adf6766 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > > @@ -424,6 +424,8 @@ struct amdgpu_gfx { > > /* reset mask */ > > uint32_t grbm_soft_reset; > > uint32_t srbm_soft_reset; > > + uint32_t gfx_supported_reset; > > + uint32_t compute_supported_reset; > > > > /* gfx off */ > > bool gfx_off_state; /* true: > > enabled, false: disabled */ > > @@ -582,6 +584,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct > > amdgpu_device *adev); void > > amdgpu_gfx_enforce_isolation_handler(struct > > work_struct *work); void > > amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring); > > void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring > > *ring); > > +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev); > > +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev); > > > > static inline const char *amdgpu_gfx_compute_mode_desc(int mode) { > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > > index 9da95b25e158..446e37768397 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > > Here may miss the reset_mask_int and reset_mask_fini for gfx_v10. > > > Best Regards > Tim > > > @@ -4806,6 +4806,9 @@ static int gfx_v10_0_sw_init(struct > > amdgpu_ip_block *ip_block) > > } > > } > > } > > + /* TODO: Check the version that supports fully queue reset */ > > + adev->gfx.gfx_supported_reset |= > > + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); > > > > ring_id = 0; > > /* set up the compute queues - allocate horizontally across > > pipes */ @@ > > -4825,6 +4828,9 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block > > *ip_block) > > } > > } > > } > > + /* TODO: Check the version that supports fully queue reset */ > > + adev->gfx.compute_supported_reset |= > > + > > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); > > > > r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0); > > if (r) { > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > index 5aff8f72de9c..3b23402dfb47 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > > @@ -1560,6 +1560,11 @@ static int gfx_v11_0_sw_init(struct > > amdgpu_ip_block *ip_block) > > adev->userq_funcs[AMDGPU_HW_IP_GFX] = > > &userq_mes_v11_0_funcs; > > adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] = > > &userq_mes_v11_0_funcs; #endif > > + if ((adev->gfx.me_fw_version >= 2280) && > > + (adev->gfx.mec_fw_version >= 2410)) { > > + adev->gfx.compute_supported_reset = > > AMDGPU_RESET_TYPE_PER_QUEUE; > > + adev->gfx.gfx_supported_reset = > > AMDGPU_RESET_TYPE_PER_QUEUE; > > + } > > break; > > case IP_VERSION(11, 0, 1): > > case IP_VERSION(11, 0, 4): > > @@ -1663,6 +1668,8 @@ static int gfx_v11_0_sw_init(struct > > amdgpu_ip_block *ip_block) > > } > > } > > } > > + adev->gfx.gfx_supported_reset |= > > + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); > > > > ring_id = 0; > > /* set up the compute queues - allocate horizontally across > > pipes */ @@ > > -1682,6 +1689,8 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block > > *ip_block) > > } > > } > > } > > + adev->gfx.compute_supported_reset |= > > + > > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); > > > > if (!adev->enable_mes_kiq) { > > r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0); > @@ > > -1721,6 +1730,10 @@ static int gfx_v11_0_sw_init(struct > > amdgpu_ip_block > > *ip_block) > > if (r) > > return r; > > > > + r = amdgpu_gfx_sysfs_reset_mask_init (adev); > > + if (r) > > + return r; > > + > > return 0; > > } > > > > @@ -1783,6 +1796,7 @@ static int gfx_v11_0_sw_fini(struct > > amdgpu_ip_block *ip_block) > > gfx_v11_0_free_microcode(adev); > > > > amdgpu_gfx_sysfs_isolation_shader_fini(adev); > > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > > > kfree(adev->gfx.ip_dump_core); > > kfree(adev->gfx.ip_dump_compute_queues); > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > index 016290f00592..b9d5a79ba85c 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > > @@ -1067,6 +1067,11 @@ static int gfx_v9_4_3_sw_init(struct > > amdgpu_ip_block *ip_block) > > dev_err(adev->dev, "Failed to initialize > cleaner shader\n"); > > } > > } > > + > > + if (adev->gfx.mec_fw_version >= 155) { > > + adev->gfx.compute_supported_reset = > > AMDGPU_RESET_TYPE_PER_QUEUE; > > + adev->gfx.compute_supported_reset |= > > AMDGPU_RESET_TYPE_PER_PIPE; > > + } > > break; > > default: > > adev->gfx.enable_cleaner_shader = false; @@ -1157,6 > > +1162,9 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block > *ip_block) > > return r; > > } > > > > + adev->gfx.compute_supported_reset |= > > + > > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); May careful handling is required for the initialization of `adev->gfx.gfx_supported_reset` and `adev->gfx.compute_supported_reset`. For instance, in `gfx_v9`, `adev->gfx.gfx_supported_reset` is not initialized, yet the sysfs file should be created by `amdgpu_gfx_sysfs_reset_mask_init`. Additionally, `adev->gfx.compute_supported_reset` may perform a bitwise OR operation with an uninitialized value when adev->gfx.mec_fw_version < 155. Best Regards Tim Huang > > + > > r = gfx_v9_4_3_gpu_early_init(adev); > > if (r) > > return r; > > @@ -1175,6 +1183,9 @@ static int gfx_v9_4_3_sw_init(struct > > amdgpu_ip_block *ip_block) > > if (r) > > return r; > > > > + r = amdgpu_gfx_sysfs_reset_mask_init(adev); > > + if (r) > > + return r; > > return 0; > > } > > > > @@ -1200,6 +1211,7 @@ static int gfx_v9_4_3_sw_fini(struct > > amdgpu_ip_block *ip_block) > > gfx_v9_4_3_free_microcode(adev); > > amdgpu_gfx_sysfs_fini(adev); > > amdgpu_gfx_sysfs_isolation_shader_fini(adev); > > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > > > kfree(adev->gfx.ip_dump_core); > > kfree(adev->gfx.ip_dump_compute_queues); > > -- > > 2.25.1