On 10/23/2024 8:13 AM, Jesse.zhang@xxxxxxx wrote: > Add two sysfs interfaces for gfx and compute: > gfx_reset_mask > compute_reset_mask > > These interfaces are read-only and show the resets supported by the IP. > For example, full adapter reset (mode1/mode2/BACO/etc), > soft reset, queue reset, and pipe reset. > > V2: the sysfs node returns a text string instead of some flags (Christian) > > Signed-off-by: Jesse Zhang <Jesse.Zhang@xxxxxxx> > Suggested-by:Alex Deucher <alexander.deucher@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 122 ++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 + > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 6 ++ > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 5 + > drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 5 + > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 5 + > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 5 + > 7 files changed, 150 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index e96984c53e72..10d55755ee88 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -1588,6 +1588,94 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, > return count; > } > > +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + struct drm_device *ddev = dev_get_drvdata(dev); > + struct amdgpu_device *adev = drm_to_adev(ddev); > + ssize_t size = 0; > + struct amdgpu_ring *ring = &adev->gfx.gfx_ring[0]; > + > + if (!adev || !ring) > + return -ENODEV; > + > + if (amdgpu_device_should_recover_gpu(adev)) > + size += sysfs_emit_at(buf, size, "full "); > + > + if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery) > + && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery) > + size += sysfs_emit_at(buf, size, "soft "); > + If amdgpu_gpu_recovery is disabled, then that check may be made before creating the sysfs file itself. It doesn't have to be here. > + if (amdgpu_gpu_recovery && ring->funcs->reset) { > + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { > + case IP_VERSION(9, 2, 2): //reven2 > + case IP_VERSION(9, 3, 0): //renior > + case IP_VERSION(9, 4, 0): //vega20 > + case IP_VERSION(10, 1, 0): //navi10 > + case IP_VERSION(10, 1, 1): //navi12 > + case IP_VERSION(10, 1, 2): //navi13 > + /* Skip flag setting because some cases > + * are not supported by current firmware. > + */ > + break; > + > + default: > + size += sysfs_emit_at(buf, size, "queue "); > + break; > + } > + } This kind of version check is not good. Instead initialize supported_reset_types in IP version files. As in the compute example below, sometimes it requires FW support/other checks also, not just the existence of callback implementation. This function may just iterate over the type mask to print the text version. Thanks, Lijo > + > + size += sysfs_emit_at(buf, size, "\n"); > + return size; > +} > + > +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + struct drm_device *ddev = dev_get_drvdata(dev); > + struct amdgpu_device *adev = drm_to_adev(ddev); > + ssize_t size = 0; > + struct amdgpu_ring *ring = &adev->gfx.compute_ring[0]; > + > + if (!adev || !ring) > + return -ENODEV; > + > + if (amdgpu_device_should_recover_gpu(adev)) > + size += sysfs_emit_at(buf, size, "full "); > + > + if (amdgpu_gpu_recovery && unlikely(!adev->debug_disable_soft_recovery) > + && !amdgpu_sriov_vf(adev) && ring->funcs->soft_recovery) > + size += sysfs_emit_at(buf, size, "soft "); > + > + if (amdgpu_gpu_recovery && ring->funcs->reset) { > + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { > + case IP_VERSION(9, 2, 2): //reven2 > + case IP_VERSION(9, 3, 0): //renior > + case IP_VERSION(9, 4, 0): //vega20 > + case IP_VERSION(10, 1, 0): //navi10 > + case IP_VERSION(10, 1, 1): //navi12 > + case IP_VERSION(10, 1, 2): //navi13 > + /* Skip flag setting because some test cases > + * are not supported by current firmware. > + */ > + break; > + > + default: > + size += sysfs_emit_at(buf, size, "queue "); > + break; > + } > + } > + > + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && > + adev->gfx.mec_fw_version >= 0x0000009b) > + size += sysfs_emit_at(buf, size, "pipe "); > + > + size += sysfs_emit_at(buf, size, "\n"); > + return size; > +} > + > static DEVICE_ATTR(run_cleaner_shader, 0200, > NULL, amdgpu_gfx_set_run_cleaner_shader); > > @@ -1602,6 +1690,12 @@ static DEVICE_ATTR(current_compute_partition, 0644, > static DEVICE_ATTR(available_compute_partition, 0444, > amdgpu_gfx_get_available_compute_partition, NULL); > > +static DEVICE_ATTR(gfx_reset_mask, 0444, > + amdgpu_gfx_get_gfx_reset_mask, NULL); > + > +static DEVICE_ATTR(compute_reset_mask, 0444, > + amdgpu_gfx_get_compute_reset_mask, NULL); > + > int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) > { > struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; > @@ -1702,6 +1796,34 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev, > cleaner_shader_size); > } > > +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) > +{ > + int r = 0; > + > + if (adev->gfx.num_gfx_rings) { > + r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask); > + if (r) > + return r; > + } > + > + if (adev->gfx.num_compute_rings) { > + r = device_create_file(adev->dev, &dev_attr_compute_reset_mask); > + if (r) > + return r; > + } > + > + return r; > +} > + > +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev) > +{ > + if (adev->gfx.num_gfx_rings) > + device_remove_file(adev->dev, &dev_attr_gfx_reset_mask); > + > + if (adev->gfx.num_compute_rings) > + device_remove_file(adev->dev, &dev_attr_compute_reset_mask); > +} > + > /** > * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver) > * @adev: amdgpu_device pointer > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > index f710178a21bc..0cf2151b3cf4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > @@ -582,6 +582,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev); > void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work); > void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring); > void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring); > +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev); > +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev); > > static inline const char *amdgpu_gfx_compute_mode_desc(int mode) > { > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index 9da95b25e158..2baa76095232 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -4856,6 +4856,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block) > r = amdgpu_gfx_sysfs_isolation_shader_init(adev); > if (r) > return r; > + > + r = amdgpu_gfx_sysfs_reset_mask_init(adev); > + if (r) > + return r; > + > return 0; > } > > @@ -4908,6 +4913,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block) > > gfx_v10_0_free_microcode(adev); > amdgpu_gfx_sysfs_isolation_shader_fini(adev); > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > kfree(adev->gfx.ip_dump_core); > kfree(adev->gfx.ip_dump_compute_queues); > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > index 5aff8f72de9c..32d14b9cc6e4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > @@ -1721,6 +1721,10 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) > if (r) > return r; > > + r = amdgpu_gfx_sysfs_reset_mask_init(adev); > + if (r) > + return r; > + > return 0; > } > > @@ -1783,6 +1787,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block) > gfx_v11_0_free_microcode(adev); > > amdgpu_gfx_sysfs_isolation_shader_fini(adev); > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > kfree(adev->gfx.ip_dump_core); > kfree(adev->gfx.ip_dump_compute_queues); > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > index 9fec28d8a5fc..925b7ca49b2b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > @@ -1470,6 +1470,10 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) > if (r) > return r; > > + r = amdgpu_gfx_sysfs_reset_mask_init(adev); > + if (r) > + return r; > + > return 0; > } > > @@ -1530,6 +1534,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block) > gfx_v12_0_free_microcode(adev); > > amdgpu_gfx_sysfs_isolation_shader_fini(adev); > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > kfree(adev->gfx.ip_dump_core); > kfree(adev->gfx.ip_dump_compute_queues); > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index b4c4b9916289..0de199c1cfdd 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -2394,6 +2394,10 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block) > if (r) > return r; > > + r = amdgpu_gfx_sysfs_reset_mask_init(adev); > + if (r) > + return r; > + > return 0; > } > > @@ -2432,6 +2436,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block) > gfx_v9_0_free_microcode(adev); > > amdgpu_gfx_sysfs_isolation_shader_fini(adev); > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > kfree(adev->gfx.ip_dump_core); > kfree(adev->gfx.ip_dump_compute_queues); > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > index 016290f00592..87cfd77e2fb4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > @@ -1175,6 +1175,10 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block) > if (r) > return r; > > + r = amdgpu_gfx_sysfs_reset_mask_init(adev); > + if (r) > + return r; > + > return 0; > } > > @@ -1200,6 +1204,7 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block) > gfx_v9_4_3_free_microcode(adev); > amdgpu_gfx_sysfs_fini(adev); > amdgpu_gfx_sysfs_isolation_shader_fini(adev); > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > kfree(adev->gfx.ip_dump_core); > kfree(adev->gfx.ip_dump_compute_queues);