[AMD Official Use Only - AMD Internal Distribution Only] Hi Lijo, -----Original Message----- From: Lazar, Lijo <Lijo.Lazar@xxxxxxx> Sent: Tuesday, October 29, 2024 3:58 PM To: Zhang, Jesse(Jie) <Jesse.Zhang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>; Huang, Tim <Tim.Huang@xxxxxxx> Subject: Re: [PATCH 1/5 V4 1/5] drm/amdgpu: Add sysfs interface for gc reset mask On 10/29/2024 12:44 PM, Jesse.zhang@xxxxxxx wrote: > From: "Jesse.zhang@xxxxxxx" <Jesse.zhang@xxxxxxx> > > Add two sysfs interfaces for gfx and compute: > gfx_reset_mask > compute_reset_mask > > These interfaces are read-only and show the resets supported by the IP. > For example, full adapter reset (mode1/mode2/BACO/etc), soft reset, > queue reset, and pipe reset. > > V2: the sysfs node returns a text string instead of some flags > (Christian) > v3: add a generic helper which takes the ring as parameter > and print the strings in the order they are applied (Christian) > > check amdgpu_gpu_recovery before creating sysfs file itself, > and initialize supported_reset_types in IP version files (Lijo) > v4: Fixing uninitialized variables (Tim) > > Signed-off-by: Jesse Zhang <Jesse.Zhang@xxxxxxx> Suggested-by:Alex > Deucher <alexander.deucher@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 37 ++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 66 ++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 4 ++ > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 9 +++ > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 23 ++++++++ > drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 10 ++++ > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 ++++ > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 17 ++++++ > 9 files changed, 184 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 48c9b9b06905..aea1031d7b84 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -300,6 +300,12 @@ extern int amdgpu_wbrf; > #define AMDGPU_RESET_VCE (1 << 13) > #define AMDGPU_RESET_VCE1 (1 << 14) > > +/* reset mask */ > +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, > +mode1/mode2/BACO/etc. */ #define AMDGPU_RESET_TYPE_SOFT_RESET (1 << > +1) /* IP level soft reset */ #define AMDGPU_RESET_TYPE_PER_QUEUE (1 > +<< 2) /* per queue */ #define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* > +per pipe */ > + > /* max cursor sizes (in pixels) */ > #define CIK_CURSOR_WIDTH 128 > #define CIK_CURSOR_HEIGHT 128 > @@ -1466,6 +1472,8 @@ struct dma_fence *amdgpu_device_get_gang(struct > amdgpu_device *adev); struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, > struct dma_fence *gang); > bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev); > +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring); > +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset); > > /* atpx handler */ > #if defined(CONFIG_VGA_SWITCHEROO) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index ef715b2bbcdb..cd1e3f018893 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -6684,3 +6684,40 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, > } > return ret; > } > + > +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) { > + ssize_t size = 0; > + > + if (!ring) > + return size; > + > + if (amdgpu_device_should_recover_gpu(ring->adev)) > + size |= AMDGPU_RESET_TYPE_FULL; > + > + if (unlikely(!ring->adev->debug_disable_soft_recovery) && > + !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) > + size |= AMDGPU_RESET_TYPE_SOFT_RESET; > + > + return size; > +} > + > +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) { > + ssize_t size = 0; > + > + if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) > + size += sysfs_emit_at(buf, size, "soft "); > + > + if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) > + size += sysfs_emit_at(buf, size, "queue "); > + > + if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) > + size += sysfs_emit_at(buf, size, "pipe "); > + > + if (supported_reset & AMDGPU_RESET_TYPE_FULL) > + size += sysfs_emit_at(buf, size, "full "); > + > + size += sysfs_emit_at(buf, size, "\n"); Is there an expectation of having "Unsupported" when no reset is supported (supported_reset == 0)? Yes, will add it . Thanks Jesse Thanks, Lijo > + return size; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index e96984c53e72..6de1f3bf6863 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -1588,6 +1588,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, > return count; > } > > +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + struct drm_device *ddev = dev_get_drvdata(dev); > + struct amdgpu_device *adev = drm_to_adev(ddev); > + > + if (!adev) > + return -ENODEV; > + > + return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset); } > + > +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev, > + struct device_attribute *attr, > + char *buf) > +{ > + struct drm_device *ddev = dev_get_drvdata(dev); > + struct amdgpu_device *adev = drm_to_adev(ddev); > + > + if (!adev) > + return -ENODEV; > + > + return amdgpu_show_reset_mask(buf, > +adev->gfx.compute_supported_reset); > +} > + > static DEVICE_ATTR(run_cleaner_shader, 0200, > NULL, amdgpu_gfx_set_run_cleaner_shader); > > @@ -1602,6 +1628,12 @@ static DEVICE_ATTR(current_compute_partition, > 0644, static DEVICE_ATTR(available_compute_partition, 0444, > amdgpu_gfx_get_available_compute_partition, NULL); > > +static DEVICE_ATTR(gfx_reset_mask, 0444, > + amdgpu_gfx_get_gfx_reset_mask, NULL); > + > +static DEVICE_ATTR(compute_reset_mask, 0444, > + amdgpu_gfx_get_compute_reset_mask, NULL); > + > int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) { > struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1702,6 +1734,40 > @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev, > cleaner_shader_size); > } > > +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) { > + int r = 0; > + > + if (!amdgpu_gpu_recovery) > + return r; > + > + if (adev->gfx.num_gfx_rings) { > + r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask); > + if (r) > + return r; > + } > + > + if (adev->gfx.num_compute_rings) { > + r = device_create_file(adev->dev, &dev_attr_compute_reset_mask); > + if (r) > + return r; > + } > + > + return r; > +} > + > +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev) { > + if (!amdgpu_gpu_recovery) > + return; > + > + if (adev->gfx.num_gfx_rings) > + device_remove_file(adev->dev, &dev_attr_gfx_reset_mask); > + > + if (adev->gfx.num_compute_rings) > + device_remove_file(adev->dev, &dev_attr_compute_reset_mask); } > + > /** > * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver) > * @adev: amdgpu_device pointer > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > index f710178a21bc..fb0e1adf6766 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > @@ -424,6 +424,8 @@ struct amdgpu_gfx { > /* reset mask */ > uint32_t grbm_soft_reset; > uint32_t srbm_soft_reset; > + uint32_t gfx_supported_reset; > + uint32_t compute_supported_reset; > > /* gfx off */ > bool gfx_off_state; /* true: enabled, false: disabled */ > @@ -582,6 +584,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct > amdgpu_device *adev); void > amdgpu_gfx_enforce_isolation_handler(struct work_struct *work); void > amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring); > void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring > *ring); > +int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev); > +void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev); > > static inline const char *amdgpu_gfx_compute_mode_desc(int mode) { > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index 9da95b25e158..e2b2cdab423b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -4825,6 +4825,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block) > } > } > } > + /* TODO: Add queue reset mask when FW fully supports it */ > + adev->gfx.gfx_supported_reset = > + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); > + adev->gfx.compute_supported_reset = > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); > > r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0); > if (r) { > @@ -4854,6 +4859,9 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block) > gfx_v10_0_alloc_ip_dump(adev); > > r = amdgpu_gfx_sysfs_isolation_shader_init(adev); > + if (r) > + return r; > + r = amdgpu_gfx_sysfs_reset_mask_init(adev); > if (r) > return r; > return 0; > @@ -4896,6 +4904,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block) > amdgpu_gfx_kiq_fini(adev, 0); > > amdgpu_gfx_cleaner_shader_sw_fini(adev); > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > gfx_v10_0_pfp_fini(adev); > gfx_v10_0_ce_fini(adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > index 5aff8f72de9c..ec24e8d019b3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > @@ -1683,6 +1683,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) > } > } > > + adev->gfx.gfx_supported_reset = > + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); > + adev->gfx.compute_supported_reset = > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); > + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { > + case IP_VERSION(11, 0, 0): > + case IP_VERSION(11, 0, 2): > + case IP_VERSION(11, 0, 3): > + if ((adev->gfx.me_fw_version >= 2280) && > + (adev->gfx.mec_fw_version >= 2410)) { > + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; > + adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; > + } > + break; > + default: > + break; > + } > + > if (!adev->enable_mes_kiq) { > r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0); > if (r) { > @@ -1721,6 +1739,10 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) > if (r) > return r; > > + r = amdgpu_gfx_sysfs_reset_mask_init (adev); > + if (r) > + return r; > + > return 0; > } > > @@ -1783,6 +1805,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block) > gfx_v11_0_free_microcode(adev); > > amdgpu_gfx_sysfs_isolation_shader_fini(adev); > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > kfree(adev->gfx.ip_dump_core); > kfree(adev->gfx.ip_dump_compute_queues); > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > index 9fec28d8a5fc..f5ffa2d8b22a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c > @@ -1437,6 +1437,12 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) > } > } > > + /* TODO: Add queue reset mask when FW fully supports it */ > + adev->gfx.gfx_supported_reset = > + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); > + adev->gfx.compute_supported_reset = > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); > + > if (!adev->enable_mes_kiq) { > r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, 0); > if (r) { > @@ -1467,6 +1473,9 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) > gfx_v12_0_alloc_ip_dump(adev); > > r = amdgpu_gfx_sysfs_isolation_shader_init(adev); > + if (r) > + return r; > + r = amdgpu_gfx_sysfs_reset_mask_init(adev); > if (r) > return r; > > @@ -1530,6 +1539,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block) > gfx_v12_0_free_microcode(adev); > > amdgpu_gfx_sysfs_isolation_shader_fini(adev); > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > kfree(adev->gfx.ip_dump_core); > kfree(adev->gfx.ip_dump_compute_queues); > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index b4c4b9916289..94007a9ed54b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -2362,6 +2362,12 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block) > } > } > > + /* TODO: Add queue reset mask when FW fully supports it */ > + adev->gfx.gfx_supported_reset = > + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); > + adev->gfx.compute_supported_reset = > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); > + > r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0); > if (r) { > DRM_ERROR("Failed to init KIQ BOs!\n"); @@ -2391,6 +2397,9 @@ > static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block) > gfx_v9_0_alloc_ip_dump(adev); > > r = amdgpu_gfx_sysfs_isolation_shader_init(adev); > + if (r) > + return r; > + r = amdgpu_gfx_sysfs_reset_mask_init(adev); > if (r) > return r; > > @@ -2419,6 +2428,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block) > amdgpu_gfx_kiq_fini(adev, 0); > > amdgpu_gfx_cleaner_shader_sw_fini(adev); > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > gfx_v9_0_mec_fini(adev); > amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj, > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > index 016290f00592..028fda13ac50 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c > @@ -1157,6 +1157,19 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block) > return r; > } > > + adev->gfx.compute_supported_reset = > + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); > + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { > + case IP_VERSION(9, 4, 3): > + case IP_VERSION(9, 4, 4): > + if (adev->gfx.mec_fw_version >= 155) { > + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; > + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_PIPE; > + } > + break; > + default: > + break; > + } > r = gfx_v9_4_3_gpu_early_init(adev); > if (r) > return r; > @@ -1175,6 +1188,9 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block) > if (r) > return r; > > + r = amdgpu_gfx_sysfs_reset_mask_init(adev); > + if (r) > + return r; > return 0; > } > > @@ -1200,6 +1216,7 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block) > gfx_v9_4_3_free_microcode(adev); > amdgpu_gfx_sysfs_fini(adev); > amdgpu_gfx_sysfs_isolation_shader_fini(adev); > + amdgpu_gfx_sysfs_reset_mask_fini(adev); > > kfree(adev->gfx.ip_dump_core); > kfree(adev->gfx.ip_dump_compute_queues);