OK -----Original Message----- From: Kuehling, Felix <Felix.Kuehling@xxxxxxx> Sent: Tuesday, February 1, 2022 12:24 AM To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Clements, John <John.Clements@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx> Subject: Re: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop Am 2022-01-29 um 22:19 schrieb Zhou1, Tao: > [AMD Official Use Only] > > > >> -----Original Message----- >> From: Chai, Thomas <YiPeng.Chai@xxxxxxx> >> Sent: Saturday, January 29, 2022 8:34 PM >> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking >> <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Clements, >> John <John.Clements@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> >> Subject: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused >> by infinite loop >> >> 1. The infinite loop case only occurs on multiple cards support >> ras functions. >> 2. The explanation of root cause refer to 76641cbbf196523b5752c6cf68f86. >> 3. Create new node to manage each unique ras instance to guarantee >> each device .ras_list is completely independent. >> 4. Fixes:7a6b8ab3231b511915cb94cac1debabf093. >> 5. The soft locked logs are as follows: >> [ 262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G OE >> 5.13.0-27-generic #29~20.04.1-Ubuntu >> [ 262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, >> BIOS T20200717143848 07/17/2020 [ 262.165698] Workqueue: events >> amdgpu_ras_do_recovery [amdgpu] [ 262.165980] RIP: >> 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [ 262.166239] Code: >> 68 >> d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 >> 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 >> 4d 89 f5 48 83 e8 28 48 >> 39 d3 74 25 49 89 c6 49 8b 45 [ 262.166243] RSP: >> 0018:ffffac908fa87d80 >> EFLAGS: 00000202 [ 262.166247] RAX: ffffffffc1394248 RBX: >> ffff91e4ab8d6e20 >> RCX: ffffffffc1394248 [ 262.166249] RDX: ffff91e4aa356e20 RSI: >> 000000000000000e RDI: ffff91e4ab8c0000 [ 262.166252] RBP: >> ffffac908fa87da8 R08: 0000000000000007 R09: 0000000000000001 [ >> 262.166254] R10: ffff91e4930b64ec R11: 0000000000000000 R12: >> 000000000000000e [ 262.166256] R13: ffff91e4aa356df8 R14: >> ffffffffc1394320 >> R15: 0000000000000003 [ 262.166258] FS: 0000000000000000(0000) >> GS:ffff92238fb40000(0000) knlGS:0000000000000000 [ 262.166261] CS: >> 0010 >> DS: 0000 ES: 0000 CR0: 0000000080050033 [ 262.166264] CR2: >> 00000001004865d0 CR3: 000000406d796000 CR4: 0000000000350ee0 [ >> 262.166267] Call Trace: >> [ 262.166272] amdgpu_ras_do_recovery+0x130/0x290 [amdgpu] [ >> 262.166529] ? psi_task_switch+0xd2/0x250 [ 262.166537] ? >> __switch_to+0x11d/0x460 [ 262.166542] ? __switch_to_asm+0x36/0x70 [ >> 262.166549] process_one_work+0x220/0x3c0 [ 262.166556] >> worker_thread+0x4d/0x3f0 [ 262.166560] ? >> process_one_work+0x3c0/0x3c0 [ 262.166563] kthread+0x12b/0x150 [ 262.166568] ? >> set_kthread_struct+0x40/0x40 [ 262.166571] ret_from_fork+0x22/0x30 >> >> Signed-off-by: yipechai <YiPeng.Chai@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 >> ++++++++++++++++++++++-- >> - drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 -- >> 2 files changed, 33 insertions(+), 7 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> index 9d7c778c1a2d..b0aa67308c31 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> @@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = { >> "mca_iohc", >> }; >> >> +struct amdgpu_ras_block_list { >> + /* ras block link */ >> + struct list_head node; >> + >> + struct amdgpu_ras_block_object *ras_obj; }; >> + >> const char *get_ras_block_str(struct ras_common_if *ras_block) { >> if (!ras_block) >> @@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object >> *amdgpu_ras_get_ras_block(struct amdgpu_de >> enum amdgpu_ras_block block, >> uint32_t sub_block_index) { >> int loop_cnt = 0; >> - struct amdgpu_ras_block_object *obj, *tmp; >> + struct amdgpu_ras_block_list *node, *tmp; >> + struct amdgpu_ras_block_object *obj; >> >> if (block >= AMDGPU_RAS_BLOCK__LAST) >> return NULL; >> @@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object >> *amdgpu_ras_get_ras_block(struct amdgpu_de >> if (!amdgpu_ras_is_supported(adev, block)) >> return NULL; >> >> - list_for_each_entry_safe(obj, tmp, &adev->ras_list, node) { >> + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { >> + if (!node->ras_obj) { >> + DRM_ERROR("Warning: abnormal ras list node"); > [Tao]: dev_warn is recommended. > >> + continue; >> + } >> + >> + obj = node->ras_obj; >> if (obj->ras_block_match) { >> if (obj->ras_block_match(obj, block, sub_block_index) == 0) >> return obj; >> @@ -2527,6 +2541,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device >> *adev) >> >> int amdgpu_ras_fini(struct amdgpu_device *adev) { >> + struct amdgpu_ras_block_list *ras_node, *tmp; >> struct amdgpu_ras *con = amdgpu_ras_get_context(adev); >> >> if (!adev->ras_enabled || !con) >> @@ -2545,6 +2560,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) >> amdgpu_ras_set_context(adev, NULL); >> kfree(con); >> >> + /* Clear ras blocks from ras_list and free ras block list node */ >> + list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { >> + list_del(&ras_node->node); >> + kfree(ras_node); >> + } >> + >> return 0; >> } >> >> @@ -2754,14 +2775,22 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device >> *adev) int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, >> struct amdgpu_ras_block_object *ras_block_obj) { >> + struct amdgpu_ras_block_list *ras_node; >> if (!adev || !ras_block_obj) >> return -EINVAL; >> >> if (!amdgpu_ras_asic_supported(adev)) >> return 0; >> >> - INIT_LIST_HEAD(&ras_block_obj->node); >> - list_add_tail(&ras_block_obj->node, &adev->ras_list); >> + ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL); >> + if (!ras_node) { >> + DRM_ERROR("Failed to allocate ras_node"); > [Tao] dev_err is better. You should not print error messages after failed memory allocation. Out-of-memory errors already produce noisy log messages of their own. checkpatch.pl warns about such redundant error messages with "Possible unnecessary 'out of memory' message". Regards, Felix > >> + return -EINVAL; > [Tao]: how about return -ENOMEM here? > >> + } >> + >> + INIT_LIST_HEAD(&ras_node->node); >> + ras_node->ras_obj = ras_block_obj; >> + list_add_tail(&ras_node->node, &adev->ras_list); >> >> return 0; >> } >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> index a51a281bd91a..a55743b12d57 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> @@ -493,9 +493,6 @@ struct amdgpu_ras_block_object { >> >> uint32_t sub_block_index; >> >> - /* ras block link */ >> - struct list_head node; >> - >> int (*ras_block_match)(struct amdgpu_ras_block_object *block_obj, >> enum amdgpu_ras_block block, uint32_t sub_block_index); >> int (*ras_late_init)(struct amdgpu_device *adev, void *ras_info); >> -- >> 2.25.1