Hi Felix, I just tested your patch. It works fine on my test set with the following little fix. Regards, Ma Jun diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 7ea3ec1e9e75..7d6fbfbfeb79 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1954,9 +1954,11 @@ static int kfd_topology_add_device_locked(struct kfd_dev *gpu, uint32_t gpu_id, pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", gpu_id); topology_crat_proximity_domain--; - return res; + goto err; } + INIT_LIST_HEAD(&temp_topology_device_list); + res = kfd_parse_crat_table(crat_image, &temp_topology_device_list, proximity_domain); @@ -1964,15 +1966,17 @@ static int kfd_topology_add_device_locked(struct kfd_dev *gpu, uint32_t gpu_id, pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", gpu_id); topology_crat_proximity_domain--; - return res; + goto err; } kfd_topology_update_device_list(&temp_topology_device_list, &topology_device_list); *dev = kfd_assign_gpu(gpu); - if (WARN_ON(!*dev)) - return -ENODEV; + if (WARN_ON(!*dev)) { + res = -ENODEV; + goto err; + } /* Fill the cache affinity information here for the GPUs * using VCRAT @@ -1989,6 +1993,8 @@ static int kfd_topology_add_device_locked(struct kfd_dev *gpu, uint32_t gpu_id, pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", gpu_id, res); +err: + kfd_destroy_crat_image(crat_image); return res; } @@ -2001,8 +2007,6 @@ int kfd_topology_add_device(struct kfd_dev *gpu) int i; const char *asic_name = amdgpu_asic_name[gpu->adev->asic_type]; - INIT_LIST_HEAD(&temp_topology_device_list); - gpu_id = kfd_generate_gpu_id(gpu); pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); @@ -2018,7 +2022,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) res = kfd_topology_add_device_locked(gpu, gpu_id, &dev); up_write(&topology_lock); if (res) - goto err; + return res; dev->gpu_id = gpu_id; gpu->id = gpu_id; @@ -2141,8 +2145,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) if (!res) kfd_notify_gpu_change(gpu_id, 1); -err: - kfd_destroy_crat_image(crat_image); + return res; } On 11/17/2022 4:49 AM, Felix Kuehling wrote: > Am 2022-11-16 um 03:04 schrieb Ma Jun: >> Release the topology_lock in error case >> >> Signed-off-by: Ma Jun <Jun.Ma2@xxxxxxx> >> Reported-by: Dan Carpenter <error27@xxxxxxxxx> > Dan, did you change your email address, is this one correct? > > Ma Jun, thanks for looking into this. Some of this problem predates your > patch that was flagged by Dan. I would prefer a more consistent and > robust handling of these error cases. I think everything inside the > topology lock could be moved into another function to simplify the error > handling. I'm attaching a completely untested patch to illustrate the idea. > > Regards, > Felix > > >> --- >> drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 3 +++ >> 1 file changed, 3 insertions(+) >> >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c >> index ef9c6fdfb88d..5ea737337658 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c >> @@ -1841,6 +1841,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) >> pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", >> gpu_id); >> topology_crat_proximity_domain--; >> + up_write(&topology_lock); >> return res; >> } >> >> @@ -1851,6 +1852,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) >> pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", >> gpu_id); >> topology_crat_proximity_domain--; >> + up_write(&topology_lock); >> goto err; >> } >> >> @@ -1860,6 +1862,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) >> dev = kfd_assign_gpu(gpu); >> if (WARN_ON(!dev)) { >> res = -ENODEV; >> + up_write(&topology_lock); >> goto err; >> } >>