Am 2022-11-16 um 03:04 schrieb Ma Jun:
Release the topology_lock in error case
Signed-off-by: Ma Jun <Jun.Ma2@xxxxxxx>
Reported-by: Dan Carpenter <error27@xxxxxxxxx>
Dan, did you change your email address, is this one correct?
Ma Jun, thanks for looking into this. Some of this problem predates your
patch that was flagged by Dan. I would prefer a more consistent and
robust handling of these error cases. I think everything inside the
topology lock could be moved into another function to simplify the error
handling. I'm attaching a completely untested patch to illustrate the idea.
Regards,
Felix
---
drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index ef9c6fdfb88d..5ea737337658 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1841,6 +1841,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
gpu_id);
topology_crat_proximity_domain--;
+ up_write(&topology_lock);
return res;
}
@@ -1851,6 +1852,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
gpu_id);
topology_crat_proximity_domain--;
+ up_write(&topology_lock);
goto err;
}
@@ -1860,6 +1862,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev = kfd_assign_gpu(gpu);
if (WARN_ON(!dev)) {
res = -ENODEV;
+ up_write(&topology_lock);
goto err;
}
From ceb79972cdd490de181a6895836e40bf4e93c631 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <felix.kuehling@xxxxxxxxx>
Date: Wed, 16 Nov 2022 15:38:44 -0500
Subject: [PATCH] drm/amdkfd: Release the topology_lock in error case
Move the topology-locked part of kfd_topology_add_device into a separate
function to simlpify error handling and release the topology lock
consistently.
Reported-by: Dan Carpenter <error27@xxxxxxxxx>
Signed-off-by: Felix Kuehling <felix.kuehling@xxxxxxxxx>
---
drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 109 ++++++++++++----------
1 file changed, 58 insertions(+), 51 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index ef9c6fdfb88d..b56beab71c5b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1805,16 +1805,66 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct
pr_debug("Added [%d] GPU cache entries\n", num_of_entries);
}
+static int kfd_topology_add_device_locked(struct kfd_dev *gpu, uint32_t gpu_id,
+ struct kfd_topology_device **dev)
+{
+ int proximity_domain = ++topology_crat_proximity_domain;
+ struct list_head temp_topology_device_list;
+ void *crat_image = NULL;
+ size_t image_size = 0;
+ int res;
+
+ res = kfd_create_crat_image_virtual(&crat_image, &image_size,
+ COMPUTE_UNIT_GPU, gpu,
+ proximity_domain);
+ if (res) {
+ pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
+ gpu_id);
+ topology_crat_proximity_domain--;
+ return res;
+ }
+
+ res = kfd_parse_crat_table(crat_image,
+ &temp_topology_device_list,
+ proximity_domain);
+ if (res) {
+ pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
+ gpu_id);
+ topology_crat_proximity_domain--;
+ return res;
+ }
+
+ kfd_topology_update_device_list(&temp_topology_device_list,
+ &topology_device_list);
+
+ *dev = kfd_assign_gpu(gpu);
+ if (WARN_ON(!*dev))
+ return -ENODEV;
+
+ /* Fill the cache affinity information here for the GPUs
+ * using VCRAT
+ */
+ kfd_fill_cache_non_crat_info(*dev, gpu);
+
+ /* Update the SYSFS tree, since we added another topology
+ * device
+ */
+ res = kfd_topology_update_sysfs();
+ if (!res)
+ sys_props.generation_count++;
+ else
+ pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
+ gpu_id, res);
+
+ return res;
+}
+
int kfd_topology_add_device(struct kfd_dev *gpu)
{
uint32_t gpu_id;
struct kfd_topology_device *dev;
struct kfd_cu_info cu_info;
int res = 0;
- struct list_head temp_topology_device_list;
- void *crat_image = NULL;
- size_t image_size = 0;
- int proximity_domain;
int i;
const char *asic_name = amdgpu_asic_name[gpu->adev->asic_type];
@@ -1831,54 +1881,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
*/
down_write(&topology_lock);
dev = kfd_assign_gpu(gpu);
- if (!dev) {
- proximity_domain = ++topology_crat_proximity_domain;
-
- res = kfd_create_crat_image_virtual(&crat_image, &image_size,
- COMPUTE_UNIT_GPU, gpu,
- proximity_domain);
- if (res) {
- pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
- gpu_id);
- topology_crat_proximity_domain--;
- return res;
- }
-
- res = kfd_parse_crat_table(crat_image,
- &temp_topology_device_list,
- proximity_domain);
- if (res) {
- pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
- gpu_id);
- topology_crat_proximity_domain--;
- goto err;
- }
-
- kfd_topology_update_device_list(&temp_topology_device_list,
- &topology_device_list);
-
- dev = kfd_assign_gpu(gpu);
- if (WARN_ON(!dev)) {
- res = -ENODEV;
- goto err;
- }
-
- /* Fill the cache affinity information here for the GPUs
- * using VCRAT
- */
- kfd_fill_cache_non_crat_info(dev, gpu);
-
- /* Update the SYSFS tree, since we added another topology
- * device
- */
- res = kfd_topology_update_sysfs();
- if (!res)
- sys_props.generation_count++;
- else
- pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
- gpu_id, res);
- }
+ if (!dev)
+ res = kfd_topology_add_device_locked(gpu, gpu_id, &dev);
up_write(&topology_lock);
+ if (res)
+ goto err;
dev->gpu_id = gpu_id;
gpu->id = gpu_id;
--
2.34.1