On 2024-05-06 16:30, Felix Kuehling wrote: > > On 2024-05-03 18:06, Harish Kasiviswanathan wrote: >> gpu_id needs to be unique for user space to identify GPUs via KFD >> interface. In the current implementation there is a very small >> probability of having non unique gpu_ids. >> >> v2: Add check to confirm if gpu_id is unique. If not unique, find one >> Changed commit header to reflect the above >> >> Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 26 ++++++++++++++++++++++- >> 1 file changed, 25 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c >> index b93913934b03..01d4c2e10c6d 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c >> @@ -1095,6 +1095,8 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) >> uint32_t hashout; >> uint32_t buf[8]; >> uint64_t local_mem_size; >> + struct kfd_topology_device *dev; >> + bool is_unique; >> int i; >> if (!gpu) >> @@ -1115,6 +1117,28 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu) >> for (i = 0, hashout = 0; i < 8; i++) >> hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH); >> + /* hash generated could be non-unique. Check if it is unique. >> + * If not unique increment till unique one is found. In case >> + * of overflow, restart from 1 >> + */ >> + down_read(&topology_lock); >> + do { >> + is_unique = true; >> + list_for_each_entry(dev, &topology_device_list, list) { >> + if (dev->gpu && dev->gpu_id == hashout) { >> + is_unique = false; >> + break; >> + } >> + } >> + if (unlikely(!is_unique)) { >> + hashout = (hashout + 1) & >> + ((1 << KFD_GPU_ID_HASH_WIDTH) - 1); >> + if (!hashout) >> + hashout = 1; > > This doesn't catch the case that hashout was 0 before incrementing it, and was found to be unique. I didn't actively think about this case when I sent the patch out. However, we don't have gpu_id to be 0. There are places where gpu_id=0 means it is CPU node > > Regards, > Felix > > >> + } >> + } while (!is_unique); >> + up_read(&topology_lock); >> + >> return hashout; >> } >> /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If >> @@ -1946,7 +1970,6 @@ int kfd_topology_add_device(struct kfd_node *gpu) >> struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config; >> struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info; >> - gpu_id = kfd_generate_gpu_id(gpu); >> if (gpu->xcp && !gpu->xcp->ddev) { >> dev_warn(gpu->adev->dev, >> "Won't add GPU to topology since it has no drm node assigned."); >> @@ -1969,6 +1992,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) >> if (res) >> return res; >> + gpu_id = kfd_generate_gpu_id(gpu); >> dev->gpu_id = gpu_id; >> gpu->id = gpu_id; >>