On 2019-01-07 11:33 a.m., Grodzovsky, Andrey wrote: > > > On 01/07/2019 11:16 AM, Liu, Shaoyun wrote: >> I think it's reasonable to use the hive specific lock for hive specific functions. >> The changes is acked-by Shaoyun.liu < Shaoyun.liu@xxxxxxx> >> >> -----Original Message----- >> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of StDenis, Tom >> Sent: Monday, January 7, 2019 10:16 AM >> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> Cc: StDenis, Tom <Tom.StDenis@xxxxxxx> >> Subject: [PATCH] add missing mutex lock to amdgpu_get_xgmi_hive() (v2) >> >> v2: Move locks around in other functions so that this function can stand on its own. Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive. >> >> Signed-off-by: Tom St Denis <tom.stdenis@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- >> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 36 ++++++++++++++-------- >> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 +- >> 3 files changed, 25 insertions(+), 15 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 39d5d058b2c7..13d8e2ad2f7a 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -3525,7 +3525,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, >> * by different nodes. No point also since the one node already executing >> * reset will also reset all the other nodes in the hive. >> */ >> - hive = amdgpu_get_xgmi_hive(adev); >> + hive = amdgpu_get_xgmi_hive(adev, 0); >> if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && >> !mutex_trylock(&hive->hive_lock)) >> return 0; > > Let's say i have device 0 in hive A and it just got a gpu reset and at > the same time device 1 is being added to same have though > amdgpu_xgmi_add_device, hive->hive_lock is acquired by this new device > being added and so gpu reset for device 0 bails out on > '!mutex_trylock(&hive->hive_lock))' without completing the reset. > Also in general i feel a bit uncomfortable about the confusing > acquisition scheme in the function and the fact that you take the > hive->hive_lock inside amdgpu_get_xgmi_hive but release is still outside > of the function. Is adding a device while resetting a device even a valid operation anyways? I think this means more so that the reset logic is broken. Instead there should be a per-hive reset lock that is taken and that is tested instead. Tom > > Andrey > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> index 8a8bc60cb6b4..ebf50809485f 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> @@ -40,26 +40,39 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive) >> return &hive->device_list; >> } >> >> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) >> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device >> +*adev, int lock) >> { >> int i; >> struct amdgpu_hive_info *tmp; >> >> if (!adev->gmc.xgmi.hive_id) >> return NULL; >> + >> + mutex_lock(&xgmi_mutex); >> + >> for (i = 0 ; i < hive_count; ++i) { >> tmp = &xgmi_hives[i]; >> - if (tmp->hive_id == adev->gmc.xgmi.hive_id) >> + if (tmp->hive_id == adev->gmc.xgmi.hive_id) { >> + if (lock) >> + mutex_lock(&tmp->hive_lock); >> + mutex_unlock(&xgmi_mutex); >> return tmp; >> + } >> } >> - if (i >= AMDGPU_MAX_XGMI_HIVE) >> + if (i >= AMDGPU_MAX_XGMI_HIVE) { >> + mutex_unlock(&xgmi_mutex); >> return NULL; >> + } >> >> /* initialize new hive if not exist */ >> tmp = &xgmi_hives[hive_count++]; >> tmp->hive_id = adev->gmc.xgmi.hive_id; >> INIT_LIST_HEAD(&tmp->device_list); >> mutex_init(&tmp->hive_lock); >> + if (lock) >> + mutex_lock(&tmp->hive_lock); >> + >> + mutex_unlock(&xgmi_mutex); >> >> return tmp; >> } >> @@ -111,8 +124,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) >> return ret; >> } >> >> - mutex_lock(&xgmi_mutex); >> - hive = amdgpu_get_xgmi_hive(adev); >> + /* find hive and take lock */ >> + hive = amdgpu_get_xgmi_hive(adev, 1); >> if (!hive) >> goto exit; >> >> @@ -142,8 +155,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) >> break; >> } >> >> + mutex_unlock(&hive->hive_lock); >> exit: >> - mutex_unlock(&xgmi_mutex); >> return ret; >> } >> >> @@ -154,15 +167,12 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev) >> if (!adev->gmc.xgmi.supported) >> return; >> >> - mutex_lock(&xgmi_mutex); >> - >> - hive = amdgpu_get_xgmi_hive(adev); >> + hive = amdgpu_get_xgmi_hive(adev, 1); >> if (!hive) >> - goto exit; >> + return; >> >> if (!(hive->number_devices--)) >> mutex_destroy(&hive->hive_lock); >> - >> -exit: >> - mutex_unlock(&xgmi_mutex); >> + else >> + mutex_unlock(&hive->hive_lock); >> } >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> index 6151eb9c8ad3..8d7984844174 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> @@ -32,7 +32,7 @@ struct amdgpu_hive_info { >> struct mutex hive_lock; >> }; >> >> -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev); >> +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device >> +*adev, int lock); >> int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev); int amdgpu_xgmi_add_device(struct amdgpu_device *adev); void amdgpu_xgmi_remove_device(struct amdgpu_device *adev); >> -- >> 2.17.2 >> >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx > _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx