Am 2021-06-15 um 11:32 a.m. schrieb Felix Kuehling: > [+Amber, moving amd-gfx to BCC] Actually didn't move it to BCC. But let's not name that NPI branch in public. ;) Thanks, Felix > > Amber worked on a related problem on an NPI branch recently in the > nocpsch version of this code. We should port that fix to > amd-staging-drm-next. Then lets come up with a common solution for the > cpsch code path as well. > > See one comment inline. > > > Am 2021-06-15 um 4:02 a.m. schrieb xinhui pan: >> We call free_mqd without dqm lock hold, that causes double free of >> mqd_mem_obj. Fix it by using a tmp pointer. >> We need walk through the queues_list with dqm lock hold. Otherwise hit >> list corruption. >> >> Signed-off-by: xinhui pan <xinhui.pan@xxxxxxx> >> --- >> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 17 +++++++++++++---- >> 1 file changed, 13 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c >> index e6366b408420..575c895fc241 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c >> @@ -1484,6 +1484,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, >> struct mqd_manager *mqd_mgr; >> uint64_t sdma_val = 0; >> struct kfd_process_device *pdd = qpd_to_pdd(qpd); >> + void *ptr = NULL; >> >> /* Get the SDMA queue stats */ >> if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || >> @@ -1543,10 +1544,12 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, >> pr_debug("Total of %d queues are accountable so far\n", >> dqm->total_queue_count); >> >> + swap(ptr, q->mqd_mem_obj); >> dqm_unlock(dqm); >> >> /* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */ >> - mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); >> + if (ptr) >> + mqd_mgr->free_mqd(mqd_mgr, q->mqd, ptr); >> >> return retval; >> >> @@ -1709,6 +1712,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, >> enum kfd_unmap_queues_filter filter = >> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; >> bool found = false; >> + void *ptr = NULL; >> >> retval = 0; >> >> @@ -1737,8 +1741,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, >> qpd->mapped_gws_queue = false; >> } >> } >> - >> - dqm->total_queue_count--; >> } >> >> /* Unregister process */ >> @@ -1770,13 +1772,20 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, >> /* Lastly, free mqd resources. >> * Do free_mqd() after dqm_unlock to avoid circular locking. >> */ >> + dqm_lock(dqm); >> list_for_each_entry_safe(q, next, &qpd->queues_list, list) { >> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( >> q->properties.type)]; >> list_del(&q->list); >> qpd->queue_count--; >> - mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); >> + dqm->total_queue_count--; >> + swap(ptr, q->mqd_mem_obj); >> + dqm_unlock(dqm); > This still risks list corruption because the list can be modified while > the lock is dropped. However you can make it safe by changing the loop > to something like > > dqm_lock(dqm); > while (!list_empty(...)) { > q = list_first_entry(...); > dqm_unlock(dqm); > ... > mqd_mgr->free_mqd(...); > ... > dqm_lock(dqm); > } > dqm_unlock(); > > Regards, > Felix > > >> + if (ptr) >> + mqd_mgr->free_mqd(mqd_mgr, q->mqd, ptr); >> + dqm_lock(dqm); >> } >> + dqm_unlock(dqm); >> >> return retval; >> } > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx