[+Amber, moving amd-gfx to BCC] Amber worked on a related problem on an NPI branch recently in the nocpsch version of this code. We should port that fix to amd-staging-drm-next. Then lets come up with a common solution for the cpsch code path as well. See one comment inline. Am 2021-06-15 um 4:02 a.m. schrieb xinhui pan: > We call free_mqd without dqm lock hold, that causes double free of > mqd_mem_obj. Fix it by using a tmp pointer. > We need walk through the queues_list with dqm lock hold. Otherwise hit > list corruption. > > Signed-off-by: xinhui pan <xinhui.pan@xxxxxxx> > --- > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 17 +++++++++++++---- > 1 file changed, 13 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index e6366b408420..575c895fc241 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -1484,6 +1484,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, > struct mqd_manager *mqd_mgr; > uint64_t sdma_val = 0; > struct kfd_process_device *pdd = qpd_to_pdd(qpd); > + void *ptr = NULL; > > /* Get the SDMA queue stats */ > if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || > @@ -1543,10 +1544,12 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, > pr_debug("Total of %d queues are accountable so far\n", > dqm->total_queue_count); > > + swap(ptr, q->mqd_mem_obj); > dqm_unlock(dqm); > > /* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */ > - mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); > + if (ptr) > + mqd_mgr->free_mqd(mqd_mgr, q->mqd, ptr); > > return retval; > > @@ -1709,6 +1712,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, > enum kfd_unmap_queues_filter filter = > KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; > bool found = false; > + void *ptr = NULL; > > retval = 0; > > @@ -1737,8 +1741,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, > qpd->mapped_gws_queue = false; > } > } > - > - dqm->total_queue_count--; > } > > /* Unregister process */ > @@ -1770,13 +1772,20 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, > /* Lastly, free mqd resources. > * Do free_mqd() after dqm_unlock to avoid circular locking. > */ > + dqm_lock(dqm); > list_for_each_entry_safe(q, next, &qpd->queues_list, list) { > mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( > q->properties.type)]; > list_del(&q->list); > qpd->queue_count--; > - mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); > + dqm->total_queue_count--; > + swap(ptr, q->mqd_mem_obj); > + dqm_unlock(dqm); This still risks list corruption because the list can be modified while the lock is dropped. However you can make it safe by changing the loop to something like dqm_lock(dqm); while (!list_empty(...)) { q = list_first_entry(...); dqm_unlock(dqm); ... mqd_mgr->free_mqd(...); ... dqm_lock(dqm); } dqm_unlock(); Regards, Felix > + if (ptr) > + mqd_mgr->free_mqd(mqd_mgr, q->mqd, ptr); > + dqm_lock(dqm); > } > + dqm_unlock(dqm); > > return retval; > } _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx