Add queue remapping to ensure that any waves executing the PC sampling part of the trap handler are done before kfd_pc_sample_stop returns, and that no new waves enter that part of the trap handler afterwards. This avoids race conditions that could lead to use-after-free. Unmapping and remapping the queues either waits for the waves to drain, or preempts them with CWSR, which itself executes a trap and waits for previous traps to finish. Signed-off-by: James Zhu <James.Zhu@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 11 +++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 5 +++++ drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 4 +++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c0e71543389a..a3f57be63f4f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -3155,6 +3155,17 @@ int debug_refresh_runlist(struct device_queue_manager *dqm) return debug_map_and_unlock(dqm); } +void remap_queue(struct device_queue_manager *dqm, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, + uint32_t grace_period) +{ + dqm_lock(dqm); + if (!dqm->dev->kfd->shared_resources.enable_mes) + execute_queues_cpsch(dqm, filter, filter_param, grace_period); + dqm_unlock(dqm); +} + #if defined(CONFIG_DEBUG_FS) static void seq_reg_dump(struct seq_file *m, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index cf7e182588f8..f8aae3747a36 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -303,6 +303,11 @@ int debug_lock_and_unmap(struct device_queue_manager *dqm); int debug_map_and_unlock(struct device_queue_manager *dqm); int debug_refresh_runlist(struct device_queue_manager *dqm); +void remap_queue(struct device_queue_manager *dqm, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, + uint32_t grace_period); + static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { return (pdd->lds_base >> 16) & 0xFF; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c index 53e44e68408e..df2f4bfd0cda 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c @@ -24,6 +24,7 @@ #include "kfd_priv.h" #include "amdgpu_amdkfd.h" #include "kfd_pc_sampling.h" +#include "kfd_device_queue_manager.h" struct supported_pc_sample_info { uint32_t ip_version; @@ -115,9 +116,10 @@ static int kfd_pc_sample_stop(struct kfd_process_device *pdd, kfd_process_set_trap_pc_sampling_flag(&pdd->qpd, pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false); + remap_queue(pdd->dev->dqm, + KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD); if (pc_sampling_stop) { - mutex_lock(&pdd->dev->pcs_data.mutex); pdd->dev->pcs_data.hosttrap_entry.base.target_simd = 0; pdd->dev->pcs_data.hosttrap_entry.base.target_wave_slot = 0; -- 2.25.1