On Fri, Jul 26, 2024 at 11:31 AM Jonathan Kim <Jonathan.Kim@xxxxxxx> wrote: > > Support per-queue reset for GFX9. The recommendation is for the driver > to target reset the HW queue via a SPI MMIO register write. > > Since this requires pipe and HW queue info and MEC FW is limited to > doorbell reports of hung queues after an unmap failure, scan the HW > queue slots defined by SET_RESOURCES first to identify the user queue > candidates to reset. > > Only signal reset events to processes that have had a queue reset. > > If queue reset fails, fall back to GPU reset. > > v2: move reset queue flag for house keeping to process device. > split detect and reset into separate functions. > make reset call safe during power saving modes. > clean up some other nitpicks. > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> > --- > .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 2 + > .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 4 +- > .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c | 4 +- > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 16 ++ > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h | 9 + > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c | 4 +- > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c | 18 +- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 85 +++++++++ > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 9 + > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 172 +++++++++++++++++- > .../drm/amd/amdkfd/kfd_device_queue_manager.h | 12 ++ > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 21 +++ > .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 6 +- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 + > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 + > .../gpu/drm/amd/include/kgd_kfd_interface.h | 6 + > 16 files changed, 360 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c > index aff08321e976..8dfdb18197c4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c > @@ -191,4 +191,6 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = { > .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, > .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info, > .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, > + .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, > + .hqd_reset = kgd_gfx_v9_hqd_reset, > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > index 3a3f3ce09f00..017e8a3013aa 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > @@ -418,5 +418,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = { > .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, > .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info, > .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, > - .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings > + .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, > + .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, > + .hqd_reset = kgd_gfx_v9_hqd_reset > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > index a5c7259cf2a3..e2ae714a700f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > @@ -541,5 +541,7 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = { > kgd_gfx_v9_4_3_set_wave_launch_trap_override, > .set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode, > .set_address_watch = kgd_gfx_v9_4_3_set_address_watch, > - .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch > + .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch, > + .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, > + .hqd_reset = kgd_gfx_v9_hqd_reset > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c > index 3ab6c3aa0ad1..62176d607bef 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c > @@ -1070,6 +1070,20 @@ static void program_trap_handler_settings(struct amdgpu_device *adev, > unlock_srbm(adev); > } > > +uint64_t kgd_gfx_v10_hqd_get_pq_addr(struct amdgpu_device *adev, > + uint32_t pipe_id, uint32_t queue_id, > + uint32_t inst) > +{ > + return 0; > +} > + > +uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev, > + uint32_t pipe_id, uint32_t queue_id, > + uint32_t inst, unsigned int utimeout) > +{ > + return 0; > +} > + > const struct kfd2kgd_calls gfx_v10_kfd2kgd = { > .program_sh_mem_settings = kgd_program_sh_mem_settings, > .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, > @@ -1097,4 +1111,6 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = { > .get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times, > .build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info, > .program_trap_handler_settings = program_trap_handler_settings, > + .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr, > + .hqd_reset = kgd_gfx_v10_hqd_reset > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h > index 67bcaa3d4226..9efd2dd4fdd7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h > @@ -56,3 +56,12 @@ void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev, > uint32_t grace_period, > uint32_t *reg_offset, > uint32_t *reg_data); > +uint64_t kgd_gfx_v10_hqd_get_pq_addr(struct amdgpu_device *adev, > + uint32_t pipe_id, > + uint32_t queue_id, > + uint32_t inst); > +uint64_t kgd_gfx_v10_hqd_reset(struct amdgpu_device *adev, > + uint32_t pipe_id, > + uint32_t queue_id, > + uint32_t inst, > + unsigned int utimeout); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c > index 8c8437a4383f..c718bedda0ca 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c > @@ -680,5 +680,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = { > .set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override, > .set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode, > .set_address_watch = kgd_gfx_v10_set_address_watch, > - .clear_address_watch = kgd_gfx_v10_clear_address_watch > + .clear_address_watch = kgd_gfx_v10_clear_address_watch, > + .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr, > + .hqd_reset = kgd_gfx_v10_hqd_reset > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c > index b61a32d6af4b..a4ba49cb22db 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c > @@ -786,6 +786,20 @@ static uint32_t kgd_gfx_v11_clear_address_watch(struct amdgpu_device *adev, > return 0; > } > > +static uint64_t kgd_gfx_v11_hqd_get_pq_addr(struct amdgpu_device *adev, > + uint32_t pipe_id, uint32_t queue_id, > + uint32_t inst) > +{ > + return 0; > +} > + > +static uint64_t kgd_gfx_v11_hqd_reset(struct amdgpu_device *adev, > + uint32_t pipe_id, uint32_t queue_id, > + uint32_t inst, unsigned int utimeout) > +{ > + return 0; > +} > + > const struct kfd2kgd_calls gfx_v11_kfd2kgd = { > .program_sh_mem_settings = program_sh_mem_settings_v11, > .set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11, > @@ -808,5 +822,7 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = { > .set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override, > .set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode, > .set_address_watch = kgd_gfx_v11_set_address_watch, > - .clear_address_watch = kgd_gfx_v11_clear_address_watch > + .clear_address_watch = kgd_gfx_v11_clear_address_watch, > + .hqd_get_pq_addr = kgd_gfx_v11_hqd_get_pq_addr, > + .hqd_reset = kgd_gfx_v11_hqd_reset > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > index 5a35a8ca8922..32f28c12077b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > @@ -1144,6 +1144,89 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev, > kgd_gfx_v9_unlock_srbm(adev, inst); > } > > +uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev, > + uint32_t pipe_id, uint32_t queue_id, > + uint32_t inst) > +{ > + uint32_t low, high; > + uint64_t queue_addr = 0; > + > + kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); > + amdgpu_gfx_rlc_enter_safe_mode(adev, inst); > + > + if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE)) > + goto unlock_out; > + > + low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE); > + high = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI); > + > + /* only concerned with user queues. */ > + if (!high) > + goto unlock_out; > + > + queue_addr = (((queue_addr | high) << 32) | low) << 8; > + > +unlock_out: > + amdgpu_gfx_rlc_exit_safe_mode(adev, inst); > + kgd_gfx_v9_release_queue(adev, inst); > + > + return queue_addr; > +} > + > +uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev, > + uint32_t pipe_id, uint32_t queue_id, > + uint32_t inst, unsigned int utimeout) > +{ > + uint32_t low, high, temp; > + unsigned long end_jiffies; > + uint64_t queue_addr = 0; > + > + kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); > + amdgpu_gfx_rlc_enter_safe_mode(adev, inst); > + > + if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE)) > + goto unlock_out; > + > + low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE); > + high = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI); > + > + /* only concerned with user queues. */ > + if (!high) > + goto unlock_out; > + > + queue_addr = (((queue_addr | high) << 32) | low) << 8; > + > + pr_debug("Attempting queue reset on XCC %i pipe id %i queue id %i\n", > + inst, pipe_id, queue_id); > + > + /* assume previous dequeue request issued will take affect after reset */ > + WREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_COMPUTE_QUEUE_RESET, 0x1); > + > + end_jiffies = (utimeout * HZ / 1000) + jiffies; > + while (true) { > + temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE); > + > + if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) > + break; > + > + if (time_after(jiffies, end_jiffies)) { > + queue_addr = 0; > + break; > + } > + > + usleep_range(500, 1000); > + } > + > + pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n", > + inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!"); > + > +unlock_out: > + amdgpu_gfx_rlc_exit_safe_mode(adev, inst); > + kgd_gfx_v9_release_queue(adev, inst); > + > + return queue_addr; > +} > + > const struct kfd2kgd_calls gfx_v9_kfd2kgd = { > .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, > .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping, > @@ -1172,4 +1255,6 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = { > .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info, > .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, > .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, > + .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, > + .hqd_reset = kgd_gfx_v9_hqd_reset > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > index ce424615f59b..988c50ac3be0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > @@ -101,3 +101,12 @@ void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev, > uint32_t grace_period, > uint32_t *reg_offset, > uint32_t *reg_data); > +uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev, > + uint32_t pipe_id, > + uint32_t queue_id, > + uint32_t inst); > +uint64_t kgd_gfx_v9_hqd_reset(struct amdgpu_device *adev, > + uint32_t pipe_id, > + uint32_t queue_id, > + uint32_t inst, > + unsigned int utimeout); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index fdc76c24b2e7..e335703eff84 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -153,6 +153,20 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, > > static void kfd_hws_hang(struct device_queue_manager *dqm) > { > + struct device_process_node *cur; > + struct qcm_process_device *qpd; > + struct queue *q; > + > + /* Mark all device queues as reset. */ > + list_for_each_entry(cur, &dqm->queues, list) { > + qpd = cur->qpd; > + list_for_each_entry(q, &qpd->queues_list, list) { > + struct kfd_process_device *pdd = qpd_to_pdd(qpd); > + > + pdd->has_reset_queue = true; > + } > + } > + > /* > * Issue a GPU reset if HWS is unresponsive > */ > @@ -878,6 +892,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q, > else if (prev_active) > retval = remove_queue_mes(dqm, q, &pdd->qpd); > > + /* queue is reset so inaccessable */ > + if (pdd->has_reset_queue) { > + retval = -EACCES; > + goto out_unlock; > + } > + > if (retval) { > dev_err(dev, "unmap queue failed\n"); > goto out_unlock; > @@ -1627,7 +1647,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) > static int start_cpsch(struct device_queue_manager *dqm) > { > struct device *dev = dqm->dev->adev->dev; > - int retval; > + int retval, num_hw_queue_slots; > > retval = 0; > > @@ -1680,6 +1700,14 @@ static int start_cpsch(struct device_queue_manager *dqm) > &dqm->wait_times); > } > > + /* setup per-queue reset detection buffer */ > + num_hw_queue_slots = dqm->dev->kfd->shared_resources.num_queue_per_pipe * > + dqm->dev->kfd->shared_resources.num_pipe_per_mec * > + NUM_XCC(dqm->dev->xcc_mask); > + > + dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct dqm_detect_hang_info); > + dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, GFP_KERNEL); > + > dqm_unlock(dqm); > > return 0; > @@ -1713,6 +1741,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); > if (!dqm->dev->kfd->shared_resources.enable_mes) > pm_uninit(&dqm->packet_mgr); > + kfree(dqm->detect_hang_info); > dqm_unlock(dqm); > > return 0; > @@ -1929,6 +1958,131 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) > return retval; > } > > +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q, > + struct qcm_process_device *qpd) > +{ > + struct kfd_process_device *pdd = qpd_to_pdd(qpd); > + > + pr_err("queue id 0x%0x at pasid 0x%0x is reset\n", > + q->properties.queue_id, q->process->pasid); > + > + pdd->has_reset_queue = true; > + if (q->properties.is_active) { > + q->properties.is_active = false; > + decrement_queue_count(dqm, qpd, q); > + } > +} > + > +static int detect_queue_hang(struct device_queue_manager *dqm) > +{ > + int i; > + > + memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size); > + > + for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) { > + uint32_t mec, pipe, queue; > + int xcc_id; > + > + mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe) > + / dqm->dev->kfd->shared_resources.num_pipe_per_mec; > + > + if (mec || !test_bit(i, dqm->dev->kfd->shared_resources.cp_queue_bitmap)) > + continue; > + > + amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i, &mec, &pipe, &queue); > + > + for_each_inst(xcc_id, dqm->dev->xcc_mask) { > + uint64_t queue_addr = dqm->dev->kfd2kgd->hqd_get_pq_addr( > + dqm->dev->adev, pipe, queue, xcc_id); > + struct dqm_detect_hang_info hang_info; > + > + if (!queue_addr) > + continue; > + > + hang_info.pipe_id = pipe; > + hang_info.queue_id = queue; > + hang_info.xcc_id = xcc_id; > + hang_info.queue_address = queue_addr; > + > + dqm->detect_hang_info[dqm->detect_hang_count] = hang_info; > + dqm->detect_hang_count++; > + } > + } > + > + return dqm->detect_hang_count; > +} > + > +static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uint64_t queue_address) > +{ > + struct device_process_node *cur; > + struct qcm_process_device *qpd; > + struct queue *q; > + > + list_for_each_entry(cur, &dqm->queues, list) { > + qpd = cur->qpd; > + list_for_each_entry(q, &qpd->queues_list, list) { > + if (queue_address == q->properties.queue_address) > + return q; > + } > + } > + > + return NULL; > +} > + > +/* only for compute queue */ > +static int reset_queues_on_hws_hang(struct device_queue_manager *dqm) > +{ > + int r = 0, reset_count = 0, i; > + > + if (!dqm->detect_hang_info || dqm->is_hws_hang) > + return -EIO; > + > + /* assume dqm locked. */ > + if (!detect_queue_hang(dqm)) > + return -ENOTRECOVERABLE; > + > + for (i = 0; i < dqm->detect_hang_count; i++) { > + struct dqm_detect_hang_info hang_info = dqm->detect_hang_info[i]; > + struct queue *q = find_queue_by_address(dqm, hang_info.queue_address); > + struct kfd_process_device *pdd; > + uint64_t queue_addr = 0; > + > + if (!q) { > + r = -ENOTRECOVERABLE; > + goto reset_fail; > + } > + > + pdd = kfd_get_process_device_data(dqm->dev, q->process); > + if (!pdd) { > + r = -ENOTRECOVERABLE; > + goto reset_fail; > + } > + > + queue_addr = dqm->dev->kfd2kgd->hqd_reset(dqm->dev->adev, > + hang_info.pipe_id, hang_info.queue_id, hang_info.xcc_id, > + KFD_UNMAP_LATENCY_MS); > + > + /* either reset failed or we reset an unexpected queue. */ > + if (queue_addr != q->properties.queue_address) { > + r = -ENOTRECOVERABLE; > + goto reset_fail; > + } > + > + set_queue_as_reset(dqm, q, &pdd->qpd); > + reset_count++; > + } > + > + if (reset_count == dqm->detect_hang_count) > + kfd_signal_reset_event(dqm->dev); > + else > + r = -ENOTRECOVERABLE; > + > +reset_fail: > + dqm->detect_hang_count = 0; > + > + return r; > +} > + > /* dqm->lock mutex has to be locked before calling this function */ > static int unmap_queues_cpsch(struct device_queue_manager *dqm, > enum kfd_unmap_queues_filter filter, > @@ -1979,11 +2133,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, > */ > mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; > if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) { > - while (halt_if_hws_hang) > - schedule(); > - kfd_hws_hang(dqm); > - retval = -ETIME; > - goto out; > + if (reset_queues_on_hws_hang(dqm)) { > + while (halt_if_hws_hang) > + schedule(); > + dqm->is_hws_hang = true; > + kfd_hws_hang(dqm); > + retval = -ETIME; > + goto out; > + } > } > > /* We need to reset the grace period value for this device */ > @@ -2002,8 +2159,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, > } > > /* only for compute queue */ > -static int reset_queues_cpsch(struct device_queue_manager *dqm, > - uint16_t pasid) > +static int reset_queues_cpsch(struct device_queue_manager *dqm, uint16_t pasid) > { > int retval; > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > index 3b9b8eabaacc..dfb36a246637 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > @@ -210,6 +210,13 @@ struct device_queue_manager_asic_ops { > struct kfd_node *dev); > }; > > +struct dqm_detect_hang_info { > + int pipe_id; > + int queue_id; > + int xcc_id; > + uint64_t queue_address; > +}; > + > /** > * struct device_queue_manager > * > @@ -264,6 +271,11 @@ struct device_queue_manager { > uint32_t wait_times; > > wait_queue_head_t destroy_wait; > + > + /* for per-queue reset support */ > + struct dqm_detect_hang_info *detect_hang_info; > + size_t detect_hang_info_size; > + int detect_hang_count; > }; > > void device_queue_manager_init_cik( > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > index 9b33d9d2c9ad..3a6e7a4c8895 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > @@ -31,6 +31,7 @@ > #include <linux/memory.h> > #include "kfd_priv.h" > #include "kfd_events.h" > +#include "kfd_device_queue_manager.h" > #include <linux/device.h> > > /* > @@ -1244,12 +1245,32 @@ void kfd_signal_reset_event(struct kfd_node *dev) > idx = srcu_read_lock(&kfd_processes_srcu); > hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { > int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); > + struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p); > > if (unlikely(user_gpu_id == -EINVAL)) { > WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); > continue; > } > > + if (unlikely(!pdd)) { > + WARN_ONCE(1, "Could not get device data from pasid:0x%x\n", p->pasid); > + continue; > + } > + > + if (dev->dqm->detect_hang_count && !pdd->has_reset_queue) > + continue; > + > + if (dev->dqm->detect_hang_count) { > + struct amdgpu_task_info *ti; > + > + ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid); > + if (ti) { > + DRM_ERROR("Process info: process %s tid %d thread %s pid %d\n", > + ti->process_name, ti->tgid, ti->task_name, ti->pid); > + amdgpu_vm_put_task_info(ti); Use dev_err() here so we know which device issued the error on multi-GPU systems. With that fixed, the patch looks good to me. I'm not a KFD expert, so: Acked-by: Alex Deucher <alexander.deucher@xxxxxxx> Alex > + } > + } > + > rcu_read_lock(); > > id = KFD_FIRST_NONSIGNAL_EVENT_ID; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > index 66c73825c0a0..84e8ea3a8a0c 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > @@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, > static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) > { > struct v9_mqd *m = (struct v9_mqd *)mqd; > + uint32_t doorbell_id = m->queue_doorbell_id0; > > - return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); > + m->queue_doorbell_id0 = 0; > + > + return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0); > } > > static int get_wave_state(struct mqd_manager *mm, void *mqd, > @@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd) > m = get_mqd(mqd + hiq_mqd_size * inst); > ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev, > m->queue_doorbell_id0, inst); > + m->queue_doorbell_id0 = 0; > ++inst; > } > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index b5cae48dff66..892a85408c09 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -843,6 +843,9 @@ struct kfd_process_device { > void *proc_ctx_bo; > uint64_t proc_ctx_gpu_addr; > void *proc_ctx_cpu_ptr; > + > + /* Tracks queue reset status */ > + bool has_reset_queue; > }; > > #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index 9e29b92eb523..a902950cc060 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) > goto fail; > } > n_evicted++; > + > + pdd->dev->dqm->is_hws_hang = false; > } > > return r; > diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > index 6d094cf3587d..7744ca3ef4b1 100644 > --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > @@ -318,6 +318,12 @@ struct kfd2kgd_calls { > void (*program_trap_handler_settings)(struct amdgpu_device *adev, > uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, > uint32_t inst); > + uint64_t (*hqd_get_pq_addr)(struct amdgpu_device *adev, > + uint32_t pipe_id, uint32_t queue_id, > + uint32_t inst); > + uint64_t (*hqd_reset)(struct amdgpu_device *adev, > + uint32_t pipe_id, uint32_t queue_id, > + uint32_t inst, unsigned int utimeout); > }; > > #endif /* KGD_KFD_INTERFACE_H_INCLUDED */ > -- > 2.34.1 >