On Wed, Jul 24, 2024 at 4:34 PM Felix Kuehling <felix.kuehling@xxxxxxx> wrote: > > > On 2024-07-18 13:56, Jonathan Kim wrote: > > Support per-queue reset for GFX9. The recommendation is for the driver > > to target reset the HW queue via a SPI MMIO register write. > > > > Since this requires pipe and HW queue info and MEC FW is limited to > > doorbell reports of hung queues after an unmap failure, scan the HW > > queue slots defined by SET_RESOURCES first to identify the user queue > > candidates to reset. > > > > Only signal reset events to processes that have had a queue reset. > > > > If queue reset fails, fall back to GPU reset. > > > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> > > --- > > .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 1 + > > .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 3 +- > > .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c | 3 +- > > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 9 + > > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h | 6 + > > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c | 3 +- > > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c | 11 +- > > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 56 +++++ > > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 6 + > > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 195 ++++++++++++++++-- > > .../drm/amd/amdkfd/kfd_device_queue_manager.h | 12 ++ > > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 15 ++ > > .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 6 +- > > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 + > > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 + > > .../gpu/drm/amd/include/kgd_kfd_interface.h | 4 + > > 16 files changed, 310 insertions(+), 26 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c > > index aff08321e976..1dc601e4518a 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c > > @@ -191,4 +191,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = { > > .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, > > .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info, > > .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, > > + .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset > > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > index 3a3f3ce09f00..534975c722df 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > @@ -418,5 +418,6 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = { > > .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, > > .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info, > > .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, > > - .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings > > + .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, > > + .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset > > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > > index a5c7259cf2a3..b53c1cfa34de 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > > @@ -541,5 +541,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = { > > kgd_gfx_v9_4_3_set_wave_launch_trap_override, > > .set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode, > > .set_address_watch = kgd_gfx_v9_4_3_set_address_watch, > > - .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch > > + .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch, > > + .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset > > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c > > index 3ab6c3aa0ad1..dd449a0caba8 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c > > @@ -1070,6 +1070,14 @@ static void program_trap_handler_settings(struct amdgpu_device *adev, > > unlock_srbm(adev); > > } > > > > +uint64_t kgd_gfx_v10_hqd_detect_and_reset(struct amdgpu_device *adev, > > + uint32_t pipe_id, uint32_t queue_id, > > + uint32_t inst, unsigned int utimeout, > > + bool detect_only) > > +{ > > + return 0; > > +} > > + > > const struct kfd2kgd_calls gfx_v10_kfd2kgd = { > > .program_sh_mem_settings = kgd_program_sh_mem_settings, > > .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, > > @@ -1097,4 +1105,5 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = { > > .get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times, > > .build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info, > > .program_trap_handler_settings = program_trap_handler_settings, > > + .hqd_detect_and_reset = kgd_gfx_v10_hqd_detect_and_reset > > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h > > index 67bcaa3d4226..9eeda8808a3c 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h > > @@ -56,3 +56,9 @@ void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev, > > uint32_t grace_period, > > uint32_t *reg_offset, > > uint32_t *reg_data); > > +uint64_t kgd_gfx_v10_hqd_detect_and_reset(struct amdgpu_device *adev, > > + uint32_t pipe_id, > > + uint32_t queue_id, > > + uint32_t inst, > > + unsigned int utimeout, > > + bool detect_only); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c > > index 8c8437a4383f..9872bca4156b 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c > > @@ -680,5 +680,6 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = { > > .set_wave_launch_trap_override = kgd_gfx_v10_set_wave_launch_trap_override, > > .set_wave_launch_mode = kgd_gfx_v10_set_wave_launch_mode, > > .set_address_watch = kgd_gfx_v10_set_address_watch, > > - .clear_address_watch = kgd_gfx_v10_clear_address_watch > > + .clear_address_watch = kgd_gfx_v10_clear_address_watch, > > + .hqd_detect_and_reset = kgd_gfx_v10_hqd_detect_and_reset > > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c > > index b61a32d6af4b..483937c571ab 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c > > @@ -786,6 +786,14 @@ static uint32_t kgd_gfx_v11_clear_address_watch(struct amdgpu_device *adev, > > return 0; > > } > > > > +static uint64_t kgd_gfx_v11_hqd_detect_and_reset(struct amdgpu_device *adev, > > + uint32_t pipe_id, uint32_t queue_id, > > + uint32_t inst, unsigned int utimeout, > > + bool detect_only) > > +{ > > + return 0; > > +} > > + > > const struct kfd2kgd_calls gfx_v11_kfd2kgd = { > > .program_sh_mem_settings = program_sh_mem_settings_v11, > > .set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11, > > @@ -808,5 +816,6 @@ const struct kfd2kgd_calls gfx_v11_kfd2kgd = { > > .set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override, > > .set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode, > > .set_address_watch = kgd_gfx_v11_set_address_watch, > > - .clear_address_watch = kgd_gfx_v11_clear_address_watch > > + .clear_address_watch = kgd_gfx_v11_clear_address_watch, > > + .hqd_detect_and_reset = kgd_gfx_v11_hqd_detect_and_reset > > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > > index 5a35a8ca8922..9428e98ac12b 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > > @@ -1144,6 +1144,61 @@ void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev, > > kgd_gfx_v9_unlock_srbm(adev, inst); > > } > > > > +uint64_t kgd_gfx_v9_hqd_detect_and_reset(struct amdgpu_device *adev, > > + uint32_t pipe_id, uint32_t queue_id, > > + uint32_t inst, unsigned int utimeout, > > + bool detect_only) > > This is a bit misleading. This function doesn't detect anything. It just > returns a queue address and optionally resets the queue. And the way > it's called is like "detect and reset, but don't reset", except that > last part is not obvious if you don't know what the last parameter > means. It would make the code easier to read if you split this into two > functions: hqd_get_pq_addr and hqd_reset. > > > > +{ > > + uint32_t low, high, temp; > > + unsigned long end_jiffies; > > + uint64_t queue_addr = 0; > > + > > + kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); > > + if (!RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE)) > > + goto unlock_out; > > + > > + low = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE); > > + high = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI); > > You may need to disable GFXOFF before accessing these registers. Maybe > you're assuming that the GPU is not idle when you're getting here, and > maybe that's a good assumption, so I'm not sure. I think you should wrap any reset MMIO in amdgpu_gfx_rlc_enter_safe_mode() and amdgpu_gfx_rlc_exit_safe_mode(). Alex > > > > + > > + /* only concerned with user queues. */ > > + if (!high) > > + goto unlock_out; > > + > > + queue_addr = (((queue_addr | high) << 32) | low) << 8; > > + > > + if (detect_only) > > + goto unlock_out; > > + > > + pr_debug("Attempting queue reset on XCC %i pipe id %i queue id %i\n", > > + inst, pipe_id, queue_id); > > + > > + /* assume previous dequeue request issued will take affect after reset */ > > + WREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_COMPUTE_QUEUE_RESET, 0x1); > > + > > + end_jiffies = (utimeout * HZ / 1000) + jiffies; > > + while (true) { > > + temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE); > > + > > + if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) > > + break; > > + > > + if (time_after(jiffies, end_jiffies)) { > > + queue_addr = 0; > > + break; > > + } > > + > > + usleep_range(500, 1000); > > + } > > + > > +unlock_out: > > + if (!detect_only) > > + pr_debug("queue reset on XCC %i pipe id %i queue id %i %s\n", > > + inst, pipe_id, queue_id, !!queue_addr ? "succeeded!" : "failed!"); > > + kgd_gfx_v9_release_queue(adev, inst); > > + > > + return queue_addr; > > +} > > + > > const struct kfd2kgd_calls gfx_v9_kfd2kgd = { > > .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, > > .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping, > > @@ -1172,4 +1227,5 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = { > > .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info, > > .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, > > .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, > > + .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset > > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > > index ce424615f59b..cf70d8fc3d10 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > > @@ -101,3 +101,9 @@ void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev, > > uint32_t grace_period, > > uint32_t *reg_offset, > > uint32_t *reg_data); > > +uint64_t kgd_gfx_v9_hqd_detect_and_reset(struct amdgpu_device *adev, > > + uint32_t pipe_id, > > + uint32_t queue_id, > > + uint32_t inst, > > + unsigned int utimeout, > > + bool detect_only); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > index 4f48507418d2..037b75a64e66 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > @@ -153,6 +153,19 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, > > > > static void kfd_hws_hang(struct device_queue_manager *dqm) > > { > > + struct device_process_node *cur; > > + struct qcm_process_device *qpd; > > + struct queue *q; > > + > > + /* Mark all device queues as reset. */ > > + list_for_each_entry(cur, &dqm->queues, list) { > > + qpd = cur->qpd; > > + list_for_each_entry(q, &qpd->queues_list, list) { > > + q->properties.is_reset = true; > > + q->process->has_reset_queue = true; > > Both seem to track the same information, but one does it per queue, the > other does it per process. Why do you need to track this per queue at > all? The only place where I see q->properties->is_reset used is in > update_queue. I think you can use the per-process tracking there. > > That said, I think it would be better to track has_reset_queue in the > pdd or qpd because GPU resets are per GPU, and these fields are accessed > under the per-GPU DQM lock and reset events sent to user mode are per > GPU as well. > > > > + } > > + } > > + > > /* > > * Issue a GPU reset if HWS is unresponsive > > */ > > @@ -880,6 +893,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q, > > else if (prev_active) > > retval = remove_queue_mes(dqm, q, &pdd->qpd); > > > > + /* queue is reset so inaccessable */ > > + if (q->properties.is_reset) { > > You could use pdd->process->has_reset_queue (or pdd->has_reset_queue) > here and get rid of is_reset. > > > > + retval = -EACCES; > > + goto out_unlock; > > + } > > + > > if (retval) { > > dev_err(dev, "unmap queue failed\n"); > > goto out_unlock; > > @@ -1629,7 +1648,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) > > static int start_cpsch(struct device_queue_manager *dqm) > > { > > struct device *dev = dqm->dev->adev->dev; > > - int retval; > > + int retval, num_hw_queue_slots; > > > > retval = 0; > > > > @@ -1682,6 +1701,14 @@ static int start_cpsch(struct device_queue_manager *dqm) > > &dqm->wait_times); > > } > > > > + /* setup per-queue reset detection buffer */ > > + num_hw_queue_slots = dqm->dev->kfd->shared_resources.num_queue_per_pipe * > > + dqm->dev->kfd->shared_resources.num_pipe_per_mec * > > + NUM_XCC(dqm->dev->xcc_mask); > > + > > + dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct dqm_detect_hang_info); > > + dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, GFP_KERNEL); > > + > > dqm_unlock(dqm); > > > > return 0; > > @@ -1715,6 +1742,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) > > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); > > if (!dqm->dev->kfd->shared_resources.enable_mes) > > pm_uninit(&dqm->packet_mgr); > > + kfree(dqm->detect_hang_info); > > dqm_unlock(dqm); > > > > return 0; > > @@ -1931,6 +1959,143 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) > > return retval; > > } > > > > +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q, > > + struct qcm_process_device *qpd) > > +{ > > + pr_err("queue id 0x%0x at pasid 0x%0x is reset\n", > > + q->properties.queue_id, q->process->pasid); > > + > > + q->properties.is_reset = true; > > + q->process->has_reset_queue = true; > > + if (q->properties.is_active) { > > + q->properties.is_active = false; > > + decrement_queue_count(dqm, qpd, q); > > + } > > +} > > + > > +static int detect_queue_hang(struct device_queue_manager *dqm) > > +{ > > + int i; > > + > > + memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size); > > + > > + for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) { > > + uint32_t mec, pipe, queue; > > + int xcc_id; > > + > > + mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe) > > + / dqm->dev->kfd->shared_resources.num_pipe_per_mec; > > + > > + if (mec || !test_bit(i, dqm->dev->kfd->shared_resources.cp_queue_bitmap)) > > + continue; > > + > > + amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i, &mec, &pipe, &queue); > > + > > + for_each_inst(xcc_id, dqm->dev->xcc_mask) { > > + uint64_t queue_addr = dqm->dev->kfd2kgd->hqd_detect_and_reset( > > + dqm->dev->adev, pipe, queue, xcc_id, > > + 0, true); > > + struct dqm_detect_hang_info hang_info; > > + > > + if (!queue_addr) > > + continue; > > + > > + hang_info.pipe_id = pipe; > > + hang_info.queue_id = queue; > > + hang_info.xcc_id = xcc_id; > > + hang_info.queue_address = queue_addr; > > + > > + dqm->detect_hang_info[dqm->detect_hang_count] = hang_info; > > + dqm->detect_hang_count++; > > + } > > + } > > + > > + return dqm->detect_hang_count; > > +} > > + > > +static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uint64_t queue_address) > > +{ > > + struct device_process_node *cur; > > + struct qcm_process_device *qpd; > > + struct queue *q; > > + > > + list_for_each_entry(cur, &dqm->queues, list) { > > + qpd = cur->qpd; > > + list_for_each_entry(q, &qpd->queues_list, list) { > > + if (queue_address == q->properties.queue_address) > > + return q; > > + } > > + } > > + > > + return NULL; > > +} > > + > > +/* only for compute queue */ > > +static int reset_queues_cpsch(struct device_queue_manager *dqm, uint16_t pasid) > > +{ > > + int retval = 0, reset_count = 0, i; > > + > > + if (pasid) { > > + dqm_lock(dqm); > > + > > + retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID, > > + pasid, USE_DEFAULT_GRACE_PERIOD, true); > > + > > + dqm_unlock(dqm); > > + return retval; > > + } > > + > > + if (!dqm->detect_hang_info || dqm->is_hws_hang) > > + return -EIO; > > + > > + /* assume dqm locked. */ > > This is weird, the same function does different things and has different > assumptions about its locking context depending where it's called from. > Maybe keep the original function and and make this a new function with a > distinct name. > > > > + if (!detect_queue_hang(dqm)) > > + return -ENOTRECOVERABLE; > > + > > + for (i = 0; i < dqm->detect_hang_count; i++) { > > + struct dqm_detect_hang_info hang_info = dqm->detect_hang_info[i]; > > + struct queue *q = find_queue_by_address(dqm, hang_info.queue_address); > > + struct kfd_process_device *pdd; > > + uint64_t queue_addr = 0; > > + int pipe = hang_info.pipe_id; > > + int queue = hang_info.queue_id; > > + int xcc_id = hang_info.xcc_id; > > Why do you need these local variables. hang_info is already a local > variable. Now you have two copies of this on your stack. > > Regards, > Felix > > > > + > > + if (!q) { > > + retval = -ENOTRECOVERABLE; > > + goto reset_fail; > > + } > > + > > + pdd = kfd_get_process_device_data(dqm->dev, q->process); > > + if (!pdd) { > > + retval = -ENOTRECOVERABLE; > > + goto reset_fail; > > + } > > + > > + queue_addr = dqm->dev->kfd2kgd->hqd_detect_and_reset( > > + dqm->dev->adev, pipe, queue, xcc_id, > > + KFD_UNMAP_LATENCY_MS, false); > > + > > + if (queue_addr != q->properties.queue_address) { > > + retval = -ENOTRECOVERABLE; > > + goto reset_fail; > > + } > > + > > + set_queue_as_reset(dqm, q, &pdd->qpd); > > + reset_count++; > > + } > > + > > + if (reset_count == dqm->detect_hang_count) > > + kfd_signal_reset_event(dqm->dev); > > + else > > + retval = -ENOTRECOVERABLE; > > + > > +reset_fail: > > + dqm->detect_hang_count = 0; > > + > > + return retval; > > +} > > + > > /* dqm->lock mutex has to be locked before calling this function */ > > static int unmap_queues_cpsch(struct device_queue_manager *dqm, > > enum kfd_unmap_queues_filter filter, > > @@ -1981,11 +2146,14 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, > > */ > > mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; > > if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) { > > - while (halt_if_hws_hang) > > - schedule(); > > - kfd_hws_hang(dqm); > > - retval = -ETIME; > > - goto out; > > + if (reset_queues_cpsch(dqm, 0)) { > > + while (halt_if_hws_hang) > > + schedule(); > > + dqm->is_hws_hang = true; > > + kfd_hws_hang(dqm); > > + retval = -ETIME; > > + goto out; > > + } > > } > > > > /* We need to reset the grace period value for this device */ > > @@ -2003,21 +2171,6 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, > > return retval; > > } > > > > -/* only for compute queue */ > > -static int reset_queues_cpsch(struct device_queue_manager *dqm, > > - uint16_t pasid) > > -{ > > - int retval; > > - > > - dqm_lock(dqm); > > - > > - retval = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_BY_PASID, > > - pasid, USE_DEFAULT_GRACE_PERIOD, true); > > - > > - dqm_unlock(dqm); > > - return retval; > > -} > > - > > /* dqm->lock mutex has to be locked before calling this function */ > > static int execute_queues_cpsch(struct device_queue_manager *dqm, > > enum kfd_unmap_queues_filter filter, > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > > index 3b9b8eabaacc..dfb36a246637 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > > @@ -210,6 +210,13 @@ struct device_queue_manager_asic_ops { > > struct kfd_node *dev); > > }; > > > > +struct dqm_detect_hang_info { > > + int pipe_id; > > + int queue_id; > > + int xcc_id; > > + uint64_t queue_address; > > +}; > > + > > /** > > * struct device_queue_manager > > * > > @@ -264,6 +271,11 @@ struct device_queue_manager { > > uint32_t wait_times; > > > > wait_queue_head_t destroy_wait; > > + > > + /* for per-queue reset support */ > > + struct dqm_detect_hang_info *detect_hang_info; > > + size_t detect_hang_info_size; > > + int detect_hang_count; > > }; > > > > void device_queue_manager_init_cik( > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > index 9b33d9d2c9ad..30a55d3733e8 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > > @@ -31,6 +31,7 @@ > > #include <linux/memory.h> > > #include "kfd_priv.h" > > #include "kfd_events.h" > > +#include "kfd_device_queue_manager.h" > > #include <linux/device.h> > > > > /* > > @@ -1250,6 +1251,20 @@ void kfd_signal_reset_event(struct kfd_node *dev) > > continue; > > } > > > > + if (dev->dqm->detect_hang_count && !p->has_reset_queue) > > + continue; > > + > > + if (dev->dqm->detect_hang_count) { > > + struct amdgpu_task_info *ti; > > + > > + ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid); > > + if (ti) { > > + DRM_ERROR("Process info: process %s tid %d thread %s pid %d\n", > > + ti->process_name, ti->tgid, ti->task_name, ti->pid); > > + amdgpu_vm_put_task_info(ti); > > + } > > + } > > + > > rcu_read_lock(); > > > > id = KFD_FIRST_NONSIGNAL_EVENT_ID; > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > index 66c73825c0a0..84e8ea3a8a0c 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > @@ -321,8 +321,11 @@ static void update_mqd(struct mqd_manager *mm, void *mqd, > > static bool check_preemption_failed(struct mqd_manager *mm, void *mqd) > > { > > struct v9_mqd *m = (struct v9_mqd *)mqd; > > + uint32_t doorbell_id = m->queue_doorbell_id0; > > > > - return kfd_check_hiq_mqd_doorbell_id(mm->dev, m->queue_doorbell_id0, 0); > > + m->queue_doorbell_id0 = 0; > > + > > + return kfd_check_hiq_mqd_doorbell_id(mm->dev, doorbell_id, 0); > > } > > > > static int get_wave_state(struct mqd_manager *mm, void *mqd, > > @@ -624,6 +627,7 @@ static bool check_preemption_failed_v9_4_3(struct mqd_manager *mm, void *mqd) > > m = get_mqd(mqd + hiq_mqd_size * inst); > > ret |= kfd_check_hiq_mqd_doorbell_id(mm->dev, > > m->queue_doorbell_id0, inst); > > + m->queue_doorbell_id0 = 0; > > ++inst; > > } > > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > index 2b3ec92981e8..e244242fd1b4 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > @@ -504,6 +504,7 @@ struct queue_properties { > > bool is_being_destroyed; > > bool is_active; > > bool is_gws; > > + bool is_reset; > > uint32_t pm4_target_xcc; > > bool is_dbg_wa; > > bool is_user_cu_masked; > > @@ -982,6 +983,9 @@ struct kfd_process { > > struct semaphore runtime_enable_sema; > > bool is_runtime_retry; > > struct kfd_runtime_info runtime_info; > > + > > + /* Tracks queue reset status */ > > + bool has_reset_queue; > > }; > > > > #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > > index 17e42161b015..9a1d69f83a08 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > > @@ -1851,6 +1851,8 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger) > > goto fail; > > } > > n_evicted++; > > + > > + pdd->dev->dqm->is_hws_hang = false; > > } > > > > return r; > > diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > > index 6d094cf3587d..20bb4a0c893e 100644 > > --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > > +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > > @@ -318,6 +318,10 @@ struct kfd2kgd_calls { > > void (*program_trap_handler_settings)(struct amdgpu_device *adev, > > uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, > > uint32_t inst); > > + uint64_t (*hqd_detect_and_reset)(struct amdgpu_device *adev, > > + uint32_t pipe_id, uint32_t queue_id, > > + uint32_t inst, unsigned int utimeout, > > + bool detect_only); > > }; > > > > #endif /* KGD_KFD_INTERFACE_H_INCLUDED */