Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Can we use the  dqm_lock when we try to get the dqm->is_hw_hang and  dqm->is_resetting inside function kq_uninitialize ?

I think more closer we check the status  to hqd_destroy it will be  more accurate . It does look better with this logic if the status are changed after dqm unmap_queue call and  before we call hqd_destroy .

Another comment in line.

Regards

shaoyun.liu




On 2019-12-20 11:33 a.m., Felix Kuehling wrote:
dqm->is_hws_hang is protected by the DQM lock. kq_uninitialize runs outside that lock protection. Therefore I opted to pass in the hanging flag as a parameter. It also keeps the logic that decides all of that inside the device queue manager, which I think is cleaner.

I was trying to clean this up further by moving the pm_init/pm_uninit out of the start_cpsch/stop_cpsch sequence, but gave up on that idea when I found out that I can't create the kernel queue in the DQM initialize function because dev->dqm isn't initialized at that time yet.

Regards,
  Felix

On 2019-12-20 10:56, shaoyunl wrote:
Looks like patch 2 is not related to this serial , but anyway .

Patch 1,2,3 are reviewed by shaoyunl <shaoyun.liu@xxxxxxx>

For patch 4 ,  is it possible we directly check dqm->is_hws_hang || dqm->is_resetting  inside function kq_uninitialize.  so we don't need other interface change .

I think even Inside that kq_uninitialize function , we still can get dqm as  kq->dev->dqm .


shaoyun.liu


On 2019-12-20 3:30 a.m., Felix Kuehling wrote:
Don't use the HWS if it's known to be hanging. In a reset also
don't try to destroy the HIQ because that may hang on SRIOV if the
KIQ is unresponsive.

Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
---
  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
  .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
  5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a7e9ec1b3ce3..d7eb6ac37f62 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager *dqm)
  static int stop_nocpsch(struct device_queue_manager *dqm)
  {
      if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
-        pm_uninit(&dqm->packets);
+        pm_uninit(&dqm->packets, false);
      dqm->sched_running = false;
        return 0;
@@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager *dqm)
      return 0;
  fail_allocate_vidmem:
  fail_set_sched_resources:
-    pm_uninit(&dqm->packets);
+    pm_uninit(&dqm->packets, false);
  fail_packet_manager_init:
      return retval;
  }
    static int stop_cpsch(struct device_queue_manager *dqm)
  {
+    bool hanging;
+kq_uninitialize(

      dqm_lock(dqm);
-    unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+    if (!dqm->is_hws_hang)
[shaoyunl]  should we check is_resetting here as well . so we ignore the  unmap call even HWS still not  detect the hang but we know we currently in resetting  precedure
+        unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+    hanging = dqm->is_hws_hang || dqm->is_resetting;
      dqm->sched_running = false;
      dqm_unlock(dqm);
        kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
-    pm_uninit(&dqm->packets);
+    pm_uninit(&dqm->packets, hanging);
        return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 2d56dc534459..bae706462f96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_dev *dev,
  }
    /* Uninitialize a kernel queue and free all its memory usages. */
-static void kq_uninitialize(struct kernel_queue *kq)
+static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
  {
-    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+    if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
          kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
                      kq->queue->mqd,
                      KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
@@ -337,9 +337,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
      return NULL;
  }
  -void kernel_queue_uninit(struct kernel_queue *kq)
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
  {
-    kq_uninitialize(kq);
+    kq_uninitialize(kq, hanging);
      kfree(kq);
  }
  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 6cabed06ef5d..dc406e6dee23 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
      return 0;
  }
  -void pm_uninit(struct packet_manager *pm)
+void pm_uninit(struct packet_manager *pm, bool hanging)
  {
      mutex_destroy(&pm->lock);
-    kernel_queue_uninit(pm->priv_queue);
+    kernel_queue_uninit(pm->priv_queue, hanging);
  }
    int pm_send_set_resources(struct packet_manager *pm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h I
index 087e96838997..8ac680dc90f1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -883,7 +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
  void device_queue_manager_uninit(struct device_queue_manager *dqm);
  struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
                      enum kfd_queue_type type);
-void kernel_queue_uninit(struct kernel_queue *kq);
+void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
  int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
    /* Process Queue Manager */
@@ -974,7 +974,7 @@ extern const struct packet_manager_funcs kfd_vi_pm_funcs;
  extern const struct packet_manager_funcs kfd_v9_pm_funcs;
    int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
-void pm_uninit(struct packet_manager *pm);
+void pm_uninit(struct packet_manager *pm, bool hanging);
  int pm_send_set_resources(struct packet_manager *pm,
                  struct scheduling_resources *res);
  int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index d3eacf72e8db..8fa856e6a03f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
          /* destroy kernel queue (DIQ) */
          dqm = pqn->kq->dev->dqm;
          dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
-        kernel_queue_uninit(pqn->kq);
+        kernel_queue_uninit(pqn->kq, false);
      }
        if (pqn->q) {
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cfelix.kuehling%40amd.com%7C6df6fef2bf6a4208704e08d785652f27%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637124542109969638&amp;sdata=V0mHmgjUSP%2BauYL3r6PGU7aqFTQz8NkKMcuA5vXSkUQ%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux