[+Andrey]
Hi Shaoyun, Monk, and Andrey,
I tested this on my bare-metal Vega10 system. GPU reset
(using BACO) is
flaky on this system with and without this patch. The
first reset seems
to work OK, the second one fails in different ways. In
theory this
change should be an improvement as it eliminated at least
one failure
mode (hanging before the actual reset). A least I don't
think I'm making
anything worse.
My test is to run KFDTest and trigger a GPU reset while
it's busy with
the eviction test. This test runs multiple KFD processes
and also
performs graphics memory allocations and command
submissions to force
TTM to evict KFD processes. I trigger the reset by hanging
the HWS
through a debugfs hook:
# cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id >
/sys/kernel/debug/kfd/hang_hws
Please give this patch a try and reinstate the pre_reset
call before the
FLR for SRIOV.
Regards,
Felix
On 2019-12-19 9:09 p.m., Felix Kuehling wrote:
> The intention of the pre_reset callback is to update
the driver
> state to reflect that all user mode queues are
preempted and the
> HIQ is destroyed. However we should not actually
preempt any queues
> or otherwise touch the hardware because it is
presumably hanging.
> The impending reset will take care of actually
stopping all queues.
>
> This should prevent KIQ register access hanging on
SRIOV function
> level reset (FLR). It should also speed up the reset
by avoiding
> unnecessary timeouts from a potentially hanging GPU
scheduler.
>
> CC: shaoyunl
<shaoyun.liu@xxxxxxx>
> CC: Liu Monk
<Monk.Liu@xxxxxxx>
> Signed-off-by: Felix Kuehling
<Felix.Kuehling@xxxxxxx>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 24
++++++++++-------
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27
++++++++++++-------
> .../drm/amd/amdkfd/kfd_device_queue_manager.h | 5
++--
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8
+++---
> .../gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4
+--
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 8
+++---
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 11
++++----
> .../amd/amdkfd/kfd_process_queue_manager.c | 2
+-
> 8 files changed, 53 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index c6b6901bbda3..796996a0d926 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -486,6 +486,7 @@ static int kfd_gtt_sa_init(struct
kfd_dev *kfd, unsigned int buf_size,
> unsigned int
chunk_size);
> static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
>
> +static void kfd_suspend(struct kfd_dev *kfd, bool
pre_reset);
> static int kfd_resume(struct kfd_dev *kfd);
>
> struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
> @@ -728,7 +729,7 @@ int kgd2kfd_pre_reset(struct
kfd_dev *kfd)
> {
> if (!kfd->init_complete)
> return 0;
> - kgd2kfd_suspend(kfd);
> + kfd_suspend(kfd, true);
>
> kfd_signal_reset_event(kfd);
> return 0;
> @@ -767,13 +768,7 @@ void kgd2kfd_suspend(struct
kfd_dev *kfd)
> if (!kfd->init_complete)
> return;
>
> - /* For first KFD device suspend all the KFD
processes */
> - if (atomic_inc_return(&kfd_locked) == 1)
> - kfd_suspend_all_processes();
> -
> - kfd->dqm->ops.stop(kfd->dqm);
> -
> - kfd_iommu_suspend(kfd);
> + kfd_suspend(kfd, false);
> }
>
> int kgd2kfd_resume(struct kfd_dev *kfd)
> @@ -795,6 +790,17 @@ int kgd2kfd_resume(struct
kfd_dev *kfd)
> return ret;
> }
>
> +static void kfd_suspend(struct kfd_dev *kfd, bool
pre_reset)
> +{
> + /* For first KFD device suspend all the KFD
processes */
> + if (atomic_inc_return(&kfd_locked) == 1)
> + kfd_suspend_all_processes(pre_reset);
> +
> + kfd->dqm->ops.stop(kfd->dqm,
pre_reset);
> +
> + kfd_iommu_suspend(kfd);
> +}
> +
> static int kfd_resume(struct kfd_dev *kfd)
> {
> int err = 0;
> @@ -877,7 +883,7 @@ int kgd2kfd_quiesce_mm(struct
mm_struct *mm)
> if (!p)
> return -ESRCH;
>
> - r = kfd_process_evict_queues(p);
> + r = kfd_process_evict_queues(p, false);
>
> kfd_unref_process(p);
> return r;
> diff --git
a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f7f6df40875e..3a49944164da 100644
> ---
a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -592,7 +592,8 @@ static int update_queue(struct
device_queue_manager *dqm, struct queue *q)
> }
>
> static int evict_process_queues_nocpsch(struct
device_queue_manager *dqm,
> - struct
qcm_process_device *qpd)
> + struct
qcm_process_device *qpd,
> + bool pre_reset)
> {
> struct queue *q;
> struct mqd_manager *mqd_mgr;
> @@ -622,6 +623,8 @@ static int
evict_process_queues_nocpsch(struct device_queue_manager
*dqm,
>
> if (WARN_ONCE(!dqm->sched_running,
"Evict when stopped\n"))
> continue;
> + if (pre_reset)
> + continue;
>
> retval =
mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
> @@ -639,7 +642,8 @@ static int
evict_process_queues_nocpsch(struct device_queue_manager
*dqm,
> }
>
> static int evict_process_queues_cpsch(struct
device_queue_manager *dqm,
> - struct
qcm_process_device *qpd)
> + struct
qcm_process_device *qpd,
> + bool pre_reset)
> {
> struct queue *q;
> struct kfd_process_device *pdd;
> @@ -664,6 +668,10 @@ static int
evict_process_queues_cpsch(struct device_queue_manager
*dqm,
> q->properties.is_active = false;
> dqm->queue_count--;
> }
> +
> + if (pre_reset)
> + goto out;
> +
> retval = execute_queues_cpsch(dqm,
> qpd->is_debug ?
>
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
> @@ -944,10 +952,10 @@ static int start_nocpsch(struct
device_queue_manager *dqm)
> return 0;
> }
>
> -static int stop_nocpsch(struct device_queue_manager
*dqm)
> +static int stop_nocpsch(struct device_queue_manager
*dqm, bool pre_reset)
> {
> if
(dqm->dev->device_info->asic_family ==
CHIP_HAWAII)
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets,
pre_reset);
> dqm->sched_running = false;
>
> return 0;
> @@ -1107,20 +1115,21 @@ static int start_cpsch(struct
device_queue_manager *dqm)
> return 0;
> fail_allocate_vidmem:
> fail_set_sched_resources:
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, false);
> fail_packet_manager_init:
> return retval;
> }
>
> -static int stop_cpsch(struct device_queue_manager
*dqm)
> +static int stop_cpsch(struct device_queue_manager
*dqm, bool pre_reset)
> {
> dqm_lock(dqm);
> - unmap_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> + if (!pre_reset)
> + unmap_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> dqm->sched_running = false;
> dqm_unlock(dqm);
>
> kfd_gtt_sa_free(dqm->dev,
dqm->fence_mem);
> - pm_uninit(&dqm->packets);
> + pm_uninit(&dqm->packets, pre_reset);
>
> return 0;
> }
> @@ -1891,7 +1900,7 @@ int kfd_process_vm_fault(struct
device_queue_manager *dqm,
> return -EINVAL;
> pdd = kfd_get_process_device_data(dqm->dev,
p);
> if (pdd)
> - ret =
dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> + ret =
dqm->ops.evict_process_queues(dqm, &pdd->qpd,
false);
> kfd_unref_process(p);
>
> return ret;
> diff --git
a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index a8c37e6da027..9f82f95f6a92 100644
> ---
a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -103,7 +103,7 @@ struct device_queue_manager_ops {
>
> int (*initialize)(struct
device_queue_manager *dqm);
> int (*start)(struct device_queue_manager
*dqm);
> - int (*stop)(struct device_queue_manager
*dqm);
> + int (*stop)(struct device_queue_manager
*dqm, bool pre_reset);
> void (*uninitialize)(struct
device_queue_manager *dqm);
> int (*create_kernel_queue)(struct
device_queue_manager *dqm,
> struct
kernel_queue *kq,
> @@ -129,7 +129,8 @@ struct device_queue_manager_ops {
> struct qcm_process_device
*qpd);
>
> int (*evict_process_queues)(struct
device_queue_manager *dqm,
> - struct
qcm_process_device *qpd);
> + struct
qcm_process_device *qpd,
> + bool pre_reset);
> int (*restore_process_queues)(struct
device_queue_manager *dqm,
> struct
qcm_process_device *qpd);
>
> diff --git
a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..dbd2e8e9ae69 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct
kernel_queue *kq, struct kfd_dev *dev,
> }
>
> /* Uninitialize a kernel queue and free all its
memory usages. */
> -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq,
bool pre_reset)
> {
> - if (kq->queue->properties.type ==
KFD_QUEUE_TYPE_HIQ)
> + if (kq->queue->properties.type ==
KFD_QUEUE_TYPE_HIQ && !pre_reset)
>
kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>
kq->queue->mqd,
>
KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
> @@ -337,9 +337,9 @@ struct kernel_queue
*kernel_queue_init(struct kfd_dev *dev,
> return NULL;
> }
>
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq,
bool pre_reset)
> {
> - kq_uninitialize(kq);
> + kq_uninitialize(kq, pre_reset);
> kfree(kq);
> }
>
> diff --git
a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..7732a3bbebd6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct
packet_manager *pm, struct device_queue_manager *dqm)
> return 0;
> }
>
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool
pre_reset)
> {
> mutex_destroy(&pm->lock);
> - kernel_queue_uninit(pm->priv_queue);
> + kernel_queue_uninit(pm->priv_queue,
pre_reset);
> }
>
> int pm_send_set_resources(struct packet_manager
*pm,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..9bc4ced4acba 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -762,9 +762,9 @@ struct kfd_process
*kfd_get_process(const struct task_struct *);
> struct kfd_process
*kfd_lookup_process_by_pasid(unsigned int pasid);
> struct kfd_process *kfd_lookup_process_by_mm(const
struct mm_struct *mm);
> void kfd_unref_process(struct kfd_process *p);
> -int kfd_process_evict_queues(struct kfd_process *p);
> +int kfd_process_evict_queues(struct kfd_process *p,
bool pre_reset);
> int kfd_process_restore_queues(struct kfd_process
*p);
> -void kfd_suspend_all_processes(void);
> +void kfd_suspend_all_processes(bool pre_reset);
> int kfd_resume_all_processes(void);
>
> int kfd_process_device_init_vm(struct
kfd_process_device *pdd,
> @@ -883,7 +883,7 @@ struct device_queue_manager
*device_queue_manager_init(struct kfd_dev *dev);
> void device_queue_manager_uninit(struct
device_queue_manager *dqm);
> struct kernel_queue *kernel_queue_init(struct
kfd_dev *dev,
> enum
kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq,
bool pre_reset);
> int kfd_process_vm_fault(struct
device_queue_manager *dqm, unsigned int pasid);
>
> /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct
packet_manager_funcs kfd_vi_pm_funcs;
> extern const struct packet_manager_funcs
kfd_v9_pm_funcs;
>
> int pm_init(struct packet_manager *pm, struct
device_queue_manager *dqm);
> -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool
pre_reset);
> int pm_send_set_resources(struct packet_manager
*pm,
> struct
scheduling_resources *res);
> int pm_send_runlist(struct packet_manager *pm,
struct list_head *dqm_queues);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 536a153ac9a4..7bcadd3a1e3b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -948,7 +948,7 @@ struct kfd_process
*kfd_lookup_process_by_mm(const struct mm_struct *mm)
> * Eviction is reference-counted per
process-device. This means multiple
> * evictions from different sources can be nested
safely.
> */
> -int kfd_process_evict_queues(struct kfd_process *p)
> +int kfd_process_evict_queues(struct kfd_process *p,
bool pre_reset)
> {
> struct kfd_process_device *pdd;
> int r = 0;
> @@ -956,7 +956,8 @@ int
kfd_process_evict_queues(struct kfd_process *p)
>
> list_for_each_entry(pdd,
&p->per_device_data, per_device_list) {
> r =
pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
>
-
&pdd->qpd);
>
+
&pdd->qpd,
>
+
pre_reset);
> if (r) {
> pr_err("Failed to evict
process queues\n");
> goto fail;
> @@ -1026,7 +1027,7 @@ static void
evict_process_worker(struct work_struct *work)
> flush_delayed_work(&p->restore_work);
>
> pr_debug("Started evicting pasid 0x%x\n",
p->pasid);
> - ret = kfd_process_evict_queues(p);
> + ret = kfd_process_evict_queues(p, false);
> if (!ret) {
> dma_fence_signal(p->ef);
> dma_fence_put(p->ef);
> @@ -1082,7 +1083,7 @@ static void
restore_process_worker(struct work_struct *work)
> pr_err("Failed to restore queues of
pasid 0x%x\n", p->pasid);
> }
>
> -void kfd_suspend_all_processes(void)
> +void kfd_suspend_all_processes(bool pre_reset)
> {
> struct kfd_process *p;
> unsigned int temp;
> @@ -1092,7 +1093,7 @@ void
kfd_suspend_all_processes(void)
>
cancel_delayed_work_sync(&p->eviction_work);
>
cancel_delayed_work_sync(&p->restore_work);
>
> - if (kfd_process_evict_queues(p))
> + if (kfd_process_evict_queues(p,
pre_reset))
> pr_err("Failed to suspend
process 0x%x\n", p->pasid);
> dma_fence_signal(p->ef);
> dma_fence_put(p->ef);
> diff --git
a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> ---
a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct
process_queue_manager *pqm, unsigned int qid)
> /* destroy kernel queue (DIQ) */
> dqm = pqn->kq->dev->dqm;
> dqm->ops.destroy_kernel_queue(dqm,
pqn->kq, &pdd->qpd);
> - kernel_queue_uninit(pqn->kq);
> + kernel_queue_uninit(pqn->kq, false);
> }
>
> if (pqn->q) {