Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>
Tested-by: Bai Zoy <Zoy.Bai@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 +---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 3 +
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 73 +++++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +-
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 ++-
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 ++-
drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 7 ++-
8 files changed, 104 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4264abc5604d..99efd8317547 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -109,6 +109,7 @@
#include "amdgpu_fdinfo.h"
#include "amdgpu_mca.h"
#include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
#define MAX_GPU_INSTANCE 16
@@ -509,16 +510,6 @@ struct amdgpu_allowed_register_entry {
bool grbm_indexed;
};
-enum amd_reset_method {
- AMD_RESET_METHOD_NONE = -1,
- AMD_RESET_METHOD_LEGACY = 0,
- AMD_RESET_METHOD_MODE0,
- AMD_RESET_METHOD_MODE1,
- AMD_RESET_METHOD_MODE2,
- AMD_RESET_METHOD_BACO,
- AMD_RESET_METHOD_PCI,
-};
-
struct amdgpu_video_codec_info {
u32 codec_type;
u32 max_width;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e582f1044c0f..7fa82269c30f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5201,6 +5201,12 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
}
tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
+
+ /* Drop all pending resets since we will reset now anyway */
+ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+ reset_list);
+ amdgpu_reset_pending_list(tmp_adev->reset_domain);
+
/* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
@@ -5296,7 +5302,7 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
}
struct amdgpu_recover_work_struct {
- struct work_struct base;
+ struct amdgpu_reset_work_struct base;
struct amdgpu_device *adev;
struct amdgpu_job *job;
int ret;
@@ -5304,7 +5310,7 @@ struct amdgpu_recover_work_struct {
static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
{
- struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
+ struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base.base.work);
recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
}
@@ -5316,12 +5322,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
{
struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
- INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
+ INIT_DELAYED_WORK(&work.base.base, amdgpu_device_queue_gpu_recover_work);
+ INIT_LIST_HEAD(&work.base.node);
if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
return -EAGAIN;
- flush_work(&work.base);
+ flush_delayed_work(&work.base.base);
+
+ amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &work.base);
return work.ret;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index c80af0889773..ffddd419c351 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -134,6 +134,9 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
atomic_set(&reset_domain->in_gpu_reset, 0);
init_rwsem(&reset_domain->sem);
+ INIT_LIST_HEAD(&reset_domain->pending_works);
+ mutex_init(&reset_domain->reset_lock);
+
return reset_domain;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 1949dbe28a86..863ec5720fc1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -24,7 +24,18 @@
#ifndef __AMDGPU_RESET_H__
#define __AMDGPU_RESET_H__
-#include "amdgpu.h"
+
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+#include <linux/rwsem.h>
+#include <linux/workqueue.h>
+
+struct amdgpu_device;
+struct amdgpu_job;
+struct amdgpu_hive_info;
+
enum AMDGPU_RESET_FLAGS {
@@ -32,6 +43,17 @@ enum AMDGPU_RESET_FLAGS {
AMDGPU_SKIP_HW_RESET = 1,
};
+
+enum amd_reset_method {
+ AMD_RESET_METHOD_NONE = -1,
+ AMD_RESET_METHOD_LEGACY = 0,
+ AMD_RESET_METHOD_MODE0,
+ AMD_RESET_METHOD_MODE1,
+ AMD_RESET_METHOD_MODE2,
+ AMD_RESET_METHOD_BACO,
+ AMD_RESET_METHOD_PCI,
+};
+
struct amdgpu_reset_context {
enum amd_reset_method method;
struct amdgpu_device *reset_req_dev;
@@ -40,6 +62,8 @@ struct amdgpu_reset_context {
unsigned long flags;
};
+struct amdgpu_reset_control;
+
struct amdgpu_reset_handler {
enum amd_reset_method reset_method;
struct list_head handler_list;
@@ -76,12 +100,21 @@ enum amdgpu_reset_domain_type {
XGMI_HIVE
};
+
+struct amdgpu_reset_work_struct {
+ struct delayed_work base;
+ struct list_head node;
+};
+
struct amdgpu_reset_domain {
struct kref refcount;
struct workqueue_struct *wq;
enum amdgpu_reset_domain_type type;
struct rw_semaphore sem;
atomic_t in_gpu_reset;
+
+ struct list_head pending_works;
+ struct mutex reset_lock;
};
@@ -113,9 +146,43 @@ static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *dom
}
static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain,
- struct work_struct *work)
+ struct amdgpu_reset_work_struct *work)
{
- return queue_work(domain->wq, work);
+ mutex_lock(&domain->reset_lock);
+
+ if (!queue_delayed_work(domain->wq, &work->base, 0)) {
+ mutex_unlock(&domain->reset_lock);
+ return false;
+ }
+
+ list_add_tail(&work->node, &domain->pending_works);
+ mutex_unlock(&domain->reset_lock);
+
+ return true;
+}
+
+static inline void amdgpu_reset_domain_del_pendning_work(struct amdgpu_reset_domain *domain,
+ struct amdgpu_reset_work_struct *work)
+{
+ mutex_lock(&domain->reset_lock);
+ list_del_init(&work->node);
+ mutex_unlock(&domain->reset_lock);
+}
+
+static inline void amdgpu_reset_pending_list(struct amdgpu_reset_domain *domain)
+{
+ struct amdgpu_reset_work_struct *entry, *tmp;
+
+ mutex_lock(&domain->reset_lock);
+ list_for_each_entry_safe(entry, tmp, &domain->pending_works, node) {
+
+ list_del_init(&entry->node);
+
+ /* Stop any other related pending resets */
+ cancel_delayed_work(&entry->base);
+ }
+
+ mutex_unlock(&domain->reset_lock);
}
void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 239f232f9c02..574e870d3064 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -25,6 +25,7 @@
#define AMDGPU_VIRT_H
#include "amdgv_sriovmsg.h"
+#include "amdgpu_reset.h"
#define AMDGPU_SRIOV_CAPS_SRIOV_VBIOS (1 << 0) /* vBIOS is sr-iov ready */
#define AMDGPU_SRIOV_CAPS_ENABLE_IOV (1 << 1) /* sr-iov is enabled on this GPU */
@@ -230,7 +231,7 @@ struct amdgpu_virt {
uint32_t reg_val_offs;
struct amdgpu_irq_src ack_irq;
struct amdgpu_irq_src rcv_irq;
- struct work_struct flr_work;
+ struct amdgpu_reset_work_struct flr_work;
struct amdgpu_mm_table mm_table;
const struct amdgpu_virt_ops *ops;
struct amdgpu_vf_error_buffer vf_errors;
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index b81acf59870c..f3d1c2be9292 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -251,7 +251,7 @@ static int xgpu_ai_set_mailbox_ack_irq(struct amdgpu_device *adev,
static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
{
- struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
+ struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
@@ -380,7 +380,8 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
return r;
}
- INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
+ INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_ai_mailbox_flr_work);
+ INIT_LIST_HEAD(&adev->virt.flr_work.node);
return 0;
}
@@ -389,6 +390,8 @@ void xgpu_ai_mailbox_put_irq(struct amdgpu_device *adev)
{
amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
+
+ amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work);
}
static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index 22c10b97ea81..927b3d5bb1d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -275,7 +275,7 @@ static int xgpu_nv_set_mailbox_ack_irq(struct amdgpu_device *adev,
static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
{
- struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
+ struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
@@ -407,7 +407,8 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
return r;
}
- INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
+ INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_nv_mailbox_flr_work);
+ INIT_LIST_HEAD(&adev->virt.flr_work.node);
return 0;
}
@@ -416,6 +417,8 @@ void xgpu_nv_mailbox_put_irq(struct amdgpu_device *adev)
{
amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
+
+ amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work);
}
const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index 7b63d30b9b79..1d4ef5c70730 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -512,7 +512,7 @@ static int xgpu_vi_set_mailbox_ack_irq(struct amdgpu_device *adev,
static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
{
- struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
+ struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work.base.work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
/* wait until RCV_MSG become 3 */
@@ -610,7 +610,8 @@ int xgpu_vi_mailbox_get_irq(struct amdgpu_device *adev)
return r;
}
- INIT_WORK(&adev->virt.flr_work, xgpu_vi_mailbox_flr_work);
+ INIT_DELAYED_WORK(&adev->virt.flr_work.base, xgpu_vi_mailbox_flr_work);
+ INIT_LIST_HEAD(&adev->virt.flr_work.node);
return 0;
}
@@ -619,6 +620,8 @@ void xgpu_vi_mailbox_put_irq(struct amdgpu_device *adev)
{
amdgpu_irq_put(adev, &adev->virt.ack_irq, 0);
amdgpu_irq_put(adev, &adev->virt.rcv_irq, 0);
+
+ amdgpu_reset_domain_del_pendning_work(adev->reset_domain, &adev->virt.flr_work);
}
const struct amdgpu_virt_ops xgpu_vi_virt_ops = {