Add DPC handlers with basic recovery functionality.
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 181 ++++++++++++++++++++++++++++-
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 9 +-
3 files changed, 196 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 49ea9fa..3399242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -49,6 +49,8 @@
#include <linux/rbtree.h>
#include <linux/hashtable.h>
#include <linux/dma-fence.h>
+#include <linux/pci.h>
+#include <linux/aer.h>
#include <drm/ttm/ttm_bo_api.h>
#include <drm/ttm/ttm_bo_driver.h>
@@ -1263,6 +1265,13 @@ static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return
void amdgpu_register_gpu_instance(struct amdgpu_device *adev);
void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev);
+pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state);
+pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev);
+pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev);
+void amdgpu_pci_resume(struct pci_dev *pdev);
+
+
#include "amdgpu_object.h"
/* used by df_v3_6.c and amdgpu_pmu.c */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5a948ed..84f8d14 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -350,7 +350,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
*
* Returns the 8 bit value from the offset specified.
*/
-uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
+uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
+{
+
if (offset < adev->rmmio_size)
return (readb(adev->rmmio + offset));
BUG();
@@ -371,7 +373,9 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
*
* Writes the value specified to the offset specified.
*/
-void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
+void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
+{
+
if (offset < adev->rmmio_size)
writeb(value, adev->rmmio + offset);
else
@@ -380,6 +384,7 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
{
+
trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
if ((reg * 4) < adev->rmmio_size)
@@ -407,6 +412,7 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
uint32_t acc_flags)
{
+
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
return amdgpu_kiq_wreg(adev, reg, v);
@@ -461,6 +467,7 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
*/
void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
{
+
if ((reg * 4) < adev->rio_mem_size)
iowrite32(v, adev->rio_mem + (reg * 4));
else {
@@ -480,6 +487,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
*/
u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
{
+
if (index < adev->doorbell.num_doorbells) {
return readl(adev->doorbell.ptr + index);
} else {
@@ -500,6 +508,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
*/
void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
{
+
if (index < adev->doorbell.num_doorbells) {
writel(v, adev->doorbell.ptr + index);
} else {
@@ -518,6 +527,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
*/
u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
{
+
if (index < adev->doorbell.num_doorbells) {
return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
} else {
@@ -538,6 +548,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
*/
void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
{
+
if (index < adev->doorbell.num_doorbells) {
atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
} else {
@@ -2989,6 +3000,7 @@ static const struct attribute *amdgpu_dev_attributes[] = {
NULL
};
+
/**
* amdgpu_device_init - initialize the driver
*
@@ -3207,6 +3219,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
}
}
+ pci_enable_pcie_error_reporting(adev->ddev.pdev);
+
+
/* Post card if necessary */
if (amdgpu_device_need_post(adev)) {
if (!adev->bios) {
@@ -3359,6 +3374,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r)
dev_err(adev->dev, "amdgpu_pmu_init failed\n");
+ if (pci_save_state(pdev))
+ DRM_ERROR("Failed to save PCI state!!\n");
+
return 0;
failed:
@@ -4701,3 +4719,162 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
return 0;
}
+
+/**
+ * amdgpu_pci_error_detected - Called when a PCI error is detected.
+ * @pdev: PCI device struct
+ * @state: PCI channel state
+ *
+ * Description: Called when a PCI error is detected.
+ *
+ * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
+ */
+pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+ struct drm_device *dev = pci_get_drvdata(pdev);
+ struct amdgpu_device *adev = drm_to_adev(dev);
+
+ DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
+
+ switch (state) {
+ case pci_channel_io_normal:
+ return PCI_ERS_RESULT_CAN_RECOVER;
+ case pci_channel_io_frozen: {
+ /* Fatal error, prepare for slot reset */
+
+ amdgpu_device_lock_adev(adev);
+ return PCI_ERS_RESULT_NEED_RESET;
+ }
+ case pci_channel_io_perm_failure:
+ /* Permanent error, prepare for device removal */
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+ return PCI_ERS_RESULT_NEED_RESET;
+}