Re: [PATCH V4 09/10] accel/amdxdna: Add error handling

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 10/18/24 14:01, Jeffrey Hugo wrote:
On 10/11/2024 5:12 PM, Lizhi Hou wrote:
When there is a hardware error, the NPU firmware notifies the host through a mailbox message. The message includes details of the error, such as the
tile and column indexes where the error occurred.

The driver starts a thread to handle the NPU error message. The thread
stops the clients which are using the column where error occurred. Then
the driver resets that column.

Co-developed-by: Min Ma<min.ma@xxxxxxx>
Signed-off-by: Min Ma<min.ma@xxxxxxx>
Signed-off-by: Lizhi Hou<lizhi.hou@xxxxxxx>
---
  drivers/accel/amdxdna/Makefile       |   1 +
  drivers/accel/amdxdna/aie2_error.c   | 356 +++++++++++++++++++++++++++
  drivers/accel/amdxdna/aie2_message.c |  19 ++
  drivers/accel/amdxdna/aie2_pci.c     |  32 +++
  drivers/accel/amdxdna/aie2_pci.h     |   9 +
  5 files changed, 417 insertions(+)
  create mode 100644 drivers/accel/amdxdna/aie2_error.c

diff --git a/drivers/accel/amdxdna/Makefile b/drivers/accel/amdxdna/Makefile
index a688c378761f..ed6f87910880 100644
--- a/drivers/accel/amdxdna/Makefile
+++ b/drivers/accel/amdxdna/Makefile
@@ -2,6 +2,7 @@
    amdxdna-y := \
      aie2_ctx.o \
+    aie2_error.o \
      aie2_message.o \
      aie2_pci.o \
      aie2_psp.o \
diff --git a/drivers/accel/amdxdna/aie2_error.c b/drivers/accel/amdxdna/aie2_error.c
new file mode 100644
index 000000000000..d2787549f3b7
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_error.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/drm_cache.h>
+#include <drm/drm_device.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/dma-mapping.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+
+#include "aie2_msg_priv.h"
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+struct async_event {
+    struct amdxdna_dev_hdl        *ndev;
+    struct async_event_msg_resp    resp;
+    struct workqueue_struct        *wq;
+    struct work_struct        work;
+    u8                *buf;
+    dma_addr_t            addr;
+    u32                size;
+};
+
+struct async_events {
+    struct workqueue_struct        *wq;
+    u8                *buf;
+    dma_addr_t            addr;
+    u32                size;
+    u32                event_cnt;
+    struct async_event        event[] __counted_by(event_cnt);
+};
+
+/*
+ * Below enum, struct and lookup tables are porting from XAIE util header file.
+ *
+ * Below data is defined by AIE device and it is used for decode error message
+ * from the device.
+ */
+
+enum aie_module_type {
+    AIE_MEM_MOD = 0,
+    AIE_CORE_MOD,
+    AIE_PL_MOD,
+};
+
+enum aie_error_category {
+    AIE_ERROR_SATURATION = 0,
+    AIE_ERROR_FP,
+    AIE_ERROR_STREAM,
+    AIE_ERROR_ACCESS,
+    AIE_ERROR_BUS,
+    AIE_ERROR_INSTRUCTION,
+    AIE_ERROR_ECC,
+    AIE_ERROR_LOCK,
+    AIE_ERROR_DMA,
+    AIE_ERROR_MEM_PARITY,
+    /* Unknown is not from XAIE, added for better category */
+    AIE_ERROR_UNKNOWN,
+};
+
+/* Don't pack, unless XAIE side changed */
+struct aie_error {
+    u8            row;
+    u8            col;
+    u32            mod_type;
+    u8            event_id;
+};

This looks like it is a structure to decode data from an external device.  Assuming that is so, the wrong types are used here. Should be the "__" types like "__u8", no?  Normal u8, etc are kernel internal only types.

Yes, you are correct. I will fix this.


Thanks,

Lizhi


+
+struct aie_err_info {
+    u32            err_cnt;
+    u32            ret_code;
+    u32            rsvd;
+    struct aie_error    payload[] __counted_by(err_cnt);
+};
+
+struct aie_event_category {
+    u8            event_id;
+    enum aie_error_category category;
+};




[Index of Archives]     [Linux DRI Users]     [Linux Intel Graphics]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [XFree86]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux