From: Ofir Bitton <obitton@xxxxxxxxx> User gets notification for every engine error report, but he still lacks the exact engine information. Hence, we allow user to query for the exact engine reported an error. Signed-off-by: Ofir Bitton <obitton@xxxxxxxxx> Reviewed-by: Oded Gabbay <ogabbay@xxxxxxxxxx> Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx> --- drivers/accel/habanalabs/common/device.c | 14 ++ drivers/accel/habanalabs/common/habanalabs.h | 17 ++ .../habanalabs/common/habanalabs_ioctl.c | 25 +++ drivers/accel/habanalabs/gaudi2/gaudi2.c | 168 ++++++++++++++++++ include/uapi/drm/habanalabs_accel.h | 16 ++ 5 files changed, 240 insertions(+) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 28be0fc325ea..80cce6b74d05 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -2701,6 +2701,20 @@ void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info) *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR; } +void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count) +{ + struct engine_err_info *info = &hdev->captured_err_info.engine_err; + + /* Capture only the first engine error */ + if (atomic_cmpxchg(&info->event_detected, 0, 1)) + return; + + info->event.timestamp = ktime_to_ns(ktime_get()); + info->event.engine_id = engine_id; + info->event.error_count = error_count; + info->event_info_available = true; +} + void hl_enable_err_info_capture(struct hl_error_info *captured_err_info) { vfree(captured_err_info->page_fault_info.user_mappings); diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 4fecd300b8dd..201d826b0fb7 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3062,6 +3062,20 @@ struct fw_err_info { bool event_info_available; }; +/** + * struct engine_err_info - engine error information. + * @event: holds information on the event. + * @event_detected: if set as 1, then an engine event was discovered for the + * first time after the driver has finished booting-up. + * @event_info_available: indicates that an engine event info is now available. + */ +struct engine_err_info { + struct hl_info_engine_err_event event; + atomic_t event_detected; + bool event_info_available; +}; + + /** * struct hl_error_info - holds information collected during an error. * @cs_timeout: CS timeout error information. @@ -3070,6 +3084,7 @@ struct fw_err_info { * @page_fault_info: page fault information. * @hw_err: (fatal) hardware error information. * @fw_err: firmware error information. + * @engine_err: engine error information. */ struct hl_error_info { struct cs_timeout_info cs_timeout; @@ -3078,6 +3093,7 @@ struct hl_error_info { struct page_fault_info page_fault_info; struct hw_err_info hw_err; struct fw_err_info fw_err; + struct engine_err_info engine_err; }; /** @@ -3951,6 +3967,7 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_ u64 *event_mask); void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask); void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info); +void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count); void hl_enable_err_info_capture(struct hl_error_info *captured_err_info); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index 549b2518fae0..097d65e493c8 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -875,6 +875,28 @@ static int fw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args) return rc ? -EFAULT : 0; } +static int engine_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer; + struct hl_device *hdev = hpriv->hdev; + u32 user_buf_size = args->return_size; + struct engine_err_info *info; + int rc; + + if (!user_buf) + return -EINVAL; + + info = &hdev->captured_err_info.engine_err; + if (!info->event_info_available) + return 0; + + if (user_buf_size < sizeof(struct hl_info_engine_err_event)) + return -ENOMEM; + + rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_engine_err_event)); + return rc ? -EFAULT : 0; +} + static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args) { void __user *buff = (void __user *) (uintptr_t) info_args->return_pointer; @@ -1001,6 +1023,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_FW_ERR_EVENT: return fw_err_info(hpriv, args); + case HL_INFO_USER_ENGINE_ERR_EVENT: + return engine_err_info(hpriv, args); + case HL_INFO_DRAM_USAGE: return dram_usage_info(hpriv, args); default: diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 70b8f744cd73..222310bf1098 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -9588,6 +9588,171 @@ static int hl_arc_event_handle(struct hl_device *hdev, u16 event_type, } } +static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type) +{ + enum gaudi2_block_types type = GAUDI2_BLOCK_TYPE_MAX; + u16 index; + + switch (event_type) { + case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP: + index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP; + type = GAUDI2_BLOCK_TYPE_TPC; + break; + case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_TPC24_QM: + index = event_type - GAUDI2_EVENT_TPC0_QM; + type = GAUDI2_BLOCK_TYPE_TPC; + break; + case GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE: + case GAUDI2_EVENT_MME0_SPI_BASE ... GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID: + case GAUDI2_EVENT_MME0_QM: + index = 0; + type = GAUDI2_BLOCK_TYPE_MME; + break; + case GAUDI2_EVENT_MME1_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE: + case GAUDI2_EVENT_MME1_SPI_BASE ... GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID: + case GAUDI2_EVENT_MME1_QM: + index = 1; + type = GAUDI2_BLOCK_TYPE_MME; + break; + case GAUDI2_EVENT_MME2_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME2_CTRL_AXI_ERROR_RESPONSE: + case GAUDI2_EVENT_MME2_SPI_BASE ... GAUDI2_EVENT_MME2_WAP_SOURCE_RESULT_INVALID: + case GAUDI2_EVENT_MME2_QM: + index = 2; + type = GAUDI2_BLOCK_TYPE_MME; + break; + case GAUDI2_EVENT_MME3_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME3_CTRL_AXI_ERROR_RESPONSE: + case GAUDI2_EVENT_MME3_SPI_BASE ... GAUDI2_EVENT_MME3_WAP_SOURCE_RESULT_INVALID: + case GAUDI2_EVENT_MME3_QM: + index = 3; + type = GAUDI2_BLOCK_TYPE_MME; + break; + case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP: + case GAUDI2_EVENT_KDMA_BM_SPMU: + case GAUDI2_EVENT_KDMA0_CORE: + return GAUDI2_ENGINE_ID_KDMA; + case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP: + case GAUDI2_EVENT_PDMA0_CORE: + case GAUDI2_EVENT_PDMA0_BM_SPMU: + case GAUDI2_EVENT_PDMA0_QM: + return GAUDI2_ENGINE_ID_PDMA_0; + case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP: + case GAUDI2_EVENT_PDMA1_CORE: + case GAUDI2_EVENT_PDMA1_BM_SPMU: + case GAUDI2_EVENT_PDMA1_QM: + return GAUDI2_ENGINE_ID_PDMA_1; + case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE: + index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE; + type = GAUDI2_BLOCK_TYPE_DEC; + break; + case GAUDI2_EVENT_DEC0_SPI ... GAUDI2_EVENT_DEC9_BMON_SPMU: + index = (event_type - GAUDI2_EVENT_DEC0_SPI) >> 1; + type = GAUDI2_BLOCK_TYPE_DEC; + break; + case GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE ... GAUDI2_EVENT_NIC11_AXI_ERROR_RESPONSE: + index = event_type - GAUDI2_EVENT_NIC0_AXI_ERROR_RESPONSE; + return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2); + case GAUDI2_EVENT_NIC0_QM0 ... GAUDI2_EVENT_NIC11_QM1: + index = event_type - GAUDI2_EVENT_NIC0_QM0; + return GAUDI2_ENGINE_ID_NIC0_0 + index; + case GAUDI2_EVENT_NIC0_BMON_SPMU ... GAUDI2_EVENT_NIC11_SW_ERROR: + index = event_type - GAUDI2_EVENT_NIC0_BMON_SPMU; + return GAUDI2_ENGINE_ID_NIC0_0 + (index * 2); + case GAUDI2_EVENT_TPC0_BMON_SPMU ... GAUDI2_EVENT_TPC24_KERNEL_ERR: + index = (event_type - GAUDI2_EVENT_TPC0_BMON_SPMU) >> 1; + type = GAUDI2_BLOCK_TYPE_TPC; + break; + case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE: + case GAUDI2_EVENT_ROTATOR0_BMON_SPMU: + case GAUDI2_EVENT_ROTATOR0_ROT0_QM: + return GAUDI2_ENGINE_ID_ROT_0; + case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE: + case GAUDI2_EVENT_ROTATOR1_BMON_SPMU: + case GAUDI2_EVENT_ROTATOR1_ROT1_QM: + return GAUDI2_ENGINE_ID_ROT_1; + case GAUDI2_EVENT_HDMA0_BM_SPMU: + case GAUDI2_EVENT_HDMA0_QM: + case GAUDI2_EVENT_HDMA0_CORE: + return GAUDI2_DCORE0_ENGINE_ID_EDMA_0; + case GAUDI2_EVENT_HDMA1_BM_SPMU: + case GAUDI2_EVENT_HDMA1_QM: + case GAUDI2_EVENT_HDMA1_CORE: + return GAUDI2_DCORE0_ENGINE_ID_EDMA_1; + case GAUDI2_EVENT_HDMA2_BM_SPMU: + case GAUDI2_EVENT_HDMA2_QM: + case GAUDI2_EVENT_HDMA2_CORE: + return GAUDI2_DCORE1_ENGINE_ID_EDMA_0; + case GAUDI2_EVENT_HDMA3_BM_SPMU: + case GAUDI2_EVENT_HDMA3_QM: + case GAUDI2_EVENT_HDMA3_CORE: + return GAUDI2_DCORE1_ENGINE_ID_EDMA_1; + case GAUDI2_EVENT_HDMA4_BM_SPMU: + case GAUDI2_EVENT_HDMA4_QM: + case GAUDI2_EVENT_HDMA4_CORE: + return GAUDI2_DCORE2_ENGINE_ID_EDMA_0; + case GAUDI2_EVENT_HDMA5_BM_SPMU: + case GAUDI2_EVENT_HDMA5_QM: + case GAUDI2_EVENT_HDMA5_CORE: + return GAUDI2_DCORE2_ENGINE_ID_EDMA_1; + case GAUDI2_EVENT_HDMA6_BM_SPMU: + case GAUDI2_EVENT_HDMA6_QM: + case GAUDI2_EVENT_HDMA6_CORE: + return GAUDI2_DCORE3_ENGINE_ID_EDMA_0; + case GAUDI2_EVENT_HDMA7_BM_SPMU: + case GAUDI2_EVENT_HDMA7_QM: + case GAUDI2_EVENT_HDMA7_CORE: + return GAUDI2_DCORE3_ENGINE_ID_EDMA_1; + default: + break; + } + + switch (type) { + case GAUDI2_BLOCK_TYPE_TPC: + switch (index) { + case TPC_ID_DCORE0_TPC0 ... TPC_ID_DCORE0_TPC5: + return GAUDI2_DCORE0_ENGINE_ID_TPC_0 + index; + case TPC_ID_DCORE1_TPC0 ... TPC_ID_DCORE1_TPC5: + return GAUDI2_DCORE1_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE1_TPC0; + case TPC_ID_DCORE2_TPC0 ... TPC_ID_DCORE2_TPC5: + return GAUDI2_DCORE2_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE2_TPC0; + case TPC_ID_DCORE3_TPC0 ... TPC_ID_DCORE3_TPC5: + return GAUDI2_DCORE3_ENGINE_ID_TPC_0 + index - TPC_ID_DCORE3_TPC0; + default: + break; + } + break; + case GAUDI2_BLOCK_TYPE_MME: + switch (index) { + case MME_ID_DCORE0: return GAUDI2_DCORE0_ENGINE_ID_MME; + case MME_ID_DCORE1: return GAUDI2_DCORE1_ENGINE_ID_MME; + case MME_ID_DCORE2: return GAUDI2_DCORE2_ENGINE_ID_MME; + case MME_ID_DCORE3: return GAUDI2_DCORE3_ENGINE_ID_MME; + default: + break; + } + break; + case GAUDI2_BLOCK_TYPE_DEC: + switch (index) { + case DEC_ID_DCORE0_DEC0: return GAUDI2_DCORE0_ENGINE_ID_DEC_0; + case DEC_ID_DCORE0_DEC1: return GAUDI2_DCORE0_ENGINE_ID_DEC_1; + case DEC_ID_DCORE1_DEC0: return GAUDI2_DCORE1_ENGINE_ID_DEC_0; + case DEC_ID_DCORE1_DEC1: return GAUDI2_DCORE1_ENGINE_ID_DEC_1; + case DEC_ID_DCORE2_DEC0: return GAUDI2_DCORE2_ENGINE_ID_DEC_0; + case DEC_ID_DCORE2_DEC1: return GAUDI2_DCORE2_ENGINE_ID_DEC_1; + case DEC_ID_DCORE3_DEC0: return GAUDI2_DCORE3_ENGINE_ID_DEC_0; + case DEC_ID_DCORE3_DEC1: return GAUDI2_DCORE3_ENGINE_ID_DEC_1; + case DEC_ID_PCIE_VDEC0: return GAUDI2_PCIE_ENGINE_ID_DEC_0; + case DEC_ID_PCIE_VDEC1: return GAUDI2_PCIE_ENGINE_ID_DEC_1; + default: + break; + } + break; + default: + break; + } + + return U16_MAX; +} + static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) { struct gaudi2_device *gaudi2 = hdev->asic_specific; @@ -10010,6 +10175,9 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent } } + if (event_mask & HL_NOTIFIER_EVENT_USER_ENGINE_ERR) + hl_capture_engine_err(hdev, event_id_to_engine_id(hdev, event_type), error_count); + /* Make sure to dump an error in case no error cause was printed so far. * Note that although we have counted the errors, we use this number as * a boolean. diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h index e6436f3e8ea6..f912869b151e 100644 --- a/include/uapi/drm/habanalabs_accel.h +++ b/include/uapi/drm/habanalabs_accel.h @@ -809,6 +809,7 @@ enum hl_server_type { * HL_INFO_FW_ERR_EVENT - Retrieve information on the reported FW error. * May return 0 even though no new data is available, in that case * timestamp will be 0. + * HL_INFO_USER_ENGINE_ERR_EVENT - Retrieve the last engine id that reported an error. */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -845,6 +846,7 @@ enum hl_server_type { #define HL_INFO_FW_GENERIC_REQ 35 #define HL_INFO_HW_ERR_EVENT 36 #define HL_INFO_FW_ERR_EVENT 37 +#define HL_INFO_USER_ENGINE_ERR_EVENT 38 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -1226,6 +1228,20 @@ struct hl_info_fw_err_event { __u32 pad; }; +/** + * struct hl_info_engine_err_event - engine error info + * @timestamp: time-stamp of error occurrence + * @engine_id: engine id who reported the error. + * @error_count: Amount of errors reported. + * @pad: size padding for u64 granularity. + */ +struct hl_info_engine_err_event { + __s64 timestamp; + __u16 engine_id; + __u16 error_count; + __u32 pad; +}; + /** * struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information. * @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size -- 2.34.1