On 20/10/2023 18:58, Aravind Iddamsetty wrote: > We expose the various error counters supported on a hardware via genl > subsytem through the registered commands to userspace. The > DRM_RAS_CMD_QUERY lists the error names with config id, > DRM_RAD_CMD_READ_ONE returns the counter value for the requested config > id and the DRM_RAS_CMD_READ_ALL lists the counters for all errors along > with their names and config ids. > > v2: Rebase > > v3: > 1. presently xe_list_errors fills blank data for IGFX, prevent it by > having an early check of IS_DGFX (Michael J. Ruhl) > 2. update errors from all sources > > Cc: Ruhl, Michael J<michael.j.ruhl@xxxxxxxxx> > Signed-off-by: Aravind Iddamsetty<aravind.iddamsetty@xxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/xe/xe_netlink.c | 499 +++++++++++++++++++++++++++++++- > include/uapi/drm/xe_drm.h | 81 ++++++ > 2 files changed, 578 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/xe/xe_netlink.c b/drivers/gpu/drm/xe/xe_netlink.c > index 81d785455632..3e4cdb5e4920 100644 > --- a/drivers/gpu/drm/xe/xe_netlink.c > +++ b/drivers/gpu/drm/xe/xe_netlink.c > @@ -2,16 +2,511 @@ > /* > * Copyright © 2023 Intel Corporation > */ > +#include <drm/xe_drm.h> > + > #include "xe_device.h" > > -static int xe_genl_list_errors(struct drm_device *drm, struct sk_buff *msg, struct genl_info *info) > +#define MAX_ERROR_NAME 100 > + > +static const char * const xe_hw_error_events[] = { > + [XE_GENL_GT_ERROR_CORRECTABLE_L3_SNG] = "correctable-l3-sng", > + [XE_GENL_GT_ERROR_CORRECTABLE_GUC] = "correctable-guc", > + [XE_GENL_GT_ERROR_CORRECTABLE_SAMPLER] = "correctable-sampler", > + [XE_GENL_GT_ERROR_CORRECTABLE_SLM] = "correctable-slm", > + [XE_GENL_GT_ERROR_CORRECTABLE_EU_IC] = "correctable-eu-ic", > + [XE_GENL_GT_ERROR_CORRECTABLE_EU_GRF] = "correctable-eu-grf", > + [XE_GENL_GT_ERROR_FATAL_ARR_BIST] = "fatal-array-bist", > + [XE_GENL_GT_ERROR_FATAL_L3_DOUB] = "fatal-l3-double", > + [XE_GENL_GT_ERROR_FATAL_L3_ECC_CHK] = "fatal-l3-ecc-checker", > + [XE_GENL_GT_ERROR_FATAL_GUC] = "fatal-guc", > + [XE_GENL_GT_ERROR_FATAL_IDI_PAR] = "fatal-idi-parity", > + [XE_GENL_GT_ERROR_FATAL_SQIDI] = "fatal-sqidi", > + [XE_GENL_GT_ERROR_FATAL_SAMPLER] = "fatal-sampler", > + [XE_GENL_GT_ERROR_FATAL_SLM] = "fatal-slm", > + [XE_GENL_GT_ERROR_FATAL_EU_IC] = "fatal-eu-ic", > + [XE_GENL_GT_ERROR_FATAL_EU_GRF] = "fatal-eu-grf", > + [XE_GENL_GT_ERROR_FATAL_FPU] = "fatal-fpu", > + [XE_GENL_GT_ERROR_FATAL_TLB] = "fatal-tlb", > + [XE_GENL_GT_ERROR_FATAL_L3_FABRIC] = "fatal-l3-fabric", > + [XE_GENL_GT_ERROR_CORRECTABLE_SUBSLICE] = "correctable-subslice", > + [XE_GENL_GT_ERROR_CORRECTABLE_L3BANK] = "correctable-l3bank", > + [XE_GENL_GT_ERROR_FATAL_SUBSLICE] = "fatal-subslice", > + [XE_GENL_GT_ERROR_FATAL_L3BANK] = "fatal-l3bank", > + [XE_GENL_SGUNIT_ERROR_CORRECTABLE] = "sgunit-correctable", > + [XE_GENL_SGUNIT_ERROR_NONFATAL] = "sgunit-nonfatal", > + [XE_GENL_SGUNIT_ERROR_FATAL] = "sgunit-fatal", > + [XE_GENL_SOC_ERROR_NONFATAL_CSC_PSF_CMD] = "soc-nonfatal-csc-psf-cmd-parity", > + [XE_GENL_SOC_ERROR_NONFATAL_CSC_PSF_CMP] = "soc-nonfatal-csc-psf-unexpected-completion", > + [XE_GENL_SOC_ERROR_NONFATAL_CSC_PSF_REQ] = "soc-nonfatal-csc-psf-unsupported-request", > + [XE_GENL_SOC_ERROR_NONFATAL_ANR_MDFI] = "soc-nonfatal-anr-mdfi", > + [XE_GENL_SOC_ERROR_NONFATAL_MDFI_T2T] = "soc-nonfatal-mdfi-t2t", > + [XE_GENL_SOC_ERROR_NONFATAL_MDFI_T2C] = "soc-nonfatal-mdfi-t2c", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 0)] = "soc-nonfatal-hbm-ss0-0", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 1)] = "soc-nonfatal-hbm-ss0-1", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 2)] = "soc-nonfatal-hbm-ss0-2", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 3)] = "soc-nonfatal-hbm-ss0-3", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 4)] = "soc-nonfatal-hbm-ss0-4", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 5)] = "soc-nonfatal-hbm-ss0-5", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 6)] = "soc-nonfatal-hbm-ss0-6", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 7)] = "soc-nonfatal-hbm-ss0-7", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 8)] = "soc-nonfatal-hbm-ss1-0", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 9)] = "soc-nonfatal-hbm-ss1-1", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 10)] = "soc-nonfatal-hbm-ss1-2", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 11)] = "soc-nonfatal-hbm-ss1-3", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 12)] = "soc-nonfatal-hbm-ss1-4", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 13)] = "soc-nonfatal-hbm-ss1-5", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 14)] = "soc-nonfatal-hbm-ss1-6", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 15)] = "soc-nonfatal-hbm-ss1-7", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 0)] = "soc-nonfatal-hbm-ss2-0", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 1)] = "soc-nonfatal-hbm-ss2-1", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 2)] = "soc-nonfatal-hbm-ss2-2", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 3)] = "soc-nonfatal-hbm-ss2-3", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 4)] = "soc-nonfatal-hbm-ss2-4", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 5)] = "soc-nonfatal-hbm-ss2-5", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 6)] = "soc-nonfatal-hbm-ss2-6", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 7)] = "soc-nonfatal-hbm-ss2-7", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 8)] = "soc-nonfatal-hbm-ss3-0", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 9)] = "soc-nonfatal-hbm-ss3-1", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 10)] = "soc-nonfatal-hbm-ss3-2", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 11)] = "soc-nonfatal-hbm-ss3-3", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 12)] = "soc-nonfatal-hbm-ss3-4", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 13)] = "soc-nonfatal-hbm-ss3-5", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 14)] = "soc-nonfatal-hbm-ss3-6", > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 15)] = "soc-nonfatal-hbm-ss3-7", > + [XE_GENL_SOC_ERROR_FATAL_CSC_PSF_CMD] = "soc-fatal-csc-psf-cmd-parity", > + [XE_GENL_SOC_ERROR_FATAL_CSC_PSF_CMP] = "soc-fatal-csc-psf-unexpected-completion", > + [XE_GENL_SOC_ERROR_FATAL_CSC_PSF_REQ] = "soc-fatal-csc-psf-unsupported-request", > + [XE_GENL_SOC_ERROR_FATAL_PUNIT] = "soc-fatal-punit", > + [XE_GENL_SOC_ERROR_FATAL_PCIE_PSF_CMD] = "soc-fatal-pcie-psf-command-parity", > + [XE_GENL_SOC_ERROR_FATAL_PCIE_PSF_CMP] = "soc-fatal-pcie-psf-unexpected-completion", > + [XE_GENL_SOC_ERROR_FATAL_PCIE_PSF_REQ] = "soc-fatal-pcie-psf-unsupported-request", > + [XE_GENL_SOC_ERROR_FATAL_ANR_MDFI] = "soc-fatal-anr-mdfi", > + [XE_GENL_SOC_ERROR_FATAL_MDFI_T2T] = "soc-fatal-mdfi-t2t", > + [XE_GENL_SOC_ERROR_FATAL_MDFI_T2C] = "soc-fatal-mdfi-t2c", > + [XE_GENL_SOC_ERROR_FATAL_PCIE_AER] = "soc-fatal-malformed-pcie-aer", > + [XE_GENL_SOC_ERROR_FATAL_PCIE_ERR] = "soc-fatal-malformed-pcie-err", > + [XE_GENL_SOC_ERROR_FATAL_UR_COND] = "soc-fatal-ur-condition-ieh", > + [XE_GENL_SOC_ERROR_FATAL_SERR_SRCS] = "soc-fatal-from-serr-sources", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 0)] = "soc-fatal-hbm-ss0-0", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 1)] = "soc-fatal-hbm-ss0-1", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 2)] = "soc-fatal-hbm-ss0-2", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 3)] = "soc-fatal-hbm-ss0-3", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 4)] = "soc-fatal-hbm-ss0-4", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 5)] = "soc-fatal-hbm-ss0-5", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 6)] = "soc-fatal-hbm-ss0-6", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 7)] = "soc-fatal-hbm-ss0-7", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 8)] = "soc-fatal-hbm-ss1-0", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 9)] = "soc-fatal-hbm-ss1-1", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 10)] = "soc-fatal-hbm-ss1-2", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 11)] = "soc-fatal-hbm-ss1-3", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 12)] = "soc-fatal-hbm-ss1-4", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 13)] = "soc-fatal-hbm-ss1-5", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 14)] = "soc-fatal-hbm-ss1-6", > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 15)] = "soc-fatal-hbm-ss1-7", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 0)] = "soc-fatal-hbm-ss2-0", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 1)] = "soc-fatal-hbm-ss2-1", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 2)] = "soc-fatal-hbm-ss2-2", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 3)] = "soc-fatal-hbm-ss2-3", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 4)] = "soc-fatal-hbm-ss2-4", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 5)] = "soc-fatal-hbm-ss2-5", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 6)] = "soc-fatal-hbm-ss2-6", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 7)] = "soc-fatal-hbm-ss2-7", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 8)] = "soc-fatal-hbm-ss3-0", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 9)] = "soc-fatal-hbm-ss3-1", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 10)] = "soc-fatal-hbm-ss3-2", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 11)] = "soc-fatal-hbm-ss3-3", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 12)] = "soc-fatal-hbm-ss3-4", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 13)] = "soc-fatal-hbm-ss3-5", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 14)] = "soc-fatal-hbm-ss3-6", > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 15)] = "soc-fatal-hbm-ss3-7", > + [XE_GENL_GSC_ERROR_CORRECTABLE_SRAM_ECC] = "gsc-correctable-sram-ecc", > + [XE_GENL_GSC_ERROR_NONFATAL_MIA_SHUTDOWN] = "gsc-nonfatal-mia-shutdown", > + [XE_GENL_GSC_ERROR_NONFATAL_MIA_INTERNAL] = "gsc-nonfatal-mia-internal", > + [XE_GENL_GSC_ERROR_NONFATAL_SRAM_ECC] = "gsc-nonfatal-sram-ecc", > + [XE_GENL_GSC_ERROR_NONFATAL_WDG_TIMEOUT] = "gsc-nonfatal-wdg-timeout", > + [XE_GENL_GSC_ERROR_NONFATAL_ROM_PARITY] = "gsc-nonfatal-rom-parity", > + [XE_GENL_GSC_ERROR_NONFATAL_UCODE_PARITY] = "gsc-nonfatal-ucode-parity", > + [XE_GENL_GSC_ERROR_NONFATAL_VLT_GLITCH] = "gsc-nonfatal-vlt-glitch", > + [XE_GENL_GSC_ERROR_NONFATAL_FUSE_PULL] = "gsc-nonfatal-fuse-pull", > + [XE_GENL_GSC_ERROR_NONFATAL_FUSE_CRC_CHECK] = "gsc-nonfatal-fuse-crc-check", > + [XE_GENL_GSC_ERROR_NONFATAL_SELF_MBIST] = "gsc-nonfatal-self-mbist", > + [XE_GENL_GSC_ERROR_NONFATAL_AON_RF_PARITY] = "gsc-nonfatal-aon-parity", > + [XE_GENL_SGGI_ERROR_NONFATAL] = "sggi-nonfatal-data-parity", > + [XE_GENL_SGLI_ERROR_NONFATAL] = "sgli-nonfatal-data-parity", > + [XE_GENL_SGCI_ERROR_NONFATAL] = "sgci-nonfatal-data-parity", > + [XE_GENL_MERT_ERROR_NONFATAL] = "mert-nonfatal-data-parity", > + [XE_GENL_SGGI_ERROR_FATAL] = "sggi-fatal-data-parity", > + [XE_GENL_SGLI_ERROR_FATAL] = "sgli-fatal-data-parity", > + [XE_GENL_SGCI_ERROR_FATAL] = "sgci-fatal-data-parity", > + [XE_GENL_MERT_ERROR_FATAL] = "mert-nonfatal-data-parity", > +}; > + > +static const unsigned long xe_hw_error_map[] = { > + [XE_GENL_GT_ERROR_CORRECTABLE_L3_SNG] = XE_HW_ERR_GT_CORR_L3_SNG, > + [XE_GENL_GT_ERROR_CORRECTABLE_GUC] = XE_HW_ERR_GT_CORR_GUC, > + [XE_GENL_GT_ERROR_CORRECTABLE_SAMPLER] = XE_HW_ERR_GT_CORR_SAMPLER, > + [XE_GENL_GT_ERROR_CORRECTABLE_SLM] = XE_HW_ERR_GT_CORR_SLM, > + [XE_GENL_GT_ERROR_CORRECTABLE_EU_IC] = XE_HW_ERR_GT_CORR_EU_IC, > + [XE_GENL_GT_ERROR_CORRECTABLE_EU_GRF] = XE_HW_ERR_GT_CORR_EU_GRF, > + [XE_GENL_GT_ERROR_FATAL_ARR_BIST] = XE_HW_ERR_GT_FATAL_ARR_BIST, > + [XE_GENL_GT_ERROR_FATAL_L3_DOUB] = XE_HW_ERR_GT_FATAL_L3_DOUB, > + [XE_GENL_GT_ERROR_FATAL_L3_ECC_CHK] = XE_HW_ERR_GT_FATAL_L3_ECC_CHK, > + [XE_GENL_GT_ERROR_FATAL_GUC] = XE_HW_ERR_GT_FATAL_GUC, > + [XE_GENL_GT_ERROR_FATAL_IDI_PAR] = XE_HW_ERR_GT_FATAL_IDI_PAR, > + [XE_GENL_GT_ERROR_FATAL_SQIDI] = XE_HW_ERR_GT_FATAL_SQIDI, > + [XE_GENL_GT_ERROR_FATAL_SAMPLER] = XE_HW_ERR_GT_FATAL_SAMPLER, > + [XE_GENL_GT_ERROR_FATAL_SLM] = XE_HW_ERR_GT_FATAL_SLM, > + [XE_GENL_GT_ERROR_FATAL_EU_IC] = XE_HW_ERR_GT_FATAL_EU_IC, > + [XE_GENL_GT_ERROR_FATAL_EU_GRF] = XE_HW_ERR_GT_FATAL_EU_GRF, > + [XE_GENL_GT_ERROR_FATAL_FPU] = XE_HW_ERR_GT_FATAL_FPU, > + [XE_GENL_GT_ERROR_FATAL_TLB] = XE_HW_ERR_GT_FATAL_TLB, > + [XE_GENL_GT_ERROR_FATAL_L3_FABRIC] = XE_HW_ERR_GT_FATAL_L3_FABRIC, > + [XE_GENL_GT_ERROR_CORRECTABLE_SUBSLICE] = XE_HW_ERR_GT_CORR_SUBSLICE, > + [XE_GENL_GT_ERROR_CORRECTABLE_L3BANK] = XE_HW_ERR_GT_CORR_L3BANK, > + [XE_GENL_GT_ERROR_FATAL_SUBSLICE] = XE_HW_ERR_GT_FATAL_SUBSLICE, > + [XE_GENL_GT_ERROR_FATAL_L3BANK] = XE_HW_ERR_GT_FATAL_L3BANK, > + [XE_GENL_SGUNIT_ERROR_CORRECTABLE] = XE_HW_ERR_TILE_CORR_SGUNIT, > + [XE_GENL_SGUNIT_ERROR_NONFATAL] = XE_HW_ERR_TILE_NONFATAL_SGUNIT, > + [XE_GENL_SGUNIT_ERROR_FATAL] = XE_HW_ERR_TILE_FATAL_SGUNIT, > + [XE_GENL_SOC_ERROR_NONFATAL_CSC_PSF_CMD] = XE_HW_ERR_SOC_NONFATAL_CSC_PSF_CMD, > + [XE_GENL_SOC_ERROR_NONFATAL_CSC_PSF_CMP] = XE_HW_ERR_SOC_NONFATAL_CSC_PSF_CMP, > + [XE_GENL_SOC_ERROR_NONFATAL_CSC_PSF_REQ] = XE_HW_ERR_SOC_NONFATAL_CSC_PSF_REQ, > + [XE_GENL_SOC_ERROR_NONFATAL_ANR_MDFI] = XE_HW_ERR_SOC_NONFATAL_ANR_MDFI, > + [XE_GENL_SOC_ERROR_NONFATAL_MDFI_T2T] = XE_HW_ERR_SOC_NONFATAL_MDFI_T2T, > + [XE_GENL_SOC_ERROR_NONFATAL_MDFI_T2C] = XE_HW_ERR_SOC_NONFATAL_MDFI_T2C, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 0)] = XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL0, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 1)] = XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL1, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 2)] = XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL2, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 3)] = XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL3, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 4)] = XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL4, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 5)] = XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL5, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 6)] = XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL6, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 7)] = XE_HW_ERR_SOC_NONFATAL_HBM0_CHNL7, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 8)] = XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL0, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 9)] = XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL1, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 10)] = XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL2, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 11)] = XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL3, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 12)] = XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL4, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 13)] = XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL5, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 14)] = XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL6, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(0, 15)] = XE_HW_ERR_SOC_NONFATAL_HBM1_CHNL7, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 0)] = XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL0, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 1)] = XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL1, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 2)] = XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL2, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 3)] = XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL3, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 4)] = XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL4, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 5)] = XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL5, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 6)] = XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL6, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 7)] = XE_HW_ERR_SOC_NONFATAL_HBM2_CHNL7, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 8)] = XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL0, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 9)] = XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL1, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 10)] = XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL2, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 11)] = XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL3, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 12)] = XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL4, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 13)] = XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL5, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 14)] = XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL6, > + [XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 15)] = XE_HW_ERR_SOC_NONFATAL_HBM3_CHNL7, > + [XE_GENL_SOC_ERROR_FATAL_CSC_PSF_CMD] = XE_HW_ERR_SOC_FATAL_CSC_PSF_CMD, > + [XE_GENL_SOC_ERROR_FATAL_CSC_PSF_CMP] = XE_HW_ERR_SOC_FATAL_CSC_PSF_CMP, > + [XE_GENL_SOC_ERROR_FATAL_CSC_PSF_REQ] = XE_HW_ERR_SOC_FATAL_CSC_PSF_REQ, > + [XE_GENL_SOC_ERROR_FATAL_PUNIT] = XE_HW_ERR_SOC_FATAL_PUNIT, > + [XE_GENL_SOC_ERROR_FATAL_PCIE_PSF_CMD] = XE_HW_ERR_SOC_FATAL_PCIE_PSF_CMD, > + [XE_GENL_SOC_ERROR_FATAL_PCIE_PSF_CMP] = XE_HW_ERR_SOC_FATAL_PCIE_PSF_CMP, > + [XE_GENL_SOC_ERROR_FATAL_PCIE_PSF_REQ] = XE_HW_ERR_SOC_FATAL_PCIE_PSF_REQ, > + [XE_GENL_SOC_ERROR_FATAL_ANR_MDFI] = XE_HW_ERR_SOC_FATAL_ANR_MDFI, > + [XE_GENL_SOC_ERROR_FATAL_MDFI_T2T] = XE_HW_ERR_SOC_FATAL_MDFI_T2T, > + [XE_GENL_SOC_ERROR_FATAL_MDFI_T2C] = XE_HW_ERR_SOC_FATAL_MDFI_T2C, > + [XE_GENL_SOC_ERROR_FATAL_PCIE_AER] = XE_HW_ERR_SOC_FATAL_PCIE_AER, > + [XE_GENL_SOC_ERROR_FATAL_PCIE_ERR] = XE_HW_ERR_SOC_FATAL_PCIE_ERR, > + [XE_GENL_SOC_ERROR_FATAL_UR_COND] = XE_HW_ERR_SOC_FATAL_UR_COND, > + [XE_GENL_SOC_ERROR_FATAL_SERR_SRCS] = XE_HW_ERR_SOC_FATAL_SERR_SRCS, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 0)] = XE_HW_ERR_SOC_FATAL_HBM0_CHNL0, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 1)] = XE_HW_ERR_SOC_FATAL_HBM0_CHNL1, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 2)] = XE_HW_ERR_SOC_FATAL_HBM0_CHNL2, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 3)] = XE_HW_ERR_SOC_FATAL_HBM0_CHNL3, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 4)] = XE_HW_ERR_SOC_FATAL_HBM0_CHNL4, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 5)] = XE_HW_ERR_SOC_FATAL_HBM0_CHNL5, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 6)] = XE_HW_ERR_SOC_FATAL_HBM0_CHNL6, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 7)] = XE_HW_ERR_SOC_FATAL_HBM0_CHNL7, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 8)] = XE_HW_ERR_SOC_FATAL_HBM1_CHNL0, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 9)] = XE_HW_ERR_SOC_FATAL_HBM1_CHNL1, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 10)] = XE_HW_ERR_SOC_FATAL_HBM1_CHNL2, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 11)] = XE_HW_ERR_SOC_FATAL_HBM1_CHNL3, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 12)] = XE_HW_ERR_SOC_FATAL_HBM1_CHNL4, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 13)] = XE_HW_ERR_SOC_FATAL_HBM1_CHNL5, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 14)] = XE_HW_ERR_SOC_FATAL_HBM1_CHNL6, > + [XE_GENL_SOC_ERROR_FATAL_HBM(0, 15)] = XE_HW_ERR_SOC_FATAL_HBM1_CHNL7, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 0)] = XE_HW_ERR_SOC_FATAL_HBM2_CHNL0, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 1)] = XE_HW_ERR_SOC_FATAL_HBM2_CHNL1, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 2)] = XE_HW_ERR_SOC_FATAL_HBM2_CHNL2, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 3)] = XE_HW_ERR_SOC_FATAL_HBM2_CHNL3, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 4)] = XE_HW_ERR_SOC_FATAL_HBM2_CHNL4, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 5)] = XE_HW_ERR_SOC_FATAL_HBM2_CHNL5, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 6)] = XE_HW_ERR_SOC_FATAL_HBM2_CHNL6, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 7)] = XE_HW_ERR_SOC_FATAL_HBM2_CHNL7, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 8)] = XE_HW_ERR_SOC_FATAL_HBM3_CHNL0, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 9)] = XE_HW_ERR_SOC_FATAL_HBM3_CHNL1, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 10)] = XE_HW_ERR_SOC_FATAL_HBM3_CHNL2, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 11)] = XE_HW_ERR_SOC_FATAL_HBM3_CHNL3, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 12)] = XE_HW_ERR_SOC_FATAL_HBM3_CHNL4, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 13)] = XE_HW_ERR_SOC_FATAL_HBM3_CHNL5, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 14)] = XE_HW_ERR_SOC_FATAL_HBM3_CHNL6, > + [XE_GENL_SOC_ERROR_FATAL_HBM(1, 15)] = XE_HW_ERR_SOC_FATAL_HBM3_CHNL7, > + [XE_GENL_GSC_ERROR_CORRECTABLE_SRAM_ECC] = XE_HW_ERR_GSC_CORR_SRAM, > + [XE_GENL_GSC_ERROR_NONFATAL_MIA_SHUTDOWN] = XE_HW_ERR_GSC_NONFATAL_MIA_SHUTDOWN, > + [XE_GENL_GSC_ERROR_NONFATAL_MIA_INTERNAL] = XE_HW_ERR_GSC_NONFATAL_MIA_INTERNAL, > + [XE_GENL_GSC_ERROR_NONFATAL_SRAM_ECC] = XE_HW_ERR_GSC_NONFATAL_SRAM, > + [XE_GENL_GSC_ERROR_NONFATAL_WDG_TIMEOUT] = XE_HW_ERR_GSC_NONFATAL_WDG, > + [XE_GENL_GSC_ERROR_NONFATAL_ROM_PARITY] = XE_HW_ERR_GSC_NONFATAL_ROM_PARITY, > + [XE_GENL_GSC_ERROR_NONFATAL_UCODE_PARITY] = XE_HW_ERR_GSC_NONFATAL_UCODE_PARITY, > + [XE_GENL_GSC_ERROR_NONFATAL_VLT_GLITCH] = XE_HW_ERR_GSC_NONFATAL_VLT_GLITCH, > + [XE_GENL_GSC_ERROR_NONFATAL_FUSE_PULL] = XE_HW_ERR_GSC_NONFATAL_FUSE_PULL, > + [XE_GENL_GSC_ERROR_NONFATAL_FUSE_CRC_CHECK] = XE_HW_ERR_GSC_NONFATAL_FUSE_CRC, > + [XE_GENL_GSC_ERROR_NONFATAL_SELF_MBIST] = XE_HW_ERR_GSC_NONFATAL_SELF_MBIST, > + [XE_GENL_GSC_ERROR_NONFATAL_AON_RF_PARITY] = XE_HW_ERR_GSC_NONFATAL_AON_RF_PARITY, > + [XE_GENL_SGGI_ERROR_NONFATAL] = XE_HW_ERR_TILE_NONFATAL_SGGI, > + [XE_GENL_SGLI_ERROR_NONFATAL] = XE_HW_ERR_TILE_NONFATAL_SGLI, > + [XE_GENL_SGCI_ERROR_NONFATAL] = XE_HW_ERR_TILE_NONFATAL_SGCI, > + [XE_GENL_MERT_ERROR_NONFATAL] = XE_HW_ERR_TILE_NONFATAL_MERT, > + [XE_GENL_SGGI_ERROR_FATAL] = XE_HW_ERR_TILE_FATAL_SGGI, > + [XE_GENL_SGLI_ERROR_FATAL] = XE_HW_ERR_TILE_FATAL_SGLI, > + [XE_GENL_SGCI_ERROR_FATAL] = XE_HW_ERR_TILE_FATAL_SGCI, > + [XE_GENL_MERT_ERROR_FATAL] = XE_HW_ERR_TILE_FATAL_MERT, > +}; > + > +static unsigned int config_gt_id(const u64 config) > +{ > + return config >> __XE_PMU_GT_SHIFT; > +} > + > +static u64 config_counter(const u64 config) > { > + return config & ~(~0ULL << __XE_PMU_GT_SHIFT); > +} > + > +static bool is_gt_error(const u64 config) > +{ > + unsigned int error; > + > + error = config_counter(config); > + if (error <= XE_GENL_GT_ERROR_FATAL_FPU) > + return true; > + > + return false; > +} > + > +static bool is_gt_vector_error(const u64 config) > +{ > + unsigned int error; > + > + error = config_counter(config); > + if (error >= XE_GENL_GT_ERROR_FATAL_TLB && > + error <= XE_GENL_GT_ERROR_FATAL_L3BANK) > + return true; > + > + return false; > +} > + > +static bool is_pvc_invalid_gt_errors(const u64 config) > +{ > + switch (config_counter(config)) { > + case XE_GENL_GT_ERROR_CORRECTABLE_L3_SNG: > + case XE_GENL_GT_ERROR_CORRECTABLE_SAMPLER: > + case XE_GENL_GT_ERROR_FATAL_ARR_BIST: > + case XE_GENL_GT_ERROR_FATAL_L3_DOUB: > + case XE_GENL_GT_ERROR_FATAL_L3_ECC_CHK: > + case XE_GENL_GT_ERROR_FATAL_IDI_PAR: > + case XE_GENL_GT_ERROR_FATAL_SQIDI: > + case XE_GENL_GT_ERROR_FATAL_SAMPLER: > + case XE_GENL_GT_ERROR_FATAL_EU_IC: > + return true; > + default: > + return false; > + } > +} > + > +static bool is_gsc_hw_error(const u64 config) > +{ > + if (config_counter(config) >= XE_GENL_GSC_ERROR_CORRECTABLE_SRAM_ECC && > + config_counter(config) <= XE_GENL_GSC_ERROR_NONFATAL_AON_RF_PARITY) > + return true; > + > + return false; > +} > + > +static bool is_soc_error(const u64 config) > +{ > + if (config_counter(config) >= XE_GENL_SOC_ERROR_NONFATAL_CSC_PSF_CMD && > + config_counter(config) <= XE_GENL_SOC_ERROR_FATAL_HBM(1, 15)) > + return true; > + > + return false; > +} > + > +static int > +config_status(struct xe_device *xe, u64 config) > +{ > + unsigned int gt_id = config_gt_id(config); > + struct xe_gt *gt = xe_device_get_gt(xe, gt_id); > + > + if (!IS_DGFX(xe)) > + return -ENODEV; > + > + if (gt->info.type == XE_GT_TYPE_UNINITIALIZED) > + return -ENOENT; > + > + /* GSC HW ERRORS are present on root tile of > + * platform supporting MEMORY SPARING only > + */ > + if (is_gsc_hw_error(config) && !(xe->info.platform == XE_PVC && !gt_id)) > + return -ENODEV; > + > + /* GT vectors error are valid on Platforms supporting error vectors only */ > + if (is_gt_vector_error(config) && xe->info.platform != XE_PVC) > + return -ENODEV; > + > + /* Skip gt errors not supported on pvc */ > + if (is_pvc_invalid_gt_errors(config) && xe->info.platform == XE_PVC) > + return -ENODEV; > + > + /* FATAL FPU error is valid on PVC only */ > + if (config_counter(config) == XE_GENL_GT_ERROR_FATAL_FPU && > + !(xe->info.platform == XE_PVC)) > + return -ENODEV; > + > + if (is_soc_error(config) && !(xe->info.platform == XE_PVC)) > + return -ENODEV; > + > + return (config_counter(config) >= > + ARRAY_SIZE(xe_hw_error_map)) ? -ENOENT : 0; > +} > + > +static u64 get_counter_value(struct xe_device *xe, u64 config) > +{ > + const unsigned int gt_id = config_gt_id(config); > + struct xe_gt *gt = xe_device_get_gt(xe, gt_id); > + unsigned int id = config_counter(config); > + > + if (is_gt_error(config) || is_gt_vector_error(config)) > + return xa_to_value(xa_load(>->errors.hw_error, xe_hw_error_map[id])); > + > + return xa_to_value(xa_load(>->tile->errors.hw_error, xe_hw_error_map[id])); > +} > + > +int fill_error_details(struct xe_device *xe, struct genl_info *info, struct sk_buff *new_msg) Should it be static? > +{ > + struct nlattr *entry_attr; > + bool counter = false; > + struct xe_gt *gt; > + int i, j; > + > + BUILD_BUG_ON(ARRAY_SIZE(xe_hw_error_events) != > + ARRAY_SIZE(xe_hw_error_map)); > + > + if (info->genlhdr->cmd == DRM_RAS_CMD_READ_ALL) > + counter = true; > + > + entry_attr = nla_nest_start(new_msg, DRM_RAS_ATTR_QUERY_REPLY); > + if (!entry_attr) > + return -EMSGSIZE; > + > + for_each_gt(gt, xe, j) { > + char str[MAX_ERROR_NAME]; > + u64 val; > + > + for (i = 0; i < ARRAY_SIZE(xe_hw_error_events); i++) { > + u64 config = XE_HW_ERROR(j, i); > + > + if (config_status(xe, config)) > + continue; > + > + /* should this be cleared everytime */ > + snprintf(str, sizeof(str), "error-gt%d-%s", j, xe_hw_error_events[i]); > + > + if (nla_put_string(new_msg, DRM_RAS_ATTR_ERROR_NAME, str)) > + goto err; > + if (nla_put_u64_64bit(new_msg, DRM_RAS_ATTR_ERROR_ID, config, DRM_ATTR_PAD)) > + goto err; > + if (counter) { > + val = get_counter_value(xe, config); > + if (nla_put_u64_64bit(new_msg, DRM_RAS_ATTR_ERROR_VALUE, val, DRM_ATTR_PAD)) > + goto err; > + } > + } > + } > + > + nla_nest_end(new_msg, entry_attr); > + > return 0; > +err: > + drm_dbg_driver(&xe->drm, "msg buff is small\n"); > + nla_nest_cancel(new_msg, entry_attr); > + nlmsg_free(new_msg); > + > + return -EMSGSIZE; > +} > + > +static int xe_genl_list_errors(struct drm_device *drm, struct sk_buff *msg, struct genl_info *info) > +{ > + struct xe_device *xe = to_xe_device(drm); > + size_t msg_size = NLMSG_DEFAULT_SIZE; > + struct sk_buff *new_msg; > + int retries = 2; > + void *usrhdr; > + int ret = 0; > + > + if (!IS_DGFX(xe)) > + return -ENODEV; > + > + do { > + new_msg = drm_genl_alloc_msg(drm, info, msg_size, &usrhdr); > + if (!new_msg) > + return -ENOMEM; > + > + ret = fill_error_details(xe, info, new_msg); > + if (!ret) > + break; > + > + msg_size += NLMSG_DEFAULT_SIZE; > + } while (retries--); > + > + if (!ret) > + ret = drm_genl_reply(new_msg, info, usrhdr); > + > + return ret; > } > > static int xe_genl_read_error(struct drm_device *drm, struct sk_buff *msg, struct genl_info *info) > { > - return 0; > + struct xe_device *xe = to_xe_device(drm); > + size_t msg_size = NLMSG_DEFAULT_SIZE; > + struct sk_buff *new_msg; > + void *usrhdr; > + int ret = 0; > + int retries = 2; > + u64 config, val; > + > + config = nla_get_u64(info->attrs[DRM_RAS_ATTR_ERROR_ID]); > + ret = config_status(xe, config); > + if (ret) > + return ret; > + do { > + new_msg = drm_genl_alloc_msg(drm, info, msg_size, &usrhdr); > + if (!new_msg) > + return -ENOMEM; > + > + val = get_counter_value(xe, config); > + if (nla_put_u64_64bit(new_msg, DRM_RAS_ATTR_ERROR_VALUE, val, DRM_ATTR_PAD)) { > + msg_size += NLMSG_DEFAULT_SIZE; > + continue; > + } Here ERROR_ID is provided and ERROR_VALUE is returned, but maybe we can return also ERROR_NAME for the "full picture"? Or do you think that a regular flow would be first listing all errors, grep the name of the required error, and use its id to get the value, so userspace already has the name? > + > + break; > + } while (retries--); It is really possible that NLMSG_DEFAULT_SIZE won't be enough for a single counter read? Thanks, Tomer > + > + ret = drm_genl_reply(new_msg, info, usrhdr); > + > + return ret; > } > > /* driver callbacks to DRM netlink commands*/ > diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h > index 60cc6418d9a7..dbb3f1afba5f 100644 > --- a/include/uapi/drm/xe_drm.h > +++ b/include/uapi/drm/xe_drm.h > @@ -1087,6 +1087,87 @@ struct drm_xe_vm_madvise { > #define XE_PMU_MEDIA_GROUP_BUSY(gt) ___XE_PMU_OTHER(gt, 3) > #define XE_PMU_ANY_ENGINE_GROUP_BUSY(gt) ___XE_PMU_OTHER(gt, 4) > > +/** > + * DOC: XE GENL netlink event IDs > + * TODO: Add more details > + */ > +#define XE_HW_ERROR(gt, id) \ > + ((id) | ((__u64)(gt) << __XE_PMU_GT_SHIFT)) > + > +#define XE_GENL_GT_ERROR_CORRECTABLE_L3_SNG (0) > +#define XE_GENL_GT_ERROR_CORRECTABLE_GUC (1) > +#define XE_GENL_GT_ERROR_CORRECTABLE_SAMPLER (2) > +#define XE_GENL_GT_ERROR_CORRECTABLE_SLM (3) > +#define XE_GENL_GT_ERROR_CORRECTABLE_EU_IC (4) > +#define XE_GENL_GT_ERROR_CORRECTABLE_EU_GRF (5) > +#define XE_GENL_GT_ERROR_FATAL_ARR_BIST (6) > +#define XE_GENL_GT_ERROR_FATAL_L3_DOUB (7) > +#define XE_GENL_GT_ERROR_FATAL_L3_ECC_CHK (8) > +#define XE_GENL_GT_ERROR_FATAL_GUC (9) > +#define XE_GENL_GT_ERROR_FATAL_IDI_PAR (10) > +#define XE_GENL_GT_ERROR_FATAL_SQIDI (11) > +#define XE_GENL_GT_ERROR_FATAL_SAMPLER (12) > +#define XE_GENL_GT_ERROR_FATAL_SLM (13) > +#define XE_GENL_GT_ERROR_FATAL_EU_IC (14) > +#define XE_GENL_GT_ERROR_FATAL_EU_GRF (15) > +#define XE_GENL_GT_ERROR_FATAL_FPU (16) > +#define XE_GENL_GT_ERROR_FATAL_TLB (17) > +#define XE_GENL_GT_ERROR_FATAL_L3_FABRIC (18) > +#define XE_GENL_GT_ERROR_CORRECTABLE_SUBSLICE (19) > +#define XE_GENL_GT_ERROR_CORRECTABLE_L3BANK (20) > +#define XE_GENL_GT_ERROR_FATAL_SUBSLICE (21) > +#define XE_GENL_GT_ERROR_FATAL_L3BANK (22) > +#define XE_GENL_SGUNIT_ERROR_CORRECTABLE (23) > +#define XE_GENL_SGUNIT_ERROR_NONFATAL (24) > +#define XE_GENL_SGUNIT_ERROR_FATAL (25) > +#define XE_GENL_SOC_ERROR_NONFATAL_CSC_PSF_CMD (26) > +#define XE_GENL_SOC_ERROR_NONFATAL_CSC_PSF_CMP (27) > +#define XE_GENL_SOC_ERROR_NONFATAL_CSC_PSF_REQ (28) > +#define XE_GENL_SOC_ERROR_NONFATAL_ANR_MDFI (29) > +#define XE_GENL_SOC_ERROR_NONFATAL_MDFI_T2T (30) > +#define XE_GENL_SOC_ERROR_NONFATAL_MDFI_T2C (31) > +#define XE_GENL_SOC_ERROR_FATAL_CSC_PSF_CMD (32) > +#define XE_GENL_SOC_ERROR_FATAL_CSC_PSF_CMP (33) > +#define XE_GENL_SOC_ERROR_FATAL_CSC_PSF_REQ (34) > +#define XE_GENL_SOC_ERROR_FATAL_PUNIT (35) > +#define XE_GENL_SOC_ERROR_FATAL_PCIE_PSF_CMD (36) > +#define XE_GENL_SOC_ERROR_FATAL_PCIE_PSF_CMP (37) > +#define XE_GENL_SOC_ERROR_FATAL_PCIE_PSF_REQ (38) > +#define XE_GENL_SOC_ERROR_FATAL_ANR_MDFI (39) > +#define XE_GENL_SOC_ERROR_FATAL_MDFI_T2T (40) > +#define XE_GENL_SOC_ERROR_FATAL_MDFI_T2C (41) > +#define XE_GENL_SOC_ERROR_FATAL_PCIE_AER (42) > +#define XE_GENL_SOC_ERROR_FATAL_PCIE_ERR (43) > +#define XE_GENL_SOC_ERROR_FATAL_UR_COND (44) > +#define XE_GENL_SOC_ERROR_FATAL_SERR_SRCS (45) > + > +#define XE_GENL_SOC_ERROR_NONFATAL_HBM(ss, n)\ > + (XE_GENL_SOC_ERROR_FATAL_SERR_SRCS + 0x1 + (ss) * 0x10 + (n)) > +#define XE_GENL_SOC_ERROR_FATAL_HBM(ss, n)\ > + (XE_GENL_SOC_ERROR_NONFATAL_HBM(1, 15) + 0x1 + (ss) * 0x10 + (n)) > + > +/* 109 is the last ID used by SOC errors */ > +#define XE_GENL_GSC_ERROR_CORRECTABLE_SRAM_ECC (110) > +#define XE_GENL_GSC_ERROR_NONFATAL_MIA_SHUTDOWN (111) > +#define XE_GENL_GSC_ERROR_NONFATAL_MIA_INTERNAL (112) > +#define XE_GENL_GSC_ERROR_NONFATAL_SRAM_ECC (113) > +#define XE_GENL_GSC_ERROR_NONFATAL_WDG_TIMEOUT (114) > +#define XE_GENL_GSC_ERROR_NONFATAL_ROM_PARITY (115) > +#define XE_GENL_GSC_ERROR_NONFATAL_UCODE_PARITY (116) > +#define XE_GENL_GSC_ERROR_NONFATAL_VLT_GLITCH (117) > +#define XE_GENL_GSC_ERROR_NONFATAL_FUSE_PULL (118) > +#define XE_GENL_GSC_ERROR_NONFATAL_FUSE_CRC_CHECK (119) > +#define XE_GENL_GSC_ERROR_NONFATAL_SELF_MBIST (120) > +#define XE_GENL_GSC_ERROR_NONFATAL_AON_RF_PARITY (121) > +#define XE_GENL_SGGI_ERROR_NONFATAL (122) > +#define XE_GENL_SGLI_ERROR_NONFATAL (123) > +#define XE_GENL_SGCI_ERROR_NONFATAL (124) > +#define XE_GENL_MERT_ERROR_NONFATAL (125) > +#define XE_GENL_SGGI_ERROR_FATAL (126) > +#define XE_GENL_SGLI_ERROR_FATAL (127) > +#define XE_GENL_SGCI_ERROR_FATAL (128) > +#define XE_GENL_MERT_ERROR_FATAL (129) > + > #if defined(__cplusplus) > } > #endif