Patch "EDAC/skx_common: Differentiate memory error sources" has been added to the 6.6-stable tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a note to let you know that I've just added the patch titled

    EDAC/skx_common: Differentiate memory error sources

to the 6.6-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     edac-skx_common-differentiate-memory-error-sources.patch
and it can be found in the queue-6.6 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit 609c888fcfbab94ec1049072b7d48c5dfb5013a1
Author: Qiuxu Zhuo <qiuxu.zhuo@xxxxxxxxx>
Date:   Tue Oct 15 15:22:35 2024 +0800

    EDAC/skx_common: Differentiate memory error sources
    
    [ Upstream commit 2397f795735219caa9c2fe61e7bcdd0652e670d3 ]
    
    The current skx_common determines whether the memory error source is the
    near memory of the 2LM system and then retrieves the decoded error results
    from the ADXL components (near-memory vs. far-memory) accordingly.
    
    However, some memory controllers may have limitations in correctly
    reporting the memory error source, leading to the retrieval of incorrect
    decoded parts from the ADXL.
    
    To address these limitations, instead of simply determining whether the
    memory error is from the near memory of the 2LM system, it is necessary to
    distinguish the memory error source details as follows:
    
      Memory error from the near memory of the 2LM system.
      Memory error from the far memory of the 2LM system.
      Memory error from the 1LM system.
      Not a memory error.
    
    This will enable the i10nm_edac driver to take appropriate actions for
    those memory controllers that have limitations in reporting the memory
    error source.
    
    Fixes: ba987eaaabf9 ("EDAC/i10nm: Add Intel Granite Rapids server support")
    Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@xxxxxxxxx>
    Signed-off-by: Tony Luck <tony.luck@xxxxxxxxx>
    Tested-by: Diego Garcia Rodriguez <diego.garcia.rodriguez@xxxxxxxxx>
    Link: https://lore.kernel.org/r/20241015072236.24543-2-qiuxu.zhuo@xxxxxxxxx
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 8d18099fd528c..42266120ef427 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -119,7 +119,7 @@ void skx_adxl_put(void)
 }
 EXPORT_SYMBOL_GPL(skx_adxl_put);
 
-static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem)
+static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src)
 {
 	struct skx_dev *d;
 	int i, len = 0;
@@ -136,7 +136,7 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me
 	}
 
 	res->socket  = (int)adxl_values[component_indices[INDEX_SOCKET]];
-	if (error_in_1st_level_mem) {
+	if (err_src == ERR_SRC_2LM_NM) {
 		res->imc     = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ?
 			       (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1;
 		res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ?
@@ -620,31 +620,27 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			     optype, skx_msg);
 }
 
-static bool skx_error_in_1st_level_mem(const struct mce *m)
+static enum error_source skx_error_source(const struct mce *m)
 {
-	u32 errcode;
+	u32 errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
 
-	if (!skx_mem_cfg_2lm)
-		return false;
-
-	errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
-
-	return errcode == MCACOD_EXT_MEM_ERR;
-}
+	if (errcode != MCACOD_MEM_CTL_ERR && errcode != MCACOD_EXT_MEM_ERR)
+		return ERR_SRC_NOT_MEMORY;
 
-static bool skx_error_in_mem(const struct mce *m)
-{
-	u32 errcode;
+	if (!skx_mem_cfg_2lm)
+		return ERR_SRC_1LM;
 
-	errcode = GET_BITFIELD(m->status, 0, 15) & MCACOD_MEM_ERR_MASK;
+	if (errcode == MCACOD_EXT_MEM_ERR)
+		return ERR_SRC_2LM_NM;
 
-	return (errcode == MCACOD_MEM_CTL_ERR || errcode == MCACOD_EXT_MEM_ERR);
+	return ERR_SRC_2LM_FM;
 }
 
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			void *data)
 {
 	struct mce *mce = (struct mce *)data;
+	enum error_source err_src;
 	struct decoded_addr res;
 	struct mem_ctl_info *mci;
 	char *type;
@@ -652,8 +648,10 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	if (mce->kflags & MCE_HANDLED_CEC)
 		return NOTIFY_DONE;
 
+	err_src = skx_error_source(mce);
+
 	/* Ignore unless this is memory related with an address */
-	if (!skx_error_in_mem(mce) || !(mce->status & MCI_STATUS_ADDRV))
+	if (err_src == ERR_SRC_NOT_MEMORY || !(mce->status & MCI_STATUS_ADDRV))
 		return NOTIFY_DONE;
 
 	memset(&res, 0, sizeof(res));
@@ -667,7 +665,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	/* Try driver decoder first */
 	if (!(driver_decode && driver_decode(&res))) {
 		/* Then try firmware decoder (ACPI DSM methods) */
-		if (!(adxl_component_count && skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))))
+		if (!(adxl_component_count && skx_adxl_decode(&res, err_src)))
 			return NOTIFY_DONE;
 	}
 
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 11faf1db4fa48..30a795d8b8d36 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -147,6 +147,13 @@ enum {
 	INDEX_MAX
 };
 
+enum error_source {
+	ERR_SRC_1LM,
+	ERR_SRC_2LM_NM,
+	ERR_SRC_2LM_FM,
+	ERR_SRC_NOT_MEMORY,
+};
+
 #define BIT_NM_MEMCTRL	BIT_ULL(INDEX_NM_MEMCTRL)
 #define BIT_NM_CHANNEL	BIT_ULL(INDEX_NM_CHANNEL)
 #define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)




[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux