Patch "ACPI: APEI: set memory failure flags as MF_ACTION_REQUIRED on synchronous events" has been added to the 6.1-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Thu, 1 Feb 2024 12:20:54 -0500

This is a note to let you know that I've just added the patch titled

    ACPI: APEI: set memory failure flags as MF_ACTION_REQUIRED on synchronous events

to the 6.1-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     acpi-apei-set-memory-failure-flags-as-mf_action_requ.patch
and it can be found in the queue-6.1 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit d92e7c45d6c89799fe5d58289fd98d1ced180d4d
Author: Shuai Xue <xueshuai@xxxxxxxxxxxxxxxxx>
Date:   Mon Dec 18 14:45:18 2023 +0800

    ACPI: APEI: set memory failure flags as MF_ACTION_REQUIRED on synchronous events
    
    [ Upstream commit a70297d2213253853e95f5b49651f924990c6d3b ]
    
    There are two major types of uncorrected recoverable (UCR) errors :
    
     - Synchronous error: The error is detected and raised at the point of
       the consumption in the execution flow, e.g. when a CPU tries to
       access a poisoned cache line. The CPU will take a synchronous error
       exception such as Synchronous External Abort (SEA) on Arm64 and
       Machine Check Exception (MCE) on X86. OS requires to take action (for
       example, offline failure page/kill failure thread) to recover this
       uncorrectable error.
    
     - Asynchronous error: The error is detected out of processor execution
       context, e.g. when an error is detected by a background scrubber.
       Some data in the memory are corrupted. But the data have not been
       consumed. OS is optional to take action to recover this uncorrectable
       error.
    
    When APEI firmware first is enabled, a platform may describe one error
    source for the handling of synchronous errors (e.g. MCE or SEA notification
    ), or for handling asynchronous errors (e.g. SCI or External Interrupt
    notification). In other words, we can distinguish synchronous errors by
    APEI notification. For synchronous errors, kernel will kill the current
    process which accessing the poisoned page by sending SIGBUS with
    BUS_MCEERR_AR. In addition, for asynchronous errors, kernel will notify the
    process who owns the poisoned page by sending SIGBUS with BUS_MCEERR_AO in
    early kill mode. However, the GHES driver always sets mf_flags to 0 so that
    all synchronous errors are handled as asynchronous errors in memory failure.
    
    To this end, set memory failure flags as MF_ACTION_REQUIRED on synchronous
    events.
    
    Signed-off-by: Shuai Xue <xueshuai@xxxxxxxxxxxxxxxxx>
    Tested-by: Ma Wupeng <mawupeng1@xxxxxxxxxx>
    Reviewed-by: Kefeng Wang <wangkefeng.wang@xxxxxxxxxx>
    Reviewed-by: Xiaofei Tan <tanxiaofei@xxxxxxxxxx>
    Reviewed-by: Baolin Wang <baolin.wang@xxxxxxxxxxxxxxxxx>
    Reviewed-by: James Morse <james.morse@xxxxxxx>
    Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@xxxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 9952f3a792ba..dd808cf65c84 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -99,6 +99,20 @@ static inline bool is_hest_type_generic_v2(struct ghes *ghes)
 	return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
 }
 
+/*
+ * A platform may describe one error source for the handling of synchronous
+ * errors (e.g. MCE or SEA), or for handling asynchronous errors (e.g. SCI
+ * or External Interrupt). On x86, the HEST notifications are always
+ * asynchronous, so only SEA on ARM is delivered as a synchronous
+ * notification.
+ */
+static inline bool is_hest_sync_notify(struct ghes *ghes)
+{
+	u8 notify_type = ghes->generic->notify.type;
+
+	return notify_type == ACPI_HEST_NOTIFY_SEA;
+}
+
 /*
  * This driver isn't really modular, however for the time being,
  * continuing to use module_param is the easiest way to remain
@@ -461,7 +475,7 @@ static bool ghes_do_memory_failure(u64 physical_addr, int flags)
 }
 
 static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
-				       int sev)
+				       int sev, bool sync)
 {
 	int flags = -1;
 	int sec_sev = ghes_severity(gdata->error_severity);
@@ -475,7 +489,7 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
 	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
 		flags = MF_SOFT_OFFLINE;
 	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
-		flags = 0;
+		flags = sync ? MF_ACTION_REQUIRED : 0;
 
 	if (flags != -1)
 		return ghes_do_memory_failure(mem_err->physical_addr, flags);
@@ -483,9 +497,11 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
 	return false;
 }
 
-static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
+static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
+				       int sev, bool sync)
 {
 	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
+	int flags = sync ? MF_ACTION_REQUIRED : 0;
 	bool queued = false;
 	int sec_sev, i;
 	char *p;
@@ -510,7 +526,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int s
 		 * and don't filter out 'corrected' error here.
 		 */
 		if (is_cache && has_pa) {
-			queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0);
+			queued = ghes_do_memory_failure(err_info->physical_fault_addr, flags);
 			p += err_info->length;
 			continue;
 		}
@@ -631,6 +647,7 @@ static bool ghes_do_proc(struct ghes *ghes,
 	const guid_t *fru_id = &guid_null;
 	char *fru_text = "";
 	bool queued = false;
+	bool sync = is_hest_sync_notify(ghes);
 
 	sev = ghes_severity(estatus->error_severity);
 	apei_estatus_for_each_section(estatus, gdata) {
@@ -648,13 +665,13 @@ static bool ghes_do_proc(struct ghes *ghes,
 			ghes_edac_report_mem_error(sev, mem_err);
 
 			arch_apei_report_mem_error(sev, mem_err);
-			queued = ghes_handle_memory_failure(gdata, sev);
+			queued = ghes_handle_memory_failure(gdata, sev, sync);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
 			ghes_handle_aer(gdata);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
-			queued = ghes_handle_arm_hw_error(gdata, sev);
+			queued = ghes_handle_arm_hw_error(gdata, sev, sync);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);