On Arm64 platform, errors could be signaled by synchronous interrupt, e.g. when an error is detected by a background scrubber, or signaled by synchronous exception, e.g. when an uncorrected error is consumed. Both synchronous and asynchronous error are queued and handled by a dedicated kthread in workqueue. commit 7f17b4a121d0 ("ACPI: APEI: Kick the memory_failure() queue for synchronous errors") keep track of whether memory_failure() work was queued, and make task_work pending to flush out the workqueue so that the work for synchronous error is processed before returning to user-space. The trick ensures that the corrupted page is unmapped and poisoned. And after returning to user-space, the task starts at current instruction which triggering a page fault and kernel will send sigbus due to VM_FAULT_HWPOISON. Although the task could be killed by page fault, the memory failure is handled in a kthread context so that the hwpoison-aware mechanisms, e.g. PF_MCE_EARLY, early kill, does not work as expected. To this end, separate synchronous and asynchronous error handling into different paths like X86 does: - task work for synchronous error. - and workqueue for asynchronous error. Signed-off-by: Shuai Xue <xueshuai@xxxxxxxxxxxxxxxxx> --- drivers/acpi/apei/ghes.c | 118 ++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 52 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index a420759fce2d..f13c298f47e6 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -421,46 +421,80 @@ static void ghes_clear_estatus(struct ghes *ghes, ghes_ack_error(ghes->generic_v2); } -/* - * Called as task_work before returning to user-space. - * Ensure any queued work has been done before we return to the context that - * triggered the notification. +/** + * struct mce_task_work - for synchronous RAS event + * + * @twork: callback_head for task work + * @pfn: page frame number of corrupted page + * @flags: fine tune action taken + * + * Structure to pass task work to be handled before + * returning to userspace via task_work_add(). */ -static void ghes_kick_task_work(struct callback_head *head) +struct mce_task_work { + struct callback_head twork; + u64 pfn; + int flags; +}; + +static void memory_failure_cb(struct callback_head *twork) { - struct acpi_hest_generic_status *estatus; - struct ghes_estatus_node *estatus_node; - u32 node_len; + int ret; + struct mce_task_work *twcb = + container_of(twork, struct mce_task_work, twork); + ret = memory_failure(twcb->pfn, twcb->flags); + kfree(twcb); - estatus_node = container_of(head, struct ghes_estatus_node, task_work); - if (IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE)) - memory_failure_queue_kick(estatus_node->task_work_cpu); + if (!ret) + return; + /* + * -EHWPOISON from memory_failure() means that it already sent SIGBUS + * to the current process with the proper error info, + * -EOPNOTSUPP means hwpoison_filter() filtered the error event, + * + * In both cases, no further processing is required. + */ + if (ret == -EHWPOISON || ret == -EOPNOTSUPP) + return; - estatus = GHES_ESTATUS_FROM_NODE(estatus_node); - node_len = GHES_ESTATUS_NODE_LEN(cper_estatus_len(estatus)); - gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len); + pr_err("Memory error not recovered"); + force_sig(SIGBUS); } -static bool ghes_do_memory_failure(u64 physical_addr, int flags) +static void ghes_do_memory_failure(u64 physical_addr, int flags) { unsigned long pfn; + struct mce_task_work *twcb; if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE)) - return false; + return; pfn = PHYS_PFN(physical_addr); if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) { pr_warn_ratelimited(FW_WARN GHES_PFX "Invalid address in generic error data: %#llx\n", physical_addr); - return false; + return; } - memory_failure_queue(pfn, flags); - return true; + if (flags == MF_ACTION_REQUIRED && current->mm) { + twcb = kmalloc(sizeof(*twcb), GFP_ATOMIC); + if (!twcb) + return; + + twcb->pfn = pfn; + twcb->flags = flags; + init_task_work(&twcb->twork, memory_failure_cb); + task_work_add(current, &twcb->twork, TWA_RESUME); + return; + } else { + memory_failure_queue(pfn, flags); + } + + return; } -static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, +static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev) { int flags = -1; @@ -468,7 +502,7 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) - return false; + return; /* iff following two events can be handled properly by now */ if (sec_sev == GHES_SEV_CORRECTED && @@ -478,15 +512,12 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, flags = (gdata->flags & CPER_SEC_SYNC) ? MF_ACTION_REQUIRED : 0; if (flags != -1) - return ghes_do_memory_failure(mem_err->physical_addr, flags); - - return false; + ghes_do_memory_failure(mem_err->physical_addr, flags); } -static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev) +static void ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev) { struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); - bool queued = false; int sec_sev, i; char *p; @@ -494,7 +525,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int s sec_sev = ghes_severity(gdata->error_severity); if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE) - return false; + return; p = (char *)(err + 1); for (i = 0; i < err->err_info_num; i++) { @@ -510,7 +541,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int s * and don't filter out 'corrected' error here. */ if (is_cache && has_pa) { - queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0); + ghes_do_memory_failure(err_info->physical_fault_addr, 0); p += err_info->length; continue; } @@ -524,7 +555,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int s p += err_info->length; } - return queued; + return; } /* @@ -622,7 +653,7 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata, schedule_work(&entry->work); } -static bool ghes_do_proc(struct ghes *ghes, +static void ghes_do_proc(struct ghes *ghes, const struct acpi_hest_generic_status *estatus) { int sev, sec_sev; @@ -630,7 +661,6 @@ static bool ghes_do_proc(struct ghes *ghes, guid_t *sec_type; const guid_t *fru_id = &guid_null; char *fru_text = ""; - bool queued = false; sev = ghes_severity(estatus->error_severity); apei_estatus_for_each_section(estatus, gdata) { @@ -648,13 +678,13 @@ static bool ghes_do_proc(struct ghes *ghes, ghes_edac_report_mem_error(sev, mem_err); arch_apei_report_mem_error(sev, mem_err); - queued = ghes_handle_memory_failure(gdata, sev); + ghes_handle_memory_failure(gdata, sev); } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) { ghes_handle_aer(gdata); } else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { - queued = ghes_handle_arm_hw_error(gdata, sev); + ghes_handle_arm_hw_error(gdata, sev); } else { void *err = acpi_hest_get_payload(gdata); @@ -664,8 +694,6 @@ static bool ghes_do_proc(struct ghes *ghes, gdata->error_data_length); } } - - return queued; } static void __ghes_print_estatus(const char *pfx, @@ -961,9 +989,7 @@ static void ghes_proc_in_irq(struct irq_work *irq_work) struct ghes_estatus_node *estatus_node; struct acpi_hest_generic *generic; struct acpi_hest_generic_status *estatus; - bool task_work_pending; u32 len, node_len; - int ret; llnode = llist_del_all(&ghes_estatus_llist); /* @@ -978,26 +1004,15 @@ static void ghes_proc_in_irq(struct irq_work *irq_work) estatus = GHES_ESTATUS_FROM_NODE(estatus_node); len = cper_estatus_len(estatus); node_len = GHES_ESTATUS_NODE_LEN(len); - task_work_pending = ghes_do_proc(estatus_node->ghes, estatus); + ghes_do_proc(estatus_node->ghes, estatus); if (!ghes_estatus_cached(estatus)) { generic = estatus_node->generic; if (ghes_print_estatus(NULL, generic, estatus)) ghes_estatus_cache_add(generic, estatus); } - if (task_work_pending && current->mm) { - estatus_node->task_work.func = ghes_kick_task_work; - estatus_node->task_work_cpu = smp_processor_id(); - ret = task_work_add(current, &estatus_node->task_work, - TWA_RESUME); - if (ret) - estatus_node->task_work.func = NULL; - } - - if (!estatus_node->task_work.func) - gen_pool_free(ghes_estatus_pool, - (unsigned long)estatus_node, node_len); - + gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, + node_len); llnode = next; } } @@ -1057,7 +1072,6 @@ static int ghes_in_nmi_queue_one_entry(struct ghes *ghes, estatus_node->ghes = ghes; estatus_node->generic = ghes->generic; - estatus_node->task_work.func = NULL; estatus = GHES_ESTATUS_FROM_NODE(estatus_node); if (__ghes_read_estatus(estatus, buf_paddr, fixmap_idx, len)) { -- 2.20.1.12.g72788fdb