Re: [RFC PATCH v2 1/1] ACPI: APEI: Make memory_failure() triggered by synchronization errors execute in the current context

HORIGUCHI NAOYA(堀口 直也) <naoya.horiguchi@xxxxxxx> · Thu, 15 Dec 2022 00:26:01 +0000

On Fri, Dec 09, 2022 at 05:54:07PM +0800, Lv Ying wrote:
> The memory uncorrected error which is detected by an external component and
> notified via an IRQ, can be called asynchronization error. If an error is
> detected as a result of user-space process accessing a corrupt memory
> location, the CPU may take an abort. On arm64 this is a
> 'synchronous external abort', and on a firmware first system it is notified
> via NOTIFY_SEA, this can be called synchronization error.
> 

"synchronization error" in this context looks weird to me, maybe you mean
"synchronous error" ?  There're many places using "synchronization", so
please use consistent wording.

> Currently, synchronization error and asynchronization error both use
> memory_failure_queue to schedule memory_failure() exectute in kworker
> context. Commit 7f17b4a121d0 ("ACPI: APEI: Kick the memory_failure() queue
> for synchronous errors") make task_work pending to flush out the queue,
> cancel_work_sync() in memory_failure_queue_kick() will make
> memory_failure() exectute in kworker context first which will get

s/exectute/execute/

> synchronization error info from kfifo, so task_work later will get nothing
> from kfifo which doesn't work as expected. Even worse, synchronization
> error notification has NMI like properties, (it can interrupt IRQ-masked
> code), task_work may get wrong kfifo entry from interrupted
> asynchronization error which is notified by IRQ.
> 
> Since the memory_failure() triggered by a synchronous exception is
> executed in the kworker context, the early_kill mode of memory_failure()
> will send wrong si_code by SIGBUS signal: current process is kworker
> thread, the actual user-space process accessing the corrupt memory location
> will be collected by find_early_kill_thread(), and then send SIGBUS with
> BUS_MCEERR_AO si_code to the actual user-space process instead of
> BUS_MCEERR_AR. The machine-manager(kvm) use the si_code: BUS_MCEERR_AO for
> 'action optional' early notifications, and BUS_MCEERR_AR for
> 'action required' synchronous/late notifications.
> 
> Make memory_failure() triggered by synchronization errors execute in the
> current context, we do not need workqueue for synchronization error
> anymore, use task_work handle synchronization errors directly. Since,
> synchronization errors and asynchronization errors share the same kfifo,
> use MF_ACTION_REQUIRED flag to distinguish them. And the asynchronization
> error keeps the same as before.
> 
> Currently, it's hard to distinguish synchronization error in APEI. It
> can be determined that the SEA report synchronization error, so
> currently only the synchronization error reported by SEA is distinguished
> and handled in current context.
> 
> Signed-off-by: Lv Ying <lvying6@xxxxxxxxxx>
> ---
>  drivers/acpi/apei/ghes.c | 20 +++++++++-------
>  mm/memory-failure.c      | 50 +++++++++++++++++++++++++++++-----------
>  2 files changed, 48 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 9952f3a792ba..19d62ec2177f 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -423,8 +423,8 @@ static void ghes_clear_estatus(struct ghes *ghes,
>  
>  /*
>   * Called as task_work before returning to user-space.
> - * Ensure any queued work has been done before we return to the context that
> - * triggered the notification.
> + * Ensure any queued corrupt page in synchronous errors has been handled before
> + * we return to the user context that triggered the notification.
>   */
>  static void ghes_kick_task_work(struct callback_head *head)
>  {
> @@ -461,7 +461,7 @@ static bool ghes_do_memory_failure(u64 physical_addr, int flags)
>  }
>  
>  static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
> -				       int sev)
> +				       int sev, int notify_type)
>  {
>  	int flags = -1;
>  	int sec_sev = ghes_severity(gdata->error_severity);
> @@ -475,7 +475,7 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>  	    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
>  		flags = MF_SOFT_OFFLINE;
>  	if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
> -		flags = 0;
> +		flags = (notify_type == ACPI_HEST_NOTIFY_SEA) ? MF_ACTION_REQUIRED : 0;
>  
>  	if (flags != -1)
>  		return ghes_do_memory_failure(mem_err->physical_addr, flags);
> @@ -483,7 +483,8 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
>  	return false;
>  }
>  
> -static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
> +static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev,
> +		int notify_type)
>  {
>  	struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
>  	bool queued = false;
> @@ -510,7 +511,9 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int s
>  		 * and don't filter out 'corrected' error here.
>  		 */
>  		if (is_cache && has_pa) {
> -			queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0);
> +			queued = ghes_do_memory_failure(err_info->physical_fault_addr,
> +					(notify_type == ACPI_HEST_NOTIFY_SEA) ?
> +					MF_ACTION_REQUIRED : 0);
>  			p += err_info->length;
>  			continue;
>  		}
> @@ -631,6 +634,7 @@ static bool ghes_do_proc(struct ghes *ghes,
>  	const guid_t *fru_id = &guid_null;
>  	char *fru_text = "";
>  	bool queued = false;
> +	int notify_type = ghes->generic->notify.type;
>  
>  	sev = ghes_severity(estatus->error_severity);
>  	apei_estatus_for_each_section(estatus, gdata) {
> @@ -648,13 +652,13 @@ static bool ghes_do_proc(struct ghes *ghes,
>  			ghes_edac_report_mem_error(sev, mem_err);
>  
>  			arch_apei_report_mem_error(sev, mem_err);
> -			queued = ghes_handle_memory_failure(gdata, sev);
> +			queued = ghes_handle_memory_failure(gdata, sev, notify_type);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
>  			ghes_handle_aer(gdata);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
> -			queued = ghes_handle_arm_hw_error(gdata, sev);
> +			queued = ghes_handle_arm_hw_error(gdata, sev, notify_type);
>  		} else {
>  			void *err = acpi_hest_get_payload(gdata);
>  
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index bead6bccc7f2..82238ec86acd 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -2204,7 +2204,11 @@ struct memory_failure_cpu {
>  static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
>  
>  /**
> - * memory_failure_queue - Schedule handling memory failure of a page.
> + * memory_failure_queue
> + * - Schedule handling memory failure of a page for asynchronous error, memory
> + *   failure page will be executed in kworker thread
> + * - put corrupt memory info into kfifo for synchronous error, task_work will
> + *   handle them before returning to the user

I think that the top description of kernel-doc function documentation needs
to be brief, so could you move the above 2 items downward as details?
Maybe the first line can be updated like below (scheduling is done conditionally
with your change):

/**                                                                    
 * memory_failure_queue - Queue memory failure event
 * @pfn: Page Number of the corrupted page                             
 * @flags: Flags for memory failure handling
 *
 * ... (full details)

And maybe existing comment in "full details" is obsolete since commit
7f17b4a121d0 ("ACPI: APEI: Kick the memory_failure() queue for synchronous
errors"), so could you update the whole description to explain the new
behavior with some background information as done in patch description?

>   * @pfn: Page Number of the corrupted page
>   * @flags: Flags for memory failure handling
>   *
> @@ -2217,6 +2221,11 @@ static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
>   * happen outside the current execution context (e.g. when
>   * detected by a background scrubber)
>   *
> + * This function can also be used in synchronous errors which was detected as a

"... errors which was ..." seems unmatched in plurality.

> + * result of user-space accessing a corrupt memory location, just put memory

s/corrupt/corrupted/

> + * error info into kfifo, and then, task_work get and handle it in current
> + * execution context instead of scheduling kworker to handle it

Please put a period at the end of sentence. kernel-doc comment is
converted to auto-generated documentation, so it needs to look like
natural English text.
See https://docs.kernel.org/doc-guide/kernel-doc.html#function-documentation

> + *
>   * Can run in IRQ context.
>   */
> @@ -2230,9 +2239,10 @@ void memory_failure_queue(unsigned long pfn, int flags)
>  
>  	mf_cpu = &get_cpu_var(memory_failure_cpu);
>  	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
> -	if (kfifo_put(&mf_cpu->fifo, entry))
> -		schedule_work_on(smp_processor_id(), &mf_cpu->work);
> -	else
> +	if (kfifo_put(&mf_cpu->fifo, entry)) {
> +		if (!(entry.flags & MF_ACTION_REQUIRED))
> +			schedule_work_on(smp_processor_id(), &mf_cpu->work);
> +	} else
>  		pr_err("buffer overflow when queuing memory failure at %#lx\n",
>  		       pfn);
>  	spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
> @@ -2240,12 +2250,15 @@ void memory_failure_queue(unsigned long pfn, int flags)
>  }
>  EXPORT_SYMBOL_GPL(memory_failure_queue);
>  
> -static void memory_failure_work_func(struct work_struct *work)
> +/*
> + * (a)synchronous error info should be consumed by the corresponding handler
> + */
> +static void __memory_failure_work_func(struct work_struct *work, bool sync)
>  {
>  	struct memory_failure_cpu *mf_cpu;
>  	struct memory_failure_entry entry = { 0, };
>  	unsigned long proc_flags;
> -	int gotten;
> +	int gotten, ret;
>  
>  	mf_cpu = container_of(work, struct memory_failure_cpu, work);
>  	for (;;) {
> @@ -2256,22 +2269,31 @@ static void memory_failure_work_func(struct work_struct *work)
>  			break;
>  		if (entry.flags & MF_SOFT_OFFLINE)
>  			soft_offline_page(entry.pfn, entry.flags);
> -		else
> -			memory_failure(entry.pfn, entry.flags);
> +		else {
> +			if (sync && (entry.flags & MF_ACTION_REQUIRED)) {
> +				ret = memory_failure(entry.pfn, entry.flags);
> +				if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
> +					return;
> +
> +				pr_err("Memory error not recovered");
> +				force_sig(SIGBUS);
> +			} else if (!sync && !(entry.flags & MF_ACTION_REQUIRED))
> +				memory_failure(entry.pfn, entry.flags);

So if sync is true and MF_ACTION_REQUIRED is not set, memory_failure() is
not called.  Does that break something?

Thanks,
Naoya Horiguchi

> +		}
>  	}
>  }
>  
> -/*
> - * Process memory_failure work queued on the specified CPU.
> - * Used to avoid return-to-userspace racing with the memory_failure workqueue.
> - */
> +static void memory_failure_work_func(struct work_struct *work)
> +{
> +	__memory_failure_work_func(work, false);
> +}
> +
>  void memory_failure_queue_kick(int cpu)
>  {
>  	struct memory_failure_cpu *mf_cpu;
>  
>  	mf_cpu = &per_cpu(memory_failure_cpu, cpu);
> -	cancel_work_sync(&mf_cpu->work);
> -	memory_failure_work_func(&mf_cpu->work);
> +	__memory_failure_work_func(&mf_cpu->work, true);
>  }
>  
>  static int __init memory_failure_init(void)
> -- 
> 2.36.1