RE: [PATCH v9 1/2] ACPI / APEI: Add support to notify the vendor specific HW errors

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Bjorn,

Thanks for reviewing.

I will make changes for your suggestions in both patches.

Regards,
Shiju  

>-----Original Message-----
>From: Bjorn Helgaas [mailto:helgaas@xxxxxxxxxx]
>Sent: 16 June 2020 23:57
>To: Shiju Jose <shiju.jose@xxxxxxxxxx>
>Cc: linux-acpi@xxxxxxxxxxxxxxx; linux-pci@xxxxxxxxxxxxxxx; linux-
>kernel@xxxxxxxxxxxxxxx; rjw@xxxxxxxxxxxxx; bp@xxxxxxxxx;
>james.morse@xxxxxxx; lenb@xxxxxxxxxx; tony.luck@xxxxxxxxx;
>dan.carpenter@xxxxxxxxxx; zhangliguang@xxxxxxxxxxxxxxxxx;
>andriy.shevchenko@xxxxxxxxxxxxxxx; Wangkefeng (OS Kernel Lab)
><wangkefeng.wang@xxxxxxxxxx>; jroedel@xxxxxxx; yangyicong
><yangyicong@xxxxxxxxxx>; Jonathan Cameron
><jonathan.cameron@xxxxxxxxxx>; tanxiaofei <tanxiaofei@xxxxxxxxxx>
>Subject: Re: [PATCH v9 1/2] ACPI / APEI: Add support to notify the vendor
>specific HW errors
>
>On Mon, Jun 15, 2020 at 10:53:11AM +0100, Shiju Jose wrote:
>> Add support to notify the vendor specific non-fatal HW errors to the
>> drivers for the error recovery.
>
>This doesn't actually say anything about what this patch does.  Sure, it "adds
>support," but it doesn't say anything about how that support works or how
>to use it.
>
>This should say something about a FIFO and it should mention that an event
>handler registered with ghes_register_event_notifier() will be called with
>each vendor-specific error record.
>
>> Signed-off-by: Shiju Jose <shiju.jose@xxxxxxxxxx>
>> ---
>>  drivers/acpi/apei/ghes.c | 130
>++++++++++++++++++++++++++++++++++++++-
>>  include/acpi/ghes.h      |  28 +++++++++
>>  2 files changed, 157 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index
>> 24c9642e8fc7..854d8115cdfc 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -33,6 +33,7 @@
>>  #include <linux/irq_work.h>
>>  #include <linux/llist.h>
>>  #include <linux/genalloc.h>
>> +#include <linux/kfifo.h>
>>  #include <linux/pci.h>
>>  #include <linux/pfn.h>
>>  #include <linux/aer.h>
>> @@ -63,6 +64,11 @@
>>  #define GHES_ESTATUS_CACHES_SIZE	4
>>
>>  #define GHES_ESTATUS_IN_CACHE_MAX_NSEC	10000000000ULL
>> +
>> +#define GHES_EVENT_RING_SIZE	256
>> +#define GHES_GDATA_POOL_MIN_ALLOC_ORDER	3
>> +#define GHES_GDATA_POOL_MIN_SIZE	65536
>
>Don't drop these new #defines right in the middle of the GHES_ESTATUS
>block.  The ESTATUS ones are all related, and these new ones are something
>separate.
>
>These new names should be related somehow.  The names don't make it clear
>that GHES_EVENT and GHES_GDATA are related.  IIUC GHES_GDATA is space
>for storing GHES structures, and GHES_EVENT is a FIFO of struct
>ghes_event_entry, each of which points to one of those GHES structures.
>
>>  /* Prevent too many caches are allocated because of RCU */
>>  #define GHES_ESTATUS_CACHE_ALLOCED_MAX
>	(GHES_ESTATUS_CACHES_SIZE * 3 / 2)
>>
>> @@ -122,6 +128,19 @@ static DEFINE_MUTEX(ghes_list_mutex);
>>   */
>>  static DEFINE_SPINLOCK(ghes_notify_lock_irq);
>>
>> +struct ghes_event_entry {
>> +	struct acpi_hest_generic_data *gdata;
>> +	int error_severity;
>> +};
>> +
>> +static DEFINE_KFIFO(ghes_event_ring, struct ghes_event_entry,
>> +		    GHES_EVENT_RING_SIZE);
>> +
>> +static DEFINE_SPINLOCK(ghes_event_ring_lock);
>> +
>> +static struct gen_pool *ghes_gdata_pool; static unsigned long
>> +ghes_gdata_pool_size_request;
>> +
>>  static struct gen_pool *ghes_estatus_pool;  static unsigned long
>> ghes_estatus_pool_size_request;
>>
>> @@ -188,6 +207,40 @@ int ghes_estatus_pool_init(int num_ghes)
>>  	return -ENOMEM;
>>  }
>>
>> +static int ghes_gdata_pool_init(void) {
>> +	unsigned long addr, len;
>> +	int rc;
>> +
>> +	ghes_gdata_pool =
>gen_pool_create(GHES_GDATA_POOL_MIN_ALLOC_ORDER, -1);
>> +	if (!ghes_gdata_pool)
>> +		return -ENOMEM;
>> +
>> +	if (ghes_gdata_pool_size_request < GHES_GDATA_POOL_MIN_SIZE)
>> +		ghes_gdata_pool_size_request =
>GHES_GDATA_POOL_MIN_SIZE;
>> +
>> +	len = ghes_gdata_pool_size_request;
>> +	addr = (unsigned long)vmalloc(PAGE_ALIGN(len));
>> +	if (!addr)
>> +		goto err_pool_alloc;
>> +
>> +	vmalloc_sync_mappings();
>> +
>> +	rc = gen_pool_add(ghes_gdata_pool, addr, PAGE_ALIGN(len), -1);
>> +	if (rc)
>> +		goto err_pool_add;
>> +
>> +	return 0;
>> +
>> +err_pool_add:
>> +	vfree((void *)addr);
>> +
>> +err_pool_alloc:
>> +	gen_pool_destroy(ghes_gdata_pool);
>> +
>> +	return -ENOMEM;
>> +}
>> +
>>  static int map_gen_v2(struct ghes *ghes)  {
>>  	return
>> apei_map_generic_address(&ghes->generic_v2->read_ack_register);
>> @@ -247,6 +300,10 @@ static struct ghes *ghes_new(struct
>acpi_hest_generic *generic)
>>  		goto err_unmap_status_addr;
>>  	}
>>
>> +	ghes_gdata_pool_size_request += generic->records_to_preallocate *
>> +					generic->max_sections_per_record *
>> +					generic->max_raw_data_length;
>> +
>>  	return ghes;
>>
>>  err_unmap_status_addr:
>> @@ -490,6 +547,68 @@ static void ghes_handle_aer(struct
>> acpi_hest_generic_data *gdata)  #endif  }
>>
>> +static BLOCKING_NOTIFIER_HEAD(ghes_event_notify_list);
>> +
>> +/**
>> + * ghes_register_event_notifier - register an event notifier
>> + * for the non-fatal HW errors.
>> + * @nb: pointer to the notifier_block structure of the event handler.
>> + *
>> + * return 0 : SUCCESS, non-zero : FAIL  */ int
>> +ghes_register_event_notifier(struct notifier_block *nb) {
>> +	return blocking_notifier_chain_register(&ghes_event_notify_list,
>> +nb); } EXPORT_SYMBOL_GPL(ghes_register_event_notifier);
>> +
>> +/**
>> + * ghes_unregister_event_notifier - unregister the previously
>> + * registered event notifier.
>> + * @nb: pointer to the notifier_block structure of the event handler.
>> + */
>> +void ghes_unregister_event_notifier(struct notifier_block *nb)
>> +{
>> +	blocking_notifier_chain_unregister(&ghes_event_notify_list, nb);
>> +}
>> +EXPORT_SYMBOL_GPL(ghes_unregister_event_notifier);
>> +
>> +static void ghes_event_work_func(struct work_struct *work)
>> +{
>> +	struct ghes_event_entry entry;
>> +	u32 len;
>> +
>> +	while (kfifo_get(&ghes_event_ring, &entry)) {
>> +		blocking_notifier_call_chain(&ghes_event_notify_list,
>> +					     entry.error_severity,
>> +					     entry.gdata);
>> +		len = acpi_hest_get_record_size(entry.gdata);
>> +		gen_pool_free(ghes_gdata_pool, (unsigned long)entry.gdata,
>len);
>> +	}
>> +}
>> +
>> +static DECLARE_WORK(ghes_event_work, ghes_event_work_func);
>> +
>> +static void ghes_handle_non_standard_event(struct
>acpi_hest_generic_data *gdata,
>> +					   int sev)
>> +{
>> +	u32 len;
>> +	struct ghes_event_entry event_entry;
>> +
>> +	len = acpi_hest_get_record_size(gdata);
>> +	event_entry.gdata = (void *)gen_pool_alloc(ghes_gdata_pool, len);
>> +	if (event_entry.gdata) {
>
>Make this:
>
>  if (!event_entry.gdata) {
>    pr_warn(...);
>    return;
>  }
>
>and then you won't have to indent the usual case below.
>
>> +		memcpy(event_entry.gdata, gdata, len);
>> +		event_entry.error_severity = sev;
>> +
>> +		if (kfifo_in_spinlocked(&ghes_event_ring, &event_entry, 1,
>> +					&ghes_event_ring_lock))
>> +			schedule_work(&ghes_event_work);
>> +		else
>> +			pr_warn(GHES_PFX "ghes event queue full\n");
>
>GHES_PFX is already "GHES: ", so no need to repeat "ghes" in your
>message.  You can use that space for something more useful, like the
>source ID, section type, severity, etc.  Dmesg strings that are
>completely constant are not as useful as they could be.
>
>> +	}
>> +}
>> +
>>  static void ghes_do_proc(struct ghes *ghes,
>>  			 const struct acpi_hest_generic_status *estatus)
>>  {
>> @@ -527,6 +646,7 @@ static void ghes_do_proc(struct ghes *ghes,
>>  		} else {
>>  			void *err = acpi_hest_get_payload(gdata);
>>
>> +			ghes_handle_non_standard_event(gdata, sev);
>>  			log_non_standard_event(sec_type, fru_id, fru_text,
>>  					       sec_sev, err,
>>  					       gdata->error_data_length);
>> @@ -1334,7 +1454,7 @@ static int __init ghes_init(void)
>>
>>  	rc = platform_driver_register(&ghes_platform_driver);
>>  	if (rc)
>> -		goto err;
>> +		goto exit;
>>
>>  	rc = apei_osc_setup();
>>  	if (rc == 0 && osc_sb_apei_support_acked)
>> @@ -1346,8 +1466,16 @@ static int __init ghes_init(void)
>>  	else
>>  		pr_info(GHES_PFX "Failed to enable APEI firmware first
>mode.\n");
>>
>> +	rc = ghes_gdata_pool_init();
>> +	if (rc) {
>> +		pr_warn(GHES_PFX "ghes_gdata_pool_init failed\n");
>
>I don't think this message is really meaningful to a user.  Maybe
>something about non-fatal vendor-specific errors not being logged?
>
>> +		goto err;
>
>I'm not sure this is an error that should cause the whole GHES driver
>to be unregistered.  After all, the driver *used* to work fine even
>without this vendor-specific functionality.  Seems like we ought to be
>able to fall back to the previous behavior even if we can't allocate
>the gdata pool.
>
>> +	}
>> +
>>  	return 0;
>>  err:
>> +	platform_driver_unregister(&ghes_platform_driver);
>> +exit:
>>  	return rc;
>>  }
>>  device_initcall(ghes_init);
>> diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
>> index e3f1cddb4ac8..a3dd82069069 100644
>> --- a/include/acpi/ghes.h
>> +++ b/include/acpi/ghes.h
>> @@ -50,6 +50,34 @@ enum {
>>  	GHES_SEV_PANIC = 0x3,
>>  };
>>
>> +
>
>Don't add an extra blank line here.
>
>> +#ifdef CONFIG_ACPI_APEI_GHES
>> +/**
>> + * ghes_register_event_notifier - register an event notifier
>> + * for the non-fatal HW errors.
>> + * @nb: pointer to the notifier_block structure of the event notifier.
>> + *
>> + * Return : 0 - SUCCESS, non-zero - FAIL.
>> + */
>> +int ghes_register_event_notifier(struct notifier_block *nb);
>> +
>> +/**
>> + * ghes_unregister_event_notifier - unregister the previously
>> + * registered event notifier.
>> + * @nb: pointer to the notifier_block structure of the event notifier.
>> + */
>> +void ghes_unregister_event_notifier(struct notifier_block *nb);
>> +#else
>> +static inline int ghes_register_event_notifier(struct notifier_block *nb)
>> +{
>> +	return -ENODEV;
>> +}
>> +
>> +static inline void ghes_unregister_event_notifier(struct notifier_block
>*nb)
>> +{
>> +}
>> +#endif
>> +
>>  int ghes_estatus_pool_init(int num_ghes);
>>
>>  /* From drivers/edac/ghes_edac.c */
>> --
>> 2.17.1
>>
>>



[Index of Archives]     [Linux IBM ACPI]     [Linux Power Management]     [Linux Kernel]     [Linux Laptop]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]     [Linux Resources]

  Powered by Linux