Hi Bjorn, Thanks for reviewing. I will make changes for your suggestions in both patches. Regards, Shiju >-----Original Message----- >From: Bjorn Helgaas [mailto:helgaas@xxxxxxxxxx] >Sent: 16 June 2020 23:57 >To: Shiju Jose <shiju.jose@xxxxxxxxxx> >Cc: linux-acpi@xxxxxxxxxxxxxxx; linux-pci@xxxxxxxxxxxxxxx; linux- >kernel@xxxxxxxxxxxxxxx; rjw@xxxxxxxxxxxxx; bp@xxxxxxxxx; >james.morse@xxxxxxx; lenb@xxxxxxxxxx; tony.luck@xxxxxxxxx; >dan.carpenter@xxxxxxxxxx; zhangliguang@xxxxxxxxxxxxxxxxx; >andriy.shevchenko@xxxxxxxxxxxxxxx; Wangkefeng (OS Kernel Lab) ><wangkefeng.wang@xxxxxxxxxx>; jroedel@xxxxxxx; yangyicong ><yangyicong@xxxxxxxxxx>; Jonathan Cameron ><jonathan.cameron@xxxxxxxxxx>; tanxiaofei <tanxiaofei@xxxxxxxxxx> >Subject: Re: [PATCH v9 1/2] ACPI / APEI: Add support to notify the vendor >specific HW errors > >On Mon, Jun 15, 2020 at 10:53:11AM +0100, Shiju Jose wrote: >> Add support to notify the vendor specific non-fatal HW errors to the >> drivers for the error recovery. > >This doesn't actually say anything about what this patch does. Sure, it "adds >support," but it doesn't say anything about how that support works or how >to use it. > >This should say something about a FIFO and it should mention that an event >handler registered with ghes_register_event_notifier() will be called with >each vendor-specific error record. > >> Signed-off-by: Shiju Jose <shiju.jose@xxxxxxxxxx> >> --- >> drivers/acpi/apei/ghes.c | 130 >++++++++++++++++++++++++++++++++++++++- >> include/acpi/ghes.h | 28 +++++++++ >> 2 files changed, 157 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index >> 24c9642e8fc7..854d8115cdfc 100644 >> --- a/drivers/acpi/apei/ghes.c >> +++ b/drivers/acpi/apei/ghes.c >> @@ -33,6 +33,7 @@ >> #include <linux/irq_work.h> >> #include <linux/llist.h> >> #include <linux/genalloc.h> >> +#include <linux/kfifo.h> >> #include <linux/pci.h> >> #include <linux/pfn.h> >> #include <linux/aer.h> >> @@ -63,6 +64,11 @@ >> #define GHES_ESTATUS_CACHES_SIZE 4 >> >> #define GHES_ESTATUS_IN_CACHE_MAX_NSEC 10000000000ULL >> + >> +#define GHES_EVENT_RING_SIZE 256 >> +#define GHES_GDATA_POOL_MIN_ALLOC_ORDER 3 >> +#define GHES_GDATA_POOL_MIN_SIZE 65536 > >Don't drop these new #defines right in the middle of the GHES_ESTATUS >block. The ESTATUS ones are all related, and these new ones are something >separate. > >These new names should be related somehow. The names don't make it clear >that GHES_EVENT and GHES_GDATA are related. IIUC GHES_GDATA is space >for storing GHES structures, and GHES_EVENT is a FIFO of struct >ghes_event_entry, each of which points to one of those GHES structures. > >> /* Prevent too many caches are allocated because of RCU */ >> #define GHES_ESTATUS_CACHE_ALLOCED_MAX > (GHES_ESTATUS_CACHES_SIZE * 3 / 2) >> >> @@ -122,6 +128,19 @@ static DEFINE_MUTEX(ghes_list_mutex); >> */ >> static DEFINE_SPINLOCK(ghes_notify_lock_irq); >> >> +struct ghes_event_entry { >> + struct acpi_hest_generic_data *gdata; >> + int error_severity; >> +}; >> + >> +static DEFINE_KFIFO(ghes_event_ring, struct ghes_event_entry, >> + GHES_EVENT_RING_SIZE); >> + >> +static DEFINE_SPINLOCK(ghes_event_ring_lock); >> + >> +static struct gen_pool *ghes_gdata_pool; static unsigned long >> +ghes_gdata_pool_size_request; >> + >> static struct gen_pool *ghes_estatus_pool; static unsigned long >> ghes_estatus_pool_size_request; >> >> @@ -188,6 +207,40 @@ int ghes_estatus_pool_init(int num_ghes) >> return -ENOMEM; >> } >> >> +static int ghes_gdata_pool_init(void) { >> + unsigned long addr, len; >> + int rc; >> + >> + ghes_gdata_pool = >gen_pool_create(GHES_GDATA_POOL_MIN_ALLOC_ORDER, -1); >> + if (!ghes_gdata_pool) >> + return -ENOMEM; >> + >> + if (ghes_gdata_pool_size_request < GHES_GDATA_POOL_MIN_SIZE) >> + ghes_gdata_pool_size_request = >GHES_GDATA_POOL_MIN_SIZE; >> + >> + len = ghes_gdata_pool_size_request; >> + addr = (unsigned long)vmalloc(PAGE_ALIGN(len)); >> + if (!addr) >> + goto err_pool_alloc; >> + >> + vmalloc_sync_mappings(); >> + >> + rc = gen_pool_add(ghes_gdata_pool, addr, PAGE_ALIGN(len), -1); >> + if (rc) >> + goto err_pool_add; >> + >> + return 0; >> + >> +err_pool_add: >> + vfree((void *)addr); >> + >> +err_pool_alloc: >> + gen_pool_destroy(ghes_gdata_pool); >> + >> + return -ENOMEM; >> +} >> + >> static int map_gen_v2(struct ghes *ghes) { >> return >> apei_map_generic_address(&ghes->generic_v2->read_ack_register); >> @@ -247,6 +300,10 @@ static struct ghes *ghes_new(struct >acpi_hest_generic *generic) >> goto err_unmap_status_addr; >> } >> >> + ghes_gdata_pool_size_request += generic->records_to_preallocate * >> + generic->max_sections_per_record * >> + generic->max_raw_data_length; >> + >> return ghes; >> >> err_unmap_status_addr: >> @@ -490,6 +547,68 @@ static void ghes_handle_aer(struct >> acpi_hest_generic_data *gdata) #endif } >> >> +static BLOCKING_NOTIFIER_HEAD(ghes_event_notify_list); >> + >> +/** >> + * ghes_register_event_notifier - register an event notifier >> + * for the non-fatal HW errors. >> + * @nb: pointer to the notifier_block structure of the event handler. >> + * >> + * return 0 : SUCCESS, non-zero : FAIL */ int >> +ghes_register_event_notifier(struct notifier_block *nb) { >> + return blocking_notifier_chain_register(&ghes_event_notify_list, >> +nb); } EXPORT_SYMBOL_GPL(ghes_register_event_notifier); >> + >> +/** >> + * ghes_unregister_event_notifier - unregister the previously >> + * registered event notifier. >> + * @nb: pointer to the notifier_block structure of the event handler. >> + */ >> +void ghes_unregister_event_notifier(struct notifier_block *nb) >> +{ >> + blocking_notifier_chain_unregister(&ghes_event_notify_list, nb); >> +} >> +EXPORT_SYMBOL_GPL(ghes_unregister_event_notifier); >> + >> +static void ghes_event_work_func(struct work_struct *work) >> +{ >> + struct ghes_event_entry entry; >> + u32 len; >> + >> + while (kfifo_get(&ghes_event_ring, &entry)) { >> + blocking_notifier_call_chain(&ghes_event_notify_list, >> + entry.error_severity, >> + entry.gdata); >> + len = acpi_hest_get_record_size(entry.gdata); >> + gen_pool_free(ghes_gdata_pool, (unsigned long)entry.gdata, >len); >> + } >> +} >> + >> +static DECLARE_WORK(ghes_event_work, ghes_event_work_func); >> + >> +static void ghes_handle_non_standard_event(struct >acpi_hest_generic_data *gdata, >> + int sev) >> +{ >> + u32 len; >> + struct ghes_event_entry event_entry; >> + >> + len = acpi_hest_get_record_size(gdata); >> + event_entry.gdata = (void *)gen_pool_alloc(ghes_gdata_pool, len); >> + if (event_entry.gdata) { > >Make this: > > if (!event_entry.gdata) { > pr_warn(...); > return; > } > >and then you won't have to indent the usual case below. > >> + memcpy(event_entry.gdata, gdata, len); >> + event_entry.error_severity = sev; >> + >> + if (kfifo_in_spinlocked(&ghes_event_ring, &event_entry, 1, >> + &ghes_event_ring_lock)) >> + schedule_work(&ghes_event_work); >> + else >> + pr_warn(GHES_PFX "ghes event queue full\n"); > >GHES_PFX is already "GHES: ", so no need to repeat "ghes" in your >message. You can use that space for something more useful, like the >source ID, section type, severity, etc. Dmesg strings that are >completely constant are not as useful as they could be. > >> + } >> +} >> + >> static void ghes_do_proc(struct ghes *ghes, >> const struct acpi_hest_generic_status *estatus) >> { >> @@ -527,6 +646,7 @@ static void ghes_do_proc(struct ghes *ghes, >> } else { >> void *err = acpi_hest_get_payload(gdata); >> >> + ghes_handle_non_standard_event(gdata, sev); >> log_non_standard_event(sec_type, fru_id, fru_text, >> sec_sev, err, >> gdata->error_data_length); >> @@ -1334,7 +1454,7 @@ static int __init ghes_init(void) >> >> rc = platform_driver_register(&ghes_platform_driver); >> if (rc) >> - goto err; >> + goto exit; >> >> rc = apei_osc_setup(); >> if (rc == 0 && osc_sb_apei_support_acked) >> @@ -1346,8 +1466,16 @@ static int __init ghes_init(void) >> else >> pr_info(GHES_PFX "Failed to enable APEI firmware first >mode.\n"); >> >> + rc = ghes_gdata_pool_init(); >> + if (rc) { >> + pr_warn(GHES_PFX "ghes_gdata_pool_init failed\n"); > >I don't think this message is really meaningful to a user. Maybe >something about non-fatal vendor-specific errors not being logged? > >> + goto err; > >I'm not sure this is an error that should cause the whole GHES driver >to be unregistered. After all, the driver *used* to work fine even >without this vendor-specific functionality. Seems like we ought to be >able to fall back to the previous behavior even if we can't allocate >the gdata pool. > >> + } >> + >> return 0; >> err: >> + platform_driver_unregister(&ghes_platform_driver); >> +exit: >> return rc; >> } >> device_initcall(ghes_init); >> diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h >> index e3f1cddb4ac8..a3dd82069069 100644 >> --- a/include/acpi/ghes.h >> +++ b/include/acpi/ghes.h >> @@ -50,6 +50,34 @@ enum { >> GHES_SEV_PANIC = 0x3, >> }; >> >> + > >Don't add an extra blank line here. > >> +#ifdef CONFIG_ACPI_APEI_GHES >> +/** >> + * ghes_register_event_notifier - register an event notifier >> + * for the non-fatal HW errors. >> + * @nb: pointer to the notifier_block structure of the event notifier. >> + * >> + * Return : 0 - SUCCESS, non-zero - FAIL. >> + */ >> +int ghes_register_event_notifier(struct notifier_block *nb); >> + >> +/** >> + * ghes_unregister_event_notifier - unregister the previously >> + * registered event notifier. >> + * @nb: pointer to the notifier_block structure of the event notifier. >> + */ >> +void ghes_unregister_event_notifier(struct notifier_block *nb); >> +#else >> +static inline int ghes_register_event_notifier(struct notifier_block *nb) >> +{ >> + return -ENODEV; >> +} >> + >> +static inline void ghes_unregister_event_notifier(struct notifier_block >*nb) >> +{ >> +} >> +#endif >> + >> int ghes_estatus_pool_init(int num_ghes); >> >> /* From drivers/edac/ghes_edac.c */ >> -- >> 2.17.1 >> >>