From: Yulei Zhang <yuleixzhang@xxxxxxxxxxx> dmem handle the mce if the pfn belongs to dmem when mce occurs. 1. check whether the pfn is handled by dmem. return if true. 2. mark the pfn in a new error bitmap defined in page. 3. a series of mechanism to ensure that the mce pfn is not allocated. Signed-off-by: Haiwei Li <lihaiwei@xxxxxxxxxxx> Signed-off-by: Yulei Zhang <yuleixzhang@xxxxxxxxxxx> --- include/linux/dmem.h | 6 +++ include/trace/events/dmem.h | 17 ++++++ mm/dmem.c | 103 +++++++++++++++++++++++++----------- mm/memory-failure.c | 6 +++ 4 files changed, 102 insertions(+), 30 deletions(-) diff --git a/include/linux/dmem.h b/include/linux/dmem.h index 59d3ef14fe42..cd17a91a7264 100644 --- a/include/linux/dmem.h +++ b/include/linux/dmem.h @@ -21,6 +21,8 @@ dmem_alloc_pages_vma(struct vm_area_struct *vma, unsigned long addr, void dmem_free_pages(phys_addr_t addr, unsigned int dpages_nr); bool is_dmem_pfn(unsigned long pfn); #define dmem_free_page(addr) dmem_free_pages(addr, 1) + +bool dmem_memory_failure(unsigned long pfn, int flags); #else static inline int dmem_reserve_init(void) { @@ -32,5 +34,9 @@ static inline bool is_dmem_pfn(unsigned long pfn) return 0; } +static inline bool dmem_memory_failure(unsigned long pfn, int flags) +{ + return false; +} #endif #endif /* _LINUX_DMEM_H */ diff --git a/include/trace/events/dmem.h b/include/trace/events/dmem.h index 10d1b90a7783..f8eeb3c63b14 100644 --- a/include/trace/events/dmem.h +++ b/include/trace/events/dmem.h @@ -62,6 +62,23 @@ TRACE_EVENT(dmem_free_pages, TP_printk("addr %#lx dpages_nr %d", (unsigned long)__entry->addr, __entry->dpages_nr) ); + +TRACE_EVENT(dmem_memory_failure, + TP_PROTO(unsigned long pfn, bool used), + TP_ARGS(pfn, used), + + TP_STRUCT__entry( + __field(unsigned long, pfn) + __field(bool, used) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->used = used; + ), + + TP_printk("pfn=%#lx used=%d", __entry->pfn, __entry->used) +); #endif /* This part must be outside protection */ diff --git a/mm/dmem.c b/mm/dmem.c index 50cdff98675b..16438dbed3f5 100644 --- a/mm/dmem.c +++ b/mm/dmem.c @@ -431,6 +431,41 @@ static void __init dmem_uinit(void) dmem_pool.registered_pages = 0; } +/* set or clear corresponding bit on allocation bitmap based on error bitmap */ +static unsigned long dregion_alloc_bitmap_set_clear(struct dmem_region *dregion, + bool set) +{ + unsigned long pos_pfn, pos_offset; + unsigned long valid_pages, mce_dpages = 0; + phys_addr_t dpage, reserved_start_pfn; + + reserved_start_pfn = __phys_to_pfn(dregion->reserved_start_addr); + + valid_pages = dpage_to_pfn(dregion->dpage_end_pfn) - reserved_start_pfn; + pos_offset = dpage_to_pfn(dregion->dpage_start_pfn) + - reserved_start_pfn; +try_set: + pos_pfn = find_next_bit(dregion->error_bitmap, valid_pages, pos_offset); + + if (pos_pfn >= valid_pages) + return mce_dpages; + mce_dpages++; + dpage = pfn_to_dpage(pos_pfn + reserved_start_pfn); + if (set) + WARN_ON(__test_and_set_bit(dpage - dregion->dpage_start_pfn, + dregion->bitmap)); + else + WARN_ON(!__test_and_clear_bit(dpage - dregion->dpage_start_pfn, + dregion->bitmap)); + pos_offset = dpage_to_pfn(dpage + 1) - reserved_start_pfn; + goto try_set; +} + +static unsigned long dmem_region_mark_mce_dpages(struct dmem_region *dregion) +{ + return dregion_alloc_bitmap_set_clear(dregion, true); +} + static int __init dmem_region_init(struct dmem_region *dregion) { unsigned long *bitmap, nr_pages; @@ -514,6 +549,8 @@ static int dmem_alloc_region_init(struct dmem_region *dregion, dregion->dpage_start_pfn = start; dregion->dpage_end_pfn = end; + *dpages -= dmem_region_mark_mce_dpages(dregion); + dmem_pool.unaligned_pages += __phys_to_pfn((dpage_to_phys(start) - dregion->reserved_start_addr)); dmem_pool.unaligned_pages += __phys_to_pfn(dregion->reserved_end_addr @@ -558,36 +595,6 @@ dmem_alloc_bitmap_clear(struct dmem_region *dregion, phys_addr_t dpage, return err_num; } -/* set or clear corresponding bit on allocation bitmap based on error bitmap */ -static unsigned long dregion_alloc_bitmap_set_clear(struct dmem_region *dregion, - bool set) -{ - unsigned long pos_pfn, pos_offset; - unsigned long valid_pages, mce_dpages = 0; - phys_addr_t dpage, reserved_start_pfn; - - reserved_start_pfn = __phys_to_pfn(dregion->reserved_start_addr); - - valid_pages = dpage_to_pfn(dregion->dpage_end_pfn) - reserved_start_pfn; - pos_offset = dpage_to_pfn(dregion->dpage_start_pfn) - - reserved_start_pfn; -try_set: - pos_pfn = find_next_bit(dregion->error_bitmap, valid_pages, pos_offset); - - if (pos_pfn >= valid_pages) - return mce_dpages; - mce_dpages++; - dpage = pfn_to_dpage(pos_pfn + reserved_start_pfn); - if (set) - WARN_ON(__test_and_set_bit(dpage - dregion->dpage_start_pfn, - dregion->bitmap)); - else - WARN_ON(!__test_and_clear_bit(dpage - dregion->dpage_start_pfn, - dregion->bitmap)); - pos_offset = dpage_to_pfn(dpage + 1) - reserved_start_pfn; - goto try_set; -} - static void dmem_uinit_check_alloc_bitmap(struct dmem_region *dregion) { unsigned long dpages, size; @@ -989,6 +996,42 @@ void dmem_free_pages(phys_addr_t addr, unsigned int dpages_nr) } EXPORT_SYMBOL(dmem_free_pages); +bool dmem_memory_failure(unsigned long pfn, int flags) +{ + struct dmem_region *dregion; + struct dmem_node *pdnode = NULL; + u64 pos; + phys_addr_t addr = __pfn_to_phys(pfn); + bool used = false; + + dregion = find_dmem_region(addr, &pdnode); + if (!dregion) + return false; + + WARN_ON(!pdnode || !dregion->error_bitmap); + + mutex_lock(&dmem_pool.lock); + pos = pfn - __phys_to_pfn(dregion->reserved_start_addr); + if (__test_and_set_bit(pos, dregion->error_bitmap)) + goto out; + + if (!dregion->bitmap || pfn < dpage_to_pfn(dregion->dpage_start_pfn) || + pfn >= dpage_to_pfn(dregion->dpage_end_pfn)) + goto out; + + pos = phys_to_dpage(addr) - dregion->dpage_start_pfn; + if (__test_and_set_bit(pos, dregion->bitmap)) { + used = true; + } else { + pr_info("MCE: free dpage, mark %#lx disabled in dmem\n", pfn); + dnode_count_free_dpages(pdnode, -1); + } +out: + trace_dmem_memory_failure(pfn, used); + mutex_unlock(&dmem_pool.lock); + return true; +} + bool is_dmem_pfn(unsigned long pfn) { struct dmem_node *dnode; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f1aa6433f404..c613e1ec5995 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -35,6 +35,7 @@ */ #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/dmem.h> #include <linux/page-flags.h> #include <linux/kernel-page-flags.h> #include <linux/sched/signal.h> @@ -1280,6 +1281,11 @@ int memory_failure(unsigned long pfn, int flags) if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); + if (dmem_memory_failure(pfn, flags)) { + pr_info("MCE %#lx: handled by dmem\n", pfn); + return 0; + } + p = pfn_to_online_page(pfn); if (!p) { if (pfn_valid(pfn)) { -- 2.28.0