Memory raw pages can become HWPOISON between when userspace maps a hugepage and when userspace faults in the hugepage. Today when hugetlb faults somewhere in a hugepage containing HWPOISON raw pages, the result is a VM_FAULT_HWPOISON_LARGE. This commit teaches hugetlb page fault handler to only VM_FAULT_HWPOISON_LARGE if the faulting address is within HWPOISON raw page; otherwise, fault handler can continue to fault in healthy raw pages. Signed-off-by: Jiaqi Yan <jiaqiyan@xxxxxxxxxx> --- include/linux/mm.h | 2 + mm/hugetlb.c | 129 ++++++++++++++++++++++++++++++++++++++++++-- mm/memory-failure.c | 1 + 3 files changed, 127 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dc192f98cb1d..7caa4530953f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3531,6 +3531,7 @@ extern const struct attribute_group memory_failure_attr_group; * @nr_expected_unmaps: if a VMA that maps @page when detected is eligible * for high granularity mapping, @page is expected to be unmapped. * @nr_actual_unmaps: how many times the raw page is actually unmapped. + * @index: index of the poisoned subpage in the folio. */ struct raw_hwp_page { struct llist_node node; @@ -3538,6 +3539,7 @@ struct raw_hwp_page { int nr_vmas_mapped; int nr_expected_unmaps; int nr_actual_unmaps; + unsigned long index; }; #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1419176b7e51..f8ddf04ae0c4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6158,6 +6158,30 @@ static struct folio *hugetlb_try_find_lock_folio(struct address_space *mapping, return folio; } +static vm_fault_t hugetlb_no_page_hwpoison(struct mm_struct *mm, + struct vm_area_struct *vma, + struct folio *folio, + unsigned long address, + struct hugetlb_pte *hpte, + unsigned int flags); + +#ifndef CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING +static vm_fault_t hugetlb_no_page_hwpoison(struct mm_struct *mm, + struct vm_area_struct *vma, + struct folio *folio, + unsigned long address, + struct hugetlb_pte *hpte, + unsigned int flags) +{ + if (unlikely(folio_test_hwpoison(folio))) { + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(hstate_vma(vma))); + } + + return 0; +} +#endif + static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -6287,13 +6311,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* * If memory error occurs between mmap() and fault, some process * don't have hwpoisoned swap entry for errored virtual address. - * So we need to block hugepage fault by PG_hwpoison bit check. + * So we need to block hugepage fault by hwpoison check: + * - without HGM, the check is based on PG_hwpoison + * - with HGM, check if the raw page for address is poisoned */ - if (unlikely(folio_test_hwpoison(folio))) { - ret = VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(hstate_index(h)); + ret = hugetlb_no_page_hwpoison(mm, vma, folio, address, hpte, flags); + if (unlikely(ret)) goto backout_unlocked; - } /* Check for page in userfault range. */ if (userfaultfd_minor(vma)) { @@ -8426,6 +8450,11 @@ int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma, * the allocated PTEs created before splitting fails. */ + /* + * For none and UFFD_WP marker PTEs, given try_to_unmap_one doesn't + * unmap them, delay the splitting until page fault happens. See the + * hugetlb_no_page_hwpoison check in hugetlb_no_page. + */ if (unlikely(huge_pte_none_mostly(old_entry))) { ret = -EAGAIN; goto skip; @@ -8479,6 +8508,96 @@ int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +/* + * Given a hugetlb PTE, if we want to split it into its next smaller level + * PTE, return what size we should use to do HGM walk with allocations. + * If given hugetlb PTE is already at smallest PAGESIZE, returns -EINVAL. + */ +static int hgm_next_size(struct vm_area_struct *vma, struct hugetlb_pte *hpte) +{ + struct hstate *h = hstate_vma(vma), *tmp_h; + unsigned int shift; + unsigned long curr_size = hugetlb_pte_size(hpte); + unsigned long next_size; + + for_each_hgm_shift(h, tmp_h, shift) { + next_size = 1UL << shift; + if (next_size < curr_size) + return next_size; + } + + return -EINVAL; +} + +/* + * Check if address is in the range of a HWPOISON raw page. + * During checking hugetlb PTE may be split into smaller hguetlb PTEs. + */ +static vm_fault_t hugetlb_no_page_hwpoison(struct mm_struct *mm, + struct vm_area_struct *vma, + struct folio *folio, + unsigned long address, + struct hugetlb_pte *hpte, + unsigned int flags) +{ + unsigned long range_start, range_end; + unsigned long start_index, end_index; + unsigned long folio_start = vma_address(folio_page(folio, 0), vma); + struct llist_node *t, *tnode; + struct llist_head *raw_hwp_head = raw_hwp_list_head(folio); + struct raw_hwp_page *p = NULL; + bool contain_hwpoison = false; + int hgm_size; + int hgm_ret = 0; + + if (likely(!folio_test_hwpoison(folio))) + return 0; + + if (hugetlb_enable_hgm_vma(vma)) + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(hstate_vma(vma))); + +recheck: + range_start = address & hugetlb_pte_mask(hpte); + range_end = range_start + hugetlb_pte_size(hpte); + start_index = (range_start - folio_start) / PAGE_SIZE; + end_index = start_index + hugetlb_pte_size(hpte) / PAGE_SIZE; + + contain_hwpoison = false; + llist_for_each_safe(tnode, t, raw_hwp_head->first) { + p = container_of(tnode, struct raw_hwp_page, node); + if (start_index <= p->index && p->index < end_index) { + contain_hwpoison = true; + break; + } + } + + if (!contain_hwpoison) + return 0; + + if (hugetlb_pte_size(hpte) == PAGE_SIZE) + return VM_FAULT_HWPOISON; + + /* + * hugetlb_fault already ensured hugetlb_vma_lock_read. + * We also checked hugetlb_pte_size(hpte) != PAGE_SIZE, + * so hgm_size must be something meaningful to HGM. + */ + hgm_size = hgm_next_size(vma, hpte); + VM_BUG_ON(hgm_size == -EINVAL); + hgm_ret = hugetlb_full_walk_alloc(hpte, vma, address, hgm_size); + if (hgm_ret) { + WARN_ON_ONCE(hgm_ret); + /* + * When splitting using HGM fails, return like + * HGM is not eligible or enabled. + */ + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(hstate_vma(vma))); + } + goto recheck; +} + #endif /* CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING */ /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 47b935918ceb..9093ba53feed 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1957,6 +1957,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) raw_hwp->nr_vmas_mapped = 0; raw_hwp->nr_expected_unmaps = 0; raw_hwp->nr_actual_unmaps = 0; + raw_hwp->index = folio_page_idx(folio, page); llist_add(&raw_hwp->node, head); if (hgm_enabled) /* -- 2.40.1.495.gc816e09b53d-goog