[RFC PATCH v1 5/7] hugetlb: only VM_FAULT_HWPOISON_LARGE raw page

Jiaqi Yan <jiaqiyan@xxxxxxxxxx> · Fri, 28 Apr 2023 00:41:37 +0000

Memory raw pages can become HWPOISON between when userspace maps
a hugepage and when userspace faults in the hugepage.

Today when hugetlb faults somewhere in a hugepage containing
HWPOISON raw pages, the result is a VM_FAULT_HWPOISON_LARGE.

This commit teaches hugetlb page fault handler to only
VM_FAULT_HWPOISON_LARGE if the faulting address is within HWPOISON
raw page; otherwise, fault handler can continue to fault in healthy
raw pages.

Signed-off-by: Jiaqi Yan <jiaqiyan@xxxxxxxxxx>
---
 include/linux/mm.h  |   2 +
 mm/hugetlb.c        | 129 ++++++++++++++++++++++++++++++++++++++++++--
 mm/memory-failure.c |   1 +
 3 files changed, 127 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc192f98cb1d..7caa4530953f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3531,6 +3531,7 @@ extern const struct attribute_group memory_failure_attr_group;
  * @nr_expected_unmaps: if a VMA that maps @page when detected is eligible
  *   for high granularity mapping, @page is expected to be unmapped.
  * @nr_actual_unmaps: how many times the raw page is actually unmapped.
+ * @index: index of the poisoned subpage in the folio.
  */
 struct raw_hwp_page {
 	struct llist_node node;
@@ -3538,6 +3539,7 @@ struct raw_hwp_page {
 	int nr_vmas_mapped;
 	int nr_expected_unmaps;
 	int nr_actual_unmaps;
+	unsigned long index;
 };
 
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1419176b7e51..f8ddf04ae0c4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6158,6 +6158,30 @@ static struct folio *hugetlb_try_find_lock_folio(struct address_space *mapping,
 	return folio;
 }
 
+static vm_fault_t hugetlb_no_page_hwpoison(struct mm_struct *mm,
+					   struct vm_area_struct *vma,
+					   struct folio *folio,
+					   unsigned long address,
+					   struct hugetlb_pte *hpte,
+					   unsigned int flags);
+
+#ifndef CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING
+static vm_fault_t hugetlb_no_page_hwpoison(struct mm_struct *mm,
+					   struct vm_area_struct *vma,
+					   struct folio *folio,
+					   unsigned long address,
+					   struct hugetlb_pte *hpte,
+					   unsigned int flags)
+{
+	if (unlikely(folio_test_hwpoison(folio))) {
+		return VM_FAULT_HWPOISON_LARGE |
+		       VM_FAULT_SET_HINDEX(hstate_index(hstate_vma(vma)));
+	}
+
+	return 0;
+}
+#endif
+
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 			struct vm_area_struct *vma,
 			struct address_space *mapping, pgoff_t idx,
@@ -6287,13 +6311,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 		/*
 		 * If memory error occurs between mmap() and fault, some process
 		 * don't have hwpoisoned swap entry for errored virtual address.
-		 * So we need to block hugepage fault by PG_hwpoison bit check.
+		 * So we need to block hugepage fault by hwpoison check:
+		 * - without HGM, the check is based on PG_hwpoison
+		 * - with HGM, check if the raw page for address is poisoned
 		 */
-		if (unlikely(folio_test_hwpoison(folio))) {
-			ret = VM_FAULT_HWPOISON_LARGE |
-				VM_FAULT_SET_HINDEX(hstate_index(h));
+		ret = hugetlb_no_page_hwpoison(mm, vma, folio, address, hpte, flags);
+		if (unlikely(ret))
 			goto backout_unlocked;
-		}
 
 		/* Check for page in userfault range. */
 		if (userfaultfd_minor(vma)) {
@@ -8426,6 +8450,11 @@ int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * the allocated PTEs created before splitting fails.
 	 */
 
+	/*
+	 * For none and UFFD_WP marker PTEs, given try_to_unmap_one doesn't
+	 * unmap them, delay the splitting until page fault happens. See the
+	 * hugetlb_no_page_hwpoison check in hugetlb_no_page.
+	 */
 	if (unlikely(huge_pte_none_mostly(old_entry))) {
 		ret = -EAGAIN;
 		goto skip;
@@ -8479,6 +8508,96 @@ int hugetlb_split_to_shift(struct mm_struct *mm, struct vm_area_struct *vma,
 	return ret;
 }
 
+/*
+ * Given a hugetlb PTE, if we want to split it into its next smaller level
+ * PTE, return what size we should use to do HGM walk with allocations.
+ * If given hugetlb PTE is already at smallest PAGESIZE, returns -EINVAL.
+ */
+static int hgm_next_size(struct vm_area_struct *vma, struct hugetlb_pte *hpte)
+{
+	struct hstate *h = hstate_vma(vma), *tmp_h;
+	unsigned int shift;
+	unsigned long curr_size = hugetlb_pte_size(hpte);
+	unsigned long next_size;
+
+	for_each_hgm_shift(h, tmp_h, shift) {
+		next_size = 1UL << shift;
+		if (next_size < curr_size)
+			return next_size;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Check if address is in the range of a HWPOISON raw page.
+ * During checking hugetlb PTE may be split into smaller hguetlb PTEs.
+ */
+static vm_fault_t hugetlb_no_page_hwpoison(struct mm_struct *mm,
+					   struct vm_area_struct *vma,
+					   struct folio *folio,
+					   unsigned long address,
+					   struct hugetlb_pte *hpte,
+					   unsigned int flags)
+{
+	unsigned long range_start, range_end;
+	unsigned long start_index, end_index;
+	unsigned long folio_start = vma_address(folio_page(folio, 0), vma);
+	struct llist_node *t, *tnode;
+	struct llist_head *raw_hwp_head = raw_hwp_list_head(folio);
+	struct raw_hwp_page *p = NULL;
+	bool contain_hwpoison = false;
+	int hgm_size;
+	int hgm_ret = 0;
+
+	if (likely(!folio_test_hwpoison(folio)))
+		return 0;
+
+	if (hugetlb_enable_hgm_vma(vma))
+		return VM_FAULT_HWPOISON_LARGE |
+		       VM_FAULT_SET_HINDEX(hstate_index(hstate_vma(vma)));
+
+recheck:
+	range_start = address & hugetlb_pte_mask(hpte);
+	range_end = range_start + hugetlb_pte_size(hpte);
+	start_index = (range_start - folio_start) / PAGE_SIZE;
+	end_index = start_index + hugetlb_pte_size(hpte) / PAGE_SIZE;
+
+	contain_hwpoison = false;
+	llist_for_each_safe(tnode, t, raw_hwp_head->first) {
+		p = container_of(tnode, struct raw_hwp_page, node);
+		if (start_index <= p->index && p->index < end_index) {
+			contain_hwpoison = true;
+			break;
+		}
+	}
+
+	if (!contain_hwpoison)
+		return 0;
+
+	if (hugetlb_pte_size(hpte) == PAGE_SIZE)
+		return VM_FAULT_HWPOISON;
+
+	/*
+	 * hugetlb_fault already ensured hugetlb_vma_lock_read.
+	 * We also checked hugetlb_pte_size(hpte) != PAGE_SIZE,
+	 * so hgm_size must be something meaningful to HGM.
+	 */
+	hgm_size = hgm_next_size(vma, hpte);
+	VM_BUG_ON(hgm_size == -EINVAL);
+	hgm_ret = hugetlb_full_walk_alloc(hpte, vma, address, hgm_size);
+	if (hgm_ret) {
+		WARN_ON_ONCE(hgm_ret);
+		/*
+		 * When splitting using HGM fails, return like
+		 * HGM is not eligible or enabled.
+		 */
+		return VM_FAULT_HWPOISON_LARGE |
+		       VM_FAULT_SET_HINDEX(hstate_index(hstate_vma(vma)));
+	}
+	goto recheck;
+}
+
 #endif /* CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING */
 
 /*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 47b935918ceb..9093ba53feed 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1957,6 +1957,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
 		raw_hwp->nr_vmas_mapped = 0;
 		raw_hwp->nr_expected_unmaps = 0;
 		raw_hwp->nr_actual_unmaps = 0;
+		raw_hwp->index = folio_page_idx(folio, page);
 		llist_add(&raw_hwp->node, head);
 		if (hgm_enabled)
 			/*
-- 
2.40.1.495.gc816e09b53d-goog