On Fri, Mar 24, 2023 at 8:34 AM Jiaqi Yan <jiaqiyan@xxxxxxxxxx> wrote: > > On Thu, Mar 23, 2023 at 2:38 PM Yang Shi <shy828301@xxxxxxxxx> wrote: > > > > On Sat, Mar 4, 2023 at 10:51 PM Jiaqi Yan <jiaqiyan@xxxxxxxxxx> wrote: > > > > > > Make __collapse_huge_page_copy return whether copying anonymous pages > > > succeeded, and make collapse_huge_page handle the return status. > > > > > > Break existing PTE scan loop into two for-loops. The first loop copies > > > source pages into target huge page, and can fail gracefully when running > > > into memory errors in source pages. If copying all pages succeeds, the > > > second loop releases and clears up these normal pages. Otherwise, the > > > second loop rolls back the page table and page states by: > > > - re-establishing the original PTEs-to-PMD connection. > > > - releasing source pages back to their LRU list. > > > > > > Tested manually: > > > 0. Enable khugepaged on system under test. > > > 1. Start a two-thread application. Each thread allocates a chunk of > > > non-huge anonymous memory buffer. > > > 2. Pick 4 random buffer locations (2 in each thread) and inject > > > uncorrectable memory errors at corresponding physical addresses. > > > 3. Signal both threads to make their memory buffer collapsible, i.e. > > > calling madvise(MADV_HUGEPAGE). > > > 4. Wait and check kernel log: khugepaged is able to recover from poisoned > > > pages and skips collapsing them. > > > 5. Signal both threads to inspect their buffer contents and make sure no > > > data corruption. > > > > > > Signed-off-by: Jiaqi Yan <jiaqiyan@xxxxxxxxxx> > > > --- > > > include/trace/events/huge_memory.h | 3 +- > > > mm/khugepaged.c | 148 ++++++++++++++++++++++++----- > > > 2 files changed, 128 insertions(+), 23 deletions(-) > > > > > > diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h > > > index 3e6fb05852f9a..46cce509957ba 100644 > > > --- a/include/trace/events/huge_memory.h > > > +++ b/include/trace/events/huge_memory.h > > > @@ -36,7 +36,8 @@ > > > EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ > > > EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ > > > EM( SCAN_TRUNCATED, "truncated") \ > > > - EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ > > > + EM( SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ > > > + EMe(SCAN_COPY_MC, "copy_poisoned_page") \ > > > > > > #undef EM > > > #undef EMe > > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > > index 27956d4404134..c3c217f6ebc6e 100644 > > > --- a/mm/khugepaged.c > > > +++ b/mm/khugepaged.c > > > @@ -19,6 +19,7 @@ > > > #include <linux/page_table_check.h> > > > #include <linux/swapops.h> > > > #include <linux/shmem_fs.h> > > > +#include <linux/kmsan.h> > > > > > > #include <asm/tlb.h> > > > #include <asm/pgalloc.h> > > > @@ -55,6 +56,7 @@ enum scan_result { > > > SCAN_CGROUP_CHARGE_FAIL, > > > SCAN_TRUNCATED, > > > SCAN_PAGE_HAS_PRIVATE, > > > + SCAN_COPY_MC, > > > }; > > > > > > #define CREATE_TRACE_POINTS > > > @@ -681,47 +683,47 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, > > > return result; > > > } > > > > > > -static void __collapse_huge_page_copy(pte_t *pte, struct page *page, > > > - struct vm_area_struct *vma, > > > - unsigned long address, > > > - spinlock_t *ptl, > > > - struct list_head *compound_pagelist) > > > +static void __collapse_huge_page_copy_succeeded(pte_t *pte, > > > + pmd_t *pmd, > > > + struct vm_area_struct *vma, > > > + unsigned long address, > > > + spinlock_t *pte_ptl, > > > + struct list_head *compound_pagelist) > > > { > > > struct page *src_page, *tmp; > > > pte_t *_pte; > > > - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; > > > - _pte++, page++, address += PAGE_SIZE) { > > > - pte_t pteval = *_pte; > > > + pte_t pteval; > > > + unsigned long _address; > > > > > > + for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; > > > + _pte++, _address += PAGE_SIZE) { > > > + pteval = *_pte; > > > if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { > > > - clear_user_highpage(page, address); > > > add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); > > > if (is_zero_pfn(pte_pfn(pteval))) { > > > /* > > > - * ptl mostly unnecessary. > > > + * pte_ptl mostly unnecessary. > > > */ > > > - spin_lock(ptl); > > > - ptep_clear(vma->vm_mm, address, _pte); > > > - spin_unlock(ptl); > > > + spin_lock(pte_ptl); > > > > Why did you have to rename ptl to pte_ptl? It seems unnecessary. > > Thanks, I will use `ptl` in the next version. > > > > > > + pte_clear(vma->vm_mm, _address, _pte); > > > + spin_unlock(pte_ptl); > > > } > > > } else { > > > src_page = pte_page(pteval); > > > - copy_user_highpage(page, src_page, address, vma); > > > if (!PageCompound(src_page)) > > > release_pte_page(src_page); > > > /* > > > - * ptl mostly unnecessary, but preempt has to > > > - * be disabled to update the per-cpu stats > > > + * pte_ptl mostly unnecessary, but preempt has > > > + * to be disabled to update the per-cpu stats > > > * inside page_remove_rmap(). > > > */ > > > - spin_lock(ptl); > > > - ptep_clear(vma->vm_mm, address, _pte); > > > + spin_lock(pte_ptl); > > > + ptep_clear(vma->vm_mm, _address, _pte); > > > page_remove_rmap(src_page, vma, false); > > > - spin_unlock(ptl); > > > + spin_unlock(pte_ptl); > > > free_page_and_swap_cache(src_page); > > > } > > > } > > > - > > > list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { > > > list_del(&src_page->lru); > > > mod_node_page_state(page_pgdat(src_page), > > > @@ -733,6 +735,104 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, > > > } > > > } > > > > > > +static void __collapse_huge_page_copy_failed(pte_t *pte, > > > + pmd_t *pmd, > > > + pmd_t orig_pmd, > > > + struct vm_area_struct *vma, > > > + unsigned long address, > > > + struct list_head *compound_pagelist) > > > +{ > > > + struct page *src_page, *tmp; > > > + pte_t *_pte; > > > + pte_t pteval; > > > + unsigned long _address; > > > + spinlock_t *pmd_ptl; > > > + > > > + /* > > > + * Re-establish the PMD to point to the original page table > > > + * entry. Restoring PMD needs to be done prior to releasing > > > + * pages. Since pages are still isolated and locked here, > > > + * acquiring anon_vma_lock_write is unnecessary. > > > + */ > > > + pmd_ptl = pmd_lock(vma->vm_mm, pmd); > > > + pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); > > > + spin_unlock(pmd_ptl); > > > + /* > > > + * Release both raw and compound pages isolated > > > + * in __collapse_huge_page_isolate. > > > + */ > > > > It looks like the below code could be replaced by release_pte_pages() > > with advancing _pte to (pte + HPAGE_PMD_NR - 1). > > > > Yeah, but I think _pte should be (pte + HPAGE_PMR_NR) because _pte is > decremented before comparison in release_pte_pages(pte, _pte, > compound_pagelist): > > while (--_pte >= pte) {...} > > Advancing _pte to (pte + HPAGE_PMD_NR - 1) may leave the last page not released. Yeah, good catch. I think it is because the only user of release_pte_pages() is __collapse_huge_page_isolate(). Once the loop in it is done _pte is pte + HPAGE_PMD_NR. > > > > > > + for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; > > > + _pte++, _address += PAGE_SIZE) { > > > + pteval = *_pte; > > > + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) > > > + continue; > > > + src_page = pte_page(pteval); > > > + if (!PageCompound(src_page)) > > > + release_pte_page(src_page); > > > + } > > > + list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { > > > + list_del(&src_page->lru); > > > + release_pte_page(src_page); > > > + } > > > +} > > > + > > > +/* > > > + * __collapse_huge_page_copy - attempts to copy memory contents from raw > > > + * pages to a hugepage. Cleans up the raw pages if copying succeeds; > > > + * otherwise restores the original page table and releases isolated raw pages. > > > + * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. > > > + * > > > + * @pte: starting of the PTEs to copy from > > > + * @page: the new hugepage to copy contents to > > > + * @pmd: pointer to the new hugepage's PMD > > > + * @orig_pmd: the original raw pages' PMD > > > + * @vma: the original raw pages' virtual memory area > > > + * @address: starting address to copy > > > + * @pte_ptl: lock on raw pages' PTEs > > > + * @compound_pagelist: list that stores compound pages > > > + */ > > > +static int __collapse_huge_page_copy(pte_t *pte, > > > + struct page *page, > > > + pmd_t *pmd, > > > + pmd_t orig_pmd, > > > + struct vm_area_struct *vma, > > > + unsigned long address, > > > + spinlock_t *pte_ptl, > > > + struct list_head *compound_pagelist) > > > +{ > > > + struct page *src_page; > > > + pte_t *_pte; > > > + pte_t pteval; > > > + unsigned long _address; > > > + int result = SCAN_SUCCEED; > > > + > > > + /* > > > + * Copying pages' contents is subject to memory poison at any iteration. > > > + */ > > > + for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR; > > > + _pte++, page++, _address += PAGE_SIZE) { > > > + pteval = *_pte; > > > + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { > > > + clear_user_highpage(page, _address); > > > + continue; > > > + } > > > + src_page = pte_page(pteval); > > > + if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) { > > > + result = SCAN_COPY_MC; > > > + break; > > > + } > > > + } > > > + > > > + if (likely(result == SCAN_SUCCEED)) > > > + __collapse_huge_page_copy_succeeded(pte, pmd, vma, address, > > > + pte_ptl, compound_pagelist); > > > + else > > > + __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, > > > + address, compound_pagelist); > > > + > > > + return result; > > > +} > > > + > > > static void khugepaged_alloc_sleep(void) > > > { > > > DEFINE_WAIT(wait); > > > @@ -1106,9 +1206,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > > > */ > > > anon_vma_unlock_write(vma->anon_vma); > > > > > > - __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, > > > - &compound_pagelist); > > > + result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd, > > > + vma, address, pte_ptl, > > > + &compound_pagelist); > > > pte_unmap(pte); > > > + if (unlikely(result != SCAN_SUCCEED)) > > > + goto out_up_write; > > > + > > > /* > > > * spin_lock() below is not the equivalent of smp_wmb(), but > > > * the smp_wmb() inside __SetPageUptodate() can be reused to > > > -- > > > 2.40.0.rc0.216.gc4246ad0f0-goog > > >