Re: [RFC v1 2/2] mm: khugepaged: recover from poisoned file-backed memory

Yang Shi <shy828301@xxxxxxxxx> · Mon, 28 Mar 2022 16:37:05 -0700

On Wed, Mar 23, 2022 at 4:29 PM Jiaqi Yan <jiaqiyan@xxxxxxxxxx> wrote:
>
> Make collapse_file roll back when copying pages failed.
> More concretely:
> * extract copy operations into a separate loop
> * postpone the updates for nr_none until both scan and copy succeeded
> * postpone joining small xarray entries until both scan and copy
>   succeeded
> * as for update operations to NR_XXX_THPS
>     * for SHMEM file, postpone until both scan and copy succeeded
>     * for other file, roll back if scan succeeded but copy failed
>
> Signed-off-by: Jiaqi Yan <jiaqiyan@xxxxxxxxxx>
> ---
>  include/linux/highmem.h | 18 ++++++++++
>  mm/khugepaged.c         | 75 +++++++++++++++++++++++++++--------------
>  2 files changed, 67 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> index 15d0aa4d349c..fc5aa221bdb5 100644
> --- a/include/linux/highmem.h
> +++ b/include/linux/highmem.h
> @@ -315,6 +315,24 @@ static inline void copy_highpage(struct page *to, struct page *from)
>         kunmap_local(vfrom);
>  }
>
> +/*
> + * Machine check exception handled version of copy_highpage.
> + * Return true if copying page content failed; otherwise false.
> + */
> +static inline bool copy_highpage_mc(struct page *to, struct page *from)
> +{
> +       char *vfrom, *vto;
> +       unsigned long ret;
> +
> +       vfrom = kmap_local_page(from);
> +       vto = kmap_local_page(to);
> +       ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
> +       kunmap_local(vto);
> +       kunmap_local(vfrom);
> +
> +       return ret > 0;
> +}
> +
>  #endif
>
>  static inline void memcpy_page(struct page *dst_page, size_t dst_off,
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 84ed177f56ff..ed2b1cd4bbc6 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1708,12 +1708,13 @@ static void collapse_file(struct mm_struct *mm,
>  {
>         struct address_space *mapping = file->f_mapping;
>         gfp_t gfp;
> -       struct page *new_page;
> +       struct page *new_page, *page, *tmp;

It seems you removed the "struct page *page" from " if (result ==
SCAN_SUCCEED)", but keep the "struct page *page" under "for (index =
start; index < end; index++)". I think the "struct page *page" in the
for loop could be removed too.

>         pgoff_t index, end = start + HPAGE_PMD_NR;
>         LIST_HEAD(pagelist);
>         XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
>         int nr_none = 0, result = SCAN_SUCCEED;
>         bool is_shmem = shmem_file(file);
> +       bool copy_failed = false;
>         int nr;
>
>         VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
> @@ -1936,9 +1937,7 @@ static void collapse_file(struct mm_struct *mm,
>         }
>         nr = thp_nr_pages(new_page);
>
> -       if (is_shmem)
> -               __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> -       else {
> +       if (!is_shmem) {
>                 __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
>                 filemap_nr_thps_inc(mapping);
>                 /*
> @@ -1956,34 +1955,39 @@ static void collapse_file(struct mm_struct *mm,
>                 }
>         }
>
> -       if (nr_none) {
> -               __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> -               if (is_shmem)
> -                       __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
> -       }
> -
> -       /* Join all the small entries into a single multi-index entry */
> -       xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> -       xas_store(&xas, new_page);
>  xa_locked:
>         xas_unlock_irq(&xas);
>  xa_unlocked:
>
>         if (result == SCAN_SUCCEED) {
> -               struct page *page, *tmp;
> -
>                 /*
>                  * Replacing old pages with new one has succeeded, now we
> -                * need to copy the content and free the old pages.
> +                * attempt to copy the contents.
>                  */
>                 index = start;
> -               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
> +               list_for_each_entry(page, &pagelist, lru) {
>                         while (index < page->index) {
>                                 clear_highpage(new_page + (index % HPAGE_PMD_NR));
>                                 index++;
>                         }
> -                       copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
> -                                       page);
> +                       if (copy_highpage_mc(new_page + (page->index % HPAGE_PMD_NR), page)) {
> +                               copy_failed = true;
> +                               break;
> +                       }
> +                       index++;
> +               }
> +               while (!copy_failed && index < end) {
> +                       clear_highpage(new_page + (page->index % HPAGE_PMD_NR));
> +                       index++;
> +               }
> +       }
> +
> +       if (result == SCAN_SUCCEED && !copy_failed) {

I think you could set "result" to SCAN_COPY_MC (as same as the
anonymous one), then you could drop !copy_failed and use "result"
alone afterwards.

> +               /*
> +                * Copying old pages to huge one has succeeded, now we
> +                * need to free the old pages.
> +                */
> +               list_for_each_entry_safe(page, tmp, &pagelist, lru) {
>                         list_del(&page->lru);
>                         page->mapping = NULL;
>                         page_ref_unfreeze(page, 1);
> @@ -1991,12 +1995,20 @@ static void collapse_file(struct mm_struct *mm,
>                         ClearPageUnevictable(page);
>                         unlock_page(page);
>                         put_page(page);
> -                       index++;
>                 }
> -               while (index < end) {
> -                       clear_highpage(new_page + (index % HPAGE_PMD_NR));
> -                       index++;
> +
> +               xas_lock_irq(&xas);
> +               if (is_shmem)
> +                       __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
> +               if (nr_none) {
> +                       __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
> +                       if (is_shmem)
> +                               __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
>                 }
> +               /* Join all the small entries into a single multi-index entry. */
> +               xas_set_order(&xas, start, HPAGE_PMD_ORDER);
> +               xas_store(&xas, new_page);
> +               xas_unlock_irq(&xas);
>
>                 SetPageUptodate(new_page);
>                 page_ref_add(new_page, HPAGE_PMD_NR - 1);
> @@ -2012,9 +2024,11 @@ static void collapse_file(struct mm_struct *mm,
>
>                 khugepaged_pages_collapsed++;
>         } else {
> -               struct page *page;
> -
> -               /* Something went wrong: roll back page cache changes */
> +               /*
> +                * Something went wrong:
> +                * either result != SCAN_SUCCEED or copy_failed,
> +                * roll back page cache changes
> +                */
>                 xas_lock_irq(&xas);
>                 mapping->nrpages -= nr_none;
>
> @@ -2047,6 +2061,15 @@ static void collapse_file(struct mm_struct *mm,
>                         xas_lock_irq(&xas);
>                 }
>                 VM_BUG_ON(nr_none);
> +               /*
> +                * Undo the updates of thp_nr_pages(new_page) for non-SHMEM file,
> +                * which is not updated yet for SHMEM file.
> +                * These undos are not needed if result is not SCAN_SUCCEED.
> +                */
> +               if (!is_shmem && result == SCAN_SUCCEED) {

Handling the error case when "result == SCAN_SUCCEED" looks awkward.
With the above fixed (set result to SCAN_COPY_MC) we could avoid the
awkwardness.

> +                       __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);

I'm wondering whether we may defer NR_FILE_THPS update just like SHMEM
because it has not to be updated in advance so that we have the user
visible counters update in the single place. Just filemap_nr_thps
needs to be updated in advance since it is used to sync with file open
path to truncate huge pages.

> +                       filemap_nr_thps_dec(mapping);
> +               }
>                 xas_unlock_irq(&xas);
>
>                 new_page->mapping = NULL;
> --
> 2.35.1.894.gb6a874cedc-goog
>