Re: [PATCH] shmem: support huge_fault to avoid pmd split

Yang Shi <shy828301@xxxxxxxxx> · Tue, 26 Jul 2022 10:54:17 -0700

On Tue, Jul 26, 2022 at 5:43 AM Liu Zixian <liuzixian4@xxxxxxxxxx> wrote:
>
> Transparent hugepage of tmpfs is useful to improve TLB miss, but
> it will be split during cow memory fault.
> This will happen if we mprotect and rewrite code segment (which is
> private file map) to hotpatch a running process.

As Matthew said it is intentional and a tradeoff between memory
consumption and performance. Other than that file COW is more
complicated and THP is actually not supported for private mapping
AFAIK since it is anonymous pages mapped to a file vma. So the private
mapping THP support must be added before supporting your usecase.

>
> We can avoid the splitting by adding a huge_fault function.
>
> Signed-off-by: Liu Zixian <liuzixian4@xxxxxxxxxx>
> ---
>  mm/shmem.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 46 insertions(+)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index a6f565308..12b2b5140 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2120,6 +2120,51 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
>         return ret;
>  }
>
> +static vm_fault_t shmem_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size)
> +{
> +       vm_fault_t ret = VM_FAULT_FALLBACK;
> +       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> +       struct page *old_page, *new_page;
> +       int gfp_flags = GFP_HIGHUSER_MOVABLE | __GFP_COMP;
> +
> +       /* read or shared fault will not split huge pmd */
> +       if (!(vmf->flags & FAULT_FLAG_WRITE)
> +                       || (vmf->vma->vm_flags & VM_SHARED))
> +               return VM_FAULT_FALLBACK;
> +       if (pe_size != PE_SIZE_PMD)
> +               return VM_FAULT_FALLBACK;
> +
> +       if (pmd_none(*vmf->pmd)) {
> +               if (shmem_fault(vmf) & VM_FAULT_ERROR)
> +                       goto out;
> +               if (!PageTransHuge(vmf->page))
> +                       goto out;
> +               old_page = vmf->page;
> +       } else {
> +               old_page = pmd_page(*vmf->pmd);
> +               page_remove_rmap(old_page, vmf->vma, true);
> +               pmdp_huge_clear_flush(vmf->vma, haddr, vmf->pmd);
> +               add_mm_counter(vmf->vma->vm_mm, MM_SHMEMPAGES, -HPAGE_PMD_NR);
> +       }
> +
> +       new_page = &vma_alloc_folio(gfp_flags, HPAGE_PMD_ORDER,
> +                       vmf->vma, haddr, true)->page;
> +       if (!new_page)
> +               goto out;
> +       prep_transhuge_page(new_page);
> +       copy_user_huge_page(new_page, old_page, haddr, vmf->vma, HPAGE_PMD_NR);
> +       __SetPageUptodate(new_page);
> +
> +       ret = do_set_pmd(vmf, new_page);

This is also totally wrong IIUC. You are actually allocating anonymous
pages, but do_set_pmd() is used for file pages. So all the
manipulations to anonymous rmap, accounting, lru, memcg and etc are
actually missing.

> +
> +out:
> +       if (vmf->page) {
> +               unlock_page(vmf->page);
> +               put_page(vmf->page);
> +       }
> +       return ret;
> +}
> +
>  unsigned long shmem_get_unmapped_area(struct file *file,
>                                       unsigned long uaddr, unsigned long len,
>                                       unsigned long pgoff, unsigned long flags)
> @@ -3884,6 +3929,7 @@ static const struct super_operations shmem_ops = {
>
>  static const struct vm_operations_struct shmem_vm_ops = {
>         .fault          = shmem_fault,
> +       .huge_fault     = shmem_huge_fault,
>         .map_pages      = filemap_map_pages,
>  #ifdef CONFIG_NUMA
>         .set_policy     = shmem_set_policy,
> --
> 2.33.0
>
>