Re: [RFC PATCH v2 2/2] Implement sharing/unsharing of PMDs for FS/DAX

"Kirill A. Shutemov" <kirill@xxxxxxxxxxxxx> · Wed, 12 Jun 2019 05:33:36 +0300

On Fri, Jun 07, 2019 at 12:51:03PM -0700, Larry Bassel wrote:
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 3a54c9d..1c1ed4e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4653,9 +4653,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
>  }
>  
>  #ifdef CONFIG_ARCH_HAS_HUGE_PMD_SHARE
> -static unsigned long page_table_shareable(struct vm_area_struct *svma,
> -				struct vm_area_struct *vma,
> -				unsigned long addr, pgoff_t idx)
> +unsigned long page_table_shareable(struct vm_area_struct *svma,
> +				   struct vm_area_struct *vma,
> +				   unsigned long addr, pgoff_t idx)
>  {
>  	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
>  				svma->vm_start;
> @@ -4678,7 +4678,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
>  	return saddr;
>  }
>  
> -static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
> +bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
>  {
>  	unsigned long base = addr & PUD_MASK;
>  	unsigned long end = base + PUD_SIZE;

This is going to be build error. mm/hugetlb.o doesn't build unlessp CONFIG_HUGETLBFS=y.

And I think both functions doesn't cover all DAX cases: VMA can be not
aligned (due to vm_start and/or vm_pgoff) to 2M even if the file has 2M
ranges allocated. See transhuge_vma_suitable().

And as I said before, nothing guarantees contiguous 2M ranges on backing
storage.

And in general I found piggybacking on hugetlb hacky.

The solution has to stand on its own with own justification. Saying it
worked for hugetlb and it has to work here would not fly. hugetlb is much
more restrictive on use cases. THP has more corner cases.

> diff --git a/mm/memory.c b/mm/memory.c
> index ddf20bd..1ca8f75 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3932,6 +3932,109 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_ARCH_HAS_HUGE_PMD_SHARE
> +static pmd_t *huge_pmd_offset(struct mm_struct *mm,
> +			      unsigned long addr, unsigned long sz)
> +{
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +
> +	pgd = pgd_offset(mm, addr);
> +	if (!pgd_present(*pgd))
> +		return NULL;
> +	p4d = p4d_offset(pgd, addr);
> +	if (!p4d_present(*p4d))
> +		return NULL;
> +
> +	pud = pud_offset(p4d, addr);
> +	if (sz != PUD_SIZE && pud_none(*pud))
> +		return NULL;
> +	/* hugepage or swap? */
> +	if (pud_huge(*pud) || !pud_present(*pud))
> +		return (pmd_t *)pud;

So do we or do we not support PUD pages? This is just broken.
> +
> +	pmd = pmd_offset(pud, addr);
> +	if (sz != PMD_SIZE && pmd_none(*pmd))
> +		return NULL;
> +	/* hugepage or swap? */
> +	if (pmd_huge(*pmd) || !pmd_present(*pmd))
> +		return pmd;
> +
> +	return NULL;
> +}
> +

-- 
 Kirill A. Shutemov