On Wed, Jul 12, 2023 at 12:02 AM Yin Fengwei <fengwei.yin@xxxxxxxxx> wrote: > > Current kernel only lock base size folio during mlock syscall. > Add large folio support with following rules: > - Only mlock large folio when it's in VM_LOCKED VMA range > > - If there is cow folio, mlock the cow folio as cow folio > is also in VM_LOCKED VMA range. > > - munlock will apply to the large folio which is in VMA range > or cross the VMA boundary. > > The last rule is used to handle the case that the large folio is > mlocked, later the VMA is split in the middle of large folio > and this large folio become cross VMA boundary. > > Signed-off-by: Yin Fengwei <fengwei.yin@xxxxxxxxx> > --- > mm/mlock.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 99 insertions(+), 5 deletions(-) > > diff --git a/mm/mlock.c b/mm/mlock.c > index 0a0c996c5c214..f49e079066870 100644 > --- a/mm/mlock.c > +++ b/mm/mlock.c > @@ -305,6 +305,95 @@ void munlock_folio(struct folio *folio) > local_unlock(&mlock_fbatch.lock); > } > > +static inline bool should_mlock_folio(struct folio *folio, > + struct vm_area_struct *vma) > +{ > + if (vma->vm_flags & VM_LOCKED) > + return (!folio_test_large(folio) || > + folio_within_vma(folio, vma)); > + > + /* > + * For unlock, allow munlock large folio which is partially > + * mapped to VMA. As it's possible that large folio is > + * mlocked and VMA is split later. > + * > + * During memory pressure, such kind of large folio can > + * be split. And the pages are not in VM_LOCKed VMA > + * can be reclaimed. > + */ > + > + return true; Looks good, or just should_mlock_folio() // or whatever name you see fit, can_mlock_folio()? { return !(vma->vm_flags & VM_LOCKED) || folio_within_vma(); } > +} > + > +static inline unsigned int get_folio_mlock_step(struct folio *folio, > + pte_t pte, unsigned long addr, unsigned long end) > +{ > + unsigned int nr; > + > + nr = folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte); > + return min_t(unsigned int, nr, (end - addr) >> PAGE_SHIFT); > +} > + > +void mlock_folio_range(struct folio *folio, struct vm_area_struct *vma, > + pte_t *pte, unsigned long addr, unsigned int nr) > +{ > + struct folio *cow_folio; > + unsigned int step = 1; > + > + mlock_folio(folio); > + if (nr == 1) > + return; > + > + for (; nr > 0; pte += step, addr += (step << PAGE_SHIFT), nr -= step) { > + pte_t ptent; > + > + step = 1; > + ptent = ptep_get(pte); > + > + if (!pte_present(ptent)) > + continue; > + > + cow_folio = vm_normal_folio(vma, addr, ptent); > + if (!cow_folio || cow_folio == folio) { > + continue; > + } > + > + mlock_folio(cow_folio); > + step = get_folio_mlock_step(folio, ptent, > + addr, addr + (nr << PAGE_SHIFT)); > + } > +} > + > +void munlock_folio_range(struct folio *folio, struct vm_area_struct *vma, > + pte_t *pte, unsigned long addr, unsigned int nr) > +{ > + struct folio *cow_folio; > + unsigned int step = 1; > + > + munlock_folio(folio); > + if (nr == 1) > + return; > + > + for (; nr > 0; pte += step, addr += (step << PAGE_SHIFT), nr -= step) { > + pte_t ptent; > + > + step = 1; > + ptent = ptep_get(pte); > + > + if (!pte_present(ptent)) > + continue; > + > + cow_folio = vm_normal_folio(vma, addr, ptent); > + if (!cow_folio || cow_folio == folio) { > + continue; > + } > + > + munlock_folio(cow_folio); > + step = get_folio_mlock_step(folio, ptent, > + addr, addr + (nr << PAGE_SHIFT)); > + } > +} I'll finish the above later. > static int mlock_pte_range(pmd_t *pmd, unsigned long addr, > unsigned long end, struct mm_walk *walk) > > @@ -314,6 +403,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, > pte_t *start_pte, *pte; > pte_t ptent; > struct folio *folio; > + unsigned int step = 1; > > ptl = pmd_trans_huge_lock(pmd, vma); > if (ptl) { > @@ -329,24 +419,28 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, > goto out; > } > > - start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > + pte = start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); > if (!start_pte) { > walk->action = ACTION_AGAIN; > return 0; > } > - for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { > + > + for (; addr != end; pte += step, addr += (step << PAGE_SHIFT)) { > + step = 1; > ptent = ptep_get(pte); > if (!pte_present(ptent)) > continue; > folio = vm_normal_folio(vma, addr, ptent); > if (!folio || folio_is_zone_device(folio)) > continue; > - if (folio_test_large(folio)) > + if (!should_mlock_folio(folio, vma)) > continue; > + > + step = get_folio_mlock_step(folio, ptent, addr, end); > if (vma->vm_flags & VM_LOCKED) > - mlock_folio(folio); > + mlock_folio_range(folio, vma, pte, addr, step); > else > - munlock_folio(folio); > + munlock_folio_range(folio, vma, pte, addr, step); > } > pte_unmap(start_pte); > out: Looks good.