"Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxx> writes: > On 5/20/21 6:16 PM, Peter Xu wrote: >> On Thu, May 20, 2021 at 01:56:54PM +0530, Aneesh Kumar K.V wrote: >>>> This seems to work at least for my userfaultfd test on shmem, however I don't >>>> fully understand the commit message [1] on: How do we guarantee we're not >>>> moving a thp pte? >>>> >>> >>> move_page_tables() checks for pmd_trans_huge() and ends up calling >>> move_huge_pmd if it is a THP entry. >> >> Sorry to be unclear: what if a huge pud thp? >> > > I am still checking. Looking at the code before commit > c49dd340180260c6239e453263a9a244da9a7c85, I don't see kernel handling > huge pud thp. I haven't studied huge pud thp enough to understand > whether c49dd340180260c6239e453263a9a244da9a7c85 intent to add that > support. > > We can do a move_huge_pud() like we do for huge pmd thp. But I am not > sure whether we handle those VMA's earlier and restrict mremap on them? something like this? (not even compile tested). I am still not sure whether this is really needed or we handle DAX VMA's in some other form. diff --git a/mm/mremap.c b/mm/mremap.c index 47c255b60150..037a7bd311f1 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -324,10 +324,51 @@ static inline bool move_normal_pud(struct vm_area_struct *vma, } #endif + +static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pud_t pud; + + /* + * The destination pud shouldn't be established, free_pgtables() + * should have released it. + */ + if (WARN_ON_ONCE(!pud_none(*new_pud))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_lock prevents deadlock. + */ + old_ptl = pud_lock(vma->vm_mm, old_pud); + new_ptl = pud_lockptr(mm, new_pud); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pud */ + pud = *old_pud; + pud_clear(old_pud); + + VM_BUG_ON(!pud_none(*new_pud)); + + /* Set the new pud */ + set_pud_at(mm, new_addr, new_pud, pud); + flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} + enum pgt_entry { NORMAL_PMD, HPAGE_PMD, NORMAL_PUD, + HPAGE_PUD, }; /* @@ -347,6 +388,7 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry, mask = PMD_MASK; size = PMD_SIZE; break; + case HPAGE_PUD: case NORMAL_PUD: mask = PUD_MASK; size = PUD_SIZE; @@ -395,6 +437,12 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, move_huge_pmd(vma, old_addr, new_addr, old_entry, new_entry); break; + case HPAGE_PUD: + moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE_PUD) && + move_huge_pud(vma, old_addr, new_addr, old_entry, + new_entry); + break; + default: WARN_ON_ONCE(1); break; @@ -429,15 +477,23 @@ unsigned long move_page_tables(struct vm_area_struct *vma, * PUD level if possible. */ extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr); - if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { - pud_t *old_pud, *new_pud; - old_pud = get_old_pud(vma->vm_mm, old_addr); - if (!old_pud) + old_pud = get_old_pud(vma->vm_mm, old_addr); + if (!old_pud) + continue; + new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr); + if (!new_pud) + break; + if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) { + if (extent == HPAGE_PUD_SIZE) { + move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr, + old_pud, new_pud, need_rmap_locks); + /* We ignore and continue on error? */ continue; - new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr); - if (!new_pud) - break; + } + } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { + pud_t *old_pud, *new_pud; + if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr, old_pud, new_pud, need_rmap_locks)) continue; > > Are huge pud thp only allowed with DAX vmas? > > > -aneesh