Instead of splitting the compound page with FOLL_SPLIT, this patch allows uprobe to only split pmd for huge pages. A helper function mm_address_trans_huge(mm, address) was introduced to test whether the address in mm is pointing to THP. Signed-off-by: Song Liu <songliubraving@xxxxxx> --- include/linux/huge_mm.h | 8 ++++++++ kernel/events/uprobes.c | 38 ++++++++++++++++++++++++++++++++------ mm/huge_memory.c | 24 ++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2d8a40fd06e4..4832d6580969 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -163,6 +163,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page, pgtable_t prealloc_pgtable); +bool mm_address_trans_huge(struct mm_struct *mm, unsigned long address); + void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); @@ -302,6 +304,12 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page, pgtable_t prealloc_pgtable) {} +static inline bool mm_address_trans_huge(struct mm_struct *mm, + unsigned long address) +{ + return false; +} + #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ba49da99d2a2..56eeccc2f7a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,7 @@ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> #include <linux/shmem_fs.h> +#include <asm/pgalloc.h> #include <linux/uprobes.h> @@ -153,7 +154,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, { struct mm_struct *mm = vma->vm_mm; struct page_vma_mapped_walk pvmw = { - .page = old_page, + .page = compound_head(old_page), .vma = vma, .address = addr, }; @@ -165,8 +166,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); - VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); - if (!orig) { err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, false); @@ -188,7 +187,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(new_page); if (orig) { - page_add_file_rmap(new_page, false); + page_add_file_rmap(compound_head(new_page), + PageTransHuge(compound_head(new_page))); inc_mm_counter(mm, mm_counter_file(new_page)); dec_mm_counter(mm, MM_ANONPAGES); } else { @@ -207,7 +207,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); + page_remove_rmap(compound_head(old_page), + PageTransHuge(compound_head(old_page))); if (!page_mapped(old_page)) try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); @@ -475,17 +476,42 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; pgoff_t index; + pgtable_t prealloc_pgtable = NULL; + unsigned long foll_flags = FOLL_FORCE; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); + /* do not FOLL_SPLIT yet */ + ret = get_user_pages_remote(NULL, mm, vaddr, 1, + foll_flags, &old_page, &vma, NULL); + + if (ret <= 0) + return ret; + + if (mm_address_trans_huge(mm, vaddr)) { + prealloc_pgtable = pte_alloc_one(mm); + if (likely(prealloc_pgtable)) { + split_huge_pmd_address(vma, vaddr, false, NULL, + prealloc_pgtable); + goto verify; + } else { + /* fallback to FOLL_SPLIT */ + foll_flags |= FOLL_SPLIT; + put_page(old_page); + } + } else { + goto verify; + } + retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); + foll_flags, &old_page, &vma, NULL); if (ret <= 0) return ret; +verify: ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) goto put_old; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index dcb0e30213af..4714871353c0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2360,6 +2360,30 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, ____split_huge_pmd(vma, pmd, address, freeze, page, prealloc_pgtable); } +bool mm_address_trans_huge(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return false; + + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return false; + + pud = pud_offset(p4d, address); + if (!pud_present(*pud)) + return false; + + pmd = pmd_offset(pud, address); + + return pmd_trans_huge(*pmd); +} + void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, -- 2.17.1