On Fri, Mar 21, 2025 at 12:37:13PM +0100, David Hildenbrand wrote: SNIP > +static int __uprobe_write_opcode(struct vm_area_struct *vma, > + struct folio_walk *fw, struct folio *folio, > + unsigned long opcode_vaddr, uprobe_opcode_t opcode) > +{ > + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; > + const bool is_register = !!is_swbp_insn(&opcode); > + bool pmd_mappable; > + > + /* For now, we'll only handle PTE-mapped folios. */ > + if (fw->level != FW_LEVEL_PTE) > + return -EFAULT; > + > + /* > + * See can_follow_write_pte(): we'd actually prefer a writable PTE here, > + * but the VMA might not be writable. > + */ > + if (!pte_write(fw->pte)) { > + if (!PageAnonExclusive(fw->page)) > + return -EFAULT; > + if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) > + return -EFAULT; > + /* SOFTDIRTY is handled via pte_mkdirty() below. */ > + } > + > + /* > + * We'll temporarily unmap the page and flush the TLB, such that we can > + * modify the page atomically. > + */ > + flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); > + fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); > + copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); > + > + /* > + * When unregistering, we may only zap a PTE if uffd is disabled and > + * there are no unexpected folio references ... > + */ > + if (is_register || userfaultfd_missing(vma) || > + (folio_ref_count(folio) != folio_mapcount(folio) + 1 + > + folio_test_swapcache(folio) * folio_nr_pages(folio))) > + goto remap; > + > + /* > + * ... and the mapped page is identical to the original page that > + * would get faulted in on next access. > + */ > + if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) > + goto remap; > + > + dec_mm_counter(vma->vm_mm, MM_ANONPAGES); > + folio_remove_rmap_pte(folio, fw->page, vma); > + if (!folio_mapped(folio) && folio_test_swapcache(folio) && > + folio_trylock(folio)) { > + folio_free_swap(folio); > + folio_unlock(folio); > + } > + folio_put(folio); hi, it's probably ok and I'm missing something, but why do we call folio_put in here, I'd think it's done by folio_put call in uprobe_write_opcode thanks, jirka > + > + return pmd_mappable; > +remap: > + /* > + * Make sure that our copy_to_page() changes become visible before the > + * set_pte_at() write. > + */ > + smp_wmb(); > + /* We modified the page. Make sure to mark the PTE dirty. */ > + set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); > + return 0; > +} > + > /* > * NOTE: > * Expect the breakpoint instruction to be the smallest size instruction for > @@ -475,116 +480,115 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, > * uprobe_write_opcode - write the opcode at a given virtual address. > * @auprobe: arch specific probepoint information. > * @vma: the probed virtual memory area. > - * @vaddr: the virtual address to store the opcode. > - * @opcode: opcode to be written at @vaddr. > + * @opcode_vaddr: the virtual address to store the opcode. > + * @opcode: opcode to be written at @opcode_vaddr. > * > * Called with mm->mmap_lock held for read or write. > * Return 0 (success) or a negative errno. > */ > int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, > - unsigned long vaddr, uprobe_opcode_t opcode) > + const unsigned long opcode_vaddr, uprobe_opcode_t opcode) > { > + const unsigned long vaddr = opcode_vaddr & PAGE_MASK; > struct mm_struct *mm = vma->vm_mm; > struct uprobe *uprobe; > - struct page *old_page, *new_page; > int ret, is_register, ref_ctr_updated = 0; > - bool orig_page_huge = false; > unsigned int gup_flags = FOLL_FORCE; > + struct mmu_notifier_range range; > + struct folio_walk fw; > + struct folio *folio; > + struct page *page; > > is_register = is_swbp_insn(&opcode); > uprobe = container_of(auprobe, struct uprobe, arch); > > -retry: > + if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) > + return -EINVAL; > + > + /* > + * When registering, we have to break COW to get an exclusive anonymous > + * page that we can safely modify. Use FOLL_WRITE to trigger a write > + * fault if required. When unregistering, we might be lucky and the > + * anon page is already gone. So defer write faults until really > + * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() > + * cannot deal with PMDs yet. > + */ > if (is_register) > - gup_flags |= FOLL_SPLIT_PMD; > - /* Read the page with vaddr into memory */ > - ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &old_page, NULL); > - if (ret != 1) > - return ret; > + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; > > - ret = verify_opcode(old_page, vaddr, &opcode); > +retry: > + ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); > if (ret <= 0) > - goto put_old; > - > - if (is_zero_page(old_page)) { > - ret = -EINVAL; > - goto put_old; > - } > + goto out; > + folio = page_folio(page); > > - if (WARN(!is_register && PageCompound(old_page), > - "uprobe unregister should never work on compound page\n")) { > - ret = -EINVAL; > - goto put_old; > + ret = verify_opcode(page, opcode_vaddr, &opcode); > + if (ret <= 0) { > + folio_put(folio); > + goto out; > } > > /* We are going to replace instruction, update ref_ctr. */ > if (!ref_ctr_updated && uprobe->ref_ctr_offset) { > ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); > - if (ret) > - goto put_old; > + if (ret) { > + folio_put(folio); > + goto out; > + } > > ref_ctr_updated = 1; > } > > ret = 0; > - if (!is_register && !PageAnon(old_page)) > - goto put_old; > - > - ret = anon_vma_prepare(vma); > - if (ret) > - goto put_old; > - > - ret = -ENOMEM; > - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); > - if (!new_page) > - goto put_old; > - > - __SetPageUptodate(new_page); > - copy_highpage(new_page, old_page); > - copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); > + if (unlikely(!folio_test_anon(folio))) { > + VM_WARN_ON_ONCE(is_register); > + folio_put(folio); > + goto out; > + } > > if (!is_register) { > - struct page *orig_page; > - pgoff_t index; > - > - VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); > - > - index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; > - orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, > - index); > - > - if (orig_page) { > - if (PageUptodate(orig_page) && > - pages_identical(new_page, orig_page)) { > - /* let go new_page */ > - put_page(new_page); > - new_page = NULL; > - > - if (PageCompound(orig_page)) > - orig_page_huge = true; > - } > - put_page(orig_page); > - } > + /* > + * In the common case, we'll be able to zap the page when > + * unregistering. So trigger MMU notifiers now, as we won't > + * be able to do it under PTL. > + */ > + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, > + vaddr, vaddr + PAGE_SIZE); > + mmu_notifier_invalidate_range_start(&range); > + } > + > + ret = -EAGAIN; > + /* Walk the page tables again, to perform the actual update. */ > + if (folio_walk_start(&fw, vma, vaddr, 0)) { > + if (fw.page == page) > + ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); > + folio_walk_end(&fw, vma); > } > > - ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); > - if (new_page) > - put_page(new_page); > -put_old: > - put_page(old_page); > + if (!is_register) > + mmu_notifier_invalidate_range_end(&range); > > - if (unlikely(ret == -EAGAIN)) > + folio_put(folio); > + switch (ret) { > + case -EFAULT: > + gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD; > + fallthrough; > + case -EAGAIN: > goto retry; > + default: > + break; > + } > > +out: > /* Revert back reference counter if instruction update failed. */ > - if (ret && is_register && ref_ctr_updated) > + if (ret < 0 && is_register && ref_ctr_updated) > update_ref_ctr(uprobe, mm, -1); > > /* try collapse pmd for compound page */ > - if (!ret && orig_page_huge) > + if (ret > 0) > collapse_pte_mapped_thp(mm, vaddr, false); > > - return ret; > + return ret < 0 ? ret : 0; > } > > /** > -- > 2.48.1 >